fix(drive_monitor): 1s interval, path matching, error isolation
All checks were successful
BotServer CI/CD / build (push) Successful in 4m31s
All checks were successful
BotServer CI/CD / build (push) Successful in 4m31s
This commit is contained in:
parent
f9178e947e
commit
162515ba11
1 changed files with 25 additions and 65 deletions
|
|
@ -66,6 +66,12 @@ pub struct DriveMonitor {
|
||||||
_pending_kb_index: Arc<TokioRwLock<HashSet<String>>>,
|
_pending_kb_index: Arc<TokioRwLock<HashSet<String>>>,
|
||||||
// Database-backed file state repository (replaces JSON file_states)
|
// Database-backed file state repository (replaces JSON file_states)
|
||||||
file_repo: Arc<DriveFileRepository>,
|
file_repo: Arc<DriveFileRepository>,
|
||||||
|
// Queue for pending file changes (prevents reentrant processing) - reserved for future use
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pending_changes: Arc<TokioRwLock<Vec<String>>>,
|
||||||
|
// Last processed etag snapshot for quick diff - reserved for future use
|
||||||
|
#[allow(dead_code)]
|
||||||
|
last_etag_snapshot: Arc<TokioRwLock<HashMap<String, String>>>,
|
||||||
}
|
}
|
||||||
impl DriveMonitor {
|
impl DriveMonitor {
|
||||||
fn normalize_config_value(value: &str) -> String {
|
fn normalize_config_value(value: &str) -> String {
|
||||||
|
|
@ -104,6 +110,8 @@ pub fn new(state: Arc<AppState>, bucket_name: String, bot_id: uuid::Uuid) -> Sel
|
||||||
#[cfg(not(any(feature = "research", feature = "llm")))]
|
#[cfg(not(any(feature = "research", feature = "llm")))]
|
||||||
_pending_kb_index: Arc::new(TokioRwLock::new(HashSet::new())),
|
_pending_kb_index: Arc::new(TokioRwLock::new(HashSet::new())),
|
||||||
file_repo,
|
file_repo,
|
||||||
|
pending_changes: Arc::new(TokioRwLock::new(Vec::new())),
|
||||||
|
last_etag_snapshot: Arc::new(TokioRwLock::new(HashMap::new())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -427,26 +435,22 @@ match result {
|
||||||
}
|
}
|
||||||
pub fn spawn(self: Arc<Self>) -> tokio::task::JoinHandle<()> {
|
pub fn spawn(self: Arc<Self>) -> tokio::task::JoinHandle<()> {
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
trace!(
|
info!(
|
||||||
"Drive Monitor service started for bucket: {}",
|
"Drive Monitor service started for bucket: {}",
|
||||||
self.bucket_name
|
self.bucket_name
|
||||||
);
|
);
|
||||||
loop {
|
loop {
|
||||||
let backoff = self.calculate_backoff();
|
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||||
tokio::time::sleep(backoff).await;
|
|
||||||
|
|
||||||
if self.is_processing.load(Ordering::Acquire) {
|
if self.is_processing.load(Ordering::Acquire) {
|
||||||
log::warn!(
|
|
||||||
"Drive monitor is still processing previous changes, skipping this tick"
|
|
||||||
);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if !self.check_drive_health().await {
|
if !self.check_drive_health().await {
|
||||||
let failures = self.consecutive_failures.fetch_add(1, Ordering::Relaxed) + 1;
|
let failures = self.consecutive_failures.fetch_add(1, Ordering::Relaxed) + 1;
|
||||||
if failures % 10 == 1 {
|
if failures % 10 == 1 {
|
||||||
warn!("S3/MinIO unavailable for bucket {} (failures: {}), backing off to {:?}",
|
warn!("S3/MinIO unavailable for bucket {} (failures: {})",
|
||||||
self.bucket_name, failures, self.calculate_backoff());
|
self.bucket_name, failures);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -457,13 +461,13 @@ match result {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
let prev_failures = self.consecutive_failures.swap(0, Ordering::Relaxed);
|
let prev_failures = self.consecutive_failures.swap(0, Ordering::Relaxed);
|
||||||
if prev_failures > 0 {
|
if prev_failures > 0 {
|
||||||
trace!("S3/MinIO recovered for bucket {} after {} failures",
|
info!("S3/MinIO recovered for bucket {} after {} failures",
|
||||||
self.bucket_name, prev_failures);
|
self.bucket_name, prev_failures);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||||
log::error!("Error checking for drive changes: {}", e);
|
error!("Error checking for drive changes: {}", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -472,62 +476,22 @@ match result {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
async fn check_for_changes(&self) -> Result<(), Box<dyn Error + Send + Sync>> {
|
async fn check_for_changes(&self) -> Result<(), Box<dyn Error + Send + Sync>> {
|
||||||
trace!("check_for_changes ENTER");
|
|
||||||
let start_mem = MemoryStats::current();
|
|
||||||
trace!(
|
|
||||||
"check_for_changes START, RSS={}",
|
|
||||||
MemoryStats::format_bytes(start_mem.rss_bytes)
|
|
||||||
);
|
|
||||||
|
|
||||||
let Some(client) = &self.state.drive else {
|
let Some(client) = &self.state.drive else {
|
||||||
warn!("No drive client available for bot {}, skipping file monitoring", self.bot_id);
|
warn!("No drive client available for bot {}, skipping file monitoring", self.bot_id);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
};
|
};
|
||||||
|
|
||||||
trace!("check_for_changes: calling check_gbdialog_changes...");
|
// All checks run independently - one failure doesn't stop others
|
||||||
trace!("Checking gbdialog...");
|
if let Err(e) = self.check_gbdialog_changes(client).await {
|
||||||
self.check_gbdialog_changes(client).await?;
|
error!("gbdialog check failed: {}", e);
|
||||||
trace!("check_for_changes: check_gbdialog_changes done");
|
}
|
||||||
let after_dialog = MemoryStats::current();
|
if let Err(e) = self.check_gbot(client).await {
|
||||||
trace!(
|
error!("gbot check failed: {}", e);
|
||||||
"After gbdialog, RSS={} (delta={})",
|
}
|
||||||
MemoryStats::format_bytes(after_dialog.rss_bytes),
|
if let Err(e) = self.check_gbkb_changes(client).await {
|
||||||
MemoryStats::format_bytes(after_dialog.rss_bytes.saturating_sub(start_mem.rss_bytes))
|
error!("gbkb check failed: {}", e);
|
||||||
);
|
|
||||||
|
|
||||||
trace!("check_for_changes: calling check_gbot...");
|
|
||||||
trace!("Checking gbot...");
|
|
||||||
self.check_gbot(client).await?;
|
|
||||||
trace!("check_for_changes: check_gbot done");
|
|
||||||
let after_gbot = MemoryStats::current();
|
|
||||||
trace!(
|
|
||||||
"After gbot, RSS={} (delta={})",
|
|
||||||
MemoryStats::format_bytes(after_gbot.rss_bytes),
|
|
||||||
MemoryStats::format_bytes(after_gbot.rss_bytes.saturating_sub(after_dialog.rss_bytes))
|
|
||||||
);
|
|
||||||
|
|
||||||
trace!("check_for_changes: calling check_gbkb_changes...");
|
|
||||||
trace!("Checking gbkb...");
|
|
||||||
self.check_gbkb_changes(client).await?;
|
|
||||||
trace!("check_for_changes: check_gbkb_changes done");
|
|
||||||
let after_gbkb = MemoryStats::current();
|
|
||||||
trace!(
|
|
||||||
"After gbkb, RSS={} (delta={})",
|
|
||||||
MemoryStats::format_bytes(after_gbkb.rss_bytes),
|
|
||||||
MemoryStats::format_bytes(after_gbkb.rss_bytes.saturating_sub(after_gbot.rss_bytes))
|
|
||||||
);
|
|
||||||
|
|
||||||
log_jemalloc_stats();
|
|
||||||
|
|
||||||
let total_delta = after_gbkb.rss_bytes.saturating_sub(start_mem.rss_bytes);
|
|
||||||
if total_delta > 50 * 1024 * 1024 {
|
|
||||||
warn!(
|
|
||||||
"check_for_changes grew by {} - potential leak!",
|
|
||||||
MemoryStats::format_bytes(total_delta)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
trace!("check_for_changes EXIT");
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
async fn check_gbdialog_changes(
|
async fn check_gbdialog_changes(
|
||||||
|
|
@ -557,12 +521,8 @@ match result {
|
||||||
};
|
};
|
||||||
for obj in list_objects.contents.unwrap_or_default() {
|
for obj in list_objects.contents.unwrap_or_default() {
|
||||||
let path = obj.key().unwrap_or_default().to_string();
|
let path = obj.key().unwrap_or_default().to_string();
|
||||||
let path_parts: Vec<&str> = path.split('/').collect();
|
// Must end with .bas and contain .gbdialog anywhere (not just at start)
|
||||||
// Filter for paths matching *.gbdialog/*.bas pattern
|
if !path.to_ascii_lowercase().ends_with(".bas") || !path.to_lowercase().contains(".gbdialog") {
|
||||||
if path_parts.len() < 2 || !path_parts[0].ends_with(".gbdialog") {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if path.ends_with('/') || !path.to_ascii_lowercase().ends_with(".bas") {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let etag = normalize_etag(obj.e_tag().unwrap_or_default());
|
let etag = normalize_etag(obj.e_tag().unwrap_or_default());
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue