From 1ae0ad70513af82003140b226a8872e9017274c4 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Tue, 21 Apr 2026 16:16:39 +0000 Subject: [PATCH] fix: DriveMonitor skips unchanged files on rescan, skips directory entries - Only upsert drive_files when ETag actually changed (was re-processing all files every 60s cycle) - Skip S3 directory entries (keys ending with '/') to avoid storing stale directory markers - Add debug-level logging for unchanged file skips - Fixes noisy 'Added/updated drive_files' spam on every scan cycle --- botserver/src/drive/drive_monitor/types.rs | 59 ++++++++++++++-------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/botserver/src/drive/drive_monitor/types.rs b/botserver/src/drive/drive_monitor/types.rs index 3731db4a..5e6960d6 100644 --- a/botserver/src/drive/drive_monitor/types.rs +++ b/botserver/src/drive/drive_monitor/types.rs @@ -34,32 +34,47 @@ impl DriveMonitor { let current_keys: Vec = objects.iter().map(|o| o.key.clone()).collect(); - for obj in &objects { - let file_type = classify_file(&obj.key); + for obj in &objects { + if obj.key.ends_with('/') { + log::debug!("Skipping directory entry: {}", obj.key); + continue; + } + + let file_type = classify_file(&obj.key); let full_key = format!("{}.gbai/{}", bot_name, obj.key); let etag = obj.etag.as_deref().map(normalize_etag); - let existing = self.file_repo.get_file_state(self.bot_id, &full_key); - let needs_reindex = match &existing { - Some(prev) if prev.indexed && prev.etag.as_deref() == etag.as_deref() => false, - Some(prev) if prev.indexed && prev.etag.as_deref() != etag.as_deref() => { - log::info!("ETag changed for {}, will reindex", full_key); - true - } - Some(_) => !existing.as_ref().map_or(false, |f| f.indexed), - None => true, - }; + let existing = self.file_repo.get_file_state(self.bot_id, &full_key); + let needs_reindex = match &existing { + Some(prev) if prev.indexed && prev.etag.as_deref() == etag.as_deref() => false, + Some(prev) if prev.indexed && prev.etag.as_deref() != etag.as_deref() => { + log::info!("ETag changed for {}, will reindex", full_key); + true + } + Some(prev) if !prev.indexed && prev.etag.as_deref() == etag.as_deref() => { + log::debug!("{} unchanged but not yet indexed, will index", full_key); + true + } + Some(_) => true, + None => true, + }; - match self.file_repo.upsert_file( - self.bot_id, - &full_key, - file_type, - etag, - None, - ) { - Ok(_) => log::info!("Added/updated drive_files for: {} ({})", full_key, file_type), - Err(e) => log::error!("Failed to upsert {}: {}", full_key, e), - } + let etag_changed = existing.as_ref().map_or(true, |prev| prev.etag.as_deref() != etag.as_deref()); + + if etag_changed || existing.is_none() { + match self.file_repo.upsert_file( + self.bot_id, + &full_key, + file_type, + etag, + None, + ) { + Ok(_) => log::info!("Added/updated drive_files for: {} ({})", full_key, file_type), + Err(e) => log::error!("Failed to upsert {}: {}", full_key, e), + } + } else { + log::debug!("{} unchanged, skipping upsert", full_key); + } if needs_reindex && file_type == "kb" { #[cfg(any(feature = "research", feature = "llm"))]