From 32fbdb4b171edf5a106173b33cfe9a299fc8fe6d Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Mon, 13 Apr 2026 16:03:22 -0300 Subject: [PATCH] fix: detect new PDFs in already-indexed KB folders - Don't skip entire GBKB scan when all KBs are indexed - Instead, skip individual files that are already tracked (not new) - This allows new PDFs added to existing KB folders to be detected and indexed Co-authored-by: Qwen-Coder --- src/drive/drive_monitor/mod.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/drive/drive_monitor/mod.rs b/src/drive/drive_monitor/mod.rs index 05a84dbd..3e9e42b1 100644 --- a/src/drive/drive_monitor/mod.rs +++ b/src/drive/drive_monitor/mod.rs @@ -1541,9 +1541,9 @@ let file_state = FileState { } debug!("[GBKB] Found {} files total, acquiring file_states lock...", current_files.len()); - + // Check if ALL KBs for this bot are already indexed in Qdrant - // If so, skip the entire scan to avoid deadlock and unnecessary downloads + // If so, only scan for NEW files - skip re-indexing existing ones let mut kb_folders: HashSet = HashSet::new(); for (path, _) in current_files.iter() { let parts: Vec<&str> = path.split('/').collect(); @@ -1551,7 +1551,7 @@ let file_state = FileState { kb_folders.insert(parts[1].to_string()); } } - + let mut all_indexed = true; for kb_name in &kb_folders { let kb_key = format!("{}_{}", bot_name, kb_name); @@ -1564,20 +1564,20 @@ let file_state = FileState { break; } } - - if all_indexed && !kb_folders.is_empty() { - trace!("[GBKB] All {} KB folders already indexed, skipping scan for bot {}", - kb_folders.len(), self.bot_id); - return Ok(()); - } - + let mut file_states = self.file_states.write().await; - debug!("[GBKB] file_states lock acquired, processing {} files", current_files.len()); + debug!("[GBKB] file_states lock acquired, processing {} files (all_indexed={})", current_files.len(), all_indexed); for (path, current_state) in current_files.iter() { let is_new = !file_states.contains_key(path); debug!("[GBKB] DEBUG: path={} in_file_states={}", path, !is_new); - + + // When all KBs are indexed, skip files that are already tracked (not new) + if all_indexed && !is_new { + trace!("[GBKB] Skipping already indexed file: {}", path); + continue; + } + // Use last_modified as primary change detector (more stable than ETag) // ETags can change due to metadata updates even when content is identical let is_modified = if let Some(prev) = file_states.get(path) {