fix: detect new PDFs in already-indexed KB folders
All checks were successful
BotServer CI/CD / build (push) Successful in 3m11s

- Don't skip entire GBKB scan when all KBs are indexed
- Instead, skip individual files that are already tracked (not new)
- This allows new PDFs added to existing KB folders to be detected and indexed

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
Rodrigo Rodriguez (Pragmatismo) 2026-04-13 16:03:22 -03:00
parent 6d987c0eea
commit 32fbdb4b17

View file

@ -1543,7 +1543,7 @@ let file_state = FileState {
debug!("[GBKB] Found {} files total, acquiring file_states lock...", current_files.len()); debug!("[GBKB] Found {} files total, acquiring file_states lock...", current_files.len());
// Check if ALL KBs for this bot are already indexed in Qdrant // Check if ALL KBs for this bot are already indexed in Qdrant
// If so, skip the entire scan to avoid deadlock and unnecessary downloads // If so, only scan for NEW files - skip re-indexing existing ones
let mut kb_folders: HashSet<String> = HashSet::new(); let mut kb_folders: HashSet<String> = HashSet::new();
for (path, _) in current_files.iter() { for (path, _) in current_files.iter() {
let parts: Vec<&str> = path.split('/').collect(); let parts: Vec<&str> = path.split('/').collect();
@ -1565,19 +1565,19 @@ let file_state = FileState {
} }
} }
if all_indexed && !kb_folders.is_empty() {
trace!("[GBKB] All {} KB folders already indexed, skipping scan for bot {}",
kb_folders.len(), self.bot_id);
return Ok(());
}
let mut file_states = self.file_states.write().await; let mut file_states = self.file_states.write().await;
debug!("[GBKB] file_states lock acquired, processing {} files", current_files.len()); debug!("[GBKB] file_states lock acquired, processing {} files (all_indexed={})", current_files.len(), all_indexed);
for (path, current_state) in current_files.iter() { for (path, current_state) in current_files.iter() {
let is_new = !file_states.contains_key(path); let is_new = !file_states.contains_key(path);
debug!("[GBKB] DEBUG: path={} in_file_states={}", path, !is_new); debug!("[GBKB] DEBUG: path={} in_file_states={}", path, !is_new);
// When all KBs are indexed, skip files that are already tracked (not new)
if all_indexed && !is_new {
trace!("[GBKB] Skipping already indexed file: {}", path);
continue;
}
// Use last_modified as primary change detector (more stable than ETag) // Use last_modified as primary change detector (more stable than ETag)
// ETags can change due to metadata updates even when content is identical // ETags can change due to metadata updates even when content is identical
let is_modified = if let Some(prev) = file_states.get(path) { let is_modified = if let Some(prev) = file_states.get(path) {