fix: DriveMonitor skips unchanged files on rescan, skips directory entries
- Only upsert drive_files when ETag actually changed (was re-processing all files every 60s cycle) - Skip S3 directory entries (keys ending with '/') to avoid storing stale directory markers - Add debug-level logging for unchanged file skips - Fixes noisy 'Added/updated drive_files' spam on every scan cycle
This commit is contained in:
parent
c70fbba099
commit
1ae0ad7051
1 changed files with 37 additions and 22 deletions
|
|
@ -34,32 +34,47 @@ impl DriveMonitor {
|
|||
|
||||
let current_keys: Vec<String> = objects.iter().map(|o| o.key.clone()).collect();
|
||||
|
||||
for obj in &objects {
|
||||
let file_type = classify_file(&obj.key);
|
||||
for obj in &objects {
|
||||
if obj.key.ends_with('/') {
|
||||
log::debug!("Skipping directory entry: {}", obj.key);
|
||||
continue;
|
||||
}
|
||||
|
||||
let file_type = classify_file(&obj.key);
|
||||
let full_key = format!("{}.gbai/{}", bot_name, obj.key);
|
||||
let etag = obj.etag.as_deref().map(normalize_etag);
|
||||
|
||||
let existing = self.file_repo.get_file_state(self.bot_id, &full_key);
|
||||
let needs_reindex = match &existing {
|
||||
Some(prev) if prev.indexed && prev.etag.as_deref() == etag.as_deref() => false,
|
||||
Some(prev) if prev.indexed && prev.etag.as_deref() != etag.as_deref() => {
|
||||
log::info!("ETag changed for {}, will reindex", full_key);
|
||||
true
|
||||
}
|
||||
Some(_) => !existing.as_ref().map_or(false, |f| f.indexed),
|
||||
None => true,
|
||||
};
|
||||
let existing = self.file_repo.get_file_state(self.bot_id, &full_key);
|
||||
let needs_reindex = match &existing {
|
||||
Some(prev) if prev.indexed && prev.etag.as_deref() == etag.as_deref() => false,
|
||||
Some(prev) if prev.indexed && prev.etag.as_deref() != etag.as_deref() => {
|
||||
log::info!("ETag changed for {}, will reindex", full_key);
|
||||
true
|
||||
}
|
||||
Some(prev) if !prev.indexed && prev.etag.as_deref() == etag.as_deref() => {
|
||||
log::debug!("{} unchanged but not yet indexed, will index", full_key);
|
||||
true
|
||||
}
|
||||
Some(_) => true,
|
||||
None => true,
|
||||
};
|
||||
|
||||
match self.file_repo.upsert_file(
|
||||
self.bot_id,
|
||||
&full_key,
|
||||
file_type,
|
||||
etag,
|
||||
None,
|
||||
) {
|
||||
Ok(_) => log::info!("Added/updated drive_files for: {} ({})", full_key, file_type),
|
||||
Err(e) => log::error!("Failed to upsert {}: {}", full_key, e),
|
||||
}
|
||||
let etag_changed = existing.as_ref().map_or(true, |prev| prev.etag.as_deref() != etag.as_deref());
|
||||
|
||||
if etag_changed || existing.is_none() {
|
||||
match self.file_repo.upsert_file(
|
||||
self.bot_id,
|
||||
&full_key,
|
||||
file_type,
|
||||
etag,
|
||||
None,
|
||||
) {
|
||||
Ok(_) => log::info!("Added/updated drive_files for: {} ({})", full_key, file_type),
|
||||
Err(e) => log::error!("Failed to upsert {}: {}", full_key, e),
|
||||
}
|
||||
} else {
|
||||
log::debug!("{} unchanged, skipping upsert", full_key);
|
||||
}
|
||||
|
||||
if needs_reindex && file_type == "kb" {
|
||||
#[cfg(any(feature = "research", feature = "llm"))]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue