fix: DriveMonitor skips unchanged files on rescan, skips directory entries
- Only upsert drive_files when ETag actually changed (was re-processing all files every 60s cycle) - Skip S3 directory entries (keys ending with '/') to avoid storing stale directory markers - Add debug-level logging for unchanged file skips - Fixes noisy 'Added/updated drive_files' spam on every scan cycle
This commit is contained in:
parent
c70fbba099
commit
1ae0ad7051
1 changed files with 37 additions and 22 deletions
|
|
@ -34,32 +34,47 @@ impl DriveMonitor {
|
||||||
|
|
||||||
let current_keys: Vec<String> = objects.iter().map(|o| o.key.clone()).collect();
|
let current_keys: Vec<String> = objects.iter().map(|o| o.key.clone()).collect();
|
||||||
|
|
||||||
for obj in &objects {
|
for obj in &objects {
|
||||||
let file_type = classify_file(&obj.key);
|
if obj.key.ends_with('/') {
|
||||||
|
log::debug!("Skipping directory entry: {}", obj.key);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let file_type = classify_file(&obj.key);
|
||||||
let full_key = format!("{}.gbai/{}", bot_name, obj.key);
|
let full_key = format!("{}.gbai/{}", bot_name, obj.key);
|
||||||
let etag = obj.etag.as_deref().map(normalize_etag);
|
let etag = obj.etag.as_deref().map(normalize_etag);
|
||||||
|
|
||||||
let existing = self.file_repo.get_file_state(self.bot_id, &full_key);
|
let existing = self.file_repo.get_file_state(self.bot_id, &full_key);
|
||||||
let needs_reindex = match &existing {
|
let needs_reindex = match &existing {
|
||||||
Some(prev) if prev.indexed && prev.etag.as_deref() == etag.as_deref() => false,
|
Some(prev) if prev.indexed && prev.etag.as_deref() == etag.as_deref() => false,
|
||||||
Some(prev) if prev.indexed && prev.etag.as_deref() != etag.as_deref() => {
|
Some(prev) if prev.indexed && prev.etag.as_deref() != etag.as_deref() => {
|
||||||
log::info!("ETag changed for {}, will reindex", full_key);
|
log::info!("ETag changed for {}, will reindex", full_key);
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
Some(_) => !existing.as_ref().map_or(false, |f| f.indexed),
|
Some(prev) if !prev.indexed && prev.etag.as_deref() == etag.as_deref() => {
|
||||||
None => true,
|
log::debug!("{} unchanged but not yet indexed, will index", full_key);
|
||||||
};
|
true
|
||||||
|
}
|
||||||
|
Some(_) => true,
|
||||||
|
None => true,
|
||||||
|
};
|
||||||
|
|
||||||
match self.file_repo.upsert_file(
|
let etag_changed = existing.as_ref().map_or(true, |prev| prev.etag.as_deref() != etag.as_deref());
|
||||||
self.bot_id,
|
|
||||||
&full_key,
|
if etag_changed || existing.is_none() {
|
||||||
file_type,
|
match self.file_repo.upsert_file(
|
||||||
etag,
|
self.bot_id,
|
||||||
None,
|
&full_key,
|
||||||
) {
|
file_type,
|
||||||
Ok(_) => log::info!("Added/updated drive_files for: {} ({})", full_key, file_type),
|
etag,
|
||||||
Err(e) => log::error!("Failed to upsert {}: {}", full_key, e),
|
None,
|
||||||
}
|
) {
|
||||||
|
Ok(_) => log::info!("Added/updated drive_files for: {} ({})", full_key, file_type),
|
||||||
|
Err(e) => log::error!("Failed to upsert {}: {}", full_key, e),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log::debug!("{} unchanged, skipping upsert", full_key);
|
||||||
|
}
|
||||||
|
|
||||||
if needs_reindex && file_type == "kb" {
|
if needs_reindex && file_type == "kb" {
|
||||||
#[cfg(any(feature = "research", feature = "llm"))]
|
#[cfg(any(feature = "research", feature = "llm"))]
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue