Fix KB indexing: upsert kb_collections, consistent collection names, preserve indexed flag
All checks were successful
BotServer CI/CD / build (push) Successful in 3m23s
All checks were successful
BotServer CI/CD / build (push) Successful in 3m23s
- Bug 1: check_gbkb_changes now preserves indexed=true from previous state when etag matches, preventing redundant re-indexing every cycle - Bug 2: USE KB fallback uses bot_id_short (8 chars) instead of random UUID, matching the collection name convention used by DriveMonitor - Bug 3: handle_gbkb_change now upserts into kb_collections table after successful indexing, so USE KB can find the collection at runtime - Changed ON CONFLICT DO NOTHING to DO UPDATE for kb_collections inserts - Changed process_gbkb_folder return type to Result<IndexingResult>
This commit is contained in:
parent
e81aee6221
commit
7a1ec157f1
4 changed files with 61 additions and 9 deletions
|
|
@ -220,18 +220,21 @@ fn add_kb_to_session(
|
||||||
(kb_result.folder_path, kb_result.qdrant_collection)
|
(kb_result.folder_path, kb_result.qdrant_collection)
|
||||||
} else {
|
} else {
|
||||||
let default_path = format!("work/{}/{}.gbkb/{}", bot_name, bot_name, kb_name);
|
let default_path = format!("work/{}/{}.gbkb/{}", bot_name, bot_name, kb_name);
|
||||||
|
let bot_id_short: String = bot_id.to_string().chars().take(8).collect();
|
||||||
|
let default_collection = format!("{}_{}_{}", bot_name, bot_id_short, kb_name);
|
||||||
let kb_id = Uuid::new_v4();
|
let kb_id = Uuid::new_v4();
|
||||||
let default_collection = format!("{}_{}_{}", bot_name, kb_id, kb_name);
|
|
||||||
|
|
||||||
warn!(
|
warn!(
|
||||||
"KB '{}' not found in kb_collections for bot {}. Using default path: {}",
|
"KB '{}' not found in kb_collections for bot {}. Using default path: {}, collection: {}",
|
||||||
kb_name, bot_name, default_path
|
kb_name, bot_name, default_path, default_collection
|
||||||
);
|
);
|
||||||
|
|
||||||
diesel::sql_query(
|
diesel::sql_query(
|
||||||
"INSERT INTO kb_collections (id, bot_id, name, folder_path, qdrant_collection, document_count)
|
"INSERT INTO kb_collections (id, bot_id, name, folder_path, qdrant_collection, document_count)
|
||||||
VALUES ($1, $2, $3, $4, $5, 0)
|
VALUES ($1, $2, $3, $4, $5, 0)
|
||||||
ON CONFLICT (bot_id, name) DO NOTHING"
|
ON CONFLICT (bot_id, name) DO UPDATE SET
|
||||||
|
folder_path = EXCLUDED.folder_path,
|
||||||
|
qdrant_collection = EXCLUDED.qdrant_collection"
|
||||||
)
|
)
|
||||||
.bind::<diesel::sql_types::Uuid, _>(kb_id)
|
.bind::<diesel::sql_types::Uuid, _>(kb_id)
|
||||||
.bind::<diesel::sql_types::Uuid, _>(bot_id)
|
.bind::<diesel::sql_types::Uuid, _>(bot_id)
|
||||||
|
|
|
||||||
|
|
@ -941,7 +941,7 @@ impl KbFolderMonitor {
|
||||||
Self { indexer, work_root }
|
Self { indexer, work_root }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn process_gbkb_folder(&self, bot_id: Uuid, bot_name: &str, kb_folder: &Path) -> Result<()> {
|
pub async fn process_gbkb_folder(&self, bot_id: Uuid, bot_name: &str, kb_folder: &Path) -> Result<IndexingResult> {
|
||||||
let kb_name = kb_folder
|
let kb_name = kb_folder
|
||||||
.file_name()
|
.file_name()
|
||||||
.and_then(|n| n.to_str())
|
.and_then(|n| n.to_str())
|
||||||
|
|
@ -965,6 +965,6 @@ impl KbFolderMonitor {
|
||||||
result.documents_processed, result.chunks_indexed, result.collection_name
|
result.documents_processed, result.chunks_indexed, result.collection_name
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -9,11 +9,12 @@ pub use document_processor::{DocumentFormat, DocumentProcessor, TextChunk};
|
||||||
pub use embedding_generator::{
|
pub use embedding_generator::{
|
||||||
EmailEmbeddingGenerator, EmbeddingConfig, EmbeddingGenerator, KbEmbeddingGenerator,
|
EmailEmbeddingGenerator, EmbeddingConfig, EmbeddingGenerator, KbEmbeddingGenerator,
|
||||||
};
|
};
|
||||||
pub use kb_indexer::{CollectionInfo, KbFolderMonitor, KbIndexer, QdrantConfig, SearchResult};
|
pub use kb_indexer::{CollectionInfo, IndexingResult, KbFolderMonitor, KbIndexer, QdrantConfig, SearchResult};
|
||||||
pub use web_crawler::{WebCrawler, WebPage, WebsiteCrawlConfig};
|
pub use web_crawler::{WebCrawler, WebPage, WebsiteCrawlConfig};
|
||||||
pub use website_crawler_service::{ensure_crawler_service_running, WebsiteCrawlerService};
|
pub use website_crawler_service::{ensure_crawler_service_running, WebsiteCrawlerService};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use diesel::prelude::*;
|
||||||
use log::{error, info, warn};
|
use log::{error, info, warn};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
@ -172,7 +173,50 @@ impl KnowledgeBaseManager {
|
||||||
);
|
);
|
||||||
|
|
||||||
let monitor = self.monitor.read().await;
|
let monitor = self.monitor.read().await;
|
||||||
monitor.process_gbkb_folder(bot_id, bot_name, kb_folder).await
|
let result = monitor.process_gbkb_folder(bot_id, bot_name, kb_folder).await?;
|
||||||
|
|
||||||
|
let kb_name = kb_folder
|
||||||
|
.file_name()
|
||||||
|
.and_then(|n| n.to_str())
|
||||||
|
.unwrap_or("unknown");
|
||||||
|
let collection_name = result.collection_name.clone();
|
||||||
|
let folder_path = kb_folder.to_string_lossy().to_string();
|
||||||
|
let doc_count = result.documents_processed;
|
||||||
|
|
||||||
|
if let Some(pool) = self.indexer.get_db_pool() {
|
||||||
|
if let Ok(mut conn) = pool.get() {
|
||||||
|
diesel::sql_query(
|
||||||
|
"INSERT INTO kb_collections (id, bot_id, name, folder_path, qdrant_collection, document_count)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6)
|
||||||
|
ON CONFLICT (bot_id, name) DO UPDATE SET
|
||||||
|
folder_path = EXCLUDED.folder_path,
|
||||||
|
qdrant_collection = EXCLUDED.qdrant_collection,
|
||||||
|
document_count = EXCLUDED.document_count,
|
||||||
|
updated_at = NOW()"
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Uuid, _>(Uuid::new_v4())
|
||||||
|
.bind::<diesel::sql_types::Uuid, _>(bot_id)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(kb_name)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(&folder_path)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(&collection_name)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(doc_count as i32)
|
||||||
|
.execute(&mut conn)
|
||||||
|
.map_err(|e| {
|
||||||
|
error!("Failed to upsert kb_collections for {}/{}: {}", bot_name, kb_name, e);
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
info!(
|
||||||
|
"Upserted kb_collections: bot={}/{}, kb={}, collection={}, docs={}",
|
||||||
|
bot_name, bot_id, kb_name, collection_name, doc_count
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
warn!("No DB connection available to upsert kb_collections for {}/{}", bot_name, kb_name);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
warn!("No DB pool available to upsert kb_collections for {}/{}", bot_name, kb_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn clear_kb(&self, bot_id: Uuid, bot_name: &str, kb_name: &str) -> Result<()> {
|
pub async fn clear_kb(&self, bot_id: Uuid, bot_name: &str, kb_name: &str) -> Result<()> {
|
||||||
|
|
|
||||||
|
|
@ -1373,7 +1373,12 @@ impl DriveMonitor {
|
||||||
files_processed, pdf_files_found
|
files_processed, pdf_files_found
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
for (path, state) in current_files {
|
for (path, mut state) in current_files {
|
||||||
|
if let Some(previous) = file_states.get(&path) {
|
||||||
|
if previous.indexed && state.etag == previous.etag {
|
||||||
|
state.indexed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
file_states.insert(path, state);
|
file_states.insert(path, state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue