fix: add 60s timeout to LLM stream reads and add concurrent scan guard
All checks were successful
BotServer CI/CD / build (push) Successful in 3m53s
All checks were successful
BotServer CI/CD / build (push) Successful in 3m53s
- Add tokio timeout to SSE stream reads in OpenAI client (60s) - Prevents indefinite hang when Kimi/Nvidia stops responding - Add scanning AtomicBool to prevent concurrent check_gbkb_changes calls - Skip GBKB scan entirely when all KBs already indexed in Qdrant Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
parent
c1df15eb48
commit
723407cfd6
2 changed files with 86 additions and 44 deletions
|
|
@ -55,6 +55,7 @@ pub struct DriveMonitor {
|
||||||
kb_manager: Arc<KnowledgeBaseManager>,
|
kb_manager: Arc<KnowledgeBaseManager>,
|
||||||
work_root: PathBuf,
|
work_root: PathBuf,
|
||||||
is_processing: Arc<AtomicBool>,
|
is_processing: Arc<AtomicBool>,
|
||||||
|
scanning: Arc<AtomicBool>,
|
||||||
consecutive_failures: Arc<AtomicU32>,
|
consecutive_failures: Arc<AtomicU32>,
|
||||||
#[cfg(any(feature = "research", feature = "llm"))]
|
#[cfg(any(feature = "research", feature = "llm"))]
|
||||||
files_being_indexed: Arc<TokioRwLock<HashSet<String>>>,
|
files_being_indexed: Arc<TokioRwLock<HashSet<String>>>,
|
||||||
|
|
@ -89,6 +90,7 @@ impl DriveMonitor {
|
||||||
kb_manager,
|
kb_manager,
|
||||||
work_root,
|
work_root,
|
||||||
is_processing: Arc::new(AtomicBool::new(false)),
|
is_processing: Arc::new(AtomicBool::new(false)),
|
||||||
|
scanning: Arc::new(AtomicBool::new(false)),
|
||||||
consecutive_failures: Arc::new(AtomicU32::new(0)),
|
consecutive_failures: Arc::new(AtomicU32::new(0)),
|
||||||
#[cfg(any(feature = "research", feature = "llm"))]
|
#[cfg(any(feature = "research", feature = "llm"))]
|
||||||
files_being_indexed: Arc::new(TokioRwLock::new(HashSet::new())),
|
files_being_indexed: Arc::new(TokioRwLock::new(HashSet::new())),
|
||||||
|
|
@ -1457,6 +1459,16 @@ etag: normalize_etag(obj.e_tag().unwrap_or_default()),
|
||||||
&self,
|
&self,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(), Box<dyn Error + Send + Sync>> {
|
) -> Result<(), Box<dyn Error + Send + Sync>> {
|
||||||
|
// Prevent concurrent scans - if already scanning, skip this tick
|
||||||
|
if self
|
||||||
|
.scanning
|
||||||
|
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
|
||||||
|
.is_err()
|
||||||
|
{
|
||||||
|
trace!("[GBKB] Scan already in progress for bot {}, skipping", self.bot_id);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
debug!("[GBKB] check_gbkb_changes ENTER for bot {} (prefix: {})", self.bot_id, self.bucket_name);
|
debug!("[GBKB] check_gbkb_changes ENTER for bot {} (prefix: {})", self.bot_id, self.bucket_name);
|
||||||
let bot_name = self
|
let bot_name = self
|
||||||
.bucket_name
|
.bucket_name
|
||||||
|
|
@ -1796,6 +1808,7 @@ let file_state = FileState {
|
||||||
|
|
||||||
debug!("[GBKB] check_gbkb_changes EXIT for bot {}", self.bot_id);
|
debug!("[GBKB] check_gbkb_changes EXIT for bot {}", self.bot_id);
|
||||||
trace!("check_gbkb_changes EXIT");
|
trace!("check_gbkb_changes EXIT");
|
||||||
|
self.scanning.store(false, Ordering::Release);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -452,8 +452,27 @@ impl LLMProvider for OpenAIClient {
|
||||||
// Accumulate tool calls here because OpenAI streams them in fragments
|
// Accumulate tool calls here because OpenAI streams them in fragments
|
||||||
let mut active_tool_calls: Vec<serde_json::Value> = Vec::new();
|
let mut active_tool_calls: Vec<serde_json::Value> = Vec::new();
|
||||||
|
|
||||||
while let Some(chunk_result) = stream.next().await {
|
// Add timeout to stream reads - if Kimi/Nvidia stops responding, fail gracefully
|
||||||
let chunk = chunk_result?;
|
const STREAM_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let chunk_opt = match tokio::time::timeout(
|
||||||
|
STREAM_TIMEOUT,
|
||||||
|
stream.next(),
|
||||||
|
).await {
|
||||||
|
Ok(opt) => opt,
|
||||||
|
Err(_) => {
|
||||||
|
// Timeout - LLM stopped sending data
|
||||||
|
log::warn!("[LLM] Stream timed out after {}s for model {}",
|
||||||
|
STREAM_TIMEOUT.as_secs(), model);
|
||||||
|
let _ = tx.send(format!("[ERROR] LLM response timed out after {} seconds.",
|
||||||
|
STREAM_TIMEOUT.as_secs())).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match chunk_opt {
|
||||||
|
Some(Ok(chunk)) => {
|
||||||
let chunk_str = String::from_utf8_lossy(&chunk);
|
let chunk_str = String::from_utf8_lossy(&chunk);
|
||||||
for line in chunk_str.lines() {
|
for line in chunk_str.lines() {
|
||||||
if line.starts_with("data: ") && !line.contains("[DONE]") {
|
if line.starts_with("data: ") && !line.contains("[DONE]") {
|
||||||
|
|
@ -507,6 +526,16 @@ if line.starts_with("data: ") && !line.contains("[DONE]") {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Some(Err(e)) => {
|
||||||
|
log::error!("[LLM] Stream error: {}", e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Stream ended
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Send accumulated tool calls when stream finishes
|
// Send accumulated tool calls when stream finishes
|
||||||
for tool_call in active_tool_calls {
|
for tool_call in active_tool_calls {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue