revert: restore llm/mod.rs to stable April 9 version

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-04-13 15:07:19 -03:00 · 2026-04-13 15:07:19 -03:00 · c5d30adebe
commit c5d30adebe
parent 765bd624f4
1 changed files with 49 additions and 91 deletions
--- a/src/llm/mod.rs
+++ b/src/llm/mod.rs
@ -1,6 +1,6 @@
 use async_trait::async_trait;
 use futures::StreamExt;
-use log::{error, info, trace};
+use log::{error, info};
 use serde_json::Value;
 use std::sync::Arc;
 use tokio::sync::{mpsc, RwLock};
@ -11,7 +11,6 @@ pub mod episodic_memory;
 pub mod glm;
 pub mod hallucination_detector;
 pub mod llm_models;
 #[cfg(feature = "llm")]
 pub mod local;
 pub mod observability;
 pub mod rate_limiter;
@ -290,7 +289,7 @@ impl LLMProvider for OpenAIClient {
            128000 // Cerebras gpt-oss models and GPT-4 variants
        } else if model.contains("gpt-3.5") {
            16385
-        } else if model == "local" || model.is_empty() {
+        } else if model.starts_with("http://localhost:808") || model == "local" {
            768 // Local llama.cpp server context limit
        } else {
            32768 // Default conservative limit for modern models
@ -379,7 +378,7 @@ impl LLMProvider for OpenAIClient {
            128000 // Cerebras gpt-oss models and GPT-4 variants
        } else if model.contains("gpt-3.5") {
            16385
-        } else if model == "local" || model.is_empty() {
+        } else if model.starts_with("http://localhost:808") || model == "local" {
            768 // Local llama.cpp server context limit
        } else {
            32768 // Default conservative limit for modern models
@ -413,8 +412,7 @@ impl LLMProvider for OpenAIClient {
        let mut request_body = serde_json::json!({
            "model": model,
            "messages": messages,
-            "stream": true,
+            "stream": true
            "max_tokens": 16384
        });
        // Add tools to the request if provided
@ -453,43 +451,13 @@ impl LLMProvider for OpenAIClient {
        // Accumulate tool calls here because OpenAI streams them in fragments
        let mut active_tool_calls: Vec<serde_json::Value> = Vec::new();
-        // Add timeout to stream reads - if Kimi/Nvidia stops responding, fail gracefully
+        while let Some(chunk_result) = stream.next().await {
-        const STREAM_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
+            let chunk = chunk_result?;
        loop {
            let chunk_opt = match tokio::time::timeout(
                STREAM_TIMEOUT,
                stream.next(),
            ).await {
                Ok(opt) => opt,
                Err(_) => {
                    // Timeout - LLM stopped sending data
                    log::warn!("[LLM] Stream timed out after {}s for model {}",
                        STREAM_TIMEOUT.as_secs(), model);
                    let _ = tx.send(format!("[ERROR] LLM response timed out after {} seconds.",
                        STREAM_TIMEOUT.as_secs())).await;
                    break;
                }
            };
            match chunk_opt {
                Some(Ok(chunk)) => {
            let chunk_str = String::from_utf8_lossy(&chunk);
            for line in chunk_str.lines() {
                if line.starts_with("data: ") && !line.contains("[DONE]") {
                    if let Ok(data) = serde_json::from_str::<Value>(&line[6..]) {
-                                // Kimi K2.5 and other reasoning models send thinking in "reasoning" field
+                        if let Some(content) = data["choices"][0]["delta"]["content"].as_str() {
                                // Only process "content" (actual response), ignore "reasoning" (thinking)
                                let content = data["choices"][0]["delta"]["content"].as_str();
                                let reasoning = data["choices"][0]["delta"]["reasoning"].as_str();
                                // Log first chunk to help debug reasoning models
                                if reasoning.is_some() && content.is_none() {
                                    trace!("[LLM] Kimi reasoning chunk (no content yet): {} chars", 
                                        reasoning.unwrap_or("").len());
                                }
                                if let Some(content) = content {
                            let processed = handler.process_content(content);
                            if !processed.is_empty() {
                                let _ = tx.send(processed).await;
@ -537,16 +505,6 @@ impl LLMProvider for OpenAIClient {
                }
            }
        }
                Some(Err(e)) => {
                    log::error!("[LLM] Stream error: {}", e);
                    break;
                }
                None => {
                    // Stream ended
                    break;
                }
            }
        }
        // Send accumulated tool calls when stream finishes
        for tool_call in active_tool_calls {
@ -927,10 +885,10 @@ mod tests {
    fn test_openai_client_new_custom_url() {
        let client = OpenAIClient::new(
            "test_key".to_string(),
-            Some("".to_string()),
+            Some("http://localhost:9000".to_string()),
            None,
        );
-        assert_eq!(client.base_url, "");
+        assert_eq!(client.base_url, "http://localhost:9000");
    }
    #[test]