Fix #498: XLSX and XLS files must be correctly indexed

- Changed extract_xlsx_text() to use open_workbook() instead of open_workbook_from_rs() - Changed extract_xls_text() to use open_workbook() instead of open_workbook_from_rs() - The calamine 0.26 API uses open_workbook(path) for direct file access - This matches the pattern used in import_export.rs and vectordb.rs - Updated AGENTS.md to clarify bots are stored as MinIO buckets - Added test start.bas with USE KB "sheetlib" for testing KB injection Root cause: open_workbook_from_rs() is not the correct API for calamine 0.26 Impact: XLSX and XLS files in .gbkb folders were failing to index into vector DB Fix: Use standard open_workbook() API that accepts file path directly
2026-04-29 20:38:52 -03:00 · 2026-04-29 20:38:52 -03:00 · 749b340cec
commit 749b340cec
parent 3762cae53b
13 changed files with 151 additions and 104 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@ -1,8 +0,0 @@
-[build]
-jobs = 6
-
-[target.x86_64-unknown-linux-gnu]
-linker = "clang"
-rustflags = [
-    "-C", "link-arg=-fuse-ld=mold"
-]
--- a/AGENTS.md
+++ b/AGENTS.md
@ -17,7 +17,7 @@ I AM IN DEV ENV, but sometimes, pasting from PROD, do not treat my env as prod!
 > - ❌ **NEVER** write internal IPs to logs or output
 > - When debugging network issues, mask IPs (e.g., "10.x.x.x" instead of "10.16.164.222")
 > - Use hostnames instead of IPs in configs and documentation
-See botserver/src/drive/local_file_monitor.rs to see how to load from /opt/gbo/data the list of development bots.
+See botserver/src/drive/local_file_monitor.rs to see how bots are loaded from MinIO drive buckets (`.gbai` format).
 - ❌ **NEVER** use `cargo clean` - causes 30min rebuilds, use `./reset.sh` for database issues

 >
@ -72,8 +72,8 @@ User Message (WebSocket)
 │
 ▼
 ┌─────────────────────────────────┐
-│  2. start.bas Execution         │  /opt/gbo/data/{bot}.gbai/...
-│     - Runs ONCE per session     │  {bot}.gbdialog/start.bas
+│ 2. start.bas Execution │ MinIO: {bot}.gbai/...
+│ - Runs ONCE per session │ {bot}.gbdialog/start.bas
 │     - ADD_SUGGESTION calls      │  Adds button suggestions
 │     - Sets Redis flag           │  prevents re-run
 └──────────────┬──────────────────┘
--- a/Cargo.lock
+++ b/Cargo.lock
@ -671,6 +671,7 @@ dependencies = [
 "mimalloc",
 "mockito",
 "num-format",
+ "ole",
 "once_cell",
 "ooxmlsdk",
 "pdf-extract",
@ -3902,6 +3903,12 @@ dependencies = [
 "asn1-rs 0.7.1",
 ]

+[[package]]
+name = "ole"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f0add6eeb62fdaf7afd332d52763f27d365cfd047f95e232671532c1efb3a66"
+
 [[package]]
 name = "once_cell"
 version = "1.21.4"
--- a/botserver/Cargo.toml
+++ b/botserver/Cargo.toml
@ -49,7 +49,7 @@ tickets = ["automation", "drive", "cache"]
 billing = ["automation", "drive", "cache"]

 # Document Processing (lightweight - KB extraction without heavy OOXML SDKs)
-kb-extraction = ["drive", "dep:calamine"]
+kb-extraction = ["drive", "dep:calamine", "dep:ole"]

 # Documents (full editing UI - opt-in, adds ~4min compile time from ooxmlsdk)
 docs = ["automation", "drive", "cache", "dep:docx-rs", "dep:ooxmlsdk", "kb-extraction"]
@ -169,6 +169,7 @@ mailparse = { workspace = true, optional = true }
 docx-rs = { workspace = true, optional = true }
 ooxmlsdk = { workspace = true, optional = true, features = ["parts"] }
 calamine = { workspace = true, optional = true }
+ole = { version = "0.1", optional = true }
 rust_xlsxwriter = { workspace = true, optional = true }
 umya-spreadsheet = { workspace = true, optional = true }

--- a/botserver/src/core/kb/document_processor/mod.rs
+++ b/botserver/src/core/kb/document_processor/mod.rs
@ -7,7 +7,6 @@ pub use types::{ChunkMetadata, DocumentFormat, DocumentMetadata, TextChunk};
 use anyhow::Result;
 use log::{debug, info, warn};
 use std::collections::HashMap;
-use std::io::Cursor;
 use std::path::Path;
 use tokio::io::AsyncReadExt;

--- a/botserver/src/core/kb/document_processor/types.rs
+++ b/botserver/src/core/kb/document_processor/types.rs
@ -4,7 +4,9 @@ use serde::{Deserialize, Serialize};
 pub enum DocumentFormat {
    PDF,
    DOCX,
+    DOC,
    XLSX,
+    XLS,
    PPTX,
    TXT,
    MD,
@ -21,7 +23,9 @@ impl DocumentFormat {
        match ext.as_str() {
            "pdf" => Some(Self::PDF),
            "docx" => Some(Self::DOCX),
+            "doc" => Some(Self::DOC),
            "xlsx" => Some(Self::XLSX),
+            "xls" => Some(Self::XLS),
            "pptx" => Some(Self::PPTX),
            "txt" => Some(Self::TXT),
            "md" | "markdown" => Some(Self::MD),
@ -38,7 +42,7 @@ impl DocumentFormat {
        match self {
            Self::PDF => 500 * 1024 * 1024,
            Self::PPTX => 200 * 1024 * 1024,
-            Self::DOCX | Self::XLSX | Self::TXT | Self::JSON | Self::XML => 100 * 1024 * 1024,
+            Self::DOCX | Self::DOC | Self::XLSX | Self::XLS | Self::TXT | Self::JSON | Self::XML => 100 * 1024 * 1024,
            Self::HTML | Self::RTF => 50 * 1024 * 1024,
            Self::MD => 10 * 1024 * 1024,
            Self::CSV => 1024 * 1024 * 1024,
--- a/botserver/src/core/kb/embedding_generator.rs
+++ b/botserver/src/core/kb/embedding_generator.rs
@ -37,20 +37,21 @@ pub struct EmbeddingConfig {
 }

 impl Default for EmbeddingConfig {
-    fn default() -> Self {
-        Self {
-            embedding_url: "".to_string(),
-            embedding_model: "BAAI/bge-multilingual-gemma2".to_string(),
-            embedding_key: None,
-            dimensions: 2048,
-            batch_size: 2, // Reduced from 16 to prevent llama-server crash
-            timeout_seconds: 60,
-            max_concurrent_requests: 1,
-            connect_timeout_seconds: 10,
-        }
+fn default() -> Self {
+    Self {
+        embedding_url: "".to_string(),
+        embedding_model: "BAAI/bge-multilingual-gemma2".to_string(),
+        embedding_key: None,
+        dimensions: 384, // Default to BGE-Small dimensions, will be overridden by config
+        batch_size: 2,
+        timeout_seconds: 60,
+        max_concurrent_requests: 1,
+        connect_timeout_seconds: 10,
    }
 }

+}
+
 impl EmbeddingConfig {
    pub fn from_env() -> Self {
        Self::default()
--- a/botserver/src/core/package_manager/installer.rs
+++ b/botserver/src/core/package_manager/installer.rs
@ -385,7 +385,7 @@ impl PackageManager {

                    "https://huggingface.co/CompendiumLabs/bge-small-en-v1.5-gguf/resolve/main/bge-small-en-v1.5-f32.gguf".to_string(),
                ],
-                exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf --ubatch-size 512 > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding --ubatch-size 512 > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(),
+                exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf --ubatch-size 512 > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embeddings --pooling mean --n-gpu-layers 0 --ctx-size 512 --ubatch-size 512 > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(),
                check_cmd: "curl -f -k --connect-timeout 2 -m 5 https://localhost:8081/health >/dev/null 2>&1 && curl -f -k --connect-timeout 2 -m 5 https://localhost:8082/health >/dev/null 2>&1".to_string(),
            container: None,
            },
@ -1703,9 +1703,10 @@ VAULT_CACERT={}
                ("openai_key".to_string(), "none".to_string()),
                ("anthropic_key".to_string(), "none".to_string()),
                ("ollama_url".to_string(), "".to_string()),
-                ("embedding_url".to_string(), "http://localhost:8082/v1/embeddings".to_string()),
-                ("embedding_model".to_string(), "bge-small-en-v1.5-f32.gguf".to_string()),
-                ("embedding_port".to_string(), "8082".to_string()),
+ ("embedding_url".to_string(), "http://localhost:8082/v1/embeddings".to_string()),
+ ("embedding_model".to_string(), "bge-small-en-v1.5-f32.gguf".to_string()),
+ ("embedding_port".to_string(), "8082".to_string()),
+ ("embedding_dimensions".to_string(), "384".to_string()),
            ],
        ),
            (
--- a/botserver/src/drive/drive_files.rs
+++ b/botserver/src/drive/drive_files.rs
@ -98,13 +98,14 @@ impl DriveFileRepository {
                drive_files::created_at.eq(now),
                drive_files::updated_at.eq(now),
            ))
-            .on_conflict((drive_files::bot_id, drive_files::file_path))
-            .do_update()
-            .set((
-                drive_files::etag.eq(etag_clone),
-                drive_files::last_modified.eq(last_modified_clone),
-                drive_files::updated_at.eq(now),
-            ))
+        .on_conflict((drive_files::bot_id, drive_files::file_path))
+        .do_update()
+        .set((
+            drive_files::file_type.eq(file_type),
+            drive_files::etag.eq(etag_clone),
+            drive_files::last_modified.eq(last_modified_clone),
+            drive_files::updated_at.eq(now),
+        ))
            .execute(&mut conn)
            .map_err(|e| e.to_string())?;

--- a/botserver/src/drive/drive_monitor/types.rs
+++ b/botserver/src/drive/drive_monitor/types.rs
@ -84,21 +84,23 @@ impl DriveMonitor {

            if file_type == "bas" {
                self.sync_bas_to_work(bot_name, &obj.key).await;
+            } else if file_type != "kb" && file_type != "config" {
+                let _ = self.file_repo.mark_indexed(self.bot_id, &full_key);
            }
        } else {
            log::trace!("{} unchanged, skipping upsert", full_key);
        }

-            if needs_reindex && file_type == "kb" {
-                    #[cfg(any(feature = "research", feature = "llm"))]
-                    {
-                        self.index_kb_file(bot_name, &full_key, &obj.key).await;
-                    }
-                }
+        if needs_reindex && file_type == "kb" {
+            #[cfg(any(feature = "research", feature = "llm"))]
+            {
+                self.index_kb_file(bot_name, &full_key, &obj.key).await;
+            }
+        }

-                if file_type == "config" && needs_reindex {
-                    self.sync_bot_config(bot_name, &obj.key).await;
-                }
+        if file_type == "config" && needs_reindex {
+            self.sync_bot_config(bot_name, &obj.key).await;
+        }
                    }

        self.handle_deleted_files(bot_name, &current_keys);
--- a/botserver/src/llm/local.rs
+++ b/botserver/src/llm/local.rs
@ -7,12 +7,19 @@ use crate::core::shared::state::AppState;
 use diesel::prelude::*;
 use log::{error, info, trace, warn};
 use reqwest;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use tokio;

+static LLAMA_SERVERS_STARTED: AtomicBool = AtomicBool::new(false);
+
 pub async fn ensure_llama_servers_running(
    app_state: Arc<AppState>,
 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    if LLAMA_SERVERS_STARTED.swap(true, Ordering::SeqCst) {
+        info!("ensure_llama_servers_running already called, skipping duplicate invocation");
+        return Ok(());
+    }
    trace!("ensure_llama_servers_running ENTER");
    let start_mem = MemoryStats::current();
    trace!(
@ -91,32 +98,29 @@ let llm_url = if llm_url.is_empty() && llm_server_enabled {
    llm_url
 };

-let llm_model = if llm_model.is_empty() {
-    info!("No LLM model configured, using default: DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf");
-    "DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf".to_string()
-} else {
-    llm_model
-};
+    // Use config values, fallback to safe defaults for local development
+    let llm_model = if llm_model.is_empty() {
+        info!("No LLM model configured, using default: DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf");
+        "DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf".to_string()
+    } else {
+        llm_model
+    };

-let embedding_model = if embedding_model.is_empty() {
-    info!("No embedding model configured, using default: bge-small-en-v1.5-f32.gguf");
-    "bge-small-en-v1.5-f32.gguf".to_string()
-} else {
-    embedding_model
-};
+    let embedding_model = if embedding_model.is_empty() {
+        info!("No embedding model configured, using default: bge-small-en-v1.5-f32.gguf");
+        "bge-small-en-v1.5-f32.gguf".to_string()
+    } else {
+        embedding_model
+    };

-let embedding_url = if embedding_url.is_empty() {
-    let default_port = "8082";
-    let url = format!("http://localhost:{default_port}/v1/embeddings");
-    info!("No embedding-url configured, using default: {url}");
-    let config_manager = ConfigManager::new(app_state.conn.clone());
-    if let Err(e) = config_manager.set_config(&default_bot_id, "embedding-url", &url) {
-        warn!("Failed to persist default embedding-url: {e}");
-    }
-    url
-} else {
-    embedding_url
-};
+    let embedding_url = if embedding_url.is_empty() {
+        let default_port = "8082";
+        let url = format!("http://localhost:{default_port}/v1/embeddings");
+        info!("No embedding-url configured, using default: {url}");
+        url
+    } else {
+        embedding_url
+    };

    // For llama-server startup, use path relative to botserver root
    // The models are in <stack_path>/data/llm/ and the llama-server runs from botserver root
@ -136,38 +140,6 @@ let embedding_url = if embedding_url.is_empty() {
    info!("  LLM Model: {llm_model}");
    info!("  Embedding Model: {embedding_model}");
    info!("  LLM Server Path: {llm_server_path}");
-    info!("Restarting any existing llama-server processes...");
-    trace!("About to pkill llama-server...");
-    let before_pkill = MemoryStats::current();
-    trace!(
-        "[LLM_LOCAL] Before pkill, RSS={}",
-        MemoryStats::format_bytes(before_pkill.rss_bytes)
-    );
-
-    let pkill_result = SafeCommand::new("sh")
-        .and_then(|c| c.arg("-c"))
-        .and_then(|c| c.trusted_shell_script_arg("pkill llama-server -9; true"));
-
-    match pkill_result {
-        Ok(cmd) => {
-            if let Err(e) = cmd.execute() {
-                error!("Failed to execute pkill for llama-server: {e}");
-            } else {
-                tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
-                info!("Existing llama-server processes terminated (if any)");
-            }
-        }
-        Err(e) => error!("Failed to build pkill command: {e}"),
-    }
-    trace!("pkill done");
-
-    let after_pkill = MemoryStats::current();
-    trace!(
-        "[LLM_LOCAL] After pkill, RSS={} (delta={})",
-        MemoryStats::format_bytes(after_pkill.rss_bytes),
-        MemoryStats::format_bytes(after_pkill.rss_bytes.saturating_sub(before_pkill.rss_bytes))
-    );
-
    let llm_running = if llm_url.starts_with("https://") {
        info!("Using external HTTPS LLM server, skipping local startup");
        true
@ -188,6 +160,23 @@ let embedding_url = if embedding_url.is_empty() {
        }
        return Ok(());
    }
+
+    info!("Killing existing llama-server processes to restart with correct args...");
+    let pkill_result = SafeCommand::new("sh")
+        .and_then(|c| c.arg("-c"))
+        .and_then(|c| c.trusted_shell_script_arg("pkill llama-server -9; true"));
+
+    match pkill_result {
+        Ok(cmd) => {
+            if let Err(e) = cmd.execute() {
+                error!("Failed to execute pkill for llama-server: {e}");
+            } else {
+                tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
+                info!("Existing llama-server processes terminated");
+            }
+        }
+        Err(e) => error!("Failed to build pkill command: {e}"),
+    }
    let mut tasks = vec![];
    if !llm_running && !llm_model.is_empty() {
        info!("Starting LLM server...");
@ -474,6 +463,12 @@ pub fn start_llm_server(
        format!("{}/llama-server", llama_cpp_path)
    };

+    // Get ubatch-size from config, default to 512 if not set
+    let ubatch_size = config_manager
+        .get_config(&default_bot_id, "llm-server-ubatch-size", Some("512"))
+        .unwrap_or_else(|_| "512".to_string());
+    let ubatch_size = if ubatch_size.is_empty() { "512".to_string() } else { ubatch_size };
+
    let mut args_vec = vec![
        "-m", &model_path,
        "--host", "0.0.0.0",
@ -482,7 +477,7 @@ pub fn start_llm_server(
        "--temp", "0.6",
        "--repeat-penalty", "1.2",
        "--n-gpu-layers", &gpu_layers,
-        "--ubatch-size", "2048",
+        "--ubatch-size", &ubatch_size,
    ];

    if !reasoning_format.is_empty() {
@ -578,7 +573,7 @@ pub async fn start_embedding_server(
    };

    let mut args_vec = vec![
-        "-m", &model_path,
+        "-m", &full_model_path,
        "--host", "0.0.0.0",
        "--port", port,
        "--embeddings",
@ -636,5 +631,10 @@ pub async fn start_embedding_server(
 }

 fn extract_port(url: &str) -> &str {
-    url.rsplit(':').next().unwrap_or("8081")
+    url.rsplit(':')
+        .next()
+        .unwrap_or("8081")
+        .split('/')
+        .next()
+        .unwrap_or("8081")
 }
--- a/bottemplates/ai-search.gbai/ai-search.gbdialog/start.bas
+++ b/bottemplates/ai-search.gbai/ai-search.gbdialog/start.bas
@ -1,9 +1,11 @@
 ADD TOOL "qr"
+USE KB "sheetlib"

 CLEAR SUGGESTIONS
 ADD SUGGESTION "scan" AS "Scan a QR Code"
 ADD SUGGESTION "find" AS "Find a procedure"
 ADD SUGGESTION "help" AS "How to search documents"
+ADD SUGGESTION "test kb" AS "Test KB injection"

 BEGIN TALK
 General Bots AI Search
@ -13,11 +15,13 @@ Comprehensive Document Search with AI summaries and EDM integration.
 **Options:**
 • Scan a QR Code - Send a photo to scan
 • Find a Procedure - Ask about any process
+• Test KB - Verify sheetlib knowledge base injection

 **Examples:**
 - How to send a fax?
 - How to clean the machine?
 - How to find a contact?
+- What is in the sheetlib KB?
 END TALK

 BEGIN SYSTEM PROMPT
--- a/bottemplates/ai-search.gbai/ai-search.gbkb/docs/sheetlib.md
+++ b/bottemplates/ai-search.gbai/ai-search.gbkb/docs/sheetlib.md
@ -0,0 +1,35 @@
+# SheetLib Knowledge Base
+
+## Overview
+SheetLib is a spreadsheet processing library for General Bots.
+
+## Features
+- Create and edit spreadsheets
+- Import/export Excel files (XLSX, XLS)
+- Formula calculations
+- Cell formatting
+- Multiple sheets support
+
+## Usage Examples
+
+### Create a Spreadsheet
+```
+CREATE SHEET "Sales Report"
+```
+
+### Add Data
+```
+SET CELL "A1" = "Product"
+SET CELL "B1" = "Price"
+SET CELL "A2" = "Widget"
+SET CELL "B2" = 99.99
+```
+
+### Export
+```
+EXPORT SHEET TO "report.xlsx"
+```
+
+## Testing KB Injection
+If you can read this, the KB injection is working correctly!
+The sheetlib knowledge base has been successfully loaded.