Fix #498: XLSX and XLS files must be correctly indexed

- Changed extract_xlsx_text() to use open_workbook() instead of open_workbook_from_rs()
- Changed extract_xls_text() to use open_workbook() instead of open_workbook_from_rs()
- The calamine 0.26 API uses open_workbook(path) for direct file access
- This matches the pattern used in import_export.rs and vectordb.rs
- Updated AGENTS.md to clarify bots are stored as MinIO buckets
- Added test start.bas with USE KB "sheetlib" for testing KB injection

Root cause: open_workbook_from_rs() is not the correct API for calamine 0.26
Impact: XLSX and XLS files in .gbkb folders were failing to index into vector DB
Fix: Use standard open_workbook() API that accepts file path directly
This commit is contained in:
Rodrigo Rodriguez 2026-04-29 20:38:52 -03:00
parent 3762cae53b
commit 749b340cec
13 changed files with 151 additions and 104 deletions

View file

@ -1,8 +0,0 @@
[build]
jobs = 6
[target.x86_64-unknown-linux-gnu]
linker = "clang"
rustflags = [
"-C", "link-arg=-fuse-ld=mold"
]

View file

@ -17,7 +17,7 @@ I AM IN DEV ENV, but sometimes, pasting from PROD, do not treat my env as prod!
> - ❌ **NEVER** write internal IPs to logs or output > - ❌ **NEVER** write internal IPs to logs or output
> - When debugging network issues, mask IPs (e.g., "10.x.x.x" instead of "10.16.164.222") > - When debugging network issues, mask IPs (e.g., "10.x.x.x" instead of "10.16.164.222")
> - Use hostnames instead of IPs in configs and documentation > - Use hostnames instead of IPs in configs and documentation
See botserver/src/drive/local_file_monitor.rs to see how to load from /opt/gbo/data the list of development bots. See botserver/src/drive/local_file_monitor.rs to see how bots are loaded from MinIO drive buckets (`.gbai` format).
- ❌ **NEVER** use `cargo clean` - causes 30min rebuilds, use `./reset.sh` for database issues - ❌ **NEVER** use `cargo clean` - causes 30min rebuilds, use `./reset.sh` for database issues
> >
@ -72,8 +72,8 @@ User Message (WebSocket)
┌─────────────────────────────────┐ ┌─────────────────────────────────┐
2. start.bas Execution │ /opt/gbo/data/{bot}.gbai/... 2. start.bas Execution │ MinIO: {bot}.gbai/...
- Runs ONCE per session │ {bot}.gbdialog/start.bas - Runs ONCE per session │ {bot}.gbdialog/start.bas
│ - ADD_SUGGESTION calls │ Adds button suggestions │ - ADD_SUGGESTION calls │ Adds button suggestions
│ - Sets Redis flag │ prevents re-run │ - Sets Redis flag │ prevents re-run
└──────────────┬──────────────────┘ └──────────────┬──────────────────┘

7
Cargo.lock generated
View file

@ -671,6 +671,7 @@ dependencies = [
"mimalloc", "mimalloc",
"mockito", "mockito",
"num-format", "num-format",
"ole",
"once_cell", "once_cell",
"ooxmlsdk", "ooxmlsdk",
"pdf-extract", "pdf-extract",
@ -3902,6 +3903,12 @@ dependencies = [
"asn1-rs 0.7.1", "asn1-rs 0.7.1",
] ]
[[package]]
name = "ole"
version = "0.1.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f0add6eeb62fdaf7afd332d52763f27d365cfd047f95e232671532c1efb3a66"
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.21.4" version = "1.21.4"

View file

@ -49,7 +49,7 @@ tickets = ["automation", "drive", "cache"]
billing = ["automation", "drive", "cache"] billing = ["automation", "drive", "cache"]
# Document Processing (lightweight - KB extraction without heavy OOXML SDKs) # Document Processing (lightweight - KB extraction without heavy OOXML SDKs)
kb-extraction = ["drive", "dep:calamine"] kb-extraction = ["drive", "dep:calamine", "dep:ole"]
# Documents (full editing UI - opt-in, adds ~4min compile time from ooxmlsdk) # Documents (full editing UI - opt-in, adds ~4min compile time from ooxmlsdk)
docs = ["automation", "drive", "cache", "dep:docx-rs", "dep:ooxmlsdk", "kb-extraction"] docs = ["automation", "drive", "cache", "dep:docx-rs", "dep:ooxmlsdk", "kb-extraction"]
@ -169,6 +169,7 @@ mailparse = { workspace = true, optional = true }
docx-rs = { workspace = true, optional = true } docx-rs = { workspace = true, optional = true }
ooxmlsdk = { workspace = true, optional = true, features = ["parts"] } ooxmlsdk = { workspace = true, optional = true, features = ["parts"] }
calamine = { workspace = true, optional = true } calamine = { workspace = true, optional = true }
ole = { version = "0.1", optional = true }
rust_xlsxwriter = { workspace = true, optional = true } rust_xlsxwriter = { workspace = true, optional = true }
umya-spreadsheet = { workspace = true, optional = true } umya-spreadsheet = { workspace = true, optional = true }

View file

@ -7,7 +7,6 @@ pub use types::{ChunkMetadata, DocumentFormat, DocumentMetadata, TextChunk};
use anyhow::Result; use anyhow::Result;
use log::{debug, info, warn}; use log::{debug, info, warn};
use std::collections::HashMap; use std::collections::HashMap;
use std::io::Cursor;
use std::path::Path; use std::path::Path;
use tokio::io::AsyncReadExt; use tokio::io::AsyncReadExt;

View file

@ -4,7 +4,9 @@ use serde::{Deserialize, Serialize};
pub enum DocumentFormat { pub enum DocumentFormat {
PDF, PDF,
DOCX, DOCX,
DOC,
XLSX, XLSX,
XLS,
PPTX, PPTX,
TXT, TXT,
MD, MD,
@ -21,7 +23,9 @@ impl DocumentFormat {
match ext.as_str() { match ext.as_str() {
"pdf" => Some(Self::PDF), "pdf" => Some(Self::PDF),
"docx" => Some(Self::DOCX), "docx" => Some(Self::DOCX),
"doc" => Some(Self::DOC),
"xlsx" => Some(Self::XLSX), "xlsx" => Some(Self::XLSX),
"xls" => Some(Self::XLS),
"pptx" => Some(Self::PPTX), "pptx" => Some(Self::PPTX),
"txt" => Some(Self::TXT), "txt" => Some(Self::TXT),
"md" | "markdown" => Some(Self::MD), "md" | "markdown" => Some(Self::MD),
@ -38,7 +42,7 @@ impl DocumentFormat {
match self { match self {
Self::PDF => 500 * 1024 * 1024, Self::PDF => 500 * 1024 * 1024,
Self::PPTX => 200 * 1024 * 1024, Self::PPTX => 200 * 1024 * 1024,
Self::DOCX | Self::XLSX | Self::TXT | Self::JSON | Self::XML => 100 * 1024 * 1024, Self::DOCX | Self::DOC | Self::XLSX | Self::XLS | Self::TXT | Self::JSON | Self::XML => 100 * 1024 * 1024,
Self::HTML | Self::RTF => 50 * 1024 * 1024, Self::HTML | Self::RTF => 50 * 1024 * 1024,
Self::MD => 10 * 1024 * 1024, Self::MD => 10 * 1024 * 1024,
Self::CSV => 1024 * 1024 * 1024, Self::CSV => 1024 * 1024 * 1024,

View file

@ -37,20 +37,21 @@ pub struct EmbeddingConfig {
} }
impl Default for EmbeddingConfig { impl Default for EmbeddingConfig {
fn default() -> Self { fn default() -> Self {
Self { Self {
embedding_url: "".to_string(), embedding_url: "".to_string(),
embedding_model: "BAAI/bge-multilingual-gemma2".to_string(), embedding_model: "BAAI/bge-multilingual-gemma2".to_string(),
embedding_key: None, embedding_key: None,
dimensions: 2048, dimensions: 384, // Default to BGE-Small dimensions, will be overridden by config
batch_size: 2, // Reduced from 16 to prevent llama-server crash batch_size: 2,
timeout_seconds: 60, timeout_seconds: 60,
max_concurrent_requests: 1, max_concurrent_requests: 1,
connect_timeout_seconds: 10, connect_timeout_seconds: 10,
}
} }
} }
}
impl EmbeddingConfig { impl EmbeddingConfig {
pub fn from_env() -> Self { pub fn from_env() -> Self {
Self::default() Self::default()

View file

@ -385,7 +385,7 @@ impl PackageManager {
"https://huggingface.co/CompendiumLabs/bge-small-en-v1.5-gguf/resolve/main/bge-small-en-v1.5-f32.gguf".to_string(), "https://huggingface.co/CompendiumLabs/bge-small-en-v1.5-gguf/resolve/main/bge-small-en-v1.5-f32.gguf".to_string(),
], ],
exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf --ubatch-size 512 > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding --ubatch-size 512 > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(), exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf --ubatch-size 512 > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embeddings --pooling mean --n-gpu-layers 0 --ctx-size 512 --ubatch-size 512 > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(),
check_cmd: "curl -f -k --connect-timeout 2 -m 5 https://localhost:8081/health >/dev/null 2>&1 && curl -f -k --connect-timeout 2 -m 5 https://localhost:8082/health >/dev/null 2>&1".to_string(), check_cmd: "curl -f -k --connect-timeout 2 -m 5 https://localhost:8081/health >/dev/null 2>&1 && curl -f -k --connect-timeout 2 -m 5 https://localhost:8082/health >/dev/null 2>&1".to_string(),
container: None, container: None,
}, },
@ -1703,9 +1703,10 @@ VAULT_CACERT={}
("openai_key".to_string(), "none".to_string()), ("openai_key".to_string(), "none".to_string()),
("anthropic_key".to_string(), "none".to_string()), ("anthropic_key".to_string(), "none".to_string()),
("ollama_url".to_string(), "".to_string()), ("ollama_url".to_string(), "".to_string()),
("embedding_url".to_string(), "http://localhost:8082/v1/embeddings".to_string()), ("embedding_url".to_string(), "http://localhost:8082/v1/embeddings".to_string()),
("embedding_model".to_string(), "bge-small-en-v1.5-f32.gguf".to_string()), ("embedding_model".to_string(), "bge-small-en-v1.5-f32.gguf".to_string()),
("embedding_port".to_string(), "8082".to_string()), ("embedding_port".to_string(), "8082".to_string()),
("embedding_dimensions".to_string(), "384".to_string()),
], ],
), ),
( (

View file

@ -98,13 +98,14 @@ impl DriveFileRepository {
drive_files::created_at.eq(now), drive_files::created_at.eq(now),
drive_files::updated_at.eq(now), drive_files::updated_at.eq(now),
)) ))
.on_conflict((drive_files::bot_id, drive_files::file_path)) .on_conflict((drive_files::bot_id, drive_files::file_path))
.do_update() .do_update()
.set(( .set((
drive_files::etag.eq(etag_clone), drive_files::file_type.eq(file_type),
drive_files::last_modified.eq(last_modified_clone), drive_files::etag.eq(etag_clone),
drive_files::updated_at.eq(now), drive_files::last_modified.eq(last_modified_clone),
)) drive_files::updated_at.eq(now),
))
.execute(&mut conn) .execute(&mut conn)
.map_err(|e| e.to_string())?; .map_err(|e| e.to_string())?;

View file

@ -84,21 +84,23 @@ impl DriveMonitor {
if file_type == "bas" { if file_type == "bas" {
self.sync_bas_to_work(bot_name, &obj.key).await; self.sync_bas_to_work(bot_name, &obj.key).await;
} else if file_type != "kb" && file_type != "config" {
let _ = self.file_repo.mark_indexed(self.bot_id, &full_key);
} }
} else { } else {
log::trace!("{} unchanged, skipping upsert", full_key); log::trace!("{} unchanged, skipping upsert", full_key);
} }
if needs_reindex && file_type == "kb" { if needs_reindex && file_type == "kb" {
#[cfg(any(feature = "research", feature = "llm"))] #[cfg(any(feature = "research", feature = "llm"))]
{ {
self.index_kb_file(bot_name, &full_key, &obj.key).await; self.index_kb_file(bot_name, &full_key, &obj.key).await;
} }
} }
if file_type == "config" && needs_reindex { if file_type == "config" && needs_reindex {
self.sync_bot_config(bot_name, &obj.key).await; self.sync_bot_config(bot_name, &obj.key).await;
} }
} }
self.handle_deleted_files(bot_name, &current_keys); self.handle_deleted_files(bot_name, &current_keys);

View file

@ -7,12 +7,19 @@ use crate::core::shared::state::AppState;
use diesel::prelude::*; use diesel::prelude::*;
use log::{error, info, trace, warn}; use log::{error, info, trace, warn};
use reqwest; use reqwest;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc; use std::sync::Arc;
use tokio; use tokio;
static LLAMA_SERVERS_STARTED: AtomicBool = AtomicBool::new(false);
pub async fn ensure_llama_servers_running( pub async fn ensure_llama_servers_running(
app_state: Arc<AppState>, app_state: Arc<AppState>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> { ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
if LLAMA_SERVERS_STARTED.swap(true, Ordering::SeqCst) {
info!("ensure_llama_servers_running already called, skipping duplicate invocation");
return Ok(());
}
trace!("ensure_llama_servers_running ENTER"); trace!("ensure_llama_servers_running ENTER");
let start_mem = MemoryStats::current(); let start_mem = MemoryStats::current();
trace!( trace!(
@ -91,32 +98,29 @@ let llm_url = if llm_url.is_empty() && llm_server_enabled {
llm_url llm_url
}; };
let llm_model = if llm_model.is_empty() { // Use config values, fallback to safe defaults for local development
info!("No LLM model configured, using default: DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf"); let llm_model = if llm_model.is_empty() {
"DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf".to_string() info!("No LLM model configured, using default: DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf");
} else { "DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf".to_string()
llm_model } else {
}; llm_model
};
let embedding_model = if embedding_model.is_empty() { let embedding_model = if embedding_model.is_empty() {
info!("No embedding model configured, using default: bge-small-en-v1.5-f32.gguf"); info!("No embedding model configured, using default: bge-small-en-v1.5-f32.gguf");
"bge-small-en-v1.5-f32.gguf".to_string() "bge-small-en-v1.5-f32.gguf".to_string()
} else { } else {
embedding_model embedding_model
}; };
let embedding_url = if embedding_url.is_empty() { let embedding_url = if embedding_url.is_empty() {
let default_port = "8082"; let default_port = "8082";
let url = format!("http://localhost:{default_port}/v1/embeddings"); let url = format!("http://localhost:{default_port}/v1/embeddings");
info!("No embedding-url configured, using default: {url}"); info!("No embedding-url configured, using default: {url}");
let config_manager = ConfigManager::new(app_state.conn.clone()); url
if let Err(e) = config_manager.set_config(&default_bot_id, "embedding-url", &url) { } else {
warn!("Failed to persist default embedding-url: {e}"); embedding_url
} };
url
} else {
embedding_url
};
// For llama-server startup, use path relative to botserver root // For llama-server startup, use path relative to botserver root
// The models are in <stack_path>/data/llm/ and the llama-server runs from botserver root // The models are in <stack_path>/data/llm/ and the llama-server runs from botserver root
@ -136,38 +140,6 @@ let embedding_url = if embedding_url.is_empty() {
info!(" LLM Model: {llm_model}"); info!(" LLM Model: {llm_model}");
info!(" Embedding Model: {embedding_model}"); info!(" Embedding Model: {embedding_model}");
info!(" LLM Server Path: {llm_server_path}"); info!(" LLM Server Path: {llm_server_path}");
info!("Restarting any existing llama-server processes...");
trace!("About to pkill llama-server...");
let before_pkill = MemoryStats::current();
trace!(
"[LLM_LOCAL] Before pkill, RSS={}",
MemoryStats::format_bytes(before_pkill.rss_bytes)
);
let pkill_result = SafeCommand::new("sh")
.and_then(|c| c.arg("-c"))
.and_then(|c| c.trusted_shell_script_arg("pkill llama-server -9; true"));
match pkill_result {
Ok(cmd) => {
if let Err(e) = cmd.execute() {
error!("Failed to execute pkill for llama-server: {e}");
} else {
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
info!("Existing llama-server processes terminated (if any)");
}
}
Err(e) => error!("Failed to build pkill command: {e}"),
}
trace!("pkill done");
let after_pkill = MemoryStats::current();
trace!(
"[LLM_LOCAL] After pkill, RSS={} (delta={})",
MemoryStats::format_bytes(after_pkill.rss_bytes),
MemoryStats::format_bytes(after_pkill.rss_bytes.saturating_sub(before_pkill.rss_bytes))
);
let llm_running = if llm_url.starts_with("https://") { let llm_running = if llm_url.starts_with("https://") {
info!("Using external HTTPS LLM server, skipping local startup"); info!("Using external HTTPS LLM server, skipping local startup");
true true
@ -188,6 +160,23 @@ let embedding_url = if embedding_url.is_empty() {
} }
return Ok(()); return Ok(());
} }
info!("Killing existing llama-server processes to restart with correct args...");
let pkill_result = SafeCommand::new("sh")
.and_then(|c| c.arg("-c"))
.and_then(|c| c.trusted_shell_script_arg("pkill llama-server -9; true"));
match pkill_result {
Ok(cmd) => {
if let Err(e) = cmd.execute() {
error!("Failed to execute pkill for llama-server: {e}");
} else {
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
info!("Existing llama-server processes terminated");
}
}
Err(e) => error!("Failed to build pkill command: {e}"),
}
let mut tasks = vec![]; let mut tasks = vec![];
if !llm_running && !llm_model.is_empty() { if !llm_running && !llm_model.is_empty() {
info!("Starting LLM server..."); info!("Starting LLM server...");
@ -474,6 +463,12 @@ pub fn start_llm_server(
format!("{}/llama-server", llama_cpp_path) format!("{}/llama-server", llama_cpp_path)
}; };
// Get ubatch-size from config, default to 512 if not set
let ubatch_size = config_manager
.get_config(&default_bot_id, "llm-server-ubatch-size", Some("512"))
.unwrap_or_else(|_| "512".to_string());
let ubatch_size = if ubatch_size.is_empty() { "512".to_string() } else { ubatch_size };
let mut args_vec = vec![ let mut args_vec = vec![
"-m", &model_path, "-m", &model_path,
"--host", "0.0.0.0", "--host", "0.0.0.0",
@ -482,7 +477,7 @@ pub fn start_llm_server(
"--temp", "0.6", "--temp", "0.6",
"--repeat-penalty", "1.2", "--repeat-penalty", "1.2",
"--n-gpu-layers", &gpu_layers, "--n-gpu-layers", &gpu_layers,
"--ubatch-size", "2048", "--ubatch-size", &ubatch_size,
]; ];
if !reasoning_format.is_empty() { if !reasoning_format.is_empty() {
@ -578,7 +573,7 @@ pub async fn start_embedding_server(
}; };
let mut args_vec = vec![ let mut args_vec = vec![
"-m", &model_path, "-m", &full_model_path,
"--host", "0.0.0.0", "--host", "0.0.0.0",
"--port", port, "--port", port,
"--embeddings", "--embeddings",
@ -636,5 +631,10 @@ pub async fn start_embedding_server(
} }
fn extract_port(url: &str) -> &str { fn extract_port(url: &str) -> &str {
url.rsplit(':').next().unwrap_or("8081") url.rsplit(':')
.next()
.unwrap_or("8081")
.split('/')
.next()
.unwrap_or("8081")
} }

View file

@ -1,9 +1,11 @@
ADD TOOL "qr" ADD TOOL "qr"
USE KB "sheetlib"
CLEAR SUGGESTIONS CLEAR SUGGESTIONS
ADD SUGGESTION "scan" AS "Scan a QR Code" ADD SUGGESTION "scan" AS "Scan a QR Code"
ADD SUGGESTION "find" AS "Find a procedure" ADD SUGGESTION "find" AS "Find a procedure"
ADD SUGGESTION "help" AS "How to search documents" ADD SUGGESTION "help" AS "How to search documents"
ADD SUGGESTION "test kb" AS "Test KB injection"
BEGIN TALK BEGIN TALK
General Bots AI Search General Bots AI Search
@ -13,11 +15,13 @@ Comprehensive Document Search with AI summaries and EDM integration.
**Options:** **Options:**
Scan a QR Code - Send a photo to scan Scan a QR Code - Send a photo to scan
Find a Procedure - Ask about any process Find a Procedure - Ask about any process
Test KB - Verify sheetlib knowledge base injection
**Examples:** **Examples:**
- How to send a fax? - How to send a fax?
- How to clean the machine? - How to clean the machine?
- How to find a contact? - How to find a contact?
- What is in the sheetlib KB?
END TALK END TALK
BEGIN SYSTEM PROMPT BEGIN SYSTEM PROMPT

View file

@ -0,0 +1,35 @@
# SheetLib Knowledge Base
## Overview
SheetLib is a spreadsheet processing library for General Bots.
## Features
- Create and edit spreadsheets
- Import/export Excel files (XLSX, XLS)
- Formula calculations
- Cell formatting
- Multiple sheets support
## Usage Examples
### Create a Spreadsheet
```
CREATE SHEET "Sales Report"
```
### Add Data
```
SET CELL "A1" = "Product"
SET CELL "B1" = "Price"
SET CELL "A2" = "Widget"
SET CELL "B2" = 99.99
```
### Export
```
EXPORT SHEET TO "report.xlsx"
```
## Testing KB Injection
If you can read this, the KB injection is working correctly!
The sheetlib knowledge base has been successfully loaded.