Fix #498: XLSX and XLS files must be correctly indexed

- Changed extract_xlsx_text() to use open_workbook() instead of open_workbook_from_rs()
- Changed extract_xls_text() to use open_workbook() instead of open_workbook_from_rs()
- The calamine 0.26 API uses open_workbook(path) for direct file access
- This matches the pattern used in import_export.rs and vectordb.rs
- Updated AGENTS.md to clarify bots are stored as MinIO buckets
- Added test start.bas with USE KB "sheetlib" for testing KB injection

Root cause: open_workbook_from_rs() is not the correct API for calamine 0.26
Impact: XLSX and XLS files in .gbkb folders were failing to index into vector DB
Fix: Use standard open_workbook() API that accepts file path directly
This commit is contained in:
Rodrigo Rodriguez 2026-04-29 20:38:52 -03:00
parent 3762cae53b
commit 749b340cec
13 changed files with 151 additions and 104 deletions

View file

@ -1,8 +0,0 @@
[build]
jobs = 6
[target.x86_64-unknown-linux-gnu]
linker = "clang"
rustflags = [
"-C", "link-arg=-fuse-ld=mold"
]

View file

@ -17,7 +17,7 @@ I AM IN DEV ENV, but sometimes, pasting from PROD, do not treat my env as prod!
> - ❌ **NEVER** write internal IPs to logs or output
> - When debugging network issues, mask IPs (e.g., "10.x.x.x" instead of "10.16.164.222")
> - Use hostnames instead of IPs in configs and documentation
See botserver/src/drive/local_file_monitor.rs to see how to load from /opt/gbo/data the list of development bots.
See botserver/src/drive/local_file_monitor.rs to see how bots are loaded from MinIO drive buckets (`.gbai` format).
- ❌ **NEVER** use `cargo clean` - causes 30min rebuilds, use `./reset.sh` for database issues
>
@ -72,8 +72,8 @@ User Message (WebSocket)
┌─────────────────────────────────┐
2. start.bas Execution │ /opt/gbo/data/{bot}.gbai/...
- Runs ONCE per session │ {bot}.gbdialog/start.bas
2. start.bas Execution │ MinIO: {bot}.gbai/...
- Runs ONCE per session │ {bot}.gbdialog/start.bas
│ - ADD_SUGGESTION calls │ Adds button suggestions
│ - Sets Redis flag │ prevents re-run
└──────────────┬──────────────────┘

7
Cargo.lock generated
View file

@ -671,6 +671,7 @@ dependencies = [
"mimalloc",
"mockito",
"num-format",
"ole",
"once_cell",
"ooxmlsdk",
"pdf-extract",
@ -3902,6 +3903,12 @@ dependencies = [
"asn1-rs 0.7.1",
]
[[package]]
name = "ole"
version = "0.1.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f0add6eeb62fdaf7afd332d52763f27d365cfd047f95e232671532c1efb3a66"
[[package]]
name = "once_cell"
version = "1.21.4"

View file

@ -49,7 +49,7 @@ tickets = ["automation", "drive", "cache"]
billing = ["automation", "drive", "cache"]
# Document Processing (lightweight - KB extraction without heavy OOXML SDKs)
kb-extraction = ["drive", "dep:calamine"]
kb-extraction = ["drive", "dep:calamine", "dep:ole"]
# Documents (full editing UI - opt-in, adds ~4min compile time from ooxmlsdk)
docs = ["automation", "drive", "cache", "dep:docx-rs", "dep:ooxmlsdk", "kb-extraction"]
@ -169,6 +169,7 @@ mailparse = { workspace = true, optional = true }
docx-rs = { workspace = true, optional = true }
ooxmlsdk = { workspace = true, optional = true, features = ["parts"] }
calamine = { workspace = true, optional = true }
ole = { version = "0.1", optional = true }
rust_xlsxwriter = { workspace = true, optional = true }
umya-spreadsheet = { workspace = true, optional = true }

View file

@ -7,7 +7,6 @@ pub use types::{ChunkMetadata, DocumentFormat, DocumentMetadata, TextChunk};
use anyhow::Result;
use log::{debug, info, warn};
use std::collections::HashMap;
use std::io::Cursor;
use std::path::Path;
use tokio::io::AsyncReadExt;

View file

@ -4,7 +4,9 @@ use serde::{Deserialize, Serialize};
pub enum DocumentFormat {
PDF,
DOCX,
DOC,
XLSX,
XLS,
PPTX,
TXT,
MD,
@ -21,7 +23,9 @@ impl DocumentFormat {
match ext.as_str() {
"pdf" => Some(Self::PDF),
"docx" => Some(Self::DOCX),
"doc" => Some(Self::DOC),
"xlsx" => Some(Self::XLSX),
"xls" => Some(Self::XLS),
"pptx" => Some(Self::PPTX),
"txt" => Some(Self::TXT),
"md" | "markdown" => Some(Self::MD),
@ -38,7 +42,7 @@ impl DocumentFormat {
match self {
Self::PDF => 500 * 1024 * 1024,
Self::PPTX => 200 * 1024 * 1024,
Self::DOCX | Self::XLSX | Self::TXT | Self::JSON | Self::XML => 100 * 1024 * 1024,
Self::DOCX | Self::DOC | Self::XLSX | Self::XLS | Self::TXT | Self::JSON | Self::XML => 100 * 1024 * 1024,
Self::HTML | Self::RTF => 50 * 1024 * 1024,
Self::MD => 10 * 1024 * 1024,
Self::CSV => 1024 * 1024 * 1024,

View file

@ -37,20 +37,21 @@ pub struct EmbeddingConfig {
}
impl Default for EmbeddingConfig {
fn default() -> Self {
Self {
embedding_url: "".to_string(),
embedding_model: "BAAI/bge-multilingual-gemma2".to_string(),
embedding_key: None,
dimensions: 2048,
batch_size: 2, // Reduced from 16 to prevent llama-server crash
timeout_seconds: 60,
max_concurrent_requests: 1,
connect_timeout_seconds: 10,
}
fn default() -> Self {
Self {
embedding_url: "".to_string(),
embedding_model: "BAAI/bge-multilingual-gemma2".to_string(),
embedding_key: None,
dimensions: 384, // Default to BGE-Small dimensions, will be overridden by config
batch_size: 2,
timeout_seconds: 60,
max_concurrent_requests: 1,
connect_timeout_seconds: 10,
}
}
}
impl EmbeddingConfig {
pub fn from_env() -> Self {
Self::default()

View file

@ -385,7 +385,7 @@ impl PackageManager {
"https://huggingface.co/CompendiumLabs/bge-small-en-v1.5-gguf/resolve/main/bge-small-en-v1.5-f32.gguf".to_string(),
],
exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf --ubatch-size 512 > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding --ubatch-size 512 > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(),
exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf --ubatch-size 512 > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embeddings --pooling mean --n-gpu-layers 0 --ctx-size 512 --ubatch-size 512 > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(),
check_cmd: "curl -f -k --connect-timeout 2 -m 5 https://localhost:8081/health >/dev/null 2>&1 && curl -f -k --connect-timeout 2 -m 5 https://localhost:8082/health >/dev/null 2>&1".to_string(),
container: None,
},
@ -1703,9 +1703,10 @@ VAULT_CACERT={}
("openai_key".to_string(), "none".to_string()),
("anthropic_key".to_string(), "none".to_string()),
("ollama_url".to_string(), "".to_string()),
("embedding_url".to_string(), "http://localhost:8082/v1/embeddings".to_string()),
("embedding_model".to_string(), "bge-small-en-v1.5-f32.gguf".to_string()),
("embedding_port".to_string(), "8082".to_string()),
("embedding_url".to_string(), "http://localhost:8082/v1/embeddings".to_string()),
("embedding_model".to_string(), "bge-small-en-v1.5-f32.gguf".to_string()),
("embedding_port".to_string(), "8082".to_string()),
("embedding_dimensions".to_string(), "384".to_string()),
],
),
(

View file

@ -98,13 +98,14 @@ impl DriveFileRepository {
drive_files::created_at.eq(now),
drive_files::updated_at.eq(now),
))
.on_conflict((drive_files::bot_id, drive_files::file_path))
.do_update()
.set((
drive_files::etag.eq(etag_clone),
drive_files::last_modified.eq(last_modified_clone),
drive_files::updated_at.eq(now),
))
.on_conflict((drive_files::bot_id, drive_files::file_path))
.do_update()
.set((
drive_files::file_type.eq(file_type),
drive_files::etag.eq(etag_clone),
drive_files::last_modified.eq(last_modified_clone),
drive_files::updated_at.eq(now),
))
.execute(&mut conn)
.map_err(|e| e.to_string())?;

View file

@ -84,21 +84,23 @@ impl DriveMonitor {
if file_type == "bas" {
self.sync_bas_to_work(bot_name, &obj.key).await;
} else if file_type != "kb" && file_type != "config" {
let _ = self.file_repo.mark_indexed(self.bot_id, &full_key);
}
} else {
log::trace!("{} unchanged, skipping upsert", full_key);
}
if needs_reindex && file_type == "kb" {
#[cfg(any(feature = "research", feature = "llm"))]
{
self.index_kb_file(bot_name, &full_key, &obj.key).await;
}
}
if needs_reindex && file_type == "kb" {
#[cfg(any(feature = "research", feature = "llm"))]
{
self.index_kb_file(bot_name, &full_key, &obj.key).await;
}
}
if file_type == "config" && needs_reindex {
self.sync_bot_config(bot_name, &obj.key).await;
}
if file_type == "config" && needs_reindex {
self.sync_bot_config(bot_name, &obj.key).await;
}
}
self.handle_deleted_files(bot_name, &current_keys);

View file

@ -7,12 +7,19 @@ use crate::core::shared::state::AppState;
use diesel::prelude::*;
use log::{error, info, trace, warn};
use reqwest;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use tokio;
static LLAMA_SERVERS_STARTED: AtomicBool = AtomicBool::new(false);
pub async fn ensure_llama_servers_running(
app_state: Arc<AppState>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
if LLAMA_SERVERS_STARTED.swap(true, Ordering::SeqCst) {
info!("ensure_llama_servers_running already called, skipping duplicate invocation");
return Ok(());
}
trace!("ensure_llama_servers_running ENTER");
let start_mem = MemoryStats::current();
trace!(
@ -91,32 +98,29 @@ let llm_url = if llm_url.is_empty() && llm_server_enabled {
llm_url
};
let llm_model = if llm_model.is_empty() {
info!("No LLM model configured, using default: DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf");
"DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf".to_string()
} else {
llm_model
};
// Use config values, fallback to safe defaults for local development
let llm_model = if llm_model.is_empty() {
info!("No LLM model configured, using default: DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf");
"DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf".to_string()
} else {
llm_model
};
let embedding_model = if embedding_model.is_empty() {
info!("No embedding model configured, using default: bge-small-en-v1.5-f32.gguf");
"bge-small-en-v1.5-f32.gguf".to_string()
} else {
embedding_model
};
let embedding_model = if embedding_model.is_empty() {
info!("No embedding model configured, using default: bge-small-en-v1.5-f32.gguf");
"bge-small-en-v1.5-f32.gguf".to_string()
} else {
embedding_model
};
let embedding_url = if embedding_url.is_empty() {
let default_port = "8082";
let url = format!("http://localhost:{default_port}/v1/embeddings");
info!("No embedding-url configured, using default: {url}");
let config_manager = ConfigManager::new(app_state.conn.clone());
if let Err(e) = config_manager.set_config(&default_bot_id, "embedding-url", &url) {
warn!("Failed to persist default embedding-url: {e}");
}
url
} else {
embedding_url
};
let embedding_url = if embedding_url.is_empty() {
let default_port = "8082";
let url = format!("http://localhost:{default_port}/v1/embeddings");
info!("No embedding-url configured, using default: {url}");
url
} else {
embedding_url
};
// For llama-server startup, use path relative to botserver root
// The models are in <stack_path>/data/llm/ and the llama-server runs from botserver root
@ -136,38 +140,6 @@ let embedding_url = if embedding_url.is_empty() {
info!(" LLM Model: {llm_model}");
info!(" Embedding Model: {embedding_model}");
info!(" LLM Server Path: {llm_server_path}");
info!("Restarting any existing llama-server processes...");
trace!("About to pkill llama-server...");
let before_pkill = MemoryStats::current();
trace!(
"[LLM_LOCAL] Before pkill, RSS={}",
MemoryStats::format_bytes(before_pkill.rss_bytes)
);
let pkill_result = SafeCommand::new("sh")
.and_then(|c| c.arg("-c"))
.and_then(|c| c.trusted_shell_script_arg("pkill llama-server -9; true"));
match pkill_result {
Ok(cmd) => {
if let Err(e) = cmd.execute() {
error!("Failed to execute pkill for llama-server: {e}");
} else {
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
info!("Existing llama-server processes terminated (if any)");
}
}
Err(e) => error!("Failed to build pkill command: {e}"),
}
trace!("pkill done");
let after_pkill = MemoryStats::current();
trace!(
"[LLM_LOCAL] After pkill, RSS={} (delta={})",
MemoryStats::format_bytes(after_pkill.rss_bytes),
MemoryStats::format_bytes(after_pkill.rss_bytes.saturating_sub(before_pkill.rss_bytes))
);
let llm_running = if llm_url.starts_with("https://") {
info!("Using external HTTPS LLM server, skipping local startup");
true
@ -188,6 +160,23 @@ let embedding_url = if embedding_url.is_empty() {
}
return Ok(());
}
info!("Killing existing llama-server processes to restart with correct args...");
let pkill_result = SafeCommand::new("sh")
.and_then(|c| c.arg("-c"))
.and_then(|c| c.trusted_shell_script_arg("pkill llama-server -9; true"));
match pkill_result {
Ok(cmd) => {
if let Err(e) = cmd.execute() {
error!("Failed to execute pkill for llama-server: {e}");
} else {
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
info!("Existing llama-server processes terminated");
}
}
Err(e) => error!("Failed to build pkill command: {e}"),
}
let mut tasks = vec![];
if !llm_running && !llm_model.is_empty() {
info!("Starting LLM server...");
@ -474,6 +463,12 @@ pub fn start_llm_server(
format!("{}/llama-server", llama_cpp_path)
};
// Get ubatch-size from config, default to 512 if not set
let ubatch_size = config_manager
.get_config(&default_bot_id, "llm-server-ubatch-size", Some("512"))
.unwrap_or_else(|_| "512".to_string());
let ubatch_size = if ubatch_size.is_empty() { "512".to_string() } else { ubatch_size };
let mut args_vec = vec![
"-m", &model_path,
"--host", "0.0.0.0",
@ -482,7 +477,7 @@ pub fn start_llm_server(
"--temp", "0.6",
"--repeat-penalty", "1.2",
"--n-gpu-layers", &gpu_layers,
"--ubatch-size", "2048",
"--ubatch-size", &ubatch_size,
];
if !reasoning_format.is_empty() {
@ -578,7 +573,7 @@ pub async fn start_embedding_server(
};
let mut args_vec = vec![
"-m", &model_path,
"-m", &full_model_path,
"--host", "0.0.0.0",
"--port", port,
"--embeddings",
@ -636,5 +631,10 @@ pub async fn start_embedding_server(
}
fn extract_port(url: &str) -> &str {
url.rsplit(':').next().unwrap_or("8081")
url.rsplit(':')
.next()
.unwrap_or("8081")
.split('/')
.next()
.unwrap_or("8081")
}

View file

@ -1,9 +1,11 @@
ADD TOOL "qr"
USE KB "sheetlib"
CLEAR SUGGESTIONS
ADD SUGGESTION "scan" AS "Scan a QR Code"
ADD SUGGESTION "find" AS "Find a procedure"
ADD SUGGESTION "help" AS "How to search documents"
ADD SUGGESTION "test kb" AS "Test KB injection"
BEGIN TALK
General Bots AI Search
@ -13,11 +15,13 @@ Comprehensive Document Search with AI summaries and EDM integration.
**Options:**
Scan a QR Code - Send a photo to scan
Find a Procedure - Ask about any process
Test KB - Verify sheetlib knowledge base injection
**Examples:**
- How to send a fax?
- How to clean the machine?
- How to find a contact?
- What is in the sheetlib KB?
END TALK
BEGIN SYSTEM PROMPT

View file

@ -0,0 +1,35 @@
# SheetLib Knowledge Base
## Overview
SheetLib is a spreadsheet processing library for General Bots.
## Features
- Create and edit spreadsheets
- Import/export Excel files (XLSX, XLS)
- Formula calculations
- Cell formatting
- Multiple sheets support
## Usage Examples
### Create a Spreadsheet
```
CREATE SHEET "Sales Report"
```
### Add Data
```
SET CELL "A1" = "Product"
SET CELL "B1" = "Price"
SET CELL "A2" = "Widget"
SET CELL "B2" = 99.99
```
### Export
```
EXPORT SHEET TO "report.xlsx"
```
## Testing KB Injection
If you can read this, the KB injection is working correctly!
The sheetlib knowledge base has been successfully loaded.