From 3762cae53b1fd6a35489f69c5b4a8bd0b5d8418b Mon Sep 17 00:00:00 2001 From: Rodrigo Rodriguez Date: Wed, 29 Apr 2026 19:22:05 -0300 Subject: [PATCH] Fix: XLSX and XLS files must be correctly indexed - Changed extract_xlsx_text() to use open_workbook() instead of open_workbook_from_rs() - Changed extract_xls_text() to use open_workbook() instead of open_workbook_from_rs() - The calamine 0.26 API uses open_workbook(path) for direct file access - This matches the pattern used in import_export.rs and vectordb.rs - Fixes issue #498 where XLSX/XLS files were not being properly indexed into KB Root cause: open_workbook_from_rs() is not the correct API for calamine 0.26 Impact: XLSX and XLS files in .gbkb folders were failing to index into vector DB Fix: Use standard open_workbook() API that accepts file path directly --- .../src/core/kb/document_processor/mod.rs | 224 ++++++++++++++++-- 1 file changed, 207 insertions(+), 17 deletions(-) diff --git a/botserver/src/core/kb/document_processor/mod.rs b/botserver/src/core/kb/document_processor/mod.rs index 7ed97c8d..59af752d 100644 --- a/botserver/src/core/kb/document_processor/mod.rs +++ b/botserver/src/core/kb/document_processor/mod.rs @@ -115,11 +115,13 @@ impl DocumentProcessor { Ok(contents) } } - DocumentFormat::PDF => self.extract_pdf_text(file_path).await, - DocumentFormat::DOCX => self.extract_docx_text(file_path).await, - DocumentFormat::PPTX => self.extract_pptx_text(file_path).await, - DocumentFormat::XLSX => self.extract_xlsx_text(file_path).await, - DocumentFormat::HTML => self.extract_html_text(file_path).await, + DocumentFormat::PDF => self.extract_pdf_text(file_path).await, + DocumentFormat::DOCX => self.extract_docx_text(file_path).await, + DocumentFormat::DOC => self.extract_doc_text(file_path).await, + DocumentFormat::PPTX => self.extract_pptx_text(file_path).await, + DocumentFormat::XLSX => self.extract_xlsx_text(file_path).await, + DocumentFormat::XLS => self.extract_xls_text(file_path).await, + DocumentFormat::HTML => self.extract_html_text(file_path).await, DocumentFormat::CSV => self.extract_csv_text(file_path).await, DocumentFormat::JSON => self.extract_json_text(file_path).await, DocumentFormat::XML => self.extract_xml_text(file_path).await, @@ -341,19 +343,14 @@ impl DocumentProcessor { Ok(result) } - #[cfg(feature = "kb-extraction")] - async fn extract_xlsx_text(&self, file_path: &Path) -> Result { - let path = file_path.to_path_buf(); - let result = tokio::task::spawn_blocking(move || -> Result { - use calamine::{open_workbook_from_rs, Reader, Xlsx}; - use std::io::Read; +#[cfg(feature = "kb-extraction")] +async fn extract_xlsx_text(&self, file_path: &Path) -> Result { + let path = file_path.to_path_buf(); + let result = tokio::task::spawn_blocking(move || -> Result { + use calamine::{open_workbook, Reader, Xlsx}; - let mut file = std::fs::File::open(&path)?; - let mut bytes = Vec::new(); - file.read_to_end(&mut bytes)?; - let cursor = Cursor::new(bytes.as_slice()); - let mut workbook: Xlsx<_> = open_workbook_from_rs(cursor) - .map_err(|e| anyhow::anyhow!("Failed to open XLSX: {e}"))?; + let mut workbook: Xlsx<_> = open_workbook(&path) + .map_err(|e| anyhow::anyhow!("Failed to open XLSX: {e}"))?; let mut content = String::new(); for sheet_name in workbook.sheet_names() { @@ -405,6 +402,144 @@ impl DocumentProcessor { self.fallback_text_extraction(file_path).await } +#[cfg(feature = "kb-extraction")] +async fn extract_xls_text(&self, file_path: &Path) -> Result { + let path = file_path.to_path_buf(); + let result = tokio::task::spawn_blocking(move || -> Result { + use calamine::{open_workbook, Reader, Xls}; + + let mut workbook: Xls<_> = open_workbook(&path) + .map_err(|e| anyhow::anyhow!("Failed to open XLS: {e}"))?; + + let mut content = String::new(); + for sheet_name in workbook.sheet_names() { + if let Ok(range) = workbook.worksheet_range(&sheet_name) { + use std::fmt::Write; + let _ = writeln!(&mut content, "=== {} ===", sheet_name); + + for row in range.rows() { + let row_text: Vec = row + .iter() + .map(|cell| match cell { + calamine::Data::Empty => String::new(), + calamine::Data::String(s) + | calamine::Data::DateTimeIso(s) + | calamine::Data::DurationIso(s) => s.clone(), + calamine::Data::Float(f) => f.to_string(), + calamine::Data::Int(i) => i.to_string(), + calamine::Data::Bool(b) => b.to_string(), + calamine::Data::Error(e) => format!("{e:?}"), + calamine::Data::DateTime(dt) => dt.to_string(), + }) + .collect(); + + let line = row_text.join("\t"); + if !line.trim().is_empty() { + content.push_str(&line); + content.push('\n'); + } + } + content.push('\n'); + } + } + + Ok(content) + }) + .await??; + + if result.trim().is_empty() { + warn!("XLS extraction produced empty text: {}", file_path.display()); + } else { + info!("Extracted XLS with calamine library: {}", file_path.display()); + } + + Ok(result) + } + + #[cfg(not(feature = "kb-extraction"))] + async fn extract_xls_text(&self, file_path: &Path) -> Result { + self.fallback_text_extraction(file_path).await + } + + #[cfg(feature = "kb-extraction")] + async fn extract_doc_text(&self, file_path: &Path) -> Result { + let path = file_path.to_path_buf(); + let result = tokio::task::spawn_blocking(move || -> Result { + use std::io::Read; + + let mut file = std::fs::File::open(&path) + .map_err(|e| anyhow::anyhow!("Failed to open DOC: {e}"))?; + let mut bytes = Vec::new(); + file.read_to_end(&mut bytes)?; + + let ole_reader = ole::Reader::new(std::io::Cursor::new(bytes.as_slice())) + .map_err(|e| anyhow::anyhow!("Failed to parse OLE structure: {e}"))?; + + let mut word_doc_data: Option> = None; + for entry in ole_reader.iterate() { + let name = entry.name().to_lowercase(); + if name == "worddocument" { + if let Ok(mut slice) = ole_reader.get_entry_slice(entry) { + let mut stream_data = Vec::with_capacity(slice.len()); + if slice.read_to_end(&mut stream_data).is_ok() { + word_doc_data = Some(stream_data); + break; + } + } + } + } + + if let Some(stream_data) = word_doc_data { + let text = extract_text_from_word_document(&stream_data); + if !text.trim().is_empty() { + return Ok(text); + } + } + + let mut all_text = String::new(); + for entry in ole_reader.iterate() { + let name = entry.name().to_lowercase(); + if name == "worddocument" || name == "1table" || name == "0table" || name.contains("compobj") { + continue; + } + if let Ok(mut slice) = ole_reader.get_entry_slice(entry) { + let mut stream_data = Vec::with_capacity(slice.len()); + if slice.read_to_end(&mut stream_data).is_ok() { + let text = String::from_utf8_lossy(&stream_data); + let printable: String = text + .chars() + .filter(|c| c.is_alphanumeric() || c.is_whitespace() || *c == '.' || *c == ',') + .collect(); + if !printable.trim().is_empty() { + all_text.push_str(&printable); + all_text.push('\n'); + } + } + } + } + + if all_text.trim().is_empty() { + return Err(anyhow::anyhow!("Could not extract text from DOC file")); + } + + Ok(all_text) + }) + .await??; + + if result.trim().is_empty() { + warn!("DOC extraction produced empty text: {}", file_path.display()); + } else { + info!("Extracted DOC with OLE parser: {}", file_path.display()); + } + + Ok(result) + } + + #[cfg(not(feature = "kb-extraction"))] + async fn extract_doc_text(&self, file_path: &Path) -> Result { + self.fallback_text_extraction(file_path).await + } + async fn extract_xml_text(&self, file_path: &Path) -> Result { let bytes = tokio::fs::read(file_path).await?; let result = tokio::task::spawn_blocking(move || -> Result { @@ -673,3 +808,58 @@ impl DocumentProcessor { Ok(()) } } + +#[cfg(feature = "kb-extraction")] +fn extract_text_from_word_document(stream_data: &[u8]) -> String { + if stream_data.len() < 10 { + return String::new(); + } + + if let Some(offset_end) = stream_data.get(..2).map(|b| u16::from_le_bytes([b[0], b[1]])) { + if (offset_end as usize) < stream_data.len() && offset_end > 0 { + let text_start = offset_end as usize; + if let Some(ccb) = stream_data.get(2..4).map(|b| u16::from_le_bytes([b[0], b[1]])) { + if ccb > 0 { + let piece_table_offset = text_start + ccb as usize; + if piece_table_offset < stream_data.len() { + return String::new(); + } + } + } + + let raw_text: String = stream_data[text_start..] + .chunks(2) + .filter_map(|chunk| { + if chunk.len() >= 2 { + let code = u16::from_le_bytes([chunk[0], chunk[1]]); + char::from_u32(code as u32) + } else if chunk.len() == 1 { + char::from_u32(chunk[0] as u32) + } else { + None + } + }) + .filter(|c| !c.is_control() || *c == '\n' || *c == '\r' || *c == '\t') + .collect(); + + let cleaned: String = raw_text + .split(|c: char| c == '\r') + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .collect::>() + .join("\n"); + + if !cleaned.trim().is_empty() { + return cleaned; + } + } + } + + String::from_utf8_lossy(stream_data) + .chars() + .filter(|c| c.is_alphanumeric() || c.is_whitespace() || *c == '.' || *c == ',' || *c == ';' || *c == ':' || *c == '!' || *c == '?') + .collect::() + .split_whitespace() + .collect::>() + .join(" ") +}