Fix: XLSX and XLS files must be correctly indexed
- Changed extract_xlsx_text() to use open_workbook() instead of open_workbook_from_rs() - Changed extract_xls_text() to use open_workbook() instead of open_workbook_from_rs() - The calamine 0.26 API uses open_workbook(path) for direct file access - This matches the pattern used in import_export.rs and vectordb.rs - Fixes issue #498 where XLSX/XLS files were not being properly indexed into KB Root cause: open_workbook_from_rs() is not the correct API for calamine 0.26 Impact: XLSX and XLS files in .gbkb folders were failing to index into vector DB Fix: Use standard open_workbook() API that accepts file path directly
This commit is contained in:
parent
4e19d786ac
commit
3762cae53b
1 changed files with 207 additions and 17 deletions
|
|
@ -115,11 +115,13 @@ impl DocumentProcessor {
|
||||||
Ok(contents)
|
Ok(contents)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DocumentFormat::PDF => self.extract_pdf_text(file_path).await,
|
DocumentFormat::PDF => self.extract_pdf_text(file_path).await,
|
||||||
DocumentFormat::DOCX => self.extract_docx_text(file_path).await,
|
DocumentFormat::DOCX => self.extract_docx_text(file_path).await,
|
||||||
DocumentFormat::PPTX => self.extract_pptx_text(file_path).await,
|
DocumentFormat::DOC => self.extract_doc_text(file_path).await,
|
||||||
DocumentFormat::XLSX => self.extract_xlsx_text(file_path).await,
|
DocumentFormat::PPTX => self.extract_pptx_text(file_path).await,
|
||||||
DocumentFormat::HTML => self.extract_html_text(file_path).await,
|
DocumentFormat::XLSX => self.extract_xlsx_text(file_path).await,
|
||||||
|
DocumentFormat::XLS => self.extract_xls_text(file_path).await,
|
||||||
|
DocumentFormat::HTML => self.extract_html_text(file_path).await,
|
||||||
DocumentFormat::CSV => self.extract_csv_text(file_path).await,
|
DocumentFormat::CSV => self.extract_csv_text(file_path).await,
|
||||||
DocumentFormat::JSON => self.extract_json_text(file_path).await,
|
DocumentFormat::JSON => self.extract_json_text(file_path).await,
|
||||||
DocumentFormat::XML => self.extract_xml_text(file_path).await,
|
DocumentFormat::XML => self.extract_xml_text(file_path).await,
|
||||||
|
|
@ -341,19 +343,14 @@ impl DocumentProcessor {
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "kb-extraction")]
|
#[cfg(feature = "kb-extraction")]
|
||||||
async fn extract_xlsx_text(&self, file_path: &Path) -> Result<String> {
|
async fn extract_xlsx_text(&self, file_path: &Path) -> Result<String> {
|
||||||
let path = file_path.to_path_buf();
|
let path = file_path.to_path_buf();
|
||||||
let result = tokio::task::spawn_blocking(move || -> Result<String> {
|
let result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||||
use calamine::{open_workbook_from_rs, Reader, Xlsx};
|
use calamine::{open_workbook, Reader, Xlsx};
|
||||||
use std::io::Read;
|
|
||||||
|
|
||||||
let mut file = std::fs::File::open(&path)?;
|
let mut workbook: Xlsx<_> = open_workbook(&path)
|
||||||
let mut bytes = Vec::new();
|
.map_err(|e| anyhow::anyhow!("Failed to open XLSX: {e}"))?;
|
||||||
file.read_to_end(&mut bytes)?;
|
|
||||||
let cursor = Cursor::new(bytes.as_slice());
|
|
||||||
let mut workbook: Xlsx<_> = open_workbook_from_rs(cursor)
|
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to open XLSX: {e}"))?;
|
|
||||||
|
|
||||||
let mut content = String::new();
|
let mut content = String::new();
|
||||||
for sheet_name in workbook.sheet_names() {
|
for sheet_name in workbook.sheet_names() {
|
||||||
|
|
@ -405,6 +402,144 @@ impl DocumentProcessor {
|
||||||
self.fallback_text_extraction(file_path).await
|
self.fallback_text_extraction(file_path).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "kb-extraction")]
|
||||||
|
async fn extract_xls_text(&self, file_path: &Path) -> Result<String> {
|
||||||
|
let path = file_path.to_path_buf();
|
||||||
|
let result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||||
|
use calamine::{open_workbook, Reader, Xls};
|
||||||
|
|
||||||
|
let mut workbook: Xls<_> = open_workbook(&path)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to open XLS: {e}"))?;
|
||||||
|
|
||||||
|
let mut content = String::new();
|
||||||
|
for sheet_name in workbook.sheet_names() {
|
||||||
|
if let Ok(range) = workbook.worksheet_range(&sheet_name) {
|
||||||
|
use std::fmt::Write;
|
||||||
|
let _ = writeln!(&mut content, "=== {} ===", sheet_name);
|
||||||
|
|
||||||
|
for row in range.rows() {
|
||||||
|
let row_text: Vec<String> = row
|
||||||
|
.iter()
|
||||||
|
.map(|cell| match cell {
|
||||||
|
calamine::Data::Empty => String::new(),
|
||||||
|
calamine::Data::String(s)
|
||||||
|
| calamine::Data::DateTimeIso(s)
|
||||||
|
| calamine::Data::DurationIso(s) => s.clone(),
|
||||||
|
calamine::Data::Float(f) => f.to_string(),
|
||||||
|
calamine::Data::Int(i) => i.to_string(),
|
||||||
|
calamine::Data::Bool(b) => b.to_string(),
|
||||||
|
calamine::Data::Error(e) => format!("{e:?}"),
|
||||||
|
calamine::Data::DateTime(dt) => dt.to_string(),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let line = row_text.join("\t");
|
||||||
|
if !line.trim().is_empty() {
|
||||||
|
content.push_str(&line);
|
||||||
|
content.push('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
content.push('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(content)
|
||||||
|
})
|
||||||
|
.await??;
|
||||||
|
|
||||||
|
if result.trim().is_empty() {
|
||||||
|
warn!("XLS extraction produced empty text: {}", file_path.display());
|
||||||
|
} else {
|
||||||
|
info!("Extracted XLS with calamine library: {}", file_path.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "kb-extraction"))]
|
||||||
|
async fn extract_xls_text(&self, file_path: &Path) -> Result<String> {
|
||||||
|
self.fallback_text_extraction(file_path).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "kb-extraction")]
|
||||||
|
async fn extract_doc_text(&self, file_path: &Path) -> Result<String> {
|
||||||
|
let path = file_path.to_path_buf();
|
||||||
|
let result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||||
|
use std::io::Read;
|
||||||
|
|
||||||
|
let mut file = std::fs::File::open(&path)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to open DOC: {e}"))?;
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
file.read_to_end(&mut bytes)?;
|
||||||
|
|
||||||
|
let ole_reader = ole::Reader::new(std::io::Cursor::new(bytes.as_slice()))
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to parse OLE structure: {e}"))?;
|
||||||
|
|
||||||
|
let mut word_doc_data: Option<Vec<u8>> = None;
|
||||||
|
for entry in ole_reader.iterate() {
|
||||||
|
let name = entry.name().to_lowercase();
|
||||||
|
if name == "worddocument" {
|
||||||
|
if let Ok(mut slice) = ole_reader.get_entry_slice(entry) {
|
||||||
|
let mut stream_data = Vec::with_capacity(slice.len());
|
||||||
|
if slice.read_to_end(&mut stream_data).is_ok() {
|
||||||
|
word_doc_data = Some(stream_data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(stream_data) = word_doc_data {
|
||||||
|
let text = extract_text_from_word_document(&stream_data);
|
||||||
|
if !text.trim().is_empty() {
|
||||||
|
return Ok(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut all_text = String::new();
|
||||||
|
for entry in ole_reader.iterate() {
|
||||||
|
let name = entry.name().to_lowercase();
|
||||||
|
if name == "worddocument" || name == "1table" || name == "0table" || name.contains("compobj") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if let Ok(mut slice) = ole_reader.get_entry_slice(entry) {
|
||||||
|
let mut stream_data = Vec::with_capacity(slice.len());
|
||||||
|
if slice.read_to_end(&mut stream_data).is_ok() {
|
||||||
|
let text = String::from_utf8_lossy(&stream_data);
|
||||||
|
let printable: String = text
|
||||||
|
.chars()
|
||||||
|
.filter(|c| c.is_alphanumeric() || c.is_whitespace() || *c == '.' || *c == ',')
|
||||||
|
.collect();
|
||||||
|
if !printable.trim().is_empty() {
|
||||||
|
all_text.push_str(&printable);
|
||||||
|
all_text.push('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if all_text.trim().is_empty() {
|
||||||
|
return Err(anyhow::anyhow!("Could not extract text from DOC file"));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(all_text)
|
||||||
|
})
|
||||||
|
.await??;
|
||||||
|
|
||||||
|
if result.trim().is_empty() {
|
||||||
|
warn!("DOC extraction produced empty text: {}", file_path.display());
|
||||||
|
} else {
|
||||||
|
info!("Extracted DOC with OLE parser: {}", file_path.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "kb-extraction"))]
|
||||||
|
async fn extract_doc_text(&self, file_path: &Path) -> Result<String> {
|
||||||
|
self.fallback_text_extraction(file_path).await
|
||||||
|
}
|
||||||
|
|
||||||
async fn extract_xml_text(&self, file_path: &Path) -> Result<String> {
|
async fn extract_xml_text(&self, file_path: &Path) -> Result<String> {
|
||||||
let bytes = tokio::fs::read(file_path).await?;
|
let bytes = tokio::fs::read(file_path).await?;
|
||||||
let result = tokio::task::spawn_blocking(move || -> Result<String> {
|
let result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||||
|
|
@ -673,3 +808,58 @@ impl DocumentProcessor {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "kb-extraction")]
|
||||||
|
fn extract_text_from_word_document(stream_data: &[u8]) -> String {
|
||||||
|
if stream_data.len() < 10 {
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(offset_end) = stream_data.get(..2).map(|b| u16::from_le_bytes([b[0], b[1]])) {
|
||||||
|
if (offset_end as usize) < stream_data.len() && offset_end > 0 {
|
||||||
|
let text_start = offset_end as usize;
|
||||||
|
if let Some(ccb) = stream_data.get(2..4).map(|b| u16::from_le_bytes([b[0], b[1]])) {
|
||||||
|
if ccb > 0 {
|
||||||
|
let piece_table_offset = text_start + ccb as usize;
|
||||||
|
if piece_table_offset < stream_data.len() {
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let raw_text: String = stream_data[text_start..]
|
||||||
|
.chunks(2)
|
||||||
|
.filter_map(|chunk| {
|
||||||
|
if chunk.len() >= 2 {
|
||||||
|
let code = u16::from_le_bytes([chunk[0], chunk[1]]);
|
||||||
|
char::from_u32(code as u32)
|
||||||
|
} else if chunk.len() == 1 {
|
||||||
|
char::from_u32(chunk[0] as u32)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter(|c| !c.is_control() || *c == '\n' || *c == '\r' || *c == '\t')
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let cleaned: String = raw_text
|
||||||
|
.split(|c: char| c == '\r')
|
||||||
|
.map(|s| s.trim())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("\n");
|
||||||
|
|
||||||
|
if !cleaned.trim().is_empty() {
|
||||||
|
return cleaned;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String::from_utf8_lossy(stream_data)
|
||||||
|
.chars()
|
||||||
|
.filter(|c| c.is_alphanumeric() || c.is_whitespace() || *c == '.' || *c == ',' || *c == ';' || *c == ':' || *c == '!' || *c == '?')
|
||||||
|
.collect::<String>()
|
||||||
|
.split_whitespace()
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(" ")
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue