Fix PDF tabular parsing spacing
This commit is contained in:
parent
8069fbab28
commit
058000cd1f
1 changed files with 12 additions and 12 deletions
|
|
@ -482,20 +482,20 @@ impl DocumentProcessor {
|
|||
}
|
||||
|
||||
fn clean_text(text: &str) -> String {
|
||||
let cleaned = text
|
||||
.lines()
|
||||
.map(|line| line.trim())
|
||||
text.lines()
|
||||
.map(|line| {
|
||||
let cleaned_line: String = line
|
||||
.chars()
|
||||
.filter(|c| !c.is_control() || c.is_whitespace())
|
||||
.collect();
|
||||
cleaned_line
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
})
|
||||
.filter(|line| !line.is_empty())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
|
||||
cleaned
|
||||
.chars()
|
||||
.filter(|c| !c.is_control() || c.is_whitespace())
|
||||
.collect::<String>()
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
fn create_chunks(&self, text: &str, file_path: &Path) -> Vec<TextChunk> {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue