649 lines
22 KiB
Rust
649 lines
22 KiB
Rust
use anyhow::Result;
|
|
use chrono::{DateTime, Utc};
|
|
use log::{error, info, warn};
|
|
use std::collections::HashMap;
|
|
use std::path::PathBuf;
|
|
use std::sync::Arc;
|
|
use tokio::sync::RwLock;
|
|
use tokio::time::{sleep, Duration};
|
|
use uuid::Uuid;
|
|
|
|
#[cfg(feature = "vectordb")]
|
|
use crate::drive::vectordb::UserDriveVectorDB;
|
|
#[cfg(feature = "vectordb")]
|
|
use crate::drive::vectordb::{FileContentExtractor, FileDocument};
|
|
#[cfg(all(feature = "vectordb", feature = "mail"))]
|
|
use crate::email::vectordb::{EmailDocument, UserEmailVectorDB};
|
|
use crate::vector_db::embedding::EmbeddingGenerator;
|
|
use crate::shared::utils::DbPool;
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct UserWorkspace {
|
|
root: PathBuf,
|
|
bot_id: Uuid,
|
|
user_id: Uuid,
|
|
}
|
|
|
|
impl UserWorkspace {
|
|
fn new(root: PathBuf, bot_id: &Uuid, user_id: &Uuid) -> Self {
|
|
Self {
|
|
root,
|
|
bot_id: *bot_id,
|
|
user_id: *user_id,
|
|
}
|
|
}
|
|
|
|
fn get_path(&self) -> PathBuf {
|
|
self.root
|
|
.join(self.bot_id.to_string())
|
|
.join(self.user_id.to_string())
|
|
}
|
|
|
|
#[cfg(feature = "mail")]
|
|
fn email_vectordb(&self) -> String {
|
|
format!("email_{}_{}", self.bot_id, self.user_id)
|
|
}
|
|
|
|
fn drive_vectordb(&self) -> String {
|
|
format!("drive_{}_{}", self.bot_id, self.user_id)
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub enum IndexingStatus {
|
|
Idle,
|
|
Running,
|
|
Paused,
|
|
Failed(String),
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct IndexingStats {
|
|
pub emails_indexed: u64,
|
|
pub files_indexed: u64,
|
|
pub emails_pending: u64,
|
|
pub files_pending: u64,
|
|
pub last_run: Option<DateTime<Utc>>,
|
|
pub errors: u64,
|
|
}
|
|
|
|
struct UserIndexingJob {
|
|
user_id: Uuid,
|
|
bot_id: Uuid,
|
|
workspace: UserWorkspace,
|
|
#[cfg(all(feature = "vectordb", feature = "mail"))]
|
|
email_db: Option<UserEmailVectorDB>,
|
|
#[cfg(feature = "vectordb")]
|
|
drive_db: Option<UserDriveVectorDB>,
|
|
stats: IndexingStats,
|
|
status: IndexingStatus,
|
|
}
|
|
|
|
pub struct VectorDBIndexer {
|
|
db_pool: DbPool,
|
|
work_root: PathBuf,
|
|
qdrant_url: String,
|
|
embedding_generator: Arc<EmbeddingGenerator>,
|
|
jobs: Arc<RwLock<HashMap<Uuid, UserIndexingJob>>>,
|
|
running: Arc<RwLock<bool>>,
|
|
interval_seconds: u64,
|
|
batch_size: usize,
|
|
}
|
|
|
|
impl VectorDBIndexer {
|
|
pub fn new(
|
|
db_pool: DbPool,
|
|
work_root: PathBuf,
|
|
qdrant_url: String,
|
|
llm_endpoint: String,
|
|
) -> Self {
|
|
Self {
|
|
db_pool,
|
|
work_root,
|
|
qdrant_url,
|
|
embedding_generator: Arc::new(EmbeddingGenerator::new(llm_endpoint)),
|
|
jobs: Arc::new(RwLock::new(HashMap::new())),
|
|
running: Arc::new(RwLock::new(false)),
|
|
interval_seconds: 300,
|
|
batch_size: 10,
|
|
}
|
|
}
|
|
|
|
pub async fn start(self: Arc<Self>) -> Result<()> {
|
|
let mut running = self.running.write().await;
|
|
if *running {
|
|
warn!("Vector DB indexer already running");
|
|
return Ok(());
|
|
}
|
|
*running = true;
|
|
drop(running);
|
|
|
|
info!(" Starting Vector DB Indexer background service");
|
|
|
|
let indexer = Arc::clone(&self);
|
|
tokio::spawn(async move {
|
|
indexer.run_indexing_loop().await;
|
|
});
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn stop(&self) {
|
|
let mut running = self.running.write().await;
|
|
*running = false;
|
|
info!("🛑 Stopping Vector DB Indexer");
|
|
}
|
|
|
|
async fn run_indexing_loop(self: Arc<Self>) {
|
|
loop {
|
|
{
|
|
let running = self.running.read().await;
|
|
if !*running {
|
|
break;
|
|
}
|
|
}
|
|
|
|
info!(" Running vector DB indexing cycle...");
|
|
|
|
match self.get_active_users().await {
|
|
Ok(users) => {
|
|
info!("Found {} active users to index", users.len());
|
|
|
|
for (user_id, bot_id) in users {
|
|
if let Err(e) = self.index_user_data(user_id, bot_id).await {
|
|
error!("Failed to index user {}: {}", user_id, e);
|
|
}
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to get active users: {}", e);
|
|
}
|
|
}
|
|
|
|
info!(" Indexing cycle complete");
|
|
|
|
sleep(Duration::from_secs(self.interval_seconds)).await;
|
|
}
|
|
|
|
info!("Vector DB Indexer stopped");
|
|
}
|
|
|
|
async fn get_active_users(&self) -> Result<Vec<(Uuid, Uuid)>> {
|
|
let pool = self.db_pool.clone();
|
|
|
|
tokio::task::spawn_blocking(move || {
|
|
use crate::shared::models::schema::user_sessions::dsl::*;
|
|
use diesel::prelude::*;
|
|
|
|
let mut db_conn = pool.get()?;
|
|
|
|
let results: Vec<(Uuid, Uuid)> = user_sessions
|
|
.select((user_id, bot_id))
|
|
.distinct()
|
|
.load(&mut db_conn)?;
|
|
|
|
Ok::<_, anyhow::Error>(results)
|
|
})
|
|
.await?
|
|
}
|
|
|
|
async fn index_user_data(&self, user_id: Uuid, bot_id: Uuid) -> Result<()> {
|
|
info!("Indexing user: {} (bot: {})", user_id, bot_id);
|
|
|
|
let mut jobs = self.jobs.write().await;
|
|
let job = jobs.entry(user_id).or_insert_with(|| {
|
|
let workspace = UserWorkspace::new(self.work_root.clone(), &bot_id, &user_id);
|
|
info!("User workspace path: {}", workspace.get_path().display());
|
|
|
|
UserIndexingJob {
|
|
user_id,
|
|
bot_id,
|
|
workspace,
|
|
#[cfg(all(feature = "vectordb", feature = "mail"))]
|
|
email_db: None,
|
|
drive_db: None,
|
|
stats: IndexingStats {
|
|
emails_indexed: 0,
|
|
files_indexed: 0,
|
|
emails_pending: 0,
|
|
files_pending: 0,
|
|
last_run: None,
|
|
errors: 0,
|
|
},
|
|
status: IndexingStatus::Idle,
|
|
}
|
|
});
|
|
|
|
if job.status == IndexingStatus::Running {
|
|
warn!(
|
|
"Job already running for user {} (bot: {})",
|
|
job.user_id, job.bot_id
|
|
);
|
|
return Ok(());
|
|
}
|
|
|
|
job.status = IndexingStatus::Running;
|
|
|
|
#[cfg(all(feature = "vectordb", feature = "mail"))]
|
|
if job.email_db.is_none() {
|
|
let mut email_db =
|
|
UserEmailVectorDB::new(user_id, bot_id, job.workspace.email_vectordb().into());
|
|
if let Err(e) = email_db.initialize(&self.qdrant_url).await {
|
|
warn!(
|
|
"Failed to initialize email vector DB for user {}: {}",
|
|
user_id, e
|
|
);
|
|
} else {
|
|
job.email_db = Some(email_db);
|
|
}
|
|
}
|
|
|
|
if job.drive_db.is_none() {
|
|
let mut drive_db =
|
|
UserDriveVectorDB::new(user_id, bot_id, job.workspace.drive_vectordb().into());
|
|
if let Err(e) = drive_db.initialize(&self.qdrant_url).await {
|
|
warn!(
|
|
"Failed to initialize drive vector DB for user {}: {}",
|
|
user_id, e
|
|
);
|
|
} else {
|
|
job.drive_db = Some(drive_db);
|
|
}
|
|
}
|
|
|
|
drop(jobs);
|
|
|
|
#[cfg(feature = "mail")]
|
|
if let Err(e) = self.index_user_emails(user_id).await {
|
|
error!("Failed to index emails for user {}: {}", user_id, e);
|
|
}
|
|
|
|
if let Err(e) = self.index_user_files(user_id).await {
|
|
error!("Failed to index files for user {}: {}", user_id, e);
|
|
}
|
|
|
|
let mut jobs = self.jobs.write().await;
|
|
if let Some(job) = jobs.get_mut(&user_id) {
|
|
job.status = IndexingStatus::Idle;
|
|
job.stats.last_run = Some(Utc::now());
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(feature = "mail")]
|
|
async fn index_user_emails(&self, user_id: Uuid) -> Result<()> {
|
|
let jobs = self.jobs.read().await;
|
|
let job = jobs
|
|
.get(&user_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Job not found"))?;
|
|
|
|
let Some(email_db) = &job.email_db else {
|
|
warn!("Email vector DB not initialized for user {}", user_id);
|
|
return Ok(());
|
|
};
|
|
|
|
let accounts = self.get_user_email_accounts(user_id).await?;
|
|
|
|
info!(
|
|
"Found {} email accounts for user {}",
|
|
accounts.len(),
|
|
user_id
|
|
);
|
|
|
|
for account_id in accounts {
|
|
match self.get_unindexed_emails(user_id, &account_id).await {
|
|
Ok(emails) => {
|
|
if emails.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
info!(
|
|
"Indexing {} emails for account {}",
|
|
emails.len(),
|
|
account_id
|
|
);
|
|
|
|
for chunk in emails.chunks(self.batch_size) {
|
|
for email in chunk {
|
|
let text = format!(
|
|
"From: {} <{}>\nSubject: {}\n\n{}",
|
|
email.from_name, email.from_email, email.subject, email.body_text
|
|
);
|
|
let text = if text.len() > 8000 {
|
|
&text[..8000]
|
|
} else {
|
|
&text
|
|
};
|
|
|
|
match self.embedding_generator.generate_text_embedding(text).await {
|
|
Ok(embedding) => {
|
|
if let Err(e) = email_db.index_email(&email, embedding).await {
|
|
error!("Failed to index email {}: {}", email.id, e);
|
|
} else {
|
|
info!(" Indexed email: {}", email.subject);
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!(
|
|
"Failed to generate embedding for email {}: {}",
|
|
email.id, e
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
sleep(Duration::from_millis(100)).await;
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!(
|
|
"Failed to get unindexed emails for account {}: {}",
|
|
account_id, e
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn index_user_files(&self, user_id: Uuid) -> Result<()> {
|
|
let jobs = self.jobs.read().await;
|
|
let job = jobs
|
|
.get(&user_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Job not found"))?;
|
|
|
|
let Some(drive_db) = &job.drive_db else {
|
|
warn!("Drive vector DB not initialized for user {}", user_id);
|
|
return Ok(());
|
|
};
|
|
|
|
match self.get_unindexed_files(user_id).await {
|
|
Ok(files) => {
|
|
if files.is_empty() {
|
|
return Ok(());
|
|
}
|
|
|
|
info!("Indexing {} files for user {}", files.len(), user_id);
|
|
|
|
for chunk in files.chunks(self.batch_size) {
|
|
for file in chunk {
|
|
let mime_type = file.mime_type.as_deref().unwrap_or("");
|
|
if !FileContentExtractor::should_index(&mime_type, file.file_size) {
|
|
continue;
|
|
}
|
|
|
|
let text = format!(
|
|
"File: {}\nType: {}\n\n{}",
|
|
file.file_name, file.file_type, file.content_text
|
|
);
|
|
|
|
match self
|
|
.embedding_generator
|
|
.generate_text_embedding(&text)
|
|
.await
|
|
{
|
|
Ok(embedding) => {
|
|
if let Err(e) = drive_db.index_file(&file, embedding).await {
|
|
error!("Failed to index file {}: {}", file.id, e);
|
|
} else {
|
|
info!(" Indexed file: {}", file.file_name);
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to generate embedding for file {}: {}", file.id, e);
|
|
}
|
|
}
|
|
}
|
|
|
|
sleep(Duration::from_millis(100)).await;
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to get unindexed files for user {}: {}", user_id, e);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(feature = "mail")]
|
|
async fn get_user_email_accounts(&self, user_id: Uuid) -> Result<Vec<String>> {
|
|
let pool = self.db_pool.clone();
|
|
|
|
tokio::task::spawn_blocking(move || {
|
|
use diesel::prelude::*;
|
|
|
|
let mut db_conn = pool.get()?;
|
|
|
|
#[derive(diesel::QueryableByName)]
|
|
struct AccountIdRow {
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
id: String,
|
|
}
|
|
|
|
let results: Vec<String> = diesel::sql_query(
|
|
"SELECT id::text FROM user_email_accounts WHERE user_id = $1 AND is_active = true",
|
|
)
|
|
.bind::<diesel::sql_types::Uuid, _>(user_id)
|
|
.load::<AccountIdRow>(&mut db_conn)?
|
|
.into_iter()
|
|
.map(|row| row.id)
|
|
.collect();
|
|
|
|
Ok::<_, anyhow::Error>(results)
|
|
})
|
|
.await?
|
|
}
|
|
|
|
#[cfg(feature = "mail")]
|
|
async fn get_unindexed_emails(
|
|
&self,
|
|
user_id: Uuid,
|
|
account_id: &str,
|
|
) -> Result<Vec<EmailDocument>, Box<dyn std::error::Error + Send + Sync>> {
|
|
let pool = self.db_pool.clone();
|
|
let account_id = account_id.to_string();
|
|
|
|
let results = tokio::task::spawn_blocking(move || {
|
|
use diesel::prelude::*;
|
|
let mut conn = pool.get()?;
|
|
|
|
#[derive(diesel::QueryableByName)]
|
|
struct EmailRow {
|
|
#[diesel(sql_type = diesel::sql_types::Uuid)]
|
|
id: Uuid,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
message_id: String,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
subject: String,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
from_address: String,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
to_addresses: String,
|
|
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
|
|
body_text: Option<String>,
|
|
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
|
|
body_html: Option<String>,
|
|
#[diesel(sql_type = diesel::sql_types::Timestamptz)]
|
|
received_at: DateTime<Utc>,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
folder: String,
|
|
}
|
|
|
|
let query = r"
|
|
SELECT e.id, e.message_id, e.subject, e.from_address, e.to_addresses,
|
|
e.body_text, e.body_html, e.received_at, e.folder
|
|
FROM emails e
|
|
LEFT JOIN email_index_status eis ON e.id = eis.email_id
|
|
WHERE e.user_id = $1
|
|
AND e.account_id = $2
|
|
AND (eis.indexed_at IS NULL OR eis.needs_reindex = true)
|
|
ORDER BY e.received_at DESC
|
|
LIMIT 100
|
|
";
|
|
|
|
let rows: Vec<EmailRow> = diesel::sql_query(query)
|
|
.bind::<diesel::sql_types::Uuid, _>(user_id)
|
|
.bind::<diesel::sql_types::Text, _>(&account_id)
|
|
.load(&mut conn)
|
|
.unwrap_or_default();
|
|
|
|
let emails: Vec<EmailDocument> = rows
|
|
.into_iter()
|
|
.map(|row| EmailDocument {
|
|
id: row.id.to_string(),
|
|
account_id: account_id.clone(),
|
|
from_email: row.from_address.clone(),
|
|
from_name: row.from_address,
|
|
to_email: row.to_addresses,
|
|
subject: row.subject,
|
|
body_text: row
|
|
.body_html
|
|
.unwrap_or_else(|| row.body_text.unwrap_or_default()),
|
|
date: row.received_at,
|
|
folder: row.folder,
|
|
has_attachments: false,
|
|
thread_id: Some(row.message_id),
|
|
})
|
|
.collect();
|
|
|
|
Ok::<_, anyhow::Error>(emails)
|
|
})
|
|
.await??;
|
|
|
|
Ok(results)
|
|
}
|
|
|
|
async fn get_unindexed_files(
|
|
&self,
|
|
user_id: Uuid,
|
|
) -> Result<Vec<FileDocument>, Box<dyn std::error::Error + Send + Sync>> {
|
|
let pool = self.db_pool.clone();
|
|
|
|
let results = tokio::task::spawn_blocking(move || {
|
|
use diesel::prelude::*;
|
|
let mut conn = pool.get()?;
|
|
|
|
#[derive(diesel::QueryableByName)]
|
|
struct FileRow {
|
|
#[diesel(sql_type = diesel::sql_types::Uuid)]
|
|
id: Uuid,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
file_path: String,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
file_name: String,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
file_type: String,
|
|
#[diesel(sql_type = diesel::sql_types::BigInt)]
|
|
file_size: i64,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
bucket: String,
|
|
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
|
|
mime_type: Option<String>,
|
|
#[diesel(sql_type = diesel::sql_types::Timestamptz)]
|
|
created_at: DateTime<Utc>,
|
|
#[diesel(sql_type = diesel::sql_types::Timestamptz)]
|
|
modified_at: DateTime<Utc>,
|
|
}
|
|
|
|
let query = r"
|
|
SELECT f.id, f.file_path, f.file_name, f.file_type, f.file_size,
|
|
f.bucket, f.mime_type, f.created_at, f.modified_at
|
|
FROM user_files f
|
|
LEFT JOIN file_index_status fis ON f.id = fis.file_id
|
|
WHERE f.user_id = $1
|
|
AND (fis.indexed_at IS NULL OR fis.needs_reindex = true)
|
|
ORDER BY f.modified_at DESC
|
|
LIMIT 100
|
|
";
|
|
|
|
let rows: Vec<FileRow> = diesel::sql_query(query)
|
|
.bind::<diesel::sql_types::Uuid, _>(user_id)
|
|
.load(&mut conn)
|
|
.unwrap_or_default();
|
|
|
|
let files: Vec<FileDocument> = rows
|
|
.into_iter()
|
|
.map(|row| FileDocument {
|
|
id: row.id.to_string(),
|
|
file_path: row.file_path,
|
|
file_name: row.file_name,
|
|
file_type: row.file_type,
|
|
file_size: row.file_size as u64,
|
|
bucket: row.bucket,
|
|
content_text: String::new(),
|
|
content_summary: None,
|
|
created_at: row.created_at,
|
|
modified_at: row.modified_at,
|
|
indexed_at: Utc::now(),
|
|
mime_type: row.mime_type,
|
|
tags: Vec::new(),
|
|
})
|
|
.collect();
|
|
|
|
Ok::<_, anyhow::Error>(files)
|
|
})
|
|
.await??;
|
|
|
|
Ok(results)
|
|
}
|
|
|
|
pub async fn get_user_stats(&self, user_id: Uuid) -> Option<IndexingStats> {
|
|
let jobs = self.jobs.read().await;
|
|
jobs.get(&user_id).map(|job| job.stats.clone())
|
|
}
|
|
|
|
pub async fn get_overall_stats(&self) -> IndexingStats {
|
|
let jobs = self.jobs.read().await;
|
|
|
|
let mut total_stats = IndexingStats {
|
|
emails_indexed: 0,
|
|
files_indexed: 0,
|
|
emails_pending: 0,
|
|
files_pending: 0,
|
|
last_run: None,
|
|
errors: 0,
|
|
};
|
|
|
|
for job in jobs.values() {
|
|
total_stats.emails_indexed += job.stats.emails_indexed;
|
|
total_stats.files_indexed += job.stats.files_indexed;
|
|
total_stats.emails_pending += job.stats.emails_pending;
|
|
total_stats.files_pending += job.stats.files_pending;
|
|
total_stats.errors += job.stats.errors;
|
|
|
|
if let Some(last_run) = job.stats.last_run {
|
|
if total_stats.last_run.map_or(true, |lr| lr < last_run) {
|
|
total_stats.last_run = Some(last_run);
|
|
}
|
|
}
|
|
}
|
|
|
|
total_stats
|
|
}
|
|
|
|
pub async fn pause_user_indexing(&self, user_id: Uuid) -> Result<()> {
|
|
let mut jobs = self.jobs.write().await;
|
|
if let Some(job) = jobs.get_mut(&user_id) {
|
|
job.status = IndexingStatus::Paused;
|
|
info!("⏸️ Paused indexing for user {}", user_id);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn resume_user_indexing(&self, user_id: Uuid) -> Result<()> {
|
|
let mut jobs = self.jobs.write().await;
|
|
if let Some(job) = jobs.get_mut(&user_id) {
|
|
job.status = IndexingStatus::Idle;
|
|
info!("▶️ Resumed indexing for user {}", user_id);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn trigger_user_indexing(&self, user_id: Uuid, bot_id: Uuid) -> Result<()> {
|
|
info!(" Triggering immediate indexing for user {}", user_id);
|
|
self.index_user_data(user_id, bot_id).await
|
|
}
|
|
}
|