use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use tokio::sync::{broadcast, RwLock}; use uuid::Uuid; use crate::security::command_guard::SafeCommand; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum MetricType { Counter, Gauge, Histogram, Summary, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum AlertSeverity { Info, Warning, Error, Critical, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum AlertStatus { Firing, Resolved, Acknowledged, Silenced, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MetricDataPoint { pub timestamp: DateTime, pub value: f64, pub labels: HashMap, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Metric { pub name: String, pub metric_type: MetricType, pub description: String, pub unit: Option, pub labels: HashMap, pub current_value: f64, pub data_points: Vec, pub updated_at: DateTime, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct AlertRule { pub id: Uuid, pub name: String, pub description: String, pub metric_name: String, pub condition: AlertCondition, pub severity: AlertSeverity, pub duration_seconds: u64, pub labels: HashMap, pub annotations: HashMap, pub enabled: bool, pub created_at: DateTime, pub updated_at: DateTime, } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum AlertCondition { GreaterThan(f64), LessThan(f64), Equals(f64), NotEquals(f64), GreaterThanOrEqual(f64), LessThanOrEqual(f64), AbsentFor(u64), RateOfChange { threshold: f64, window_seconds: u64 }, } impl AlertCondition { pub fn evaluate(&self, current: f64, previous: Option, absent_seconds: u64) -> bool { match self { Self::GreaterThan(threshold) => current > *threshold, Self::LessThan(threshold) => current < *threshold, Self::Equals(threshold) => (current - threshold).abs() < f64::EPSILON, Self::NotEquals(threshold) => (current - threshold).abs() >= f64::EPSILON, Self::GreaterThanOrEqual(threshold) => current >= *threshold, Self::LessThanOrEqual(threshold) => current <= *threshold, Self::AbsentFor(seconds) => absent_seconds >= *seconds, Self::RateOfChange { threshold, window_seconds: _, } => { if let Some(prev) = previous { let rate = (current - prev).abs(); rate > *threshold } else { false } } } } } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Alert { pub id: Uuid, pub rule_id: Uuid, pub rule_name: String, pub severity: AlertSeverity, pub status: AlertStatus, pub metric_name: String, pub metric_value: f64, pub threshold: f64, pub message: String, pub labels: HashMap, pub annotations: HashMap, pub started_at: DateTime, pub resolved_at: Option>, pub acknowledged_at: Option>, pub acknowledged_by: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SystemHealth { pub status: HealthStatus, pub cpu_usage_percent: f64, pub memory_usage_percent: f64, pub disk_usage_percent: f64, pub active_connections: u64, pub requests_per_second: f64, pub error_rate_percent: f64, pub average_latency_ms: f64, pub uptime_seconds: u64, pub last_check: DateTime, pub components: Vec, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum HealthStatus { Healthy, Degraded, Unhealthy, Unknown, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ComponentHealth { pub name: String, pub status: HealthStatus, pub latency_ms: Option, pub message: Option, pub last_check: DateTime, } #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type")] pub enum MonitoringMessage { MetricUpdate { metric: Metric, }, AlertFired { alert: Alert, }, AlertResolved { alert_id: Uuid, }, HealthUpdate { health: SystemHealth, }, Subscribe { metrics: Vec, alerts: bool, health: bool, }, Unsubscribe { metrics: Vec, }, Ping, Pong, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DashboardQuery { pub metrics: Option>, pub from: Option>, pub to: Option>, pub resolution: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DashboardResponse { pub metrics: Vec, pub health: SystemHealth, pub active_alerts: Vec, pub recent_alerts: Vec, } pub struct MetricsCollector { metrics: Arc>>, alerts: Arc>>, alert_rules: Arc>>, health: Arc>, broadcast_tx: broadcast::Sender, request_counter: AtomicU64, error_counter: AtomicU64, total_latency_ms: AtomicU64, start_time: DateTime, } impl MetricsCollector { pub fn new() -> Self { let (tx, _) = broadcast::channel(1024); let health = SystemHealth { status: HealthStatus::Unknown, cpu_usage_percent: 0.0, memory_usage_percent: 0.0, disk_usage_percent: 0.0, active_connections: 0, requests_per_second: 0.0, error_rate_percent: 0.0, average_latency_ms: 0.0, uptime_seconds: 0, last_check: Utc::now(), components: Vec::new(), }; Self { metrics: Arc::new(RwLock::new(HashMap::new())), alerts: Arc::new(RwLock::new(HashMap::new())), alert_rules: Arc::new(RwLock::new(Vec::new())), health: Arc::new(RwLock::new(health)), broadcast_tx: tx, request_counter: AtomicU64::new(0), error_counter: AtomicU64::new(0), total_latency_ms: AtomicU64::new(0), start_time: Utc::now(), } } pub fn subscribe(&self) -> broadcast::Receiver { self.broadcast_tx.subscribe() } pub async fn record_metric(&self, name: &str, value: f64, labels: HashMap) { let now = Utc::now(); let data_point = MetricDataPoint { timestamp: now, value, labels: labels.clone(), }; let mut metrics = self.metrics.write().await; if let Some(metric) = metrics.get_mut(name) { metric.current_value = value; metric.updated_at = now; metric.data_points.push(data_point); if metric.data_points.len() > 1000 { metric.data_points.remove(0); } let _ = self.broadcast_tx.send(MonitoringMessage::MetricUpdate { metric: metric.clone(), }); } else { let metric = Metric { name: name.to_string(), metric_type: MetricType::Gauge, description: String::new(), unit: None, labels, current_value: value, data_points: vec![data_point], updated_at: now, }; let _ = self.broadcast_tx.send(MonitoringMessage::MetricUpdate { metric: metric.clone(), }); metrics.insert(name.to_string(), metric); } drop(metrics); self.check_alert_rules(name, value).await; } pub async fn increment_counter(&self, name: &str, labels: HashMap) { let metrics = self.metrics.read().await; let current = metrics.get(name).map(|m| m.current_value).unwrap_or(0.0); drop(metrics); self.record_metric(name, current + 1.0, labels).await; } pub async fn record_histogram(&self, name: &str, value: f64, labels: HashMap) { self.record_metric(name, value, labels).await; } pub async fn register_metric( &self, name: &str, metric_type: MetricType, description: &str, unit: Option, ) { let metric = Metric { name: name.to_string(), metric_type, description: description.to_string(), unit, labels: HashMap::new(), current_value: 0.0, data_points: Vec::new(), updated_at: Utc::now(), }; let mut metrics = self.metrics.write().await; metrics.insert(name.to_string(), metric); } pub async fn add_alert_rule(&self, rule: AlertRule) { let mut rules = self.alert_rules.write().await; rules.push(rule); } pub async fn remove_alert_rule(&self, rule_id: Uuid) -> bool { let mut rules = self.alert_rules.write().await; let initial_len = rules.len(); rules.retain(|r| r.id != rule_id); rules.len() < initial_len } async fn check_alert_rules(&self, metric_name: &str, value: f64) { let rules = self.alert_rules.read().await; let relevant_rules: Vec<_> = rules .iter() .filter(|r| r.enabled && r.metric_name == metric_name) .cloned() .collect(); drop(rules); for rule in relevant_rules { let should_fire = rule.condition.evaluate(value, None, 0); if should_fire { self.fire_alert(&rule, value).await; } else { self.resolve_alert_for_rule(rule.id).await; } } } async fn fire_alert(&self, rule: &AlertRule, value: f64) { let mut alerts = self.alerts.write().await; let existing = alerts.values().find(|a| { a.rule_id == rule.id && matches!(a.status, AlertStatus::Firing | AlertStatus::Acknowledged) }); if existing.is_some() { return; } let threshold = match &rule.condition { AlertCondition::GreaterThan(t) | AlertCondition::LessThan(t) | AlertCondition::Equals(t) | AlertCondition::NotEquals(t) | AlertCondition::GreaterThanOrEqual(t) | AlertCondition::LessThanOrEqual(t) => *t, AlertCondition::AbsentFor(s) => *s as f64, AlertCondition::RateOfChange { threshold, .. } => *threshold, }; let alert = Alert { id: Uuid::new_v4(), rule_id: rule.id, rule_name: rule.name.clone(), severity: rule.severity.clone(), status: AlertStatus::Firing, metric_name: rule.metric_name.clone(), metric_value: value, threshold, message: format!( "{}: {} is {} (threshold: {})", rule.name, rule.metric_name, value, threshold ), labels: rule.labels.clone(), annotations: rule.annotations.clone(), started_at: Utc::now(), resolved_at: None, acknowledged_at: None, acknowledged_by: None, }; let _ = self.broadcast_tx.send(MonitoringMessage::AlertFired { alert: alert.clone(), }); alerts.insert(alert.id, alert); } async fn resolve_alert_for_rule(&self, rule_id: Uuid) { let mut alerts = self.alerts.write().await; let alert_ids: Vec<_> = alerts .iter() .filter(|(_, a)| a.rule_id == rule_id && a.status == AlertStatus::Firing) .map(|(id, _)| *id) .collect(); for alert_id in alert_ids { if let Some(alert) = alerts.get_mut(&alert_id) { alert.status = AlertStatus::Resolved; alert.resolved_at = Some(Utc::now()); let _ = self .broadcast_tx .send(MonitoringMessage::AlertResolved { alert_id }); } } } pub async fn acknowledge_alert(&self, alert_id: Uuid, acknowledged_by: &str) -> bool { let mut alerts = self.alerts.write().await; if let Some(alert) = alerts.get_mut(&alert_id) { if alert.status == AlertStatus::Firing { alert.status = AlertStatus::Acknowledged; alert.acknowledged_at = Some(Utc::now()); alert.acknowledged_by = Some(acknowledged_by.to_string()); return true; } } false } pub async fn get_active_alerts(&self) -> Vec { let alerts = self.alerts.read().await; alerts .values() .filter(|a| matches!(a.status, AlertStatus::Firing | AlertStatus::Acknowledged)) .cloned() .collect() } pub async fn get_recent_alerts(&self, limit: usize) -> Vec { let alerts = self.alerts.read().await; let mut all_alerts: Vec<_> = alerts.values().cloned().collect(); all_alerts.sort_by(|a, b| b.started_at.cmp(&a.started_at)); all_alerts.truncate(limit); all_alerts } pub fn record_request(&self, latency_ms: u64, is_error: bool) { self.request_counter.fetch_add(1, Ordering::Relaxed); self.total_latency_ms.fetch_add(latency_ms, Ordering::Relaxed); if is_error { self.error_counter.fetch_add(1, Ordering::Relaxed); } } pub async fn update_system_health(&self) { let now = Utc::now(); let cpu_usage = self.collect_cpu_usage().await; let memory_usage = self.collect_memory_usage().await; let disk_usage = self.collect_disk_usage().await; let request_count = self.request_counter.load(Ordering::Relaxed); let error_count = self.error_counter.load(Ordering::Relaxed); let total_latency = self.total_latency_ms.load(Ordering::Relaxed); let error_rate = if request_count > 0 { (error_count as f64 / request_count as f64) * 100.0 } else { 0.0 }; let avg_latency = if request_count > 0 { total_latency as f64 / request_count as f64 } else { 0.0 }; let uptime = (now - self.start_time).num_seconds() as u64; let rps = if uptime > 0 { request_count as f64 / uptime as f64 } else { 0.0 }; let components = self.check_component_health().await; let overall_status = self.calculate_overall_status( cpu_usage, memory_usage, error_rate, &components, ); let health = SystemHealth { status: overall_status, cpu_usage_percent: cpu_usage, memory_usage_percent: memory_usage, disk_usage_percent: disk_usage, active_connections: 0, requests_per_second: rps, error_rate_percent: error_rate, average_latency_ms: avg_latency, uptime_seconds: uptime, last_check: now, components, }; { let mut h = self.health.write().await; *h = health.clone(); } let _ = self .broadcast_tx .send(MonitoringMessage::HealthUpdate { health }); } async fn collect_cpu_usage(&self) -> f64 { #[cfg(target_os = "linux")] { if let Ok(contents) = tokio::fs::read_to_string("/proc/stat").await { if let Some(line) = contents.lines().next() { let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 5 { let user: u64 = parts.get(1).and_then(|s| s.parse().ok()).unwrap_or(0); let nice: u64 = parts.get(2).and_then(|s| s.parse().ok()).unwrap_or(0); let system: u64 = parts.get(3).and_then(|s| s.parse().ok()).unwrap_or(0); let idle: u64 = parts.get(4).and_then(|s| s.parse().ok()).unwrap_or(0); let total = user + nice + system + idle; let active = user + nice + system; if total > 0 { return (active as f64 / total as f64) * 100.0; } } } } } 0.0 } async fn collect_memory_usage(&self) -> f64 { #[cfg(target_os = "linux")] { if let Ok(contents) = tokio::fs::read_to_string("/proc/meminfo").await { let mut total: u64 = 0; let mut available: u64 = 0; for line in contents.lines() { if line.starts_with("MemTotal:") { total = line .split_whitespace() .nth(1) .and_then(|s| s.parse().ok()) .unwrap_or(0); } else if line.starts_with("MemAvailable:") { available = line .split_whitespace() .nth(1) .and_then(|s| s.parse().ok()) .unwrap_or(0); } } if total > 0 { return ((total - available) as f64 / total as f64) * 100.0; } } } 0.0 } async fn collect_disk_usage(&self) -> f64 { #[cfg(target_os = "linux")] { if let Ok(output) = SafeCommand::new("df")? .args(["-h", "/"]) .execute() { if let Ok(stdout) = String::from_utf8(output.stdout) { if let Some(line) = stdout.lines().nth(1) { let parts: Vec<&str> = line.split_whitespace().collect(); if let Some(usage_str) = parts.get(4) { if let Ok(usage) = usage_str.trim_end_matches('%').parse::() { return usage; } } } } } } 0.0 } async fn check_component_health(&self) -> Vec { let mut components = Vec::new(); let db_health = self.check_database_health().await; components.push(db_health); let cache_health = self.check_cache_health().await; components.push(cache_health); let vector_db_health = self.check_vector_db_health().await; components.push(vector_db_health); let llm_health = self.check_llm_health().await; components.push(llm_health); components } async fn check_database_health(&self) -> ComponentHealth { let start = std::time::Instant::now(); let (status, message) = (HealthStatus::Healthy, None); ComponentHealth { name: "database".to_string(), status, latency_ms: Some(start.elapsed().as_secs_f64() * 1000.0), message, last_check: Utc::now(), } } async fn check_cache_health(&self) -> ComponentHealth { let start = std::time::Instant::now(); let (status, message) = (HealthStatus::Healthy, None); ComponentHealth { name: "cache".to_string(), status, latency_ms: Some(start.elapsed().as_secs_f64() * 1000.0), message, last_check: Utc::now(), } } async fn check_vector_db_health(&self) -> ComponentHealth { let start = std::time::Instant::now(); let (status, message) = (HealthStatus::Healthy, None); ComponentHealth { name: "vector_db".to_string(), status, latency_ms: Some(start.elapsed().as_secs_f64() * 1000.0), message, last_check: Utc::now(), } } async fn check_llm_health(&self) -> ComponentHealth { let start = std::time::Instant::now(); let (status, message) = (HealthStatus::Healthy, None); ComponentHealth { name: "llm".to_string(), status, latency_ms: Some(start.elapsed().as_secs_f64() * 1000.0), message, last_check: Utc::now(), } } fn calculate_overall_status( &self, cpu_usage: f64, memory_usage: f64, error_rate: f64, components: &[ComponentHealth], ) -> HealthStatus { let unhealthy_components = components .iter() .filter(|c| c.status == HealthStatus::Unhealthy) .count(); let degraded_components = components .iter() .filter(|c| c.status == HealthStatus::Degraded) .count(); if unhealthy_components > 0 || cpu_usage > 95.0 || memory_usage > 95.0 || error_rate > 10.0 { HealthStatus::Unhealthy } else if degraded_components > 0 || cpu_usage > 80.0 || memory_usage > 80.0 || error_rate > 5.0 { HealthStatus::Degraded } else { HealthStatus::Healthy } } pub async fn get_health(&self) -> SystemHealth { let health = self.health.read().await; health.clone() } pub async fn get_metrics(&self) -> Vec { let metrics = self.metrics.read().await; metrics.values().cloned().collect() } pub async fn get_metric(&self, name: &str) -> Option { let metrics = self.metrics.read().await; metrics.get(name).cloned() } pub async fn get_metric_history( &self, name: &str, from: DateTime, to: DateTime, ) -> Vec { let metrics = self.metrics.read().await; if let Some(metric) = metrics.get(name) { metric .data_points .iter() .filter(|p| p.timestamp >= from && p.timestamp <= to) .cloned() .collect() } else { Vec::new() } } pub async fn get_dashboard(&self) -> DashboardResponse { DashboardResponse { metrics: self.get_metrics().await, health: self.get_health().await, active_alerts: self.get_active_alerts().await, recent_alerts: self.get_recent_alerts(10).await, } } pub async fn start_background_collection(self: Arc) { let collector = self.clone(); tokio::spawn(async move { let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(30)); loop { interval.tick().await; collector.update_system_health().await; } }); let collector = self.clone(); tokio::spawn(async move { let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(60)); loop { interval.tick().await; collector.record_builtin_metrics().await; } }); } async fn record_builtin_metrics(&self) { let health = self.get_health().await; self.record_metric("system_cpu_usage_percent", health.cpu_usage_percent, HashMap::new()) .await; self.record_metric( "system_memory_usage_percent", health.memory_usage_percent, HashMap::new(), ) .await; self.record_metric( "system_disk_usage_percent", health.disk_usage_percent, HashMap::new(), ) .await; self.record_metric( "system_requests_per_second", health.requests_per_second, HashMap::new(), ) .await; self.record_metric( "system_error_rate_percent", health.error_rate_percent, HashMap::new(), ) .await; self.record_metric( "system_average_latency_ms", health.average_latency_ms, HashMap::new(), ) .await; } pub async fn setup_default_alert_rules(&self) { let rules = vec![ AlertRule { id: Uuid::new_v4(), name: "High CPU Usage".to_string(), description: "CPU usage exceeds 90%".to_string(), metric_name: "system_cpu_usage_percent".to_string(), condition: AlertCondition::GreaterThan(90.0), severity: AlertSeverity::Warning, duration_seconds: 300, labels: HashMap::new(), annotations: HashMap::new(), enabled: true, created_at: Utc::now(), updated_at: Utc::now(), }, AlertRule { id: Uuid::new_v4(), name: "Critical CPU Usage".to_string(), description: "CPU usage exceeds 95%".to_string(), metric_name: "system_cpu_usage_percent".to_string(), condition: AlertCondition::GreaterThan(95.0), severity: AlertSeverity::Critical, duration_seconds: 60, labels: HashMap::new(), annotations: HashMap::new(), enabled: true, created_at: Utc::now(), updated_at: Utc::now(), }, AlertRule { id: Uuid::new_v4(), name: "High Memory Usage".to_string(), description: "Memory usage exceeds 85%".to_string(), metric_name: "system_memory_usage_percent".to_string(), condition: AlertCondition::GreaterThan(85.0), severity: AlertSeverity::Warning, duration_seconds: 300, labels: HashMap::new(), annotations: HashMap::new(), enabled: true, created_at: Utc::now(), updated_at: Utc::now(), }, AlertRule { id: Uuid::new_v4(), name: "High Error Rate".to_string(), description: "Error rate exceeds 5%".to_string(), metric_name: "system_error_rate_percent".to_string(), condition: AlertCondition::GreaterThan(5.0), severity: AlertSeverity::Error, duration_seconds: 120, labels: HashMap::new(), annotations: HashMap::new(), enabled: true, created_at: Utc::now(), updated_at: Utc::now(), }, AlertRule { id: Uuid::new_v4(), name: "High Latency".to_string(), description: "Average latency exceeds 1000ms".to_string(), metric_name: "system_average_latency_ms".to_string(), condition: AlertCondition::GreaterThan(1000.0), severity: AlertSeverity::Warning, duration_seconds: 300, labels: HashMap::new(), annotations: HashMap::new(), enabled: true, created_at: Utc::now(), updated_at: Utc::now(), }, ]; for rule in rules { self.add_alert_rule(rule).await; } } }