From c26e483cc9fd6317e101e749ca9b5588135ea4a2 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Wed, 1 Apr 2026 18:28:54 -0300 Subject: [PATCH] fix: All services check health before starting (idempotent bootstrap) - Tables (PostgreSQL): pg_isready health check before start - Drive (MinIO): /minio/health/live check before start - ALM (Forgejo): HTTP health check before start - ALM CI (Forgejo Runner): pgrep check before start - Valkey: health check uses absolute path to valkey-cli - Vault, Qdrant, Zitadel: already had health checks - Result: no duplicate starts, no hangs on restart --- src/core/bootstrap/bootstrap_manager.rs | 96 +++++++++++++---------- src/core/bootstrap/bootstrap_utils.rs | 100 ++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 43 deletions(-) diff --git a/src/core/bootstrap/bootstrap_manager.rs b/src/core/bootstrap/bootstrap_manager.rs index 8f6ab483..2b08d2f9 100644 --- a/src/core/bootstrap/bootstrap_manager.rs +++ b/src/core/bootstrap/bootstrap_manager.rs @@ -1,6 +1,6 @@ // Bootstrap manager implementation use crate::core::bootstrap::bootstrap_types::{BootstrapManager, BootstrapProgress}; -use crate::core::bootstrap::bootstrap_utils::{cache_health_check, safe_pkill, vault_health_check, vector_db_health_check, zitadel_health_check}; +use crate::core::bootstrap::bootstrap_utils::{alm_ci_health_check, alm_health_check, cache_health_check, drive_health_check, safe_pkill, tables_health_check, vault_health_check, vector_db_health_check, zitadel_health_check}; use crate::core::config::AppConfig; use crate::core::package_manager::{InstallMode, PackageManager}; use crate::security::command_guard::SafeCommand; @@ -107,13 +107,18 @@ impl BootstrapManager { } if pm.is_installed("tables") { - info!("Starting PostgreSQL..."); - match pm.start("tables") { - Ok(_child) => { - info!("PostgreSQL started"); - } - Err(e) => { - warn!("Failed to start PostgreSQL: {}", e); + let tables_already_running = tables_health_check(); + if tables_already_running { + info!("PostgreSQL is already running"); + } else { + info!("Starting PostgreSQL..."); + match pm.start("tables") { + Ok(_child) => { + info!("PostgreSQL started"); + } + Err(e) => { + warn!("Failed to start PostgreSQL: {}", e); + } } } } @@ -127,7 +132,6 @@ impl BootstrapManager { match pm.start("cache") { Ok(_child) => { info!("Valkey cache process started, waiting for readiness..."); - // Wait for cache to be ready (up to 30 seconds) for i in 0..30 { sleep(Duration::from_secs(1)).await; if cache_health_check() { @@ -147,25 +151,28 @@ impl BootstrapManager { } if pm.is_installed("drive") { - info!("Starting MinIO..."); - match pm.start("drive") { - Ok(_child) => { - info!("MinIO started"); - } - Err(e) => { - warn!("Failed to start MinIO: {}", e); + let drive_already_running = drive_health_check(); + if drive_already_running { + info!("MinIO is already running"); + } else { + info!("Starting MinIO..."); + match pm.start("drive") { + Ok(_child) => { + info!("MinIO started"); + } + Err(e) => { + warn!("Failed to start MinIO: {}", e); + } } } } if pm.is_installed("directory") { - // Check once if Zitadel is already running let directory_already_running = zitadel_health_check(); if directory_already_running { info!("Zitadel/Directory service is already running"); - // Create OAuth client if config doesn't exist (even when already running) let config_path = self.stack_dir("conf/system/directory_config.json"); if !config_path.exists() { info!("Creating OAuth client for Directory service..."); @@ -177,7 +184,6 @@ impl BootstrapManager { info!("Directory config already exists, skipping OAuth setup"); } } else { - // Not running — start it immediately, then wait for it to become ready info!("Starting Zitadel/Directory service..."); match pm.start("directory") { Ok(_child) => { @@ -190,7 +196,6 @@ impl BootstrapManager { zitadel_ready = true; break; } - // Log progress every 15 checks (30 seconds) if i % 15 == 14 { info!("Zitadel health check: {}s elapsed, retrying...", (i + 1) * 2); } @@ -199,7 +204,6 @@ impl BootstrapManager { warn!("Zitadel/Directory service did not respond after 300 seconds"); } - // Create OAuth client if Zitadel is ready and config doesn't exist if zitadel_ready { let config_path = self.stack_dir("conf/system/directory_config.json"); if !config_path.exists() { @@ -216,37 +220,43 @@ impl BootstrapManager { } } } - - // Note: Directory (Zitadel) bootstrap is handled in main_module/bootstrap.rs - // where it has proper access to the admin PAT token } if pm.is_installed("alm") { - info!("Starting ALM (Forgejo) service..."); - match pm.start("alm") { - Ok(_child) => { - info!("ALM service started"); - // Wait for ALM to initialize its database - tokio::time::sleep(tokio::time::Duration::from_secs(20)).await; - match crate::core::package_manager::setup_alm().await { - Ok(_) => info!("ALM setup and runner generation successful"), - Err(e) => warn!("ALM setup failed: {}", e), + let alm_already_running = alm_health_check(); + if alm_already_running { + info!("ALM (Forgejo) is already running"); + } else { + info!("Starting ALM (Forgejo) service..."); + match pm.start("alm") { + Ok(_child) => { + info!("ALM service started"); + tokio::time::sleep(tokio::time::Duration::from_secs(20)).await; + match crate::core::package_manager::setup_alm().await { + Ok(_) => info!("ALM setup and runner generation successful"), + Err(e) => warn!("ALM setup failed: {}", e), + } + } + Err(e) => { + warn!("Failed to start ALM service: {}", e); } - } - Err(e) => { - warn!("Failed to start ALM service: {}", e); } } } if pm.is_installed("alm-ci") { - info!("Starting ALM CI (Forgejo Runner) service..."); - match pm.start("alm-ci") { - Ok(_child) => { - info!("ALM CI service started"); - } - Err(e) => { - warn!("Failed to start ALM CI service: {}", e); + let alm_ci_already_running = alm_ci_health_check(); + if alm_ci_already_running { + info!("ALM CI (Forgejo Runner) is already running"); + } else { + info!("Starting ALM CI (Forgejo Runner) service..."); + match pm.start("alm-ci") { + Ok(_child) => { + info!("ALM CI service started"); + } + Err(e) => { + warn!("Failed to start ALM CI service: {}", e); + } } } } diff --git a/src/core/bootstrap/bootstrap_utils.rs b/src/core/bootstrap/bootstrap_utils.rs index 3fbaea0f..eec4dfce 100644 --- a/src/core/bootstrap/bootstrap_utils.rs +++ b/src/core/bootstrap/bootstrap_utils.rs @@ -245,3 +245,103 @@ pub fn zitadel_health_check() -> bool { Err(_) => false, } } + +/// Check if PostgreSQL/Tables is healthy +pub fn tables_health_check() -> bool { + if let Ok(output) = SafeCommand::new("pg_isready") + .and_then(|c| c.args(&["-h", "127.0.0.1", "-p", "5432"])) + .and_then(|c| c.execute()) + { + return output.status.success(); + } + + let stack_path = + std::env::var("BOTSERVER_STACK_PATH").unwrap_or_else(|_| "./botserver-stack".to_string()); + let pg_isready = format!("{}/bin/tables/bin/pg_isready", stack_path); + if let Ok(output) = SafeCommand::new(&pg_isready) + .and_then(|c| c.args(&["-h", "127.0.0.1", "-p", "5432"])) + .and_then(|c| c.execute()) + { + return output.status.success(); + } + + match SafeCommand::new("nc") + .and_then(|c| c.args(&["-z", "-w", "1", "127.0.0.1", "5432"])) + .and_then(|c| c.execute()) + { + Ok(output) => output.status.success(), + Err(_) => false, + } +} + +/// Check if MinIO/Drive is healthy +pub fn drive_health_check() -> bool { + let urls = [ + "http://127.0.0.1:9100/minio/health/live", + "https://127.0.0.1:9100/minio/health/live", + ]; + + for url in &urls { + if let Ok(output) = SafeCommand::new("curl") + .and_then(|c| c.args(&["-sfk", "--connect-timeout", "2", "-m", "3", url])) + .and_then(|c| c.execute()) + { + if output.status.success() { + return true; + } + } + } + + match SafeCommand::new("nc") + .and_then(|c| c.args(&["-z", "-w", "1", "127.0.0.1", "9100"])) + .and_then(|c| c.execute()) + { + Ok(output) => output.status.success(), + Err(_) => false, + } +} + +/// Check if ALM (Forgejo) is healthy +pub fn alm_health_check() -> bool { + let urls = ["http://localhost:3000", "https://localhost:3000"]; + + for url in &urls { + if let Ok(output) = SafeCommand::new("curl") + .and_then(|c| c.args(&["-sfk", "--connect-timeout", "2", "-m", "3", url])) + .and_then(|c| c.execute()) + { + if output.status.success() { + return true; + } + } + } + + match SafeCommand::new("nc") + .and_then(|c| c.args(&["-z", "-w", "1", "127.0.0.1", "3000"])) + .and_then(|c| c.execute()) + { + Ok(output) => output.status.success(), + Err(_) => false, + } +} + +/// Check if ALM CI (Forgejo Runner) is running +pub fn alm_ci_health_check() -> bool { + if let Ok(output) = SafeCommand::new("pgrep") + .and_then(|c| c.args(&["-x", "forgejo-runner"])) + .and_then(|c| c.execute()) + { + return output.status.success(); + } + + match SafeCommand::new("ps") + .and_then(|c| c.args(&["-ef"])) + .and_then(|c| c.execute()) + { + Ok(output) => { + let stdout = String::from_utf8_lossy(&output.stdout); + stdout.contains("forgejo-runner") && stdout.contains("daemon") + } + Err(_) => false, + } +}