fix: All services check health before starting (idempotent bootstrap)
All checks were successful
BotServer CI/CD / build (push) Successful in 4m9s

- Tables (PostgreSQL): pg_isready health check before start
- Drive (MinIO): /minio/health/live check before start
- ALM (Forgejo): HTTP health check before start
- ALM CI (Forgejo Runner): pgrep check before start
- Valkey: health check uses absolute path to valkey-cli
- Vault, Qdrant, Zitadel: already had health checks
- Result: no duplicate starts, no hangs on restart
This commit is contained in:
Rodrigo Rodriguez (Pragmatismo) 2026-04-01 18:28:54 -03:00
parent b47d928608
commit c26e483cc9
2 changed files with 153 additions and 43 deletions

View file

@ -1,6 +1,6 @@
// Bootstrap manager implementation
use crate::core::bootstrap::bootstrap_types::{BootstrapManager, BootstrapProgress};
use crate::core::bootstrap::bootstrap_utils::{cache_health_check, safe_pkill, vault_health_check, vector_db_health_check, zitadel_health_check};
use crate::core::bootstrap::bootstrap_utils::{alm_ci_health_check, alm_health_check, cache_health_check, drive_health_check, safe_pkill, tables_health_check, vault_health_check, vector_db_health_check, zitadel_health_check};
use crate::core::config::AppConfig;
use crate::core::package_manager::{InstallMode, PackageManager};
use crate::security::command_guard::SafeCommand;
@ -107,13 +107,18 @@ impl BootstrapManager {
}
if pm.is_installed("tables") {
info!("Starting PostgreSQL...");
match pm.start("tables") {
Ok(_child) => {
info!("PostgreSQL started");
}
Err(e) => {
warn!("Failed to start PostgreSQL: {}", e);
let tables_already_running = tables_health_check();
if tables_already_running {
info!("PostgreSQL is already running");
} else {
info!("Starting PostgreSQL...");
match pm.start("tables") {
Ok(_child) => {
info!("PostgreSQL started");
}
Err(e) => {
warn!("Failed to start PostgreSQL: {}", e);
}
}
}
}
@ -127,7 +132,6 @@ impl BootstrapManager {
match pm.start("cache") {
Ok(_child) => {
info!("Valkey cache process started, waiting for readiness...");
// Wait for cache to be ready (up to 30 seconds)
for i in 0..30 {
sleep(Duration::from_secs(1)).await;
if cache_health_check() {
@ -147,25 +151,28 @@ impl BootstrapManager {
}
if pm.is_installed("drive") {
info!("Starting MinIO...");
match pm.start("drive") {
Ok(_child) => {
info!("MinIO started");
}
Err(e) => {
warn!("Failed to start MinIO: {}", e);
let drive_already_running = drive_health_check();
if drive_already_running {
info!("MinIO is already running");
} else {
info!("Starting MinIO...");
match pm.start("drive") {
Ok(_child) => {
info!("MinIO started");
}
Err(e) => {
warn!("Failed to start MinIO: {}", e);
}
}
}
}
if pm.is_installed("directory") {
// Check once if Zitadel is already running
let directory_already_running = zitadel_health_check();
if directory_already_running {
info!("Zitadel/Directory service is already running");
// Create OAuth client if config doesn't exist (even when already running)
let config_path = self.stack_dir("conf/system/directory_config.json");
if !config_path.exists() {
info!("Creating OAuth client for Directory service...");
@ -177,7 +184,6 @@ impl BootstrapManager {
info!("Directory config already exists, skipping OAuth setup");
}
} else {
// Not running — start it immediately, then wait for it to become ready
info!("Starting Zitadel/Directory service...");
match pm.start("directory") {
Ok(_child) => {
@ -190,7 +196,6 @@ impl BootstrapManager {
zitadel_ready = true;
break;
}
// Log progress every 15 checks (30 seconds)
if i % 15 == 14 {
info!("Zitadel health check: {}s elapsed, retrying...", (i + 1) * 2);
}
@ -199,7 +204,6 @@ impl BootstrapManager {
warn!("Zitadel/Directory service did not respond after 300 seconds");
}
// Create OAuth client if Zitadel is ready and config doesn't exist
if zitadel_ready {
let config_path = self.stack_dir("conf/system/directory_config.json");
if !config_path.exists() {
@ -216,37 +220,43 @@ impl BootstrapManager {
}
}
}
// Note: Directory (Zitadel) bootstrap is handled in main_module/bootstrap.rs
// where it has proper access to the admin PAT token
}
if pm.is_installed("alm") {
info!("Starting ALM (Forgejo) service...");
match pm.start("alm") {
Ok(_child) => {
info!("ALM service started");
// Wait for ALM to initialize its database
tokio::time::sleep(tokio::time::Duration::from_secs(20)).await;
match crate::core::package_manager::setup_alm().await {
Ok(_) => info!("ALM setup and runner generation successful"),
Err(e) => warn!("ALM setup failed: {}", e),
let alm_already_running = alm_health_check();
if alm_already_running {
info!("ALM (Forgejo) is already running");
} else {
info!("Starting ALM (Forgejo) service...");
match pm.start("alm") {
Ok(_child) => {
info!("ALM service started");
tokio::time::sleep(tokio::time::Duration::from_secs(20)).await;
match crate::core::package_manager::setup_alm().await {
Ok(_) => info!("ALM setup and runner generation successful"),
Err(e) => warn!("ALM setup failed: {}", e),
}
}
Err(e) => {
warn!("Failed to start ALM service: {}", e);
}
}
Err(e) => {
warn!("Failed to start ALM service: {}", e);
}
}
}
if pm.is_installed("alm-ci") {
info!("Starting ALM CI (Forgejo Runner) service...");
match pm.start("alm-ci") {
Ok(_child) => {
info!("ALM CI service started");
}
Err(e) => {
warn!("Failed to start ALM CI service: {}", e);
let alm_ci_already_running = alm_ci_health_check();
if alm_ci_already_running {
info!("ALM CI (Forgejo Runner) is already running");
} else {
info!("Starting ALM CI (Forgejo Runner) service...");
match pm.start("alm-ci") {
Ok(_child) => {
info!("ALM CI service started");
}
Err(e) => {
warn!("Failed to start ALM CI service: {}", e);
}
}
}
}

View file

@ -245,3 +245,103 @@ pub fn zitadel_health_check() -> bool {
Err(_) => false,
}
}
/// Check if PostgreSQL/Tables is healthy
pub fn tables_health_check() -> bool {
if let Ok(output) = SafeCommand::new("pg_isready")
.and_then(|c| c.args(&["-h", "127.0.0.1", "-p", "5432"]))
.and_then(|c| c.execute())
{
return output.status.success();
}
let stack_path =
std::env::var("BOTSERVER_STACK_PATH").unwrap_or_else(|_| "./botserver-stack".to_string());
let pg_isready = format!("{}/bin/tables/bin/pg_isready", stack_path);
if let Ok(output) = SafeCommand::new(&pg_isready)
.and_then(|c| c.args(&["-h", "127.0.0.1", "-p", "5432"]))
.and_then(|c| c.execute())
{
return output.status.success();
}
match SafeCommand::new("nc")
.and_then(|c| c.args(&["-z", "-w", "1", "127.0.0.1", "5432"]))
.and_then(|c| c.execute())
{
Ok(output) => output.status.success(),
Err(_) => false,
}
}
/// Check if MinIO/Drive is healthy
pub fn drive_health_check() -> bool {
let urls = [
"http://127.0.0.1:9100/minio/health/live",
"https://127.0.0.1:9100/minio/health/live",
];
for url in &urls {
if let Ok(output) = SafeCommand::new("curl")
.and_then(|c| c.args(&["-sfk", "--connect-timeout", "2", "-m", "3", url]))
.and_then(|c| c.execute())
{
if output.status.success() {
return true;
}
}
}
match SafeCommand::new("nc")
.and_then(|c| c.args(&["-z", "-w", "1", "127.0.0.1", "9100"]))
.and_then(|c| c.execute())
{
Ok(output) => output.status.success(),
Err(_) => false,
}
}
/// Check if ALM (Forgejo) is healthy
pub fn alm_health_check() -> bool {
let urls = ["http://localhost:3000", "https://localhost:3000"];
for url in &urls {
if let Ok(output) = SafeCommand::new("curl")
.and_then(|c| c.args(&["-sfk", "--connect-timeout", "2", "-m", "3", url]))
.and_then(|c| c.execute())
{
if output.status.success() {
return true;
}
}
}
match SafeCommand::new("nc")
.and_then(|c| c.args(&["-z", "-w", "1", "127.0.0.1", "3000"]))
.and_then(|c| c.execute())
{
Ok(output) => output.status.success(),
Err(_) => false,
}
}
/// Check if ALM CI (Forgejo Runner) is running
pub fn alm_ci_health_check() -> bool {
if let Ok(output) = SafeCommand::new("pgrep")
.and_then(|c| c.args(&["-x", "forgejo-runner"]))
.and_then(|c| c.execute())
{
return output.status.success();
}
match SafeCommand::new("ps")
.and_then(|c| c.args(&["-ef"]))
.and_then(|c| c.execute())
{
Ok(output) => {
let stdout = String::from_utf8_lossy(&output.stdout);
stdout.contains("forgejo-runner") && stdout.contains("daemon")
}
Err(_) => false,
}
}