From a377af5ba303bf55b9fb3f1cbd3d27a9fb148227 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Fri, 3 Apr 2026 18:46:24 -0300 Subject: [PATCH] docs: update AGENTS.md with CI/CD directives and remove deprecated patterns - Added explicit CI/CD deployment directives (NEVER use scp, ALWAYS use CI) - Updated CI runner documentation: runs as gbuser, sccache config, workspace paths - Fixed deprecated block_in_place code example to use std::thread::spawn pattern - Added security headers: NEVER include sensitive data in documentation - Updated container architecture table with all services --- AGENTS.md | 465 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 433 insertions(+), 32 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index d4399a8..4dc515a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,6 +1,10 @@ # General Bots AI Agent Guidelines - stop saving .png on root! Use /tmp. never allow new files on root. - never push to alm without asking first - pbecause it is production! +- **❌ NEVER deploy to production manually — ALWAYS use CI/CD pipeline** +- **❌ NEVER include sensitive data (IPs, tokens, passwords, keys) in AGENTS.md or any documentation** +- **❌ NEVER use `scp`, direct SSH binary copy, or manual deployment to system container** +- **✅ ALWAYS push to ALM → CI builds on alm-ci → CI deploys to system container automatically** 8080 is server 3000 is client ui if you are in trouble with some tool, please go to the ofiical website to get proper install or instructions To test web is http://localhost:3000 (botui!) @@ -252,6 +256,14 @@ match x { - ❌ **NEVER** run `cargo build` - use `cargo check` for syntax verification - ❌ **NEVER** compile directly for production - ALWAYS use push + CI/CD pipeline - ❌ **NEVER** use `scp` or manual transfer to deploy - ONLY CI/CD ensures correct deployment +- ❌ **NEVER** manually copy binaries to production system container - ALWAYS push to ALM and let CI/CD build and deploy +- ❌ **NEVER** SSH into system container to deploy binaries - CI workflow handles build, transfer, and restart via alm-ci SSH +- ✅ **ALWAYS** push code to ALM → CI builds on alm-ci → CI deploys to system container via SSH from alm-ci +- ✅ **CI deploy path**: alm-ci builds at `/opt/gbo/data/botserver/target/debug/botserver` → tar+gzip via SSH → `/opt/gbo/bin/botserver` on system container → restart +- ❌ **NEVER** manually copy binaries to production system container - ALWAYS push to ALM and let CI/CD build and deploy +- ❌ **NEVER** SSH into system container to deploy binaries - CI workflow handles build, transfer, and restart via alm-ci SSH +- ✅ **ALWAYS** push code to ALM → CI builds on alm-ci → CI deploys to system container via SSH from alm-ci +- ✅ **CI deploy path**: alm-ci builds at `/opt/gbo/data/botserver/target/debug/botserver` → tar+gzip via SSH → `/opt/gbo/bin/botserver` on system container → restart **Current Status:** ✅ **0 clippy warnings** (down from 61 - PERFECT SCORE in YOLO mode) - ❌ **NEVER** use `panic!()`, `todo!()`, `unimplemented!()` @@ -455,12 +467,21 @@ pub fn new_feature_keyword( move |context, inputs| { let param = context.eval_expression_tree(&inputs[0])?.to_string(); - // Call async function from sync context - let result = tokio::task::block_in_place(|| { - tokio::runtime::Handle::current().block_on(async { - create_feature(&state_clone, param).await - }) + // Call async function from sync context using separate thread + let (tx, rx) = std::sync::mpsc::channel(); + std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all().build().ok(); + let result = if let Some(rt) = rt { + rt.block_on(async { + create_feature(&state_clone, param).await + }) + } else { + Err("Failed to create runtime".into()) + }; + let _ = tx.send(result); }); + let result = rx.recv().unwrap_or(Err("Channel error".into())); match result { Ok(feature) => Ok(Dynamic::from(feature.name)), @@ -470,10 +491,6 @@ pub fn new_feature_keyword( ) .expect("valid syntax registration"); } - -// Register in botserver/src/basic/keywords/mod.rs -pub mod new_feature; -pub use new_feature::new_feature_keyword; ``` ### Step 4: Test the Feature @@ -880,22 +897,44 @@ Continue on gb/ workspace. Follow AGENTS.md strictly: git push origin main ``` -2. **Wait for CI** — build takes ~3-4 minutes. Check status: +2. **Wait for CI programmatically** — poll Forgejo API until build completes: ```bash - # Via web: https://alm.pragmatismo.com.br/GeneralBots/botserver/actions - # Or check binary timestamp after ~4 min sleep: + # ALM is at http://:4747 (port 4747, NOT 3000) + # The runner is in container alm-ci, registered with token from DB + + # Method 1: Poll API for latest workflow run status + ALM_URL="http://:4747" + REPO="GeneralBots/BotServer" + MAX_WAIT=600 # 10 minutes + ELAPSED=0 + + while [ $ELAPSED -lt $MAX_WAIT ]; do + STATUS=$(curl -sf "$ALM_URL/api/v1/repos/$REPO/actions/runs?per_page=1" | python3 -c "import sys,json; runs=json.load(sys.stdin); print(runs[0]['status'] if runs else 'unknown')") + if [ "$STATUS" = "completed" ] || [ "$STATUS" = "failure" ] || [ "$STATUS" = "cancelled" ]; then + echo "CI finished with status: $STATUS" + break + fi + echo "CI status: $STATUS (waiting ${ELAPSED}s...)" + sleep 15 + ELAPSED=$((ELAPSED + 15)) + done + + # Method 2: Check runner logs directly + ssh "sudo incus exec alm-ci -- tail -20 /opt/gbo/logs/forgejo-runner.log" + + # Method 3: Check binary timestamp after CI completes sleep 240 - ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 administrator@prod-host \ + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ "sudo incus exec system -- stat -c '%y' /opt/gbo/bin/botserver" ``` 3. **Restart in prod** — after binary updates: ```bash - ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 administrator@prod-host \ + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ "sudo incus exec system -- pkill -f botserver || true" sleep 2 - ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 administrator@prod-host \ - "sudo incus exec system -- bash -c 'cd /opt/gbo/bin && sudo -u gbuser RUST_LOG=info ./botserver --noconsole > /opt/gbo/logs/botserver-output.log 2>&1 &'" + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ + "sudo incus exec system -- bash -c 'cd /opt/gbo/bin && RUST_LOG=info nohup ./botserver --noconsole > /opt/gbo/logs/stdout.log 2>&1 &'" ``` 4. **Verify deployment**: @@ -903,28 +942,390 @@ Continue on gb/ workspace. Follow AGENTS.md strictly: # Wait for bootstrap (~2 min) sleep 120 # Check health - ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 administrator@prod-host \ + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ "sudo incus exec system -- curl -s -o /dev/null -w '%{http_code}' http://localhost:8080/health" # Check logs - ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 administrator@prod-host \ - "sudo incus exec system -- tail -30 /opt/gbo/logs/botserver-output.log" + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ + "sudo incus exec system -- tail -30 /opt/gbo/logs/stdout.log" ``` -### Troubleshooting +### Production Container Architecture + +| Container | Service | Port | Notes | +|-----------|---------|------|-------| +| system | BotServer | 8080 | Main API server | +| vault | Vault | 8200 | Secrets management (isolated) | +| tables | PostgreSQL | 5432 | Database | +| cache | Valkey | 6379 | Cache | +| drive | MinIO | 9100 | Object storage | +| directory | Zitadel | 9000 | Identity provider | +| meet | LiveKit | 7880 | Video conferencing | +| vectordb | Qdrant | 6333 | Vector database | +| llm | llama.cpp | 8081 | Local LLM | +| email | Stalwart | 25/587 | Mail server | +| alm | Forgejo | 4747 | Git server (NOT 3000!) | +| alm-ci | Forgejo Runner | - | CI runner | +| proxy | Caddy | 80/443 | Reverse proxy | + +**Important:** ALM (Forgejo) listens on port **4747**, not 3000. The runner token is stored in the `action_runner_token` table in the `PROD-ALM` database. + +### CI Runner Troubleshooting | Symptom | Cause | Fix | |---------|-------|-----| -| `valkey-cli ping` hangs indefinitely | Valkey requires password auth | Install `nc` or `ss` in container for health checks | -| `nc: command not found` | Prod container lacks netcat | `sudo incus exec system -- apt-get install -y netcat-openbsd` | -| Cache connection timeout | iptables DROP rule on port 6379 | `sudo incus exec system -- iptables -I INPUT -i lo -j ACCEPT` | -| `AUTH failed` on Valkey | Valkey runs without password but Vault has one | Code tries no-password URL first, then with password | -| `Cannot start a runtime from within a runtime` | `block_on()` called from async context | Use `.await` directly, never `runtime.block_on()` in async functions | -| Secret not found in Vault | Path mismatch between seeding and reading | Seeding: `secret/gbo/cache`, Reading: `gbo/cache` (kv2 prepends `secret/`) | -| CI completed but binary not updated | Deploy step fails silently (SSH/transfer issue) | Build locally and transfer, or re-push to trigger CI again | +| Runner not connecting | Wrong ALM port (3000 vs 4747) | Use port 4747 in runner registration | +| `registration file not found` | `.runner` file missing or wrong format | Re-register: `forgejo-runner register --instance http://:4747 --token --name gbo --labels ubuntu-latest:docker://node:20-bookworm --no-interactive` | +| `unsupported protocol scheme` | `.runner` file has wrong JSON format | Delete `.runner` and re-register | +| `connection refused` to ALM | iptables blocking or ALM not running | Check `sudo incus exec alm -- ss -tlnp \| grep 4747` | +| CI not picking up jobs | Runner not registered or labels mismatch | Check runner labels match workflow `runs-on` field | -### Critical Paths in Vault +--- -- **Seeding writes to**: `secret/gbo/{service}` (e.g., `secret/gbo/cache`) -- **Code reads via**: `SecretPaths::{SERVICE}` which maps to `gbo/{service}` -- **kv2::read** prepends `secret/` automatically and looks up `secret/data/gbo/{service}` -- **All paths must match**: `gbo/cache`, `gbo/drive`, `gbo/tables`, `gbo/directory`, `gbo/llm`, `gbo/meet`, `gbo/alm`, `gbo/vectordb`, `gbo/encryption`, `gbo/email` +## 🖥️ Production Operations Guide + +### ⚠️ CRITICAL SAFETY RULES +1. **NEVER modify iptables rules without explicit confirmation** — always confirm the exact rules, source IPs, ports, and destinations before applying +2. **NEVER touch the PROD project without asking first** — no changes to production services, configs, or containers without user approval +3. **ALWAYS backup files to `/tmp` before editing** — e.g. `cp /path/to/file /tmp/$(basename /path/to/file).bak-$(date +%Y%m%d%H%M%S)` + +### Infrastructure Overview +- **Host OS:** Ubuntu LTS +- **Container engine:** Incus (LXC-based) +- **Base path:** `/opt/gbo/` (General Bots Operations) +- **Data path:** `/opt/gbo/data` — shared data, configs, bot definitions +- **Bin path:** `/opt/gbo/bin` — compiled binaries +- **Conf path:** `/opt/gbo/conf` — service configurations +- **Log path:** `/opt/gbo/logs` — application logs + +### Container Architecture + +| Role | Service | Typical Port | Notes | +|------|---------|-------------|-------| +| **dns** | CoreDNS | 53 | DNS resolution, zone files in `/opt/gbo/data` | +| **proxy** | Caddy | 80/443 | Reverse proxy, TLS termination | +| **tables** | PostgreSQL | 5432 | Primary database | +| **email** | Stalwart | 993/465/587 | Mail server (IMAPS, SMTPS, Submission) | +| **system** | BotServer + Valkey | 8080/6379 | Main API + cache | +| **webmail** | Roundcube | behind proxy | PHP-FPM webmail frontend | +| **alm** | Forgejo | 4747 | Git/ALM server (NOT 3000!) | +| **alm-ci** | Forgejo Runner | - | CI/CD runner | +| **drive** | MinIO | 9000/9100 | Object storage | +| **table-editor** | NocoDB | behind proxy | Database UI, connects to tables | +| **vault** | Vault | 8200 | Secrets management | +| **directory** | Zitadel | 9000 | Identity provider | +| **meet** | LiveKit | 7880 | Video conferencing | +| **vectordb** | Qdrant | 6333 | Vector database | +| **llm** | llama.cpp | 8081 | Local LLM inference | + +### Container Management + +```bash +# List all containers +sudo incus list + +# Start/Stop/Restart +sudo incus start +sudo incus stop +sudo incus restart + +# Exec into container +sudo incus exec -- bash + +# View container logs +sudo incus log +sudo incus log --show-log + +# File operations +sudo incus file pull /path/to/file /local/dest +sudo incus file push /local/src /path/to/dest + +# Create snapshot before changes +sudo incus snapshot create pre-change-$(date +%Y%m%d%H%M%S) +``` + +### Service Management (inside container) + +```bash +# Check if process is running +sudo incus exec -- pgrep -a + +# Restart service (systemd) +sudo incus exec -- systemctl restart + +# Follow logs +sudo incus exec -- journalctl -u -f + +# Check listening ports +sudo incus exec -- ss -tlnp +``` + +### Quick Health Check + +```bash +# Check all containers status +sudo incus list --format csv + +# Quick service check across containers +for c in dns proxy tables system email webmail alm alm-ci drive table-editor; do + echo -n "$c: " + sudo incus exec $c -- pgrep -a $(case $c in + dns) echo "coredns";; + proxy) echo "caddy";; + tables) echo "postgres";; + system) echo "botserver";; + email) echo "stalwart";; + webmail) echo "php-fpm";; + alm) echo "forgejo";; + alm-ci) echo "runner";; + drive) echo "minio";; + table-editor) echo "nocodb";; + esac) >/dev/null && echo OK || echo FAIL +done +``` + +### Network & NAT + +#### Port Forwarding Pattern +External ports on the host are DNAT'd to container IPs via iptables. NAT rules live in `/etc/iptables.rules`. + +**Critical rule pattern** — always use the external interface (`-i `) to avoid loopback issues: +``` +-A PREROUTING -i -p tcp --dport -j DNAT --to-destination : +``` + +#### Typical Port Map + +| External | Service | Notes | +|----------|---------|-------| +| 53 | DNS | Public DNS resolution | +| 80/443 | HTTP/HTTPS | Via Caddy proxy | +| 5432 | PostgreSQL | Restricted access only | +| 993 | IMAPS | Secure email retrieval | +| 465 | SMTPS | Secure email sending | +| 587 | SMTP Submission | STARTTLS | +| 25 | SMTP | Often blocked by ISPs | +| 4747 | Forgejo | Behind proxy | +| 9000 | MinIO API | Internal only | +| 8200 | Vault | Isolated | + +#### Network Diagnostics + +```bash +# Check NAT rules +sudo iptables -t nat -L -n | grep DNAT + +# Test connectivity from container +sudo incus exec -- ping -c 3 8.8.8.8 + +# Test DNS resolution +sudo incus exec -- dig + +# Test port connectivity +nc -zv +``` + +### Key Service Operations + +#### DNS (CoreDNS) +- **Config:** `/opt/gbo/conf/Corefile` +- **Zones:** `/opt/gbo/data/.zone` +- **Test:** `dig @ ` + +#### Database (PostgreSQL) +- **Data:** `/opt/gbo/data` +- **Backup:** `pg_dump -U postgres -F c -f /tmp/backup.dump ` +- **Restore:** `pg_restore -U postgres -d /tmp/backup.dump` + +#### Email (Stalwart) +- **Config:** `/opt/gbo/conf/config.toml` +- **DKIM:** Check TXT records for `selector._domainkey.` +- **Webmail:** Behind proxy +- **Admin:** Accessible via configured admin port + +**Recovery from crash:** +```bash +# Check if service starts with config validation +sudo incus exec email -- /opt/gbo/bin/stalwart -c /opt/gbo/conf/config.toml --help + +# Check error logs +sudo incus exec email -- cat /opt/gbo/logs/stderr.log + +# Restore from snapshot if config corrupted +sudo incus snapshot list email +sudo incus copy email/ email-temp +sudo incus start email-temp +sudo incus file pull email-temp/opt/gbo/conf/config.toml /tmp/config.toml +sudo incus file push /tmp/config.toml email/opt/gbo/conf/config.toml +``` + +#### Proxy (Caddy) +- **Config:** `/opt/gbo/conf/config` +- **Backup before edit:** `cp /opt/gbo/conf/config /opt/gbo/conf/config.bak-$(date +%Y%m%d)` +- **Validate:** `caddy validate --config /opt/gbo/conf/config` +- **Reload:** `caddy reload --config /opt/gbo/conf/config` + +#### Storage (MinIO) +- **Console:** Behind proxy +- **Internal API:** http://:9000 +- **Data:** `/opt/gbo/data` + +#### Bot System (system) +- **Service:** BotServer + Valkey (Redis-compatible) +- **Binary:** `/opt/gbo/bin/botserver` +- **Valkey:** port 6379 + +#### Git/ALM (Forgejo) +- **Port:** 4747 (NOT 3000!) +- **Behind proxy:** Access via configured hostname +- **CI Runner:** Separate container, registered with token from DB + +#### CI/CD (Forgejo Runner) +- **Config:** `/opt/gbo/bin/config.yaml` +- **Init:** `/etc/systemd/system/alm-ci-runner.service` (runs as `gbuser`, NOT root) +- **Logs:** `/opt/gbo/logs/out.log`, `/opt/gbo/logs/err.log` +- **Auto-start:** Via systemd (enabled) +- **Runner user:** `gbuser` (uid 1000) — all `/opt/gbo/` files owned by `gbuser:gbuser` +- **sccache:** Installed at `/usr/local/bin/sccache`, configured via `RUSTC_WRAPPER=sccache` in workflow +- **Workspace:** `/opt/gbo/data/` (NOT `/opt/gbo/ci/`) +- **Cargo cache:** `/home/gbuser/.cargo/` (registry + git db) +- **Rustup:** `/home/gbuser/.rustup/` +- **SSH keys:** `/home/gbuser/.ssh/id_ed25519` (for deploy to system container) +- **Deploy mechanism:** CI builds binary → tar+gzip via SSH → `/opt/gbo/bin/botserver` on system container + +### Backup & Recovery + +#### Snapshot Recovery +```bash +# List snapshots +sudo incus snapshot list + +# Restore from snapshot +sudo incus copy / -restored +sudo incus start -restored + +# Get files from snapshot without starting +sudo incus file pull //path/to/file . +``` + +#### Backup Scripts +- Host config backup: `/opt/gbo/bin/backup-local-host.sh` +- Remote backup to S3: `/opt/gbo/bin/backup-remote.sh` + +### Troubleshooting + +#### Container Won't Start +```bash +# Check status +sudo incus list +sudo incus info + +# Check logs +sudo incus log --show-log + +# Try starting with verbose +sudo incus start -v +``` + +#### Service Not Running +```bash +# Find process +sudo incus exec -- pgrep -a + +# Check listening ports +sudo incus exec -- ss -tlnp | grep + +# Check application logs +sudo incus exec -- tail -50 /opt/gbo/logs/stderr.log +``` + +#### Email Delivery Issues +```bash +# Check mail server is running +sudo incus exec email -- pgrep -a stalwart + +# Check IMAP/SMTP ports +nc -zv 993 +nc -zv 465 +nc -zv 587 + +# Check DKIM DNS records +dig TXT ._domainkey. + +# Check mail logs +sudo incus exec email -- tail -100 /opt/gbo/logs/email.log +``` + +### Maintenance + +#### Update Container +```bash +# Stop container +sudo incus stop + +# Create snapshot backup +sudo incus snapshot create pre-update-$(date +%Y%m%d) + +# Update packages +sudo incus exec -- apt update && apt upgrade -y + +# Restart +sudo incus start +``` + +#### Disk Space Management +```bash +# Check host disk usage +df -h / + +# Check btrfs pool (if applicable) +sudo btrfs filesystem df /var/lib/incus + +# Clean old logs in container +sudo incus exec -- find /opt/gbo/logs -name "*.log.*" -mtime +7 -delete +``` + +### Container Tricks & Optimizations + +#### Resource Limits +```bash +# Set CPU limit +sudo incus config set limits.cpu 2 + +# Set memory limit +sudo incus config set limits.memory 4GiB + +# Set disk limit +sudo incus config device set root size 20GiB +``` + +#### Profile Management +```bash +# List profiles +sudo incus profile list + +# Apply profile to container +sudo incus profile add + +# Clone container for testing +sudo incus copy --ephemeral +``` + +#### Network Optimization +```bash +# Add static DHCP-like assignment +sudo incus config device add eth0 nic nictype=bridged parent= + +# Set custom DNS for container +sudo incus config set raw.lxc "lxc.net.0.ipv4.address=" +``` + +#### Quick Container Cloning for Testing +```bash +# Snapshot and clone for safe testing +sudo incus snapshot create test-base +sudo incus copy /test-base -test +sudo incus start -test +# ... test safely ... +sudo incus stop -test +sudo incus delete -test +```