//! Corrosion Host Agent — multi-game ops runtime. //! //! Phase 0: NATS connectivity, real host telemetry, multi-instance config, //! connectivity prober, host command channel. Process control, file ops, and //! game adapters arrive in Phase 1+ (see PROTOCOL.md). use corrosion_host_agent::{ agent, bus, config, docker_compose, filemanager, hostcmd, instancecmd, prober, process, service, subjects, supervisor, telemetry, version, }; use anyhow::{Context, Result}; use clap::{Parser, Subcommand}; use std::path::PathBuf; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; use crate::agent::Agent; #[derive(Parser)] #[command(name = "corrosion-host-agent", version = version::VERSION, about)] struct Cli { /// Path to agent.toml (default: /etc/corrosion/agent.toml on Linux, /// C:\ProgramData\Corrosion\agent.toml on Windows) #[arg(long, short = 'c')] config: Option, #[command(subcommand)] command: Option, } #[derive(Subcommand)] enum Command { /// Validate the config file and exit. Check, /// Print full version (semver, git hash, build timestamp) and exit. Version, /// Install as a systemd service and start it (Linux; requires root). Install, /// Stop and remove the systemd service (Linux; requires root). Uninstall, } fn main() -> Result<()> { let cli = Cli::parse(); let config_path = cli.config.unwrap_or_else(config::default_config_path); match cli.command { Some(Command::Version) => { println!("corrosion-host-agent {}", version::long()); Ok(()) } Some(Command::Check) => { let settings = config::load(&config_path)?; println!( "config ok: license {}, {} instance(s), nats {}", settings.license_id, settings.instances.len(), settings.nats_url ); Ok(()) } Some(Command::Install) => service::install(&config_path), Some(Command::Uninstall) => service::uninstall(), None => { let settings = config::load(&config_path)?; init_logging(&settings.log_level); tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .context("building tokio runtime")? .block_on(run(settings)) } } } fn init_logging(level: &str) { let filter = tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(level)); tracing_subscriber::fmt() .with_env_filter(filter) .with_target(false) .init(); } async fn run(settings: config::Settings) -> Result<()> { tracing::info!( "corrosion-host-agent {} starting: license {}, {} instance(s)", version::long(), settings.license_id, settings.instances.len() ); for inst in &settings.instances { tracing::info!(" instance '{}' ({}) at {}", inst.id, inst.game, inst.root.display()); } let nats = bus::connect(&settings).await?; // Per-game supervisor factory: container-managed games (Dune) get a // docker-compose supervisor; everything else is a spawned-process // supervisor. Both satisfy the `Supervisor` trait, so the rest of the agent // is game-agnostic. let supervisors: std::collections::HashMap> = settings .instances .iter() .map(|inst| { let sup: Arc = match inst.game.as_str() { "dune" => docker_compose::DockerComposeSupervisor::new(inst), _ => process::ProcessSupervisor::new(inst), }; (inst.id.clone(), sup) }) .collect(); let agent = Arc::new(Agent { cfg: settings, nats, started: Instant::now(), last_probe: RwLock::new(None), supervisors, shutdown: CancellationToken::new(), }); let mut handles = Vec::new(); handles.push(tokio::spawn(telemetry::run(agent.clone()))); handles.push(tokio::spawn(prober::run_loop(agent.clone()))); { let agent = agent.clone(); handles.push(tokio::spawn(async move { if let Err(e) = hostcmd::run(agent).await { tracing::error!("host command handler failed: {e:#}"); } })); } for (instance_id, sup) in &agent.supervisors { { let agent = agent.clone(); let sup = sup.clone(); handles.push(tokio::spawn(async move { if let Err(e) = instancecmd::run(agent, sup).await { tracing::error!("instance command handler failed: {e:#}"); } })); } handles.push(tokio::spawn(instancecmd::publish_state_changes( agent.clone(), sup.clone(), ))); // File manager: one handler task per instance, jailed to root. { let agent = agent.clone(); let inst_cfg = agent .cfg .instances .iter() .find(|i| &i.id == instance_id) .cloned(); if let Some(cfg) = inst_cfg { let id = instance_id.clone(); handles.push(tokio::spawn(async move { if let Err(e) = filemanager::run(agent, id, cfg.root).await { tracing::error!("file manager handler failed: {e:#}"); } })); } } } wait_for_shutdown_signal().await; tracing::info!("shutdown signal received"); agent.shutdown.cancel(); // Best-effort offline beacon so the panel flips to offline immediately // instead of waiting out the heartbeat staleness window. let beacon = subjects::host_going_offline(&agent.cfg.license_id); let _ = tokio::time::timeout( Duration::from_millis(500), agent.nats.publish(beacon, "{}".into()), ) .await; match tokio::time::timeout( Duration::from_secs(10), futures::future::join_all(handles), ) .await { Ok(_) => tracing::info!("all subsystems stopped cleanly"), Err(_) => tracing::warn!("shutdown timeout: some subsystems did not stop within 10s"), } let _ = agent.nats.flush().await; tracing::info!("corrosion-host-agent stopped"); Ok(()) } async fn wait_for_shutdown_signal() { #[cfg(unix)] { use tokio::signal::unix::{signal, SignalKind}; let mut sigterm = match signal(SignalKind::terminate()) { Ok(s) => s, Err(e) => { tracing::error!("SIGTERM handler failed: {e}; falling back to ctrl-c only"); let _ = tokio::signal::ctrl_c().await; return; } }; tokio::select! { _ = tokio::signal::ctrl_c() => {} _ = sigterm.recv() => {} } } #[cfg(not(unix))] { let _ = tokio::signal::ctrl_c().await; } }