For Saturday's Ubuntu host + Linux VM: 'corrosion-host-agent install' writes a systemd unit (Type=simple — the agent already handles SIGTERM cleanly), daemon-reloads, and enables+starts the service; 'uninstall' reverses it. - new service.rs: pure unit_file_contents() generator (unit-tested) + Linux install/uninstall via systemctl; non-Linux returns a clear 'Linux only' error (Windows SCM is the follow-up). - ExecStart honors the resolved --config path (default or explicit). - Runs as root: the agent supervises game processes + their files, needs broad filesystem access. cargo check + service unit test green. Tag agent-v2.0.0-alpha.11 -> CI signs -> CDN /host-agent/alpha/. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
221 lines
7.1 KiB
Rust
221 lines
7.1 KiB
Rust
//! Corrosion Host Agent — multi-game ops runtime.
|
|
//!
|
|
//! Phase 0: NATS connectivity, real host telemetry, multi-instance config,
|
|
//! connectivity prober, host command channel. Process control, file ops, and
|
|
//! game adapters arrive in Phase 1+ (see PROTOCOL.md).
|
|
|
|
use corrosion_host_agent::{
|
|
agent, bus, config, docker_compose, filemanager, hostcmd, instancecmd, prober, process,
|
|
service, subjects, supervisor, telemetry, version,
|
|
};
|
|
|
|
use anyhow::{Context, Result};
|
|
use clap::{Parser, Subcommand};
|
|
use std::path::PathBuf;
|
|
use std::sync::Arc;
|
|
use std::time::{Duration, Instant};
|
|
use tokio::sync::RwLock;
|
|
use tokio_util::sync::CancellationToken;
|
|
|
|
use crate::agent::Agent;
|
|
|
|
#[derive(Parser)]
|
|
#[command(name = "corrosion-host-agent", version = version::VERSION, about)]
|
|
struct Cli {
|
|
/// Path to agent.toml (default: /etc/corrosion/agent.toml on Linux,
|
|
/// C:\ProgramData\Corrosion\agent.toml on Windows)
|
|
#[arg(long, short = 'c')]
|
|
config: Option<PathBuf>,
|
|
|
|
#[command(subcommand)]
|
|
command: Option<Command>,
|
|
}
|
|
|
|
#[derive(Subcommand)]
|
|
enum Command {
|
|
/// Validate the config file and exit.
|
|
Check,
|
|
/// Print full version (semver, git hash, build timestamp) and exit.
|
|
Version,
|
|
/// Install as a systemd service and start it (Linux; requires root).
|
|
Install,
|
|
/// Stop and remove the systemd service (Linux; requires root).
|
|
Uninstall,
|
|
}
|
|
|
|
fn main() -> Result<()> {
|
|
let cli = Cli::parse();
|
|
let config_path = cli.config.unwrap_or_else(config::default_config_path);
|
|
|
|
match cli.command {
|
|
Some(Command::Version) => {
|
|
println!("corrosion-host-agent {}", version::long());
|
|
Ok(())
|
|
}
|
|
Some(Command::Check) => {
|
|
let settings = config::load(&config_path)?;
|
|
println!(
|
|
"config ok: license {}, {} instance(s), nats {}",
|
|
settings.license_id,
|
|
settings.instances.len(),
|
|
settings.nats_url
|
|
);
|
|
Ok(())
|
|
}
|
|
Some(Command::Install) => service::install(&config_path),
|
|
Some(Command::Uninstall) => service::uninstall(),
|
|
None => {
|
|
let settings = config::load(&config_path)?;
|
|
init_logging(&settings.log_level);
|
|
tokio::runtime::Builder::new_multi_thread()
|
|
.enable_all()
|
|
.build()
|
|
.context("building tokio runtime")?
|
|
.block_on(run(settings))
|
|
}
|
|
}
|
|
}
|
|
|
|
fn init_logging(level: &str) {
|
|
let filter = tracing_subscriber::EnvFilter::try_from_default_env()
|
|
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(level));
|
|
tracing_subscriber::fmt()
|
|
.with_env_filter(filter)
|
|
.with_target(false)
|
|
.init();
|
|
}
|
|
|
|
async fn run(settings: config::Settings) -> Result<()> {
|
|
tracing::info!(
|
|
"corrosion-host-agent {} starting: license {}, {} instance(s)",
|
|
version::long(),
|
|
settings.license_id,
|
|
settings.instances.len()
|
|
);
|
|
for inst in &settings.instances {
|
|
tracing::info!(" instance '{}' ({}) at {}", inst.id, inst.game, inst.root.display());
|
|
}
|
|
|
|
let nats = bus::connect(&settings).await?;
|
|
|
|
// Per-game supervisor factory: container-managed games (Dune) get a
|
|
// docker-compose supervisor; everything else is a spawned-process
|
|
// supervisor. Both satisfy the `Supervisor` trait, so the rest of the agent
|
|
// is game-agnostic.
|
|
let supervisors: std::collections::HashMap<String, Arc<dyn supervisor::Supervisor>> = settings
|
|
.instances
|
|
.iter()
|
|
.map(|inst| {
|
|
let sup: Arc<dyn supervisor::Supervisor> = match inst.game.as_str() {
|
|
"dune" => docker_compose::DockerComposeSupervisor::new(inst),
|
|
_ => process::ProcessSupervisor::new(inst),
|
|
};
|
|
(inst.id.clone(), sup)
|
|
})
|
|
.collect();
|
|
|
|
let agent = Arc::new(Agent {
|
|
cfg: settings,
|
|
nats,
|
|
started: Instant::now(),
|
|
last_probe: RwLock::new(None),
|
|
supervisors,
|
|
shutdown: CancellationToken::new(),
|
|
});
|
|
|
|
let mut handles = Vec::new();
|
|
handles.push(tokio::spawn(telemetry::run(agent.clone())));
|
|
handles.push(tokio::spawn(prober::run_loop(agent.clone())));
|
|
{
|
|
let agent = agent.clone();
|
|
handles.push(tokio::spawn(async move {
|
|
if let Err(e) = hostcmd::run(agent).await {
|
|
tracing::error!("host command handler failed: {e:#}");
|
|
}
|
|
}));
|
|
}
|
|
for (instance_id, sup) in &agent.supervisors {
|
|
{
|
|
let agent = agent.clone();
|
|
let sup = sup.clone();
|
|
handles.push(tokio::spawn(async move {
|
|
if let Err(e) = instancecmd::run(agent, sup).await {
|
|
tracing::error!("instance command handler failed: {e:#}");
|
|
}
|
|
}));
|
|
}
|
|
handles.push(tokio::spawn(instancecmd::publish_state_changes(
|
|
agent.clone(),
|
|
sup.clone(),
|
|
)));
|
|
// File manager: one handler task per instance, jailed to root.
|
|
{
|
|
let agent = agent.clone();
|
|
let inst_cfg = agent
|
|
.cfg
|
|
.instances
|
|
.iter()
|
|
.find(|i| &i.id == instance_id)
|
|
.cloned();
|
|
if let Some(cfg) = inst_cfg {
|
|
let id = instance_id.clone();
|
|
handles.push(tokio::spawn(async move {
|
|
if let Err(e) = filemanager::run(agent, id, cfg.root).await {
|
|
tracing::error!("file manager handler failed: {e:#}");
|
|
}
|
|
}));
|
|
}
|
|
}
|
|
}
|
|
|
|
wait_for_shutdown_signal().await;
|
|
tracing::info!("shutdown signal received");
|
|
agent.shutdown.cancel();
|
|
|
|
// Best-effort offline beacon so the panel flips to offline immediately
|
|
// instead of waiting out the heartbeat staleness window.
|
|
let beacon = subjects::host_going_offline(&agent.cfg.license_id);
|
|
let _ = tokio::time::timeout(
|
|
Duration::from_millis(500),
|
|
agent.nats.publish(beacon, "{}".into()),
|
|
)
|
|
.await;
|
|
|
|
match tokio::time::timeout(
|
|
Duration::from_secs(10),
|
|
futures::future::join_all(handles),
|
|
)
|
|
.await
|
|
{
|
|
Ok(_) => tracing::info!("all subsystems stopped cleanly"),
|
|
Err(_) => tracing::warn!("shutdown timeout: some subsystems did not stop within 10s"),
|
|
}
|
|
|
|
let _ = agent.nats.flush().await;
|
|
tracing::info!("corrosion-host-agent stopped");
|
|
Ok(())
|
|
}
|
|
|
|
async fn wait_for_shutdown_signal() {
|
|
#[cfg(unix)]
|
|
{
|
|
use tokio::signal::unix::{signal, SignalKind};
|
|
let mut sigterm = match signal(SignalKind::terminate()) {
|
|
Ok(s) => s,
|
|
Err(e) => {
|
|
tracing::error!("SIGTERM handler failed: {e}; falling back to ctrl-c only");
|
|
let _ = tokio::signal::ctrl_c().await;
|
|
return;
|
|
}
|
|
};
|
|
tokio::select! {
|
|
_ = tokio::signal::ctrl_c() => {}
|
|
_ = sigterm.recv() => {}
|
|
}
|
|
}
|
|
#[cfg(not(unix))]
|
|
{
|
|
let _ = tokio::signal::ctrl_c().await;
|
|
}
|
|
}
|