feat(host-agent): Phase 1a process supervision — instance start/stop/restart/status + push state events
Per-instance ProcessSupervisor: tokio child spawn with proper arg list
(fixes Go's naive space-splitting), graceful SIGTERM with 30s budget
then force kill, monitor task classifying ordered-stop vs crash (exit
code captured), watch-channel state observable everywhere. Instance cmd
channel live on corrosion.{license}.{instance}.cmd (start/stop/restart/
status) with state events pushed on {instance}.status (keep-latest
semantics, documented). Heartbeats now carry live process state +
uptime per instance. Crate restructured lib+bin for integration tests.
Verified: 5 integration tests with real OS processes (lifecycle, crash
exit-code, restart recovery, unmanaged rejection, clean spawn failure)
+ live-NATS contract test (request-reply roundtrips, double-start
rejection, push events, heartbeat state) — all green.
Known limitation (documented): no PID adoption yet — agent restart
orphans a running game process to 'stopped' until panel restart.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -4,14 +4,9 @@
|
||||
//! connectivity prober, host command channel. Process control, file ops, and
|
||||
//! game adapters arrive in Phase 1+ (see PROTOCOL.md).
|
||||
|
||||
mod agent;
|
||||
mod bus;
|
||||
mod config;
|
||||
mod hostcmd;
|
||||
mod prober;
|
||||
mod subjects;
|
||||
mod telemetry;
|
||||
mod version;
|
||||
use corrosion_host_agent::{
|
||||
agent, bus, config, hostcmd, instancecmd, prober, process, subjects, telemetry, version,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{Parser, Subcommand};
|
||||
@@ -96,11 +91,18 @@ async fn run(settings: config::Settings) -> Result<()> {
|
||||
|
||||
let nats = bus::connect(&settings).await?;
|
||||
|
||||
let supervisors = settings
|
||||
.instances
|
||||
.iter()
|
||||
.map(|inst| (inst.id.clone(), process::ProcessSupervisor::new(inst)))
|
||||
.collect();
|
||||
|
||||
let agent = Arc::new(Agent {
|
||||
cfg: settings,
|
||||
nats,
|
||||
started: Instant::now(),
|
||||
last_probe: RwLock::new(None),
|
||||
supervisors,
|
||||
shutdown: CancellationToken::new(),
|
||||
});
|
||||
|
||||
@@ -115,6 +117,21 @@ async fn run(settings: config::Settings) -> Result<()> {
|
||||
}
|
||||
}));
|
||||
}
|
||||
for sup in agent.supervisors.values() {
|
||||
{
|
||||
let agent = agent.clone();
|
||||
let sup = sup.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
if let Err(e) = instancecmd::run(agent, sup).await {
|
||||
tracing::error!("instance command handler failed: {e:#}");
|
||||
}
|
||||
}));
|
||||
}
|
||||
handles.push(tokio::spawn(instancecmd::publish_state_changes(
|
||||
agent.clone(),
|
||||
sup.clone(),
|
||||
)));
|
||||
}
|
||||
|
||||
wait_for_shutdown_signal().await;
|
||||
tracing::info!("shutdown signal received");
|
||||
|
||||
Reference in New Issue
Block a user