feat(host-agent): Phase 1a process supervision — instance start/stop/restart/status + push state events
Per-instance ProcessSupervisor: tokio child spawn with proper arg list
(fixes Go's naive space-splitting), graceful SIGTERM with 30s budget
then force kill, monitor task classifying ordered-stop vs crash (exit
code captured), watch-channel state observable everywhere. Instance cmd
channel live on corrosion.{license}.{instance}.cmd (start/stop/restart/
status) with state events pushed on {instance}.status (keep-latest
semantics, documented). Heartbeats now carry live process state +
uptime per instance. Crate restructured lib+bin for integration tests.
Verified: 5 integration tests with real OS processes (lifecycle, crash
exit-code, restart recovery, unmanaged rejection, clean spawn failure)
+ live-NATS contract test (request-reply roundtrips, double-start
rejection, push events, heartbeat state) — all green.
Known limitation (documented): no PID adoption yet — agent restart
orphans a running game process to 'stopped' until panel restart.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -65,9 +65,10 @@ pub struct InstanceInfo {
|
||||
pub game: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub label: Option<String>,
|
||||
/// Phase 0 states: `configured` (root exists) or `missing_root`.
|
||||
/// Phase 1 adds live process states (running/stopped/crashed).
|
||||
/// Process-managed: running/stopped/starting/stopping/crashed.
|
||||
/// Unmanaged (no executable configured): configured/missing_root.
|
||||
pub state: String,
|
||||
pub uptime_seconds: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub root_disk_free_mb: Option<u64>,
|
||||
}
|
||||
@@ -125,21 +126,30 @@ pub async fn collect(agent: &Agent, sys: &mut System) -> HeartbeatPayload {
|
||||
})
|
||||
.collect();
|
||||
|
||||
let instances = agent
|
||||
.cfg
|
||||
.instances
|
||||
.iter()
|
||||
.map(|inst| {
|
||||
let exists = inst.root.exists();
|
||||
InstanceInfo {
|
||||
id: inst.id.clone(),
|
||||
game: inst.game.clone(),
|
||||
label: inst.label.clone(),
|
||||
state: if exists { "configured" } else { "missing_root" }.to_string(),
|
||||
root_disk_free_mb: disk_free_for_path(&disks, &inst.root),
|
||||
let mut instances = Vec::with_capacity(agent.cfg.instances.len());
|
||||
for inst in &agent.cfg.instances {
|
||||
let (state, uptime_seconds) = match agent.supervisors.get(&inst.id) {
|
||||
Some(sup) if !matches!(sup.state(), crate::process::InstanceState::Unmanaged) => {
|
||||
(sup.state().as_label().to_string(), sup.uptime_seconds().await)
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
_ => {
|
||||
let exists = inst.root.exists();
|
||||
(
|
||||
if exists { "configured" } else { "missing_root" }.to_string(),
|
||||
0,
|
||||
)
|
||||
}
|
||||
};
|
||||
instances.push(InstanceInfo {
|
||||
id: inst.id.clone(),
|
||||
game: inst.game.clone(),
|
||||
label: inst.label.clone(),
|
||||
state,
|
||||
uptime_seconds,
|
||||
root_disk_free_mb: disk_free_for_path(&disks, &inst.root),
|
||||
});
|
||||
}
|
||||
let instances = instances;
|
||||
|
||||
HeartbeatPayload {
|
||||
schema: 2,
|
||||
|
||||
Reference in New Issue
Block a user