feat(host-agent): Phase 1a process supervision — instance start/stop/restart/status + push state events
Per-instance ProcessSupervisor: tokio child spawn with proper arg list
(fixes Go's naive space-splitting), graceful SIGTERM with 30s budget
then force kill, monitor task classifying ordered-stop vs crash (exit
code captured), watch-channel state observable everywhere. Instance cmd
channel live on corrosion.{license}.{instance}.cmd (start/stop/restart/
status) with state events pushed on {instance}.status (keep-latest
semantics, documented). Heartbeats now carry live process state +
uptime per instance. Crate restructured lib+bin for integration tests.
Verified: 5 integration tests with real OS processes (lifecycle, crash
exit-code, restart recovery, unmanaged rejection, clean spawn failure)
+ live-NATS contract test (request-reply roundtrips, double-start
rejection, push events, heartbeat state) — all green.
Known limitation (documented): no PID adoption yet — agent restart
orphans a running game process to 'stopped' until panel restart.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,13 @@
|
||||
//! Shared agent handle: every subsystem task holds an `Arc<Agent>`.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::config::Settings;
|
||||
use crate::process::ProcessSupervisor;
|
||||
use crate::prober::ProbeReport;
|
||||
|
||||
pub struct Agent {
|
||||
@@ -12,5 +15,8 @@ pub struct Agent {
|
||||
pub nats: async_nats::Client,
|
||||
pub started: Instant,
|
||||
pub last_probe: RwLock<Option<ProbeReport>>,
|
||||
/// One supervisor per instance (unmanaged instances included — they
|
||||
/// report `unmanaged` state and reject process commands).
|
||||
pub supervisors: HashMap<String, Arc<ProcessSupervisor>>,
|
||||
pub shutdown: CancellationToken,
|
||||
}
|
||||
|
||||
@@ -49,6 +49,29 @@ pub struct InstanceConfig {
|
||||
/// Optional human label shown in the panel.
|
||||
#[serde(default)]
|
||||
pub label: Option<String>,
|
||||
/// Game server executable. Relative paths resolve against `root`.
|
||||
/// Absent = unmanaged instance (telemetry only, no process control).
|
||||
#[serde(default)]
|
||||
pub executable: Option<PathBuf>,
|
||||
/// Arguments as a proper list — no shell splitting, quoted values survive.
|
||||
#[serde(default)]
|
||||
pub args: Vec<String>,
|
||||
/// Working directory for the process. Defaults to the executable's directory.
|
||||
#[serde(default)]
|
||||
pub working_dir: Option<PathBuf>,
|
||||
}
|
||||
|
||||
impl InstanceConfig {
|
||||
/// Absolute executable path, if this instance is process-managed.
|
||||
pub fn resolved_executable(&self) -> Option<PathBuf> {
|
||||
self.executable.as_ref().map(|exe| {
|
||||
if exe.is_absolute() {
|
||||
exe.clone()
|
||||
} else {
|
||||
self.root.join(exe)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Deserialize)]
|
||||
|
||||
147
corrosion-host-agent/src/instancecmd.rs
Normal file
147
corrosion-host-agent/src/instancecmd.rs
Normal file
@@ -0,0 +1,147 @@
|
||||
//! Per-instance command channel + state-change events.
|
||||
//!
|
||||
//! Each process-managed instance gets a request-reply subscriber on
|
||||
//! `corrosion.{license}.{instance_id}.cmd` (funcs: start/stop/restart/status)
|
||||
//! and a publisher task that pushes every supervisor state change to
|
||||
//! `corrosion.{license}.{instance_id}.status` — the panel sees crashes when
|
||||
//! they happen, not when the next heartbeat ambles in.
|
||||
|
||||
use chrono::{SecondsFormat, Utc};
|
||||
use futures::StreamExt;
|
||||
use serde::Deserialize;
|
||||
use serde_json::json;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::agent::Agent;
|
||||
use crate::process::ProcessSupervisor;
|
||||
use crate::subjects;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct InstanceCommand {
|
||||
func: String,
|
||||
}
|
||||
|
||||
/// Forward every supervisor state change as a status event.
|
||||
pub async fn publish_state_changes(agent: Arc<Agent>, sup: Arc<ProcessSupervisor>) {
|
||||
let subject = subjects::instance_status(&agent.cfg.license_id, &sup.instance_id);
|
||||
let mut rx = sup.watch_state();
|
||||
let cancel = agent.shutdown.clone();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
changed = rx.changed() => {
|
||||
if changed.is_err() {
|
||||
break;
|
||||
}
|
||||
let state = rx.borrow().clone();
|
||||
let event = json!({
|
||||
"timestamp": Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true),
|
||||
"instance_id": sup.instance_id,
|
||||
"event": state,
|
||||
});
|
||||
match serde_json::to_vec(&event) {
|
||||
Ok(bytes) => {
|
||||
if let Err(e) = agent.nats.publish(subject.clone(), bytes.into()).await {
|
||||
tracing::warn!("status publish failed for '{}': {e}", sup.instance_id);
|
||||
}
|
||||
}
|
||||
Err(e) => tracing::error!("status serialize failed: {e}"),
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => break,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Request-reply command handler for one instance.
|
||||
pub async fn run(agent: Arc<Agent>, sup: Arc<ProcessSupervisor>) -> anyhow::Result<()> {
|
||||
let subject = subjects::instance_cmd(&agent.cfg.license_id, &sup.instance_id);
|
||||
let mut sub = agent.nats.subscribe(subject.clone()).await?;
|
||||
tracing::info!("instance command handler listening on {subject}");
|
||||
|
||||
let cancel = agent.shutdown.clone();
|
||||
loop {
|
||||
tokio::select! {
|
||||
msg = sub.next() => {
|
||||
match msg {
|
||||
Some(msg) => {
|
||||
let agent = agent.clone();
|
||||
let sup = sup.clone();
|
||||
tokio::spawn(async move { handle(agent, sup, msg).await });
|
||||
}
|
||||
None => {
|
||||
tracing::warn!("instance command subscription ended for '{}'", sup.instance_id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("instance command handler stopping for '{}'", sup.instance_id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle(agent: Arc<Agent>, sup: Arc<ProcessSupervisor>, msg: async_nats::Message) {
|
||||
let Some(reply) = msg.reply.clone() else {
|
||||
tracing::warn!("instance command without reply subject ignored");
|
||||
return;
|
||||
};
|
||||
|
||||
let response = match serde_json::from_slice::<InstanceCommand>(&msg.payload) {
|
||||
Ok(cmd) => dispatch(&sup, &cmd.func).await,
|
||||
Err(e) => json!({ "status": "error", "message": format!("invalid command payload: {e}") }),
|
||||
};
|
||||
|
||||
let bytes = match serde_json::to_vec(&response) {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
tracing::error!("response serialize failed: {e}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
if let Err(e) = agent.nats.publish(reply, bytes.into()).await {
|
||||
tracing::warn!("response publish failed: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
async fn dispatch(sup: &Arc<ProcessSupervisor>, func: &str) -> serde_json::Value {
|
||||
let outcome = match func {
|
||||
"start" => sup.start().await.map(|_| "starting"),
|
||||
"stop" => sup.stop().await.map(|_| "stopped"),
|
||||
"restart" => sup.restart().await.map(|_| "restarted"),
|
||||
"status" => {
|
||||
return json!({
|
||||
"status": "success",
|
||||
"func": "status",
|
||||
"instance_id": sup.instance_id,
|
||||
"state": sup.state(),
|
||||
"uptime_seconds": sup.uptime_seconds().await,
|
||||
});
|
||||
}
|
||||
other => {
|
||||
return json!({
|
||||
"status": "error",
|
||||
"message": format!("unknown func '{other}' (supported: start, stop, restart, status)"),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
match outcome {
|
||||
Ok(result) => json!({
|
||||
"status": "success",
|
||||
"func": func,
|
||||
"instance_id": sup.instance_id,
|
||||
"result": result,
|
||||
"state": sup.state(),
|
||||
}),
|
||||
Err(e) => json!({
|
||||
"status": "error",
|
||||
"func": func,
|
||||
"instance_id": sup.instance_id,
|
||||
"message": format!("{e:#}"),
|
||||
}),
|
||||
}
|
||||
}
|
||||
13
corrosion-host-agent/src/lib.rs
Normal file
13
corrosion-host-agent/src/lib.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
//! Corrosion Host Agent library surface — modules are public so integration
|
||||
//! tests can drive subsystems (notably the process supervisor) directly.
|
||||
|
||||
pub mod agent;
|
||||
pub mod bus;
|
||||
pub mod config;
|
||||
pub mod hostcmd;
|
||||
pub mod instancecmd;
|
||||
pub mod prober;
|
||||
pub mod process;
|
||||
pub mod subjects;
|
||||
pub mod telemetry;
|
||||
pub mod version;
|
||||
@@ -4,14 +4,9 @@
|
||||
//! connectivity prober, host command channel. Process control, file ops, and
|
||||
//! game adapters arrive in Phase 1+ (see PROTOCOL.md).
|
||||
|
||||
mod agent;
|
||||
mod bus;
|
||||
mod config;
|
||||
mod hostcmd;
|
||||
mod prober;
|
||||
mod subjects;
|
||||
mod telemetry;
|
||||
mod version;
|
||||
use corrosion_host_agent::{
|
||||
agent, bus, config, hostcmd, instancecmd, prober, process, subjects, telemetry, version,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{Parser, Subcommand};
|
||||
@@ -96,11 +91,18 @@ async fn run(settings: config::Settings) -> Result<()> {
|
||||
|
||||
let nats = bus::connect(&settings).await?;
|
||||
|
||||
let supervisors = settings
|
||||
.instances
|
||||
.iter()
|
||||
.map(|inst| (inst.id.clone(), process::ProcessSupervisor::new(inst)))
|
||||
.collect();
|
||||
|
||||
let agent = Arc::new(Agent {
|
||||
cfg: settings,
|
||||
nats,
|
||||
started: Instant::now(),
|
||||
last_probe: RwLock::new(None),
|
||||
supervisors,
|
||||
shutdown: CancellationToken::new(),
|
||||
});
|
||||
|
||||
@@ -115,6 +117,21 @@ async fn run(settings: config::Settings) -> Result<()> {
|
||||
}
|
||||
}));
|
||||
}
|
||||
for sup in agent.supervisors.values() {
|
||||
{
|
||||
let agent = agent.clone();
|
||||
let sup = sup.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
if let Err(e) = instancecmd::run(agent, sup).await {
|
||||
tracing::error!("instance command handler failed: {e:#}");
|
||||
}
|
||||
}));
|
||||
}
|
||||
handles.push(tokio::spawn(instancecmd::publish_state_changes(
|
||||
agent.clone(),
|
||||
sup.clone(),
|
||||
)));
|
||||
}
|
||||
|
||||
wait_for_shutdown_signal().await;
|
||||
tracing::info!("shutdown signal received");
|
||||
|
||||
278
corrosion-host-agent/src/process.rs
Normal file
278
corrosion-host-agent/src/process.rs
Normal file
@@ -0,0 +1,278 @@
|
||||
//! Per-instance game-server process supervision.
|
||||
//!
|
||||
//! One `ProcessSupervisor` per process-managed instance. Lifecycle mirrors the
|
||||
//! proven Go agent behavior — graceful SIGTERM with a 30s budget before force
|
||||
//! kill, a monitor task that reaps the child and records crash-vs-stop — with
|
||||
//! two fixes the Go version needed: args are a proper list (no naive space
|
||||
//! splitting), and every state change is observable through a watch channel
|
||||
//! so the panel gets push events instead of waiting for the next heartbeat.
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use serde::Serialize;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::process::{Child, Command};
|
||||
use tokio::sync::{watch, Mutex};
|
||||
|
||||
use crate::config::InstanceConfig;
|
||||
|
||||
const GRACEFUL_STOP_BUDGET: Duration = Duration::from_secs(30);
|
||||
const RESTART_PAUSE: Duration = Duration::from_secs(2);
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize)]
|
||||
#[serde(rename_all = "snake_case", tag = "state")]
|
||||
pub enum InstanceState {
|
||||
/// Not process-managed (no executable configured).
|
||||
Unmanaged,
|
||||
Stopped,
|
||||
Starting,
|
||||
Running,
|
||||
Stopping,
|
||||
/// Process exited without a stop request.
|
||||
Crashed {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
exit_code: Option<i32>,
|
||||
},
|
||||
}
|
||||
|
||||
impl InstanceState {
|
||||
pub fn as_label(&self) -> &'static str {
|
||||
match self {
|
||||
InstanceState::Unmanaged => "unmanaged",
|
||||
InstanceState::Stopped => "stopped",
|
||||
InstanceState::Starting => "starting",
|
||||
InstanceState::Running => "running",
|
||||
InstanceState::Stopping => "stopping",
|
||||
InstanceState::Crashed { .. } => "crashed",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
child: Option<Child>,
|
||||
started_at: Option<Instant>,
|
||||
/// True while a stop was requested — the monitor uses it to distinguish
|
||||
/// an ordered shutdown from a crash.
|
||||
stop_requested: bool,
|
||||
}
|
||||
|
||||
pub struct ProcessSupervisor {
|
||||
pub instance_id: String,
|
||||
executable: Option<PathBuf>,
|
||||
args: Vec<String>,
|
||||
working_dir: Option<PathBuf>,
|
||||
inner: Mutex<Inner>,
|
||||
state_tx: watch::Sender<InstanceState>,
|
||||
}
|
||||
|
||||
impl ProcessSupervisor {
|
||||
pub fn new(cfg: &InstanceConfig) -> Arc<Self> {
|
||||
let executable = cfg.resolved_executable();
|
||||
let initial = if executable.is_some() {
|
||||
InstanceState::Stopped
|
||||
} else {
|
||||
InstanceState::Unmanaged
|
||||
};
|
||||
let (state_tx, _) = watch::channel(initial);
|
||||
Arc::new(Self {
|
||||
instance_id: cfg.id.clone(),
|
||||
executable,
|
||||
args: cfg.args.clone(),
|
||||
working_dir: cfg.working_dir.clone(),
|
||||
inner: Mutex::new(Inner {
|
||||
child: None,
|
||||
started_at: None,
|
||||
stop_requested: false,
|
||||
}),
|
||||
state_tx,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn state(&self) -> InstanceState {
|
||||
self.state_tx.borrow().clone()
|
||||
}
|
||||
|
||||
pub fn watch_state(&self) -> watch::Receiver<InstanceState> {
|
||||
self.state_tx.subscribe()
|
||||
}
|
||||
|
||||
pub async fn uptime_seconds(&self) -> u64 {
|
||||
let inner = self.inner.lock().await;
|
||||
match (&*self.state_tx.borrow(), inner.started_at) {
|
||||
(InstanceState::Running, Some(t)) => t.elapsed().as_secs(),
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn start(self: &Arc<Self>) -> Result<()> {
|
||||
let Some(exe) = self.executable.clone() else {
|
||||
bail!("instance '{}' has no executable configured", self.instance_id);
|
||||
};
|
||||
if !exe.exists() {
|
||||
bail!("executable not found: {}", exe.display());
|
||||
}
|
||||
|
||||
let mut inner = self.inner.lock().await;
|
||||
if matches!(*self.state_tx.borrow(), InstanceState::Running | InstanceState::Starting) {
|
||||
bail!("instance '{}' is already running", self.instance_id);
|
||||
}
|
||||
|
||||
self.set_state(InstanceState::Starting);
|
||||
|
||||
let workdir = self
|
||||
.working_dir
|
||||
.clone()
|
||||
.or_else(|| exe.parent().map(|p| p.to_path_buf()))
|
||||
.unwrap_or_else(|| PathBuf::from("."));
|
||||
|
||||
let child = Command::new(&exe)
|
||||
.args(&self.args)
|
||||
.current_dir(&workdir)
|
||||
.stdin(Stdio::null())
|
||||
.stdout(Stdio::inherit())
|
||||
.stderr(Stdio::inherit())
|
||||
.spawn()
|
||||
.with_context(|| format!("spawning {}", exe.display()))?;
|
||||
|
||||
let pid = child.id();
|
||||
inner.child = Some(child);
|
||||
inner.started_at = Some(Instant::now());
|
||||
inner.stop_requested = false;
|
||||
drop(inner);
|
||||
|
||||
self.set_state(InstanceState::Running);
|
||||
tracing::info!(
|
||||
"instance '{}' started: {} (pid {:?})",
|
||||
self.instance_id,
|
||||
exe.display(),
|
||||
pid
|
||||
);
|
||||
|
||||
// Monitor: reap the child and classify the exit.
|
||||
let sup = Arc::clone(self);
|
||||
tokio::spawn(async move { sup.monitor().await });
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn monitor(self: Arc<Self>) {
|
||||
// Take a waiter without holding the lock across the whole child
|
||||
// lifetime: Child::wait needs &mut, so the child stays in inner and
|
||||
// we poll it.
|
||||
loop {
|
||||
let status = {
|
||||
let mut inner = self.inner.lock().await;
|
||||
let Some(child) = inner.child.as_mut() else { return };
|
||||
match child.try_wait() {
|
||||
Ok(Some(status)) => Some(status),
|
||||
Ok(None) => None,
|
||||
Err(e) => {
|
||||
tracing::error!("instance '{}' wait failed: {e}", self.instance_id);
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match status {
|
||||
Some(status) => {
|
||||
let mut inner = self.inner.lock().await;
|
||||
inner.child = None;
|
||||
inner.started_at = None;
|
||||
let ordered = inner.stop_requested;
|
||||
inner.stop_requested = false;
|
||||
drop(inner);
|
||||
|
||||
if ordered {
|
||||
self.set_state(InstanceState::Stopped);
|
||||
tracing::info!("instance '{}' stopped ({status})", self.instance_id);
|
||||
} else {
|
||||
let exit_code = status.code();
|
||||
self.set_state(InstanceState::Crashed { exit_code });
|
||||
tracing::warn!(
|
||||
"instance '{}' exited unexpectedly ({status}) — marked crashed",
|
||||
self.instance_id
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
None => tokio::time::sleep(Duration::from_millis(500)).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn stop(self: &Arc<Self>) -> Result<()> {
|
||||
let mut inner = self.inner.lock().await;
|
||||
if inner.child.is_none() {
|
||||
bail!("instance '{}' is not running", self.instance_id);
|
||||
}
|
||||
inner.stop_requested = true;
|
||||
self.set_state(InstanceState::Stopping);
|
||||
let child = inner.child.as_mut().expect("checked above");
|
||||
|
||||
// Graceful first: SIGTERM on unix; Windows has no SIGTERM equivalent
|
||||
// for console processes, so it goes straight to kill there.
|
||||
#[cfg(unix)]
|
||||
if let Some(pid) = child.id() {
|
||||
unsafe {
|
||||
libc::kill(pid as i32, libc::SIGTERM);
|
||||
}
|
||||
}
|
||||
#[cfg(not(unix))]
|
||||
{
|
||||
let _ = child.start_kill();
|
||||
}
|
||||
drop(inner);
|
||||
|
||||
// Wait for the monitor to observe the exit; force kill on budget.
|
||||
let mut rx = self.watch_state();
|
||||
let deadline = tokio::time::timeout(GRACEFUL_STOP_BUDGET, async {
|
||||
loop {
|
||||
if matches!(*rx.borrow(), InstanceState::Stopped) {
|
||||
return;
|
||||
}
|
||||
if rx.changed().await.is_err() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
if deadline.is_err() {
|
||||
tracing::warn!(
|
||||
"instance '{}' ignored SIGTERM for {}s — force killing",
|
||||
self.instance_id,
|
||||
GRACEFUL_STOP_BUDGET.as_secs()
|
||||
);
|
||||
let mut inner = self.inner.lock().await;
|
||||
if let Some(child) = inner.child.as_mut() {
|
||||
let _ = child.start_kill();
|
||||
}
|
||||
drop(inner);
|
||||
|
||||
let mut rx = self.watch_state();
|
||||
let _ = tokio::time::timeout(Duration::from_secs(5), async {
|
||||
while !matches!(*rx.borrow(), InstanceState::Stopped) {
|
||||
if rx.changed().await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn restart(self: &Arc<Self>) -> Result<()> {
|
||||
if !matches!(*self.state_tx.borrow(), InstanceState::Stopped | InstanceState::Crashed { .. } | InstanceState::Unmanaged) {
|
||||
self.stop().await?;
|
||||
}
|
||||
tokio::time::sleep(RESTART_PAUSE).await;
|
||||
self.start().await
|
||||
}
|
||||
|
||||
fn set_state(&self, state: InstanceState) {
|
||||
// send_replace never fails even with zero receivers.
|
||||
let _ = self.state_tx.send_replace(state);
|
||||
}
|
||||
}
|
||||
@@ -17,14 +17,12 @@ pub fn host_going_offline(license: &str) -> String {
|
||||
format!("corrosion.{license}.host.going_offline")
|
||||
}
|
||||
|
||||
/// Phase 1: per-instance command channel (start/stop/restart/rcon/...).
|
||||
#[allow(dead_code)]
|
||||
/// Per-instance command channel (start/stop/restart/status; rcon et al. to come).
|
||||
pub fn instance_cmd(license: &str, instance: &str) -> String {
|
||||
format!("corrosion.{license}.{instance}.cmd")
|
||||
}
|
||||
|
||||
/// Phase 1: per-instance state-change events.
|
||||
#[allow(dead_code)]
|
||||
/// Per-instance state-change events.
|
||||
pub fn instance_status(license: &str, instance: &str) -> String {
|
||||
format!("corrosion.{license}.{instance}.status")
|
||||
}
|
||||
|
||||
@@ -65,9 +65,10 @@ pub struct InstanceInfo {
|
||||
pub game: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub label: Option<String>,
|
||||
/// Phase 0 states: `configured` (root exists) or `missing_root`.
|
||||
/// Phase 1 adds live process states (running/stopped/crashed).
|
||||
/// Process-managed: running/stopped/starting/stopping/crashed.
|
||||
/// Unmanaged (no executable configured): configured/missing_root.
|
||||
pub state: String,
|
||||
pub uptime_seconds: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub root_disk_free_mb: Option<u64>,
|
||||
}
|
||||
@@ -125,21 +126,30 @@ pub async fn collect(agent: &Agent, sys: &mut System) -> HeartbeatPayload {
|
||||
})
|
||||
.collect();
|
||||
|
||||
let instances = agent
|
||||
.cfg
|
||||
.instances
|
||||
.iter()
|
||||
.map(|inst| {
|
||||
let exists = inst.root.exists();
|
||||
InstanceInfo {
|
||||
id: inst.id.clone(),
|
||||
game: inst.game.clone(),
|
||||
label: inst.label.clone(),
|
||||
state: if exists { "configured" } else { "missing_root" }.to_string(),
|
||||
root_disk_free_mb: disk_free_for_path(&disks, &inst.root),
|
||||
let mut instances = Vec::with_capacity(agent.cfg.instances.len());
|
||||
for inst in &agent.cfg.instances {
|
||||
let (state, uptime_seconds) = match agent.supervisors.get(&inst.id) {
|
||||
Some(sup) if !matches!(sup.state(), crate::process::InstanceState::Unmanaged) => {
|
||||
(sup.state().as_label().to_string(), sup.uptime_seconds().await)
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
_ => {
|
||||
let exists = inst.root.exists();
|
||||
(
|
||||
if exists { "configured" } else { "missing_root" }.to_string(),
|
||||
0,
|
||||
)
|
||||
}
|
||||
};
|
||||
instances.push(InstanceInfo {
|
||||
id: inst.id.clone(),
|
||||
game: inst.game.clone(),
|
||||
label: inst.label.clone(),
|
||||
state,
|
||||
uptime_seconds,
|
||||
root_disk_free_mb: disk_free_for_path(&disks, &inst.root),
|
||||
});
|
||||
}
|
||||
let instances = instances;
|
||||
|
||||
HeartbeatPayload {
|
||||
schema: 2,
|
||||
|
||||
Reference in New Issue
Block a user