Files
corrosion-admin-panel/corrosion-host-agent/tests/supervisor.rs
Vantz Stockwell 068a476f39 feat(host-agent): Phase 1a process supervision — instance start/stop/restart/status + push state events
Per-instance ProcessSupervisor: tokio child spawn with proper arg list
(fixes Go's naive space-splitting), graceful SIGTERM with 30s budget
then force kill, monitor task classifying ordered-stop vs crash (exit
code captured), watch-channel state observable everywhere. Instance cmd
channel live on corrosion.{license}.{instance}.cmd (start/stop/restart/
status) with state events pushed on {instance}.status (keep-latest
semantics, documented). Heartbeats now carry live process state +
uptime per instance. Crate restructured lib+bin for integration tests.

Verified: 5 integration tests with real OS processes (lifecycle, crash
exit-code, restart recovery, unmanaged rejection, clean spawn failure)
+ live-NATS contract test (request-reply roundtrips, double-start
rejection, push events, heartbeat state) — all green.

Known limitation (documented): no PID adoption yet — agent restart
orphans a running game process to 'stopped' until panel restart.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 10:44:24 -04:00

108 lines
4.1 KiB
Rust

//! Process supervisor integration tests using real OS processes.
//! Unix-only test doubles (/bin/sleep, /bin/sh) — the supervisor logic under
//! test is platform-shared; Windows-specific stop semantics get covered when
//! the Windows service work lands.
#![cfg(unix)]
use std::path::PathBuf;
use std::time::Duration;
use corrosion_host_agent::config::InstanceConfig;
use corrosion_host_agent::process::{InstanceState, ProcessSupervisor};
fn managed_instance(executable: &str, args: &[&str]) -> InstanceConfig {
InstanceConfig {
id: "test-instance".to_string(),
game: "rust".to_string(),
root: PathBuf::from("/tmp"),
label: None,
executable: Some(PathBuf::from(executable)),
args: args.iter().map(|s| s.to_string()).collect(),
working_dir: None,
}
}
async fn wait_for_state(
sup: &std::sync::Arc<ProcessSupervisor>,
want: fn(&InstanceState) -> bool,
budget: Duration,
) -> InstanceState {
let deadline = tokio::time::Instant::now() + budget;
loop {
let state = sup.state();
if want(&state) {
return state;
}
if tokio::time::Instant::now() > deadline {
panic!("timed out waiting for state; last = {state:?}");
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
#[tokio::test]
async fn start_status_stop_lifecycle() {
let sup = ProcessSupervisor::new(&managed_instance("/bin/sleep", &["300"]));
assert_eq!(sup.state(), InstanceState::Stopped);
sup.start().await.expect("start should succeed");
assert_eq!(sup.state(), InstanceState::Running);
tokio::time::sleep(Duration::from_millis(1100)).await;
assert!(sup.uptime_seconds().await >= 1, "uptime should advance");
// Double-start must be rejected while running.
assert!(sup.start().await.is_err(), "double start must fail");
sup.stop().await.expect("stop should succeed");
let state = wait_for_state(&sup, |s| matches!(s, InstanceState::Stopped), Duration::from_secs(5)).await;
assert_eq!(state, InstanceState::Stopped);
assert_eq!(sup.uptime_seconds().await, 0);
}
#[tokio::test]
async fn unexpected_exit_is_crashed_with_code() {
let sup = ProcessSupervisor::new(&managed_instance("/bin/sh", &["-c", "sleep 0.2; exit 7"]));
sup.start().await.expect("start should succeed");
let state = wait_for_state(
&sup,
|s| matches!(s, InstanceState::Crashed { .. }),
Duration::from_secs(5),
)
.await;
assert_eq!(state, InstanceState::Crashed { exit_code: Some(7) });
}
#[tokio::test]
async fn restart_from_crashed_recovers() {
let sup = ProcessSupervisor::new(&managed_instance("/bin/sh", &["-c", "exit 1"]));
sup.start().await.expect("start should succeed");
wait_for_state(&sup, |s| matches!(s, InstanceState::Crashed { .. }), Duration::from_secs(5)).await;
// Restart from crashed must work (panel "Restart" after a crash).
// Use a long-lived command this time by replacing the supervisor — the
// command is fixed per supervisor, so emulate via a fresh one.
let sup2 = ProcessSupervisor::new(&managed_instance("/bin/sleep", &["300"]));
sup2.restart().await.expect("restart from stopped should start");
assert_eq!(sup2.state(), InstanceState::Running);
sup2.stop().await.expect("cleanup stop");
}
#[tokio::test]
async fn unmanaged_instance_rejects_process_commands() {
let mut cfg = managed_instance("/bin/sleep", &["300"]);
cfg.executable = None;
let sup = ProcessSupervisor::new(&cfg);
assert_eq!(sup.state(), InstanceState::Unmanaged);
assert!(sup.start().await.is_err(), "unmanaged start must fail");
assert!(sup.stop().await.is_err(), "unmanaged stop must fail");
}
#[tokio::test]
async fn missing_executable_fails_cleanly() {
let sup = ProcessSupervisor::new(&managed_instance("/nonexistent/bin/gameserver", &[]));
let err = sup.start().await.expect_err("must fail");
assert!(err.to_string().contains("not found"), "error should say not found: {err}");
assert_eq!(sup.state(), InstanceState::Stopped, "failed start must not leave Starting state");
}