All checks were successful
Test Asgard Runner / test (push) Successful in 3s
New corrosion-host-agent/ crate (Go companion-agent stays as behavior
reference until parity). Wire protocol v2 per COA-B: instance-scoped
subjects corrosion.{license}.{instance}.* + host-level .host.* — spec
in PROTOCOL.md, designed for the license->host->instance fleet model.
- Multi-instance TOML config in the foundation, not retrofitted
- NATS layer on the Vigilance production profile (infinite reconnect,
capped backoff, 30s ping, 8192-msg offline buffer)
- Heartbeat with real sysinfo telemetry — Go agent shipped hardcoded
disk/cpu placeholders; this is the panel's first true Resources data
- Connectivity prober (outbound TCP, periodic + on-demand)
- Host cmd channel (ping/probe/sysinfo), going-offline beacon,
CancellationToken shutdown
- Live-fire verified against production NATS; artifacts: 3.7MB static
linux-musl, 3.8MB windows .exe (static CRT)
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
122 lines
3.9 KiB
Rust
122 lines
3.9 KiB
Rust
//! Connectivity prober.
|
|
//!
|
|
//! Answers "is it the box or is it the network?" before a support ticket gets
|
|
//! written. Phase 0 scope is OUTBOUND reachability: TCP connect timing from
|
|
//! the host to known endpoints. Inbound port-forward verification (the thing
|
|
//! panel users actually struggle with) requires a backend-side reverse probe
|
|
//! and is specified in PROTOCOL.md as a later phase.
|
|
|
|
use chrono::{SecondsFormat, Utc};
|
|
use serde::Serialize;
|
|
use std::sync::Arc;
|
|
use std::time::{Duration, Instant};
|
|
use tokio::net::TcpStream;
|
|
|
|
use crate::agent::Agent;
|
|
use crate::config::ProbeTargetConfig;
|
|
|
|
const CONNECT_TIMEOUT: Duration = Duration::from_secs(3);
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct ProbeResult {
|
|
pub name: String,
|
|
pub host: String,
|
|
pub port: u16,
|
|
pub ok: bool,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub latency_ms: Option<u64>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub error: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct ProbeReport {
|
|
pub timestamp: String,
|
|
pub results: Vec<ProbeResult>,
|
|
}
|
|
|
|
/// Built-in targets every agent checks, before config extras.
|
|
fn default_targets() -> Vec<ProbeTargetConfig> {
|
|
vec![ProbeTargetConfig {
|
|
name: "corrosion-cdn".to_string(),
|
|
host: "cdn.corrosionmgmt.com".to_string(),
|
|
port: 443,
|
|
}]
|
|
}
|
|
|
|
pub async fn run_probe(extra_targets: &[ProbeTargetConfig]) -> ProbeReport {
|
|
let mut targets = default_targets();
|
|
targets.extend(extra_targets.iter().cloned());
|
|
|
|
let checks = targets.into_iter().map(|t| async move {
|
|
let started = Instant::now();
|
|
let addr = format!("{}:{}", t.host, t.port);
|
|
let outcome = tokio::time::timeout(CONNECT_TIMEOUT, TcpStream::connect(&addr)).await;
|
|
match outcome {
|
|
Ok(Ok(_stream)) => ProbeResult {
|
|
name: t.name,
|
|
host: t.host,
|
|
port: t.port,
|
|
ok: true,
|
|
latency_ms: Some(started.elapsed().as_millis() as u64),
|
|
error: None,
|
|
},
|
|
Ok(Err(e)) => ProbeResult {
|
|
name: t.name,
|
|
host: t.host,
|
|
port: t.port,
|
|
ok: false,
|
|
latency_ms: None,
|
|
error: Some(e.to_string()),
|
|
},
|
|
Err(_) => ProbeResult {
|
|
name: t.name,
|
|
host: t.host,
|
|
port: t.port,
|
|
ok: false,
|
|
latency_ms: None,
|
|
error: Some(format!("timeout after {}s", CONNECT_TIMEOUT.as_secs())),
|
|
},
|
|
}
|
|
});
|
|
|
|
let results = futures::future::join_all(checks).await;
|
|
|
|
ProbeReport {
|
|
timestamp: Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true),
|
|
results,
|
|
}
|
|
}
|
|
|
|
/// Periodic probe loop; results land in shared state and ride the next
|
|
/// heartbeat. Jittered interval to avoid fleet-wide synchronization.
|
|
pub async fn run_loop(agent: Arc<Agent>) {
|
|
let cancel = agent.shutdown.clone();
|
|
loop {
|
|
let report = run_probe(&agent.cfg.probe_targets).await;
|
|
let failed: Vec<&str> = report
|
|
.results
|
|
.iter()
|
|
.filter(|r| !r.ok)
|
|
.map(|r| r.name.as_str())
|
|
.collect();
|
|
if failed.is_empty() {
|
|
tracing::debug!("probe ok ({} targets)", report.results.len());
|
|
} else {
|
|
tracing::warn!("probe failures: {}", failed.join(", "));
|
|
}
|
|
*agent.last_probe.write().await = Some(report);
|
|
|
|
let jitter = rand::Rng::gen_range(&mut rand::thread_rng(), 0.8..1.2);
|
|
let interval =
|
|
Duration::from_secs_f64(agent.cfg.probe_interval_seconds as f64 * jitter);
|
|
tokio::select! {
|
|
_ = tokio::time::sleep(interval) => {}
|
|
_ = cancel.cancelled() => {
|
|
tracing::info!("prober stopping");
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|