feat(api): host-agent protocol v2 consumer — heartbeat persistence, auto-register, staleness sweep
Nothing persisted agent heartbeats before: companion_last_seen was written once at setup and connection_status stayed 'connected' forever. HostAgentConsumerService now consumes corrosion.*.host.heartbeat (updates last_seen + status, auto-creates the bare_metal connection row on first contact), host.going_offline (graceful offline), and sweeps connections offline after 180s of heartbeat silence. License-existence tenant validation with caching per NATS-consumer doctrine. WS bridge forwards host_heartbeat/host_going_offline to the panel. Contract-verified against production NATS with the backend's own nats lib: v2 subjects, schema 2, real telemetry, offline beacon. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -49,6 +49,9 @@ import { EarlyAccessModule } from './modules/early-access/early-access.module';
|
||||
// Shared Services
|
||||
import { NatsService } from './services/nats.service';
|
||||
import { NatsBridgeService } from './services/nats-bridge.service';
|
||||
import { HostAgentConsumerService } from './services/host-agent-consumer.service';
|
||||
import { ServerConnection } from './entities/server-connection.entity';
|
||||
import { License } from './entities/license.entity';
|
||||
import { SteamService } from './services/steam.service';
|
||||
|
||||
// Gateway
|
||||
@@ -91,6 +94,9 @@ import { NatsBridgeGateway } from './gateways/nats-bridge.gateway';
|
||||
// Scheduler
|
||||
ScheduleModule.forRoot(),
|
||||
|
||||
// Repositories for app-level shared services (host-agent consumer)
|
||||
TypeOrmModule.forFeature([ServerConnection, License]),
|
||||
|
||||
// Feature Modules
|
||||
AuthModule,
|
||||
UsersModule,
|
||||
@@ -134,6 +140,7 @@ import { NatsBridgeGateway } from './gateways/nats-bridge.gateway';
|
||||
// Shared services
|
||||
NatsService,
|
||||
NatsBridgeService,
|
||||
HostAgentConsumerService,
|
||||
SteamService,
|
||||
|
||||
// WebSocket gateway
|
||||
|
||||
152
backend-nest/src/services/host-agent-consumer.service.ts
Normal file
152
backend-nest/src/services/host-agent-consumer.service.ts
Normal file
@@ -0,0 +1,152 @@
|
||||
import { Injectable, Logger, OnModuleInit } from '@nestjs/common';
|
||||
import { Interval } from '@nestjs/schedule';
|
||||
import { InjectRepository } from '@nestjs/typeorm';
|
||||
import { Repository } from 'typeorm';
|
||||
import { NatsService } from './nats.service';
|
||||
import { ServerConnection } from '../entities/server-connection.entity';
|
||||
import { License } from '../entities/license.entity';
|
||||
|
||||
/**
|
||||
* Consumes Corrosion wire protocol v2 host-agent subjects
|
||||
* (corrosion-host-agent/PROTOCOL.md) and keeps server_connections truthful.
|
||||
*
|
||||
* Before this service existed, NOTHING persisted agent heartbeats:
|
||||
* companion_last_seen was written once at setup and connection_status stayed
|
||||
* 'connected' forever. Now: heartbeat -> last_seen + connected (row
|
||||
* auto-created on first contact), going_offline beacon -> offline, and a
|
||||
* staleness sweep marks hosts offline when heartbeats stop arriving.
|
||||
*/
|
||||
@Injectable()
|
||||
export class HostAgentConsumerService implements OnModuleInit {
|
||||
private readonly logger = new Logger(HostAgentConsumerService.name);
|
||||
|
||||
/** licenseId -> cache expiry epoch-ms. Positive = exists, absent = unknown. */
|
||||
private knownLicenses = new Map<string, number>();
|
||||
/** Unknown/garbage license ids we already warned about (anti log-spam). */
|
||||
private warnedUnknown = new Set<string>();
|
||||
|
||||
private static readonly UUID_RE =
|
||||
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||||
private static readonly LICENSE_CACHE_TTL_MS = 5 * 60_000;
|
||||
/** 3x the agent's default 60s heartbeat (which jitters to max 72s). */
|
||||
private static readonly OFFLINE_AFTER_MS = 180_000;
|
||||
|
||||
constructor(
|
||||
private readonly nats: NatsService,
|
||||
@InjectRepository(ServerConnection)
|
||||
private readonly connectionRepository: Repository<ServerConnection>,
|
||||
@InjectRepository(License)
|
||||
private readonly licenseRepository: Repository<License>,
|
||||
) {}
|
||||
|
||||
onModuleInit() {
|
||||
this.nats.subscribe('corrosion.*.host.heartbeat', (data, subject) => {
|
||||
const licenseId = subject.split('.')[1];
|
||||
void this.onHeartbeat(licenseId).catch((err) =>
|
||||
this.logger.error(`heartbeat handling failed for ${licenseId}: ${err.message}`, err.stack),
|
||||
);
|
||||
void data; // payload telemetry is bridged to the browser; persistence here is liveness only
|
||||
});
|
||||
|
||||
this.nats.subscribe('corrosion.*.host.going_offline', (_data, subject) => {
|
||||
const licenseId = subject.split('.')[1];
|
||||
void this.onGoingOffline(licenseId).catch((err) =>
|
||||
this.logger.error(`going_offline handling failed for ${licenseId}: ${err.message}`, err.stack),
|
||||
);
|
||||
});
|
||||
|
||||
this.logger.log('Host agent (protocol v2) consumer subscriptions initialized');
|
||||
}
|
||||
|
||||
private async onHeartbeat(licenseId: string): Promise<void> {
|
||||
if (!(await this.isValidTenant(licenseId))) return;
|
||||
|
||||
const now = new Date();
|
||||
const existing = await this.connectionRepository.findOne({
|
||||
where: { license_id: licenseId },
|
||||
});
|
||||
|
||||
if (existing) {
|
||||
await this.connectionRepository.update(
|
||||
{ id: existing.id },
|
||||
{ companion_last_seen: now, connection_status: 'connected', updated_at: now },
|
||||
);
|
||||
if (existing.connection_status !== 'connected') {
|
||||
this.logger.log(`host agent for license ${licenseId} is back online`);
|
||||
}
|
||||
} else {
|
||||
// First contact from a host agent: auto-register the connection so the
|
||||
// panel lights up without a manual setup step.
|
||||
await this.connectionRepository.save(
|
||||
this.connectionRepository.create({
|
||||
license_id: licenseId,
|
||||
connection_type: 'bare_metal',
|
||||
connection_status: 'connected',
|
||||
companion_last_seen: now,
|
||||
}),
|
||||
);
|
||||
this.logger.log(`host agent registered for license ${licenseId} (first heartbeat)`);
|
||||
}
|
||||
}
|
||||
|
||||
private async onGoingOffline(licenseId: string): Promise<void> {
|
||||
if (!(await this.isValidTenant(licenseId))) return;
|
||||
|
||||
await this.connectionRepository.update(
|
||||
{ license_id: licenseId },
|
||||
{ connection_status: 'offline', updated_at: new Date() },
|
||||
);
|
||||
this.logger.log(`host agent for license ${licenseId} went offline (graceful beacon)`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Heartbeats stopping must flip the panel to offline — an agent that
|
||||
* crashes or loses network never sends the goodbye beacon.
|
||||
*/
|
||||
@Interval(60_000)
|
||||
async sweepStaleConnections(): Promise<void> {
|
||||
const threshold = new Date(Date.now() - HostAgentConsumerService.OFFLINE_AFTER_MS);
|
||||
const result = await this.connectionRepository
|
||||
.createQueryBuilder()
|
||||
.update(ServerConnection)
|
||||
.set({ connection_status: 'offline', updated_at: () => 'NOW()' })
|
||||
.where('connection_status = :connected', { connected: 'connected' })
|
||||
.andWhere('companion_last_seen IS NOT NULL')
|
||||
.andWhere('companion_last_seen < :threshold', { threshold })
|
||||
.execute();
|
||||
|
||||
if (result.affected) {
|
||||
this.logger.warn(`marked ${result.affected} stale host connection(s) offline`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tenant validation: the subject segment must be a real license UUID.
|
||||
* NATS consumers must never write rows for subjects an arbitrary publisher
|
||||
* invented. Existence is cached to avoid a query per heartbeat.
|
||||
*/
|
||||
private async isValidTenant(licenseId: string): Promise<boolean> {
|
||||
if (!HostAgentConsumerService.UUID_RE.test(licenseId)) {
|
||||
this.warnUnknownOnce(licenseId, 'not a UUID');
|
||||
return false;
|
||||
}
|
||||
|
||||
const cachedUntil = this.knownLicenses.get(licenseId);
|
||||
if (cachedUntil && cachedUntil > Date.now()) return true;
|
||||
|
||||
const exists = await this.licenseRepository.exist({ where: { id: licenseId } });
|
||||
if (!exists) {
|
||||
this.warnUnknownOnce(licenseId, 'no such license');
|
||||
return false;
|
||||
}
|
||||
|
||||
this.knownLicenses.set(licenseId, Date.now() + HostAgentConsumerService.LICENSE_CACHE_TTL_MS);
|
||||
return true;
|
||||
}
|
||||
|
||||
private warnUnknownOnce(licenseId: string, reason: string): void {
|
||||
if (this.warnedUnknown.has(licenseId)) return;
|
||||
this.warnedUnknown.add(licenseId);
|
||||
this.logger.warn(`ignoring host-agent traffic for invalid license '${licenseId}' (${reason})`);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
export { NatsService } from './nats.service';
|
||||
export { NatsBridgeService } from './nats-bridge.service';
|
||||
export { HostAgentConsumerService } from './host-agent-consumer.service';
|
||||
export { SteamService } from './steam.service';
|
||||
|
||||
@@ -44,6 +44,17 @@ export class NatsBridgeService implements OnModuleInit {
|
||||
this.emit(licenseId, 'oxide_status', data);
|
||||
});
|
||||
|
||||
// Wire protocol v2 (corrosion-host-agent) — host-level telemetry
|
||||
this.nats.subscribe('corrosion.*.host.heartbeat', (data, subject) => {
|
||||
const licenseId = subject.split('.')[1];
|
||||
this.emit(licenseId, 'host_heartbeat', data);
|
||||
});
|
||||
|
||||
this.nats.subscribe('corrosion.*.host.going_offline', (data, subject) => {
|
||||
const licenseId = subject.split('.')[1];
|
||||
this.emit(licenseId, 'host_going_offline', data);
|
||||
});
|
||||
|
||||
this.logger.log('NATS bridge subscriptions initialized');
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user