add graceful shutdown signal to coordinator and all background tasks

This commit is contained in:
damocles 2026-05-20 13:05:13 +02:00
parent 67b47872e0
commit e27984b74c
6 changed files with 86 additions and 12 deletions

View file

@ -8,7 +8,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use tokio::sync::broadcast; use tokio::sync::{broadcast, watch};
use crate::agent_server::{self, AgentSocket}; use crate::agent_server::{self, AgentSocket};
use crate::approvals::Approvals; use crate::approvals::Approvals;
@ -73,6 +73,10 @@ pub struct Coordinator {
/// tokio mutex so the rescan can `await` `lifecycle::list` / /// tokio mutex so the rescan can `await` `lifecycle::list` /
/// `is_running` without blocking other coordinator paths. /// `is_running` without blocking other coordinator paths.
last_containers: tokio::sync::Mutex<HashMap<String, ContainerView>>, last_containers: tokio::sync::Mutex<HashMap<String, ContainerView>>,
/// Shutdown signal broadcast to all background tasks. Sending
/// `true` asks every loop to exit after its current work item.
/// Use `shutdown_rx()` to subscribe; `request_shutdown()` to fire.
shutdown_tx: watch::Sender<bool>,
} }
/// Per-agent in-progress state that the dashboard surfaces between approve /// Per-agent in-progress state that the dashboard surfaces between approve
@ -140,6 +144,7 @@ impl Coordinator {
let approvals = Approvals::open(db_path).context("open approvals")?; let approvals = Approvals::open(db_path).context("open approvals")?;
let questions = OperatorQuestions::open(db_path).context("open operator_questions")?; let questions = OperatorQuestions::open(db_path).context("open operator_questions")?;
let (dashboard_events, _) = broadcast::channel(DASHBOARD_CHANNEL); let (dashboard_events, _) = broadcast::channel(DASHBOARD_CHANNEL);
let (shutdown_tx, _) = watch::channel(false);
Ok(Self { Ok(Self {
broker: Arc::new(broker), broker: Arc::new(broker),
approvals: Arc::new(approvals), approvals: Arc::new(approvals),
@ -152,9 +157,27 @@ impl Coordinator {
dashboard_events, dashboard_events,
event_seq: AtomicU64::new(0), event_seq: AtomicU64::new(0),
last_containers: tokio::sync::Mutex::new(HashMap::new()), last_containers: tokio::sync::Mutex::new(HashMap::new()),
shutdown_tx,
}) })
} }
/// Subscribe to the shutdown watch channel. Background tasks call
/// this at spawn time and break their loop when the receiver
/// transitions to `true` (via `Coordinator::request_shutdown`).
/// A closed channel (i.e. the Coordinator was dropped) also
/// signals tasks to exit.
pub fn shutdown_rx(&self) -> watch::Receiver<bool> {
self.shutdown_tx.subscribe()
}
/// Signal all background tasks to exit cleanly. The tasks break
/// out of their poll loop after completing their current work item.
/// Best-effort — does nothing if all receivers have already been
/// dropped (e.g. process is already mid-shutdown).
pub fn request_shutdown(&self) {
let _ = self.shutdown_tx.send(true);
}
/// Subscribe to the unified dashboard event channel. Used by the /// Subscribe to the unified dashboard event channel. Used by the
/// `/dashboard/stream` SSE handler and by the broker-to-dashboard /// `/dashboard/stream` SSE handler and by the broker-to-dashboard
/// forwarder task. /// forwarder task.

View file

@ -26,6 +26,7 @@ use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
const POLL_INTERVAL: Duration = Duration::from_secs(10); const POLL_INTERVAL: Duration = Duration::from_secs(10);
pub fn spawn(coord: Arc<Coordinator>) { pub fn spawn(coord: Arc<Coordinator>) {
let mut shutdown = coord.shutdown_rx();
tokio::spawn(async move { tokio::spawn(async move {
let mut prev_running: HashSet<String> = HashSet::new(); let mut prev_running: HashSet<String> = HashSet::new();
let mut prev_logged_in: HashSet<String> = HashSet::new(); let mut prev_logged_in: HashSet<String> = HashSet::new();
@ -80,7 +81,13 @@ pub fn spawn(coord: Arc<Coordinator>) {
prev_updated = current_updated; prev_updated = current_updated;
seeded = true; seeded = true;
tokio::time::sleep(POLL_INTERVAL).await; tokio::select! {
_ = tokio::time::sleep(POLL_INTERVAL) => {}
_ = shutdown.changed() => {
tracing::info!("crash watcher: shutdown signal received");
break;
}
}
} }
}); });
} }

View file

@ -24,13 +24,17 @@ const KEEP_SECS: i64 = 7 * 24 * 3600;
/// the vacuum SQL against its events.sqlite if present. Errors are /// the vacuum SQL against its events.sqlite if present. Errors are
/// logged but don't tear the loop down. /// logged but don't tear the loop down.
pub fn spawn(coord: Arc<Coordinator>) { pub fn spawn(coord: Arc<Coordinator>) {
let mut shutdown = coord.shutdown_rx();
tokio::spawn(async move { tokio::spawn(async move {
loop { loop {
sweep_once(); sweep_once();
// touching coord keeps the type wired in case future sweeps tokio::select! {
// need approvals/etc.; the ref is otherwise unused today. _ = tokio::time::sleep(VACUUM_INTERVAL) => {}
let _ = &coord; _ = shutdown.changed() => {
tokio::time::sleep(VACUUM_INTERVAL).await; tracing::info!("events vacuum: shutdown signal received");
break;
}
}
} }
}); });
} }

View file

@ -154,8 +154,9 @@ async fn main() -> Result<()> {
// requeue_inflight) and undelivered rows are always kept. // requeue_inflight) and undelivered rows are always kept.
// Runs hourly; first sweep happens immediately. // Runs hourly; first sweep happens immediately.
let vacuum_coord = coord.clone(); let vacuum_coord = coord.clone();
let mut vacuum_shutdown = coord.shutdown_rx();
tokio::spawn(async move { tokio::spawn(async move {
let interval_secs = 3600u64; let interval = std::time::Duration::from_secs(3600);
let keep_secs: i64 = 30 * 24 * 3600; let keep_secs: i64 = 30 * 24 * 3600;
loop { loop {
match vacuum_coord.broker.vacuum_delivered(keep_secs) { match vacuum_coord.broker.vacuum_delivered(keep_secs) {
@ -163,7 +164,13 @@ async fn main() -> Result<()> {
Ok(n) => tracing::info!(removed = n, "broker vacuum"), Ok(n) => tracing::info!(removed = n, "broker vacuum"),
Err(e) => tracing::warn!(error = ?e, "broker vacuum failed"), Err(e) => tracing::warn!(error = ?e, "broker vacuum failed"),
} }
tokio::time::sleep(std::time::Duration::from_secs(interval_secs)).await; tokio::select! {
_ = tokio::time::sleep(interval) => {}
_ = vacuum_shutdown.changed() => {
tracing::info!("broker vacuum: shutdown signal received");
break;
}
}
} }
}); });
// Per-agent events.sqlite vacuum: host-side so the harness // Per-agent events.sqlite vacuum: host-side so the harness
@ -191,7 +198,27 @@ async fn main() -> Result<()> {
tracing::error!(error = ?e, "dashboard failed"); tracing::error!(error = ?e, "dashboard failed");
} }
}); });
server::serve(&cli.socket, coord).await // Run the admin socket until a signal arrives; then signal
// all background tasks so they exit cleanly before the
// process terminates.
let coord_sig = coord.clone();
tokio::select! {
res = server::serve(&cli.socket, coord) => { res? }
_ = tokio::signal::ctrl_c() => {
tracing::info!("SIGINT received — requesting shutdown");
coord_sig.request_shutdown();
}
_ = async {
let mut sig = tokio::signal::unix::signal(
tokio::signal::unix::SignalKind::terminate()
).expect("failed to install SIGTERM handler");
sig.recv().await;
} => {
tracing::info!("SIGTERM received — requesting shutdown");
coord_sig.request_shutdown();
}
}
Ok(())
} }
Cmd::Spawn { name } => { Cmd::Spawn { name } => {
render(client::request(&cli.socket, HostRequest::Spawn { name }).await?) render(client::request(&cli.socket, HostRequest::Spawn { name }).await?)

View file

@ -52,10 +52,17 @@ const REMINDER_BATCH_LIMIT: u64 = 100;
const POLL_INTERVAL: Duration = Duration::from_secs(5); const POLL_INTERVAL: Duration = Duration::from_secs(5);
pub fn spawn(coord: Arc<Coordinator>) { pub fn spawn(coord: Arc<Coordinator>) {
let mut shutdown = coord.shutdown_rx();
tokio::spawn(async move { tokio::spawn(async move {
loop { loop {
tick(&coord); tick(&coord);
tokio::time::sleep(POLL_INTERVAL).await; tokio::select! {
_ = tokio::time::sleep(POLL_INTERVAL) => {}
_ = shutdown.changed() => {
tracing::info!("reminder scheduler: shutdown signal received");
break;
}
}
} }
}); });
} }

View file

@ -22,11 +22,17 @@ const KEEP_SECS: i64 = 90 * 24 * 3600;
/// the vacuum SQL against its turn-stats.sqlite if present. Errors /// the vacuum SQL against its turn-stats.sqlite if present. Errors
/// are logged but don't tear the loop down. /// are logged but don't tear the loop down.
pub fn spawn(coord: Arc<Coordinator>) { pub fn spawn(coord: Arc<Coordinator>) {
let mut shutdown = coord.shutdown_rx();
tokio::spawn(async move { tokio::spawn(async move {
loop { loop {
sweep_once(); sweep_once();
let _ = &coord; tokio::select! {
tokio::time::sleep(VACUUM_INTERVAL).await; _ = tokio::time::sleep(VACUUM_INTERVAL) => {}
_ = shutdown.changed() => {
tracing::info!("stats vacuum: shutdown signal received");
break;
}
}
} }
}); });
} }