add graceful shutdown signal to coordinator and all background tasks
This commit is contained in:
parent
67b47872e0
commit
e27984b74c
6 changed files with 86 additions and 12 deletions
|
|
@ -8,7 +8,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use tokio::sync::broadcast;
|
use tokio::sync::{broadcast, watch};
|
||||||
|
|
||||||
use crate::agent_server::{self, AgentSocket};
|
use crate::agent_server::{self, AgentSocket};
|
||||||
use crate::approvals::Approvals;
|
use crate::approvals::Approvals;
|
||||||
|
|
@ -73,6 +73,10 @@ pub struct Coordinator {
|
||||||
/// tokio mutex so the rescan can `await` `lifecycle::list` /
|
/// tokio mutex so the rescan can `await` `lifecycle::list` /
|
||||||
/// `is_running` without blocking other coordinator paths.
|
/// `is_running` without blocking other coordinator paths.
|
||||||
last_containers: tokio::sync::Mutex<HashMap<String, ContainerView>>,
|
last_containers: tokio::sync::Mutex<HashMap<String, ContainerView>>,
|
||||||
|
/// Shutdown signal broadcast to all background tasks. Sending
|
||||||
|
/// `true` asks every loop to exit after its current work item.
|
||||||
|
/// Use `shutdown_rx()` to subscribe; `request_shutdown()` to fire.
|
||||||
|
shutdown_tx: watch::Sender<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Per-agent in-progress state that the dashboard surfaces between approve
|
/// Per-agent in-progress state that the dashboard surfaces between approve
|
||||||
|
|
@ -140,6 +144,7 @@ impl Coordinator {
|
||||||
let approvals = Approvals::open(db_path).context("open approvals")?;
|
let approvals = Approvals::open(db_path).context("open approvals")?;
|
||||||
let questions = OperatorQuestions::open(db_path).context("open operator_questions")?;
|
let questions = OperatorQuestions::open(db_path).context("open operator_questions")?;
|
||||||
let (dashboard_events, _) = broadcast::channel(DASHBOARD_CHANNEL);
|
let (dashboard_events, _) = broadcast::channel(DASHBOARD_CHANNEL);
|
||||||
|
let (shutdown_tx, _) = watch::channel(false);
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
broker: Arc::new(broker),
|
broker: Arc::new(broker),
|
||||||
approvals: Arc::new(approvals),
|
approvals: Arc::new(approvals),
|
||||||
|
|
@ -152,9 +157,27 @@ impl Coordinator {
|
||||||
dashboard_events,
|
dashboard_events,
|
||||||
event_seq: AtomicU64::new(0),
|
event_seq: AtomicU64::new(0),
|
||||||
last_containers: tokio::sync::Mutex::new(HashMap::new()),
|
last_containers: tokio::sync::Mutex::new(HashMap::new()),
|
||||||
|
shutdown_tx,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Subscribe to the shutdown watch channel. Background tasks call
|
||||||
|
/// this at spawn time and break their loop when the receiver
|
||||||
|
/// transitions to `true` (via `Coordinator::request_shutdown`).
|
||||||
|
/// A closed channel (i.e. the Coordinator was dropped) also
|
||||||
|
/// signals tasks to exit.
|
||||||
|
pub fn shutdown_rx(&self) -> watch::Receiver<bool> {
|
||||||
|
self.shutdown_tx.subscribe()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Signal all background tasks to exit cleanly. The tasks break
|
||||||
|
/// out of their poll loop after completing their current work item.
|
||||||
|
/// Best-effort — does nothing if all receivers have already been
|
||||||
|
/// dropped (e.g. process is already mid-shutdown).
|
||||||
|
pub fn request_shutdown(&self) {
|
||||||
|
let _ = self.shutdown_tx.send(true);
|
||||||
|
}
|
||||||
|
|
||||||
/// Subscribe to the unified dashboard event channel. Used by the
|
/// Subscribe to the unified dashboard event channel. Used by the
|
||||||
/// `/dashboard/stream` SSE handler and by the broker-to-dashboard
|
/// `/dashboard/stream` SSE handler and by the broker-to-dashboard
|
||||||
/// forwarder task.
|
/// forwarder task.
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@ use crate::lifecycle::{self, AGENT_PREFIX, MANAGER_NAME};
|
||||||
const POLL_INTERVAL: Duration = Duration::from_secs(10);
|
const POLL_INTERVAL: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
pub fn spawn(coord: Arc<Coordinator>) {
|
pub fn spawn(coord: Arc<Coordinator>) {
|
||||||
|
let mut shutdown = coord.shutdown_rx();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
let mut prev_running: HashSet<String> = HashSet::new();
|
let mut prev_running: HashSet<String> = HashSet::new();
|
||||||
let mut prev_logged_in: HashSet<String> = HashSet::new();
|
let mut prev_logged_in: HashSet<String> = HashSet::new();
|
||||||
|
|
@ -80,7 +81,13 @@ pub fn spawn(coord: Arc<Coordinator>) {
|
||||||
prev_updated = current_updated;
|
prev_updated = current_updated;
|
||||||
seeded = true;
|
seeded = true;
|
||||||
|
|
||||||
tokio::time::sleep(POLL_INTERVAL).await;
|
tokio::select! {
|
||||||
|
_ = tokio::time::sleep(POLL_INTERVAL) => {}
|
||||||
|
_ = shutdown.changed() => {
|
||||||
|
tracing::info!("crash watcher: shutdown signal received");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,13 +24,17 @@ const KEEP_SECS: i64 = 7 * 24 * 3600;
|
||||||
/// the vacuum SQL against its events.sqlite if present. Errors are
|
/// the vacuum SQL against its events.sqlite if present. Errors are
|
||||||
/// logged but don't tear the loop down.
|
/// logged but don't tear the loop down.
|
||||||
pub fn spawn(coord: Arc<Coordinator>) {
|
pub fn spawn(coord: Arc<Coordinator>) {
|
||||||
|
let mut shutdown = coord.shutdown_rx();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
loop {
|
loop {
|
||||||
sweep_once();
|
sweep_once();
|
||||||
// touching coord keeps the type wired in case future sweeps
|
tokio::select! {
|
||||||
// need approvals/etc.; the ref is otherwise unused today.
|
_ = tokio::time::sleep(VACUUM_INTERVAL) => {}
|
||||||
let _ = &coord;
|
_ = shutdown.changed() => {
|
||||||
tokio::time::sleep(VACUUM_INTERVAL).await;
|
tracing::info!("events vacuum: shutdown signal received");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -154,8 +154,9 @@ async fn main() -> Result<()> {
|
||||||
// requeue_inflight) and undelivered rows are always kept.
|
// requeue_inflight) and undelivered rows are always kept.
|
||||||
// Runs hourly; first sweep happens immediately.
|
// Runs hourly; first sweep happens immediately.
|
||||||
let vacuum_coord = coord.clone();
|
let vacuum_coord = coord.clone();
|
||||||
|
let mut vacuum_shutdown = coord.shutdown_rx();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
let interval_secs = 3600u64;
|
let interval = std::time::Duration::from_secs(3600);
|
||||||
let keep_secs: i64 = 30 * 24 * 3600;
|
let keep_secs: i64 = 30 * 24 * 3600;
|
||||||
loop {
|
loop {
|
||||||
match vacuum_coord.broker.vacuum_delivered(keep_secs) {
|
match vacuum_coord.broker.vacuum_delivered(keep_secs) {
|
||||||
|
|
@ -163,7 +164,13 @@ async fn main() -> Result<()> {
|
||||||
Ok(n) => tracing::info!(removed = n, "broker vacuum"),
|
Ok(n) => tracing::info!(removed = n, "broker vacuum"),
|
||||||
Err(e) => tracing::warn!(error = ?e, "broker vacuum failed"),
|
Err(e) => tracing::warn!(error = ?e, "broker vacuum failed"),
|
||||||
}
|
}
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(interval_secs)).await;
|
tokio::select! {
|
||||||
|
_ = tokio::time::sleep(interval) => {}
|
||||||
|
_ = vacuum_shutdown.changed() => {
|
||||||
|
tracing::info!("broker vacuum: shutdown signal received");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// Per-agent events.sqlite vacuum: host-side so the harness
|
// Per-agent events.sqlite vacuum: host-side so the harness
|
||||||
|
|
@ -191,7 +198,27 @@ async fn main() -> Result<()> {
|
||||||
tracing::error!(error = ?e, "dashboard failed");
|
tracing::error!(error = ?e, "dashboard failed");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
server::serve(&cli.socket, coord).await
|
// Run the admin socket until a signal arrives; then signal
|
||||||
|
// all background tasks so they exit cleanly before the
|
||||||
|
// process terminates.
|
||||||
|
let coord_sig = coord.clone();
|
||||||
|
tokio::select! {
|
||||||
|
res = server::serve(&cli.socket, coord) => { res? }
|
||||||
|
_ = tokio::signal::ctrl_c() => {
|
||||||
|
tracing::info!("SIGINT received — requesting shutdown");
|
||||||
|
coord_sig.request_shutdown();
|
||||||
|
}
|
||||||
|
_ = async {
|
||||||
|
let mut sig = tokio::signal::unix::signal(
|
||||||
|
tokio::signal::unix::SignalKind::terminate()
|
||||||
|
).expect("failed to install SIGTERM handler");
|
||||||
|
sig.recv().await;
|
||||||
|
} => {
|
||||||
|
tracing::info!("SIGTERM received — requesting shutdown");
|
||||||
|
coord_sig.request_shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
Cmd::Spawn { name } => {
|
Cmd::Spawn { name } => {
|
||||||
render(client::request(&cli.socket, HostRequest::Spawn { name }).await?)
|
render(client::request(&cli.socket, HostRequest::Spawn { name }).await?)
|
||||||
|
|
|
||||||
|
|
@ -52,10 +52,17 @@ const REMINDER_BATCH_LIMIT: u64 = 100;
|
||||||
const POLL_INTERVAL: Duration = Duration::from_secs(5);
|
const POLL_INTERVAL: Duration = Duration::from_secs(5);
|
||||||
|
|
||||||
pub fn spawn(coord: Arc<Coordinator>) {
|
pub fn spawn(coord: Arc<Coordinator>) {
|
||||||
|
let mut shutdown = coord.shutdown_rx();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
loop {
|
loop {
|
||||||
tick(&coord);
|
tick(&coord);
|
||||||
tokio::time::sleep(POLL_INTERVAL).await;
|
tokio::select! {
|
||||||
|
_ = tokio::time::sleep(POLL_INTERVAL) => {}
|
||||||
|
_ = shutdown.changed() => {
|
||||||
|
tracing::info!("reminder scheduler: shutdown signal received");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -22,11 +22,17 @@ const KEEP_SECS: i64 = 90 * 24 * 3600;
|
||||||
/// the vacuum SQL against its turn-stats.sqlite if present. Errors
|
/// the vacuum SQL against its turn-stats.sqlite if present. Errors
|
||||||
/// are logged but don't tear the loop down.
|
/// are logged but don't tear the loop down.
|
||||||
pub fn spawn(coord: Arc<Coordinator>) {
|
pub fn spawn(coord: Arc<Coordinator>) {
|
||||||
|
let mut shutdown = coord.shutdown_rx();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
loop {
|
loop {
|
||||||
sweep_once();
|
sweep_once();
|
||||||
let _ = &coord;
|
tokio::select! {
|
||||||
tokio::time::sleep(VACUUM_INTERVAL).await;
|
_ = tokio::time::sleep(VACUUM_INTERVAL) => {}
|
||||||
|
_ = shutdown.changed() => {
|
||||||
|
tracing::info!("stats vacuum: shutdown signal received");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue