reminders: persist + surface delivery failures

Broker schema gains attempt_count INTEGER + last_error TEXT
columns via idempotent ALTER TABLE migration (pragma-probed so
fresh + existing dbs converge). reminder_scheduler::tick calls
record_reminder_failure on every deliver_reminder error,
bumping the counter + stashing the message. get_due_reminders
filters out rows where attempt_count >= MAX_REMINDER_ATTEMPTS
(5) so the scheduler stops retrying a stuck row until the
operator intervenes.

new POST /retry-reminder/{id} → reset_reminder_failure clears
the counters; next 5s tick re-attempts. cancel-reminder
unchanged (hard-delete).

dashboard renders failed rows with a red left rule, the error
text inline, and a ⚠ N failed badge. ↻ R3TRY button appears
when attempt_count > 0 — sits next to ✗ C4NC3L in a small
actions row below the body.
This commit is contained in:
müde 2026-05-18 00:08:09 +02:00
parent d395bdc945
commit 978a3cf391
5 changed files with 173 additions and 8 deletions

View file

@ -57,8 +57,26 @@ pub struct PendingReminder {
pub file_path: Option<String>,
pub due_at: i64,
pub created_at: i64,
/// Most recent delivery failure for this row, if any. Cleared
/// to NULL on operator retry. Surfaced inline in the dashboard
/// so a stuck reminder doesn't just silently retry forever.
#[serde(skip_serializing_if = "Option::is_none")]
pub last_error: Option<String>,
/// Number of failed delivery attempts since the row was
/// created or last retried. After `MAX_REMINDER_ATTEMPTS` the
/// scheduler stops trying (the row stays in `pending` with the
/// error so the operator can decide between retry + cancel).
#[serde(default)]
pub attempt_count: u32,
}
/// Stop retrying a row after this many consecutive failures. The
/// scheduler quits scheduling it until an operator explicitly
/// retries (which resets the counter) or cancels (which deletes
/// the row). Below the cap the existing 5s tick re-attempts each
/// time the row is due.
pub const MAX_REMINDER_ATTEMPTS: u32 = 5;
/// Intra-process broker event. `recv_blocking` listens on the same
/// channel as the dashboard forwarder; the forwarder re-emits each
/// event as a `DashboardEvent` with a freshly-stamped seq from the
@ -95,6 +113,7 @@ impl Broker {
let conn =
Connection::open(path).with_context(|| format!("open broker db {}", path.display()))?;
conn.execute_batch(SCHEMA).context("apply broker schema")?;
ensure_reminder_columns(&conn).context("migrate reminders columns")?;
let (events, _) = broadcast::channel(EVENT_CHANNEL);
Ok(Self {
conn: Mutex::new(conn),
@ -305,12 +324,14 @@ impl Broker {
pub fn list_pending_reminders(&self) -> Result<Vec<PendingReminder>> {
let conn = self.conn.lock().unwrap();
let mut stmt = conn.prepare(
"SELECT id, agent, message, file_path, due_at, created_at \
"SELECT id, agent, message, file_path, due_at, created_at, \
last_error, attempt_count \
FROM reminders \
WHERE sent_at IS NULL \
ORDER BY due_at ASC",
)?;
let rows = stmt.query_map([], |row| {
let attempts: i64 = row.get(7)?;
Ok(PendingReminder {
id: row.get(0)?,
agent: row.get(1)?,
@ -318,12 +339,46 @@ impl Broker {
file_path: row.get(3)?,
due_at: row.get(4)?,
created_at: row.get(5)?,
last_error: row.get(6)?,
attempt_count: u32::try_from(attempts).unwrap_or(0),
})
})?;
rows.collect::<rusqlite::Result<Vec<_>>>()
.context("list pending reminders")
}
/// Mark a delivery attempt as failed: bump `attempt_count` and
/// stash the error string. Called by `reminder_scheduler::tick`
/// when `deliver_reminder` returns Err. Soft-cap behaviour
/// lives in `get_due_reminders` (rows over the cap drop out
/// of the due-list and stop being attempted until retry).
pub fn record_reminder_failure(&self, id: i64, reason: &str) -> Result<()> {
let conn = self.conn.lock().unwrap();
conn.execute(
"UPDATE reminders \
SET attempt_count = attempt_count + 1, last_error = ?1 \
WHERE id = ?2 AND sent_at IS NULL",
params![reason, id],
)?;
Ok(())
}
/// Clear the failure state on a pending reminder so the
/// scheduler picks it up again. No-op when the row is already
/// fresh (attempt_count == 0). Returns the number of rows
/// affected so callers can distinguish "retried" from "no
/// such pending reminder" (already delivered, or wrong id).
pub fn reset_reminder_failure(&self, id: i64) -> Result<usize> {
let conn = self.conn.lock().unwrap();
let n = conn.execute(
"UPDATE reminders \
SET attempt_count = 0, last_error = NULL \
WHERE id = ?1 AND sent_at IS NULL",
params![id],
)?;
Ok(n)
}
/// Count this agent's still-pending (un-delivered) reminders.
/// Used by the per-turn stats sink for a cheap "what was queued
/// at turn-end" snapshot.
@ -357,13 +412,16 @@ impl Broker {
pub fn get_due_reminders(&self, limit: u64) -> Result<Vec<DueReminder>> {
let conn = self.conn.lock().unwrap();
let limit_i = i64::try_from(limit.min(i64::MAX as u64)).unwrap_or(i64::MAX);
let max_attempts = i64::from(MAX_REMINDER_ATTEMPTS);
// attempt_count >= cap = give up; row stays pending so the
// operator sees + can retry/cancel via the dashboard.
let mut stmt = conn.prepare(
"SELECT agent, id, message, file_path FROM reminders \
WHERE due_at <= ?1 AND sent_at IS NULL \
WHERE due_at <= ?1 AND sent_at IS NULL AND attempt_count < ?3 \
ORDER BY agent, due_at ASC \
LIMIT ?2",
)?;
let rows = stmt.query_map(params![now_unix(), limit_i], |row| {
let rows = stmt.query_map(params![now_unix(), limit_i, max_attempts], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, i64>(1)?,
@ -410,6 +468,35 @@ impl Broker {
}
}
/// Idempotent reminder-table migrations. `ALTER TABLE ADD COLUMN`
/// has no `IF NOT EXISTS` form in sqlite, so we probe
/// `pragma_table_info` per column. New deploys (table created by
/// SCHEMA in this commit cycle) skip the ALTER; pre-existing
/// broker.sqlite files get the columns added on next boot.
fn ensure_reminder_columns(conn: &Connection) -> Result<()> {
for (name, sql) in [
(
"attempt_count",
"ALTER TABLE reminders ADD COLUMN attempt_count INTEGER NOT NULL DEFAULT 0;",
),
(
"last_error",
"ALTER TABLE reminders ADD COLUMN last_error TEXT;",
),
] {
let has: bool = conn
.prepare(&format!(
"SELECT 1 FROM pragma_table_info('reminders') WHERE name = '{name}'"
))?
.exists([])?;
if !has {
conn.execute_batch(sql)
.with_context(|| format!("add reminders.{name} column"))?;
}
}
Ok(())
}
fn now_unix() -> i64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)

View file

@ -57,6 +57,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
.route("/api/state-file", get(get_state_file))
.route("/api/reminders", get(api_reminders))
.route("/cancel-reminder/{id}", post(post_cancel_reminder))
.route("/retry-reminder/{id}", post(post_retry_reminder))
.route("/api/agent-config/{name}", get(get_agent_config))
.route("/request-spawn", post(post_request_spawn))
.route("/op-send", post(post_op_send))
@ -1126,6 +1127,25 @@ async fn post_cancel_reminder(
}
}
/// Reset a pending reminder's failure state so the scheduler
/// retries it on the next tick. Useful when the failure was
/// transient (sqlite lock contention, disk full → freed up) and
/// the operator wants delivery to resume immediately instead of
/// the row sitting in attempt-count-capped purgatory.
async fn post_retry_reminder(
State(state): State<AppState>,
AxumPath(id): AxumPath<i64>,
) -> Response {
match state.coord.broker.reset_reminder_failure(id) {
Ok(0) => error_response(&format!("reminder {id} not pending (already delivered?)")),
Ok(_) => {
tracing::info!(%id, "operator reset reminder failure for retry");
(StatusCode::OK, "ok").into_response()
}
Err(e) => error_response(&format!("retry reminder {id} failed: {e:#}")),
}
}
async fn post_purge_tombstone(
State(state): State<AppState>,
AxumPath(name): AxumPath<String>,

View file

@ -71,12 +71,24 @@ fn tick(coord: &Arc<Coordinator>) {
for (agent, id, message, file_path) in due {
let body = prepare_body(&agent, &message, file_path.as_deref());
if let Err(e) = coord.broker.deliver_reminder(id, &agent, &body) {
let reason = format!("{e:#}");
tracing::warn!(
reminder_id = id,
%agent,
error = ?e,
error = %reason,
"failed to deliver reminder"
);
// Persist the failure so the dashboard can surface it +
// bump attempt_count. After MAX_REMINDER_ATTEMPTS the
// row drops out of `get_due_reminders` and waits for
// operator retry / cancel.
if let Err(persist_err) = coord.broker.record_reminder_failure(id, &reason) {
tracing::warn!(
reminder_id = id,
error = ?persist_err,
"failed to persist reminder failure"
);
}
}
}
}