reminders: persist + surface delivery failures

Broker schema gains attempt_count INTEGER + last_error TEXT
columns via idempotent ALTER TABLE migration (pragma-probed so
fresh + existing dbs converge). reminder_scheduler::tick calls
record_reminder_failure on every deliver_reminder error,
bumping the counter + stashing the message. get_due_reminders
filters out rows where attempt_count >= MAX_REMINDER_ATTEMPTS
(5) so the scheduler stops retrying a stuck row until the
operator intervenes.

new POST /retry-reminder/{id} → reset_reminder_failure clears
the counters; next 5s tick re-attempts. cancel-reminder
unchanged (hard-delete).

dashboard renders failed rows with a red left rule, the error
text inline, and a ⚠ N failed badge. ↻ R3TRY button appears
when attempt_count > 0 — sits next to ✗ C4NC3L in a small
actions row below the body.
This commit is contained in:
müde 2026-05-18 00:08:09 +02:00
parent d395bdc945
commit 978a3cf391
5 changed files with 173 additions and 8 deletions

View file

@ -1349,7 +1349,8 @@
} }
const ul = el('ul', { class: 'reminders' }); const ul = el('ul', { class: 'reminders' });
for (const r of rows) { for (const r of rows) {
const li = el('li', { class: 'reminder-row' }); const failed = (r.attempt_count || 0) > 0;
const li = el('li', { class: 'reminder-row' + (failed ? ' reminder-failed' : '') });
const dueIn = r.due_at - Math.floor(Date.now() / 1000); const dueIn = r.due_at - Math.floor(Date.now() / 1000);
const dueLabel = dueIn <= 0 const dueLabel = dueIn <= 0
? `overdue ${fmtAgo(r.due_at)}` ? `overdue ${fmtAgo(r.due_at)}`
@ -1364,19 +1365,45 @@
head.append(' ', el('span', { class: 'meta' }, '· payload → ')); head.append(' ', el('span', { class: 'meta' }, '· payload → '));
appendLinkified(head, r.file_path); appendLinkified(head, r.file_path);
} }
if (failed) {
head.append(' ', el('span',
{
class: 'badge badge-warn',
title: 'consecutive failed delivery attempts (capped at 5; over the cap the scheduler stops retrying until you click R3TRY or cancel)',
},
`${r.attempt_count} failed`));
}
const body = el('div', { class: 'reminder-body' }); const body = el('div', { class: 'reminder-body' });
const previews = appendLinkified(body, r.message); const previews = appendLinkified(body, r.message);
li.append(head, body); li.append(head, body);
for (const d of previews) li.appendChild(d); for (const d of previews) li.appendChild(d);
// Cancel form omits `data-no-refresh` — the resulting refreshState if (r.last_error) {
// re-fires refreshReminders so the row drops on its own. li.append(el('div', { class: 'reminder-error' },
el('span', { class: 'msg-sep' }, 'error: '),
r.last_error,
));
}
const actions = el('div', { class: 'reminder-actions' });
if (failed) {
// Retry resets the failure counters so the scheduler picks
// the row up again on its next 5s tick. No data-no-refresh
// — the resulting refreshState re-fires refreshReminders.
const retryForm = el('form', {
method: 'POST', action: '/retry-reminder/' + r.id,
class: 'inline', 'data-async': '',
});
retryForm.append(el('button',
{ type: 'submit', class: 'btn btn-restart' }, '↻ R3TRY'));
actions.append(retryForm);
}
const cancelForm = el('form', { const cancelForm = el('form', {
method: 'POST', action: '/cancel-reminder/' + r.id, method: 'POST', action: '/cancel-reminder/' + r.id,
class: 'inline', 'data-async': '', class: 'inline', 'data-async': '',
'data-confirm': `cancel reminder ${r.id} for ${r.agent}? this drops the queued delivery; no undo.`, 'data-confirm': `cancel reminder ${r.id} for ${r.agent}? this drops the queued delivery; no undo.`,
}); });
cancelForm.append(el('button', { type: 'submit', class: 'btn btn-deny' }, '✗ C4NC3L')); cancelForm.append(el('button', { type: 'submit', class: 'btn btn-deny' }, '✗ C4NC3L'));
li.append(cancelForm); actions.append(cancelForm);
li.append(actions);
ul.append(li); ul.append(li);
} }
root.append(ul); root.append(ul);

View file

@ -474,6 +474,25 @@ summary:hover { color: var(--purple); }
word-break: break-word; word-break: break-word;
margin: 0.3em 0; margin: 0.3em 0;
} }
.reminder-row.reminder-failed {
border-left: 2px solid var(--red, #f38ba8);
padding-left: 0.5em;
}
.reminder-error {
color: var(--red, #f38ba8);
background: rgba(243, 139, 168, 0.06);
border: 1px solid rgba(243, 139, 168, 0.25);
padding: 0.3em 0.5em;
font-size: 0.85em;
white-space: pre-wrap;
word-break: break-word;
margin: 0.2em 0;
}
.reminder-actions {
display: flex;
gap: 0.4em;
margin-top: 0.3em;
}
/* Path linkification agents drop pointer strings into messages /* Path linkification agents drop pointer strings into messages
constantly; clicking the anchor expands a sibling <details> that constantly; clicking the anchor expands a sibling <details> that

View file

@ -57,8 +57,26 @@ pub struct PendingReminder {
pub file_path: Option<String>, pub file_path: Option<String>,
pub due_at: i64, pub due_at: i64,
pub created_at: i64, pub created_at: i64,
/// Most recent delivery failure for this row, if any. Cleared
/// to NULL on operator retry. Surfaced inline in the dashboard
/// so a stuck reminder doesn't just silently retry forever.
#[serde(skip_serializing_if = "Option::is_none")]
pub last_error: Option<String>,
/// Number of failed delivery attempts since the row was
/// created or last retried. After `MAX_REMINDER_ATTEMPTS` the
/// scheduler stops trying (the row stays in `pending` with the
/// error so the operator can decide between retry + cancel).
#[serde(default)]
pub attempt_count: u32,
} }
/// Stop retrying a row after this many consecutive failures. The
/// scheduler quits scheduling it until an operator explicitly
/// retries (which resets the counter) or cancels (which deletes
/// the row). Below the cap the existing 5s tick re-attempts each
/// time the row is due.
pub const MAX_REMINDER_ATTEMPTS: u32 = 5;
/// Intra-process broker event. `recv_blocking` listens on the same /// Intra-process broker event. `recv_blocking` listens on the same
/// channel as the dashboard forwarder; the forwarder re-emits each /// channel as the dashboard forwarder; the forwarder re-emits each
/// event as a `DashboardEvent` with a freshly-stamped seq from the /// event as a `DashboardEvent` with a freshly-stamped seq from the
@ -95,6 +113,7 @@ impl Broker {
let conn = let conn =
Connection::open(path).with_context(|| format!("open broker db {}", path.display()))?; Connection::open(path).with_context(|| format!("open broker db {}", path.display()))?;
conn.execute_batch(SCHEMA).context("apply broker schema")?; conn.execute_batch(SCHEMA).context("apply broker schema")?;
ensure_reminder_columns(&conn).context("migrate reminders columns")?;
let (events, _) = broadcast::channel(EVENT_CHANNEL); let (events, _) = broadcast::channel(EVENT_CHANNEL);
Ok(Self { Ok(Self {
conn: Mutex::new(conn), conn: Mutex::new(conn),
@ -305,12 +324,14 @@ impl Broker {
pub fn list_pending_reminders(&self) -> Result<Vec<PendingReminder>> { pub fn list_pending_reminders(&self) -> Result<Vec<PendingReminder>> {
let conn = self.conn.lock().unwrap(); let conn = self.conn.lock().unwrap();
let mut stmt = conn.prepare( let mut stmt = conn.prepare(
"SELECT id, agent, message, file_path, due_at, created_at \ "SELECT id, agent, message, file_path, due_at, created_at, \
last_error, attempt_count \
FROM reminders \ FROM reminders \
WHERE sent_at IS NULL \ WHERE sent_at IS NULL \
ORDER BY due_at ASC", ORDER BY due_at ASC",
)?; )?;
let rows = stmt.query_map([], |row| { let rows = stmt.query_map([], |row| {
let attempts: i64 = row.get(7)?;
Ok(PendingReminder { Ok(PendingReminder {
id: row.get(0)?, id: row.get(0)?,
agent: row.get(1)?, agent: row.get(1)?,
@ -318,12 +339,46 @@ impl Broker {
file_path: row.get(3)?, file_path: row.get(3)?,
due_at: row.get(4)?, due_at: row.get(4)?,
created_at: row.get(5)?, created_at: row.get(5)?,
last_error: row.get(6)?,
attempt_count: u32::try_from(attempts).unwrap_or(0),
}) })
})?; })?;
rows.collect::<rusqlite::Result<Vec<_>>>() rows.collect::<rusqlite::Result<Vec<_>>>()
.context("list pending reminders") .context("list pending reminders")
} }
/// Mark a delivery attempt as failed: bump `attempt_count` and
/// stash the error string. Called by `reminder_scheduler::tick`
/// when `deliver_reminder` returns Err. Soft-cap behaviour
/// lives in `get_due_reminders` (rows over the cap drop out
/// of the due-list and stop being attempted until retry).
pub fn record_reminder_failure(&self, id: i64, reason: &str) -> Result<()> {
let conn = self.conn.lock().unwrap();
conn.execute(
"UPDATE reminders \
SET attempt_count = attempt_count + 1, last_error = ?1 \
WHERE id = ?2 AND sent_at IS NULL",
params![reason, id],
)?;
Ok(())
}
/// Clear the failure state on a pending reminder so the
/// scheduler picks it up again. No-op when the row is already
/// fresh (attempt_count == 0). Returns the number of rows
/// affected so callers can distinguish "retried" from "no
/// such pending reminder" (already delivered, or wrong id).
pub fn reset_reminder_failure(&self, id: i64) -> Result<usize> {
let conn = self.conn.lock().unwrap();
let n = conn.execute(
"UPDATE reminders \
SET attempt_count = 0, last_error = NULL \
WHERE id = ?1 AND sent_at IS NULL",
params![id],
)?;
Ok(n)
}
/// Count this agent's still-pending (un-delivered) reminders. /// Count this agent's still-pending (un-delivered) reminders.
/// Used by the per-turn stats sink for a cheap "what was queued /// Used by the per-turn stats sink for a cheap "what was queued
/// at turn-end" snapshot. /// at turn-end" snapshot.
@ -357,13 +412,16 @@ impl Broker {
pub fn get_due_reminders(&self, limit: u64) -> Result<Vec<DueReminder>> { pub fn get_due_reminders(&self, limit: u64) -> Result<Vec<DueReminder>> {
let conn = self.conn.lock().unwrap(); let conn = self.conn.lock().unwrap();
let limit_i = i64::try_from(limit.min(i64::MAX as u64)).unwrap_or(i64::MAX); let limit_i = i64::try_from(limit.min(i64::MAX as u64)).unwrap_or(i64::MAX);
let max_attempts = i64::from(MAX_REMINDER_ATTEMPTS);
// attempt_count >= cap = give up; row stays pending so the
// operator sees + can retry/cancel via the dashboard.
let mut stmt = conn.prepare( let mut stmt = conn.prepare(
"SELECT agent, id, message, file_path FROM reminders \ "SELECT agent, id, message, file_path FROM reminders \
WHERE due_at <= ?1 AND sent_at IS NULL \ WHERE due_at <= ?1 AND sent_at IS NULL AND attempt_count < ?3 \
ORDER BY agent, due_at ASC \ ORDER BY agent, due_at ASC \
LIMIT ?2", LIMIT ?2",
)?; )?;
let rows = stmt.query_map(params![now_unix(), limit_i], |row| { let rows = stmt.query_map(params![now_unix(), limit_i, max_attempts], |row| {
Ok(( Ok((
row.get::<_, String>(0)?, row.get::<_, String>(0)?,
row.get::<_, i64>(1)?, row.get::<_, i64>(1)?,
@ -410,6 +468,35 @@ impl Broker {
} }
} }
/// Idempotent reminder-table migrations. `ALTER TABLE ADD COLUMN`
/// has no `IF NOT EXISTS` form in sqlite, so we probe
/// `pragma_table_info` per column. New deploys (table created by
/// SCHEMA in this commit cycle) skip the ALTER; pre-existing
/// broker.sqlite files get the columns added on next boot.
fn ensure_reminder_columns(conn: &Connection) -> Result<()> {
for (name, sql) in [
(
"attempt_count",
"ALTER TABLE reminders ADD COLUMN attempt_count INTEGER NOT NULL DEFAULT 0;",
),
(
"last_error",
"ALTER TABLE reminders ADD COLUMN last_error TEXT;",
),
] {
let has: bool = conn
.prepare(&format!(
"SELECT 1 FROM pragma_table_info('reminders') WHERE name = '{name}'"
))?
.exists([])?;
if !has {
conn.execute_batch(sql)
.with_context(|| format!("add reminders.{name} column"))?;
}
}
Ok(())
}
fn now_unix() -> i64 { fn now_unix() -> i64 {
SystemTime::now() SystemTime::now()
.duration_since(UNIX_EPOCH) .duration_since(UNIX_EPOCH)

View file

@ -57,6 +57,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
.route("/api/state-file", get(get_state_file)) .route("/api/state-file", get(get_state_file))
.route("/api/reminders", get(api_reminders)) .route("/api/reminders", get(api_reminders))
.route("/cancel-reminder/{id}", post(post_cancel_reminder)) .route("/cancel-reminder/{id}", post(post_cancel_reminder))
.route("/retry-reminder/{id}", post(post_retry_reminder))
.route("/api/agent-config/{name}", get(get_agent_config)) .route("/api/agent-config/{name}", get(get_agent_config))
.route("/request-spawn", post(post_request_spawn)) .route("/request-spawn", post(post_request_spawn))
.route("/op-send", post(post_op_send)) .route("/op-send", post(post_op_send))
@ -1126,6 +1127,25 @@ async fn post_cancel_reminder(
} }
} }
/// Reset a pending reminder's failure state so the scheduler
/// retries it on the next tick. Useful when the failure was
/// transient (sqlite lock contention, disk full → freed up) and
/// the operator wants delivery to resume immediately instead of
/// the row sitting in attempt-count-capped purgatory.
async fn post_retry_reminder(
State(state): State<AppState>,
AxumPath(id): AxumPath<i64>,
) -> Response {
match state.coord.broker.reset_reminder_failure(id) {
Ok(0) => error_response(&format!("reminder {id} not pending (already delivered?)")),
Ok(_) => {
tracing::info!(%id, "operator reset reminder failure for retry");
(StatusCode::OK, "ok").into_response()
}
Err(e) => error_response(&format!("retry reminder {id} failed: {e:#}")),
}
}
async fn post_purge_tombstone( async fn post_purge_tombstone(
State(state): State<AppState>, State(state): State<AppState>,
AxumPath(name): AxumPath<String>, AxumPath(name): AxumPath<String>,

View file

@ -71,12 +71,24 @@ fn tick(coord: &Arc<Coordinator>) {
for (agent, id, message, file_path) in due { for (agent, id, message, file_path) in due {
let body = prepare_body(&agent, &message, file_path.as_deref()); let body = prepare_body(&agent, &message, file_path.as_deref());
if let Err(e) = coord.broker.deliver_reminder(id, &agent, &body) { if let Err(e) = coord.broker.deliver_reminder(id, &agent, &body) {
let reason = format!("{e:#}");
tracing::warn!( tracing::warn!(
reminder_id = id, reminder_id = id,
%agent, %agent,
error = ?e, error = %reason,
"failed to deliver reminder" "failed to deliver reminder"
); );
// Persist the failure so the dashboard can surface it +
// bump attempt_count. After MAX_REMINDER_ATTEMPTS the
// row drops out of `get_due_reminders` and waits for
// operator retry / cancel.
if let Err(persist_err) = coord.broker.record_reminder_failure(id, &reason) {
tracing::warn!(
reminder_id = id,
error = ?persist_err,
"failed to persist reminder failure"
);
}
} }
} }
} }