reminders: persist + surface delivery failures
Broker schema gains attempt_count INTEGER + last_error TEXT
columns via idempotent ALTER TABLE migration (pragma-probed so
fresh + existing dbs converge). reminder_scheduler::tick calls
record_reminder_failure on every deliver_reminder error,
bumping the counter + stashing the message. get_due_reminders
filters out rows where attempt_count >= MAX_REMINDER_ATTEMPTS
(5) so the scheduler stops retrying a stuck row until the
operator intervenes.
new POST /retry-reminder/{id} → reset_reminder_failure clears
the counters; next 5s tick re-attempts. cancel-reminder
unchanged (hard-delete).
dashboard renders failed rows with a red left rule, the error
text inline, and a ⚠ N failed badge. ↻ R3TRY button appears
when attempt_count > 0 — sits next to ✗ C4NC3L in a small
actions row below the body.
This commit is contained in:
parent
d395bdc945
commit
978a3cf391
5 changed files with 173 additions and 8 deletions
|
|
@ -57,8 +57,26 @@ pub struct PendingReminder {
|
|||
pub file_path: Option<String>,
|
||||
pub due_at: i64,
|
||||
pub created_at: i64,
|
||||
/// Most recent delivery failure for this row, if any. Cleared
|
||||
/// to NULL on operator retry. Surfaced inline in the dashboard
|
||||
/// so a stuck reminder doesn't just silently retry forever.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub last_error: Option<String>,
|
||||
/// Number of failed delivery attempts since the row was
|
||||
/// created or last retried. After `MAX_REMINDER_ATTEMPTS` the
|
||||
/// scheduler stops trying (the row stays in `pending` with the
|
||||
/// error so the operator can decide between retry + cancel).
|
||||
#[serde(default)]
|
||||
pub attempt_count: u32,
|
||||
}
|
||||
|
||||
/// Stop retrying a row after this many consecutive failures. The
|
||||
/// scheduler quits scheduling it until an operator explicitly
|
||||
/// retries (which resets the counter) or cancels (which deletes
|
||||
/// the row). Below the cap the existing 5s tick re-attempts each
|
||||
/// time the row is due.
|
||||
pub const MAX_REMINDER_ATTEMPTS: u32 = 5;
|
||||
|
||||
/// Intra-process broker event. `recv_blocking` listens on the same
|
||||
/// channel as the dashboard forwarder; the forwarder re-emits each
|
||||
/// event as a `DashboardEvent` with a freshly-stamped seq from the
|
||||
|
|
@ -95,6 +113,7 @@ impl Broker {
|
|||
let conn =
|
||||
Connection::open(path).with_context(|| format!("open broker db {}", path.display()))?;
|
||||
conn.execute_batch(SCHEMA).context("apply broker schema")?;
|
||||
ensure_reminder_columns(&conn).context("migrate reminders columns")?;
|
||||
let (events, _) = broadcast::channel(EVENT_CHANNEL);
|
||||
Ok(Self {
|
||||
conn: Mutex::new(conn),
|
||||
|
|
@ -305,12 +324,14 @@ impl Broker {
|
|||
pub fn list_pending_reminders(&self) -> Result<Vec<PendingReminder>> {
|
||||
let conn = self.conn.lock().unwrap();
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT id, agent, message, file_path, due_at, created_at \
|
||||
"SELECT id, agent, message, file_path, due_at, created_at, \
|
||||
last_error, attempt_count \
|
||||
FROM reminders \
|
||||
WHERE sent_at IS NULL \
|
||||
ORDER BY due_at ASC",
|
||||
)?;
|
||||
let rows = stmt.query_map([], |row| {
|
||||
let attempts: i64 = row.get(7)?;
|
||||
Ok(PendingReminder {
|
||||
id: row.get(0)?,
|
||||
agent: row.get(1)?,
|
||||
|
|
@ -318,12 +339,46 @@ impl Broker {
|
|||
file_path: row.get(3)?,
|
||||
due_at: row.get(4)?,
|
||||
created_at: row.get(5)?,
|
||||
last_error: row.get(6)?,
|
||||
attempt_count: u32::try_from(attempts).unwrap_or(0),
|
||||
})
|
||||
})?;
|
||||
rows.collect::<rusqlite::Result<Vec<_>>>()
|
||||
.context("list pending reminders")
|
||||
}
|
||||
|
||||
/// Mark a delivery attempt as failed: bump `attempt_count` and
|
||||
/// stash the error string. Called by `reminder_scheduler::tick`
|
||||
/// when `deliver_reminder` returns Err. Soft-cap behaviour
|
||||
/// lives in `get_due_reminders` (rows over the cap drop out
|
||||
/// of the due-list and stop being attempted until retry).
|
||||
pub fn record_reminder_failure(&self, id: i64, reason: &str) -> Result<()> {
|
||||
let conn = self.conn.lock().unwrap();
|
||||
conn.execute(
|
||||
"UPDATE reminders \
|
||||
SET attempt_count = attempt_count + 1, last_error = ?1 \
|
||||
WHERE id = ?2 AND sent_at IS NULL",
|
||||
params![reason, id],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear the failure state on a pending reminder so the
|
||||
/// scheduler picks it up again. No-op when the row is already
|
||||
/// fresh (attempt_count == 0). Returns the number of rows
|
||||
/// affected so callers can distinguish "retried" from "no
|
||||
/// such pending reminder" (already delivered, or wrong id).
|
||||
pub fn reset_reminder_failure(&self, id: i64) -> Result<usize> {
|
||||
let conn = self.conn.lock().unwrap();
|
||||
let n = conn.execute(
|
||||
"UPDATE reminders \
|
||||
SET attempt_count = 0, last_error = NULL \
|
||||
WHERE id = ?1 AND sent_at IS NULL",
|
||||
params![id],
|
||||
)?;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
/// Count this agent's still-pending (un-delivered) reminders.
|
||||
/// Used by the per-turn stats sink for a cheap "what was queued
|
||||
/// at turn-end" snapshot.
|
||||
|
|
@ -357,13 +412,16 @@ impl Broker {
|
|||
pub fn get_due_reminders(&self, limit: u64) -> Result<Vec<DueReminder>> {
|
||||
let conn = self.conn.lock().unwrap();
|
||||
let limit_i = i64::try_from(limit.min(i64::MAX as u64)).unwrap_or(i64::MAX);
|
||||
let max_attempts = i64::from(MAX_REMINDER_ATTEMPTS);
|
||||
// attempt_count >= cap = give up; row stays pending so the
|
||||
// operator sees + can retry/cancel via the dashboard.
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT agent, id, message, file_path FROM reminders \
|
||||
WHERE due_at <= ?1 AND sent_at IS NULL \
|
||||
WHERE due_at <= ?1 AND sent_at IS NULL AND attempt_count < ?3 \
|
||||
ORDER BY agent, due_at ASC \
|
||||
LIMIT ?2",
|
||||
)?;
|
||||
let rows = stmt.query_map(params![now_unix(), limit_i], |row| {
|
||||
let rows = stmt.query_map(params![now_unix(), limit_i, max_attempts], |row| {
|
||||
Ok((
|
||||
row.get::<_, String>(0)?,
|
||||
row.get::<_, i64>(1)?,
|
||||
|
|
@ -410,6 +468,35 @@ impl Broker {
|
|||
}
|
||||
}
|
||||
|
||||
/// Idempotent reminder-table migrations. `ALTER TABLE ADD COLUMN`
|
||||
/// has no `IF NOT EXISTS` form in sqlite, so we probe
|
||||
/// `pragma_table_info` per column. New deploys (table created by
|
||||
/// SCHEMA in this commit cycle) skip the ALTER; pre-existing
|
||||
/// broker.sqlite files get the columns added on next boot.
|
||||
fn ensure_reminder_columns(conn: &Connection) -> Result<()> {
|
||||
for (name, sql) in [
|
||||
(
|
||||
"attempt_count",
|
||||
"ALTER TABLE reminders ADD COLUMN attempt_count INTEGER NOT NULL DEFAULT 0;",
|
||||
),
|
||||
(
|
||||
"last_error",
|
||||
"ALTER TABLE reminders ADD COLUMN last_error TEXT;",
|
||||
),
|
||||
] {
|
||||
let has: bool = conn
|
||||
.prepare(&format!(
|
||||
"SELECT 1 FROM pragma_table_info('reminders') WHERE name = '{name}'"
|
||||
))?
|
||||
.exists([])?;
|
||||
if !has {
|
||||
conn.execute_batch(sql)
|
||||
.with_context(|| format!("add reminders.{name} column"))?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn now_unix() -> i64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ pub async fn serve(port: u16, coord: Arc<Coordinator>) -> Result<()> {
|
|||
.route("/api/state-file", get(get_state_file))
|
||||
.route("/api/reminders", get(api_reminders))
|
||||
.route("/cancel-reminder/{id}", post(post_cancel_reminder))
|
||||
.route("/retry-reminder/{id}", post(post_retry_reminder))
|
||||
.route("/api/agent-config/{name}", get(get_agent_config))
|
||||
.route("/request-spawn", post(post_request_spawn))
|
||||
.route("/op-send", post(post_op_send))
|
||||
|
|
@ -1126,6 +1127,25 @@ async fn post_cancel_reminder(
|
|||
}
|
||||
}
|
||||
|
||||
/// Reset a pending reminder's failure state so the scheduler
|
||||
/// retries it on the next tick. Useful when the failure was
|
||||
/// transient (sqlite lock contention, disk full → freed up) and
|
||||
/// the operator wants delivery to resume immediately instead of
|
||||
/// the row sitting in attempt-count-capped purgatory.
|
||||
async fn post_retry_reminder(
|
||||
State(state): State<AppState>,
|
||||
AxumPath(id): AxumPath<i64>,
|
||||
) -> Response {
|
||||
match state.coord.broker.reset_reminder_failure(id) {
|
||||
Ok(0) => error_response(&format!("reminder {id} not pending (already delivered?)")),
|
||||
Ok(_) => {
|
||||
tracing::info!(%id, "operator reset reminder failure for retry");
|
||||
(StatusCode::OK, "ok").into_response()
|
||||
}
|
||||
Err(e) => error_response(&format!("retry reminder {id} failed: {e:#}")),
|
||||
}
|
||||
}
|
||||
|
||||
async fn post_purge_tombstone(
|
||||
State(state): State<AppState>,
|
||||
AxumPath(name): AxumPath<String>,
|
||||
|
|
|
|||
|
|
@ -71,12 +71,24 @@ fn tick(coord: &Arc<Coordinator>) {
|
|||
for (agent, id, message, file_path) in due {
|
||||
let body = prepare_body(&agent, &message, file_path.as_deref());
|
||||
if let Err(e) = coord.broker.deliver_reminder(id, &agent, &body) {
|
||||
let reason = format!("{e:#}");
|
||||
tracing::warn!(
|
||||
reminder_id = id,
|
||||
%agent,
|
||||
error = ?e,
|
||||
error = %reason,
|
||||
"failed to deliver reminder"
|
||||
);
|
||||
// Persist the failure so the dashboard can surface it +
|
||||
// bump attempt_count. After MAX_REMINDER_ATTEMPTS the
|
||||
// row drops out of `get_due_reminders` and waits for
|
||||
// operator retry / cancel.
|
||||
if let Err(persist_err) = coord.broker.record_reminder_failure(id, &reason) {
|
||||
tracing::warn!(
|
||||
reminder_id = id,
|
||||
error = ?persist_err,
|
||||
"failed to persist reminder failure"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue