hyperhive/hive-c0re/src/reminder_scheduler.rs

293 lines
12 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Background loop that drains due reminders out of the broker and
//! delivers them as inbox messages. Mirrors the `events_vacuum` /
//! `crash_watch` shape — a single `spawn(coord)` entry point started
//! from `main.rs`.
//!
//! File-path semantics: a reminder may carry a `file_path` (the
//! agent-visible path inside its container). On delivery we:
//!
//! - Translate the container path (`/agents/<agent>/state/foo.md`) to
//! the host path (`/var/lib/hyperhive/agents/<agent>/state/foo.md`)
//! so hive-c0re can write to it from outside the container.
//! - Reject anything that isn't under the agent's own state subtree,
//! contains `..` (path traversal), or has an empty relative tail.
//! Falling outside the allowed prefix means the file write is
//! skipped and the original message is delivered inline (with a
//! noted warning) — the reminder still fires, just without the
//! payload split.
//! - Defend against symlink escape: after `create_dir_all`, the
//! parent dir is canonicalized and re-verified to live under the
//! agent's host state root. Then we open the final file with
//! `O_NOFOLLOW | O_CREAT | O_TRUNC` so an existing-symlink basename
//! can't redirect the write either. Without this an agent could
//! `ln -s /etc /agents/foo/state/escape` and bounce a write to an
//! arbitrary host path.
//! - Write the reminder body to disk and deliver a short pointer
//! message in its place, so the agent's inbox/wake-prompt stays
//! small and the bulky payload can be read out of band.
//!
//! Atomicity of the inbox INSERT + `reminders.sent_at` UPDATE is handled
//! inside `Broker::deliver_reminder`; this module only computes the
//! body string before calling it.
use std::io::Write;
use std::os::unix::fs::OpenOptionsExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use crate::coordinator::Coordinator;
/// Per-tick cap on reminders delivered. Anything over this stays due
/// in the table and gets picked up on the next tick — keeps a
/// 10k-deep backlog from flooding the broker (or hogging the broker
/// mutex) in one shot. 100/tick × 5s tick = sustained throughput cap
/// of ~20 reminders/sec; bump together if the loose-ends tracker
/// starts firing higher rates.
const REMINDER_BATCH_LIMIT: u64 = 100;
/// Poll interval. Trade-off between latency on a freshly due reminder
/// and CPU spent on empty sweeps; 5s matches the original inline
/// scheduler.
const POLL_INTERVAL: Duration = Duration::from_secs(5);
pub fn spawn(coord: Arc<Coordinator>) {
let mut shutdown = coord.shutdown_rx();
tokio::spawn(async move {
loop {
tick(&coord);
tokio::select! {
_ = tokio::time::sleep(POLL_INTERVAL) => {}
_ = shutdown.changed() => {
tracing::info!("reminder scheduler: shutdown signal received");
break;
}
}
}
});
}
fn tick(coord: &Arc<Coordinator>) {
let due = match coord.broker.get_due_reminders(REMINDER_BATCH_LIMIT) {
Ok(rows) => rows,
Err(e) => {
tracing::warn!(error = ?e, "failed to query due reminders");
return;
}
};
for (agent, id, message, file_path) in due {
let body = prepare_body(&agent, &message, file_path.as_deref());
if let Err(e) = coord.broker.deliver_reminder(id, &agent, &body) {
let reason = format!("{e:#}");
tracing::warn!(
reminder_id = id,
%agent,
error = %reason,
"failed to deliver reminder"
);
// Persist the failure so the dashboard can surface it +
// bump attempt_count. After MAX_REMINDER_ATTEMPTS the
// row drops out of `get_due_reminders` and waits for
// operator retry / cancel.
if let Err(persist_err) = coord.broker.record_reminder_failure(id, &reason) {
tracing::warn!(
reminder_id = id,
error = ?persist_err,
"failed to persist reminder failure"
);
}
}
}
}
/// Build the inbox body for a due reminder. When `file_path` is None
/// the body is the original message verbatim. When set, we attempt to
/// persist the message body to the requested file and return a short
/// pointer string instead. Failures (bad prefix, symlink escape,
/// write error, missing parent) fall back to inline delivery with a
/// noted warning so the reminder still fires.
fn prepare_body(agent: &str, message: &str, file_path: Option<&str>) -> String {
let Some(req_path) = file_path else {
return message.to_owned();
};
let host_path = match resolve_host_path(agent, req_path) {
Ok(p) => p,
Err(reason) => {
tracing::warn!(%agent, %req_path, %reason, "reminder file_path rejected; delivering inline");
return inline_fallback(req_path, &format!("rejected: {reason}"), message);
}
};
match write_payload(agent, &host_path, message) {
Ok(()) => {
let bytes = message.len();
// debug! not info! — under load this would dominate the log.
tracing::debug!(%agent, path = %host_path.display(), bytes, "reminder body written to file");
format!(
"reminder body persisted to `{req_path}` ({bytes} bytes); read with your filesystem tools"
)
}
Err(reason) => {
tracing::warn!(%agent, path = %host_path.display(), %reason, "reminder file_path write failed; delivering inline");
inline_fallback(req_path, &reason, message)
}
}
}
fn inline_fallback(req_path: &str, reason: &str, message: &str) -> String {
format!("[reminder file_path '{req_path}' {reason}; delivering body inline]\n\n{message}")
}
/// Persist `message` to `host_path` with the symlink-escape defenses
/// described in the module docs. Returns `Ok(())` on success, or a
/// human-readable reason string on any failure (caller logs +
/// inline-falls-back). `pub` because `agent_server::handle_remind`
/// reuses it for the at-remind-time auto-file path.
pub fn write_payload(agent: &str, host_path: &Path, message: &str) -> Result<(), String> {
let Some(parent) = host_path.parent() else {
return Err("internal: host path has no parent".to_owned());
};
std::fs::create_dir_all(parent)
.map_err(|e| format!("parent dir create failed: {e}"))?;
// Resolve symlinks in the parent chain, then re-verify the
// canonical form still lives under the agent's host state root —
// catches `ln -s /etc state/escape` style attacks.
let parent_canonical = parent
.canonicalize()
.map_err(|e| format!("parent canonicalize failed: {e}"))?;
let agent_root = Coordinator::agent_notes_dir(agent)
.canonicalize()
.map_err(|e| format!("agent state root canonicalize failed: {e}"))?;
if !parent_canonical.starts_with(&agent_root) {
return Err(format!(
"symlink escape: canonical parent `{}` outside agent root `{}`",
parent_canonical.display(),
agent_root.display()
));
}
let basename = host_path
.file_name()
.ok_or_else(|| "missing basename".to_owned())?;
let target = parent_canonical.join(basename);
// O_NOFOLLOW on the final component refuses to open if the
// basename is itself an existing symlink. Combined with the
// canonicalize-parent check above, no symlink anywhere in the
// path can redirect the write.
let mut file = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.custom_flags(libc::O_NOFOLLOW)
.open(&target)
.map_err(|e| format!("open failed: {e}"))?;
file.write_all(message.as_bytes())
.map_err(|e| format!("write failed: {e}"))?;
Ok(())
}
/// Container-visible state prefix the caller's `file_path` must live
/// under. Sub-agents see their state at `/agents/<name>/state/`;
/// the manager keeps the legacy `/state/` mount (see
/// `lifecycle::set_nspawn_flags`). Auto-file paths use the same
/// prefix so the round-trip is symmetric.
#[must_use]
pub fn container_state_prefix(agent: &str) -> String {
if agent == hive_sh4re::MANAGER_AGENT {
"/state/".to_owned()
} else {
format!("/agents/{agent}/state/")
}
}
/// Map an agent-visible container path to the matching host path,
/// validating that it lives under the agent's own state subtree, has
/// a non-empty relative tail, and doesn't try to traverse out via
/// `..`. Returns the host `PathBuf` on success, or a human-readable
/// reason string on rejection. `pub` so `agent_server::handle_remind`
/// can reuse it for the at-remind-time auto-file path.
pub fn resolve_host_path(agent: &str, req_path: &str) -> Result<PathBuf, String> {
let prefix = container_state_prefix(agent);
let Some(rel) = req_path.strip_prefix(&prefix) else {
return Err(format!(
"must be absolute and under `{prefix}` (got `{req_path}`)"
));
};
if rel.is_empty() {
return Err("file_path must include a filename, not just the state dir".to_owned());
}
let rel_path = Path::new(rel);
for comp in rel_path.components() {
match comp {
std::path::Component::Normal(_) => {}
other => {
return Err(format!(
"path component `{other:?}` not allowed (no traversal / absolute / root)"
));
}
}
}
Ok(Coordinator::agent_notes_dir(agent).join(rel_path))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rejects_paths_outside_agent_state() {
assert!(resolve_host_path("foo", "/etc/passwd").is_err());
assert!(resolve_host_path("foo", "/agents/bar/state/x.md").is_err());
assert!(resolve_host_path("foo", "relative.md").is_err());
}
#[test]
fn rejects_traversal() {
assert!(resolve_host_path("foo", "/agents/foo/state/../../etc/passwd").is_err());
assert!(resolve_host_path("foo", "/agents/foo/state/./x.md").is_err());
}
#[test]
fn rejects_empty_relative_tail() {
// Trailing slash → empty tail. Used to fall through to
// create_dir_all + write-to-dir → confusing inline fallback;
// explicit reject gives a cleaner log.
let err = resolve_host_path("foo", "/agents/foo/state/").unwrap_err();
assert!(err.contains("must include a filename"), "got: {err}");
}
#[test]
fn accepts_well_formed_path() {
let p = resolve_host_path("foo", "/agents/foo/state/reminders/123.md").unwrap();
assert_eq!(
p,
PathBuf::from("/var/lib/hyperhive/agents/foo/state/reminders/123.md")
);
}
#[test]
fn manager_uses_legacy_state_prefix() {
// The manager container mounts its state at `/state/` (legacy),
// not `/agents/manager/state/`. Same host path; different
// container-visible path. resolve_host_path needs to know.
assert_eq!(container_state_prefix("manager"), "/state/");
let p = resolve_host_path("manager", "/state/reminders/x.md").unwrap();
assert_eq!(
p,
PathBuf::from("/var/lib/hyperhive/agents/manager/state/reminders/x.md")
);
// And the sub-agent prefix must NOT be accepted for the manager.
assert!(resolve_host_path("manager", "/agents/manager/state/x.md").is_err());
}
#[test]
fn prepare_body_passthrough_when_no_file_path() {
let s = prepare_body("foo", "hello world", None);
assert_eq!(s, "hello world");
}
#[test]
fn prepare_body_falls_back_inline_on_bad_path() {
let s = prepare_body("foo", "payload", Some("/etc/passwd"));
assert!(s.starts_with("[reminder file_path '/etc/passwd' rejected:"));
assert!(s.contains("payload"));
}
}