hyperhive/hive-c0re/src/manager_server.rs
2026-05-16 20:45:19 +02:00

416 lines
16 KiB
Rust

//! Manager socket listener. Privileged tool surface: agent-style send/recv
//! plus lifecycle verbs (Phase 4). Phase 5 will gate Spawn/Kill behind the
//! commit-approval flow; for now they hit the same code path the host admin
//! socket uses.
use std::sync::Arc;
use anyhow::{Context, Result};
use hive_sh4re::{MANAGER_AGENT, ManagerRequest, ManagerResponse, Message};
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use tokio::net::{UnixListener, UnixStream};
use crate::coordinator::Coordinator;
use crate::lifecycle;
pub fn start(coord: Arc<Coordinator>) -> Result<()> {
let dir = Coordinator::manager_dir();
std::fs::create_dir_all(&dir)
.with_context(|| format!("create manager dir {}", dir.display()))?;
let socket = Coordinator::manager_socket_path();
if socket.exists() {
std::fs::remove_file(&socket).context("remove stale manager socket")?;
}
let listener = UnixListener::bind(&socket)
.with_context(|| format!("bind manager socket {}", socket.display()))?;
tracing::info!(socket = %socket.display(), "manager socket listening");
tokio::spawn(async move {
loop {
match listener.accept().await {
Ok((stream, _)) => {
let coord = coord.clone();
tokio::spawn(async move {
if let Err(e) = serve(stream, coord).await {
tracing::warn!(error = ?e, "manager connection failed");
}
});
}
Err(e) => {
tracing::warn!(error = ?e, "manager listener accept failed");
return;
}
}
}
});
Ok(())
}
async fn serve(stream: UnixStream, coord: Arc<Coordinator>) -> Result<()> {
let (read, mut write) = stream.into_split();
let mut reader = BufReader::new(read);
let mut line = String::new();
loop {
line.clear();
let n = reader.read_line(&mut line).await?;
if n == 0 {
return Ok(());
}
let resp = match serde_json::from_str::<ManagerRequest>(line.trim()) {
Ok(req) => dispatch(&req, &coord).await,
Err(e) => ManagerResponse::Err {
message: format!("parse error: {e}"),
},
};
let mut payload = serde_json::to_string(&resp)?;
payload.push('\n');
write.write_all(payload.as_bytes()).await?;
write.flush().await?;
}
}
/// Max long-poll window for manager `Recv`. Same semantics as the
/// sub-agent socket: omitted `wait_seconds` (or `0`) = peek and
/// return immediately, positive value = park up to that many
/// seconds (clamped at MAX).
const MANAGER_RECV_LONG_POLL_MAX: std::time::Duration = std::time::Duration::from_secs(180);
fn manager_recv_timeout(wait_seconds: Option<u64>) -> std::time::Duration {
match wait_seconds {
Some(s) => std::time::Duration::from_secs(s).min(MANAGER_RECV_LONG_POLL_MAX),
None => std::time::Duration::ZERO,
}
}
#[allow(clippy::too_many_lines)]
async fn dispatch(req: &ManagerRequest, coord: &Arc<Coordinator>) -> ManagerResponse {
match req {
ManagerRequest::Send { to, body } => {
if to == "*" {
let errors = coord.broadcast_send(MANAGER_AGENT, body);
if errors.is_empty() {
ManagerResponse::Ok
} else {
ManagerResponse::Err {
message: format!("broadcast failed for agents: {}", errors.join(", ")),
}
}
} else {
match coord.broker.send(&Message {
from: MANAGER_AGENT.to_owned(),
to: to.clone(),
body: body.clone(),
}) {
Ok(()) => ManagerResponse::Ok,
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
}
}
ManagerRequest::OperatorMsg { body } => match coord.broker.send(&Message {
from: hive_sh4re::OPERATOR_RECIPIENT.to_owned(),
to: MANAGER_AGENT.to_owned(),
body: body.clone(),
}) {
Ok(()) => ManagerResponse::Ok,
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
},
ManagerRequest::Status => match coord.broker.count_pending(MANAGER_AGENT) {
Ok(unread) => ManagerResponse::Status { unread },
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
},
ManagerRequest::Recent { limit } => match coord.broker.recent_for(MANAGER_AGENT, *limit) {
Ok(rows) => ManagerResponse::Recent { rows },
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
},
ManagerRequest::Recv { wait_seconds } => match coord
.broker
.recv_blocking(MANAGER_AGENT, manager_recv_timeout(*wait_seconds))
.await
{
Ok(Some(msg)) => ManagerResponse::Message {
from: msg.from,
body: msg.body,
},
Ok(None) => ManagerResponse::Empty,
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
},
ManagerRequest::RequestSpawn { name, description } => {
tracing::info!(%name, "manager: request_spawn");
match coord.approvals.submit_kind(
name,
hive_sh4re::ApprovalKind::Spawn,
"",
description.as_deref(),
) {
Ok(id) => {
tracing::info!(%id, %name, "spawn approval queued");
ManagerResponse::Ok
}
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
}
ManagerRequest::Kill { name } => {
tracing::info!(%name, "manager: kill");
if name == crate::lifecycle::MANAGER_NAME {
return ManagerResponse::Err {
message: "refusing to kill the manager".into(),
};
}
let result: Result<()> = async {
lifecycle::kill(name).await?;
coord.unregister_agent(name);
Ok(())
}
.await;
match result {
Ok(()) => {
coord.notify_manager(&hive_sh4re::HelperEvent::Killed {
agent: name.clone(),
});
ManagerResponse::Ok
}
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
}
ManagerRequest::Start { name } => {
tracing::info!(%name, "manager: start");
if name == crate::lifecycle::MANAGER_NAME {
return ManagerResponse::Err {
message: "refusing to start the manager from itself".into(),
};
}
match lifecycle::start(name).await {
Ok(()) => {
coord.kick_agent(name, "container started");
ManagerResponse::Ok
}
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
}
ManagerRequest::Restart { name } => {
tracing::info!(%name, "manager: restart");
if name == crate::lifecycle::MANAGER_NAME {
return ManagerResponse::Err {
message: "refusing to restart the manager from itself".into(),
};
}
match lifecycle::restart(name).await {
Ok(()) => {
coord.kick_agent(name, "container restarted");
ManagerResponse::Ok
}
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
}
ManagerRequest::Update { name } => {
tracing::info!(%name, "manager: update");
let Some(current_rev) = crate::auto_update::current_flake_rev(&coord.hyperhive_flake)
else {
return ManagerResponse::Err {
message: "update: hyperhive_flake has no canonical path".into(),
};
};
let _guard =
coord.transient_guard(name, crate::coordinator::TransientKind::Rebuilding);
let result = crate::auto_update::rebuild_agent(coord, name, &current_rev).await;
drop(_guard);
match result {
Ok(()) => {
coord.kick_agent(name, "container rebuilt");
ManagerResponse::Ok
}
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
}
ManagerRequest::AskOperator {
question,
options,
multi,
ttl_seconds,
} => {
tracing::info!(%question, ?options, multi, ?ttl_seconds, "manager: ask_operator");
let deadline_at = ttl_seconds.and_then(|s| {
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.ok()
.and_then(|d| i64::try_from(d.as_secs()).ok())
.unwrap_or(0);
i64::try_from(s).ok().map(|s| now + s)
});
match coord
.questions
.submit(MANAGER_AGENT, question, options, *multi, deadline_at)
{
Ok(id) => {
tracing::info!(%id, ?deadline_at, "operator question queued");
if let Some(ttl) = *ttl_seconds {
spawn_question_watchdog(coord, id, ttl);
}
ManagerResponse::QuestionQueued { id }
}
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
}
ManagerRequest::GetLogs { agent, lines } => {
let n = lines.unwrap_or(50);
tracing::info!(%agent, %n, "manager: get_logs");
match tokio::process::Command::new("journalctl")
.args([
"-M",
agent,
"-n",
&n.to_string(),
"--no-pager",
"--output=short",
])
.output()
.await
{
Ok(out) => {
let content = if out.status.success() || !out.stdout.is_empty() {
String::from_utf8_lossy(&out.stdout).into_owned()
} else {
let stderr = String::from_utf8_lossy(&out.stderr);
format!("journalctl exited {}: {stderr}", out.status)
};
ManagerResponse::Logs { content }
}
Err(e) => ManagerResponse::Err {
message: format!("journalctl spawn failed: {e:#}"),
},
}
}
ManagerRequest::RequestApplyCommit {
agent,
commit_ref,
description,
} => {
tracing::info!(%agent, %commit_ref, "manager: request_apply_commit");
match submit_apply_commit(coord, agent, commit_ref, description.as_deref()).await {
Ok((id, sha)) => {
tracing::info!(%id, %agent, manager_ref = %commit_ref, %sha, "approval queued + proposal tag planted");
ManagerResponse::Ok
}
Err(e) => ManagerResponse::Err {
message: format!("{e:#}"),
},
}
}
}
}
/// Submit-time half of the apply flow: queue the approval row, then
/// fetch the manager's commit from the proposed repo into applied and
/// pin it as `refs/tags/proposal/<id>`. From this point on the manager
/// repo is irrelevant for this approval — even if the manager amends
/// or force-pushes, the canonical sha hive-c0re will eventually
/// approve/deny lives in applied's object DB.
///
/// If anything fails after the row is inserted (sha missing in
/// proposed, fs error, git plumbing crash) we mark the row failed and
/// surface the error to the manager. We don't try to roll the row
/// back — the failure is part of the audit trail.
async fn submit_apply_commit(
coord: &Arc<Coordinator>,
agent: &str,
commit_ref: &str,
description: Option<&str>,
) -> anyhow::Result<(i64, String)> {
let proposed_dir = crate::coordinator::Coordinator::agent_proposed_dir(agent);
let applied_dir = crate::coordinator::Coordinator::agent_applied_dir(agent);
if !proposed_dir.exists() {
anyhow::bail!(
"proposed repo missing for agent '{agent}' (expected at {})",
proposed_dir.display()
);
}
if !applied_dir.join(".git").exists() {
anyhow::bail!(
"applied repo at {} is uninitialised — spawn the agent first",
applied_dir.display()
);
}
let id = coord
.approvals
.submit_kind(
agent,
hive_sh4re::ApprovalKind::ApplyCommit,
commit_ref,
description,
)
.map_err(|e| anyhow::anyhow!("queue approval row: {e:#}"))?;
let tag = format!("proposal/{id}");
let sha = match crate::lifecycle::git_fetch_to_tag(
&applied_dir,
&proposed_dir,
commit_ref,
&tag,
)
.await
{
Ok(s) => s,
Err(e) => {
// Surface the failure on the approval row so the
// dashboard reflects it instead of leaving a phantom
// pending entry. The note doubles as the operator-visible
// explanation of why the approval can't be approved.
let _ = coord.approvals.mark_failed(id, &format!("{e:#}"));
return Err(anyhow::anyhow!("git_fetch_to_tag: {e:#}"));
}
};
coord
.approvals
.set_fetched_sha(id, &sha)
.map_err(|e| anyhow::anyhow!("persist fetched_sha: {e:#}"))?;
Ok((id, sha))
}
/// On `AskOperator { ttl_seconds: Some(n) }`, sleep n seconds and then
/// try to resolve the question with `[expired]`. If the operator (or
/// any other path) already answered it, `answer()` returns Err and
/// we no-op silently. Otherwise fire the usual `OperatorAnswered`
/// helper event so the manager sees a terminal state.
const TTL_SENTINEL: &str = "[expired]";
pub fn spawn_question_watchdog(coord: &Arc<Coordinator>, id: i64, ttl_secs: u64) {
let coord = coord.clone();
tokio::spawn(async move {
tokio::time::sleep(std::time::Duration::from_secs(ttl_secs)).await;
// `answer` returns Err if already resolved — that's the
// normal path when the operator responded before the ttl
// fired, so no-op silently.
if let Ok((question, asker)) = coord.questions.answer(id, TTL_SENTINEL) {
tracing::info!(%id, %asker, "operator question expired (ttl)");
coord.notify_agent(
&asker,
&hive_sh4re::HelperEvent::OperatorAnswered {
id,
question,
answer: TTL_SENTINEL.to_owned(),
},
);
}
});
}