//! Manager socket listener. Privileged tool surface: agent-style send/recv //! plus lifecycle verbs (Phase 4). Phase 5 will gate Spawn/Kill behind the //! commit-approval flow; for now they hit the same code path the host admin //! socket uses. use std::sync::Arc; use anyhow::{Context, Result}; use hive_sh4re::{MANAGER_AGENT, ManagerRequest, ManagerResponse, Message}; use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader}; use tokio::net::{UnixListener, UnixStream}; use crate::coordinator::Coordinator; use crate::lifecycle; pub fn start(coord: Arc) -> Result<()> { let dir = Coordinator::manager_dir(); std::fs::create_dir_all(&dir) .with_context(|| format!("create manager dir {}", dir.display()))?; let socket = Coordinator::manager_socket_path(); if socket.exists() { std::fs::remove_file(&socket).context("remove stale manager socket")?; } let listener = UnixListener::bind(&socket) .with_context(|| format!("bind manager socket {}", socket.display()))?; tracing::info!(socket = %socket.display(), "manager socket listening"); tokio::spawn(async move { loop { match listener.accept().await { Ok((stream, _)) => { let coord = coord.clone(); tokio::spawn(async move { if let Err(e) = serve(stream, coord).await { tracing::warn!(error = ?e, "manager connection failed"); } }); } Err(e) => { tracing::warn!(error = ?e, "manager listener accept failed"); return; } } } }); Ok(()) } async fn serve(stream: UnixStream, coord: Arc) -> Result<()> { let (read, mut write) = stream.into_split(); let mut reader = BufReader::new(read); let mut line = String::new(); loop { line.clear(); let n = reader.read_line(&mut line).await?; if n == 0 { return Ok(()); } let resp = match serde_json::from_str::(line.trim()) { Ok(req) => dispatch(&req, &coord).await, Err(e) => ManagerResponse::Err { message: format!("parse error: {e}"), }, }; let mut payload = serde_json::to_string(&resp)?; payload.push('\n'); write.write_all(payload.as_bytes()).await?; write.flush().await?; } } /// Max long-poll window for manager `Recv`. Same semantics as the /// sub-agent socket: omitted `wait_seconds` (or `0`) = peek and /// return immediately, positive value = park up to that many /// seconds (clamped at MAX). const MANAGER_RECV_LONG_POLL_MAX: std::time::Duration = std::time::Duration::from_secs(180); fn manager_recv_timeout(wait_seconds: Option) -> std::time::Duration { match wait_seconds { Some(s) => std::time::Duration::from_secs(s).min(MANAGER_RECV_LONG_POLL_MAX), None => std::time::Duration::ZERO, } } #[allow(clippy::too_many_lines)] async fn dispatch(req: &ManagerRequest, coord: &Arc) -> ManagerResponse { match req { ManagerRequest::Send { to, body } => { if to == "*" { let errors = coord.broadcast_send(MANAGER_AGENT, body); if errors.is_empty() { ManagerResponse::Ok } else { ManagerResponse::Err { message: format!("broadcast failed for agents: {}", errors.join(", ")), } } } else { match coord.broker.send(&Message { from: MANAGER_AGENT.to_owned(), to: to.clone(), body: body.clone(), }) { Ok(()) => ManagerResponse::Ok, Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, } } } ManagerRequest::OperatorMsg { body } => match coord.broker.send(&Message { from: hive_sh4re::OPERATOR_RECIPIENT.to_owned(), to: MANAGER_AGENT.to_owned(), body: body.clone(), }) { Ok(()) => ManagerResponse::Ok, Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, }, ManagerRequest::Status => match coord.broker.count_pending(MANAGER_AGENT) { Ok(unread) => ManagerResponse::Status { unread }, Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, }, ManagerRequest::Recent { limit } => match coord.broker.recent_for(MANAGER_AGENT, *limit) { Ok(rows) => ManagerResponse::Recent { rows }, Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, }, ManagerRequest::Recv { wait_seconds } => match coord .broker .recv_blocking(MANAGER_AGENT, manager_recv_timeout(*wait_seconds)) .await { Ok(Some(msg)) => ManagerResponse::Message { from: msg.from, body: msg.body, }, Ok(None) => ManagerResponse::Empty, Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, }, ManagerRequest::RequestSpawn { name, description } => { tracing::info!(%name, "manager: request_spawn"); match coord.approvals.submit_kind( name, hive_sh4re::ApprovalKind::Spawn, "", description.as_deref(), ) { Ok(id) => { tracing::info!(%id, %name, "spawn approval queued"); ManagerResponse::Ok } Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, } } ManagerRequest::Kill { name } => { tracing::info!(%name, "manager: kill"); if name == crate::lifecycle::MANAGER_NAME { return ManagerResponse::Err { message: "refusing to kill the manager".into(), }; } let result: Result<()> = async { lifecycle::kill(name).await?; coord.unregister_agent(name); Ok(()) } .await; match result { Ok(()) => { coord.notify_manager(&hive_sh4re::HelperEvent::Killed { agent: name.clone(), }); ManagerResponse::Ok } Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, } } ManagerRequest::Start { name } => { tracing::info!(%name, "manager: start"); if name == crate::lifecycle::MANAGER_NAME { return ManagerResponse::Err { message: "refusing to start the manager from itself".into(), }; } match lifecycle::start(name).await { Ok(()) => { coord.kick_agent(name, "container started"); ManagerResponse::Ok } Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, } } ManagerRequest::Restart { name } => { tracing::info!(%name, "manager: restart"); if name == crate::lifecycle::MANAGER_NAME { return ManagerResponse::Err { message: "refusing to restart the manager from itself".into(), }; } match lifecycle::restart(name).await { Ok(()) => { coord.kick_agent(name, "container restarted"); ManagerResponse::Ok } Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, } } ManagerRequest::Update { name } => { tracing::info!(%name, "manager: update"); let Some(current_rev) = crate::auto_update::current_flake_rev(&coord.hyperhive_flake) else { return ManagerResponse::Err { message: "update: hyperhive_flake has no canonical path".into(), }; }; let _guard = coord.transient_guard(name, crate::coordinator::TransientKind::Rebuilding); let result = crate::auto_update::rebuild_agent(coord, name, ¤t_rev).await; drop(_guard); match result { Ok(()) => { coord.kick_agent(name, "container rebuilt"); ManagerResponse::Ok } Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, } } ManagerRequest::AskOperator { question, options, multi, ttl_seconds, } => { tracing::info!(%question, ?options, multi, ?ttl_seconds, "manager: ask_operator"); let deadline_at = ttl_seconds.and_then(|s| { let now = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .ok() .and_then(|d| i64::try_from(d.as_secs()).ok()) .unwrap_or(0); i64::try_from(s).ok().map(|s| now + s) }); match coord .questions .submit(MANAGER_AGENT, question, options, *multi, deadline_at) { Ok(id) => { tracing::info!(%id, ?deadline_at, "operator question queued"); if let Some(ttl) = *ttl_seconds { spawn_question_watchdog(coord, id, ttl); } ManagerResponse::QuestionQueued { id } } Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, } } ManagerRequest::GetLogs { agent, lines } => { let n = lines.unwrap_or(50); tracing::info!(%agent, %n, "manager: get_logs"); match tokio::process::Command::new("journalctl") .args([ "-M", agent, "-n", &n.to_string(), "--no-pager", "--output=short", ]) .output() .await { Ok(out) => { let content = if out.status.success() || !out.stdout.is_empty() { String::from_utf8_lossy(&out.stdout).into_owned() } else { let stderr = String::from_utf8_lossy(&out.stderr); format!("journalctl exited {}: {stderr}", out.status) }; ManagerResponse::Logs { content } } Err(e) => ManagerResponse::Err { message: format!("journalctl spawn failed: {e:#}"), }, } } ManagerRequest::RequestApplyCommit { agent, commit_ref, description, } => { tracing::info!(%agent, %commit_ref, "manager: request_apply_commit"); match submit_apply_commit(coord, agent, commit_ref, description.as_deref()).await { Ok((id, sha)) => { tracing::info!(%id, %agent, manager_ref = %commit_ref, %sha, "approval queued + proposal tag planted"); ManagerResponse::Ok } Err(e) => ManagerResponse::Err { message: format!("{e:#}"), }, } } } } /// Submit-time half of the apply flow: queue the approval row, then /// fetch the manager's commit from the proposed repo into applied and /// pin it as `refs/tags/proposal/`. From this point on the manager /// repo is irrelevant for this approval — even if the manager amends /// or force-pushes, the canonical sha hive-c0re will eventually /// approve/deny lives in applied's object DB. /// /// If anything fails after the row is inserted (sha missing in /// proposed, fs error, git plumbing crash) we mark the row failed and /// surface the error to the manager. We don't try to roll the row /// back — the failure is part of the audit trail. async fn submit_apply_commit( coord: &Arc, agent: &str, commit_ref: &str, description: Option<&str>, ) -> anyhow::Result<(i64, String)> { let proposed_dir = crate::coordinator::Coordinator::agent_proposed_dir(agent); let applied_dir = crate::coordinator::Coordinator::agent_applied_dir(agent); if !proposed_dir.exists() { anyhow::bail!( "proposed repo missing for agent '{agent}' (expected at {})", proposed_dir.display() ); } if !applied_dir.join(".git").exists() { anyhow::bail!( "applied repo at {} is uninitialised — spawn the agent first", applied_dir.display() ); } let id = coord .approvals .submit_kind( agent, hive_sh4re::ApprovalKind::ApplyCommit, commit_ref, description, ) .map_err(|e| anyhow::anyhow!("queue approval row: {e:#}"))?; let tag = format!("proposal/{id}"); let sha = match crate::lifecycle::git_fetch_to_tag( &applied_dir, &proposed_dir, commit_ref, &tag, ) .await { Ok(s) => s, Err(e) => { // Surface the failure on the approval row so the // dashboard reflects it instead of leaving a phantom // pending entry. The note doubles as the operator-visible // explanation of why the approval can't be approved. let _ = coord.approvals.mark_failed(id, &format!("{e:#}")); return Err(anyhow::anyhow!("git_fetch_to_tag: {e:#}")); } }; coord .approvals .set_fetched_sha(id, &sha) .map_err(|e| anyhow::anyhow!("persist fetched_sha: {e:#}"))?; Ok((id, sha)) } /// On `AskOperator { ttl_seconds: Some(n) }`, sleep n seconds and then /// try to resolve the question with `[expired]`. If the operator (or /// any other path) already answered it, `answer()` returns Err and /// we no-op silently. Otherwise fire the usual `OperatorAnswered` /// helper event so the manager sees a terminal state. const TTL_SENTINEL: &str = "[expired]"; pub fn spawn_question_watchdog(coord: &Arc, id: i64, ttl_secs: u64) { let coord = coord.clone(); tokio::spawn(async move { tokio::time::sleep(std::time::Duration::from_secs(ttl_secs)).await; // `answer` returns Err if already resolved — that's the // normal path when the operator responded before the ttl // fired, so no-op silently. if let Ok((question, asker)) = coord.questions.answer(id, TTL_SENTINEL) { tracing::info!(%id, %asker, "operator question expired (ttl)"); coord.notify_agent( &asker, &hive_sh4re::HelperEvent::OperatorAnswered { id, question, answer: TTL_SENTINEL.to_owned(), }, ); } }); }