submit_apply_commit (1) queues the approval row, (2) git-fetches the manager-supplied sha from proposed into applied, pins it as refs/tags/proposal/<id>, (3) persists the resolved sha on the row via approvals.set_fetched_sha. from this point on the proposal is immutable from the manager's perspective: amends or force-pushes in proposed do not change what hive-c0re will build. fetch failures mark the row failed and surface the error to the manager so a phantom pending entry can't linger.
357 lines
14 KiB
Rust
357 lines
14 KiB
Rust
//! Manager socket listener. Privileged tool surface: agent-style send/recv
|
|
//! plus lifecycle verbs (Phase 4). Phase 5 will gate Spawn/Kill behind the
|
|
//! commit-approval flow; for now they hit the same code path the host admin
|
|
//! socket uses.
|
|
|
|
use std::sync::Arc;
|
|
|
|
use anyhow::{Context, Result};
|
|
use hive_sh4re::{MANAGER_AGENT, ManagerRequest, ManagerResponse, Message};
|
|
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
|
|
use tokio::net::{UnixListener, UnixStream};
|
|
|
|
use crate::coordinator::Coordinator;
|
|
use crate::lifecycle;
|
|
|
|
pub fn start(coord: Arc<Coordinator>) -> Result<()> {
|
|
let dir = Coordinator::manager_dir();
|
|
std::fs::create_dir_all(&dir)
|
|
.with_context(|| format!("create manager dir {}", dir.display()))?;
|
|
let socket = Coordinator::manager_socket_path();
|
|
if socket.exists() {
|
|
std::fs::remove_file(&socket).context("remove stale manager socket")?;
|
|
}
|
|
let listener = UnixListener::bind(&socket)
|
|
.with_context(|| format!("bind manager socket {}", socket.display()))?;
|
|
tracing::info!(socket = %socket.display(), "manager socket listening");
|
|
|
|
tokio::spawn(async move {
|
|
loop {
|
|
match listener.accept().await {
|
|
Ok((stream, _)) => {
|
|
let coord = coord.clone();
|
|
tokio::spawn(async move {
|
|
if let Err(e) = serve(stream, coord).await {
|
|
tracing::warn!(error = ?e, "manager connection failed");
|
|
}
|
|
});
|
|
}
|
|
Err(e) => {
|
|
tracing::warn!(error = ?e, "manager listener accept failed");
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
});
|
|
Ok(())
|
|
}
|
|
|
|
async fn serve(stream: UnixStream, coord: Arc<Coordinator>) -> Result<()> {
|
|
let (read, mut write) = stream.into_split();
|
|
let mut reader = BufReader::new(read);
|
|
let mut line = String::new();
|
|
loop {
|
|
line.clear();
|
|
let n = reader.read_line(&mut line).await?;
|
|
if n == 0 {
|
|
return Ok(());
|
|
}
|
|
let resp = match serde_json::from_str::<ManagerRequest>(line.trim()) {
|
|
Ok(req) => dispatch(&req, &coord).await,
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("parse error: {e}"),
|
|
},
|
|
};
|
|
let mut payload = serde_json::to_string(&resp)?;
|
|
payload.push('\n');
|
|
write.write_all(payload.as_bytes()).await?;
|
|
write.flush().await?;
|
|
}
|
|
}
|
|
|
|
/// Default and max long-poll window for manager `Recv`. Caller can
|
|
/// request a shorter or longer (up to MAX) wait via `wait_seconds`.
|
|
const MANAGER_RECV_LONG_POLL_DEFAULT: std::time::Duration = std::time::Duration::from_secs(30);
|
|
const MANAGER_RECV_LONG_POLL_MAX: std::time::Duration = std::time::Duration::from_secs(180);
|
|
|
|
fn manager_recv_timeout(wait_seconds: Option<u64>) -> std::time::Duration {
|
|
match wait_seconds {
|
|
Some(s) => std::time::Duration::from_secs(s).min(MANAGER_RECV_LONG_POLL_MAX),
|
|
None => MANAGER_RECV_LONG_POLL_DEFAULT,
|
|
}
|
|
}
|
|
|
|
#[allow(clippy::too_many_lines)]
|
|
async fn dispatch(req: &ManagerRequest, coord: &Arc<Coordinator>) -> ManagerResponse {
|
|
match req {
|
|
ManagerRequest::Send { to, body } => match coord.broker.send(&Message {
|
|
from: MANAGER_AGENT.to_owned(),
|
|
to: to.clone(),
|
|
body: body.clone(),
|
|
}) {
|
|
Ok(()) => ManagerResponse::Ok,
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
},
|
|
ManagerRequest::OperatorMsg { body } => match coord.broker.send(&Message {
|
|
from: hive_sh4re::OPERATOR_RECIPIENT.to_owned(),
|
|
to: MANAGER_AGENT.to_owned(),
|
|
body: body.clone(),
|
|
}) {
|
|
Ok(()) => ManagerResponse::Ok,
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
},
|
|
ManagerRequest::Status => match coord.broker.count_pending(MANAGER_AGENT) {
|
|
Ok(unread) => ManagerResponse::Status { unread },
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
},
|
|
ManagerRequest::Recent { limit } => match coord.broker.recent_for(MANAGER_AGENT, *limit) {
|
|
Ok(rows) => ManagerResponse::Recent { rows },
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
},
|
|
ManagerRequest::Recv { wait_seconds } => match coord
|
|
.broker
|
|
.recv_blocking(MANAGER_AGENT, manager_recv_timeout(*wait_seconds))
|
|
.await
|
|
{
|
|
Ok(Some(msg)) => ManagerResponse::Message {
|
|
from: msg.from,
|
|
body: msg.body,
|
|
},
|
|
Ok(None) => ManagerResponse::Empty,
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
},
|
|
ManagerRequest::RequestSpawn { name } => {
|
|
tracing::info!(%name, "manager: request_spawn");
|
|
match coord
|
|
.approvals
|
|
.submit_kind(name, hive_sh4re::ApprovalKind::Spawn, "")
|
|
{
|
|
Ok(id) => {
|
|
tracing::info!(%id, %name, "spawn approval queued");
|
|
ManagerResponse::Ok
|
|
}
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
}
|
|
}
|
|
ManagerRequest::Kill { name } => {
|
|
tracing::info!(%name, "manager: kill");
|
|
if name == crate::lifecycle::MANAGER_NAME {
|
|
return ManagerResponse::Err {
|
|
message: "refusing to kill the manager".into(),
|
|
};
|
|
}
|
|
let result: Result<()> = async {
|
|
lifecycle::kill(name).await?;
|
|
coord.unregister_agent(name);
|
|
Ok(())
|
|
}
|
|
.await;
|
|
match result {
|
|
Ok(()) => {
|
|
coord.notify_manager(&hive_sh4re::HelperEvent::Killed {
|
|
agent: name.clone(),
|
|
});
|
|
ManagerResponse::Ok
|
|
}
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
}
|
|
}
|
|
ManagerRequest::Start { name } => {
|
|
tracing::info!(%name, "manager: start");
|
|
if name == crate::lifecycle::MANAGER_NAME {
|
|
return ManagerResponse::Err {
|
|
message: "refusing to start the manager from itself".into(),
|
|
};
|
|
}
|
|
match lifecycle::start(name).await {
|
|
Ok(()) => {
|
|
coord.kick_agent(name, "container started");
|
|
ManagerResponse::Ok
|
|
}
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
}
|
|
}
|
|
ManagerRequest::Restart { name } => {
|
|
tracing::info!(%name, "manager: restart");
|
|
if name == crate::lifecycle::MANAGER_NAME {
|
|
return ManagerResponse::Err {
|
|
message: "refusing to restart the manager from itself".into(),
|
|
};
|
|
}
|
|
match lifecycle::restart(name).await {
|
|
Ok(()) => {
|
|
coord.kick_agent(name, "container restarted");
|
|
ManagerResponse::Ok
|
|
}
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
}
|
|
}
|
|
ManagerRequest::Update { name } => {
|
|
tracing::info!(%name, "manager: update");
|
|
let Some(current_rev) = crate::auto_update::current_flake_rev(&coord.hyperhive_flake)
|
|
else {
|
|
return ManagerResponse::Err {
|
|
message: "update: hyperhive_flake has no canonical path".into(),
|
|
};
|
|
};
|
|
coord.set_transient(name, crate::coordinator::TransientKind::Rebuilding);
|
|
let result = crate::auto_update::rebuild_agent(coord, name, ¤t_rev).await;
|
|
coord.clear_transient(name);
|
|
match result {
|
|
Ok(()) => {
|
|
coord.kick_agent(name, "container rebuilt");
|
|
ManagerResponse::Ok
|
|
}
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
}
|
|
}
|
|
ManagerRequest::AskOperator {
|
|
question,
|
|
options,
|
|
multi,
|
|
ttl_seconds,
|
|
} => {
|
|
tracing::info!(%question, ?options, multi, ?ttl_seconds, "manager: ask_operator");
|
|
let deadline_at = ttl_seconds.and_then(|s| {
|
|
let now = std::time::SystemTime::now()
|
|
.duration_since(std::time::UNIX_EPOCH)
|
|
.ok()
|
|
.and_then(|d| i64::try_from(d.as_secs()).ok())
|
|
.unwrap_or(0);
|
|
i64::try_from(s).ok().map(|s| now + s)
|
|
});
|
|
match coord
|
|
.questions
|
|
.submit(MANAGER_AGENT, question, options, *multi, deadline_at)
|
|
{
|
|
Ok(id) => {
|
|
tracing::info!(%id, ?deadline_at, "operator question queued");
|
|
if let Some(ttl) = *ttl_seconds {
|
|
spawn_question_watchdog(coord, id, ttl);
|
|
}
|
|
ManagerResponse::QuestionQueued { id }
|
|
}
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
}
|
|
}
|
|
ManagerRequest::RequestApplyCommit { agent, commit_ref } => {
|
|
tracing::info!(%agent, %commit_ref, "manager: request_apply_commit");
|
|
match submit_apply_commit(coord, agent, commit_ref).await {
|
|
Ok((id, sha)) => {
|
|
tracing::info!(%id, %agent, manager_ref = %commit_ref, %sha, "approval queued + proposal tag planted");
|
|
ManagerResponse::Ok
|
|
}
|
|
Err(e) => ManagerResponse::Err {
|
|
message: format!("{e:#}"),
|
|
},
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Submit-time half of the apply flow: queue the approval row, then
|
|
/// fetch the manager's commit from the proposed repo into applied and
|
|
/// pin it as `refs/tags/proposal/<id>`. From this point on the manager
|
|
/// repo is irrelevant for this approval — even if the manager amends
|
|
/// or force-pushes, the canonical sha hive-c0re will eventually
|
|
/// approve/deny lives in applied's object DB.
|
|
///
|
|
/// If anything fails after the row is inserted (sha missing in
|
|
/// proposed, fs error, git plumbing crash) we mark the row failed and
|
|
/// surface the error to the manager. We don't try to roll the row
|
|
/// back — the failure is part of the audit trail.
|
|
async fn submit_apply_commit(
|
|
coord: &Arc<Coordinator>,
|
|
agent: &str,
|
|
commit_ref: &str,
|
|
) -> anyhow::Result<(i64, String)> {
|
|
let proposed_dir = crate::coordinator::Coordinator::agent_proposed_dir(agent);
|
|
let applied_dir = crate::coordinator::Coordinator::agent_applied_dir(agent);
|
|
if !proposed_dir.exists() {
|
|
anyhow::bail!(
|
|
"proposed repo missing for agent '{agent}' (expected at {})",
|
|
proposed_dir.display()
|
|
);
|
|
}
|
|
if !applied_dir.join(".git").exists() {
|
|
anyhow::bail!(
|
|
"applied repo at {} is uninitialised — spawn the agent first",
|
|
applied_dir.display()
|
|
);
|
|
}
|
|
let id = coord
|
|
.approvals
|
|
.submit(agent, commit_ref)
|
|
.map_err(|e| anyhow::anyhow!("queue approval row: {e:#}"))?;
|
|
let tag = format!("proposal/{id}");
|
|
let sha = match crate::lifecycle::git_fetch_to_tag(
|
|
&applied_dir,
|
|
&proposed_dir,
|
|
commit_ref,
|
|
&tag,
|
|
)
|
|
.await
|
|
{
|
|
Ok(s) => s,
|
|
Err(e) => {
|
|
// Surface the failure on the approval row so the
|
|
// dashboard reflects it instead of leaving a phantom
|
|
// pending entry. The note doubles as the operator-visible
|
|
// explanation of why the approval can't be approved.
|
|
let _ = coord.approvals.mark_failed(id, &format!("{e:#}"));
|
|
return Err(anyhow::anyhow!("git_fetch_to_tag: {e:#}"));
|
|
}
|
|
};
|
|
coord
|
|
.approvals
|
|
.set_fetched_sha(id, &sha)
|
|
.map_err(|e| anyhow::anyhow!("persist fetched_sha: {e:#}"))?;
|
|
Ok((id, sha))
|
|
}
|
|
|
|
/// On `AskOperator { ttl_seconds: Some(n) }`, sleep n seconds and then
|
|
/// try to resolve the question with `[expired]`. If the operator (or
|
|
/// any other path) already answered it, `answer()` returns Err and
|
|
/// we no-op silently. Otherwise fire the usual `OperatorAnswered`
|
|
/// helper event so the manager sees a terminal state.
|
|
const TTL_SENTINEL: &str = "[expired]";
|
|
|
|
fn spawn_question_watchdog(coord: &Arc<Coordinator>, id: i64, ttl_secs: u64) {
|
|
let coord = coord.clone();
|
|
tokio::spawn(async move {
|
|
tokio::time::sleep(std::time::Duration::from_secs(ttl_secs)).await;
|
|
// `answer` returns Err if already resolved — that's the
|
|
// normal path when the operator responded before the ttl
|
|
// fired, so no-op silently.
|
|
if let Ok(question) = coord.questions.answer(id, TTL_SENTINEL) {
|
|
tracing::info!(%id, "operator question expired (ttl)");
|
|
coord.notify_manager(&hive_sh4re::HelperEvent::OperatorAnswered {
|
|
id,
|
|
question,
|
|
answer: TTL_SENTINEL.to_owned(),
|
|
});
|
|
}
|
|
});
|
|
}
|