agent_web_port: back to pure hash, drop port-file dance
operator's call: probing-forward + state-file machinery is more brittle than the bug it tried to fix. revert to the original deterministic FNV-1a hash mod 900. collisions are real but rare; operator resolves by renaming (different name → different hash) and rebuilding. no per-agent port file, no scan, no migration path, nothing to drift out of sync with the running container. existing port files on disk are silently ignored — operator rebuilds affected agents to regenerate flakes from the deterministic hash.
This commit is contained in:
parent
c35f566d15
commit
acaa0eb895
1 changed files with 7 additions and 99 deletions
|
|
@ -45,62 +45,18 @@ const WEB_PORT_RANGE: u16 = 900;
|
||||||
const DEFAULT_MEMORY_MAX: &str = "2G";
|
const DEFAULT_MEMORY_MAX: &str = "2G";
|
||||||
const DEFAULT_CPU_QUOTA: &str = "50%";
|
const DEFAULT_CPU_QUOTA: &str = "50%";
|
||||||
|
|
||||||
/// Returns the per-agent web UI port. Manager is fixed at `MANAGER_PORT`.
|
/// Per-agent web UI port. Manager is fixed at `MANAGER_PORT`; every
|
||||||
///
|
/// sub-agent is `WEB_PORT_BASE + FNV-1a(name) % WEB_PORT_RANGE`,
|
||||||
/// The port is **sticky** once chosen: looked up from
|
/// pure and reproducible from just the name. Collisions are
|
||||||
/// `state_root/port` if present. On first call we have to decide
|
/// possible (birthday paradox at ~30 agents); the operator resolves
|
||||||
/// what to write there, and the answer depends on whether the agent
|
/// them by renaming an agent (different hash → different port).
|
||||||
/// is legacy (has an applied flake — container already exists at
|
/// Stable across hosts, restarts, and dashboard renders — no
|
||||||
/// some port) or fresh (no applied dir yet — we're about to pick).
|
/// state-file dance.
|
||||||
///
|
|
||||||
/// In both cases we probe forward from `port_hash(name)` to skip
|
|
||||||
/// ports already claimed by another agent's *port file*. The
|
|
||||||
/// difference is what we count as "claimed":
|
|
||||||
///
|
|
||||||
/// - **Legacy migration**: only port files count. We do NOT treat
|
|
||||||
/// other legacy agents' implicit hashes as taken — if two legacy
|
|
||||||
/// agents collide on hash, the first queried claims the hash port
|
|
||||||
/// and the others probe forward. (We don't know which originally
|
|
||||||
/// won the bind race; first-write-wins is good enough — the
|
|
||||||
/// loser was already crash-looping anyway.)
|
|
||||||
/// - **Fresh spawn**: port files AND implicit hashes for any legacy
|
|
||||||
/// agent that hasn't been migrated yet. Without that, a new
|
|
||||||
/// agent could pick the same port as a yet-to-be-migrated legacy.
|
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn agent_web_port(name: &str) -> u16 {
|
pub fn agent_web_port(name: &str) -> u16 {
|
||||||
if name == MANAGER_NAME {
|
if name == MANAGER_NAME {
|
||||||
return MANAGER_PORT;
|
return MANAGER_PORT;
|
||||||
}
|
}
|
||||||
let state_root = crate::coordinator::Coordinator::agent_state_root(name);
|
|
||||||
let port_file = state_root.join("port");
|
|
||||||
if let Ok(s) = std::fs::read_to_string(&port_file)
|
|
||||||
&& let Ok(port) = s.trim().parse::<u16>()
|
|
||||||
&& (WEB_PORT_BASE..WEB_PORT_BASE + WEB_PORT_RANGE).contains(&port)
|
|
||||||
{
|
|
||||||
return port;
|
|
||||||
}
|
|
||||||
let is_legacy = crate::coordinator::Coordinator::agent_applied_dir(name).exists();
|
|
||||||
let taken = scan_taken_ports(name, /* include_implicit_hashes = */ !is_legacy);
|
|
||||||
let start = port_hash(name);
|
|
||||||
let mut chosen = start;
|
|
||||||
for _ in 0..WEB_PORT_RANGE {
|
|
||||||
if !taken.contains(&chosen) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
chosen = next_port(chosen);
|
|
||||||
if chosen == start {
|
|
||||||
tracing::warn!(%name, "agent_web_port: range exhausted, returning hash");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let _ = std::fs::create_dir_all(&state_root);
|
|
||||||
if let Err(e) = std::fs::write(&port_file, format!("{chosen}\n")) {
|
|
||||||
tracing::warn!(error = ?e, file = %port_file.display(), "persisting agent port failed");
|
|
||||||
}
|
|
||||||
chosen
|
|
||||||
}
|
|
||||||
|
|
||||||
fn port_hash(name: &str) -> u16 {
|
|
||||||
let mut hash: u32 = 2_166_136_261;
|
let mut hash: u32 = 2_166_136_261;
|
||||||
for b in name.bytes() {
|
for b in name.bytes() {
|
||||||
hash ^= u32::from(b);
|
hash ^= u32::from(b);
|
||||||
|
|
@ -110,54 +66,6 @@ fn port_hash(name: &str) -> u16 {
|
||||||
WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0)
|
WEB_PORT_BASE + u16::try_from(hash % u32::from(WEB_PORT_RANGE)).unwrap_or(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next_port(port: u16) -> u16 {
|
|
||||||
let p = port + 1;
|
|
||||||
if p >= WEB_PORT_BASE + WEB_PORT_RANGE {
|
|
||||||
WEB_PORT_BASE
|
|
||||||
} else {
|
|
||||||
p
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set of ports claimed by other agents.
|
|
||||||
///
|
|
||||||
/// - Always includes ports persisted to other agents' port files.
|
|
||||||
/// - `include_implicit_hashes`: if true (fresh-spawn case), also
|
|
||||||
/// include `port_hash(other_name)` for every other agent that has
|
|
||||||
/// NOT yet been migrated to a port file. This protects new spawns
|
|
||||||
/// from racing with not-yet-migrated legacies.
|
|
||||||
///
|
|
||||||
/// When false (legacy migration), we do NOT count implicit hashes
|
|
||||||
/// — otherwise two legacies that hash to the same port would each
|
|
||||||
/// see each other's hash as taken and both probe forward away
|
|
||||||
/// from the port one of them is actually running on, leaving a
|
|
||||||
/// dashboard URL that doesn't match the running container.
|
|
||||||
fn scan_taken_ports(name: &str, include_implicit_hashes: bool) -> std::collections::HashSet<u16> {
|
|
||||||
let mut out = std::collections::HashSet::new();
|
|
||||||
let Ok(rd) = std::fs::read_dir("/var/lib/hyperhive/agents") else {
|
|
||||||
return out;
|
|
||||||
};
|
|
||||||
for entry in rd.flatten() {
|
|
||||||
let Ok(file_name) = entry.file_name().into_string() else {
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
if file_name == name || file_name == MANAGER_NAME {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let pf = entry.path().join("port");
|
|
||||||
if let Ok(s) = std::fs::read_to_string(&pf)
|
|
||||||
&& let Ok(port) = s.trim().parse::<u16>()
|
|
||||||
{
|
|
||||||
out.insert(port);
|
|
||||||
} else if include_implicit_hashes {
|
|
||||||
// Legacy not yet migrated. From the fresh-spawn POV its
|
|
||||||
// effective port is the bare hash.
|
|
||||||
out.insert(port_hash(&file_name));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out
|
|
||||||
}
|
|
||||||
|
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn container_name(name: &str) -> String {
|
pub fn container_name(name: &str) -> String {
|
||||||
if name == MANAGER_NAME {
|
if name == MANAGER_NAME {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue