feat(cluster): implement distributed clustering with etcd coordination
- Integrate etcd-client for distributed coordination and leader election - Add remote client macros with proper formatting for all services - Implement RequestMetrics for tracking RPC performance and errors - Add rate limiting mechanism across all service endpoints - Create ElectionRequest and ElectionResult message types for leader election - Add role management with primary/replica switching capabilities - Implement health checker with automatic failover detection - Add repository count metrics for cluster monitoring - Update Cargo.toml with etcd-client and dashmap dependencies - Modify RepoEntry to include read_only flag for replica handling - Implement should_accept_election logic to prevent duplicate elections - Add RoleChangedEvent handling for cluster role updates
This commit is contained in:
@@ -0,0 +1,232 @@
|
||||
//! etcd-based peer discovery for ractor_cluster.
|
||||
//!
|
||||
//! Responsibilities:
|
||||
//! - Connect to etcd and create a Lease (TTL-based health check)
|
||||
//! - Register this node under `/gitks/nodes/{storage_name}`
|
||||
//! - Discover existing peers via prefix GET
|
||||
//! - Watch for peer join/leave events and invoke callbacks
|
||||
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use etcd_client::{Client, ConnectOptions, EventType, GetOptions, PutOptions, WatchOptions};
|
||||
|
||||
use super::types::PeerInfo;
|
||||
|
||||
/// Key prefix used for all gitks entries in etcd.
|
||||
const KEY_PREFIX: &str = "/gitks/nodes/";
|
||||
|
||||
/// Wraps an etcd client with lease-based registration and peer discovery.
|
||||
pub struct EtcdRegistry {
|
||||
client: Mutex<Client>,
|
||||
lease_id: i64,
|
||||
storage_name: String,
|
||||
}
|
||||
|
||||
impl EtcdRegistry {
|
||||
/// Connect to etcd, create a lease, and register this node.
|
||||
///
|
||||
/// Returns `None` if the connection fails (caller should fall back to standalone mode).
|
||||
pub async fn register(
|
||||
endpoints: Vec<String>,
|
||||
info: &PeerInfo,
|
||||
ttl_secs: i64,
|
||||
connect_timeout_ms: u64,
|
||||
) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
|
||||
let connect_opts = ConnectOptions::new()
|
||||
.with_connect_timeout(std::time::Duration::from_millis(connect_timeout_ms))
|
||||
.with_keep_alive(
|
||||
std::time::Duration::from_secs(5),
|
||||
std::time::Duration::from_secs(3),
|
||||
)
|
||||
.with_keep_alive_while_idle(true);
|
||||
|
||||
let mut client = Client::connect(endpoints.clone(), Some(connect_opts)).await?;
|
||||
|
||||
tracing::info!(endpoints = ?endpoints, "connected to etcd");
|
||||
|
||||
// Create lease
|
||||
let lease_resp = client.lease_grant(ttl_secs, None).await?;
|
||||
let lease_id = lease_resp.id();
|
||||
tracing::info!(lease_id, ttl = ttl_secs, "etcd lease granted");
|
||||
|
||||
// Register node info under the lease
|
||||
let key = format!("{KEY_PREFIX}{}", info.storage_name);
|
||||
let value = serde_json::to_string(info)?;
|
||||
client
|
||||
.put(key, value, Some(PutOptions::new().with_lease(lease_id)))
|
||||
.await?;
|
||||
tracing::info!(
|
||||
storage_name = %info.storage_name,
|
||||
cluster_addr = %info.cluster_addr,
|
||||
"registered in etcd"
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
client: Mutex::new(client),
|
||||
lease_id,
|
||||
storage_name: info.storage_name.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Start the lease keepalive loop in a background task.
|
||||
///
|
||||
/// The loop sends periodic heartbeats to etcd to prevent the lease from expiring.
|
||||
/// If keepalive fails, it logs a warning but does not panic — the node will
|
||||
/// eventually be removed from etcd when the lease expires.
|
||||
pub fn start_keepalive(self: &Arc<Self>) -> tokio::task::JoinHandle<()> {
|
||||
let this = Arc::clone(self);
|
||||
tokio::spawn(async move {
|
||||
let lease_id = this.lease_id;
|
||||
let (mut keeper, mut stream) = {
|
||||
let mut client = this.client.lock().await;
|
||||
match client.lease_keep_alive(lease_id).await {
|
||||
Ok(pair) => pair,
|
||||
Err(e) => {
|
||||
tracing::error!(lease_id, error = %e, "failed to start lease keepalive");
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(5));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
if let Err(e) = keeper.keep_alive().await {
|
||||
tracing::warn!(lease_id, error = %e, "etcd lease keepalive failed");
|
||||
// Don't break — let the lease expire naturally if we can't recover
|
||||
}
|
||||
// Drain keepalive responses
|
||||
let _ = stream.message().await;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Discover all currently registered peers (excluding this node).
|
||||
pub async fn discover_peers(
|
||||
&self,
|
||||
) -> Result<Vec<PeerInfo>, Box<dyn std::error::Error + Send + Sync>> {
|
||||
let mut client = self.client.lock().await;
|
||||
let resp = client
|
||||
.get(KEY_PREFIX, Some(GetOptions::new().with_prefix()))
|
||||
.await?;
|
||||
|
||||
let mut peers = Vec::new();
|
||||
for kv in resp.kvs() {
|
||||
match serde_json::from_slice::<PeerInfo>(kv.value()) {
|
||||
Ok(info) if info.storage_name != self.storage_name => {
|
||||
peers.push(info);
|
||||
}
|
||||
Ok(_) => {} // skip self
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
key = %String::from_utf8_lossy(kv.key()),
|
||||
error = %e,
|
||||
"failed to parse peer info from etcd"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(peers)
|
||||
}
|
||||
|
||||
/// Start a long-running watch loop that monitors peer join/leave events.
|
||||
///
|
||||
/// Callbacks are invoked synchronously within the watch task; keep them fast
|
||||
/// (prefer sending messages to actors rather than doing blocking work).
|
||||
pub fn start_watch(
|
||||
self: &Arc<Self>,
|
||||
on_peer_joined: impl Fn(PeerInfo) + Send + Sync + 'static,
|
||||
on_peer_left: impl Fn(String) + Send + Sync + 'static,
|
||||
) -> tokio::task::JoinHandle<()> {
|
||||
let this = Arc::clone(self);
|
||||
let my_name = self.storage_name.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let on_joined = Arc::new(on_peer_joined);
|
||||
let on_left = Arc::new(on_peer_left);
|
||||
|
||||
loop {
|
||||
// Create a fresh watch client each iteration (after reconnect)
|
||||
let mut watch_stream = {
|
||||
let mut client = this.client.lock().await;
|
||||
match client
|
||||
.watch(KEY_PREFIX, Some(WatchOptions::new().with_prefix()))
|
||||
.await
|
||||
{
|
||||
Ok(stream) => stream,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "etcd watch failed, retrying in 3s");
|
||||
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
tracing::info!("etcd watch loop started");
|
||||
|
||||
loop {
|
||||
match watch_stream.message().await {
|
||||
Ok(Some(resp)) => {
|
||||
for event in resp.events() {
|
||||
match event.event_type() {
|
||||
EventType::Put => {
|
||||
if let Some(kv) = event.kv()
|
||||
&& let Ok(info) =
|
||||
serde_json::from_slice::<PeerInfo>(kv.value())
|
||||
&& info.storage_name != my_name
|
||||
{
|
||||
tracing::info!(
|
||||
peer = %info.storage_name,
|
||||
cluster_addr = %info.cluster_addr,
|
||||
"peer joined via etcd watch"
|
||||
);
|
||||
on_joined(info);
|
||||
}
|
||||
}
|
||||
EventType::Delete => {
|
||||
if let Some(kv) = event.kv() {
|
||||
let key = String::from_utf8_lossy(kv.key());
|
||||
let name = key
|
||||
.strip_prefix(KEY_PREFIX)
|
||||
.unwrap_or(&key)
|
||||
.to_string();
|
||||
if name != my_name {
|
||||
tracing::warn!(
|
||||
peer = %name,
|
||||
"peer left (etcd lease expired or key deleted)"
|
||||
);
|
||||
on_left(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
tracing::warn!("etcd watch stream ended");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "etcd watch stream error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reconnect after a delay
|
||||
tracing::info!("etcd watch loop restarting in 3s");
|
||||
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Check if the lease is still alive (for external health monitoring).
|
||||
pub async fn is_lease_alive(&self) -> bool {
|
||||
let mut client = self.client.lock().await;
|
||||
match client.lease_time_to_live(self.lease_id, None).await {
|
||||
Ok(resp) => resp.ttl() > 0,
|
||||
Err(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
+214
@@ -0,0 +1,214 @@
|
||||
//! Cluster discovery: etcd-driven ractor_cluster node discovery.
|
||||
//!
|
||||
//! Architecture:
|
||||
//! 1. Start a `ractor_cluster::NodeServer` (TCP listener for actor remoting)
|
||||
//! 2. Connect to etcd and register this node
|
||||
//! 3. Discover existing peers → `client_connect()` to each
|
||||
//! 4. Watch etcd for future peer join/leave → connect/disconnect dynamically
|
||||
//!
|
||||
//! Once ractor_cluster TCP connections are established, the existing
|
||||
//! `pg::get_members()` / `ractor::call_t!()` APIs automatically work
|
||||
//! cross-network — no changes needed in actor/handler.rs or server/mod.rs.
|
||||
|
||||
pub mod discovery;
|
||||
pub mod types;
|
||||
|
||||
pub use discovery::EtcdRegistry;
|
||||
pub use types::PeerInfo;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use ractor::ActorRef;
|
||||
use ractor_cluster::node::NodeConnectionMode;
|
||||
use ractor_cluster::{NodeServer, NodeServerMessage, client_connect};
|
||||
|
||||
use crate::error::{GitError, GitResult};
|
||||
|
||||
/// Configuration for the cluster subsystem.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClusterConfig {
|
||||
/// etcd endpoints (e.g. ["http://etcd1:2379", "http://etcd2:2379"])
|
||||
pub etcd_endpoints: Vec<String>,
|
||||
/// Logical name for this storage node
|
||||
pub storage_name: String,
|
||||
/// gRPC address advertised to clients
|
||||
pub grpc_addr: String,
|
||||
/// TCP port for ractor_cluster NodeServer
|
||||
pub cluster_port: u16,
|
||||
/// Shared authentication cookie for ractor_cluster
|
||||
pub cookie: String,
|
||||
/// etcd lease TTL in seconds
|
||||
pub lease_ttl_secs: i64,
|
||||
/// etcd connection timeout in milliseconds
|
||||
pub connect_timeout_ms: u64,
|
||||
/// Hostname used in the ractor_cluster node name (`name@hostname`).
|
||||
/// Also used by remote nodes to connect back via `{cluster_hostname}:{cluster_port}`.
|
||||
/// In K8s/Docker, this should be a resolvable address (Pod IP, service DNS, etc.)
|
||||
pub cluster_hostname: String,
|
||||
}
|
||||
|
||||
/// The running cluster manager. Holds references to the NodeServer and etcd registry.
|
||||
/// Dropping this will stop the background tasks.
|
||||
pub struct ClusterManager {
|
||||
/// The ractor_cluster NodeServer actor
|
||||
pub node_server: ActorRef<NodeServerMessage>,
|
||||
/// The etcd registry (for health checks, etc.)
|
||||
pub registry: Arc<EtcdRegistry>,
|
||||
/// Handles for background tasks (keepalive + watch)
|
||||
_keepalive_handle: tokio::task::JoinHandle<()>,
|
||||
_watch_handle: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl ClusterManager {
|
||||
/// Start the full cluster subsystem:
|
||||
/// 1. Spawn NodeServer (TCP listener)
|
||||
/// 2. Connect to etcd + register
|
||||
/// 3. Discover peers → client_connect
|
||||
/// 4. Start keepalive + watch loops
|
||||
///
|
||||
/// Returns `Err` if etcd is unreachable (caller should fall back to standalone).
|
||||
pub async fn start(config: ClusterConfig) -> GitResult<Self> {
|
||||
// ── Step 1: Start NodeServer ──
|
||||
let node_server = spawn_node_server(&config).await?;
|
||||
tracing::info!(
|
||||
port = config.cluster_port,
|
||||
hostname = %config.cluster_hostname,
|
||||
"NodeServer started"
|
||||
);
|
||||
|
||||
// ── Step 2: Connect to etcd and register ──
|
||||
let cluster_addr = format!("{}:{}", config.cluster_hostname, config.cluster_port);
|
||||
let peer_info = PeerInfo {
|
||||
storage_name: config.storage_name.clone(),
|
||||
cluster_addr: cluster_addr.clone(),
|
||||
grpc_addr: config.grpc_addr.clone(),
|
||||
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
};
|
||||
|
||||
let registry = Arc::new(
|
||||
EtcdRegistry::register(
|
||||
config.etcd_endpoints.clone(),
|
||||
&peer_info,
|
||||
config.lease_ttl_secs,
|
||||
config.connect_timeout_ms,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| GitError::Internal(format!("etcd registration failed: {e}")))?,
|
||||
);
|
||||
|
||||
// ── Step 3: Discover existing peers and connect ──
|
||||
let peers = registry
|
||||
.discover_peers()
|
||||
.await
|
||||
.map_err(|e| GitError::Internal(format!("peer discovery failed: {e}")))?;
|
||||
|
||||
for peer in &peers {
|
||||
connect_to_peer(&node_server, peer, &config.storage_name).await;
|
||||
}
|
||||
|
||||
// ── Step 4: Start background tasks ──
|
||||
let keepalive_handle = registry.start_keepalive();
|
||||
|
||||
let ns_for_watch = node_server.clone();
|
||||
let my_name_for_watch = config.storage_name.clone();
|
||||
let watch_handle = registry.start_watch(
|
||||
move |peer| {
|
||||
let ns = ns_for_watch.clone();
|
||||
let my_name = my_name_for_watch.clone();
|
||||
tokio::spawn(async move {
|
||||
connect_to_peer(&ns, &peer, &my_name).await;
|
||||
});
|
||||
},
|
||||
move |name| {
|
||||
tracing::info!(
|
||||
peer = %name,
|
||||
"peer left etcd registry (ractor_cluster will cleanup TCP session)"
|
||||
);
|
||||
// ractor_cluster automatically:
|
||||
// 1. Detects TCP disconnection
|
||||
// 2. Stops the NodeSession actor
|
||||
// 3. Stops all RemoteActors for that session
|
||||
// 4. Removes them from Process Groups
|
||||
// No manual cleanup needed.
|
||||
},
|
||||
);
|
||||
|
||||
tracing::info!(
|
||||
storage_name = %config.storage_name,
|
||||
peers_found = peers.len(),
|
||||
"cluster manager started"
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
node_server,
|
||||
registry,
|
||||
_keepalive_handle: keepalive_handle,
|
||||
_watch_handle: watch_handle,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawn the ractor_cluster NodeServer actor (TCP listener for inter-node communication).
|
||||
async fn spawn_node_server(config: &ClusterConfig) -> GitResult<ActorRef<NodeServerMessage>> {
|
||||
let server = NodeServer::new(
|
||||
config.cluster_port,
|
||||
config.cookie.clone(),
|
||||
config.storage_name.clone(),
|
||||
config.cluster_hostname.clone(),
|
||||
None, // no encryption (internal network)
|
||||
Some(NodeConnectionMode::Transitive),
|
||||
);
|
||||
|
||||
let (actor_ref, _handle) = ractor::Actor::spawn(
|
||||
Some(format!("node_server_{}", config.storage_name)),
|
||||
server,
|
||||
(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| GitError::Internal(format!("failed to spawn NodeServer: {e}")))?;
|
||||
|
||||
Ok(actor_ref)
|
||||
}
|
||||
|
||||
/// Establish a ractor_cluster TCP connection to a remote peer.
|
||||
///
|
||||
/// Uses ordering optimization: only the node with the lexicographically
|
||||
/// smaller `storage_name` initiates the connection. The other side will
|
||||
/// accept the incoming connection. This prevents duplicate connections.
|
||||
async fn connect_to_peer(
|
||||
node_server: &ActorRef<NodeServerMessage>,
|
||||
peer: &PeerInfo,
|
||||
my_name: &str,
|
||||
) {
|
||||
// Ordering optimization: only smaller-named node connects
|
||||
if my_name >= peer.storage_name.as_str() {
|
||||
tracing::debug!(
|
||||
peer = %peer.storage_name,
|
||||
"skipping connect (peer has lower/equal name, they connect to us)"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
peer = %peer.storage_name,
|
||||
cluster_addr = %peer.cluster_addr,
|
||||
"connecting to peer via ractor_cluster"
|
||||
);
|
||||
|
||||
match client_connect(node_server, peer.cluster_addr.as_str()).await {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
peer = %peer.storage_name,
|
||||
"ractor_cluster connection initiated"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
peer = %peer.storage_name,
|
||||
cluster_addr = %peer.cluster_addr,
|
||||
error = %e,
|
||||
"failed to connect to peer (will retry on next watch event)"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Information about a peer node, registered in etcd.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PeerInfo {
|
||||
/// Logical storage name (e.g. "node-a", "default")
|
||||
pub storage_name: String,
|
||||
/// ractor_cluster TCP address (e.g. "10.0.1.4:4697")
|
||||
pub cluster_addr: String,
|
||||
/// gRPC service address (e.g. "http://10.0.1.4:50051")
|
||||
pub grpc_addr: String,
|
||||
/// Software version
|
||||
pub version: String,
|
||||
}
|
||||
Reference in New Issue
Block a user