feat(cluster): implement distributed clustering with etcd coordination

- Integrate etcd-client for distributed coordination and leader election - Add remote client macros with proper formatting for all services - Implement RequestMetrics for tracking RPC performance and errors - Add rate limiting mechanism across all service endpoints - Create ElectionRequest and ElectionResult message types for leader election - Add role management with primary/replica switching capabilities - Implement health checker with automatic failover detection - Add repository count metrics for cluster monitoring - Update Cargo.toml with etcd-client and dashmap dependencies - Modify RepoEntry to include read_only flag for replica handling - Implement should_accept_election logic to prevent duplicate elections - Add RoleChangedEvent handling for cluster role updates
2026-06-08 14:31:29 +08:00
parent d243dce027
commit 8f472a0443
37 changed files with 4691 additions and 83 deletions
@@ -0,0 +1,232 @@
+//! etcd-based peer discovery for ractor_cluster.
+//!
+//! Responsibilities:
+//! - Connect to etcd and create a Lease (TTL-based health check)
+//! - Register this node under `/gitks/nodes/{storage_name}`
+//! - Discover existing peers via prefix GET
+//! - Watch for peer join/leave events and invoke callbacks
+
+use std::sync::Arc;
+use tokio::sync::Mutex;
+
+use etcd_client::{Client, ConnectOptions, EventType, GetOptions, PutOptions, WatchOptions};
+
+use super::types::PeerInfo;
+
+/// Key prefix used for all gitks entries in etcd.
+const KEY_PREFIX: &str = "/gitks/nodes/";
+
+/// Wraps an etcd client with lease-based registration and peer discovery.
+pub struct EtcdRegistry {
+    client: Mutex<Client>,
+    lease_id: i64,
+    storage_name: String,
+}
+
+impl EtcdRegistry {
+    /// Connect to etcd, create a lease, and register this node.
+    ///
+    /// Returns `None` if the connection fails (caller should fall back to standalone mode).
+    pub async fn register(
+        endpoints: Vec<String>,
+        info: &PeerInfo,
+        ttl_secs: i64,
+        connect_timeout_ms: u64,
+    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
+        let connect_opts = ConnectOptions::new()
+            .with_connect_timeout(std::time::Duration::from_millis(connect_timeout_ms))
+            .with_keep_alive(
+                std::time::Duration::from_secs(5),
+                std::time::Duration::from_secs(3),
+            )
+            .with_keep_alive_while_idle(true);
+
+        let mut client = Client::connect(endpoints.clone(), Some(connect_opts)).await?;
+
+        tracing::info!(endpoints = ?endpoints, "connected to etcd");
+
+        // Create lease
+        let lease_resp = client.lease_grant(ttl_secs, None).await?;
+        let lease_id = lease_resp.id();
+        tracing::info!(lease_id, ttl = ttl_secs, "etcd lease granted");
+
+        // Register node info under the lease
+        let key = format!("{KEY_PREFIX}{}", info.storage_name);
+        let value = serde_json::to_string(info)?;
+        client
+            .put(key, value, Some(PutOptions::new().with_lease(lease_id)))
+            .await?;
+        tracing::info!(
+            storage_name = %info.storage_name,
+            cluster_addr = %info.cluster_addr,
+            "registered in etcd"
+        );
+
+        Ok(Self {
+            client: Mutex::new(client),
+            lease_id,
+            storage_name: info.storage_name.clone(),
+        })
+    }
+
+    /// Start the lease keepalive loop in a background task.
+    ///
+    /// The loop sends periodic heartbeats to etcd to prevent the lease from expiring.
+    /// If keepalive fails, it logs a warning but does not panic — the node will
+    /// eventually be removed from etcd when the lease expires.
+    pub fn start_keepalive(self: &Arc<Self>) -> tokio::task::JoinHandle<()> {
+        let this = Arc::clone(self);
+        tokio::spawn(async move {
+            let lease_id = this.lease_id;
+            let (mut keeper, mut stream) = {
+                let mut client = this.client.lock().await;
+                match client.lease_keep_alive(lease_id).await {
+                    Ok(pair) => pair,
+                    Err(e) => {
+                        tracing::error!(lease_id, error = %e, "failed to start lease keepalive");
+                        return;
+                    }
+                }
+            };
+
+            let mut interval = tokio::time::interval(std::time::Duration::from_secs(5));
+            loop {
+                interval.tick().await;
+                if let Err(e) = keeper.keep_alive().await {
+                    tracing::warn!(lease_id, error = %e, "etcd lease keepalive failed");
+                    // Don't break — let the lease expire naturally if we can't recover
+                }
+                // Drain keepalive responses
+                let _ = stream.message().await;
+            }
+        })
+    }
+
+    /// Discover all currently registered peers (excluding this node).
+    pub async fn discover_peers(
+        &self,
+    ) -> Result<Vec<PeerInfo>, Box<dyn std::error::Error + Send + Sync>> {
+        let mut client = self.client.lock().await;
+        let resp = client
+            .get(KEY_PREFIX, Some(GetOptions::new().with_prefix()))
+            .await?;
+
+        let mut peers = Vec::new();
+        for kv in resp.kvs() {
+            match serde_json::from_slice::<PeerInfo>(kv.value()) {
+                Ok(info) if info.storage_name != self.storage_name => {
+                    peers.push(info);
+                }
+                Ok(_) => {} // skip self
+                Err(e) => {
+                    tracing::warn!(
+                        key = %String::from_utf8_lossy(kv.key()),
+                        error = %e,
+                        "failed to parse peer info from etcd"
+                    );
+                }
+            }
+        }
+        Ok(peers)
+    }
+
+    /// Start a long-running watch loop that monitors peer join/leave events.
+    ///
+    /// Callbacks are invoked synchronously within the watch task; keep them fast
+    /// (prefer sending messages to actors rather than doing blocking work).
+    pub fn start_watch(
+        self: &Arc<Self>,
+        on_peer_joined: impl Fn(PeerInfo) + Send + Sync + 'static,
+        on_peer_left: impl Fn(String) + Send + Sync + 'static,
+    ) -> tokio::task::JoinHandle<()> {
+        let this = Arc::clone(self);
+        let my_name = self.storage_name.clone();
+
+        tokio::spawn(async move {
+            let on_joined = Arc::new(on_peer_joined);
+            let on_left = Arc::new(on_peer_left);
+
+            loop {
+                // Create a fresh watch client each iteration (after reconnect)
+                let mut watch_stream = {
+                    let mut client = this.client.lock().await;
+                    match client
+                        .watch(KEY_PREFIX, Some(WatchOptions::new().with_prefix()))
+                        .await
+                    {
+                        Ok(stream) => stream,
+                        Err(e) => {
+                            tracing::error!(error = %e, "etcd watch failed, retrying in 3s");
+                            tokio::time::sleep(std::time::Duration::from_secs(3)).await;
+                            continue;
+                        }
+                    }
+                };
+
+                tracing::info!("etcd watch loop started");
+
+                loop {
+                    match watch_stream.message().await {
+                        Ok(Some(resp)) => {
+                            for event in resp.events() {
+                                match event.event_type() {
+                                    EventType::Put => {
+                                        if let Some(kv) = event.kv()
+                                            && let Ok(info) =
+                                                serde_json::from_slice::<PeerInfo>(kv.value())
+                                            && info.storage_name != my_name
+                                        {
+                                            tracing::info!(
+                                                peer = %info.storage_name,
+                                                cluster_addr = %info.cluster_addr,
+                                                "peer joined via etcd watch"
+                                            );
+                                            on_joined(info);
+                                        }
+                                    }
+                                    EventType::Delete => {
+                                        if let Some(kv) = event.kv() {
+                                            let key = String::from_utf8_lossy(kv.key());
+                                            let name = key
+                                                .strip_prefix(KEY_PREFIX)
+                                                .unwrap_or(&key)
+                                                .to_string();
+                                            if name != my_name {
+                                                tracing::warn!(
+                                                    peer = %name,
+                                                    "peer left (etcd lease expired or key deleted)"
+                                                );
+                                                on_left(name);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        Ok(None) => {
+                            tracing::warn!("etcd watch stream ended");
+                            break;
+                        }
+                        Err(e) => {
+                            tracing::error!(error = %e, "etcd watch stream error");
+                            break;
+                        }
+                    }
+                }
+
+                // Reconnect after a delay
+                tracing::info!("etcd watch loop restarting in 3s");
+                tokio::time::sleep(std::time::Duration::from_secs(3)).await;
+            }
+        })
+    }
+
+    /// Check if the lease is still alive (for external health monitoring).
+    pub async fn is_lease_alive(&self) -> bool {
+        let mut client = self.client.lock().await;
+        match client.lease_time_to_live(self.lease_id, None).await {
+            Ok(resp) => resp.ttl() > 0,
+            Err(_) => false,
+        }
+    }
+}
@@ -0,0 +1,214 @@
+//! Cluster discovery: etcd-driven ractor_cluster node discovery.
+//!
+//! Architecture:
+//!   1. Start a `ractor_cluster::NodeServer` (TCP listener for actor remoting)
+//!   2. Connect to etcd and register this node
+//!   3. Discover existing peers → `client_connect()` to each
+//!   4. Watch etcd for future peer join/leave → connect/disconnect dynamically
+//!
+//! Once ractor_cluster TCP connections are established, the existing
+//! `pg::get_members()` / `ractor::call_t!()` APIs automatically work
+//! cross-network — no changes needed in actor/handler.rs or server/mod.rs.
+
+pub mod discovery;
+pub mod types;
+
+pub use discovery::EtcdRegistry;
+pub use types::PeerInfo;
+
+use std::sync::Arc;
+
+use ractor::ActorRef;
+use ractor_cluster::node::NodeConnectionMode;
+use ractor_cluster::{NodeServer, NodeServerMessage, client_connect};
+
+use crate::error::{GitError, GitResult};
+
+/// Configuration for the cluster subsystem.
+#[derive(Debug, Clone)]
+pub struct ClusterConfig {
+    /// etcd endpoints (e.g. ["http://etcd1:2379", "http://etcd2:2379"])
+    pub etcd_endpoints: Vec<String>,
+    /// Logical name for this storage node
+    pub storage_name: String,
+    /// gRPC address advertised to clients
+    pub grpc_addr: String,
+    /// TCP port for ractor_cluster NodeServer
+    pub cluster_port: u16,
+    /// Shared authentication cookie for ractor_cluster
+    pub cookie: String,
+    /// etcd lease TTL in seconds
+    pub lease_ttl_secs: i64,
+    /// etcd connection timeout in milliseconds
+    pub connect_timeout_ms: u64,
+    /// Hostname used in the ractor_cluster node name (`name@hostname`).
+    /// Also used by remote nodes to connect back via `{cluster_hostname}:{cluster_port}`.
+    /// In K8s/Docker, this should be a resolvable address (Pod IP, service DNS, etc.)
+    pub cluster_hostname: String,
+}
+
+/// The running cluster manager. Holds references to the NodeServer and etcd registry.
+/// Dropping this will stop the background tasks.
+pub struct ClusterManager {
+    /// The ractor_cluster NodeServer actor
+    pub node_server: ActorRef<NodeServerMessage>,
+    /// The etcd registry (for health checks, etc.)
+    pub registry: Arc<EtcdRegistry>,
+    /// Handles for background tasks (keepalive + watch)
+    _keepalive_handle: tokio::task::JoinHandle<()>,
+    _watch_handle: tokio::task::JoinHandle<()>,
+}
+
+impl ClusterManager {
+    /// Start the full cluster subsystem:
+    ///   1. Spawn NodeServer (TCP listener)
+    ///   2. Connect to etcd + register
+    ///   3. Discover peers → client_connect
+    ///   4. Start keepalive + watch loops
+    ///
+    /// Returns `Err` if etcd is unreachable (caller should fall back to standalone).
+    pub async fn start(config: ClusterConfig) -> GitResult<Self> {
+        // ── Step 1: Start NodeServer ──
+        let node_server = spawn_node_server(&config).await?;
+        tracing::info!(
+            port = config.cluster_port,
+            hostname = %config.cluster_hostname,
+            "NodeServer started"
+        );
+
+        // ── Step 2: Connect to etcd and register ──
+        let cluster_addr = format!("{}:{}", config.cluster_hostname, config.cluster_port);
+        let peer_info = PeerInfo {
+            storage_name: config.storage_name.clone(),
+            cluster_addr: cluster_addr.clone(),
+            grpc_addr: config.grpc_addr.clone(),
+            version: env!("CARGO_PKG_VERSION").to_string(),
+        };
+
+        let registry = Arc::new(
+            EtcdRegistry::register(
+                config.etcd_endpoints.clone(),
+                &peer_info,
+                config.lease_ttl_secs,
+                config.connect_timeout_ms,
+            )
+            .await
+            .map_err(|e| GitError::Internal(format!("etcd registration failed: {e}")))?,
+        );
+
+        // ── Step 3: Discover existing peers and connect ──
+        let peers = registry
+            .discover_peers()
+            .await
+            .map_err(|e| GitError::Internal(format!("peer discovery failed: {e}")))?;
+
+        for peer in &peers {
+            connect_to_peer(&node_server, peer, &config.storage_name).await;
+        }
+
+        // ── Step 4: Start background tasks ──
+        let keepalive_handle = registry.start_keepalive();
+
+        let ns_for_watch = node_server.clone();
+        let my_name_for_watch = config.storage_name.clone();
+        let watch_handle = registry.start_watch(
+            move |peer| {
+                let ns = ns_for_watch.clone();
+                let my_name = my_name_for_watch.clone();
+                tokio::spawn(async move {
+                    connect_to_peer(&ns, &peer, &my_name).await;
+                });
+            },
+            move |name| {
+                tracing::info!(
+                    peer = %name,
+                    "peer left etcd registry (ractor_cluster will cleanup TCP session)"
+                );
+                // ractor_cluster automatically:
+                //   1. Detects TCP disconnection
+                //   2. Stops the NodeSession actor
+                //   3. Stops all RemoteActors for that session
+                //   4. Removes them from Process Groups
+                // No manual cleanup needed.
+            },
+        );
+
+        tracing::info!(
+            storage_name = %config.storage_name,
+            peers_found = peers.len(),
+            "cluster manager started"
+        );
+
+        Ok(Self {
+            node_server,
+            registry,
+            _keepalive_handle: keepalive_handle,
+            _watch_handle: watch_handle,
+        })
+    }
+}
+
+/// Spawn the ractor_cluster NodeServer actor (TCP listener for inter-node communication).
+async fn spawn_node_server(config: &ClusterConfig) -> GitResult<ActorRef<NodeServerMessage>> {
+    let server = NodeServer::new(
+        config.cluster_port,
+        config.cookie.clone(),
+        config.storage_name.clone(),
+        config.cluster_hostname.clone(),
+        None, // no encryption (internal network)
+        Some(NodeConnectionMode::Transitive),
+    );
+
+    let (actor_ref, _handle) = ractor::Actor::spawn(
+        Some(format!("node_server_{}", config.storage_name)),
+        server,
+        (),
+    )
+    .await
+    .map_err(|e| GitError::Internal(format!("failed to spawn NodeServer: {e}")))?;
+
+    Ok(actor_ref)
+}
+
+/// Establish a ractor_cluster TCP connection to a remote peer.
+///
+/// Uses ordering optimization: only the node with the lexicographically
+/// smaller `storage_name` initiates the connection. The other side will
+/// accept the incoming connection. This prevents duplicate connections.
+async fn connect_to_peer(
+    node_server: &ActorRef<NodeServerMessage>,
+    peer: &PeerInfo,
+    my_name: &str,
+) {
+    // Ordering optimization: only smaller-named node connects
+    if my_name >= peer.storage_name.as_str() {
+        tracing::debug!(
+            peer = %peer.storage_name,
+            "skipping connect (peer has lower/equal name, they connect to us)"
+        );
+        return;
+    }
+
+    tracing::info!(
+        peer = %peer.storage_name,
+        cluster_addr = %peer.cluster_addr,
+        "connecting to peer via ractor_cluster"
+    );
+
+    match client_connect(node_server, peer.cluster_addr.as_str()).await {
+        Ok(()) => {
+            tracing::info!(
+                peer = %peer.storage_name,
+                "ractor_cluster connection initiated"
+            );
+        }
+        Err(e) => {
+            tracing::warn!(
+                peer = %peer.storage_name,
+                cluster_addr = %peer.cluster_addr,
+                error = %e,
+                "failed to connect to peer (will retry on next watch event)"
+            );
+        }
+    }
+}
@@ -0,0 +1,14 @@
+use serde::{Deserialize, Serialize};
+
+/// Information about a peer node, registered in etcd.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PeerInfo {
+    /// Logical storage name (e.g. "node-a", "default")
+    pub storage_name: String,
+    /// ractor_cluster TCP address (e.g. "10.0.1.4:4697")
+    pub cluster_addr: String,
+    /// gRPC service address (e.g. "http://10.0.1.4:50051")
+    pub grpc_addr: String,
+    /// Software version
+    pub version: String,
+}