feat(cluster): implement distributed clustering with etcd coordination

- Integrate etcd-client for distributed coordination and leader election - Add remote client macros with proper formatting for all services - Implement RequestMetrics for tracking RPC performance and errors - Add rate limiting mechanism across all service endpoints - Create ElectionRequest and ElectionResult message types for leader election - Add role management with primary/replica switching capabilities - Implement health checker with automatic failover detection - Add repository count metrics for cluster monitoring - Update Cargo.toml with etcd-client and dashmap dependencies - Modify RepoEntry to include read_only flag for replica handling - Implement should_accept_election logic to prevent duplicate elections - Add RoleChangedEvent handling for cluster role updates
2026-06-08 14:31:29 +08:00
parent d243dce027
commit 8f472a0443
37 changed files with 4691 additions and 83 deletions
@@ -0,0 +1,172 @@
+//! Repository-level rate limiting via per-repo semaphores.
+//!
+//! Prevents any single repository from consuming all server resources.
+//! Uses `tokio::sync::Semaphore` with configurable max concurrent operations.
+//!
+//! Integration pattern:
+//!   let _guard = rate_limit::acquire(svc, header).await?;
+//!   // ... handle request ...
+//!   // guard is dropped here → permit released
+
+use dashmap::DashMap;
+use std::sync::{Arc, OnceLock};
+use tokio::sync::Semaphore;
+
+// ── Configuration ───────────────────────────────────────────────────
+
+/// Default max concurrent operations per repository.
+const DEFAULT_MAX_CONCURRENT: usize = 5;
+
+/// Global rate limiter state.
+struct RateLimiter {
+    /// Per-repository semaphores. Key = repository relative_path.
+    semaphores: DashMap<String, Arc<Semaphore>>,
+    /// Max concurrent operations per repository.
+    max_concurrent: usize,
+}
+
+static RATE_LIMITER: OnceLock<RateLimiter> = OnceLock::new();
+
+fn limiter() -> &'static RateLimiter {
+    RATE_LIMITER.get_or_init(|| {
+        let max = std::env::var("GITKS_RATE_LIMIT_MAX_CONCURRENT")
+            .ok()
+            .and_then(|v| v.parse().ok())
+            .unwrap_or(DEFAULT_MAX_CONCURRENT);
+
+        tracing::info!(
+            max_concurrent = max,
+            "repository-level rate limiter initialized"
+        );
+
+        RateLimiter {
+            semaphores: DashMap::new(),
+            max_concurrent: max,
+        }
+    })
+}
+
+// ── Permit guard ───────────────────────────────────────────────────
+
+/// A guard that holds a rate-limit permit. The permit is released on drop.
+pub struct RateLimitGuard {
+    /// The semaphore permit. Dropping this releases the permit.
+    _permit: tokio::sync::OwnedSemaphorePermit,
+}
+
+/// Acquire a rate-limit permit for the given repository.
+///
+/// If the repository has `max_concurrent` operations already in flight,
+/// this will wait asynchronously until a permit becomes available.
+///
+/// Returns `None` if no repository header is provided (e.g., global health checks).
+pub async fn acquire(repo_relative_path: Option<&str>) -> Option<RateLimitGuard> {
+    let repo = repo_relative_path?;
+    if repo.is_empty() {
+        return None;
+    }
+    let l = limiter();
+    if l.max_concurrent == 0 {
+        // Unlimited
+        return None;
+    }
+
+    let sem = l
+        .semaphores
+        .entry(repo.to_string())
+        .or_insert_with(|| Arc::new(Semaphore::new(l.max_concurrent)))
+        .value()
+        .clone();
+
+    // Release DashMap reference before awaiting
+    let _ = l;
+
+    match tokio::time::timeout(
+        std::time::Duration::from_secs(30),
+        sem.clone().acquire_owned(),
+    )
+    .await
+    {
+        Ok(Ok(permit)) => {
+            tracing::debug!(
+                repo = %repo,
+                available = sem.available_permits(),
+                "rate limit permit acquired"
+            );
+            Some(RateLimitGuard { _permit: permit })
+        }
+        Ok(Err(_closed)) => {
+            // Semaphore was closed — recreate it
+            tracing::warn!(
+                repo = %repo,
+                "rate limit semaphore closed, recreating"
+            );
+            let new_sem = Arc::new(Semaphore::new(limiter().max_concurrent));
+            let permit = new_sem
+                .clone()
+                .acquire_owned()
+                .await
+                .expect("newly created semaphore should have permits");
+            limiter()
+                .semaphores
+                .insert(repo.to_string(), new_sem);
+            Some(RateLimitGuard { _permit: permit })
+        }
+        Err(_elapsed) => {
+            tracing::warn!(
+                repo = %repo,
+                max_concurrent = limiter().max_concurrent,
+                "rate limit timeout waiting for permit"
+            );
+            None
+        }
+    }
+}
+
+/// Acquire a rate-limit permit, returning a tonic error on timeout / overload.
+pub async fn acquire_or_reject(repo_relative_path: Option<&str>) -> Result<Option<RateLimitGuard>, tonic::Status> {
+    let repo = repo_relative_path.unwrap_or("");
+    if repo.is_empty() {
+        return Ok(None);
+    }
+    match acquire(Some(repo)).await {
+        Some(guard) => Ok(Some(guard)),
+        None => {
+            if limiter().max_concurrent == 0 {
+                return Ok(None);
+            }
+            // Timeout — reject with resource exhausted
+            Err(tonic::Status::resource_exhausted(format!(
+                "rate limit exceeded for repository '{repo}': max {max} concurrent operations",
+                max = limiter().max_concurrent
+            )))
+        }
+    }
+}
+
+/// Remove the semaphore for a repository (called on repo deletion).
+pub fn remove_repository(repo_relative_path: &str) {
+    limiter().semaphores.remove(repo_relative_path);
+    tracing::debug!(repo = %repo_relative_path, "rate limit semaphore removed");
+}
+
+/// Update the max concurrent limit at runtime.
+pub fn set_max_concurrent(max: usize) {
+    let l = limiter();
+    // We can't modify the field directly through OnceLock, but we can
+    // update existing semaphores to add or remove permits as needed.
+    // For a simpler approach, just log and let new semaphores use the new value.
+    // Since max_concurrent is only read on insert, we use a separate atomic.
+    use std::sync::atomic::AtomicUsize;
+    static OVERRIDE: std::sync::atomic::AtomicUsize = AtomicUsize::new(0);
+    OVERRIDE.store(max, std::sync::atomic::Ordering::Relaxed);
+
+    // Recreate all existing semaphores
+    for entry in &l.semaphores {
+        let _old = l.semaphores.insert(
+            entry.key().clone(),
+            Arc::new(Semaphore::new(max)),
+        );
+    }
+    tracing::info!(max_concurrent = max, "rate limit max_concurrent updated");
+}