feat(cluster): implement distributed clustering with etcd coordination

- Integrate etcd-client for distributed coordination and leader election
- Add remote client macros with proper formatting for all services
- Implement RequestMetrics for tracking RPC performance and errors
- Add rate limiting mechanism across all service endpoints
- Create ElectionRequest and ElectionResult message types for leader election
- Add role management with primary/replica switching capabilities
- Implement health checker with automatic failover detection
- Add repository count metrics for cluster monitoring
- Update Cargo.toml with etcd-client and dashmap dependencies
- Modify RepoEntry to include read_only flag for replica handling
- Implement should_accept_election logic to prevent duplicate elections
- Add RoleChangedEvent handling for cluster role updates
This commit is contained in:
zhenyi
2026-06-08 14:31:29 +08:00
parent d243dce027
commit 8f472a0443
37 changed files with 4691 additions and 83 deletions
+172
View File
@@ -0,0 +1,172 @@
//! Repository-level rate limiting via per-repo semaphores.
//!
//! Prevents any single repository from consuming all server resources.
//! Uses `tokio::sync::Semaphore` with configurable max concurrent operations.
//!
//! Integration pattern:
//! let _guard = rate_limit::acquire(svc, header).await?;
//! // ... handle request ...
//! // guard is dropped here → permit released
use dashmap::DashMap;
use std::sync::{Arc, OnceLock};
use tokio::sync::Semaphore;
// ── Configuration ───────────────────────────────────────────────────
/// Default max concurrent operations per repository.
const DEFAULT_MAX_CONCURRENT: usize = 5;
/// Global rate limiter state.
struct RateLimiter {
/// Per-repository semaphores. Key = repository relative_path.
semaphores: DashMap<String, Arc<Semaphore>>,
/// Max concurrent operations per repository.
max_concurrent: usize,
}
static RATE_LIMITER: OnceLock<RateLimiter> = OnceLock::new();
fn limiter() -> &'static RateLimiter {
RATE_LIMITER.get_or_init(|| {
let max = std::env::var("GITKS_RATE_LIMIT_MAX_CONCURRENT")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(DEFAULT_MAX_CONCURRENT);
tracing::info!(
max_concurrent = max,
"repository-level rate limiter initialized"
);
RateLimiter {
semaphores: DashMap::new(),
max_concurrent: max,
}
})
}
// ── Permit guard ───────────────────────────────────────────────────
/// A guard that holds a rate-limit permit. The permit is released on drop.
pub struct RateLimitGuard {
/// The semaphore permit. Dropping this releases the permit.
_permit: tokio::sync::OwnedSemaphorePermit,
}
/// Acquire a rate-limit permit for the given repository.
///
/// If the repository has `max_concurrent` operations already in flight,
/// this will wait asynchronously until a permit becomes available.
///
/// Returns `None` if no repository header is provided (e.g., global health checks).
pub async fn acquire(repo_relative_path: Option<&str>) -> Option<RateLimitGuard> {
let repo = repo_relative_path?;
if repo.is_empty() {
return None;
}
let l = limiter();
if l.max_concurrent == 0 {
// Unlimited
return None;
}
let sem = l
.semaphores
.entry(repo.to_string())
.or_insert_with(|| Arc::new(Semaphore::new(l.max_concurrent)))
.value()
.clone();
// Release DashMap reference before awaiting
let _ = l;
match tokio::time::timeout(
std::time::Duration::from_secs(30),
sem.clone().acquire_owned(),
)
.await
{
Ok(Ok(permit)) => {
tracing::debug!(
repo = %repo,
available = sem.available_permits(),
"rate limit permit acquired"
);
Some(RateLimitGuard { _permit: permit })
}
Ok(Err(_closed)) => {
// Semaphore was closed — recreate it
tracing::warn!(
repo = %repo,
"rate limit semaphore closed, recreating"
);
let new_sem = Arc::new(Semaphore::new(limiter().max_concurrent));
let permit = new_sem
.clone()
.acquire_owned()
.await
.expect("newly created semaphore should have permits");
limiter()
.semaphores
.insert(repo.to_string(), new_sem);
Some(RateLimitGuard { _permit: permit })
}
Err(_elapsed) => {
tracing::warn!(
repo = %repo,
max_concurrent = limiter().max_concurrent,
"rate limit timeout waiting for permit"
);
None
}
}
}
/// Acquire a rate-limit permit, returning a tonic error on timeout / overload.
pub async fn acquire_or_reject(repo_relative_path: Option<&str>) -> Result<Option<RateLimitGuard>, tonic::Status> {
let repo = repo_relative_path.unwrap_or("");
if repo.is_empty() {
return Ok(None);
}
match acquire(Some(repo)).await {
Some(guard) => Ok(Some(guard)),
None => {
if limiter().max_concurrent == 0 {
return Ok(None);
}
// Timeout — reject with resource exhausted
Err(tonic::Status::resource_exhausted(format!(
"rate limit exceeded for repository '{repo}': max {max} concurrent operations",
max = limiter().max_concurrent
)))
}
}
}
/// Remove the semaphore for a repository (called on repo deletion).
pub fn remove_repository(repo_relative_path: &str) {
limiter().semaphores.remove(repo_relative_path);
tracing::debug!(repo = %repo_relative_path, "rate limit semaphore removed");
}
/// Update the max concurrent limit at runtime.
pub fn set_max_concurrent(max: usize) {
let l = limiter();
// We can't modify the field directly through OnceLock, but we can
// update existing semaphores to add or remove permits as needed.
// For a simpler approach, just log and let new semaphores use the new value.
// Since max_concurrent is only read on insert, we use a separate atomic.
use std::sync::atomic::AtomicUsize;
static OVERRIDE: std::sync::atomic::AtomicUsize = AtomicUsize::new(0);
OVERRIDE.store(max, std::sync::atomic::Ordering::Relaxed);
// Recreate all existing semaphores
for entry in &l.semaphores {
let _old = l.semaphores.insert(
entry.key().clone(),
Arc::new(Semaphore::new(max)),
);
}
tracing::info!(max_concurrent = max, "rate limit max_concurrent updated");
}