feat(cluster): implement distributed clustering with etcd coordination

- Integrate etcd-client for distributed coordination and leader election
- Add remote client macros with proper formatting for all services
- Implement RequestMetrics for tracking RPC performance and errors
- Add rate limiting mechanism across all service endpoints
- Create ElectionRequest and ElectionResult message types for leader election
- Add role management with primary/replica switching capabilities
- Implement health checker with automatic failover detection
- Add repository count metrics for cluster monitoring
- Update Cargo.toml with etcd-client and dashmap dependencies
- Modify RepoEntry to include read_only flag for replica handling
- Implement should_accept_election logic to prevent duplicate elections
- Add RoleChangedEvent handling for cluster role updates
This commit is contained in:
zhenyi
2026-06-08 14:31:29 +08:00
parent d243dce027
commit 8f472a0443
37 changed files with 4691 additions and 83 deletions
+183 -5
View File
@@ -1,12 +1,37 @@
use std::path::PathBuf;
use std::time::Duration;
use gitks::actor::init_actor_cluster;
use gitks::cluster::{ClusterConfig, ClusterManager};
use gitks::disk_cache::DiskCache;
use gitks::hooks::HookManager;
use gitks::metrics;
use gitks::server::{GitksService, serve};
const DEFAULT_HOST: &str = "0.0.0.0";
const DEFAULT_PORT: &str = "50051";
const DEFAULT_STORAGE_NAME: &str = "default";
fn env_or(key: &str, default: &str) -> String {
std::env::var(key).unwrap_or_else(|_| default.into())
}
fn env_bool(key: &str, default: bool) -> bool {
match std::env::var(key).as_deref() {
Ok("true" | "1" | "yes") => true,
Ok("false" | "0" | "no") => false,
Ok(_) => default,
Err(_) => default,
}
}
fn env_u64(key: &str, default: u64) -> u64 {
std::env::var(key)
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(default)
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
dotenvy::dotenv().ok();
@@ -14,10 +39,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing::info!(version = env!("CARGO_PKG_VERSION"), "gitks starting up");
let host = std::env::var("GITKS_HOST").unwrap_or_else(|_| DEFAULT_HOST.into());
let port = std::env::var("GITKS_PORT").unwrap_or_else(|_| DEFAULT_PORT.into());
let storage_name =
std::env::var("STORAGE_NAME").unwrap_or_else(|_| DEFAULT_STORAGE_NAME.into());
let host = env_or("GITKS_HOST", DEFAULT_HOST);
let port = env_or("GITKS_PORT", DEFAULT_PORT);
let storage_name = env_or("STORAGE_NAME", DEFAULT_STORAGE_NAME);
let grpc_addr =
std::env::var("GITKS_ADVERTISE_ADDR").unwrap_or_else(|_| format!("http://{host}:{port}"));
@@ -32,8 +56,162 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
std::fs::create_dir_all(&repo_prefix)?;
}
// Disk cache configuration
let disk_cache_enabled = env_bool("GITKS_DISK_CACHE_ENABLED", false);
let disk_cache_max_age = env_u64("GITKS_DISK_CACHE_MAX_AGE", 300);
let disk_cache = DiskCache::new(
repo_prefix.clone(),
env!("CARGO_PKG_VERSION").to_string(),
disk_cache_max_age,
disk_cache_enabled,
);
if disk_cache_enabled {
tracing::info!("disk cache enabled, max_age={disk_cache_max_age}s");
disk_cache.cleanup_on_startup()?;
gitks::disk_cache::start_cache_cleanup_task(disk_cache.clone(), Duration::from_secs(300));
} else {
tracing::info!("disk cache disabled");
}
// Pack cache configuration
let pack_cache_enabled = env_bool("GITKS_PACK_CACHE_ENABLED", false);
let pack_backpressure = env_bool("GITKS_PACK_CACHE_BACKPRESSURE", true);
// Pack cache: needs disk_cache. If disk_cache is enabled, info/refs cache
// is always available via PackCache wrapper. pack-objects caching is
// additionally controlled by GITKS_PACK_CACHE_ENABLED.
let pack_cache = if disk_cache_enabled {
tracing::info!(
"pack cache wrapper enabled, pack-objects cache={pack_cache_enabled}, backpressure={pack_backpressure}"
);
Some(gitks::pack_cache::PackCache::new(
disk_cache.clone(),
pack_backpressure,
))
} else {
None
};
// Hook manager configuration
let hooks_enabled = env_bool("GITKS_HOOKS_ENABLED", true);
let server_hooks_dir = std::env::var("GITKS_SERVER_HOOKS_DIR")
.ok()
.map(PathBuf::from);
let hook_callback_addr = std::env::var("GITKS_HOOK_CALLBACK_ADDR").ok();
let hook_timeout = env_u64("GITKS_HOOK_TIMEOUT", 30);
let allow_custom_hooks = env_bool("GITKS_ALLOW_CUSTOM_HOOKS", true);
let hook_manager = if hooks_enabled {
tracing::info!("hooks enabled, timeout={hook_timeout}s, custom_hooks={allow_custom_hooks}");
Some(HookManager::new(
repo_prefix.clone(),
server_hooks_dir,
hook_callback_addr,
Duration::from_secs(hook_timeout),
allow_custom_hooks,
))
} else {
tracing::info!("hooks disabled");
None
};
// Health check / election configuration
let health_check_interval = env_u64("GITKS_HEALTH_CHECK_INTERVAL", 1);
let max_health_failures = env_u64("GITKS_MAX_HEALTH_FAILURES", 10);
tracing::info!(
"health check: interval={health_check_interval}s, max_failures={max_health_failures}"
);
// ── Metrics server ──
let metrics_port = env_u64("GITKS_METRICS_PORT", 9100) as u16;
let _metrics_handle = metrics::start_metrics_server(metrics_port);
tracing::info!("metrics server on port {metrics_port}");
// ── Cluster discovery (etcd → ractor_cluster) ──
//
// When GITKS_ETCD_ENDPOINTS is set, the node:
// 1. Starts a ractor_cluster NodeServer (TCP listener)
// 2. Connects to etcd and registers itself
// 3. Discovers existing peers → establishes ractor_cluster TCP connections
// 4. Watches etcd for future peer join/leave events
//
// Once ractor_cluster connections are up, pg::get_members() automatically
// returns remote actors — no changes needed in actor/handler.rs.
//
// When GITKS_ETCD_ENDPOINTS is unset or etcd is unreachable, the node
// falls back to standalone mode (existing local-only behavior).
let etcd_endpoints = std::env::var("GITKS_ETCD_ENDPOINTS")
.ok()
.filter(|s| !s.is_empty())
.map(|s| {
s.split(',')
.map(str::trim)
.map(String::from)
.collect::<Vec<_>>()
});
let cluster_port = env_or("GITKS_CLUSTER_PORT", "4697")
.parse::<u16>()
.unwrap_or(4697);
let cluster_cookie = env_or("GITKS_CLUSTER_COOKIE", "gitks-default-cookie");
let lease_ttl = env_u64("GITKS_LEASE_TTL", 15) as i64;
let connect_timeout_ms = env_u64("GITKS_ETCD_CONNECT_TIMEOUT", 5000);
// Resolve the hostname/address other nodes use to reach our NodeServer.
// Priority: GITKS_CLUSTER_HOSTNAME > POD_IP (K8s) > HOSTNAME env > "localhost"
let cluster_hostname = std::env::var("GITKS_CLUSTER_HOSTNAME")
.or_else(|_| std::env::var("POD_IP"))
.or_else(|_| std::env::var("HOSTNAME"))
.unwrap_or_else(|_| "localhost".to_string());
let _cluster: Option<ClusterManager> = if let Some(endpoints) = etcd_endpoints {
tracing::info!(
endpoints = ?endpoints,
cluster_port = cluster_port,
cluster_hostname = %cluster_hostname,
"starting cluster discovery via etcd"
);
let config = ClusterConfig {
etcd_endpoints: endpoints,
storage_name: storage_name.clone(),
grpc_addr: grpc_addr.clone(),
cluster_port,
cookie: cluster_cookie,
lease_ttl_secs: lease_ttl,
connect_timeout_ms,
cluster_hostname,
};
match ClusterManager::start(config).await {
Ok(cm) => {
tracing::info!("cluster discovery active");
Some(cm)
}
Err(e) => {
tracing::warn!(error = %e, "etcd unavailable, running in standalone mode");
None
}
}
} else {
tracing::info!("GITKS_ETCD_ENDPOINTS not set, running in standalone mode");
None
};
let addr: std::net::SocketAddr = format!("{host}:{port}").parse()?;
let svc = GitksService::new(repo_prefix.clone());
let mut svc = GitksService::new(repo_prefix.clone());
if disk_cache_enabled {
svc = svc.with_disk_cache(disk_cache);
}
if let Some(pc) = pack_cache {
svc = svc.with_pack_cache(pc);
}
if let Some(hm) = hook_manager {
svc = svc.with_hook_manager(hm);
}
let (node_actor, node_handle) =
init_actor_cluster(svc.clone(), storage_name.clone(), grpc_addr.clone()).await?;
let svc = svc