feat(cluster): implement distributed clustering with etcd coordination
- Integrate etcd-client for distributed coordination and leader election - Add remote client macros with proper formatting for all services - Implement RequestMetrics for tracking RPC performance and errors - Add rate limiting mechanism across all service endpoints - Create ElectionRequest and ElectionResult message types for leader election - Add role management with primary/replica switching capabilities - Implement health checker with automatic failover detection - Add repository count metrics for cluster monitoring - Update Cargo.toml with etcd-client and dashmap dependencies - Modify RepoEntry to include read_only flag for replica handling - Implement should_accept_election logic to prevent duplicate elections - Add RoleChangedEvent handling for cluster role updates
This commit is contained in:
+216
-10
@@ -1,5 +1,6 @@
|
||||
use crate::actor::message::{
|
||||
GitNodeMessage, NodeHealth, ROLE_PRIMARY, ROLE_REPLICA, RefUpdateEvent, RouteDecision,
|
||||
ElectionRequest, ElectionResult, GitNodeMessage, NodeHealth, ROLE_PRIMARY, ROLE_REPLICA,
|
||||
RefUpdateEvent, RoleChangedEvent, RouteDecision,
|
||||
};
|
||||
use crate::server::GitksService;
|
||||
use async_trait::async_trait;
|
||||
@@ -25,6 +26,7 @@ impl GitNodeActor {
|
||||
pub struct RepoEntry {
|
||||
pub role: String,
|
||||
pub last_commit: String,
|
||||
pub read_only: bool,
|
||||
}
|
||||
|
||||
pub struct GitNodeArgs {
|
||||
@@ -37,6 +39,10 @@ pub struct GitNodeState {
|
||||
actor_name: String,
|
||||
grpc_addr: String,
|
||||
repos: HashMap<String, RepoEntry>,
|
||||
current_term: u64,
|
||||
health_failures: u32,
|
||||
is_primary: bool,
|
||||
last_known_primary_grpc: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -58,11 +64,18 @@ impl Actor for GitNodeActor {
|
||||
vec![myself.get_cell()],
|
||||
);
|
||||
tracing::info!(storage_name = %args.storage_name, actor_name = %actor_name, grpc_addr = %args.grpc_addr, "GitNodeActor started");
|
||||
|
||||
start_health_checker(myself.clone(), 1, 10);
|
||||
|
||||
Ok(GitNodeState {
|
||||
storage_name: args.storage_name,
|
||||
actor_name,
|
||||
grpc_addr: args.grpc_addr,
|
||||
grpc_addr: args.grpc_addr.clone(),
|
||||
repos: HashMap::new(),
|
||||
current_term: 0,
|
||||
health_failures: 0,
|
||||
is_primary: true, // Will be refined at registration
|
||||
last_known_primary_grpc: args.grpc_addr.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -76,6 +89,7 @@ impl Actor for GitNodeActor {
|
||||
GitNodeMessage::ScanAndRegister => {
|
||||
let repos = self.service.scan_all_repo()?;
|
||||
tracing::info!(storage_name = %state.storage_name, found = repos.len(), "scanning local repositories");
|
||||
crate::metrics::set_repository_count(repos.len() as u64);
|
||||
for repo_path in repos {
|
||||
let relative_path = repo_path
|
||||
.strip_prefix(self.service.repo_prefix.to_string_lossy().as_ref())
|
||||
@@ -151,6 +165,79 @@ impl Actor for GitNodeActor {
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
|
||||
// ── Election & Role Change ──────────────────────────────────
|
||||
GitNodeMessage::ElectPrimary(request, reply) => {
|
||||
let accepted = should_accept_election(&request, state);
|
||||
tracing::info!(
|
||||
candidate = %request.candidate_storage_name,
|
||||
term = request.term,
|
||||
current_term = state.current_term,
|
||||
accepted = accepted,
|
||||
"election vote"
|
||||
);
|
||||
if accepted {
|
||||
state.current_term = request.term;
|
||||
state.last_known_primary_grpc = request.candidate_grpc_addr.clone();
|
||||
}
|
||||
reply
|
||||
.send(ElectionResult {
|
||||
accepted,
|
||||
current_term: state.current_term,
|
||||
voter_storage_name: state.storage_name.clone(),
|
||||
voter_role: if state.is_primary {
|
||||
ROLE_PRIMARY
|
||||
} else {
|
||||
ROLE_REPLICA
|
||||
}
|
||||
.to_string(),
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
|
||||
GitNodeMessage::RoleChanged(event) => {
|
||||
// Empty storage_name = self-promotion from health checker
|
||||
let is_self =
|
||||
event.storage_name.is_empty() || event.storage_name == state.storage_name;
|
||||
|
||||
if is_self && event.new_role == ROLE_PRIMARY {
|
||||
tracing::info!(
|
||||
storage_name = %state.storage_name,
|
||||
term = event.term,
|
||||
"promoted to PRIMARY"
|
||||
);
|
||||
state.is_primary = true;
|
||||
state.current_term = event.term;
|
||||
state.health_failures = 0;
|
||||
for entry in state.repos.values_mut() {
|
||||
entry.role = ROLE_PRIMARY.to_string();
|
||||
entry.read_only = false;
|
||||
}
|
||||
} else if is_self && event.new_role == ROLE_REPLICA {
|
||||
tracing::info!(
|
||||
storage_name = %state.storage_name,
|
||||
term = event.term,
|
||||
"demoted to REPLICA"
|
||||
);
|
||||
state.is_primary = false;
|
||||
state.current_term = event.term;
|
||||
for entry in state.repos.values_mut() {
|
||||
entry.role = ROLE_REPLICA.to_string();
|
||||
}
|
||||
} else {
|
||||
// Another node's role changed — update routing info
|
||||
tracing::info!(
|
||||
storage_name = %event.storage_name,
|
||||
new_role = %event.new_role,
|
||||
"remote node role changed"
|
||||
);
|
||||
state.last_known_primary_grpc = if event.new_role == ROLE_PRIMARY {
|
||||
event.grpc_addr.clone()
|
||||
} else {
|
||||
state.last_known_primary_grpc.clone()
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -189,6 +276,21 @@ impl Actor for GitNodeActor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine whether to accept an election request.
|
||||
fn should_accept_election(request: &ElectionRequest, state: &GitNodeState) -> bool {
|
||||
// Only accept if the term is greater than our current term
|
||||
// (prevents old/duplicate election messages)
|
||||
if request.term <= state.current_term {
|
||||
tracing::warn!(
|
||||
request_term = request.term,
|
||||
current_term = state.current_term,
|
||||
"rejecting election: term too old"
|
||||
);
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn build_decision(
|
||||
state: &GitNodeState,
|
||||
header: &crate::pb::RepositoryHeader,
|
||||
@@ -226,23 +328,20 @@ fn register_repo(
|
||||
return;
|
||||
}
|
||||
|
||||
// Determine role based on cluster state
|
||||
// For simplicity and correctness, we use a conservative approach:
|
||||
// If there are other nodes in the cluster, register as replica initially.
|
||||
// The route_repository logic will determine the actual primary at query time.
|
||||
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
||||
let my_cell = myself.get_cell();
|
||||
let other_nodes_exist = members.iter().any(|m| m != &my_cell);
|
||||
|
||||
let role = if other_nodes_exist {
|
||||
// Conservative: assume another node might be primary
|
||||
// The actual primary will be determined by route_repository query
|
||||
ROLE_REPLICA.to_string()
|
||||
} else {
|
||||
// We're the only node, so we're primary
|
||||
ROLE_PRIMARY.to_string()
|
||||
};
|
||||
|
||||
if role == ROLE_PRIMARY {
|
||||
state.is_primary = true;
|
||||
}
|
||||
|
||||
let category = extract_category(&relative_path);
|
||||
pg::join_scoped(
|
||||
state.storage_name.clone(),
|
||||
@@ -254,6 +353,7 @@ fn register_repo(
|
||||
RepoEntry {
|
||||
role: role.clone(),
|
||||
last_commit: String::new(),
|
||||
read_only: false,
|
||||
},
|
||||
);
|
||||
tracing::info!(
|
||||
@@ -262,7 +362,7 @@ fn register_repo(
|
||||
relative_path = %relative_path,
|
||||
actor_name = %state.actor_name,
|
||||
role = %role,
|
||||
"repository route registered (role will be refined at query time)"
|
||||
"repository route registered"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -270,6 +370,101 @@ fn extract_category(relative_path: &str) -> &str {
|
||||
relative_path.split('/').next().unwrap_or("root")
|
||||
}
|
||||
|
||||
/// Start background health checker that monitors the PRIMARY node.
|
||||
/// If the PRIMARY becomes unreachable for `max_failures` consecutive checks,
|
||||
/// triggers an election.
|
||||
fn start_health_checker(myself: ActorRef<GitNodeMessage>, interval_secs: u64, max_failures: u32) {
|
||||
tokio::spawn(async move {
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(interval_secs));
|
||||
interval.tick().await; // First tick immediate
|
||||
|
||||
let mut consecutive_failures: u32 = 0;
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
||||
let my_cell = myself.get_cell();
|
||||
let other_cells: Vec<ractor::ActorCell> =
|
||||
members.into_iter().filter(|m| m != &my_cell).collect();
|
||||
|
||||
if other_cells.is_empty() {
|
||||
// No other nodes → we are the only node → ensure we are PRIMARY
|
||||
consecutive_failures = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut any_reachable = false;
|
||||
for cell in &other_cells {
|
||||
let actor_ref: ActorRef<GitNodeMessage> = cell.clone().into();
|
||||
match ractor::call_t!(actor_ref, GitNodeMessage::GetNodeHealth, 2000) {
|
||||
Ok(health) if health.healthy => {
|
||||
any_reachable = true;
|
||||
break;
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
if any_reachable {
|
||||
consecutive_failures = 0;
|
||||
} else {
|
||||
consecutive_failures += 1;
|
||||
tracing::warn!(
|
||||
consecutive_failures = consecutive_failures,
|
||||
max_failures = max_failures,
|
||||
"no other cluster nodes reachable"
|
||||
);
|
||||
|
||||
if consecutive_failures >= max_failures {
|
||||
tracing::error!(
|
||||
"no other nodes reachable for {max_failures} checks, triggering self-election as PRIMARY"
|
||||
);
|
||||
trigger_self_election(&myself);
|
||||
consecutive_failures = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Trigger self-election: this node promotes itself to PRIMARY.
|
||||
fn trigger_self_election(myself: &ActorRef<GitNodeMessage>) {
|
||||
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
||||
let total_nodes = members.len();
|
||||
|
||||
tracing::warn!(
|
||||
total_nodes = total_nodes,
|
||||
"initiating self-election as new PRIMARY"
|
||||
);
|
||||
|
||||
let new_term = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
|
||||
myself
|
||||
.cast(GitNodeMessage::RoleChanged(RoleChangedEvent {
|
||||
storage_name: String::new(), // will be filled by handler from our own state
|
||||
grpc_addr: String::new(),
|
||||
new_role: ROLE_PRIMARY.to_string(),
|
||||
term: new_term,
|
||||
relative_paths: Vec::new(), // all repos
|
||||
}))
|
||||
.ok();
|
||||
|
||||
broadcast_role_changed(
|
||||
myself,
|
||||
RoleChangedEvent {
|
||||
storage_name: String::new(), // handler fills
|
||||
grpc_addr: String::new(),
|
||||
new_role: ROLE_PRIMARY.to_string(),
|
||||
term: new_term,
|
||||
relative_paths: Vec::new(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub async fn start_node_actor(
|
||||
service: GitksService,
|
||||
storage_name: String,
|
||||
@@ -314,3 +509,14 @@ pub fn broadcast_ref_update(_node_actor: &ActorRef<GitNodeMessage>, event: RefUp
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
|
||||
/// Broadcast a role change event to all cluster members.
|
||||
pub fn broadcast_role_changed(_actor: &ActorRef<GitNodeMessage>, event: RoleChangedEvent) {
|
||||
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
||||
for member in members {
|
||||
let actor_ref: ActorRef<GitNodeMessage> = member.into();
|
||||
actor_ref
|
||||
.cast(GitNodeMessage::RoleChanged(event.clone()))
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user