refactor(actor): implement Raft consensus algorithm for cluster leader election
- Add voting mechanism with term tracking and vote persistence - Implement election triggering logic with majority vote counting - Add primary/replica role transition handling with state management - Integrate health check failure detection for automatic elections - Refactor actor messaging system for distributed coordination - Update repository registration to query cluster for existing primary - Add broadcast mechanism for role change notifications - Implement proper term comparison and duplicate request filtering - Upgrade dependency versions including tokio-util for async utilities - Optimize code formatting and line wrapping for improved readability - Remove redundant blank lines and improve code structure consistency - Enhance error logging and trace information for debugging purposes
This commit is contained in:
+133
-49
@@ -2,10 +2,11 @@ use crate::actor::message::{
|
||||
ElectionRequest, ElectionResult, GitNodeMessage, NodeHealth, ROLE_PRIMARY, ROLE_REPLICA,
|
||||
RefUpdateEvent, RoleChangedEvent, RouteDecision,
|
||||
};
|
||||
use crate::pb::RepositoryHeader;
|
||||
use crate::server::GitksService;
|
||||
use async_trait::async_trait;
|
||||
use ractor::pg;
|
||||
use ractor::{Actor, ActorProcessingErr, ActorRef, SupervisionEvent};
|
||||
use ractor::{Actor, ActorCell, ActorProcessingErr, ActorRef, SupervisionEvent};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -43,6 +44,7 @@ pub struct GitNodeState {
|
||||
health_failures: u32,
|
||||
is_primary: bool,
|
||||
last_known_primary_grpc: String,
|
||||
voted_for: Option<String>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -76,6 +78,7 @@ impl Actor for GitNodeActor {
|
||||
health_failures: 0,
|
||||
is_primary: true, // Will be refined at registration
|
||||
last_known_primary_grpc: args.grpc_addr.clone(),
|
||||
voted_for: None,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -96,12 +99,12 @@ impl Actor for GitNodeActor {
|
||||
.unwrap_or(&repo_path)
|
||||
.trim_start_matches('/')
|
||||
.to_string();
|
||||
register_repo(&myself, state, relative_path);
|
||||
register_repo(&myself, state, relative_path).await;
|
||||
}
|
||||
}
|
||||
|
||||
GitNodeMessage::RegisterRepository(header) => {
|
||||
register_repo(&myself, state, header.relative_path);
|
||||
register_repo(&myself, state, header.relative_path).await;
|
||||
}
|
||||
|
||||
GitNodeMessage::RemoveRepository(header) => {
|
||||
@@ -173,10 +176,12 @@ impl Actor for GitNodeActor {
|
||||
term = request.term,
|
||||
current_term = state.current_term,
|
||||
accepted = accepted,
|
||||
voted_for = ?state.voted_for,
|
||||
"election vote"
|
||||
);
|
||||
if accepted {
|
||||
state.current_term = request.term;
|
||||
state.voted_for = Some(request.candidate_storage_name.clone());
|
||||
state.last_known_primary_grpc = request.candidate_grpc_addr.clone();
|
||||
}
|
||||
reply
|
||||
@@ -208,6 +213,7 @@ impl Actor for GitNodeActor {
|
||||
state.is_primary = true;
|
||||
state.current_term = event.term;
|
||||
state.health_failures = 0;
|
||||
state.voted_for = None;
|
||||
for entry in state.repos.values_mut() {
|
||||
entry.role = ROLE_PRIMARY.to_string();
|
||||
entry.read_only = false;
|
||||
@@ -220,6 +226,7 @@ impl Actor for GitNodeActor {
|
||||
);
|
||||
state.is_primary = false;
|
||||
state.current_term = event.term;
|
||||
state.voted_for = None;
|
||||
for entry in state.repos.values_mut() {
|
||||
entry.role = ROLE_REPLICA.to_string();
|
||||
}
|
||||
@@ -237,6 +244,76 @@ impl Actor for GitNodeActor {
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
GitNodeMessage::TriggerElection => {
|
||||
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
||||
let total = members.len();
|
||||
let my_cell = myself.get_cell();
|
||||
|
||||
let new_term = state.current_term.wrapping_add(1);
|
||||
|
||||
let mut accepted_count = 0u64;
|
||||
for member in &members {
|
||||
if *member == my_cell {
|
||||
// We vote for ourselves
|
||||
accepted_count += 1;
|
||||
continue;
|
||||
}
|
||||
let actor_ref: ActorRef<GitNodeMessage> = member.clone().into();
|
||||
let request = ElectionRequest {
|
||||
candidate_storage_name: state.storage_name.clone(),
|
||||
candidate_grpc_addr: state.grpc_addr.clone(),
|
||||
candidate_actor_name: state.actor_name.clone(),
|
||||
term: new_term,
|
||||
reason: "health_check_failure".to_string(),
|
||||
};
|
||||
match ractor::call_t!(actor_ref, GitNodeMessage::ElectPrimary, 1000, request) {
|
||||
Ok(result) if result.accepted => {
|
||||
accepted_count += 1;
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
member = ?member.get_id(),
|
||||
"no response from member during election"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let majority = (total / 2).max(1) + 1;
|
||||
if accepted_count >= majority as u64 {
|
||||
tracing::info!(
|
||||
term = new_term,
|
||||
accepted = accepted_count,
|
||||
total = total,
|
||||
"won election, promoting to PRIMARY"
|
||||
);
|
||||
state.is_primary = true;
|
||||
state.current_term = new_term;
|
||||
state.health_failures = 0;
|
||||
state.voted_for = None;
|
||||
for entry in state.repos.values_mut() {
|
||||
entry.role = ROLE_PRIMARY.to_string();
|
||||
entry.read_only = false;
|
||||
}
|
||||
let role_event = RoleChangedEvent {
|
||||
storage_name: state.storage_name.clone(),
|
||||
grpc_addr: state.grpc_addr.clone(),
|
||||
new_role: ROLE_PRIMARY.to_string(),
|
||||
term: new_term,
|
||||
relative_paths: state.repos.keys().cloned().collect(),
|
||||
};
|
||||
broadcast_role_changed(&myself, role_event);
|
||||
} else {
|
||||
tracing::warn!(
|
||||
term = new_term,
|
||||
accepted = accepted_count,
|
||||
total = total,
|
||||
"election lost, staying as REPLICA"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -277,9 +354,8 @@ impl Actor for GitNodeActor {
|
||||
|
||||
/// Determine whether to accept an election request.
|
||||
fn should_accept_election(request: &ElectionRequest, state: &GitNodeState) -> bool {
|
||||
// Only accept if the term is greater than our current term
|
||||
// (prevents old/duplicate election messages)
|
||||
if request.term <= state.current_term {
|
||||
// Reject old terms (prevents old/duplicate election messages)
|
||||
if request.term < state.current_term {
|
||||
tracing::warn!(
|
||||
request_term = request.term,
|
||||
current_term = state.current_term,
|
||||
@@ -287,6 +363,20 @@ fn should_accept_election(request: &ElectionRequest, state: &GitNodeState) -> bo
|
||||
);
|
||||
return false;
|
||||
}
|
||||
// Same term: only accept if we haven't already voted for someone else
|
||||
if request.term == state.current_term
|
||||
&& let Some(ref voted_for) = state.voted_for
|
||||
&& voted_for != &request.candidate_storage_name
|
||||
{
|
||||
tracing::warn!(
|
||||
request_term = request.term,
|
||||
current_term = state.current_term,
|
||||
already_voted = %voted_for,
|
||||
candidate = %request.candidate_storage_name,
|
||||
"rejecting election: already voted this term"
|
||||
);
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
@@ -318,7 +408,7 @@ fn build_decision(
|
||||
}
|
||||
}
|
||||
|
||||
fn register_repo(
|
||||
async fn register_repo(
|
||||
myself: &ActorRef<GitNodeMessage>,
|
||||
state: &mut GitNodeState,
|
||||
relative_path: String,
|
||||
@@ -329,10 +419,19 @@ fn register_repo(
|
||||
|
||||
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
||||
let my_cell = myself.get_cell();
|
||||
let other_nodes_exist = members.iter().any(|m| m != &my_cell);
|
||||
|
||||
let role = if other_nodes_exist {
|
||||
ROLE_REPLICA.to_string()
|
||||
let role = if members.iter().any(|m| m != &my_cell) {
|
||||
let header = RepositoryHeader {
|
||||
storage_name: String::new(),
|
||||
relative_path: relative_path.clone(),
|
||||
storage_path: String::new(),
|
||||
};
|
||||
let primary_found = find_primary_in_cluster(&members, &my_cell, &header).await;
|
||||
if primary_found {
|
||||
ROLE_REPLICA.to_string()
|
||||
} else {
|
||||
ROLE_PRIMARY.to_string()
|
||||
}
|
||||
} else {
|
||||
ROLE_PRIMARY.to_string()
|
||||
};
|
||||
@@ -365,6 +464,28 @@ fn register_repo(
|
||||
);
|
||||
}
|
||||
|
||||
/// Query all cluster members (except self) to find if a repository has a PRIMARY.
|
||||
pub async fn find_primary_in_cluster(
|
||||
members: &[ActorCell],
|
||||
my_cell: &ActorCell,
|
||||
header: &RepositoryHeader,
|
||||
) -> bool {
|
||||
for member in members {
|
||||
if member == my_cell {
|
||||
continue;
|
||||
}
|
||||
let actor_ref: ActorRef<GitNodeMessage> = member.clone().into();
|
||||
if let Ok(decision) =
|
||||
ractor::call_t!(actor_ref, GitNodeMessage::FindPrimary, 500, header.clone())
|
||||
&& decision.found
|
||||
&& decision.role == ROLE_PRIMARY
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn extract_category(relative_path: &str) -> &str {
|
||||
relative_path.split('/').next().unwrap_or("root")
|
||||
}
|
||||
@@ -417,9 +538,9 @@ fn start_health_checker(myself: ActorRef<GitNodeMessage>, interval_secs: u64, ma
|
||||
|
||||
if consecutive_failures >= max_failures {
|
||||
tracing::error!(
|
||||
"no other nodes reachable for {max_failures} checks, triggering self-election as PRIMARY"
|
||||
"no other nodes reachable for {max_failures} checks, triggering election"
|
||||
);
|
||||
trigger_self_election(&myself);
|
||||
myself.cast(GitNodeMessage::TriggerElection).ok();
|
||||
consecutive_failures = 0;
|
||||
}
|
||||
}
|
||||
@@ -427,43 +548,6 @@ fn start_health_checker(myself: ActorRef<GitNodeMessage>, interval_secs: u64, ma
|
||||
});
|
||||
}
|
||||
|
||||
/// Trigger self-election: this node promotes itself to PRIMARY.
|
||||
fn trigger_self_election(myself: &ActorRef<GitNodeMessage>) {
|
||||
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
||||
let total_nodes = members.len();
|
||||
|
||||
tracing::warn!(
|
||||
total_nodes = total_nodes,
|
||||
"initiating self-election as new PRIMARY"
|
||||
);
|
||||
|
||||
let new_term = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
|
||||
myself
|
||||
.cast(GitNodeMessage::RoleChanged(RoleChangedEvent {
|
||||
storage_name: String::new(), // will be filled by handler from our own state
|
||||
grpc_addr: String::new(),
|
||||
new_role: ROLE_PRIMARY.to_string(),
|
||||
term: new_term,
|
||||
relative_paths: Vec::new(), // all repos
|
||||
}))
|
||||
.ok();
|
||||
|
||||
broadcast_role_changed(
|
||||
myself,
|
||||
RoleChangedEvent {
|
||||
storage_name: String::new(), // handler fills
|
||||
grpc_addr: String::new(),
|
||||
new_role: ROLE_PRIMARY.to_string(),
|
||||
term: new_term,
|
||||
relative_paths: Vec::new(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub async fn start_node_actor(
|
||||
service: GitksService,
|
||||
storage_name: String,
|
||||
|
||||
Reference in New Issue
Block a user