refactor(actor): implement Raft consensus algorithm for cluster leader election

- Add voting mechanism with term tracking and vote persistence
- Implement election triggering logic with majority vote counting
- Add primary/replica role transition handling with state management
- Integrate health check failure detection for automatic elections
- Refactor actor messaging system for distributed coordination
- Update repository registration to query cluster for existing primary
- Add broadcast mechanism for role change notifications
- Implement proper term comparison and duplicate request filtering
- Upgrade dependency versions including tokio-util for async utilities
- Optimize code formatting and line wrapping for improved readability
- Remove redundant blank lines and improve code structure consistency
- Enhance error logging and trace information for debugging purposes
This commit is contained in:
zhenyi
2026-06-10 12:35:10 +08:00
parent ab32e8826e
commit 9a0c26e5f6
40 changed files with 1184 additions and 449 deletions
+133 -49
View File
@@ -2,10 +2,11 @@ use crate::actor::message::{
ElectionRequest, ElectionResult, GitNodeMessage, NodeHealth, ROLE_PRIMARY, ROLE_REPLICA,
RefUpdateEvent, RoleChangedEvent, RouteDecision,
};
use crate::pb::RepositoryHeader;
use crate::server::GitksService;
use async_trait::async_trait;
use ractor::pg;
use ractor::{Actor, ActorProcessingErr, ActorRef, SupervisionEvent};
use ractor::{Actor, ActorCell, ActorProcessingErr, ActorRef, SupervisionEvent};
use std::collections::HashMap;
#[derive(Clone)]
@@ -43,6 +44,7 @@ pub struct GitNodeState {
health_failures: u32,
is_primary: bool,
last_known_primary_grpc: String,
voted_for: Option<String>,
}
#[async_trait]
@@ -76,6 +78,7 @@ impl Actor for GitNodeActor {
health_failures: 0,
is_primary: true, // Will be refined at registration
last_known_primary_grpc: args.grpc_addr.clone(),
voted_for: None,
})
}
@@ -96,12 +99,12 @@ impl Actor for GitNodeActor {
.unwrap_or(&repo_path)
.trim_start_matches('/')
.to_string();
register_repo(&myself, state, relative_path);
register_repo(&myself, state, relative_path).await;
}
}
GitNodeMessage::RegisterRepository(header) => {
register_repo(&myself, state, header.relative_path);
register_repo(&myself, state, header.relative_path).await;
}
GitNodeMessage::RemoveRepository(header) => {
@@ -173,10 +176,12 @@ impl Actor for GitNodeActor {
term = request.term,
current_term = state.current_term,
accepted = accepted,
voted_for = ?state.voted_for,
"election vote"
);
if accepted {
state.current_term = request.term;
state.voted_for = Some(request.candidate_storage_name.clone());
state.last_known_primary_grpc = request.candidate_grpc_addr.clone();
}
reply
@@ -208,6 +213,7 @@ impl Actor for GitNodeActor {
state.is_primary = true;
state.current_term = event.term;
state.health_failures = 0;
state.voted_for = None;
for entry in state.repos.values_mut() {
entry.role = ROLE_PRIMARY.to_string();
entry.read_only = false;
@@ -220,6 +226,7 @@ impl Actor for GitNodeActor {
);
state.is_primary = false;
state.current_term = event.term;
state.voted_for = None;
for entry in state.repos.values_mut() {
entry.role = ROLE_REPLICA.to_string();
}
@@ -237,6 +244,76 @@ impl Actor for GitNodeActor {
};
}
}
GitNodeMessage::TriggerElection => {
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
let total = members.len();
let my_cell = myself.get_cell();
let new_term = state.current_term.wrapping_add(1);
let mut accepted_count = 0u64;
for member in &members {
if *member == my_cell {
// We vote for ourselves
accepted_count += 1;
continue;
}
let actor_ref: ActorRef<GitNodeMessage> = member.clone().into();
let request = ElectionRequest {
candidate_storage_name: state.storage_name.clone(),
candidate_grpc_addr: state.grpc_addr.clone(),
candidate_actor_name: state.actor_name.clone(),
term: new_term,
reason: "health_check_failure".to_string(),
};
match ractor::call_t!(actor_ref, GitNodeMessage::ElectPrimary, 1000, request) {
Ok(result) if result.accepted => {
accepted_count += 1;
}
Ok(_) => {}
Err(_) => {
tracing::warn!(
member = ?member.get_id(),
"no response from member during election"
);
}
}
}
let majority = (total / 2).max(1) + 1;
if accepted_count >= majority as u64 {
tracing::info!(
term = new_term,
accepted = accepted_count,
total = total,
"won election, promoting to PRIMARY"
);
state.is_primary = true;
state.current_term = new_term;
state.health_failures = 0;
state.voted_for = None;
for entry in state.repos.values_mut() {
entry.role = ROLE_PRIMARY.to_string();
entry.read_only = false;
}
let role_event = RoleChangedEvent {
storage_name: state.storage_name.clone(),
grpc_addr: state.grpc_addr.clone(),
new_role: ROLE_PRIMARY.to_string(),
term: new_term,
relative_paths: state.repos.keys().cloned().collect(),
};
broadcast_role_changed(&myself, role_event);
} else {
tracing::warn!(
term = new_term,
accepted = accepted_count,
total = total,
"election lost, staying as REPLICA"
);
}
}
}
Ok(())
}
@@ -277,9 +354,8 @@ impl Actor for GitNodeActor {
/// Determine whether to accept an election request.
fn should_accept_election(request: &ElectionRequest, state: &GitNodeState) -> bool {
// Only accept if the term is greater than our current term
// (prevents old/duplicate election messages)
if request.term <= state.current_term {
// Reject old terms (prevents old/duplicate election messages)
if request.term < state.current_term {
tracing::warn!(
request_term = request.term,
current_term = state.current_term,
@@ -287,6 +363,20 @@ fn should_accept_election(request: &ElectionRequest, state: &GitNodeState) -> bo
);
return false;
}
// Same term: only accept if we haven't already voted for someone else
if request.term == state.current_term
&& let Some(ref voted_for) = state.voted_for
&& voted_for != &request.candidate_storage_name
{
tracing::warn!(
request_term = request.term,
current_term = state.current_term,
already_voted = %voted_for,
candidate = %request.candidate_storage_name,
"rejecting election: already voted this term"
);
return false;
}
true
}
@@ -318,7 +408,7 @@ fn build_decision(
}
}
fn register_repo(
async fn register_repo(
myself: &ActorRef<GitNodeMessage>,
state: &mut GitNodeState,
relative_path: String,
@@ -329,10 +419,19 @@ fn register_repo(
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
let my_cell = myself.get_cell();
let other_nodes_exist = members.iter().any(|m| m != &my_cell);
let role = if other_nodes_exist {
ROLE_REPLICA.to_string()
let role = if members.iter().any(|m| m != &my_cell) {
let header = RepositoryHeader {
storage_name: String::new(),
relative_path: relative_path.clone(),
storage_path: String::new(),
};
let primary_found = find_primary_in_cluster(&members, &my_cell, &header).await;
if primary_found {
ROLE_REPLICA.to_string()
} else {
ROLE_PRIMARY.to_string()
}
} else {
ROLE_PRIMARY.to_string()
};
@@ -365,6 +464,28 @@ fn register_repo(
);
}
/// Query all cluster members (except self) to find if a repository has a PRIMARY.
pub async fn find_primary_in_cluster(
members: &[ActorCell],
my_cell: &ActorCell,
header: &RepositoryHeader,
) -> bool {
for member in members {
if member == my_cell {
continue;
}
let actor_ref: ActorRef<GitNodeMessage> = member.clone().into();
if let Ok(decision) =
ractor::call_t!(actor_ref, GitNodeMessage::FindPrimary, 500, header.clone())
&& decision.found
&& decision.role == ROLE_PRIMARY
{
return true;
}
}
false
}
fn extract_category(relative_path: &str) -> &str {
relative_path.split('/').next().unwrap_or("root")
}
@@ -417,9 +538,9 @@ fn start_health_checker(myself: ActorRef<GitNodeMessage>, interval_secs: u64, ma
if consecutive_failures >= max_failures {
tracing::error!(
"no other nodes reachable for {max_failures} checks, triggering self-election as PRIMARY"
"no other nodes reachable for {max_failures} checks, triggering election"
);
trigger_self_election(&myself);
myself.cast(GitNodeMessage::TriggerElection).ok();
consecutive_failures = 0;
}
}
@@ -427,43 +548,6 @@ fn start_health_checker(myself: ActorRef<GitNodeMessage>, interval_secs: u64, ma
});
}
/// Trigger self-election: this node promotes itself to PRIMARY.
fn trigger_self_election(myself: &ActorRef<GitNodeMessage>) {
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
let total_nodes = members.len();
tracing::warn!(
total_nodes = total_nodes,
"initiating self-election as new PRIMARY"
);
let new_term = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
myself
.cast(GitNodeMessage::RoleChanged(RoleChangedEvent {
storage_name: String::new(), // will be filled by handler from our own state
grpc_addr: String::new(),
new_role: ROLE_PRIMARY.to_string(),
term: new_term,
relative_paths: Vec::new(), // all repos
}))
.ok();
broadcast_role_changed(
myself,
RoleChangedEvent {
storage_name: String::new(), // handler fills
grpc_addr: String::new(),
new_role: ROLE_PRIMARY.to_string(),
term: new_term,
relative_paths: Vec::new(),
},
);
}
pub async fn start_node_actor(
service: GitksService,
storage_name: String,
+3 -1
View File
@@ -149,6 +149,9 @@ pub enum GitNodeMessage {
/// A role change has occurred in the cluster.
RoleChanged(RoleChangedEvent),
/// Health checker detected primary failure, trigger election.
TriggerElection,
}
#[derive(ractor_cluster::RactorMessage)]
@@ -156,7 +159,6 @@ pub enum RepoActorMessage {
UpdateMetadata(RepositoryHeader),
}
/// Request for a node to vote in a PRIMARY election.
#[derive(Debug, Clone)]
pub struct ElectionRequest {
+1
View File
@@ -3,6 +3,7 @@ pub mod message;
pub mod server;
pub mod sync;
pub use handler::find_primary_in_cluster;
pub use handler::{
GitNodeActor, GitNodeArgs, RepoEntry, broadcast_ref_update, broadcast_role_changed,
get_category_members, get_cluster_nodes, list_all_groups, route_group_for, start_node_actor,
+200 -99
View File
@@ -39,6 +39,57 @@ impl BundleApplicator {
}
Ok(())
}
/// Apply bundle from a file path (for streaming writes).
pub fn apply_bundle_from_file(&self, path: &Path) -> Result<(), String> {
let file = std::fs::File::open(path).map_err(|e| format!("open bundle file: {e}"))?;
let mut child = std::process::Command::new("git")
.args([
"--git-dir",
&self.repo_path.to_string_lossy(),
"bundle",
"unbundle",
"-",
])
.stdin(std::process::Stdio::piped())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.map_err(|e| format!("spawn git bundle unbundle: {e}"))?;
// Stream file contents to stdin in a background thread
let mut stdin = child.stdin.take().ok_or("no stdin")?;
let file_handle = file;
let writer = std::thread::spawn(move || -> Result<(), String> {
use std::io::{Read, Write};
let mut reader = std::io::BufReader::new(file_handle);
let mut buf = vec![0u8; 65536];
loop {
match reader.read(&mut buf) {
Ok(0) => break,
Ok(n) => {
stdin
.write_all(&buf[..n])
.map_err(|e| format!("write to stdin: {e}"))?;
}
Err(e) => return Err(format!("read bundle file: {e}")),
}
}
Ok(())
});
let output = child
.wait_with_output()
.map_err(|e| format!("wait bundle: {e}"))?;
// Wait for writer thread
let _ = writer.join().map_err(|_| "writer thread panicked")?;
if !output.status.success() {
return Err(String::from_utf8_lossy(&output.stderr).into_owned());
}
Ok(())
}
}
pub fn collect_local_haves(repo_path: &Path) -> Result<Vec<Oid>, String> {
@@ -92,20 +143,45 @@ pub async fn sync_from_primary(event: RefUpdateEvent, local_repo_path: PathBuf)
let relative_path = event.relative_path.clone();
let repo_for_haves = local_repo_path.clone();
match tokio::task::spawn_blocking(move || {
sync_via_pack_service(&grpc_addr, &relative_path, &repo_for_haves)
})
.await
// Collect haves in a blocking thread
let haves = match tokio::task::spawn_blocking(move || collect_local_haves(&repo_for_haves))
.await
{
Ok(Ok(pack_data)) if !pack_data.is_empty() => {
let pack_len = pack_data.len();
Ok(Ok(h)) => h,
Ok(Err(e)) => {
tracing::error!(relative_path = %event.relative_path, error = %e, "collect haves failed");
return;
}
Err(e) => {
tracing::error!(relative_path = %event.relative_path, error = %e, "haves task failed");
return;
}
};
// Stream pack data to a temporary file to avoid OOM
let temp_dir = local_repo_path.join(".gitks_tmp");
if let Err(e) = std::fs::create_dir_all(&temp_dir) {
tracing::error!(relative_path = %event.relative_path, error = %e, "create temp dir failed");
return;
}
let pack_result =
sync_via_pack_service_to_file(&grpc_addr, &relative_path, &haves, &temp_dir).await;
match pack_result {
Ok(Some(pack_file)) => {
let repo = local_repo_path.clone();
match tokio::task::spawn_blocking(move || apply_pack_data(&repo, &pack_data)).await {
let pack_path = pack_file.clone();
match tokio::task::spawn_blocking(move || {
let applicator = BundleApplicator::new(repo);
applicator.apply_bundle_from_file(&pack_path)
})
.await
{
Ok(Ok(())) => {
update_local_ref(&local_repo_path, &event.ref_name, &event.new_oid);
tracing::info!(
relative_path = %event.relative_path,
bytes = pack_len,
"replica sync done"
);
}
@@ -116,119 +192,144 @@ pub async fn sync_from_primary(event: RefUpdateEvent, local_repo_path: PathBuf)
tracing::error!(relative_path = %event.relative_path, error = %e, "apply task failed")
}
}
// Cleanup temp file
let _ = std::fs::remove_file(&pack_file);
}
Ok(Ok(_)) => {
Ok(None) => {
tracing::warn!(relative_path = %event.relative_path, "empty pack data from primary")
}
Ok(Err(e)) => {
Err(e) => {
tracing::error!(relative_path = %event.relative_path, error = %e, "pack fetch failed")
}
Err(e) => {
tracing::error!(relative_path = %event.relative_path, error = %e, "sync task failed")
}
}
// Cleanup temp dir if empty
let _ = std::fs::remove_dir(&temp_dir);
}
fn sync_via_pack_service(
/// Maximum pack size before we reject (10GB)
const MAX_PACK_SIZE: u64 = 10 * 1024 * 1024 * 1024;
/// Stream pack data from primary to a temporary file.
/// Returns Ok(Some(path)) on success, Ok(None) if empty, Err on failure.
async fn sync_via_pack_service_to_file(
grpc_addr: &str,
relative_path: &str,
local_repo_path: &Path,
) -> Result<Vec<u8>, String> {
let haves = collect_local_haves(local_repo_path)?;
haves: &[Oid],
temp_dir: &Path,
) -> Result<Option<PathBuf>, String> {
use crate::pb::pack_service_client::PackServiceClient;
use crate::pb::{AdvertiseRefsRequest, PackObjectsOptions, PackObjectsRequest, RepositoryHeader};
use tokio::io::AsyncWriteExt;
use tokio_stream::StreamExt;
let rt = tokio::runtime::Handle::current();
rt.block_on(async {
use crate::pb::pack_service_client::PackServiceClient;
use crate::pb::{
AdvertiseRefsRequest, PackObjectsOptions, PackObjectsRequest, RepositoryHeader,
};
use tokio_stream::StreamExt;
let endpoint = crate::server::remote_endpoint(grpc_addr)
.await
.map_err(|e| e.to_string())?;
let endpoint = crate::server::remote_endpoint(grpc_addr)
.await
.map_err(|e| e.to_string())?;
let mut client = PackServiceClient::connect(endpoint)
.await
.map_err(|e| format!("connect to primary: {e}"))?;
let mut client = PackServiceClient::connect(endpoint)
.await
.map_err(|e| format!("connect to primary: {e}"))?;
let header = RepositoryHeader {
storage_name: String::new(),
relative_path: relative_path.to_string(),
storage_path: String::new(),
};
let header = RepositoryHeader {
storage_name: String::new(),
relative_path: relative_path.to_string(),
storage_path: String::new(),
};
let refs_resp = client
.advertise_refs(AdvertiseRefsRequest {
repository: Some(header.clone()),
protocol: None,
service: "upload-pack".to_string(),
raw: false,
})
.await
.map_err(|e| format!("AdvertiseRefs: {e}"))?;
let refs = refs_resp.into_inner().references;
if refs.is_empty() {
return Ok(Vec::new());
}
let wants: Vec<Oid> = refs.iter().filter_map(|r| r.target_oid.clone()).collect();
let want_count = wants.len();
let have_count = haves.len();
tracing::info!(
relative_path = %relative_path,
want_count,
have_count,
"requesting incremental pack from primary"
);
let options = PackObjectsOptions {
wants,
haves,
shallow_revisions: Vec::new(),
deepen: 0,
thin_pack: false,
include_tag: true,
use_bitmaps: true,
delta_base_offset: true,
pathspec: Vec::new(),
};
let req = PackObjectsRequest {
let refs_resp = client
.advertise_refs(AdvertiseRefsRequest {
repository: Some(header.clone()),
options: Some(options),
};
protocol: None,
service: "upload-pack".to_string(),
raw: false,
})
.await
.map_err(|e| format!("AdvertiseRefs: {e}"))?;
let resp = client
.pack_objects(req)
.await
.map_err(|e| format!("PackObjects: {e}"))?;
let refs = refs_resp.into_inner().references;
if refs.is_empty() {
return Ok(None);
}
let mut stream = resp.into_inner();
let mut pack_data = Vec::new();
while let Some(chunk) = stream.next().await {
match chunk {
Ok(msg) => pack_data.extend_from_slice(&msg.data),
Err(e) => return Err(format!("pack stream: {e}")),
let wants: Vec<Oid> = refs.iter().filter_map(|r| r.target_oid.clone()).collect();
let want_count = wants.len();
let have_count = haves.len();
tracing::info!(
relative_path = %relative_path,
want_count,
have_count,
"requesting incremental pack from primary"
);
let options = PackObjectsOptions {
wants,
haves: haves.to_vec(),
shallow_revisions: Vec::new(),
deepen: 0,
thin_pack: false,
include_tag: true,
use_bitmaps: true,
delta_base_offset: true,
pathspec: Vec::new(),
};
let req = PackObjectsRequest {
repository: Some(header.clone()),
options: Some(options),
};
let resp = client
.pack_objects(req)
.await
.map_err(|e| format!("PackObjects: {e}"))?;
let mut stream = resp.into_inner();
// Create a temporary file for streaming
let temp_file = temp_dir.join(format!("pack_{}.bundle", std::process::id()));
let mut file = tokio::fs::File::create(&temp_file)
.await
.map_err(|e| format!("create temp file: {e}"))?;
let mut total_bytes: u64 = 0;
while let Some(chunk) = stream.next().await {
match chunk {
Ok(msg) => {
total_bytes += msg.data.len() as u64;
if total_bytes > MAX_PACK_SIZE {
let _ = tokio::fs::remove_file(&temp_file).await;
return Err(format!(
"pack data exceeds maximum size ({}GB)",
MAX_PACK_SIZE / (1024 * 1024 * 1024)
));
}
file.write_all(&msg.data)
.await
.map_err(|e| format!("write pack data: {e}"))?;
}
Err(e) => {
let _ = tokio::fs::remove_file(&temp_file).await;
return Err(format!("pack stream: {e}"));
}
}
}
tracing::info!(
relative_path = %relative_path,
pack_bytes = pack_data.len(),
"received pack data from primary"
);
// Flush and close the file
file.flush()
.await
.map_err(|e| format!("flush pack file: {e}"))?;
drop(file);
Ok(pack_data)
})
}
tracing::info!(
relative_path = %relative_path,
pack_bytes = total_bytes,
"received pack data from primary"
);
fn apply_pack_data(repo_path: &Path, pack_data: &[u8]) -> Result<(), String> {
let applicator = BundleApplicator::new(repo_path.to_path_buf());
applicator.apply_bundle(pack_data)
Ok(Some(temp_file))
}
fn update_local_ref(repo_path: &Path, ref_name: &str, new_oid: &str) {