9a0c26e5f6
- Add voting mechanism with term tracking and vote persistence - Implement election triggering logic with majority vote counting - Add primary/replica role transition handling with state management - Integrate health check failure detection for automatic elections - Refactor actor messaging system for distributed coordination - Update repository registration to query cluster for existing primary - Add broadcast mechanism for role change notifications - Implement proper term comparison and duplicate request filtering - Upgrade dependency versions including tokio-util for async utilities - Optimize code formatting and line wrapping for improved readability - Remove redundant blank lines and improve code structure consistency - Enhance error logging and trace information for debugging purposes
606 lines
21 KiB
Rust
606 lines
21 KiB
Rust
use crate::actor::message::{
|
|
ElectionRequest, ElectionResult, GitNodeMessage, NodeHealth, ROLE_PRIMARY, ROLE_REPLICA,
|
|
RefUpdateEvent, RoleChangedEvent, RouteDecision,
|
|
};
|
|
use crate::pb::RepositoryHeader;
|
|
use crate::server::GitksService;
|
|
use async_trait::async_trait;
|
|
use ractor::pg;
|
|
use ractor::{Actor, ActorCell, ActorProcessingErr, ActorRef, SupervisionEvent};
|
|
use std::collections::HashMap;
|
|
|
|
#[derive(Clone)]
|
|
pub struct GitNodeActor {
|
|
pub version: String,
|
|
pub service: GitksService,
|
|
}
|
|
|
|
impl GitNodeActor {
|
|
pub fn init(service: GitksService) -> Self {
|
|
GitNodeActor {
|
|
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
service,
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct RepoEntry {
|
|
pub role: String,
|
|
pub last_commit: String,
|
|
pub read_only: bool,
|
|
}
|
|
|
|
pub struct GitNodeArgs {
|
|
pub storage_name: String,
|
|
pub grpc_addr: String,
|
|
}
|
|
|
|
pub struct GitNodeState {
|
|
storage_name: String,
|
|
actor_name: String,
|
|
grpc_addr: String,
|
|
repos: HashMap<String, RepoEntry>,
|
|
current_term: u64,
|
|
health_failures: u32,
|
|
is_primary: bool,
|
|
last_known_primary_grpc: String,
|
|
voted_for: Option<String>,
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Actor for GitNodeActor {
|
|
type Msg = GitNodeMessage;
|
|
type State = GitNodeState;
|
|
type Arguments = GitNodeArgs;
|
|
|
|
async fn pre_start(
|
|
&self,
|
|
myself: ActorRef<Self::Msg>,
|
|
args: Self::Arguments,
|
|
) -> Result<Self::State, ActorProcessingErr> {
|
|
let actor_name = format!("git_node_{}", args.storage_name);
|
|
pg::join("gitks_nodes".to_string(), vec![myself.get_cell()]);
|
|
pg::join_scoped(
|
|
args.storage_name.clone(),
|
|
"node".to_string(),
|
|
vec![myself.get_cell()],
|
|
);
|
|
tracing::info!(storage_name = %args.storage_name, actor_name = %actor_name, grpc_addr = %args.grpc_addr, "GitNodeActor started");
|
|
|
|
start_health_checker(myself.clone(), 1, 10);
|
|
|
|
Ok(GitNodeState {
|
|
storage_name: args.storage_name,
|
|
actor_name,
|
|
grpc_addr: args.grpc_addr.clone(),
|
|
repos: HashMap::new(),
|
|
current_term: 0,
|
|
health_failures: 0,
|
|
is_primary: true, // Will be refined at registration
|
|
last_known_primary_grpc: args.grpc_addr.clone(),
|
|
voted_for: None,
|
|
})
|
|
}
|
|
|
|
async fn handle(
|
|
&self,
|
|
myself: ActorRef<Self::Msg>,
|
|
message: Self::Msg,
|
|
state: &mut Self::State,
|
|
) -> Result<(), ActorProcessingErr> {
|
|
match message {
|
|
GitNodeMessage::ScanAndRegister => {
|
|
let repos = self.service.scan_all_repo()?;
|
|
tracing::info!(storage_name = %state.storage_name, found = repos.len(), "scanning local repositories");
|
|
crate::metrics::set_repository_count(repos.len() as u64);
|
|
for repo_path in repos {
|
|
let relative_path = repo_path
|
|
.strip_prefix(self.service.repo_prefix.to_string_lossy().as_ref())
|
|
.unwrap_or(&repo_path)
|
|
.trim_start_matches('/')
|
|
.to_string();
|
|
register_repo(&myself, state, relative_path).await;
|
|
}
|
|
}
|
|
|
|
GitNodeMessage::RegisterRepository(header) => {
|
|
register_repo(&myself, state, header.relative_path).await;
|
|
}
|
|
|
|
GitNodeMessage::RemoveRepository(header) => {
|
|
state.repos.remove(&header.relative_path);
|
|
tracing::info!(storage_name = %state.storage_name, relative_path = %header.relative_path, "repository route removed");
|
|
}
|
|
|
|
GitNodeMessage::RefUpdated(event) => {
|
|
if let Some(entry) = state.repos.get(&event.relative_path)
|
|
&& entry.role == ROLE_REPLICA
|
|
{
|
|
let local_path = self.service.repo_prefix.join(&event.relative_path);
|
|
crate::actor::sync::sync_from_primary(event, local_path).await;
|
|
}
|
|
}
|
|
|
|
GitNodeMessage::FindPrimary(header, reply) => {
|
|
let entry = state.repos.get(&header.relative_path);
|
|
let is_primary = entry.is_some_and(|e| e.role == ROLE_PRIMARY);
|
|
reply
|
|
.send(build_decision(
|
|
state,
|
|
&header,
|
|
is_primary,
|
|
entry.map(|e| e.role.as_str()),
|
|
))
|
|
.ok();
|
|
}
|
|
|
|
GitNodeMessage::FindReplica(header, reply) => {
|
|
let entry = state.repos.get(&header.relative_path);
|
|
let has = entry.is_some();
|
|
reply
|
|
.send(build_decision(
|
|
state,
|
|
&header,
|
|
has,
|
|
entry.map(|e| e.role.as_str()),
|
|
))
|
|
.ok();
|
|
}
|
|
|
|
GitNodeMessage::ListRepositoryPaths(reply) => {
|
|
let paths: Vec<String> = state.repos.keys().cloned().collect();
|
|
reply.send(paths.join("\n")).ok();
|
|
}
|
|
|
|
GitNodeMessage::RepositoryExists(header, reply) => {
|
|
reply
|
|
.send(state.repos.contains_key(&header.relative_path))
|
|
.ok();
|
|
}
|
|
|
|
GitNodeMessage::GetNodeHealth(reply) => {
|
|
reply
|
|
.send(NodeHealth {
|
|
storage_name: state.storage_name.clone(),
|
|
repo_count: state.repos.len() as u64,
|
|
healthy: true,
|
|
version: self.version.clone(),
|
|
})
|
|
.ok();
|
|
}
|
|
|
|
GitNodeMessage::ElectPrimary(request, reply) => {
|
|
let accepted = should_accept_election(&request, state);
|
|
tracing::info!(
|
|
candidate = %request.candidate_storage_name,
|
|
term = request.term,
|
|
current_term = state.current_term,
|
|
accepted = accepted,
|
|
voted_for = ?state.voted_for,
|
|
"election vote"
|
|
);
|
|
if accepted {
|
|
state.current_term = request.term;
|
|
state.voted_for = Some(request.candidate_storage_name.clone());
|
|
state.last_known_primary_grpc = request.candidate_grpc_addr.clone();
|
|
}
|
|
reply
|
|
.send(ElectionResult {
|
|
accepted,
|
|
current_term: state.current_term,
|
|
voter_storage_name: state.storage_name.clone(),
|
|
voter_role: if state.is_primary {
|
|
ROLE_PRIMARY
|
|
} else {
|
|
ROLE_REPLICA
|
|
}
|
|
.to_string(),
|
|
})
|
|
.ok();
|
|
}
|
|
|
|
GitNodeMessage::RoleChanged(event) => {
|
|
// Empty storage_name = self-promotion from health checker
|
|
let is_self =
|
|
event.storage_name.is_empty() || event.storage_name == state.storage_name;
|
|
|
|
if is_self && event.new_role == ROLE_PRIMARY {
|
|
tracing::info!(
|
|
storage_name = %state.storage_name,
|
|
term = event.term,
|
|
"promoted to PRIMARY"
|
|
);
|
|
state.is_primary = true;
|
|
state.current_term = event.term;
|
|
state.health_failures = 0;
|
|
state.voted_for = None;
|
|
for entry in state.repos.values_mut() {
|
|
entry.role = ROLE_PRIMARY.to_string();
|
|
entry.read_only = false;
|
|
}
|
|
} else if is_self && event.new_role == ROLE_REPLICA {
|
|
tracing::info!(
|
|
storage_name = %state.storage_name,
|
|
term = event.term,
|
|
"demoted to REPLICA"
|
|
);
|
|
state.is_primary = false;
|
|
state.current_term = event.term;
|
|
state.voted_for = None;
|
|
for entry in state.repos.values_mut() {
|
|
entry.role = ROLE_REPLICA.to_string();
|
|
}
|
|
} else {
|
|
// Another node's role changed — update routing info
|
|
tracing::info!(
|
|
storage_name = %event.storage_name,
|
|
new_role = %event.new_role,
|
|
"remote node role changed"
|
|
);
|
|
state.last_known_primary_grpc = if event.new_role == ROLE_PRIMARY {
|
|
event.grpc_addr.clone()
|
|
} else {
|
|
state.last_known_primary_grpc.clone()
|
|
};
|
|
}
|
|
}
|
|
|
|
GitNodeMessage::TriggerElection => {
|
|
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
|
let total = members.len();
|
|
let my_cell = myself.get_cell();
|
|
|
|
let new_term = state.current_term.wrapping_add(1);
|
|
|
|
let mut accepted_count = 0u64;
|
|
for member in &members {
|
|
if *member == my_cell {
|
|
// We vote for ourselves
|
|
accepted_count += 1;
|
|
continue;
|
|
}
|
|
let actor_ref: ActorRef<GitNodeMessage> = member.clone().into();
|
|
let request = ElectionRequest {
|
|
candidate_storage_name: state.storage_name.clone(),
|
|
candidate_grpc_addr: state.grpc_addr.clone(),
|
|
candidate_actor_name: state.actor_name.clone(),
|
|
term: new_term,
|
|
reason: "health_check_failure".to_string(),
|
|
};
|
|
match ractor::call_t!(actor_ref, GitNodeMessage::ElectPrimary, 1000, request) {
|
|
Ok(result) if result.accepted => {
|
|
accepted_count += 1;
|
|
}
|
|
Ok(_) => {}
|
|
Err(_) => {
|
|
tracing::warn!(
|
|
member = ?member.get_id(),
|
|
"no response from member during election"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
let majority = (total / 2).max(1) + 1;
|
|
if accepted_count >= majority as u64 {
|
|
tracing::info!(
|
|
term = new_term,
|
|
accepted = accepted_count,
|
|
total = total,
|
|
"won election, promoting to PRIMARY"
|
|
);
|
|
state.is_primary = true;
|
|
state.current_term = new_term;
|
|
state.health_failures = 0;
|
|
state.voted_for = None;
|
|
for entry in state.repos.values_mut() {
|
|
entry.role = ROLE_PRIMARY.to_string();
|
|
entry.read_only = false;
|
|
}
|
|
let role_event = RoleChangedEvent {
|
|
storage_name: state.storage_name.clone(),
|
|
grpc_addr: state.grpc_addr.clone(),
|
|
new_role: ROLE_PRIMARY.to_string(),
|
|
term: new_term,
|
|
relative_paths: state.repos.keys().cloned().collect(),
|
|
};
|
|
broadcast_role_changed(&myself, role_event);
|
|
} else {
|
|
tracing::warn!(
|
|
term = new_term,
|
|
accepted = accepted_count,
|
|
total = total,
|
|
"election lost, staying as REPLICA"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
async fn handle_supervisor_evt(
|
|
&self,
|
|
_myself: ActorRef<Self::Msg>,
|
|
evt: SupervisionEvent,
|
|
_state: &mut Self::State,
|
|
) -> Result<(), ActorProcessingErr> {
|
|
match evt {
|
|
SupervisionEvent::ActorStarted(who) => {
|
|
tracing::debug!(actor = ?who.get_id(), "child started")
|
|
}
|
|
SupervisionEvent::ActorTerminated(who, _, reason) => {
|
|
tracing::warn!(actor = ?who.get_id(), reason = ?reason, "child terminated")
|
|
}
|
|
SupervisionEvent::ActorFailed(who, panic_msg) => {
|
|
tracing::error!(actor = ?who.get_id(), msg = %panic_msg, "child panicked")
|
|
}
|
|
SupervisionEvent::ProcessGroupChanged(group) => {
|
|
tracing::info!(group = ?group, "PG membership changed")
|
|
}
|
|
_ => {}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
async fn post_stop(
|
|
&self,
|
|
_myself: ActorRef<Self::Msg>,
|
|
state: &mut Self::State,
|
|
) -> Result<(), ActorProcessingErr> {
|
|
tracing::info!(storage_name = %state.storage_name, "GitNodeActor stopped");
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Determine whether to accept an election request.
|
|
fn should_accept_election(request: &ElectionRequest, state: &GitNodeState) -> bool {
|
|
// Reject old terms (prevents old/duplicate election messages)
|
|
if request.term < state.current_term {
|
|
tracing::warn!(
|
|
request_term = request.term,
|
|
current_term = state.current_term,
|
|
"rejecting election: term too old"
|
|
);
|
|
return false;
|
|
}
|
|
// Same term: only accept if we haven't already voted for someone else
|
|
if request.term == state.current_term
|
|
&& let Some(ref voted_for) = state.voted_for
|
|
&& voted_for != &request.candidate_storage_name
|
|
{
|
|
tracing::warn!(
|
|
request_term = request.term,
|
|
current_term = state.current_term,
|
|
already_voted = %voted_for,
|
|
candidate = %request.candidate_storage_name,
|
|
"rejecting election: already voted this term"
|
|
);
|
|
return false;
|
|
}
|
|
true
|
|
}
|
|
|
|
fn build_decision(
|
|
state: &GitNodeState,
|
|
header: &crate::pb::RepositoryHeader,
|
|
found: bool,
|
|
role: Option<&str>,
|
|
) -> RouteDecision {
|
|
RouteDecision {
|
|
found,
|
|
storage_name: if found {
|
|
state.storage_name.clone()
|
|
} else {
|
|
String::new()
|
|
},
|
|
relative_path: header.relative_path.clone(),
|
|
actor_name: if found {
|
|
state.actor_name.clone()
|
|
} else {
|
|
String::new()
|
|
},
|
|
grpc_addr: if found {
|
|
state.grpc_addr.clone()
|
|
} else {
|
|
String::new()
|
|
},
|
|
role: role.unwrap_or("").to_string(),
|
|
}
|
|
}
|
|
|
|
async fn register_repo(
|
|
myself: &ActorRef<GitNodeMessage>,
|
|
state: &mut GitNodeState,
|
|
relative_path: String,
|
|
) {
|
|
if state.repos.contains_key(&relative_path) {
|
|
return;
|
|
}
|
|
|
|
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
|
let my_cell = myself.get_cell();
|
|
|
|
let role = if members.iter().any(|m| m != &my_cell) {
|
|
let header = RepositoryHeader {
|
|
storage_name: String::new(),
|
|
relative_path: relative_path.clone(),
|
|
storage_path: String::new(),
|
|
};
|
|
let primary_found = find_primary_in_cluster(&members, &my_cell, &header).await;
|
|
if primary_found {
|
|
ROLE_REPLICA.to_string()
|
|
} else {
|
|
ROLE_PRIMARY.to_string()
|
|
}
|
|
} else {
|
|
ROLE_PRIMARY.to_string()
|
|
};
|
|
|
|
if role == ROLE_PRIMARY {
|
|
state.is_primary = true;
|
|
}
|
|
|
|
let category = extract_category(&relative_path);
|
|
pg::join_scoped(
|
|
state.storage_name.clone(),
|
|
category.to_string(),
|
|
vec![myself.get_cell()],
|
|
);
|
|
state.repos.insert(
|
|
relative_path.clone(),
|
|
RepoEntry {
|
|
role: role.clone(),
|
|
last_commit: String::new(),
|
|
read_only: false,
|
|
},
|
|
);
|
|
tracing::info!(
|
|
storage_name = %state.storage_name,
|
|
category = %category,
|
|
relative_path = %relative_path,
|
|
actor_name = %state.actor_name,
|
|
role = %role,
|
|
"repository route registered"
|
|
);
|
|
}
|
|
|
|
/// Query all cluster members (except self) to find if a repository has a PRIMARY.
|
|
pub async fn find_primary_in_cluster(
|
|
members: &[ActorCell],
|
|
my_cell: &ActorCell,
|
|
header: &RepositoryHeader,
|
|
) -> bool {
|
|
for member in members {
|
|
if member == my_cell {
|
|
continue;
|
|
}
|
|
let actor_ref: ActorRef<GitNodeMessage> = member.clone().into();
|
|
if let Ok(decision) =
|
|
ractor::call_t!(actor_ref, GitNodeMessage::FindPrimary, 500, header.clone())
|
|
&& decision.found
|
|
&& decision.role == ROLE_PRIMARY
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
fn extract_category(relative_path: &str) -> &str {
|
|
relative_path.split('/').next().unwrap_or("root")
|
|
}
|
|
|
|
/// Start background health checker that monitors the PRIMARY node.
|
|
/// If the PRIMARY becomes unreachable for `max_failures` consecutive checks,
|
|
/// triggers an election.
|
|
fn start_health_checker(myself: ActorRef<GitNodeMessage>, interval_secs: u64, max_failures: u32) {
|
|
tokio::spawn(async move {
|
|
let mut interval = tokio::time::interval(std::time::Duration::from_secs(interval_secs));
|
|
interval.tick().await; // First tick immediate
|
|
|
|
let mut consecutive_failures: u32 = 0;
|
|
|
|
loop {
|
|
interval.tick().await;
|
|
|
|
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
|
let my_cell = myself.get_cell();
|
|
let other_cells: Vec<ractor::ActorCell> =
|
|
members.into_iter().filter(|m| m != &my_cell).collect();
|
|
|
|
if other_cells.is_empty() {
|
|
// No other nodes → we are the only node → ensure we are PRIMARY
|
|
consecutive_failures = 0;
|
|
continue;
|
|
}
|
|
|
|
let mut any_reachable = false;
|
|
for cell in &other_cells {
|
|
let actor_ref: ActorRef<GitNodeMessage> = cell.clone().into();
|
|
match ractor::call_t!(actor_ref, GitNodeMessage::GetNodeHealth, 2000) {
|
|
Ok(health) if health.healthy => {
|
|
any_reachable = true;
|
|
break;
|
|
}
|
|
_ => continue,
|
|
}
|
|
}
|
|
|
|
if any_reachable {
|
|
consecutive_failures = 0;
|
|
} else {
|
|
consecutive_failures += 1;
|
|
tracing::warn!(
|
|
consecutive_failures = consecutive_failures,
|
|
max_failures = max_failures,
|
|
"no other cluster nodes reachable"
|
|
);
|
|
|
|
if consecutive_failures >= max_failures {
|
|
tracing::error!(
|
|
"no other nodes reachable for {max_failures} checks, triggering election"
|
|
);
|
|
myself.cast(GitNodeMessage::TriggerElection).ok();
|
|
consecutive_failures = 0;
|
|
}
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
pub async fn start_node_actor(
|
|
service: GitksService,
|
|
storage_name: String,
|
|
grpc_addr: String,
|
|
) -> Result<(ActorRef<GitNodeMessage>, tokio::task::JoinHandle<()>), ractor::SpawnErr> {
|
|
let actor = GitNodeActor::init(service);
|
|
let (actor_ref, handle) = Actor::spawn(
|
|
Some(format!("git_node_{storage_name}")),
|
|
actor,
|
|
GitNodeArgs {
|
|
storage_name,
|
|
grpc_addr,
|
|
},
|
|
)
|
|
.await?;
|
|
actor_ref.cast(GitNodeMessage::ScanAndRegister).ok();
|
|
Ok((actor_ref, handle))
|
|
}
|
|
|
|
pub fn get_cluster_nodes(storage_name: &str) -> Vec<ractor::ActorCell> {
|
|
pg::get_scoped_members(&storage_name.to_string(), &"node".to_string())
|
|
}
|
|
|
|
pub fn get_category_members(storage_name: &str, category: &str) -> Vec<ractor::ActorCell> {
|
|
pg::get_scoped_members(&storage_name.to_string(), &category.to_string())
|
|
}
|
|
|
|
pub fn route_group_for(header: &crate::pb::RepositoryHeader) -> String {
|
|
extract_category(&header.relative_path).to_string()
|
|
}
|
|
|
|
pub fn list_all_groups() -> Vec<String> {
|
|
pg::which_groups()
|
|
}
|
|
|
|
pub fn broadcast_ref_update(_node_actor: &ActorRef<GitNodeMessage>, event: RefUpdateEvent) {
|
|
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
|
for member in members {
|
|
let actor_ref: ActorRef<GitNodeMessage> = member.into();
|
|
actor_ref
|
|
.cast(GitNodeMessage::RefUpdated(event.clone()))
|
|
.ok();
|
|
}
|
|
}
|
|
|
|
/// Broadcast a role change event to all cluster members.
|
|
pub fn broadcast_role_changed(_actor: &ActorRef<GitNodeMessage>, event: RoleChangedEvent) {
|
|
let members = ractor::pg::get_members(&"gitks_nodes".to_string());
|
|
for member in members {
|
|
let actor_ref: ActorRef<GitNodeMessage> = member.into();
|
|
actor_ref
|
|
.cast(GitNodeMessage::RoleChanged(event.clone()))
|
|
.ok();
|
|
}
|
|
}
|