//! Prometheus-compatible metrics for GitKS. //! //! Tracks: //! - Request counts by gRPC method + status code //! - Request duration histogram by method //! - Active requests gauge //! - Repository count //! - Cache hits / misses //! - Error counts by error type //! - Git subprocess duration //! - Cache operation duration //! - Slow request detection //! //! Exposes HTTP endpoints on a configurable port (default 9100): //! - GET /metrics — Prometheus metrics //! - GET /health — Liveness probe //! - GET /ready — Readiness probe //! - GET /debug/log-level — Current log level filter //! - PUT /debug/log-level — Update log level filter (body: "gitks=debug") //! - GET /debug/config — Runtime configuration use dashmap::DashMap; use std::sync::atomic::{AtomicU64, AtomicBool, Ordering}; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant}; struct MetricsInner { /// Counter: total requests by (method, status_code) request_count: DashMap, /// Histogram buckets for request duration (ms) duration_buckets: DashMap, /// Gauge: number of currently in-flight requests active_requests: AtomicU64, /// Gauge: total number of registered repositories repository_count: AtomicU64, /// Counter: cache hits cache_hits: AtomicU64, /// Counter: cache misses cache_misses: AtomicU64, /// Counter: errors by error kind error_count: DashMap, /// Start timestamp start_time: Instant, /// Slow request threshold in ms slow_request_threshold_ms: AtomicU64, /// Server ready flag ready: AtomicBool, /// Histogram buckets for git subprocess duration (ms) git_cmd_duration_buckets: DashMap, /// Counter: git subprocess calls by command git_cmd_count: DashMap, /// Histogram buckets for cache operation duration (ms) cache_op_duration_buckets: DashMap, /// Counter: cache operations by (cache, result) cache_op_count: DashMap, /// Histogram: hook execution duration (ms) hook_duration_buckets: DashMap, /// Counter: hook executions by (hook_type, result) hook_count: DashMap, /// Counter: slow requests by method slow_request_count: DashMap, raft_term: AtomicU64, /// Gauge: current Raft commit index raft_commit_index: AtomicU64, /// Gauge: current Raft last applied index raft_last_applied: AtomicU64, /// Gauge: whether this node is the Raft leader (1 = yes, 0 = no) raft_is_leader: AtomicU64, /// Gauge: number of entries in the Raft log raft_log_entries: AtomicU64, /// Counter: total AppendEntries RPCs sent raft_append_entries_total: AtomicU64, /// Counter: successful AppendEntries RPCs raft_append_entries_success: AtomicU64, /// Counter: total elections triggered raft_elections_total: AtomicU64, /// Counter: elections won raft_elections_won: AtomicU64, } static METRICS: OnceLock> = OnceLock::new(); /// Handle for dynamic log level reload. static LOG_RELOAD_HANDLE: OnceLock>> = OnceLock::new(); use tracing_subscriber::EnvFilter; fn metrics() -> &'static Arc { METRICS.get_or_init(|| { Arc::new(MetricsInner { request_count: DashMap::new(), duration_buckets: DashMap::new(), active_requests: AtomicU64::new(0), repository_count: AtomicU64::new(0), cache_hits: AtomicU64::new(0), cache_misses: AtomicU64::new(0), error_count: DashMap::new(), start_time: Instant::now(), slow_request_threshold_ms: AtomicU64::new(5000), ready: AtomicBool::new(false), git_cmd_duration_buckets: DashMap::new(), git_cmd_count: DashMap::new(), cache_op_duration_buckets: DashMap::new(), cache_op_count: DashMap::new(), hook_duration_buckets: DashMap::new(), hook_count: DashMap::new(), slow_request_count: DashMap::new(), // Raft metrics raft_term: AtomicU64::new(0), raft_commit_index: AtomicU64::new(0), raft_last_applied: AtomicU64::new(0), raft_is_leader: AtomicU64::new(0), raft_log_entries: AtomicU64::new(0), raft_append_entries_total: AtomicU64::new(0), raft_append_entries_success: AtomicU64::new(0), raft_elections_total: AtomicU64::new(0), raft_elections_won: AtomicU64::new(0), }) }) } #[rustfmt::skip] const DURATION_BUCKET_MS: &[u64] = &[ 5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000, 10_000, 30_000, 60_000, u64::MAX, ]; const BUCKET_INF: u64 = u64::MAX; fn record_duration_bucket(map: &DashMap, key_prefix: &str, duration_ms: u64) { for &bound_ms in DURATION_BUCKET_MS { if duration_ms <= bound_ms || bound_ms == BUCKET_INF { let bucket_key = format!("{key_prefix}:{bound_ms}"); map.entry(bucket_key) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); } } } pub fn set_slow_request_threshold(ms: u64) { metrics().slow_request_threshold_ms.store(ms, Ordering::Relaxed); } pub fn set_ready(ready: bool) { metrics().ready.store(ready, Ordering::Relaxed); } pub fn set_log_reload_handle( handle: tracing_subscriber::reload::Handle, ) { LOG_RELOAD_HANDLE.set(Some(handle)).ok(); } /// Record a gRPC request. pub fn record_request(method: &str, status_code: &str, duration: Duration) { let m = metrics(); let duration_ms = duration.as_millis() as u64; // Request count let key = format!("{method}:{status_code}"); m.request_count .entry(key) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); // Duration histogram record_duration_bucket(&m.duration_buckets, method, duration_ms); // Slow request detection let threshold = m.slow_request_threshold_ms.load(Ordering::Relaxed); if threshold > 0 && duration_ms >= threshold { m.slow_request_count .entry(method.to_string()) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); } } pub fn inc_active_requests() { metrics().active_requests.fetch_add(1, Ordering::Relaxed); } pub fn dec_active_requests() { metrics().active_requests.fetch_sub(1, Ordering::Relaxed); } pub fn set_repository_count(count: u64) { metrics().repository_count.store(count, Ordering::Relaxed); } pub fn inc_cache_hits(count: u64) { metrics().cache_hits.fetch_add(count, Ordering::Relaxed); } pub fn inc_cache_misses(count: u64) { metrics().cache_misses.fetch_add(count, Ordering::Relaxed); } pub fn inc_error(kind: &str) { metrics() .error_count .entry(kind.to_string()) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); } /// Record a git subprocess execution. pub fn record_git_cmd(command: &str, duration: Duration) { let m = metrics(); let duration_ms = duration.as_millis() as u64; m.git_cmd_count .entry(command.to_string()) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); record_duration_bucket(&m.git_cmd_duration_buckets, command, duration_ms); } /// Record a cache operation. pub fn record_cache_op(cache: &str, result: &str, duration: Duration) { let m = metrics(); let duration_ms = duration.as_millis() as u64; let key = format!("{cache}:{result}"); m.cache_op_count .entry(key) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); record_duration_bucket(&m.cache_op_duration_buckets, cache, duration_ms); } /// Record a hook execution. pub fn record_hook_execution(hook_type: &str, result: &str, duration: Duration) { let m = metrics(); let duration_ms = duration.as_millis() as u64; let key = format!("{hook_type}:{result}"); m.hook_count .entry(key) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); record_duration_bucket(&m.hook_duration_buckets, hook_type, duration_ms); } pub fn set_raft_state(term: u64, commit_index: u64, last_applied: u64, is_leader: bool, log_entries: u64) { let m = metrics(); m.raft_term.store(term, Ordering::Relaxed); m.raft_commit_index.store(commit_index, Ordering::Relaxed); m.raft_last_applied.store(last_applied, Ordering::Relaxed); m.raft_is_leader.store(if is_leader { 1 } else { 0 }, Ordering::Relaxed); m.raft_log_entries.store(log_entries, Ordering::Relaxed); } /// Record an AppendEntries RPC attempt. pub fn inc_raft_append_entries(success: bool) { let m = metrics(); m.raft_append_entries_total.fetch_add(1, Ordering::Relaxed); if success { m.raft_append_entries_success.fetch_add(1, Ordering::Relaxed); } } /// Record an election trigger. pub fn inc_raft_election(won: bool) { let m = metrics(); m.raft_elections_total.fetch_add(1, Ordering::Relaxed); if won { m.raft_elections_won.fetch_add(1, Ordering::Relaxed); } } /// Escape a string for use as a Prometheus label value. /// Replaces `\` → `\\`, `"` → `\"`, `\n` → `\n` per the Prometheus spec. fn prom_escape(value: &str) -> String { let mut out = String::with_capacity(value.len()); for ch in value.chars() { match ch { '\\' => out.push_str("\\\\"), '"' => out.push_str("\\\""), '\n' => out.push_str("\\n"), _ => out.push(ch), } } out } fn render_counter_map(out: &mut String, name: &str, help: &str, map: &DashMap, labels: &[&str]) { out.push_str(&format!("# HELP {name} {help}\n")); out.push_str(&format!("# TYPE {name} counter\n")); for entry in map { let (key, count) = (entry.key(), entry.value().load(Ordering::Relaxed)); let parts: Vec<&str> = key.split(':').collect(); if parts.len() == labels.len() { let label_str: String = labels.iter().zip(parts.iter()) .map(|(l, v)| format!("{l}=\"{}\"", prom_escape(v))) .collect::>() .join(","); out.push_str(&format!("{name}{{{label_str}}} {count}\n")); } } out.push('\n'); } fn render_histogram(out: &mut String, name: &str, help: &str, map: &DashMap) { out.push_str(&format!("# HELP {name} {help}\n")); out.push_str(&format!("# TYPE {name} histogram\n")); for entry in map { let (key_and_bound, count) = (entry.key(), entry.value().load(Ordering::Relaxed)); if let Some((key, bound_str)) = key_and_bound.rsplit_once(':') { let le = if bound_str.parse::() == Ok(BUCKET_INF) { "+Inf".to_string() } else { bound_str.to_string() }; out.push_str(&format!("{name}_bucket{{method=\"{}\",le=\"{le}\"}} {count}\n", prom_escape(key))); } } out.push('\n'); } pub fn render_metrics() -> String { let m = metrics(); let mut out = String::with_capacity(8192); let uptime = m.start_time.elapsed().as_secs(); out.push_str("# HELP gitks_uptime_seconds Time since gitks started\n"); out.push_str("# TYPE gitks_uptime_seconds gauge\n"); out.push_str(&format!("gitks_uptime_seconds {uptime}\n\n")); let active = m.active_requests.load(Ordering::Relaxed); out.push_str("# HELP gitks_active_requests Currently in-flight requests\n"); out.push_str("# TYPE gitks_active_requests gauge\n"); out.push_str(&format!("gitks_active_requests {active}\n\n")); let repos = m.repository_count.load(Ordering::Relaxed); out.push_str("# HELP gitks_repository_count Number of registered repositories\n"); out.push_str("# TYPE gitks_repository_count gauge\n"); out.push_str(&format!("gitks_repository_count {repos}\n\n")); // gRPC requests render_counter_map(&mut out, "gitks_requests_total", "Total gRPC requests by method and status", &m.request_count, &["method", "status"]); // gRPC duration render_histogram(&mut out, "gitks_request_duration_milliseconds", "Request duration histogram in ms", &m.duration_buckets); // Slow requests render_counter_map(&mut out, "gitks_slow_requests_total", "Slow gRPC requests by method", &m.slow_request_count, &["method"]); // Cache let hits = m.cache_hits.load(Ordering::Relaxed); let misses = m.cache_misses.load(Ordering::Relaxed); out.push_str("# HELP gitks_cache_hits_total Cache hit count\n"); out.push_str("# TYPE gitks_cache_hits_total counter\n"); out.push_str(&format!("gitks_cache_hits_total {hits}\n\n")); out.push_str("# HELP gitks_cache_misses_total Cache miss count\n"); out.push_str("# TYPE gitks_cache_misses_total counter\n"); out.push_str(&format!("gitks_cache_misses_total {misses}\n\n")); // Errors render_counter_map(&mut out, "gitks_errors_total", "Total errors by kind", &m.error_count, &["kind"]); // Git subprocess render_counter_map(&mut out, "gitks_git_cmd_total", "Git subprocess calls by command", &m.git_cmd_count, &["command"]); render_histogram(&mut out, "gitks_git_cmd_duration_milliseconds", "Git subprocess duration in ms", &m.git_cmd_duration_buckets); // Cache operations render_counter_map(&mut out, "gitks_cache_ops_total", "Cache operations by cache and result", &m.cache_op_count, &["cache", "result"]); render_histogram(&mut out, "gitks_cache_op_duration_milliseconds", "Cache operation duration in ms", &m.cache_op_duration_buckets); // Hook execution render_counter_map(&mut out, "gitks_hook_executions_total", "Hook executions by type and result", &m.hook_count, &["hook_type", "result"]); render_histogram(&mut out, "gitks_hook_duration_milliseconds", "Hook execution duration in ms", &m.hook_duration_buckets); // Raft consensus metrics let raft_term = m.raft_term.load(Ordering::Relaxed); let raft_commit = m.raft_commit_index.load(Ordering::Relaxed); let raft_applied = m.raft_last_applied.load(Ordering::Relaxed); let raft_leader = m.raft_is_leader.load(Ordering::Relaxed); let raft_entries = m.raft_log_entries.load(Ordering::Relaxed); let raft_ae_total = m.raft_append_entries_total.load(Ordering::Relaxed); let raft_ae_success = m.raft_append_entries_success.load(Ordering::Relaxed); let raft_elections = m.raft_elections_total.load(Ordering::Relaxed); let raft_elections_won = m.raft_elections_won.load(Ordering::Relaxed); out.push_str("# HELP gitks_raft_term Current Raft term\n"); out.push_str("# TYPE gitks_raft_term gauge\n"); out.push_str(&format!("gitks_raft_term {raft_term}\n\n")); out.push_str("# HELP gitks_raft_commit_index Current Raft commit index\n"); out.push_str("# TYPE gitks_raft_commit_index gauge\n"); out.push_str(&format!("gitks_raft_commit_index {raft_commit}\n\n")); out.push_str("# HELP gitks_raft_last_applied Current Raft last applied index\n"); out.push_str("# TYPE gitks_raft_last_applied gauge\n"); out.push_str(&format!("gitks_raft_last_applied {raft_applied}\n\n")); out.push_str("# HELP gitks_raft_is_leader Whether this node is the Raft leader\n"); out.push_str("# TYPE gitks_raft_is_leader gauge\n"); out.push_str(&format!("gitks_raft_is_leader {raft_leader}\n\n")); out.push_str("# HELP gitks_raft_log_entries Number of entries in the Raft log\n"); out.push_str("# TYPE gitks_raft_log_entries gauge\n"); out.push_str(&format!("gitks_raft_log_entries {raft_entries}\n\n")); out.push_str("# HELP gitks_raft_append_entries_total Total AppendEntries RPCs sent\n"); out.push_str("# TYPE gitks_raft_append_entries_total counter\n"); out.push_str(&format!("gitks_raft_append_entries_total {raft_ae_total}\n\n")); out.push_str("# HELP gitks_raft_append_entries_success Successful AppendEntries RPCs\n"); out.push_str("# TYPE gitks_raft_append_entries_success counter\n"); out.push_str(&format!("gitks_raft_append_entries_success {raft_ae_success}\n\n")); out.push_str("# HELP gitks_raft_elections_total Total elections triggered\n"); out.push_str("# TYPE gitks_raft_elections_total counter\n"); out.push_str(&format!("gitks_raft_elections_total {raft_elections}\n\n")); out.push_str("# HELP gitks_raft_elections_won Elections won by this node\n"); out.push_str("# TYPE gitks_raft_elections_won counter\n"); out.push_str(&format!("gitks_raft_elections_won {raft_elections_won}\n\n")); out } use bytes::Bytes; use http_body_util::Full; use hyper::body::Incoming; use hyper::{Request, Response, Method}; use hyper::service::Service; use std::convert::Infallible; use std::future::Future; use std::pin::Pin; use std::net::SocketAddr; /// Global cancel token for the HTTP server, set from main. static HTTP_CANCEL: OnceLock = OnceLock::new(); pub fn set_http_cancel_token(token: tokio_util::sync::CancellationToken) { HTTP_CANCEL.set(token).ok(); } struct Router; impl Service> for Router { type Response = Response>; type Error = Infallible; type Future = Pin> + Send>>; fn call(&self, req: Request) -> Self::Future { Box::pin(handle_request(req)) } } fn json_response(status: u16, body: &str) -> Response> { Response::builder() .status(status) .header("Content-Type", "application/json") .header("Connection", "close") .body(Full::new(Bytes::from(body.to_string()))) .unwrap() } fn text_response(status: u16, content_type: &str, body: String) -> Response> { Response::builder() .status(status) .header("Content-Type", content_type) .header("Connection", "close") .body(Full::new(Bytes::from(body))) .unwrap() } async fn handle_request(req: Request) -> Result>, Infallible> { let method = req.method().clone(); let path = req.uri().path().to_owned(); tracing::debug!(%method, %path, "HTTP request"); let response = match (method, path.as_str()) { (Method::GET, "/metrics") => { let body = render_metrics(); text_response(200, "text/plain; version=0.0.4; charset=utf-8", body) } (Method::GET, "/health") => { json_response(200, r#"{"status":"healthy"}"#) } (Method::GET, "/ready") => { if metrics().ready.load(Ordering::Relaxed) { json_response(200, r#"{"status":"ready"}"#) } else { json_response(503, r#"{"status":"not_ready"}"#) } } (Method::GET, "/debug/log-level") => { let msg = if LOG_RELOAD_HANDLE.get().is_some() { "use PUT to change" } else { "dynamic log level not available" }; json_response(200, &format!(r#"{{"log_level":"{msg}"}}"#)) } (Method::PUT, "/debug/log-level") => { match handle_log_level_update(req).await { Ok(resp) => resp, Err(e) => json_response(400, &format!(r#"{{"error":"{e}"}}"#)), } } (Method::GET, "/debug/config") => { let threshold = metrics().slow_request_threshold_ms.load(Ordering::Relaxed); let ready = metrics().ready.load(Ordering::Relaxed); json_response(200, &format!( r#"{{"slow_request_threshold_ms":{},"ready":{}}}"#, threshold, ready )) } _ => { json_response(404, r#"{"error":"not found"}"#) } }; Ok(response) } async fn handle_log_level_update( req: Request, ) -> Result>, String> { use http_body_util::BodyExt; let body_bytes = req .collect() .await .map_err(|e| format!("failed to read body: {e}"))? .to_bytes(); let new_filter = String::from_utf8(body_bytes.to_vec()) .map_err(|e| format!("invalid UTF-8: {e}"))?; let new_filter = new_filter.trim().to_string(); if new_filter.is_empty() { return Ok(json_response(400, r#"{"error":"empty filter"}"#)); } match LOG_RELOAD_HANDLE.get() { Some(Some(handle)) => match new_filter.parse::() { Ok(filter) => { if let Err(e) = handle.reload(filter) { Ok(json_response(500, &format!(r#"{{"error":"{e}"}}"#))) } else { tracing::info!(new_filter = %new_filter, "log level updated via HTTP"); Ok(json_response(200, &format!( r#"{{"status":"ok","filter":"{}"}}"#, new_filter ))) } } Err(e) => Ok(json_response(400, &format!( r#"{{"error":"invalid filter: {e}"}}"# ))), }, _ => Ok(json_response(501, r#"{"error":"dynamic log level not configured"}"#)), } } /// Start the HTTP server (metrics + health + debug) using hyper 1.x. pub fn start_metrics_server(port: u16) -> tokio::task::JoinHandle<()> { tokio::spawn(async move { let listener = match tokio::net::TcpListener::bind(SocketAddr::from(([0, 0, 0, 0], port))) .await { Ok(l) => l, Err(e) => { tracing::error!(port, error = %e, "failed to bind HTTP server"); return; } }; tracing::info!(port, "HTTP server started (metrics + health + debug)"); let cancel = HTTP_CANCEL .get() .cloned() .unwrap_or_else(tokio_util::sync::CancellationToken::new); loop { tokio::select! { result = listener.accept() => { match result { Ok((stream, _peer)) => { let io = hyper_util::rt::TokioIo::new(stream); tokio::spawn(async move { let builder = hyper::server::conn::http1::Builder::new(); if let Err(e) = builder .serve_connection(io, Router) .with_upgrades() .await { tracing::debug!(error = %e, "HTTP connection error"); } }); } Err(e) => { tracing::error!(error = %e, "HTTP accept error"); } } } _ = cancel.cancelled() => { tracing::info!("HTTP server shutting down"); break; } } } }) } pub struct RequestMetrics { method: &'static str, start: Instant, } impl RequestMetrics { pub fn new(method: &'static str) -> Self { inc_active_requests(); Self { method, start: Instant::now(), } } /// Record the outcome. Idempotent — safe to call before each return. pub fn record(&self, status: &str) { let duration = self.start.elapsed(); let duration_ms = duration.as_millis() as u64; record_request(self.method, status, duration); // Slow request warning let threshold = metrics().slow_request_threshold_ms.load(Ordering::Relaxed); if threshold > 0 && duration_ms >= threshold { tracing::warn!( method = self.method, status, elapsed_ms = duration_ms, threshold_ms = threshold, "slow gRPC request detected" ); } } } impl Drop for RequestMetrics { fn drop(&mut self) { dec_active_requests(); } } /// Convenience: record an error from a tonic Status. pub fn record_rpc_error(m: &RequestMetrics, status: &tonic::Status) { let kind = status.code().description(); inc_error(kind); m.record(kind); }