//! Prometheus-compatible metrics for GitKS. //! //! Tracks: //! - Request counts by gRPC method + status code //! - Request duration histogram by method //! - Active requests gauge //! - Repository count //! - Cache hits / misses //! - Error counts by error type //! //! Exposes a `/metrics` HTTP endpoint on a configurable port (default 9100). use dashmap::DashMap; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant}; struct MetricsInner { /// Counter: total requests by (method, status_code) /// Key: "method:status" request_count: DashMap, /// Histogram buckets for request duration (seconds). /// Each bucket: (method, le_bound_ms) → count duration_buckets: DashMap, /// Gauge: number of currently in-flight requests active_requests: AtomicU64, /// Gauge: total number of registered repositories repository_count: AtomicU64, /// Counter: cache hits cache_hits: AtomicU64, /// Counter: cache misses cache_misses: AtomicU64, /// Counter: errors by error kind error_count: DashMap, /// Start timestamp (seconds since Unix epoch) start_time: Instant, } static METRICS: OnceLock> = OnceLock::new(); fn metrics() -> &'static Arc { METRICS.get_or_init(|| { Arc::new(MetricsInner { request_count: DashMap::new(), duration_buckets: DashMap::new(), active_requests: AtomicU64::new(0), repository_count: AtomicU64::new(0), cache_hits: AtomicU64::new(0), cache_misses: AtomicU64::new(0), error_count: DashMap::new(), start_time: Instant::now(), }) }) } #[rustfmt::skip] const DURATION_BUCKET_MS: &[u64] = &[ 5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000, 10_000, 30_000, 60_000, u64::MAX, ]; const BUCKET_INF: u64 = u64::MAX; /// Record a request. pub fn record_request(method: &str, status_code: &str, duration: Duration) { let m = metrics(); let key = format!("{method}:{status_code}"); m.request_count .entry(key) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); let duration_ms = duration.as_millis() as u64; for &bound_ms in DURATION_BUCKET_MS { if duration_ms <= bound_ms || bound_ms == BUCKET_INF { let bucket_key = format!("{method}:{bound_ms}"); m.duration_buckets .entry(bucket_key) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); } } } /// Increment the active request gauge. pub fn inc_active_requests() { metrics().active_requests.fetch_add(1, Ordering::Relaxed); } /// Decrement the active request gauge. pub fn dec_active_requests() { metrics().active_requests.fetch_sub(1, Ordering::Relaxed); } /// Set the repository count. pub fn set_repository_count(count: u64) { metrics() .repository_count .store(count, Ordering::Relaxed); } /// Record a cache hit. pub fn inc_cache_hits(count: u64) { metrics().cache_hits.fetch_add(count, Ordering::Relaxed); } /// Record a cache miss. pub fn inc_cache_misses(count: u64) { metrics() .cache_misses .fetch_add(count, Ordering::Relaxed); } /// Record an error by kind (e.g., "not_found", "internal", "invalid_argument"). pub fn inc_error(kind: &str) { metrics() .error_count .entry(kind.to_string()) .or_insert_with(|| AtomicU64::new(0)) .value() .fetch_add(1, Ordering::Relaxed); } /// Render all metrics in Prometheus text exposition format. pub fn render_metrics() -> String { let m = metrics(); let mut out = String::with_capacity(4096); // Header let uptime = m.start_time.elapsed().as_secs(); out.push_str("# HELP gitks_uptime_seconds Time since gitks started\n"); out.push_str("# TYPE gitks_uptime_seconds gauge\n"); out.push_str(&format!("gitks_uptime_seconds {uptime}\n\n")); // Active requests let active = m.active_requests.load(Ordering::Relaxed); out.push_str("# HELP gitks_active_requests Currently in-flight requests\n"); out.push_str("# TYPE gitks_active_requests gauge\n"); out.push_str(&format!("gitks_active_requests {active}\n\n")); // Repository count let repos = m.repository_count.load(Ordering::Relaxed); out.push_str("# HELP gitks_repository_count Number of registered repositories\n"); out.push_str("# TYPE gitks_repository_count gauge\n"); out.push_str(&format!("gitks_repository_count {repos}\n\n")); // Request count out.push_str("# HELP gitks_requests_total Total gRPC requests by method and status\n"); out.push_str("# TYPE gitks_requests_total counter\n"); for entry in &m.request_count { let (method_and_status, count) = (entry.key(), entry.value()); let count = count.load(Ordering::Relaxed); if let Some((method, status)) = method_and_status.rsplit_once(':') { out.push_str( &format!("gitks_requests_total{{method=\"{method}\",status=\"{status}\"}} {count}\n"), ); } } out.push('\n'); // Duration histogram out.push_str( "# HELP gitks_request_duration_milliseconds Request duration histogram in ms\n", ); out.push_str("# TYPE gitks_request_duration_milliseconds histogram\n"); for entry in &m.duration_buckets { let (method_and_bound, count) = (entry.key(), entry.value()); let count = count.load(Ordering::Relaxed); if let Some((method, bound_str)) = method_and_bound.rsplit_once(':') { let bound = bound_str; let le = if bound_str.parse::() == Ok(BUCKET_INF) { "+Inf".to_string() } else { bound.to_string() }; out.push_str( &format!("gitks_request_duration_milliseconds_bucket{{method=\"{method}\",le=\"{le}\"}} {count}\n"), ); } } out.push('\n'); // Cache let hits = m.cache_hits.load(Ordering::Relaxed); let misses = m.cache_misses.load(Ordering::Relaxed); out.push_str("# HELP gitks_cache_hits_total Cache hit count\n"); out.push_str("# TYPE gitks_cache_hits_total counter\n"); out.push_str(&format!("gitks_cache_hits_total {hits}\n\n")); out.push_str("# HELP gitks_cache_misses_total Cache miss count\n"); out.push_str("# TYPE gitks_cache_misses_total counter\n"); out.push_str(&format!("gitks_cache_misses_total {misses}\n\n")); // Errors out.push_str("# HELP gitks_errors_total Total errors by kind\n"); out.push_str("# TYPE gitks_errors_total counter\n"); for entry in &m.error_count { let (kind, count) = (entry.key(), entry.value()); let count = count.load(Ordering::Relaxed); out.push_str(&format!("gitks_errors_total{{kind=\"{kind}\"}} {count}\n")); } out.push('\n'); out } /// Start the metrics HTTP server on the given port. /// Runs in a background task; returns the JoinHandle. pub fn start_metrics_server(port: u16) -> tokio::task::JoinHandle<()> { tokio::spawn(async move { let listener = match tokio::net::TcpListener::bind(format!("0.0.0.0:{port}")).await { Ok(l) => l, Err(e) => { tracing::error!(port, error = %e, "failed to bind metrics server"); return; } }; tracing::info!(port, "metrics HTTP server started"); loop { match listener.accept().await { Ok((socket, peer)) => { tracing::debug!(%peer, "metrics request"); tokio::spawn(handle_metrics_connection(socket)); } Err(e) => { tracing::error!(error = %e, "metrics accept error"); } } } }) } async fn handle_metrics_connection(mut socket: tokio::net::TcpStream) { use tokio::io::{AsyncReadExt, AsyncWriteExt}; let mut buf = [0u8; 4096]; let _ = tokio::time::timeout(Duration::from_secs(5), socket.read(&mut buf)).await; let body = render_metrics(); let response = format!( "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}", body.len(), body ); let _ = tokio::time::timeout(Duration::from_secs(5), socket.write_all(response.as_bytes())) .await; let _ = socket.shutdown().await; } /// A guard that records metrics on drop. /// /// Usage in handlers: /// ```ignore /// let m = crate::metrics::RequestMetrics::new("Service/Method"); /// // ... handle request ... /// m.record("ok"); // on success /// // m.record("internal"); // or on error, with tonic error kind /// ``` pub struct RequestMetrics { method: &'static str, start: Instant, } impl RequestMetrics { pub fn new(method: &'static str) -> Self { inc_active_requests(); Self { method, start: Instant::now(), } } /// Record the outcome. Idempotent — safe to call before each return. pub fn record(&self, status: &str) { let duration = self.start.elapsed(); record_request(self.method, status, duration); } } impl Drop for RequestMetrics { fn drop(&mut self) { dec_active_requests(); } } /// Convenience: record an error from a tonic Status. pub fn record_rpc_error(m: &RequestMetrics, status: &tonic::Status) { let kind = status.code().description(); inc_error(kind); m.record(kind); }