Files
imks/telemetry/health.rs
T
zhenyi 0dbac480ae feat(telemetry): integrate OpenTelemetry observability stack with health metrics
- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration
- Implement connection tracking with active/total/disconnection metrics
- Add health endpoint with uptime and connection counts
- Integrate tracing spans for socket events and engine messages
- Add metrics collection for event handling duration
- Update health endpoint to include live runtime state
- Add graceful telemetry shutdown in main function
- Implement engine session active metrics tracking
- Add namespace-specific attributes to connection metrics
- Introduce message edit history retrieval endpoint
- Add scheduled message CRUD operations and dispatcher
- Update Socket.IO event registration with observability
- Refactor component update to remove dead code allowance
- Add comprehensive environment variables documentation
- Implement detailed development guidelines in AGENTS.md
2026-06-11 13:53:29 +08:00

171 lines
5.1 KiB
Rust

//! Enhanced health check endpoint with upstream dependency checks.
//!
//! Returns JSON with server status, version, uptime, connection counts,
//! and optional health checks for PostgreSQL, Redis, NATS, and gRPC.
use std::sync::Arc;
use std::sync::OnceLock;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::Instant;
use actix_web::HttpResponse;
use serde::Serialize;
/// Server start time captured at init.
static START_TIME: std::sync::OnceLock<Instant> = std::sync::OnceLock::new();
/// Live connection counter shared across the process.
/// Updated by the socket layer on connect / disconnect.
static CONNECTIONS_ACTIVE: OnceLock<AtomicU64> = OnceLock::new();
/// Initializes the start time (call once during startup).
pub fn record_start_time() {
START_TIME.set(Instant::now()).ok();
}
/// Initialize shared health counters (call once during startup).
pub fn init_counters() {
CONNECTIONS_ACTIVE.set(AtomicU64::new(0)).ok();
}
/// Signal that a new socket connection was established.
pub fn connection_connected() {
if let Some(c) = CONNECTIONS_ACTIVE.get() {
c.fetch_add(1, Ordering::Relaxed);
}
}
/// Signal that a socket connection was closed.
pub fn connection_disconnected() {
if let Some(c) = CONNECTIONS_ACTIVE.get() {
c.fetch_sub(1, Ordering::Relaxed);
}
}
/// Return the current number of active socket connections.
pub fn connections_active_count() -> u64 {
CONNECTIONS_ACTIVE
.get()
.map(|c| c.load(Ordering::Relaxed))
.unwrap_or(0)
}
/// Returns the server uptime in seconds.
pub fn uptime_secs() -> u64 {
START_TIME
.get()
.map(|t| t.elapsed().as_secs())
.unwrap_or(0)
}
#[derive(Debug, Clone, Serialize)]
pub struct HealthResponse {
pub status: String,
pub version: String,
pub timestamp: String,
pub uptime_secs: u64,
pub connections_active: u64,
pub sessions_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub checks: Option<HealthChecks>,
}
#[derive(Debug, Clone, Serialize)]
pub struct HealthChecks {
#[serde(skip_serializing_if = "Option::is_none")]
pub postgres: Option<CheckResult>,
#[serde(skip_serializing_if = "Option::is_none")]
pub redis: Option<CheckResult>,
#[serde(skip_serializing_if = "Option::is_none")]
pub nats: Option<CheckResult>,
#[serde(skip_serializing_if = "Option::is_none")]
pub grpc: Option<CheckResult>,
}
#[derive(Debug, Clone, Serialize)]
pub struct CheckResult {
pub status: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub latency_ms: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
/// Optional external check functions.
/// Each returns `Some(CheckResult)` if the service is configured, `None` otherwise.
#[derive(Default)]
pub struct HealthCheckFns {
pub check_postgres: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
pub check_redis: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
pub check_nats: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
pub check_grpc: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
}
impl HealthCheckFns {
pub fn with_postgres(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
self.check_postgres = Some(Arc::new(f));
self
}
pub fn with_redis(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
self.check_redis = Some(Arc::new(f));
self
}
pub fn with_nats(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
self.check_nats = Some(Arc::new(f));
self
}
pub fn with_grpc(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
self.check_grpc = Some(Arc::new(f));
self
}
}
/// GET /health handler with dependency checks.
pub async fn health_check(checks: actix_web::web::Data<Arc<HealthCheckFns>>) -> HttpResponse {
let checks = checks.get_ref();
let health_checks = if checks.check_postgres.is_some()
|| checks.check_redis.is_some()
|| checks.check_nats.is_some()
|| checks.check_grpc.is_some()
{
Some(HealthChecks {
postgres: checks.check_postgres.as_ref().map(|f| f()),
redis: checks.check_redis.as_ref().map(|f| f()),
nats: checks.check_nats.as_ref().map(|f| f()),
grpc: checks.check_grpc.as_ref().map(|f| f()),
})
} else {
None
};
let overall_status = if let Some(ref hc) = health_checks {
let all_up = [&hc.postgres, &hc.redis, &hc.nats, &hc.grpc]
.iter()
.filter_map(|c| c.as_ref())
.all(|c| c.status == "up");
if all_up {
"healthy"
} else {
"degraded"
}
} else {
"healthy"
};
let response = HealthResponse {
status: overall_status.to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
timestamp: chrono::Utc::now().to_rfc3339(),
uptime_secs: uptime_secs(),
connections_active: 0,
sessions_count: 0,
checks: health_checks,
};
HttpResponse::Ok().json(response)
}