Files
imks/telemetry/metrics.rs
T
zhenyi 0dbac480ae feat(telemetry): integrate OpenTelemetry observability stack with health metrics
- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration
- Implement connection tracking with active/total/disconnection metrics
- Add health endpoint with uptime and connection counts
- Integrate tracing spans for socket events and engine messages
- Add metrics collection for event handling duration
- Update health endpoint to include live runtime state
- Add graceful telemetry shutdown in main function
- Implement engine session active metrics tracking
- Add namespace-specific attributes to connection metrics
- Introduce message edit history retrieval endpoint
- Add scheduled message CRUD operations and dispatcher
- Update Socket.IO event registration with observability
- Refactor component update to remove dead code allowance
- Add comprehensive environment variables documentation
- Implement detailed development guidelines in AGENTS.md
2026-06-11 13:53:29 +08:00

169 lines
6.3 KiB
Rust

//! Prometheus metrics: global meter provider, registry, and the /metrics actix-web handler.
use std::sync::OnceLock;
use opentelemetry::global;
use opentelemetry::metrics::{Counter, Histogram, Meter, UpDownCounter};
use opentelemetry::KeyValue;
use opentelemetry_sdk::metrics::SdkMeterProvider;
use opentelemetry_sdk::Resource;
use prometheus::{Encoder, Registry, TextEncoder};
use crate::ImksResult;
/// Shared Prometheus registry, lazily initialized.
static PROMETHEUS_REGISTRY: OnceLock<Registry> = OnceLock::new();
/// Global metrics instruments, initialized once at startup.
static METRICS: OnceLock<MetricsInstruments> = OnceLock::new();
/// All application metrics instruments.
#[derive(Debug, Clone)]
pub struct MetricsInstruments {
pub connections_active: UpDownCounter<i64>,
pub connections_total: Counter<u64>,
pub disconnections_total: Counter<u64>,
pub messages_received_total: Counter<u64>,
pub messages_sent_total: Counter<u64>,
pub event_handling_duration: Histogram<f64>,
pub db_query_duration: Histogram<f64>,
pub engine_sessions_active: UpDownCounter<i64>,
pub namespaces_active: UpDownCounter<i64>,
pub gprc_calls_total: Counter<u64>,
pub gprc_call_errors_total: Counter<u64>,
pub adapter_broadcasts_total: Counter<u64>,
}
/// Initialize the Prometheus meter provider and create all metric instruments.
pub fn init_metrics(
_config: &super::config::TelemetryConfig,
resource: &Resource,
) -> ImksResult<(SdkMeterProvider, MetricsInstruments)> {
let registry = Registry::new();
PROMETHEUS_REGISTRY
.set(registry.clone())
.expect("Prometheus registry already initialized");
let exporter = opentelemetry_prometheus::exporter()
.with_registry(registry)
.build()
.map_err(|e| crate::ImksError::Internal(format!("failed to build Prometheus exporter: {e}")))?;
let provider = SdkMeterProvider::builder()
.with_resource(resource.clone())
.with_reader(exporter)
.build();
global::set_meter_provider(provider.clone());
let meter = global::meter_with_scope(
opentelemetry::InstrumentationScope::builder("imks")
.with_version(env!("CARGO_PKG_VERSION"))
.build(),
);
let instruments = MetricsInstruments::new(&meter);
METRICS
.set(instruments.clone())
.expect("Metrics instruments already initialized");
Ok((provider, instruments))
}
/// Obtain the globally initialized metrics. Panics if not initialized.
pub fn get() -> MetricsInstruments {
METRICS
.get()
.expect("Metrics not initialized — call init_metrics first")
.clone()
}
/// Obtain the globally initialized metrics, returning `None` if not initialized.
/// Prefer this in library code that may run before metrics are set up (e.g., tests).
pub fn try_get() -> Option<MetricsInstruments> {
METRICS.get().cloned()
}
impl MetricsInstruments {
fn new(meter: &Meter) -> Self {
Self {
connections_active: meter
.i64_up_down_counter("imks_connections_active")
.with_description("Number of active Socket.IO connections")
.build(),
connections_total: meter
.u64_counter("imks_connections_total")
.with_description("Total number of socket connections since start")
.build(),
disconnections_total: meter
.u64_counter("imks_disconnections_total")
.with_description("Total number of socket disconnections since start")
.build(),
messages_received_total: meter
.u64_counter("imks_messages_received_total")
.with_description("Total number of messages received from clients")
.build(),
messages_sent_total: meter
.u64_counter("imks_messages_sent_total")
.with_description("Total number of messages sent to clients")
.build(),
event_handling_duration: meter
.f64_histogram("imks_event_handling_duration_seconds")
.with_description("Socket.IO event handling latency in seconds")
.build(),
db_query_duration: meter
.f64_histogram("imks_db_query_duration_seconds")
.with_description("Database query duration in seconds")
.build(),
engine_sessions_active: meter
.i64_up_down_counter("imks_engine_sessions_active")
.with_description("Number of active Engine.IO sessions")
.build(),
namespaces_active: meter
.i64_up_down_counter("imks_namespaces_active")
.with_description("Number of active Socket.IO namespaces")
.build(),
gprc_calls_total: meter
.u64_counter("imks_gprc_calls_total")
.with_description("Total number of gRPC calls to appks")
.build(),
gprc_call_errors_total: meter
.u64_counter("imks_gprc_call_errors_total")
.with_description("Total number of failed gRPC calls to appks")
.build(),
adapter_broadcasts_total: meter
.u64_counter("imks_adapter_broadcasts_total")
.with_description("Total number of cross-node adapter broadcasts")
.build(),
}
}
/// Helper: create KV attributes for an event.
pub fn event_attrs(event: &str) -> [KeyValue; 1] {
[KeyValue::new("event", event.to_string())]
}
/// Helper: create KV attributes for a namespace.
pub fn namespace_attrs(ns: &str) -> [KeyValue; 1] {
[KeyValue::new("namespace", ns.to_string())]
}
}
/// Actix-web handler for `GET /metrics`.
///
/// Encodes the Prometheus text format from the shared registry.
pub async fn metrics_handler() -> actix_web::HttpResponse {
let registry = PROMETHEUS_REGISTRY.get().expect("Prometheus registry not initialized");
let metric_families = registry.gather();
let encoder = TextEncoder::new();
let mut buffer = Vec::new();
if encoder.encode(&metric_families, &mut buffer).is_err() {
return actix_web::HttpResponse::InternalServerError().body("failed to encode metrics");
}
actix_web::HttpResponse::Ok()
.content_type("text/plain; version=0.0.4")
.body(buffer)
}