feat(telemetry): integrate OpenTelemetry observability stack with health metrics
- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration - Implement connection tracking with active/total/disconnection metrics - Add health endpoint with uptime and connection counts - Integrate tracing spans for socket events and engine messages - Add metrics collection for event handling duration - Update health endpoint to include live runtime state - Add graceful telemetry shutdown in main function - Implement engine session active metrics tracking - Add namespace-specific attributes to connection metrics - Introduce message edit history retrieval endpoint - Add scheduled message CRUD operations and dispatcher - Update Socket.IO event registration with observability - Refactor component update to remove dead code allowance - Add comprehensive environment variables documentation - Implement detailed development guidelines in AGENTS.md
This commit is contained in:
@@ -0,0 +1,168 @@
|
||||
//! Prometheus metrics: global meter provider, registry, and the /metrics actix-web handler.
|
||||
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use opentelemetry::global;
|
||||
use opentelemetry::metrics::{Counter, Histogram, Meter, UpDownCounter};
|
||||
use opentelemetry::KeyValue;
|
||||
use opentelemetry_sdk::metrics::SdkMeterProvider;
|
||||
use opentelemetry_sdk::Resource;
|
||||
use prometheus::{Encoder, Registry, TextEncoder};
|
||||
|
||||
use crate::ImksResult;
|
||||
|
||||
/// Shared Prometheus registry, lazily initialized.
|
||||
static PROMETHEUS_REGISTRY: OnceLock<Registry> = OnceLock::new();
|
||||
|
||||
/// Global metrics instruments, initialized once at startup.
|
||||
static METRICS: OnceLock<MetricsInstruments> = OnceLock::new();
|
||||
|
||||
/// All application metrics instruments.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricsInstruments {
|
||||
pub connections_active: UpDownCounter<i64>,
|
||||
pub connections_total: Counter<u64>,
|
||||
pub disconnections_total: Counter<u64>,
|
||||
pub messages_received_total: Counter<u64>,
|
||||
pub messages_sent_total: Counter<u64>,
|
||||
pub event_handling_duration: Histogram<f64>,
|
||||
pub db_query_duration: Histogram<f64>,
|
||||
pub engine_sessions_active: UpDownCounter<i64>,
|
||||
pub namespaces_active: UpDownCounter<i64>,
|
||||
pub gprc_calls_total: Counter<u64>,
|
||||
pub gprc_call_errors_total: Counter<u64>,
|
||||
pub adapter_broadcasts_total: Counter<u64>,
|
||||
}
|
||||
|
||||
/// Initialize the Prometheus meter provider and create all metric instruments.
|
||||
pub fn init_metrics(
|
||||
_config: &super::config::TelemetryConfig,
|
||||
resource: &Resource,
|
||||
) -> ImksResult<(SdkMeterProvider, MetricsInstruments)> {
|
||||
let registry = Registry::new();
|
||||
PROMETHEUS_REGISTRY
|
||||
.set(registry.clone())
|
||||
.expect("Prometheus registry already initialized");
|
||||
|
||||
let exporter = opentelemetry_prometheus::exporter()
|
||||
.with_registry(registry)
|
||||
.build()
|
||||
.map_err(|e| crate::ImksError::Internal(format!("failed to build Prometheus exporter: {e}")))?;
|
||||
|
||||
let provider = SdkMeterProvider::builder()
|
||||
.with_resource(resource.clone())
|
||||
.with_reader(exporter)
|
||||
.build();
|
||||
|
||||
global::set_meter_provider(provider.clone());
|
||||
|
||||
let meter = global::meter_with_scope(
|
||||
opentelemetry::InstrumentationScope::builder("imks")
|
||||
.with_version(env!("CARGO_PKG_VERSION"))
|
||||
.build(),
|
||||
);
|
||||
|
||||
let instruments = MetricsInstruments::new(&meter);
|
||||
METRICS
|
||||
.set(instruments.clone())
|
||||
.expect("Metrics instruments already initialized");
|
||||
|
||||
Ok((provider, instruments))
|
||||
}
|
||||
|
||||
/// Obtain the globally initialized metrics. Panics if not initialized.
|
||||
pub fn get() -> MetricsInstruments {
|
||||
METRICS
|
||||
.get()
|
||||
.expect("Metrics not initialized — call init_metrics first")
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Obtain the globally initialized metrics, returning `None` if not initialized.
|
||||
/// Prefer this in library code that may run before metrics are set up (e.g., tests).
|
||||
pub fn try_get() -> Option<MetricsInstruments> {
|
||||
METRICS.get().cloned()
|
||||
}
|
||||
|
||||
impl MetricsInstruments {
|
||||
fn new(meter: &Meter) -> Self {
|
||||
Self {
|
||||
connections_active: meter
|
||||
.i64_up_down_counter("imks_connections_active")
|
||||
.with_description("Number of active Socket.IO connections")
|
||||
.build(),
|
||||
connections_total: meter
|
||||
.u64_counter("imks_connections_total")
|
||||
.with_description("Total number of socket connections since start")
|
||||
.build(),
|
||||
disconnections_total: meter
|
||||
.u64_counter("imks_disconnections_total")
|
||||
.with_description("Total number of socket disconnections since start")
|
||||
.build(),
|
||||
messages_received_total: meter
|
||||
.u64_counter("imks_messages_received_total")
|
||||
.with_description("Total number of messages received from clients")
|
||||
.build(),
|
||||
messages_sent_total: meter
|
||||
.u64_counter("imks_messages_sent_total")
|
||||
.with_description("Total number of messages sent to clients")
|
||||
.build(),
|
||||
event_handling_duration: meter
|
||||
.f64_histogram("imks_event_handling_duration_seconds")
|
||||
.with_description("Socket.IO event handling latency in seconds")
|
||||
.build(),
|
||||
db_query_duration: meter
|
||||
.f64_histogram("imks_db_query_duration_seconds")
|
||||
.with_description("Database query duration in seconds")
|
||||
.build(),
|
||||
engine_sessions_active: meter
|
||||
.i64_up_down_counter("imks_engine_sessions_active")
|
||||
.with_description("Number of active Engine.IO sessions")
|
||||
.build(),
|
||||
namespaces_active: meter
|
||||
.i64_up_down_counter("imks_namespaces_active")
|
||||
.with_description("Number of active Socket.IO namespaces")
|
||||
.build(),
|
||||
gprc_calls_total: meter
|
||||
.u64_counter("imks_gprc_calls_total")
|
||||
.with_description("Total number of gRPC calls to appks")
|
||||
.build(),
|
||||
gprc_call_errors_total: meter
|
||||
.u64_counter("imks_gprc_call_errors_total")
|
||||
.with_description("Total number of failed gRPC calls to appks")
|
||||
.build(),
|
||||
adapter_broadcasts_total: meter
|
||||
.u64_counter("imks_adapter_broadcasts_total")
|
||||
.with_description("Total number of cross-node adapter broadcasts")
|
||||
.build(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper: create KV attributes for an event.
|
||||
pub fn event_attrs(event: &str) -> [KeyValue; 1] {
|
||||
[KeyValue::new("event", event.to_string())]
|
||||
}
|
||||
|
||||
/// Helper: create KV attributes for a namespace.
|
||||
pub fn namespace_attrs(ns: &str) -> [KeyValue; 1] {
|
||||
[KeyValue::new("namespace", ns.to_string())]
|
||||
}
|
||||
}
|
||||
|
||||
/// Actix-web handler for `GET /metrics`.
|
||||
///
|
||||
/// Encodes the Prometheus text format from the shared registry.
|
||||
pub async fn metrics_handler() -> actix_web::HttpResponse {
|
||||
let registry = PROMETHEUS_REGISTRY.get().expect("Prometheus registry not initialized");
|
||||
|
||||
let metric_families = registry.gather();
|
||||
let encoder = TextEncoder::new();
|
||||
let mut buffer = Vec::new();
|
||||
if encoder.encode(&metric_families, &mut buffer).is_err() {
|
||||
return actix_web::HttpResponse::InternalServerError().body("failed to encode metrics");
|
||||
}
|
||||
|
||||
actix_web::HttpResponse::Ok()
|
||||
.content_type("text/plain; version=0.0.4")
|
||||
.body(buffer)
|
||||
}
|
||||
Reference in New Issue
Block a user