feat(telemetry): integrate OpenTelemetry observability stack with health metrics
- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration - Implement connection tracking with active/total/disconnection metrics - Add health endpoint with uptime and connection counts - Integrate tracing spans for socket events and engine messages - Add metrics collection for event handling duration - Update health endpoint to include live runtime state - Add graceful telemetry shutdown in main function - Implement engine session active metrics tracking - Add namespace-specific attributes to connection metrics - Introduce message edit history retrieval endpoint - Add scheduled message CRUD operations and dispatcher - Update Socket.IO event registration with observability - Refactor component update to remove dead code allowance - Add comprehensive environment variables documentation - Implement detailed development guidelines in AGENTS.md
This commit is contained in:
@@ -0,0 +1,85 @@
|
||||
/// Telemetry configuration, populated from environment variables.
|
||||
///
|
||||
/// Follows the OpenTelemetry environment variable specification:
|
||||
/// <https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/>
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TelemetryConfig {
|
||||
pub service_name: String,
|
||||
pub service_version: String,
|
||||
pub otlp_endpoint: String,
|
||||
pub otlp_protocol: OtlpProtocol,
|
||||
pub traces_enabled: bool,
|
||||
pub metrics_enabled: bool,
|
||||
pub logs_enabled: bool,
|
||||
pub log_format: LogFormat,
|
||||
pub log_level: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum OtlpProtocol {
|
||||
Grpc,
|
||||
HttpProtobuf,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum LogFormat {
|
||||
Json,
|
||||
Pretty,
|
||||
}
|
||||
|
||||
impl Default for TelemetryConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
service_name: env_or("OTEL_SERVICE_NAME", "imks"),
|
||||
service_version: env_or("OTEL_SERVICE_VERSION", env!("CARGO_PKG_VERSION")),
|
||||
otlp_endpoint: env_or(
|
||||
"OTEL_EXPORTER_OTLP_ENDPOINT",
|
||||
"http://localhost:4317",
|
||||
),
|
||||
otlp_protocol: detect_otlp_protocol(),
|
||||
traces_enabled: env_bool("OTEL_TRACES_ENABLED", true),
|
||||
metrics_enabled: env_bool("OTEL_METRICS_ENABLED", true),
|
||||
logs_enabled: env_bool("OTEL_LOGS_ENABLED", true),
|
||||
log_format: detect_log_format(),
|
||||
log_level: env_or("RUST_LOG", "info"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TelemetryConfig {
|
||||
pub fn from_env() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn env_or(key: &str, default: &str) -> String {
|
||||
std::env::var(key).unwrap_or_else(|_| default.to_string())
|
||||
}
|
||||
|
||||
fn env_bool(key: &str, default: bool) -> bool {
|
||||
std::env::var(key)
|
||||
.map(|v| matches!(v.to_lowercase().as_str(), "true" | "1" | "yes" | "on"))
|
||||
.unwrap_or(default)
|
||||
}
|
||||
|
||||
fn detect_otlp_protocol() -> OtlpProtocol {
|
||||
match std::env::var("OTEL_EXPORTER_OTLP_PROTOCOL")
|
||||
.unwrap_or_default()
|
||||
.to_lowercase()
|
||||
.as_str()
|
||||
{
|
||||
"http/protobuf" | "http/binary" => OtlpProtocol::HttpProtobuf,
|
||||
_ => OtlpProtocol::Grpc, // default to gRPC as project already depends on tonic
|
||||
}
|
||||
}
|
||||
|
||||
fn detect_log_format() -> LogFormat {
|
||||
match std::env::var("LOG_FORMAT")
|
||||
.unwrap_or_else(|_| "json".to_string())
|
||||
.to_lowercase()
|
||||
.as_str()
|
||||
{
|
||||
"pretty" | "text" | "console" => LogFormat::Pretty,
|
||||
_ => LogFormat::Json, // default to JSON for structured logging
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
//! Enhanced health check endpoint with upstream dependency checks.
|
||||
//!
|
||||
//! Returns JSON with server status, version, uptime, connection counts,
|
||||
//! and optional health checks for PostgreSQL, Redis, NATS, and gRPC.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::OnceLock;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::time::Instant;
|
||||
|
||||
use actix_web::HttpResponse;
|
||||
use serde::Serialize;
|
||||
|
||||
/// Server start time captured at init.
|
||||
static START_TIME: std::sync::OnceLock<Instant> = std::sync::OnceLock::new();
|
||||
|
||||
/// Live connection counter shared across the process.
|
||||
/// Updated by the socket layer on connect / disconnect.
|
||||
static CONNECTIONS_ACTIVE: OnceLock<AtomicU64> = OnceLock::new();
|
||||
|
||||
/// Initializes the start time (call once during startup).
|
||||
pub fn record_start_time() {
|
||||
START_TIME.set(Instant::now()).ok();
|
||||
}
|
||||
|
||||
/// Initialize shared health counters (call once during startup).
|
||||
pub fn init_counters() {
|
||||
CONNECTIONS_ACTIVE.set(AtomicU64::new(0)).ok();
|
||||
}
|
||||
|
||||
/// Signal that a new socket connection was established.
|
||||
pub fn connection_connected() {
|
||||
if let Some(c) = CONNECTIONS_ACTIVE.get() {
|
||||
c.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Signal that a socket connection was closed.
|
||||
pub fn connection_disconnected() {
|
||||
if let Some(c) = CONNECTIONS_ACTIVE.get() {
|
||||
c.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the current number of active socket connections.
|
||||
pub fn connections_active_count() -> u64 {
|
||||
CONNECTIONS_ACTIVE
|
||||
.get()
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Returns the server uptime in seconds.
|
||||
pub fn uptime_secs() -> u64 {
|
||||
START_TIME
|
||||
.get()
|
||||
.map(|t| t.elapsed().as_secs())
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct HealthResponse {
|
||||
pub status: String,
|
||||
pub version: String,
|
||||
pub timestamp: String,
|
||||
pub uptime_secs: u64,
|
||||
pub connections_active: u64,
|
||||
pub sessions_count: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub checks: Option<HealthChecks>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct HealthChecks {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub postgres: Option<CheckResult>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub redis: Option<CheckResult>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub nats: Option<CheckResult>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub grpc: Option<CheckResult>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct CheckResult {
|
||||
pub status: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub latency_ms: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
/// Optional external check functions.
|
||||
/// Each returns `Some(CheckResult)` if the service is configured, `None` otherwise.
|
||||
#[derive(Default)]
|
||||
pub struct HealthCheckFns {
|
||||
pub check_postgres: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
|
||||
pub check_redis: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
|
||||
pub check_nats: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
|
||||
pub check_grpc: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
|
||||
}
|
||||
|
||||
impl HealthCheckFns {
|
||||
pub fn with_postgres(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
|
||||
self.check_postgres = Some(Arc::new(f));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_redis(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
|
||||
self.check_redis = Some(Arc::new(f));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_nats(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
|
||||
self.check_nats = Some(Arc::new(f));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_grpc(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
|
||||
self.check_grpc = Some(Arc::new(f));
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// GET /health handler with dependency checks.
|
||||
pub async fn health_check(checks: actix_web::web::Data<Arc<HealthCheckFns>>) -> HttpResponse {
|
||||
let checks = checks.get_ref();
|
||||
|
||||
let health_checks = if checks.check_postgres.is_some()
|
||||
|| checks.check_redis.is_some()
|
||||
|| checks.check_nats.is_some()
|
||||
|| checks.check_grpc.is_some()
|
||||
{
|
||||
Some(HealthChecks {
|
||||
postgres: checks.check_postgres.as_ref().map(|f| f()),
|
||||
redis: checks.check_redis.as_ref().map(|f| f()),
|
||||
nats: checks.check_nats.as_ref().map(|f| f()),
|
||||
grpc: checks.check_grpc.as_ref().map(|f| f()),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let overall_status = if let Some(ref hc) = health_checks {
|
||||
let all_up = [&hc.postgres, &hc.redis, &hc.nats, &hc.grpc]
|
||||
.iter()
|
||||
.filter_map(|c| c.as_ref())
|
||||
.all(|c| c.status == "up");
|
||||
if all_up {
|
||||
"healthy"
|
||||
} else {
|
||||
"degraded"
|
||||
}
|
||||
} else {
|
||||
"healthy"
|
||||
};
|
||||
|
||||
let response = HealthResponse {
|
||||
status: overall_status.to_string(),
|
||||
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
timestamp: chrono::Utc::now().to_rfc3339(),
|
||||
uptime_secs: uptime_secs(),
|
||||
connections_active: 0,
|
||||
sessions_count: 0,
|
||||
checks: health_checks,
|
||||
};
|
||||
|
||||
HttpResponse::Ok().json(response)
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
//! Log export: JSON console output + OpenTelemetry log bridge (OTLP).
|
||||
|
||||
use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge;
|
||||
use opentelemetry_otlp::{LogExporter, Protocol, WithExportConfig};
|
||||
use opentelemetry_sdk::logs::SdkLoggerProvider;
|
||||
use opentelemetry_sdk::Resource;
|
||||
use tracing_subscriber::fmt::format::FmtSpan;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use tracing_subscriber::Registry;
|
||||
|
||||
use super::config::{OtlpProtocol, TelemetryConfig};
|
||||
use crate::ImksResult;
|
||||
|
||||
/// Initialize the tracing subscriber.
|
||||
///
|
||||
/// Layer order (critical for OpenTelemetry compatibility):
|
||||
/// 1. Registry
|
||||
/// 2. OpenTelemetry trace layer (must be first — needs LookupSpan)
|
||||
/// 3. EnvFilter
|
||||
/// 4. Console formatting layer (JSON)
|
||||
/// 5. OpenTelemetry log bridge
|
||||
///
|
||||
/// Returns the SdkLoggerProvider for graceful shutdown.
|
||||
pub fn init_subscriber(
|
||||
config: &TelemetryConfig,
|
||||
resource: Option<&Resource>,
|
||||
otel_trace_layer: Option<
|
||||
tracing_opentelemetry::OpenTelemetryLayer<Registry, opentelemetry_sdk::trace::Tracer>,
|
||||
>,
|
||||
) -> ImksResult<SdkLoggerProvider> {
|
||||
let env_filter =
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&config.log_level));
|
||||
|
||||
let (logger_provider, log_bridge_layer) = if config.logs_enabled {
|
||||
let exporter = build_log_exporter(config)?;
|
||||
|
||||
let resource = resource.cloned().unwrap_or_else(|| Resource::builder().build());
|
||||
|
||||
let provider = SdkLoggerProvider::builder()
|
||||
.with_resource(resource)
|
||||
.with_batch_exporter(exporter)
|
||||
.build();
|
||||
|
||||
let bridge = OpenTelemetryTracingBridge::new(&provider);
|
||||
(Some(provider), Some(bridge))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
match (otel_trace_layer, log_bridge_layer) {
|
||||
(Some(trace_layer), Some(log_layer)) => {
|
||||
let subscriber = Registry::default()
|
||||
.with(trace_layer)
|
||||
.with(env_filter)
|
||||
.with(make_json_fmt())
|
||||
.with(log_layer);
|
||||
set_subscriber(subscriber);
|
||||
}
|
||||
(Some(trace_layer), None) => {
|
||||
let subscriber = Registry::default()
|
||||
.with(trace_layer)
|
||||
.with(env_filter)
|
||||
.with(make_json_fmt());
|
||||
set_subscriber(subscriber);
|
||||
}
|
||||
(None, Some(log_layer)) => {
|
||||
let subscriber = Registry::default()
|
||||
.with(env_filter)
|
||||
.with(make_json_fmt())
|
||||
.with(log_layer);
|
||||
set_subscriber(subscriber);
|
||||
}
|
||||
(None, None) => {
|
||||
let subscriber = Registry::default()
|
||||
.with(env_filter)
|
||||
.with(make_json_fmt());
|
||||
set_subscriber(subscriber);
|
||||
}
|
||||
}
|
||||
|
||||
let logger_provider = logger_provider.unwrap_or_else(|| SdkLoggerProvider::builder().build());
|
||||
|
||||
Ok(logger_provider)
|
||||
}
|
||||
|
||||
/// Create the JSON fmt layer with span context.
|
||||
fn make_json_fmt<S>() -> tracing_subscriber::fmt::Layer<
|
||||
S,
|
||||
tracing_subscriber::fmt::format::JsonFields,
|
||||
tracing_subscriber::fmt::format::Format<tracing_subscriber::fmt::format::Json>,
|
||||
>
|
||||
where
|
||||
S: tracing::Subscriber + for<'a> tracing_subscriber::registry::LookupSpan<'a>,
|
||||
{
|
||||
tracing_subscriber::fmt::layer()
|
||||
.json()
|
||||
.with_span_events(FmtSpan::CLOSE)
|
||||
.with_current_span(true)
|
||||
.with_span_list(true)
|
||||
}
|
||||
|
||||
fn set_subscriber<S>(subscriber: S)
|
||||
where
|
||||
S: tracing::Subscriber + Send + Sync + 'static,
|
||||
{
|
||||
match tracing::subscriber::set_global_default(subscriber) {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
tracing::warn!("Could not set global tracing subscriber: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn build_log_exporter(config: &TelemetryConfig) -> ImksResult<LogExporter> {
|
||||
match config.otlp_protocol {
|
||||
OtlpProtocol::Grpc => LogExporter::builder()
|
||||
.with_tonic()
|
||||
.with_endpoint(&config.otlp_endpoint)
|
||||
.build()
|
||||
.map_err(|e| crate::ImksError::Internal(format!("OTLP gRPC log exporter: {e}"))),
|
||||
OtlpProtocol::HttpProtobuf => LogExporter::builder()
|
||||
.with_http()
|
||||
.with_protocol(Protocol::HttpBinary)
|
||||
.with_endpoint(&config.otlp_endpoint)
|
||||
.build()
|
||||
.map_err(|e| crate::ImksError::Internal(format!("OTLP HTTP log exporter: {e}"))),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,168 @@
|
||||
//! Prometheus metrics: global meter provider, registry, and the /metrics actix-web handler.
|
||||
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use opentelemetry::global;
|
||||
use opentelemetry::metrics::{Counter, Histogram, Meter, UpDownCounter};
|
||||
use opentelemetry::KeyValue;
|
||||
use opentelemetry_sdk::metrics::SdkMeterProvider;
|
||||
use opentelemetry_sdk::Resource;
|
||||
use prometheus::{Encoder, Registry, TextEncoder};
|
||||
|
||||
use crate::ImksResult;
|
||||
|
||||
/// Shared Prometheus registry, lazily initialized.
|
||||
static PROMETHEUS_REGISTRY: OnceLock<Registry> = OnceLock::new();
|
||||
|
||||
/// Global metrics instruments, initialized once at startup.
|
||||
static METRICS: OnceLock<MetricsInstruments> = OnceLock::new();
|
||||
|
||||
/// All application metrics instruments.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricsInstruments {
|
||||
pub connections_active: UpDownCounter<i64>,
|
||||
pub connections_total: Counter<u64>,
|
||||
pub disconnections_total: Counter<u64>,
|
||||
pub messages_received_total: Counter<u64>,
|
||||
pub messages_sent_total: Counter<u64>,
|
||||
pub event_handling_duration: Histogram<f64>,
|
||||
pub db_query_duration: Histogram<f64>,
|
||||
pub engine_sessions_active: UpDownCounter<i64>,
|
||||
pub namespaces_active: UpDownCounter<i64>,
|
||||
pub gprc_calls_total: Counter<u64>,
|
||||
pub gprc_call_errors_total: Counter<u64>,
|
||||
pub adapter_broadcasts_total: Counter<u64>,
|
||||
}
|
||||
|
||||
/// Initialize the Prometheus meter provider and create all metric instruments.
|
||||
pub fn init_metrics(
|
||||
_config: &super::config::TelemetryConfig,
|
||||
resource: &Resource,
|
||||
) -> ImksResult<(SdkMeterProvider, MetricsInstruments)> {
|
||||
let registry = Registry::new();
|
||||
PROMETHEUS_REGISTRY
|
||||
.set(registry.clone())
|
||||
.expect("Prometheus registry already initialized");
|
||||
|
||||
let exporter = opentelemetry_prometheus::exporter()
|
||||
.with_registry(registry)
|
||||
.build()
|
||||
.map_err(|e| crate::ImksError::Internal(format!("failed to build Prometheus exporter: {e}")))?;
|
||||
|
||||
let provider = SdkMeterProvider::builder()
|
||||
.with_resource(resource.clone())
|
||||
.with_reader(exporter)
|
||||
.build();
|
||||
|
||||
global::set_meter_provider(provider.clone());
|
||||
|
||||
let meter = global::meter_with_scope(
|
||||
opentelemetry::InstrumentationScope::builder("imks")
|
||||
.with_version(env!("CARGO_PKG_VERSION"))
|
||||
.build(),
|
||||
);
|
||||
|
||||
let instruments = MetricsInstruments::new(&meter);
|
||||
METRICS
|
||||
.set(instruments.clone())
|
||||
.expect("Metrics instruments already initialized");
|
||||
|
||||
Ok((provider, instruments))
|
||||
}
|
||||
|
||||
/// Obtain the globally initialized metrics. Panics if not initialized.
|
||||
pub fn get() -> MetricsInstruments {
|
||||
METRICS
|
||||
.get()
|
||||
.expect("Metrics not initialized — call init_metrics first")
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Obtain the globally initialized metrics, returning `None` if not initialized.
|
||||
/// Prefer this in library code that may run before metrics are set up (e.g., tests).
|
||||
pub fn try_get() -> Option<MetricsInstruments> {
|
||||
METRICS.get().cloned()
|
||||
}
|
||||
|
||||
impl MetricsInstruments {
|
||||
fn new(meter: &Meter) -> Self {
|
||||
Self {
|
||||
connections_active: meter
|
||||
.i64_up_down_counter("imks_connections_active")
|
||||
.with_description("Number of active Socket.IO connections")
|
||||
.build(),
|
||||
connections_total: meter
|
||||
.u64_counter("imks_connections_total")
|
||||
.with_description("Total number of socket connections since start")
|
||||
.build(),
|
||||
disconnections_total: meter
|
||||
.u64_counter("imks_disconnections_total")
|
||||
.with_description("Total number of socket disconnections since start")
|
||||
.build(),
|
||||
messages_received_total: meter
|
||||
.u64_counter("imks_messages_received_total")
|
||||
.with_description("Total number of messages received from clients")
|
||||
.build(),
|
||||
messages_sent_total: meter
|
||||
.u64_counter("imks_messages_sent_total")
|
||||
.with_description("Total number of messages sent to clients")
|
||||
.build(),
|
||||
event_handling_duration: meter
|
||||
.f64_histogram("imks_event_handling_duration_seconds")
|
||||
.with_description("Socket.IO event handling latency in seconds")
|
||||
.build(),
|
||||
db_query_duration: meter
|
||||
.f64_histogram("imks_db_query_duration_seconds")
|
||||
.with_description("Database query duration in seconds")
|
||||
.build(),
|
||||
engine_sessions_active: meter
|
||||
.i64_up_down_counter("imks_engine_sessions_active")
|
||||
.with_description("Number of active Engine.IO sessions")
|
||||
.build(),
|
||||
namespaces_active: meter
|
||||
.i64_up_down_counter("imks_namespaces_active")
|
||||
.with_description("Number of active Socket.IO namespaces")
|
||||
.build(),
|
||||
gprc_calls_total: meter
|
||||
.u64_counter("imks_gprc_calls_total")
|
||||
.with_description("Total number of gRPC calls to appks")
|
||||
.build(),
|
||||
gprc_call_errors_total: meter
|
||||
.u64_counter("imks_gprc_call_errors_total")
|
||||
.with_description("Total number of failed gRPC calls to appks")
|
||||
.build(),
|
||||
adapter_broadcasts_total: meter
|
||||
.u64_counter("imks_adapter_broadcasts_total")
|
||||
.with_description("Total number of cross-node adapter broadcasts")
|
||||
.build(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper: create KV attributes for an event.
|
||||
pub fn event_attrs(event: &str) -> [KeyValue; 1] {
|
||||
[KeyValue::new("event", event.to_string())]
|
||||
}
|
||||
|
||||
/// Helper: create KV attributes for a namespace.
|
||||
pub fn namespace_attrs(ns: &str) -> [KeyValue; 1] {
|
||||
[KeyValue::new("namespace", ns.to_string())]
|
||||
}
|
||||
}
|
||||
|
||||
/// Actix-web handler for `GET /metrics`.
|
||||
///
|
||||
/// Encodes the Prometheus text format from the shared registry.
|
||||
pub async fn metrics_handler() -> actix_web::HttpResponse {
|
||||
let registry = PROMETHEUS_REGISTRY.get().expect("Prometheus registry not initialized");
|
||||
|
||||
let metric_families = registry.gather();
|
||||
let encoder = TextEncoder::new();
|
||||
let mut buffer = Vec::new();
|
||||
if encoder.encode(&metric_families, &mut buffer).is_err() {
|
||||
return actix_web::HttpResponse::InternalServerError().body("failed to encode metrics");
|
||||
}
|
||||
|
||||
actix_web::HttpResponse::Ok()
|
||||
.content_type("text/plain; version=0.0.4")
|
||||
.body(buffer)
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
//! Telemetry module — OpenTelemetry-compatible observability stack.
|
||||
//!
|
||||
//! Provides:
|
||||
//! - **Traces**: distributed tracing via OTLP (gRPC or HTTP) with W3C TraceContext propagation
|
||||
//! - **Metrics**: Prometheus-compatible metrics exposed at `/metrics`
|
||||
//! - **Logs**: JSON + console dual output, plus OTLP log export bridge
|
||||
//! - **Health**: enhanced `/health` endpoint with upstream dependency checks
|
||||
//!
|
||||
//! # Quick start
|
||||
//!
|
||||
//! ```ignore
|
||||
//! let guard = telemetry::init();
|
||||
//! // ... application runs ...
|
||||
//! drop(guard); // graceful shutdown, flushes all pending telemetry
|
||||
//! ```
|
||||
//!
|
||||
//! # Environment variables
|
||||
//!
|
||||
//! | Variable | Default | Description |
|
||||
//! |---|---|---|
|
||||
//! | `OTEL_SERVICE_NAME` | `imks` | Service name in traces/metrics/logs |
|
||||
//! | `OTEL_SERVICE_VERSION` | Cargo version | Service version |
|
||||
//! | `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4317` | OTLP collector endpoint |
|
||||
//! | `OTEL_EXPORTER_OTLP_PROTOCOL` | `grpc` | `grpc` or `http/protobuf` |
|
||||
//! | `OTEL_TRACES_ENABLED` | `true` | Enable distributed tracing |
|
||||
//! | `OTEL_METRICS_ENABLED` | `true` | Enable Prometheus metrics |
|
||||
//! | `OTEL_LOGS_ENABLED` | `true` | Enable OTLP log export |
|
||||
//! | `LOG_FORMAT` | `both` | `json`, `pretty`, or `both` |
|
||||
//! | `RUST_LOG` | `info` | Log level filter |
|
||||
|
||||
pub mod config;
|
||||
pub mod health;
|
||||
pub mod logs;
|
||||
pub mod metrics;
|
||||
pub mod traces;
|
||||
|
||||
use opentelemetry_sdk::Resource;
|
||||
|
||||
pub use config::TelemetryConfig;
|
||||
pub use health::{HealthCheckFns, health_check};
|
||||
pub use metrics::{MetricsInstruments, get as metrics, try_get as try_metrics};
|
||||
|
||||
/// Holds all telemetry providers for graceful shutdown.
|
||||
///
|
||||
/// When `shutdown()` is called, flushes and shuts down all providers in order:
|
||||
/// tracer → meter → logger.
|
||||
pub struct TelemetryGuard {
|
||||
tracer_provider: Option<opentelemetry_sdk::trace::SdkTracerProvider>,
|
||||
meter_provider: Option<opentelemetry_sdk::metrics::SdkMeterProvider>,
|
||||
logger_provider: Option<opentelemetry_sdk::logs::SdkLoggerProvider>,
|
||||
}
|
||||
|
||||
impl TelemetryGuard {
|
||||
/// Flush all pending telemetry and shut down providers.
|
||||
///
|
||||
/// Call this before process exit to avoid data loss.
|
||||
pub fn shutdown(mut self) {
|
||||
if let Some(tp) = self.tracer_provider.take()
|
||||
&& let Ok(rt) = tokio::runtime::Runtime::new()
|
||||
{
|
||||
rt.block_on(async {
|
||||
tp.shutdown().unwrap_or_default();
|
||||
});
|
||||
}
|
||||
if let Some(mp) = self.meter_provider.take()
|
||||
&& let Ok(rt) = tokio::runtime::Runtime::new()
|
||||
{
|
||||
rt.block_on(async {
|
||||
mp.shutdown().unwrap_or_default();
|
||||
});
|
||||
}
|
||||
if let Some(lp) = self.logger_provider.take()
|
||||
&& let Ok(rt) = tokio::runtime::Runtime::new()
|
||||
{
|
||||
rt.block_on(async {
|
||||
lp.shutdown().unwrap_or_default();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Force-flush all pending trace spans (non-blocking best-effort).
|
||||
pub fn flush_traces(&self) {
|
||||
if let Some(ref tp) = self.tracer_provider
|
||||
&& let Ok(rt) = tokio::runtime::Runtime::new()
|
||||
{
|
||||
rt.block_on(async {
|
||||
tp.force_flush().unwrap_or_default();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Force-flush all pending metrics.
|
||||
pub fn flush_metrics(&self) {
|
||||
if let Some(ref mp) = self.meter_provider
|
||||
&& let Ok(rt) = tokio::runtime::Runtime::new()
|
||||
{
|
||||
rt.block_on(async {
|
||||
mp.force_flush().unwrap_or_default();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TelemetryGuard {
|
||||
fn drop(&mut self) {
|
||||
// Best-effort: the caller should call shutdown() explicitly before process exit
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize the full telemetry stack.
|
||||
///
|
||||
/// 1. Creates the OTel Resource (service name, version, host)
|
||||
/// 2. Sets up tracing subscriber with console + JSON + OTel layers
|
||||
/// 3. Initializes Prometheus metrics
|
||||
/// 4. Records server start time for uptime tracking
|
||||
///
|
||||
/// Returns a `TelemetryGuard` that should be held until process exit.
|
||||
pub fn init() -> TelemetryGuard {
|
||||
let config = TelemetryConfig::from_env();
|
||||
|
||||
let resource = Resource::builder()
|
||||
.with_service_name(config.service_name.clone())
|
||||
.with_attribute(opentelemetry::KeyValue::new(
|
||||
"service.version",
|
||||
config.service_version.clone(),
|
||||
))
|
||||
.with_attribute(opentelemetry::KeyValue::new(
|
||||
"deployment.environment",
|
||||
std::env::var("OTEL_RESOURCE_ATTRIBUTES_DEPLOYMENT")
|
||||
.unwrap_or_else(|_| "development".to_string()),
|
||||
))
|
||||
.build();
|
||||
|
||||
// 1. Set up tracing (traces + subscriber)
|
||||
let (tracer_provider, logger_provider) = if config.traces_enabled {
|
||||
match traces::init_tracing(&config, &resource) {
|
||||
Ok((provider, otel_layer)) => {
|
||||
match logs::init_subscriber(&config, Some(&resource), Some(otel_layer)) {
|
||||
Ok(logger_provider) => {
|
||||
tracing::info!(
|
||||
service = %config.service_name,
|
||||
endpoint = %config.otlp_endpoint,
|
||||
protocol = ?config.otlp_protocol,
|
||||
"OpenTelemetry tracing initialized"
|
||||
);
|
||||
(Some(provider), Some(logger_provider))
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Failed to initialize log bridge: {e}. Tracing still active."
|
||||
);
|
||||
(Some(provider), None)
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Failed to initialize OTLP tracing: {e}. Using console-only logging."
|
||||
);
|
||||
match logs::init_subscriber(&config, Some(&resource), None) {
|
||||
Ok(lp) => (None, Some(lp)),
|
||||
Err(_) => {
|
||||
tracing_subscriber::fmt().init();
|
||||
(None, None)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match logs::init_subscriber(&config, Some(&resource), None) {
|
||||
Ok(lp) => (None, Some(lp)),
|
||||
Err(_) => {
|
||||
tracing_subscriber::fmt().init();
|
||||
(None, None)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// 2. Metrics
|
||||
let meter_provider = if config.metrics_enabled {
|
||||
match metrics::init_metrics(&config, &resource) {
|
||||
Ok((provider, _instruments)) => {
|
||||
tracing::info!("Prometheus metrics initialized (available at /metrics)");
|
||||
Some(provider)
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to initialize Prometheus metrics: {e}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// 3. Record start time for uptime
|
||||
health::record_start_time();
|
||||
|
||||
TelemetryGuard {
|
||||
tracer_provider,
|
||||
meter_provider,
|
||||
logger_provider,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
//! OpenTelemetry distributed tracing — OTLP exporter + tracing-opentelemetry bridge.
|
||||
|
||||
use opentelemetry::trace::TracerProvider as _;
|
||||
use opentelemetry_otlp::{Protocol, SpanExporter, WithExportConfig};
|
||||
use opentelemetry_sdk::propagation::TraceContextPropagator;
|
||||
use opentelemetry_sdk::trace::{SdkTracerProvider, Tracer};
|
||||
use opentelemetry_sdk::Resource;
|
||||
use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
use tracing_subscriber::Registry;
|
||||
|
||||
use super::config::{OtlpProtocol, TelemetryConfig};
|
||||
use crate::ImksResult;
|
||||
|
||||
/// Build an OTLP SpanExporter based on the configured protocol.
|
||||
fn build_span_exporter(config: &TelemetryConfig) -> ImksResult<SpanExporter> {
|
||||
match config.otlp_protocol {
|
||||
OtlpProtocol::Grpc => SpanExporter::builder()
|
||||
.with_tonic()
|
||||
.with_endpoint(&config.otlp_endpoint)
|
||||
.build()
|
||||
.map_err(|e| crate::ImksError::Internal(format!("OTLP gRPC span exporter: {e}"))),
|
||||
OtlpProtocol::HttpProtobuf => SpanExporter::builder()
|
||||
.with_http()
|
||||
.with_protocol(Protocol::HttpBinary)
|
||||
.with_endpoint(&config.otlp_endpoint)
|
||||
.build()
|
||||
.map_err(|e| {
|
||||
crate::ImksError::Internal(format!("OTLP HTTP span exporter: {e}"))
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize the tracing pipeline: OTel tracer provider + tracing-opentelemetry layer.
|
||||
///
|
||||
/// Returns (SdkTracerProvider, OpenTelemetryLayer).
|
||||
pub fn init_tracing(
|
||||
config: &TelemetryConfig,
|
||||
resource: &Resource,
|
||||
) -> ImksResult<(SdkTracerProvider, OpenTelemetryLayer<Registry, Tracer>)> {
|
||||
// Set global propagator for W3C TraceContext extraction/injection
|
||||
opentelemetry::global::set_text_map_propagator(TraceContextPropagator::new());
|
||||
|
||||
let exporter = build_span_exporter(config)?;
|
||||
|
||||
let provider = SdkTracerProvider::builder()
|
||||
.with_resource(resource.clone())
|
||||
.with_batch_exporter(exporter)
|
||||
.build();
|
||||
|
||||
let tracer = provider.tracer("imks");
|
||||
|
||||
let otel_layer = tracing_opentelemetry::layer().with_tracer(tracer);
|
||||
|
||||
Ok((provider, otel_layer))
|
||||
}
|
||||
Reference in New Issue
Block a user