feat(telemetry): integrate OpenTelemetry observability stack with health metrics

- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration
- Implement connection tracking with active/total/disconnection metrics
- Add health endpoint with uptime and connection counts
- Integrate tracing spans for socket events and engine messages
- Add metrics collection for event handling duration
- Update health endpoint to include live runtime state
- Add graceful telemetry shutdown in main function
- Implement engine session active metrics tracking
- Add namespace-specific attributes to connection metrics
- Introduce message edit history retrieval endpoint
- Add scheduled message CRUD operations and dispatcher
- Update Socket.IO event registration with observability
- Refactor component update to remove dead code allowance
- Add comprehensive environment variables documentation
- Implement detailed development guidelines in AGENTS.md
This commit is contained in:
zhenyi
2026-06-11 13:53:29 +08:00
parent 40241e5db3
commit 0dbac480ae
22 changed files with 3116 additions and 64 deletions
+85
View File
@@ -0,0 +1,85 @@
/// Telemetry configuration, populated from environment variables.
///
/// Follows the OpenTelemetry environment variable specification:
/// <https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/>
#[derive(Debug, Clone)]
pub struct TelemetryConfig {
pub service_name: String,
pub service_version: String,
pub otlp_endpoint: String,
pub otlp_protocol: OtlpProtocol,
pub traces_enabled: bool,
pub metrics_enabled: bool,
pub logs_enabled: bool,
pub log_format: LogFormat,
pub log_level: String,
}
#[derive(Debug, Clone, PartialEq)]
pub enum OtlpProtocol {
Grpc,
HttpProtobuf,
}
#[derive(Debug, Clone, PartialEq)]
pub enum LogFormat {
Json,
Pretty,
}
impl Default for TelemetryConfig {
fn default() -> Self {
Self {
service_name: env_or("OTEL_SERVICE_NAME", "imks"),
service_version: env_or("OTEL_SERVICE_VERSION", env!("CARGO_PKG_VERSION")),
otlp_endpoint: env_or(
"OTEL_EXPORTER_OTLP_ENDPOINT",
"http://localhost:4317",
),
otlp_protocol: detect_otlp_protocol(),
traces_enabled: env_bool("OTEL_TRACES_ENABLED", true),
metrics_enabled: env_bool("OTEL_METRICS_ENABLED", true),
logs_enabled: env_bool("OTEL_LOGS_ENABLED", true),
log_format: detect_log_format(),
log_level: env_or("RUST_LOG", "info"),
}
}
}
impl TelemetryConfig {
pub fn from_env() -> Self {
Self::default()
}
}
fn env_or(key: &str, default: &str) -> String {
std::env::var(key).unwrap_or_else(|_| default.to_string())
}
fn env_bool(key: &str, default: bool) -> bool {
std::env::var(key)
.map(|v| matches!(v.to_lowercase().as_str(), "true" | "1" | "yes" | "on"))
.unwrap_or(default)
}
fn detect_otlp_protocol() -> OtlpProtocol {
match std::env::var("OTEL_EXPORTER_OTLP_PROTOCOL")
.unwrap_or_default()
.to_lowercase()
.as_str()
{
"http/protobuf" | "http/binary" => OtlpProtocol::HttpProtobuf,
_ => OtlpProtocol::Grpc, // default to gRPC as project already depends on tonic
}
}
fn detect_log_format() -> LogFormat {
match std::env::var("LOG_FORMAT")
.unwrap_or_else(|_| "json".to_string())
.to_lowercase()
.as_str()
{
"pretty" | "text" | "console" => LogFormat::Pretty,
_ => LogFormat::Json, // default to JSON for structured logging
}
}
+170
View File
@@ -0,0 +1,170 @@
//! Enhanced health check endpoint with upstream dependency checks.
//!
//! Returns JSON with server status, version, uptime, connection counts,
//! and optional health checks for PostgreSQL, Redis, NATS, and gRPC.
use std::sync::Arc;
use std::sync::OnceLock;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::Instant;
use actix_web::HttpResponse;
use serde::Serialize;
/// Server start time captured at init.
static START_TIME: std::sync::OnceLock<Instant> = std::sync::OnceLock::new();
/// Live connection counter shared across the process.
/// Updated by the socket layer on connect / disconnect.
static CONNECTIONS_ACTIVE: OnceLock<AtomicU64> = OnceLock::new();
/// Initializes the start time (call once during startup).
pub fn record_start_time() {
START_TIME.set(Instant::now()).ok();
}
/// Initialize shared health counters (call once during startup).
pub fn init_counters() {
CONNECTIONS_ACTIVE.set(AtomicU64::new(0)).ok();
}
/// Signal that a new socket connection was established.
pub fn connection_connected() {
if let Some(c) = CONNECTIONS_ACTIVE.get() {
c.fetch_add(1, Ordering::Relaxed);
}
}
/// Signal that a socket connection was closed.
pub fn connection_disconnected() {
if let Some(c) = CONNECTIONS_ACTIVE.get() {
c.fetch_sub(1, Ordering::Relaxed);
}
}
/// Return the current number of active socket connections.
pub fn connections_active_count() -> u64 {
CONNECTIONS_ACTIVE
.get()
.map(|c| c.load(Ordering::Relaxed))
.unwrap_or(0)
}
/// Returns the server uptime in seconds.
pub fn uptime_secs() -> u64 {
START_TIME
.get()
.map(|t| t.elapsed().as_secs())
.unwrap_or(0)
}
#[derive(Debug, Clone, Serialize)]
pub struct HealthResponse {
pub status: String,
pub version: String,
pub timestamp: String,
pub uptime_secs: u64,
pub connections_active: u64,
pub sessions_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub checks: Option<HealthChecks>,
}
#[derive(Debug, Clone, Serialize)]
pub struct HealthChecks {
#[serde(skip_serializing_if = "Option::is_none")]
pub postgres: Option<CheckResult>,
#[serde(skip_serializing_if = "Option::is_none")]
pub redis: Option<CheckResult>,
#[serde(skip_serializing_if = "Option::is_none")]
pub nats: Option<CheckResult>,
#[serde(skip_serializing_if = "Option::is_none")]
pub grpc: Option<CheckResult>,
}
#[derive(Debug, Clone, Serialize)]
pub struct CheckResult {
pub status: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub latency_ms: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
/// Optional external check functions.
/// Each returns `Some(CheckResult)` if the service is configured, `None` otherwise.
#[derive(Default)]
pub struct HealthCheckFns {
pub check_postgres: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
pub check_redis: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
pub check_nats: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
pub check_grpc: Option<Arc<dyn Fn() -> CheckResult + Send + Sync>>,
}
impl HealthCheckFns {
pub fn with_postgres(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
self.check_postgres = Some(Arc::new(f));
self
}
pub fn with_redis(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
self.check_redis = Some(Arc::new(f));
self
}
pub fn with_nats(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
self.check_nats = Some(Arc::new(f));
self
}
pub fn with_grpc(mut self, f: impl Fn() -> CheckResult + Send + Sync + 'static) -> Self {
self.check_grpc = Some(Arc::new(f));
self
}
}
/// GET /health handler with dependency checks.
pub async fn health_check(checks: actix_web::web::Data<Arc<HealthCheckFns>>) -> HttpResponse {
let checks = checks.get_ref();
let health_checks = if checks.check_postgres.is_some()
|| checks.check_redis.is_some()
|| checks.check_nats.is_some()
|| checks.check_grpc.is_some()
{
Some(HealthChecks {
postgres: checks.check_postgres.as_ref().map(|f| f()),
redis: checks.check_redis.as_ref().map(|f| f()),
nats: checks.check_nats.as_ref().map(|f| f()),
grpc: checks.check_grpc.as_ref().map(|f| f()),
})
} else {
None
};
let overall_status = if let Some(ref hc) = health_checks {
let all_up = [&hc.postgres, &hc.redis, &hc.nats, &hc.grpc]
.iter()
.filter_map(|c| c.as_ref())
.all(|c| c.status == "up");
if all_up {
"healthy"
} else {
"degraded"
}
} else {
"healthy"
};
let response = HealthResponse {
status: overall_status.to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
timestamp: chrono::Utc::now().to_rfc3339(),
uptime_secs: uptime_secs(),
connections_active: 0,
sessions_count: 0,
checks: health_checks,
};
HttpResponse::Ok().json(response)
}
+129
View File
@@ -0,0 +1,129 @@
//! Log export: JSON console output + OpenTelemetry log bridge (OTLP).
use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge;
use opentelemetry_otlp::{LogExporter, Protocol, WithExportConfig};
use opentelemetry_sdk::logs::SdkLoggerProvider;
use opentelemetry_sdk::Resource;
use tracing_subscriber::fmt::format::FmtSpan;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::EnvFilter;
use tracing_subscriber::Registry;
use super::config::{OtlpProtocol, TelemetryConfig};
use crate::ImksResult;
/// Initialize the tracing subscriber.
///
/// Layer order (critical for OpenTelemetry compatibility):
/// 1. Registry
/// 2. OpenTelemetry trace layer (must be first — needs LookupSpan)
/// 3. EnvFilter
/// 4. Console formatting layer (JSON)
/// 5. OpenTelemetry log bridge
///
/// Returns the SdkLoggerProvider for graceful shutdown.
pub fn init_subscriber(
config: &TelemetryConfig,
resource: Option<&Resource>,
otel_trace_layer: Option<
tracing_opentelemetry::OpenTelemetryLayer<Registry, opentelemetry_sdk::trace::Tracer>,
>,
) -> ImksResult<SdkLoggerProvider> {
let env_filter =
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&config.log_level));
let (logger_provider, log_bridge_layer) = if config.logs_enabled {
let exporter = build_log_exporter(config)?;
let resource = resource.cloned().unwrap_or_else(|| Resource::builder().build());
let provider = SdkLoggerProvider::builder()
.with_resource(resource)
.with_batch_exporter(exporter)
.build();
let bridge = OpenTelemetryTracingBridge::new(&provider);
(Some(provider), Some(bridge))
} else {
(None, None)
};
match (otel_trace_layer, log_bridge_layer) {
(Some(trace_layer), Some(log_layer)) => {
let subscriber = Registry::default()
.with(trace_layer)
.with(env_filter)
.with(make_json_fmt())
.with(log_layer);
set_subscriber(subscriber);
}
(Some(trace_layer), None) => {
let subscriber = Registry::default()
.with(trace_layer)
.with(env_filter)
.with(make_json_fmt());
set_subscriber(subscriber);
}
(None, Some(log_layer)) => {
let subscriber = Registry::default()
.with(env_filter)
.with(make_json_fmt())
.with(log_layer);
set_subscriber(subscriber);
}
(None, None) => {
let subscriber = Registry::default()
.with(env_filter)
.with(make_json_fmt());
set_subscriber(subscriber);
}
}
let logger_provider = logger_provider.unwrap_or_else(|| SdkLoggerProvider::builder().build());
Ok(logger_provider)
}
/// Create the JSON fmt layer with span context.
fn make_json_fmt<S>() -> tracing_subscriber::fmt::Layer<
S,
tracing_subscriber::fmt::format::JsonFields,
tracing_subscriber::fmt::format::Format<tracing_subscriber::fmt::format::Json>,
>
where
S: tracing::Subscriber + for<'a> tracing_subscriber::registry::LookupSpan<'a>,
{
tracing_subscriber::fmt::layer()
.json()
.with_span_events(FmtSpan::CLOSE)
.with_current_span(true)
.with_span_list(true)
}
fn set_subscriber<S>(subscriber: S)
where
S: tracing::Subscriber + Send + Sync + 'static,
{
match tracing::subscriber::set_global_default(subscriber) {
Ok(()) => {}
Err(e) => {
tracing::warn!("Could not set global tracing subscriber: {e}");
}
}
}
fn build_log_exporter(config: &TelemetryConfig) -> ImksResult<LogExporter> {
match config.otlp_protocol {
OtlpProtocol::Grpc => LogExporter::builder()
.with_tonic()
.with_endpoint(&config.otlp_endpoint)
.build()
.map_err(|e| crate::ImksError::Internal(format!("OTLP gRPC log exporter: {e}"))),
OtlpProtocol::HttpProtobuf => LogExporter::builder()
.with_http()
.with_protocol(Protocol::HttpBinary)
.with_endpoint(&config.otlp_endpoint)
.build()
.map_err(|e| crate::ImksError::Internal(format!("OTLP HTTP log exporter: {e}"))),
}
}
+168
View File
@@ -0,0 +1,168 @@
//! Prometheus metrics: global meter provider, registry, and the /metrics actix-web handler.
use std::sync::OnceLock;
use opentelemetry::global;
use opentelemetry::metrics::{Counter, Histogram, Meter, UpDownCounter};
use opentelemetry::KeyValue;
use opentelemetry_sdk::metrics::SdkMeterProvider;
use opentelemetry_sdk::Resource;
use prometheus::{Encoder, Registry, TextEncoder};
use crate::ImksResult;
/// Shared Prometheus registry, lazily initialized.
static PROMETHEUS_REGISTRY: OnceLock<Registry> = OnceLock::new();
/// Global metrics instruments, initialized once at startup.
static METRICS: OnceLock<MetricsInstruments> = OnceLock::new();
/// All application metrics instruments.
#[derive(Debug, Clone)]
pub struct MetricsInstruments {
pub connections_active: UpDownCounter<i64>,
pub connections_total: Counter<u64>,
pub disconnections_total: Counter<u64>,
pub messages_received_total: Counter<u64>,
pub messages_sent_total: Counter<u64>,
pub event_handling_duration: Histogram<f64>,
pub db_query_duration: Histogram<f64>,
pub engine_sessions_active: UpDownCounter<i64>,
pub namespaces_active: UpDownCounter<i64>,
pub gprc_calls_total: Counter<u64>,
pub gprc_call_errors_total: Counter<u64>,
pub adapter_broadcasts_total: Counter<u64>,
}
/// Initialize the Prometheus meter provider and create all metric instruments.
pub fn init_metrics(
_config: &super::config::TelemetryConfig,
resource: &Resource,
) -> ImksResult<(SdkMeterProvider, MetricsInstruments)> {
let registry = Registry::new();
PROMETHEUS_REGISTRY
.set(registry.clone())
.expect("Prometheus registry already initialized");
let exporter = opentelemetry_prometheus::exporter()
.with_registry(registry)
.build()
.map_err(|e| crate::ImksError::Internal(format!("failed to build Prometheus exporter: {e}")))?;
let provider = SdkMeterProvider::builder()
.with_resource(resource.clone())
.with_reader(exporter)
.build();
global::set_meter_provider(provider.clone());
let meter = global::meter_with_scope(
opentelemetry::InstrumentationScope::builder("imks")
.with_version(env!("CARGO_PKG_VERSION"))
.build(),
);
let instruments = MetricsInstruments::new(&meter);
METRICS
.set(instruments.clone())
.expect("Metrics instruments already initialized");
Ok((provider, instruments))
}
/// Obtain the globally initialized metrics. Panics if not initialized.
pub fn get() -> MetricsInstruments {
METRICS
.get()
.expect("Metrics not initialized — call init_metrics first")
.clone()
}
/// Obtain the globally initialized metrics, returning `None` if not initialized.
/// Prefer this in library code that may run before metrics are set up (e.g., tests).
pub fn try_get() -> Option<MetricsInstruments> {
METRICS.get().cloned()
}
impl MetricsInstruments {
fn new(meter: &Meter) -> Self {
Self {
connections_active: meter
.i64_up_down_counter("imks_connections_active")
.with_description("Number of active Socket.IO connections")
.build(),
connections_total: meter
.u64_counter("imks_connections_total")
.with_description("Total number of socket connections since start")
.build(),
disconnections_total: meter
.u64_counter("imks_disconnections_total")
.with_description("Total number of socket disconnections since start")
.build(),
messages_received_total: meter
.u64_counter("imks_messages_received_total")
.with_description("Total number of messages received from clients")
.build(),
messages_sent_total: meter
.u64_counter("imks_messages_sent_total")
.with_description("Total number of messages sent to clients")
.build(),
event_handling_duration: meter
.f64_histogram("imks_event_handling_duration_seconds")
.with_description("Socket.IO event handling latency in seconds")
.build(),
db_query_duration: meter
.f64_histogram("imks_db_query_duration_seconds")
.with_description("Database query duration in seconds")
.build(),
engine_sessions_active: meter
.i64_up_down_counter("imks_engine_sessions_active")
.with_description("Number of active Engine.IO sessions")
.build(),
namespaces_active: meter
.i64_up_down_counter("imks_namespaces_active")
.with_description("Number of active Socket.IO namespaces")
.build(),
gprc_calls_total: meter
.u64_counter("imks_gprc_calls_total")
.with_description("Total number of gRPC calls to appks")
.build(),
gprc_call_errors_total: meter
.u64_counter("imks_gprc_call_errors_total")
.with_description("Total number of failed gRPC calls to appks")
.build(),
adapter_broadcasts_total: meter
.u64_counter("imks_adapter_broadcasts_total")
.with_description("Total number of cross-node adapter broadcasts")
.build(),
}
}
/// Helper: create KV attributes for an event.
pub fn event_attrs(event: &str) -> [KeyValue; 1] {
[KeyValue::new("event", event.to_string())]
}
/// Helper: create KV attributes for a namespace.
pub fn namespace_attrs(ns: &str) -> [KeyValue; 1] {
[KeyValue::new("namespace", ns.to_string())]
}
}
/// Actix-web handler for `GET /metrics`.
///
/// Encodes the Prometheus text format from the shared registry.
pub async fn metrics_handler() -> actix_web::HttpResponse {
let registry = PROMETHEUS_REGISTRY.get().expect("Prometheus registry not initialized");
let metric_families = registry.gather();
let encoder = TextEncoder::new();
let mut buffer = Vec::new();
if encoder.encode(&metric_families, &mut buffer).is_err() {
return actix_web::HttpResponse::InternalServerError().body("failed to encode metrics");
}
actix_web::HttpResponse::Ok()
.content_type("text/plain; version=0.0.4")
.body(buffer)
}
+203
View File
@@ -0,0 +1,203 @@
//! Telemetry module — OpenTelemetry-compatible observability stack.
//!
//! Provides:
//! - **Traces**: distributed tracing via OTLP (gRPC or HTTP) with W3C TraceContext propagation
//! - **Metrics**: Prometheus-compatible metrics exposed at `/metrics`
//! - **Logs**: JSON + console dual output, plus OTLP log export bridge
//! - **Health**: enhanced `/health` endpoint with upstream dependency checks
//!
//! # Quick start
//!
//! ```ignore
//! let guard = telemetry::init();
//! // ... application runs ...
//! drop(guard); // graceful shutdown, flushes all pending telemetry
//! ```
//!
//! # Environment variables
//!
//! | Variable | Default | Description |
//! |---|---|---|
//! | `OTEL_SERVICE_NAME` | `imks` | Service name in traces/metrics/logs |
//! | `OTEL_SERVICE_VERSION` | Cargo version | Service version |
//! | `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4317` | OTLP collector endpoint |
//! | `OTEL_EXPORTER_OTLP_PROTOCOL` | `grpc` | `grpc` or `http/protobuf` |
//! | `OTEL_TRACES_ENABLED` | `true` | Enable distributed tracing |
//! | `OTEL_METRICS_ENABLED` | `true` | Enable Prometheus metrics |
//! | `OTEL_LOGS_ENABLED` | `true` | Enable OTLP log export |
//! | `LOG_FORMAT` | `both` | `json`, `pretty`, or `both` |
//! | `RUST_LOG` | `info` | Log level filter |
pub mod config;
pub mod health;
pub mod logs;
pub mod metrics;
pub mod traces;
use opentelemetry_sdk::Resource;
pub use config::TelemetryConfig;
pub use health::{HealthCheckFns, health_check};
pub use metrics::{MetricsInstruments, get as metrics, try_get as try_metrics};
/// Holds all telemetry providers for graceful shutdown.
///
/// When `shutdown()` is called, flushes and shuts down all providers in order:
/// tracer → meter → logger.
pub struct TelemetryGuard {
tracer_provider: Option<opentelemetry_sdk::trace::SdkTracerProvider>,
meter_provider: Option<opentelemetry_sdk::metrics::SdkMeterProvider>,
logger_provider: Option<opentelemetry_sdk::logs::SdkLoggerProvider>,
}
impl TelemetryGuard {
/// Flush all pending telemetry and shut down providers.
///
/// Call this before process exit to avoid data loss.
pub fn shutdown(mut self) {
if let Some(tp) = self.tracer_provider.take()
&& let Ok(rt) = tokio::runtime::Runtime::new()
{
rt.block_on(async {
tp.shutdown().unwrap_or_default();
});
}
if let Some(mp) = self.meter_provider.take()
&& let Ok(rt) = tokio::runtime::Runtime::new()
{
rt.block_on(async {
mp.shutdown().unwrap_or_default();
});
}
if let Some(lp) = self.logger_provider.take()
&& let Ok(rt) = tokio::runtime::Runtime::new()
{
rt.block_on(async {
lp.shutdown().unwrap_or_default();
});
}
}
/// Force-flush all pending trace spans (non-blocking best-effort).
pub fn flush_traces(&self) {
if let Some(ref tp) = self.tracer_provider
&& let Ok(rt) = tokio::runtime::Runtime::new()
{
rt.block_on(async {
tp.force_flush().unwrap_or_default();
});
}
}
/// Force-flush all pending metrics.
pub fn flush_metrics(&self) {
if let Some(ref mp) = self.meter_provider
&& let Ok(rt) = tokio::runtime::Runtime::new()
{
rt.block_on(async {
mp.force_flush().unwrap_or_default();
});
}
}
}
impl Drop for TelemetryGuard {
fn drop(&mut self) {
// Best-effort: the caller should call shutdown() explicitly before process exit
}
}
/// Initialize the full telemetry stack.
///
/// 1. Creates the OTel Resource (service name, version, host)
/// 2. Sets up tracing subscriber with console + JSON + OTel layers
/// 3. Initializes Prometheus metrics
/// 4. Records server start time for uptime tracking
///
/// Returns a `TelemetryGuard` that should be held until process exit.
pub fn init() -> TelemetryGuard {
let config = TelemetryConfig::from_env();
let resource = Resource::builder()
.with_service_name(config.service_name.clone())
.with_attribute(opentelemetry::KeyValue::new(
"service.version",
config.service_version.clone(),
))
.with_attribute(opentelemetry::KeyValue::new(
"deployment.environment",
std::env::var("OTEL_RESOURCE_ATTRIBUTES_DEPLOYMENT")
.unwrap_or_else(|_| "development".to_string()),
))
.build();
// 1. Set up tracing (traces + subscriber)
let (tracer_provider, logger_provider) = if config.traces_enabled {
match traces::init_tracing(&config, &resource) {
Ok((provider, otel_layer)) => {
match logs::init_subscriber(&config, Some(&resource), Some(otel_layer)) {
Ok(logger_provider) => {
tracing::info!(
service = %config.service_name,
endpoint = %config.otlp_endpoint,
protocol = ?config.otlp_protocol,
"OpenTelemetry tracing initialized"
);
(Some(provider), Some(logger_provider))
}
Err(e) => {
tracing::warn!(
"Failed to initialize log bridge: {e}. Tracing still active."
);
(Some(provider), None)
}
}
}
Err(e) => {
tracing::warn!(
"Failed to initialize OTLP tracing: {e}. Using console-only logging."
);
match logs::init_subscriber(&config, Some(&resource), None) {
Ok(lp) => (None, Some(lp)),
Err(_) => {
tracing_subscriber::fmt().init();
(None, None)
}
}
}
}
} else {
match logs::init_subscriber(&config, Some(&resource), None) {
Ok(lp) => (None, Some(lp)),
Err(_) => {
tracing_subscriber::fmt().init();
(None, None)
}
}
};
// 2. Metrics
let meter_provider = if config.metrics_enabled {
match metrics::init_metrics(&config, &resource) {
Ok((provider, _instruments)) => {
tracing::info!("Prometheus metrics initialized (available at /metrics)");
Some(provider)
}
Err(e) => {
tracing::warn!("Failed to initialize Prometheus metrics: {e}");
None
}
}
} else {
None
};
// 3. Record start time for uptime
health::record_start_time();
TelemetryGuard {
tracer_provider,
meter_provider,
logger_provider,
}
}
+55
View File
@@ -0,0 +1,55 @@
//! OpenTelemetry distributed tracing — OTLP exporter + tracing-opentelemetry bridge.
use opentelemetry::trace::TracerProvider as _;
use opentelemetry_otlp::{Protocol, SpanExporter, WithExportConfig};
use opentelemetry_sdk::propagation::TraceContextPropagator;
use opentelemetry_sdk::trace::{SdkTracerProvider, Tracer};
use opentelemetry_sdk::Resource;
use tracing_opentelemetry::OpenTelemetryLayer;
use tracing_subscriber::Registry;
use super::config::{OtlpProtocol, TelemetryConfig};
use crate::ImksResult;
/// Build an OTLP SpanExporter based on the configured protocol.
fn build_span_exporter(config: &TelemetryConfig) -> ImksResult<SpanExporter> {
match config.otlp_protocol {
OtlpProtocol::Grpc => SpanExporter::builder()
.with_tonic()
.with_endpoint(&config.otlp_endpoint)
.build()
.map_err(|e| crate::ImksError::Internal(format!("OTLP gRPC span exporter: {e}"))),
OtlpProtocol::HttpProtobuf => SpanExporter::builder()
.with_http()
.with_protocol(Protocol::HttpBinary)
.with_endpoint(&config.otlp_endpoint)
.build()
.map_err(|e| {
crate::ImksError::Internal(format!("OTLP HTTP span exporter: {e}"))
}),
}
}
/// Initialize the tracing pipeline: OTel tracer provider + tracing-opentelemetry layer.
///
/// Returns (SdkTracerProvider, OpenTelemetryLayer).
pub fn init_tracing(
config: &TelemetryConfig,
resource: &Resource,
) -> ImksResult<(SdkTracerProvider, OpenTelemetryLayer<Registry, Tracer>)> {
// Set global propagator for W3C TraceContext extraction/injection
opentelemetry::global::set_text_map_propagator(TraceContextPropagator::new());
let exporter = build_span_exporter(config)?;
let provider = SdkTracerProvider::builder()
.with_resource(resource.clone())
.with_batch_exporter(exporter)
.build();
let tracer = provider.tracer("imks");
let otel_layer = tracing_opentelemetry::layer().with_tracer(tracer);
Ok((provider, otel_layer))
}