feat(telemetry): integrate OpenTelemetry observability stack with health metrics
- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration - Implement connection tracking with active/total/disconnection metrics - Add health endpoint with uptime and connection counts - Integrate tracing spans for socket events and engine messages - Add metrics collection for event handling duration - Update health endpoint to include live runtime state - Add graceful telemetry shutdown in main function - Implement engine session active metrics tracking - Add namespace-specific attributes to connection metrics - Introduce message edit history retrieval endpoint - Add scheduled message CRUD operations and dispatcher - Update Socket.IO event registration with observability - Refactor component update to remove dead code allowance - Add comprehensive environment variables documentation - Implement detailed development guidelines in AGENTS.md
This commit is contained in:
@@ -9,14 +9,12 @@ use imks::socket::message_bus::{NatsMessageBus, RedisMessageBus};
|
||||
|
||||
use imks::socket::server::SocketServerBuilder;
|
||||
use imks::svc::{DeployConfig, MessageService};
|
||||
use imks::telemetry;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
|
||||
)
|
||||
.init();
|
||||
// Initialize observability stack (traces, metrics, logs, health)
|
||||
let telemetry_guard = telemetry::init();
|
||||
telemetry::health::init_counters();
|
||||
|
||||
let deploy = DeployConfig::from_env();
|
||||
tracing::info!(
|
||||
@@ -37,7 +35,6 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
Arc::new(OnceLock::new());
|
||||
|
||||
// Pre-configure adapter for Redis/NATS mode.
|
||||
// The callback resolves namespaces after SocketServer is built.
|
||||
match deploy.adapter_mode.as_str() {
|
||||
"redis" => {
|
||||
let message_bus = Arc::new(
|
||||
@@ -130,27 +127,58 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
.map_err(|e| e.to_string())?;
|
||||
}
|
||||
|
||||
// Increment connection metrics
|
||||
let m = telemetry::metrics::get();
|
||||
m.connections_active.add(
|
||||
1,
|
||||
&telemetry::MetricsInstruments::namespace_attrs(&socket.namespace),
|
||||
);
|
||||
m.connections_total.add(
|
||||
1,
|
||||
&telemetry::MetricsInstruments::namespace_attrs(&socket.namespace),
|
||||
);
|
||||
telemetry::health::connection_connected();
|
||||
|
||||
tracing::info!(
|
||||
"Socket {} connected (engine: {})",
|
||||
socket.sid,
|
||||
socket.engine_sid
|
||||
socket_sid = %socket.sid,
|
||||
engine_sid = %socket.engine_sid,
|
||||
namespace = %socket.namespace,
|
||||
"Socket connected"
|
||||
);
|
||||
Ok(())
|
||||
})
|
||||
.await;
|
||||
|
||||
|
||||
|
||||
// Register Socket.IO event handlers
|
||||
if let Some(ref svc) = service {
|
||||
macro_rules! register_event {
|
||||
($svc:expr, $ns:expr, $event:expr, $method:ident) => {
|
||||
let s = $svc.clone();
|
||||
let event_name = $event.to_string();
|
||||
$ns.on_event($event, Arc::new(move |socket, data| {
|
||||
let s = s.clone();
|
||||
let data = data.clone();
|
||||
let event = event_name.clone();
|
||||
tokio::spawn(async move {
|
||||
let _span = tracing::info_span!(
|
||||
"socket_event",
|
||||
otel.name = format!("handle {event}"),
|
||||
event = %event,
|
||||
socket_sid = %socket.sid,
|
||||
);
|
||||
let _enter = _span.enter();
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
if let Err(e) = s.$method(socket, &data).await {
|
||||
tracing::error!(event = $event, error = %e, "Event handler failed");
|
||||
tracing::error!(event = %event, error = %e, "Event handler failed");
|
||||
}
|
||||
let elapsed = start.elapsed().as_secs_f64();
|
||||
telemetry::metrics::get().event_handling_duration.record(
|
||||
elapsed,
|
||||
&telemetry::MetricsInstruments::event_attrs(&event),
|
||||
);
|
||||
});
|
||||
})).await;
|
||||
};
|
||||
@@ -200,11 +228,12 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
register_event!(svc, namespace, "article:list", list_articles);
|
||||
register_event!(svc, namespace, "article:delete", delete_article);
|
||||
register_event!(svc, namespace, "component:interact", interact_component);
|
||||
register_event!(svc, namespace, "component:update", update_component);
|
||||
|
||||
// Start scheduled message dispatcher (background task)
|
||||
svc.clone().start_scheduled_dispatcher();
|
||||
|
||||
tracing::info!("Registered Socket.IO event handlers");
|
||||
tracing::info!("Registered Socket.IO event handlers with observability instrumentation");
|
||||
}
|
||||
|
||||
// Start servers
|
||||
@@ -233,6 +262,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
Ok::<(), Box<dyn std::error::Error>>(())
|
||||
})?;
|
||||
|
||||
// Graceful telemetry shutdown
|
||||
telemetry_guard.shutdown();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user