feat(telemetry): integrate OpenTelemetry observability stack with health metrics

- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration
- Implement connection tracking with active/total/disconnection metrics
- Add health endpoint with uptime and connection counts
- Integrate tracing spans for socket events and engine messages
- Add metrics collection for event handling duration
- Update health endpoint to include live runtime state
- Add graceful telemetry shutdown in main function
- Implement engine session active metrics tracking
- Add namespace-specific attributes to connection metrics
- Introduce message edit history retrieval endpoint
- Add scheduled message CRUD operations and dispatcher
- Update Socket.IO event registration with observability
- Refactor component update to remove dead code allowance
- Add comprehensive environment variables documentation
- Implement detailed development guidelines in AGENTS.md
This commit is contained in:
zhenyi
2026-06-11 13:53:29 +08:00
parent 40241e5db3
commit 0dbac480ae
22 changed files with 3116 additions and 64 deletions
+44 -12
View File
@@ -9,14 +9,12 @@ use imks::socket::message_bus::{NatsMessageBus, RedisMessageBus};
use imks::socket::server::SocketServerBuilder;
use imks::svc::{DeployConfig, MessageService};
use imks::telemetry;
fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt()
.with_env_filter(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
)
.init();
// Initialize observability stack (traces, metrics, logs, health)
let telemetry_guard = telemetry::init();
telemetry::health::init_counters();
let deploy = DeployConfig::from_env();
tracing::info!(
@@ -37,7 +35,6 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
Arc::new(OnceLock::new());
// Pre-configure adapter for Redis/NATS mode.
// The callback resolves namespaces after SocketServer is built.
match deploy.adapter_mode.as_str() {
"redis" => {
let message_bus = Arc::new(
@@ -130,27 +127,58 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.map_err(|e| e.to_string())?;
}
// Increment connection metrics
let m = telemetry::metrics::get();
m.connections_active.add(
1,
&telemetry::MetricsInstruments::namespace_attrs(&socket.namespace),
);
m.connections_total.add(
1,
&telemetry::MetricsInstruments::namespace_attrs(&socket.namespace),
);
telemetry::health::connection_connected();
tracing::info!(
"Socket {} connected (engine: {})",
socket.sid,
socket.engine_sid
socket_sid = %socket.sid,
engine_sid = %socket.engine_sid,
namespace = %socket.namespace,
"Socket connected"
);
Ok(())
})
.await;
// Register Socket.IO event handlers
if let Some(ref svc) = service {
macro_rules! register_event {
($svc:expr, $ns:expr, $event:expr, $method:ident) => {
let s = $svc.clone();
let event_name = $event.to_string();
$ns.on_event($event, Arc::new(move |socket, data| {
let s = s.clone();
let data = data.clone();
let event = event_name.clone();
tokio::spawn(async move {
let _span = tracing::info_span!(
"socket_event",
otel.name = format!("handle {event}"),
event = %event,
socket_sid = %socket.sid,
);
let _enter = _span.enter();
let start = std::time::Instant::now();
if let Err(e) = s.$method(socket, &data).await {
tracing::error!(event = $event, error = %e, "Event handler failed");
tracing::error!(event = %event, error = %e, "Event handler failed");
}
let elapsed = start.elapsed().as_secs_f64();
telemetry::metrics::get().event_handling_duration.record(
elapsed,
&telemetry::MetricsInstruments::event_attrs(&event),
);
});
})).await;
};
@@ -200,11 +228,12 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
register_event!(svc, namespace, "article:list", list_articles);
register_event!(svc, namespace, "article:delete", delete_article);
register_event!(svc, namespace, "component:interact", interact_component);
register_event!(svc, namespace, "component:update", update_component);
// Start scheduled message dispatcher (background task)
svc.clone().start_scheduled_dispatcher();
tracing::info!("Registered Socket.IO event handlers");
tracing::info!("Registered Socket.IO event handlers with observability instrumentation");
}
// Start servers
@@ -233,6 +262,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
Ok::<(), Box<dyn std::error::Error>>(())
})?;
// Graceful telemetry shutdown
telemetry_guard.shutdown();
Ok(())
}