feat(telemetry): integrate OpenTelemetry observability stack with health metrics

- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration
- Implement connection tracking with active/total/disconnection metrics
- Add health endpoint with uptime and connection counts
- Integrate tracing spans for socket events and engine messages
- Add metrics collection for event handling duration
- Update health endpoint to include live runtime state
- Add graceful telemetry shutdown in main function
- Implement engine session active metrics tracking
- Add namespace-specific attributes to connection metrics
- Introduce message edit history retrieval endpoint
- Add scheduled message CRUD operations and dispatcher
- Update Socket.IO event registration with observability
- Refactor component update to remove dead code allowance
- Add comprehensive environment variables documentation
- Implement detailed development guidelines in AGENTS.md
This commit is contained in:
zhenyi
2026-06-11 13:53:29 +08:00
parent 40241e5db3
commit 0dbac480ae
22 changed files with 3116 additions and 64 deletions
+12 -6
View File
@@ -1,26 +1,32 @@
//! Health check endpoint for the imks server.
//!
//! Returns JSON with server status, version, and upstream connectivity.
//! Returns JSON with server status, version, uptime, and connection counts
//! sourced from live runtime state (session store + atomic counter).
use actix_web::HttpResponse;
use actix_web::{HttpResponse, web};
use serde::Serialize;
use crate::engine::session::SessionStore;
use crate::telemetry;
#[derive(Serialize)]
struct HealthResponse {
status: String,
version: String,
timestamp: String,
uptime_secs: u64,
connections_active: u64,
sessions_count: usize,
}
/// GET /health — returns server health status.
pub async fn health_check() -> HttpResponse {
/// GET /health — returns server health status with live connection metrics.
pub async fn health_check(store: web::Data<SessionStore>) -> HttpResponse {
HttpResponse::Ok().json(HealthResponse {
status: "healthy".into(),
version: env!("CARGO_PKG_VERSION").into(),
timestamp: chrono::Utc::now().to_rfc3339(),
uptime_secs: 0,
sessions_count: 0,
uptime_secs: telemetry::health::uptime_secs(),
connections_active: telemetry::health::connections_active_count(),
sessions_count: store.len(),
})
}
+10 -1
View File
@@ -115,17 +115,26 @@ impl EngineServer {
));
let heartbeat_handle = heartbeat.start();
tracing::info!("Engine.IO HTTP server listening on {}", addr);
tracing::info!(
endpoint = %addr,
"Engine.IO HTTP server listening, /health and /metrics available"
);
let result = HttpServer::new(move || {
App::new()
.app_data(web::Data::new(store.clone()))
.app_data(web::Data::new(config.clone()))
.app_data(web::Data::new(on_message.clone()))
// Health check with connection metrics
.route(
"/health",
web::get().to(crate::engine::health::health_check),
)
// Prometheus metrics endpoint
.route(
"/metrics",
web::get().to(crate::telemetry::metrics::metrics_handler),
)
.route("/engine.io/", web::get().to(engine_get))
.route(
"/engine.io/",
+11 -1
View File
@@ -129,6 +129,12 @@ impl SessionStore {
sid
);
}
if let Some(m) = crate::telemetry::metrics::try_get() {
m.engine_sessions_active.add(
1,
&[opentelemetry::KeyValue::new("transport", transport.as_str())],
);
}
rx
}
@@ -137,7 +143,11 @@ impl SessionStore {
}
pub fn remove(&self, sid: &str) {
self.sessions.remove(sid);
if self.sessions.remove(sid).is_some()
&& let Some(m) = crate::telemetry::metrics::try_get()
{
m.engine_sessions_active.add(-1, &[]);
}
}
pub fn exists(&self, sid: &str) -> bool {