feat(telemetry): integrate OpenTelemetry observability stack with health metrics
- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration - Implement connection tracking with active/total/disconnection metrics - Add health endpoint with uptime and connection counts - Integrate tracing spans for socket events and engine messages - Add metrics collection for event handling duration - Update health endpoint to include live runtime state - Add graceful telemetry shutdown in main function - Implement engine session active metrics tracking - Add namespace-specific attributes to connection metrics - Introduce message edit history retrieval endpoint - Add scheduled message CRUD operations and dispatcher - Update Socket.IO event registration with observability - Refactor component update to remove dead code allowance - Add comprehensive environment variables documentation - Implement detailed development guidelines in AGENTS.md
This commit is contained in:
+12
-6
@@ -1,26 +1,32 @@
|
||||
//! Health check endpoint for the imks server.
|
||||
//!
|
||||
//! Returns JSON with server status, version, and upstream connectivity.
|
||||
//! Returns JSON with server status, version, uptime, and connection counts
|
||||
//! sourced from live runtime state (session store + atomic counter).
|
||||
|
||||
use actix_web::HttpResponse;
|
||||
use actix_web::{HttpResponse, web};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::engine::session::SessionStore;
|
||||
use crate::telemetry;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct HealthResponse {
|
||||
status: String,
|
||||
version: String,
|
||||
timestamp: String,
|
||||
uptime_secs: u64,
|
||||
connections_active: u64,
|
||||
sessions_count: usize,
|
||||
}
|
||||
|
||||
/// GET /health — returns server health status.
|
||||
pub async fn health_check() -> HttpResponse {
|
||||
/// GET /health — returns server health status with live connection metrics.
|
||||
pub async fn health_check(store: web::Data<SessionStore>) -> HttpResponse {
|
||||
HttpResponse::Ok().json(HealthResponse {
|
||||
status: "healthy".into(),
|
||||
version: env!("CARGO_PKG_VERSION").into(),
|
||||
timestamp: chrono::Utc::now().to_rfc3339(),
|
||||
uptime_secs: 0,
|
||||
sessions_count: 0,
|
||||
uptime_secs: telemetry::health::uptime_secs(),
|
||||
connections_active: telemetry::health::connections_active_count(),
|
||||
sessions_count: store.len(),
|
||||
})
|
||||
}
|
||||
|
||||
+10
-1
@@ -115,17 +115,26 @@ impl EngineServer {
|
||||
));
|
||||
let heartbeat_handle = heartbeat.start();
|
||||
|
||||
tracing::info!("Engine.IO HTTP server listening on {}", addr);
|
||||
tracing::info!(
|
||||
endpoint = %addr,
|
||||
"Engine.IO HTTP server listening, /health and /metrics available"
|
||||
);
|
||||
|
||||
let result = HttpServer::new(move || {
|
||||
App::new()
|
||||
.app_data(web::Data::new(store.clone()))
|
||||
.app_data(web::Data::new(config.clone()))
|
||||
.app_data(web::Data::new(on_message.clone()))
|
||||
// Health check with connection metrics
|
||||
.route(
|
||||
"/health",
|
||||
web::get().to(crate::engine::health::health_check),
|
||||
)
|
||||
// Prometheus metrics endpoint
|
||||
.route(
|
||||
"/metrics",
|
||||
web::get().to(crate::telemetry::metrics::metrics_handler),
|
||||
)
|
||||
.route("/engine.io/", web::get().to(engine_get))
|
||||
.route(
|
||||
"/engine.io/",
|
||||
|
||||
+11
-1
@@ -129,6 +129,12 @@ impl SessionStore {
|
||||
sid
|
||||
);
|
||||
}
|
||||
if let Some(m) = crate::telemetry::metrics::try_get() {
|
||||
m.engine_sessions_active.add(
|
||||
1,
|
||||
&[opentelemetry::KeyValue::new("transport", transport.as_str())],
|
||||
);
|
||||
}
|
||||
rx
|
||||
}
|
||||
|
||||
@@ -137,7 +143,11 @@ impl SessionStore {
|
||||
}
|
||||
|
||||
pub fn remove(&self, sid: &str) {
|
||||
self.sessions.remove(sid);
|
||||
if self.sessions.remove(sid).is_some()
|
||||
&& let Some(m) = crate::telemetry::metrics::try_get()
|
||||
{
|
||||
m.engine_sessions_active.add(-1, &[]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exists(&self, sid: &str) -> bool {
|
||||
|
||||
Reference in New Issue
Block a user