feat(telemetry): integrate OpenTelemetry observability stack with health metrics
- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration - Implement connection tracking with active/total/disconnection metrics - Add health endpoint with uptime and connection counts - Integrate tracing spans for socket events and engine messages - Add metrics collection for event handling duration - Update health endpoint to include live runtime state - Add graceful telemetry shutdown in main function - Implement engine session active metrics tracking - Add namespace-specific attributes to connection metrics - Introduce message edit history retrieval endpoint - Add scheduled message CRUD operations and dispatcher - Update Socket.IO event registration with observability - Refactor component update to remove dead code allowance - Add comprehensive environment variables documentation - Implement detailed development guidelines in AGENTS.md
This commit is contained in:
+11
-3
@@ -75,7 +75,10 @@ impl Namespace {
|
||||
}
|
||||
|
||||
/// Remove a socket by its socket SID.
|
||||
pub async fn remove_socket_by_sid(&self, socket_sid: &str) {
|
||||
///
|
||||
/// Returns `true` if a socket was actually removed, `false` if the SID
|
||||
/// was not found (already removed or never existed).
|
||||
pub async fn remove_socket_by_sid(&self, socket_sid: &str) -> bool {
|
||||
if let Some((_, socket)) = self.sockets.remove(socket_sid) {
|
||||
self.engine_to_socket.remove(&socket.engine_sid);
|
||||
self.remove_socket_from_local_rooms(socket_sid);
|
||||
@@ -86,14 +89,19 @@ impl Namespace {
|
||||
{
|
||||
tracing::warn!("Adapter del_all error for socket {}: {}", socket_sid, e);
|
||||
}
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove a socket by its engine SID (for engine-level disconnections).
|
||||
pub async fn remove_socket(&self, engine_sid: &str) {
|
||||
/// Returns `true` if a socket was actually removed.
|
||||
pub async fn remove_socket(&self, engine_sid: &str) -> bool {
|
||||
if let Some((_, socket_sid)) = self.engine_to_socket.remove(engine_sid) {
|
||||
self.remove_socket_by_sid(&socket_sid).await;
|
||||
return self.remove_socket_by_sid(&socket_sid).await;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Look up a socket by its socket SID.
|
||||
|
||||
+67
-34
@@ -143,29 +143,40 @@ async fn handle_engine_message(
|
||||
) {
|
||||
if let EnginePacketData::Text(ref text) = engine_packet.data {
|
||||
match parser::decode(text) {
|
||||
Ok(socket_packet) => match socket_packet.packet_type {
|
||||
PacketType::Connect => {
|
||||
handle_connect(
|
||||
&engine_sid,
|
||||
&socket_packet,
|
||||
namespaces,
|
||||
socket_txs,
|
||||
engine_store,
|
||||
adapter,
|
||||
)
|
||||
.await;
|
||||
Ok(socket_packet) => {
|
||||
let packet_type = format!("{:?}", socket_packet.packet_type);
|
||||
let _span = tracing::debug_span!(
|
||||
"engine_message",
|
||||
engine_sid = %engine_sid,
|
||||
packet_type = %packet_type,
|
||||
namespace = %socket_packet.namespace,
|
||||
);
|
||||
let _enter = _span.enter();
|
||||
|
||||
match socket_packet.packet_type {
|
||||
PacketType::Connect => {
|
||||
handle_connect(
|
||||
&engine_sid,
|
||||
&socket_packet,
|
||||
namespaces,
|
||||
socket_txs,
|
||||
engine_store,
|
||||
adapter,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
PacketType::Disconnect => {
|
||||
handle_disconnect(&engine_sid, &socket_packet, namespaces, socket_txs);
|
||||
}
|
||||
PacketType::Event => {
|
||||
handle_event(&engine_sid, &socket_packet, namespaces);
|
||||
}
|
||||
PacketType::Ack => {
|
||||
handle_ack(&engine_sid, &socket_packet);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
PacketType::Disconnect => {
|
||||
handle_disconnect(&engine_sid, &socket_packet, namespaces, socket_txs);
|
||||
}
|
||||
PacketType::Event => {
|
||||
handle_event(&engine_sid, &socket_packet, namespaces);
|
||||
}
|
||||
PacketType::Ack => {
|
||||
handle_ack(&engine_sid, &socket_packet);
|
||||
}
|
||||
_ => {}
|
||||
},
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(engine_sid = %engine_sid, error = %e, "Invalid Socket.IO packet");
|
||||
}
|
||||
@@ -181,6 +192,13 @@ async fn handle_connect(
|
||||
engine_store: &SessionStore,
|
||||
adapter: &Arc<dyn Adapter>,
|
||||
) {
|
||||
let _span = tracing::info_span!(
|
||||
"socket_connect",
|
||||
engine_sid = %engine_sid,
|
||||
namespace = %packet.namespace,
|
||||
);
|
||||
let _enter = _span.enter();
|
||||
|
||||
// Validate namespace path to prevent DoS via arbitrary namespace creation
|
||||
if !crate::socket::namespace::is_valid_namespace(&packet.namespace) {
|
||||
tracing::warn!(
|
||||
@@ -244,11 +262,16 @@ async fn handle_connect(
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Forwarding task ended — ensure socket is cleaned up from namespace
|
||||
// Forwarding task ended — ensure socket is cleaned up from namespace.
|
||||
// If the socket was still registered (session expiry / engine disconnect
|
||||
// without Socket.IO disconnect packet), also update the connection counter.
|
||||
socket_txs_clone.remove(&socket_sid_clone);
|
||||
namespace_clone
|
||||
let was_removed = namespace_clone
|
||||
.remove_socket_by_sid(&socket_sid_clone)
|
||||
.await;
|
||||
if was_removed {
|
||||
crate::telemetry::health::connection_disconnected();
|
||||
}
|
||||
});
|
||||
|
||||
// Send Connect response (only after handler passed)
|
||||
@@ -268,16 +291,26 @@ fn handle_disconnect(
|
||||
namespaces: &Arc<NamespaceManager>,
|
||||
socket_txs: &Arc<DashMap<String, mpsc::Sender<Packet>>>,
|
||||
) {
|
||||
if let Some(namespace) = namespaces.get_namespace(&packet.namespace) {
|
||||
// Look up socket by engine_sid, then remove by socket_sid
|
||||
if let Some(socket) = namespace.get_socket_by_engine_sid(engine_sid) {
|
||||
socket_txs.remove(&socket.sid);
|
||||
let socket_sid = socket.sid.clone();
|
||||
let ns_clone = namespace.clone();
|
||||
tokio::spawn(async move {
|
||||
ns_clone.remove_socket_by_sid(&socket_sid).await;
|
||||
});
|
||||
}
|
||||
if let Some(namespace) = namespaces.get_namespace(&packet.namespace)
|
||||
&& let Some(socket) = namespace.get_socket_by_engine_sid(engine_sid)
|
||||
{
|
||||
let m = crate::telemetry::metrics::get();
|
||||
m.connections_active.add(
|
||||
-1,
|
||||
&crate::telemetry::MetricsInstruments::namespace_attrs(&socket.namespace),
|
||||
);
|
||||
m.disconnections_total.add(
|
||||
1,
|
||||
&crate::telemetry::MetricsInstruments::namespace_attrs(&socket.namespace),
|
||||
);
|
||||
crate::telemetry::health::connection_disconnected();
|
||||
|
||||
socket_txs.remove(&socket.sid);
|
||||
let socket_sid = socket.sid.clone();
|
||||
let ns_clone = namespace.clone();
|
||||
tokio::spawn(async move {
|
||||
ns_clone.remove_socket_by_sid(&socket_sid).await;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user