feat(telemetry): integrate OpenTelemetry observability stack with health metrics

- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration
- Implement connection tracking with active/total/disconnection metrics
- Add health endpoint with uptime and connection counts
- Integrate tracing spans for socket events and engine messages
- Add metrics collection for event handling duration
- Update health endpoint to include live runtime state
- Add graceful telemetry shutdown in main function
- Implement engine session active metrics tracking
- Add namespace-specific attributes to connection metrics
- Introduce message edit history retrieval endpoint
- Add scheduled message CRUD operations and dispatcher
- Update Socket.IO event registration with observability
- Refactor component update to remove dead code allowance
- Add comprehensive environment variables documentation
- Implement detailed development guidelines in AGENTS.md
This commit is contained in:
zhenyi
2026-06-11 13:53:29 +08:00
parent 40241e5db3
commit 0dbac480ae
22 changed files with 3116 additions and 64 deletions
+67 -34
View File
@@ -143,29 +143,40 @@ async fn handle_engine_message(
) {
if let EnginePacketData::Text(ref text) = engine_packet.data {
match parser::decode(text) {
Ok(socket_packet) => match socket_packet.packet_type {
PacketType::Connect => {
handle_connect(
&engine_sid,
&socket_packet,
namespaces,
socket_txs,
engine_store,
adapter,
)
.await;
Ok(socket_packet) => {
let packet_type = format!("{:?}", socket_packet.packet_type);
let _span = tracing::debug_span!(
"engine_message",
engine_sid = %engine_sid,
packet_type = %packet_type,
namespace = %socket_packet.namespace,
);
let _enter = _span.enter();
match socket_packet.packet_type {
PacketType::Connect => {
handle_connect(
&engine_sid,
&socket_packet,
namespaces,
socket_txs,
engine_store,
adapter,
)
.await;
}
PacketType::Disconnect => {
handle_disconnect(&engine_sid, &socket_packet, namespaces, socket_txs);
}
PacketType::Event => {
handle_event(&engine_sid, &socket_packet, namespaces);
}
PacketType::Ack => {
handle_ack(&engine_sid, &socket_packet);
}
_ => {}
}
PacketType::Disconnect => {
handle_disconnect(&engine_sid, &socket_packet, namespaces, socket_txs);
}
PacketType::Event => {
handle_event(&engine_sid, &socket_packet, namespaces);
}
PacketType::Ack => {
handle_ack(&engine_sid, &socket_packet);
}
_ => {}
},
}
Err(e) => {
tracing::warn!(engine_sid = %engine_sid, error = %e, "Invalid Socket.IO packet");
}
@@ -181,6 +192,13 @@ async fn handle_connect(
engine_store: &SessionStore,
adapter: &Arc<dyn Adapter>,
) {
let _span = tracing::info_span!(
"socket_connect",
engine_sid = %engine_sid,
namespace = %packet.namespace,
);
let _enter = _span.enter();
// Validate namespace path to prevent DoS via arbitrary namespace creation
if !crate::socket::namespace::is_valid_namespace(&packet.namespace) {
tracing::warn!(
@@ -244,11 +262,16 @@ async fn handle_connect(
break;
}
}
// Forwarding task ended — ensure socket is cleaned up from namespace
// Forwarding task ended — ensure socket is cleaned up from namespace.
// If the socket was still registered (session expiry / engine disconnect
// without Socket.IO disconnect packet), also update the connection counter.
socket_txs_clone.remove(&socket_sid_clone);
namespace_clone
let was_removed = namespace_clone
.remove_socket_by_sid(&socket_sid_clone)
.await;
if was_removed {
crate::telemetry::health::connection_disconnected();
}
});
// Send Connect response (only after handler passed)
@@ -268,16 +291,26 @@ fn handle_disconnect(
namespaces: &Arc<NamespaceManager>,
socket_txs: &Arc<DashMap<String, mpsc::Sender<Packet>>>,
) {
if let Some(namespace) = namespaces.get_namespace(&packet.namespace) {
// Look up socket by engine_sid, then remove by socket_sid
if let Some(socket) = namespace.get_socket_by_engine_sid(engine_sid) {
socket_txs.remove(&socket.sid);
let socket_sid = socket.sid.clone();
let ns_clone = namespace.clone();
tokio::spawn(async move {
ns_clone.remove_socket_by_sid(&socket_sid).await;
});
}
if let Some(namespace) = namespaces.get_namespace(&packet.namespace)
&& let Some(socket) = namespace.get_socket_by_engine_sid(engine_sid)
{
let m = crate::telemetry::metrics::get();
m.connections_active.add(
-1,
&crate::telemetry::MetricsInstruments::namespace_attrs(&socket.namespace),
);
m.disconnections_total.add(
1,
&crate::telemetry::MetricsInstruments::namespace_attrs(&socket.namespace),
);
crate::telemetry::health::connection_disconnected();
socket_txs.remove(&socket.sid);
let socket_sid = socket.sid.clone();
let ns_clone = namespace.clone();
tokio::spawn(async move {
ns_clone.remove_socket_by_sid(&socket_sid).await;
});
}
}