feat(telemetry): integrate OpenTelemetry observability stack with health metrics

- Add OpenTelemetry SDK, OTLP exporter, Prometheus integration
- Implement connection tracking with active/total/disconnection metrics
- Add health endpoint with uptime and connection counts
- Integrate tracing spans for socket events and engine messages
- Add metrics collection for event handling duration
- Update health endpoint to include live runtime state
- Add graceful telemetry shutdown in main function
- Implement engine session active metrics tracking
- Add namespace-specific attributes to connection metrics
- Introduce message edit history retrieval endpoint
- Add scheduled message CRUD operations and dispatcher
- Update Socket.IO event registration with observability
- Refactor component update to remove dead code allowance
- Add comprehensive environment variables documentation
- Implement detailed development guidelines in AGENTS.md
This commit is contained in:
zhenyi
2026-06-11 13:53:29 +08:00
parent 40241e5db3
commit 0dbac480ae
22 changed files with 3116 additions and 64 deletions
+85
View File
@@ -0,0 +1,85 @@
/// Telemetry configuration, populated from environment variables.
///
/// Follows the OpenTelemetry environment variable specification:
/// <https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/>
#[derive(Debug, Clone)]
pub struct TelemetryConfig {
pub service_name: String,
pub service_version: String,
pub otlp_endpoint: String,
pub otlp_protocol: OtlpProtocol,
pub traces_enabled: bool,
pub metrics_enabled: bool,
pub logs_enabled: bool,
pub log_format: LogFormat,
pub log_level: String,
}
#[derive(Debug, Clone, PartialEq)]
pub enum OtlpProtocol {
Grpc,
HttpProtobuf,
}
#[derive(Debug, Clone, PartialEq)]
pub enum LogFormat {
Json,
Pretty,
}
impl Default for TelemetryConfig {
fn default() -> Self {
Self {
service_name: env_or("OTEL_SERVICE_NAME", "imks"),
service_version: env_or("OTEL_SERVICE_VERSION", env!("CARGO_PKG_VERSION")),
otlp_endpoint: env_or(
"OTEL_EXPORTER_OTLP_ENDPOINT",
"http://localhost:4317",
),
otlp_protocol: detect_otlp_protocol(),
traces_enabled: env_bool("OTEL_TRACES_ENABLED", true),
metrics_enabled: env_bool("OTEL_METRICS_ENABLED", true),
logs_enabled: env_bool("OTEL_LOGS_ENABLED", true),
log_format: detect_log_format(),
log_level: env_or("RUST_LOG", "info"),
}
}
}
impl TelemetryConfig {
pub fn from_env() -> Self {
Self::default()
}
}
fn env_or(key: &str, default: &str) -> String {
std::env::var(key).unwrap_or_else(|_| default.to_string())
}
fn env_bool(key: &str, default: bool) -> bool {
std::env::var(key)
.map(|v| matches!(v.to_lowercase().as_str(), "true" | "1" | "yes" | "on"))
.unwrap_or(default)
}
fn detect_otlp_protocol() -> OtlpProtocol {
match std::env::var("OTEL_EXPORTER_OTLP_PROTOCOL")
.unwrap_or_default()
.to_lowercase()
.as_str()
{
"http/protobuf" | "http/binary" => OtlpProtocol::HttpProtobuf,
_ => OtlpProtocol::Grpc, // default to gRPC as project already depends on tonic
}
}
fn detect_log_format() -> LogFormat {
match std::env::var("LOG_FORMAT")
.unwrap_or_else(|_| "json".to_string())
.to_lowercase()
.as_str()
{
"pretty" | "text" | "console" => LogFormat::Pretty,
_ => LogFormat::Json, // default to JSON for structured logging
}
}