feat(registry): add service discovery and health check capabilities

- Integrate tonic-health for gRPC service health monitoring
- Add etcd-based service registration with automatic keep-alive
- Implement dynamic configuration loading from etcd with fallback
- Remove external dependencies from docker-compose for simplified deployment
- Refactor service registration logic with improved lease management
- Add health service to gRPC server with serving status reporting
This commit is contained in:
zhenyi
2026-06-11 22:50:40 +08:00
parent 1ccfd3d626
commit b797e360c0
8 changed files with 73 additions and 326 deletions
Generated
+15
View File
@@ -380,6 +380,7 @@ dependencies = [
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tonic", "tonic",
"tonic-health",
"tonic-prost", "tonic-prost",
"tonic-prost-build", "tonic-prost-build",
"tracing", "tracing",
@@ -4636,6 +4637,7 @@ dependencies = [
"futures-core", "futures-core",
"pin-project-lite", "pin-project-lite",
"tokio", "tokio",
"tokio-util",
] ]
[[package]] [[package]]
@@ -4714,6 +4716,19 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "tonic-health"
version = "0.14.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcfab99db777fba2802f0dfa861d1628d1ae916fb199d29819941f139ae85082"
dependencies = [
"prost",
"tokio",
"tokio-stream",
"tonic",
"tonic-prost",
]
[[package]] [[package]]
name = "tonic-prost" name = "tonic-prost"
version = "0.14.6" version = "0.14.6"
+1
View File
@@ -46,6 +46,7 @@ tonic = { version = "0.14", features = ["transport", "channel"] }
prost = "0.14" prost = "0.14"
prost-types = "0.14" prost-types = "0.14"
tonic-prost = "0.14" tonic-prost = "0.14"
tonic-health = "0.14.6"
url = "2.5" url = "2.5"
etcd-client = { version = "0.18", features = ["tls"] } etcd-client = { version = "0.18", features = ["tls"] }
tokio-stream = "0.1" tokio-stream = "0.1"
-255
View File
@@ -14,7 +14,6 @@ x-appks-env: &appks-env
APP_SESSION_COOKIE_PATH: / APP_SESSION_COOKIE_PATH: /
APP_SESSION_TTL_SECS: 86400 APP_SESSION_TTL_SECS: 86400
APP_SESSION_MAX_AGE_SECS: 86400 APP_SESSION_MAX_AGE_SECS: 86400
# Postgres
APP_DATABASE_URL: postgres://appks:appks@postgres:5432/appks APP_DATABASE_URL: postgres://appks:appks@postgres:5432/appks
DATABASE_URL: postgres://appks:appks@postgres:5432/appks DATABASE_URL: postgres://appks:appks@postgres:5432/appks
APP_DATABASE_MAX_CONNECTIONS: 10 APP_DATABASE_MAX_CONNECTIONS: 10
@@ -22,15 +21,11 @@ x-appks-env: &appks-env
APP_DATABASE_IDLE_TIMEOUT: 600 APP_DATABASE_IDLE_TIMEOUT: 600
APP_DATABASE_MAX_LIFETIME: 3600 APP_DATABASE_MAX_LIFETIME: 3600
APP_DATABASE_CONNECTION_TIMEOUT: 8 APP_DATABASE_CONNECTION_TIMEOUT: 8
APP_DATABASE_SCHEMA_SEARCH_PATH: public
APP_DATABASE_READ_WRITE_SPLIT: "false"
APP_DATABASE_RETRY_ATTEMPTS: 3 APP_DATABASE_RETRY_ATTEMPTS: 3
APP_DATABASE_RETRY_DELAY: 5 APP_DATABASE_RETRY_DELAY: 5
# Redis (cluster mode)
APP_REDIS_CLUSTER_ENABLED: "true" APP_REDIS_CLUSTER_ENABLED: "true"
APP_REDIS_CLUSTER_NODES: redis://redis-node-0:6379,redis://redis-node-1:6379,redis://redis-node-2:6379,redis://redis-node-3:6379,redis://redis-node-4:6379,redis://redis-node-5:6379 APP_REDIS_CLUSTER_NODES: redis://redis-node-0:6379,redis://redis-node-1:6379,redis://redis-node-2:6379,redis://redis-node-3:6379,redis://redis-node-4:6379,redis://redis-node-5:6379
APP_REDIS_READ_FROM_REPLICAS: "false" APP_REDIS_READ_FROM_REPLICAS: "false"
APP_REDIS_PASSWORD: ""
APP_REDIS_MAX_CONNECTIONS: 20 APP_REDIS_MAX_CONNECTIONS: 20
APP_REDIS_MIN_CONNECTIONS: 2 APP_REDIS_MIN_CONNECTIONS: 2
APP_REDIS_IDLE_TIMEOUT: 300 APP_REDIS_IDLE_TIMEOUT: 300
@@ -39,7 +34,6 @@ x-appks-env: &appks-env
APP_REDIS_RETRY_DELAY_MS: 100 APP_REDIS_RETRY_DELAY_MS: 100
APP_REDIS_TLS_ENABLED: "false" APP_REDIS_TLS_ENABLED: "false"
APP_REDIS_KEY_PREFIX: "appks:" APP_REDIS_KEY_PREFIX: "appks:"
# etcd
APP_ETCD_ENDPOINTS: http://etcd:2379 APP_ETCD_ENDPOINTS: http://etcd:2379
APP_ETCD_KEY_PREFIX: /appks/ APP_ETCD_KEY_PREFIX: /appks/
APP_ETCD_CONNECT_TIMEOUT: 5 APP_ETCD_CONNECT_TIMEOUT: 5
@@ -48,7 +42,6 @@ x-appks-env: &appks-env
APP_ETCD_LEASE_TTL: 15 APP_ETCD_LEASE_TTL: 15
APP_ETCD_MAX_RETRIES: 3 APP_ETCD_MAX_RETRIES: 3
APP_ETCD_REGISTER_SELF: "true" APP_ETCD_REGISTER_SELF: "true"
# NATS
APP_NATS_URL: nats://nats:4222 APP_NATS_URL: nats://nats:4222
APP_NATS_CONNECTION_TIMEOUT: 5 APP_NATS_CONNECTION_TIMEOUT: 5
APP_NATS_PING_INTERVAL: 20 APP_NATS_PING_INTERVAL: 20
@@ -57,7 +50,6 @@ x-appks-env: &appks-env
APP_NATS_STREAM_PREFIX: APPKS APP_NATS_STREAM_PREFIX: APPKS
APP_NATS_ACK_WAIT_SECS: 30 APP_NATS_ACK_WAIT_SECS: 30
APP_NATS_MAX_DELIVER: 5 APP_NATS_MAX_DELIVER: 5
# S3 / MinIO
APP_S3_ENDPOINT: http://minio:9000 APP_S3_ENDPOINT: http://minio:9000
APP_S3_REGION: us-east-1 APP_S3_REGION: us-east-1
APP_S3_ACCESS_KEY: admin APP_S3_ACCESS_KEY: admin
@@ -73,34 +65,16 @@ x-appks-env: &appks-env
APP_S3_UPLOAD_PART_SIZE: 8388608 APP_S3_UPLOAD_PART_SIZE: 8388608
APP_S3_MAX_UPLOAD_SIZE: 104857600 APP_S3_MAX_UPLOAD_SIZE: 104857600
APP_S3_PRESIGNED_URL_EXPIRY: 3600 APP_S3_PRESIGNED_URL_EXPIRY: 3600
# LRU cache
APP_LRU_DEFAULT_CAPACITY: 1000 APP_LRU_DEFAULT_CAPACITY: 1000
APP_LRU_DEFAULT_TTL_SECS: 300 APP_LRU_DEFAULT_TTL_SECS: 300
APP_LRU_CLEANUP_INTERVAL_SECS: 60 APP_LRU_CLEANUP_INTERVAL_SECS: 60
# gRPC
APP_RPC_SELF_HOST: 0.0.0.0 APP_RPC_SELF_HOST: 0.0.0.0
APP_RPC_SELF_PORT: 50049 APP_RPC_SELF_PORT: 50049
APP_RPC_SELF_REFLECTION: "false" APP_RPC_SELF_REFLECTION: "false"
APP_RPC_SELF_SERVICE_NAME: appks APP_RPC_SELF_SERVICE_NAME: appks
APP_RPC_DEFAULT_TIMEOUT_SECS: 10 APP_RPC_DEFAULT_TIMEOUT_SECS: 10
# AI (disabled by default in compose — set externally)
APP_AI_PROVIDER_API_KEY: ""
APP_AI_PROVIDER_URL: ""
# Qdrant
APP_QDRANT_URL: http://qdrant:6334
APP_QDRANT_COLLECTION: appks_embeddings
APP_QDRANT_VECTOR_SIZE: 1536
APP_QDRANT_DISTANCE: Cosine
APP_QDRANT_MAX_CONNECTIONS: 10
APP_QDRANT_IDLE_TIMEOUT: 300
APP_QDRANT_CONNECTION_TIMEOUT: 10
APP_QDRANT_MAX_RETRIES: 3
APP_QDRANT_TLS_ENABLED: "false"
APP_QDRANT_SEARCH_LIMIT: 10
APP_QDRANT_SCORE_THRESHOLD: 0.7
services: services:
# AppKS
appks: appks:
image: appks image: appks
restart: unless-stopped restart: unless-stopped
@@ -109,232 +83,3 @@ services:
- "50049:50049" - "50049:50049"
environment: environment:
<<: *appks-env <<: *appks-env
depends_on:
postgres:
condition: service_healthy
etcd:
condition: service_started
nats:
condition: service_started
minio:
condition: service_healthy
qdrant:
condition: service_started
networks:
- appks-net
# PostgreSQL
postgres:
image: postgres:17-alpine
restart: unless-stopped
environment:
POSTGRES_USER: appks
POSTGRES_PASSWORD: appks
POSTGRES_DB: appks
ports:
- "5432:5432"
volumes:
- pg_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U appks -d appks"]
interval: 5s
timeout: 5s
retries: 5
networks:
- appks-net
# Redis Cluster (6 nodes)
redis-node-0:
image: redis:7-alpine
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000 --appendonly yes
restart: unless-stopped
ports:
- "6379:6379"
volumes:
- redis_data_0:/data
networks:
- appks-net
redis-node-1:
image: redis:7-alpine
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000 --appendonly yes
restart: unless-stopped
ports:
- "6380:6379"
volumes:
- redis_data_1:/data
networks:
- appks-net
redis-node-2:
image: redis:7-alpine
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000 --appendonly yes
restart: unless-stopped
ports:
- "6381:6379"
volumes:
- redis_data_2:/data
networks:
- appks-net
redis-node-3:
image: redis:7-alpine
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000 --appendonly yes
restart: unless-stopped
ports:
- "6382:6379"
volumes:
- redis_data_3:/data
networks:
- appks-net
redis-node-4:
image: redis:7-alpine
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000 --appendonly yes
restart: unless-stopped
ports:
- "6383:6379"
volumes:
- redis_data_4:/data
networks:
- appks-net
redis-node-5:
image: redis:7-alpine
command: redis-server --port 6379 --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000 --appendonly yes
restart: unless-stopped
ports:
- "6384:6379"
volumes:
- redis_data_5:/data
networks:
- appks-net
redis-cluster-init:
image: redis:7-alpine
depends_on:
- redis-node-0
- redis-node-1
- redis-node-2
- redis-node-3
- redis-node-4
- redis-node-5
entrypoint: >
sh -c '
echo "Waiting for all Redis nodes to be ready...";
sleep 5;
echo "Creating Redis cluster...";
redis-cli --cluster create
redis-node-0:6379
redis-node-1:6379
redis-node-2:6379
redis-node-3:6379
redis-node-4:6379
redis-node-5:6379
--cluster-replicas 1
--cluster-yes
'
networks:
- appks-net
# etcd
etcd:
image: quay.io/coreos/etcd:v3.5
restart: unless-stopped
command: >
etcd
--name etcd
--data-dir /etcd-data
--listen-client-urls http://0.0.0.0:2379
--advertise-client-urls http://etcd:2379
--listen-peer-urls http://0.0.0.0:2380
--initial-advertise-peer-urls http://etcd:2380
--initial-cluster etcd=http://etcd:2380
--initial-cluster-token appks-etcd
--initial-cluster-state new
ports:
- "2379:2379"
- "2380:2380"
volumes:
- etcd_data:/etcd-data
networks:
- appks-net
# NATS
nats:
image: nats:2-alpine
restart: unless-stopped
command: --js --store_dir /data/nats
ports:
- "4222:4222"
- "8222:8222"
volumes:
- nats_data:/data/nats
networks:
- appks-net
# MinIO (S3-compatible storage)
minio:
image: minio/minio:latest
restart: unless-stopped
command: server /data --console-address ":9001"
environment:
MINIO_ROOT_USER: admin
MINIO_ROOT_PASSWORD: mysecret123
ports:
- "9000:9000"
- "9001:9001"
volumes:
- minio_data:/data
healthcheck:
test: ["CMD-SHELL", "mc ready local || exit 1"]
interval: 5s
timeout: 5s
retries: 5
networks:
- appks-net
# MinIO bucket auto-creation
minio-init:
image: minio/mc:latest
depends_on:
minio:
condition: service_healthy
entrypoint: >
sh -c '
mc alias set local http://minio:9000 admin mysecret123;
mc mb --ignore-existing local/appks;
echo "MinIO bucket appks ready";
exit 0
'
networks:
- appks-net
# Qdrant (vector database)
qdrant:
image: qdrant/qdrant:latest
restart: unless-stopped
ports:
- "6333:6333"
- "6334:6334"
volumes:
- qdrant_data:/qdrant/storage
networks:
- appks-net
volumes:
pg_data:
redis_data_0:
redis_data_1:
redis_data_2:
redis_data_3:
redis_data_4:
redis_data_5:
etcd_data:
nats_data:
minio_data:
qdrant_data:
networks:
appks-net:
driver: bridge
+1
View File
@@ -22,6 +22,7 @@ impl EtcdRegistry {
self.load_initial("mail").await?; self.load_initial("mail").await?;
self.spawn_watch("mail"); self.spawn_watch("mail");
} }
Ok(()) Ok(())
} }
+26
View File
@@ -116,6 +116,32 @@ impl EtcdRegistry {
ids.sort(); ids.sort();
ids ids
} }
/// Read config from etcd. Priority: etcd > env > default.
/// This is async but can be called from sync context via block_on.
pub async fn get_config(&self, key: &str, default: &str) -> String {
let etcd_key = format!("{}config/{}", self.inner.key_prefix, key);
let mut client = self.inner.client.lock().await;
if let Ok(resp) = client.get(etcd_key.as_str(), None).await {
if let Some(kv) = resp.kvs().first() {
if let Ok(v) = kv.value_str() {
if !v.is_empty() {
tracing::info!(key, value = v, "config from etcd");
return v.to_string();
}
}
}
}
drop(client);
// Fall back to env
if let Ok(v) = std::env::var(key) {
if !v.is_empty() {
return v;
}
}
default.to_string()
}
} }
/// Derive a deterministic UUID from a gitks storage_name. /// Derive a deterministic UUID from a gitks storage_name.
+19 -66
View File
@@ -2,12 +2,9 @@ use std::collections::HashMap;
use std::sync::atomic::Ordering; use std::sync::atomic::Ordering;
use etcd_client::PutOptions; use etcd_client::PutOptions;
use tokio_stream::StreamExt;
use crate::error::{AppError, AppResult}; use crate::error::{AppError, AppResult};
use super::EtcdRegistry; use super::EtcdRegistry;
use super::EtcdRegistryInner;
use super::types::ServiceInstance; use super::types::ServiceInstance;
impl EtcdRegistry { impl EtcdRegistry {
@@ -58,74 +55,30 @@ impl EtcdRegistry {
Ok(()) Ok(())
} }
fn spawn_keep_alive(&self, lease_id: i64, key: String) { fn spawn_keep_alive(&self, lease_id: i64, _key: String) {
let inner = self.inner.clone(); let inner = self.inner.clone();
let interval = self.inner.config.etcd_keep_alive_interval().unwrap_or(10);
tokio::spawn(async move { tokio::spawn(async move {
let (mut keeper, mut stream) = {
let mut client = inner.client.lock().await;
match client.lease_keep_alive(lease_id).await {
Ok(pair) => pair,
Err(e) => {
tracing::error!(lease_id, error = %e, "failed to start lease keepalive");
return;
}
}
};
let interval_secs = inner.config.etcd_keep_alive_interval().unwrap_or(10);
let mut interval = tokio::time::interval(std::time::Duration::from_secs(interval_secs));
loop { loop {
Self::run_keep_alive_stream(&inner, lease_id).await; interval.tick().await;
tokio::time::sleep(std::time::Duration::from_secs(interval)).await; if let Err(e) = keeper.keep_alive().await {
Self::renew_lease_and_reregister(&inner, lease_id, &key).await; tracing::warn!(lease_id, error = %e, "lease keepalive failed");
}
let _ = stream.message().await;
} }
}); });
} }
} }
impl EtcdRegistry {
async fn run_keep_alive_stream(
inner: &std::sync::Arc<EtcdRegistryInner>,
lease_id: i64,
) {
let result = {
let mut client = inner.client.lock().await;
client.lease_keep_alive(lease_id).await
};
match result {
Ok((_keeper, mut stream)) => {
while let Some(resp) = stream.next().await {
if let Err(e) = resp {
tracing::warn!(lease_id = lease_id, error = %e, "keep-alive stream error");
break;
}
}
}
Err(e) => {
tracing::warn!(lease_id = lease_id, error = %e, "keep-alive failed");
}
}
}
async fn renew_lease_and_reregister(
inner: &std::sync::Arc<EtcdRegistryInner>,
old_lease_id: i64,
key: &str,
) {
let re_grant = {
let mut client = inner.client.lock().await;
client
.lease_grant(inner.config.etcd_lease_ttl().unwrap_or(15) as i64, None)
.await
};
let Ok(current) = re_grant else {
return;
};
let new_lease = current.id();
inner.lease_id.store(new_lease, Ordering::SeqCst);
let instance = ServiceInstance {
addr: inner.config.rpc_self_listen_addr().unwrap_or_default(),
metadata: HashMap::new(),
};
if let Ok(value) = serde_json::to_string(&instance) {
let mut client = inner.client.lock().await;
let opts = PutOptions::new().with_lease(new_lease);
let _ = client.put(key, value, Some(opts)).await;
}
tracing::info!(old = old_lease_id, new = new_lease, "etcd lease renewed");
}
}
+8
View File
@@ -21,6 +21,8 @@ use crate::pb::im::member_service_server::MemberServiceServer;
use crate::pb::im::permission_service_server::PermissionServiceServer; use crate::pb::im::permission_service_server::PermissionServiceServer;
use crate::pb::im::stage_service_server::StageServiceServer; use crate::pb::im::stage_service_server::StageServiceServer;
use crate::pb::im::voice_service_server::VoiceServiceServer; use crate::pb::im::voice_service_server::VoiceServiceServer;
use tonic_health::ServingStatus;
use crate::service::AppService; use crate::service::AppService;
pub async fn start_grpc_server( pub async fn start_grpc_server(
@@ -34,9 +36,15 @@ pub async fn start_grpc_server(
let cs = channel_settings::ChannelSettingsServices::new(service); let cs = channel_settings::ChannelSettingsServices::new(service);
let (health_reporter, health_service) = tonic_health::server::health_reporter();
health_reporter
.set_service_status("", ServingStatus::Serving)
.await;
tracing::info!(%addr, "gRPC server listening"); tracing::info!(%addr, "gRPC server listening");
tonic::transport::Server::builder() tonic::transport::Server::builder()
.add_service(health_service)
.add_service(TokenServiceServer::new(token_svc)) .add_service(TokenServiceServer::new(token_svc))
.add_service(ChannelServiceServer::new(channel_svc)) .add_service(ChannelServiceServer::new(channel_svc))
.add_service(MemberServiceServer::new(member_svc)) .add_service(MemberServiceServer::new(member_svc))
-2
View File
@@ -39,11 +39,9 @@ async fn main() -> AppResult<()> {
let registry = Arc::new(EtcdRegistry::connect(&config).await?); let registry = Arc::new(EtcdRegistry::connect(&config).await?);
registry.start_discovery().await?; registry.start_discovery().await?;
if config.get_env_or("APP_ETCD_REGISTER_SELF", false)? {
registry registry
.register_self(&config.rpc_self_service_name()?) .register_self(&config.rpc_self_service_name()?)
.await?; .await?;
}
let nats = Arc::new(NatsQueue::connect(&config).await?); let nats = Arc::new(NatsQueue::connect(&config).await?);