diff --git a/CLAUDE.md b/CLAUDE.md index d304f4c1b..1d929bfea 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -714,6 +714,59 @@ for the manual conformance workflow. --- +## Observability (uptime, metrics, tracing) + +All four server binaries (`hfs`, `hts`, `sof-server`, `fhirpath-server`) share a +uniform observability surface via the `helios-observability` crate +(`crates/observability`). Each process installs a Prometheus recorder, tracks +uptime, instruments every request, and can export OTLP traces. + +### Endpoints + +| Endpoint | Description | +|----------|-------------| +| `GET /metrics` | Prometheus exposition format (text). Always available; needs no auth (on HFS it is merged outside the auth layer). | +| `GET /health` | Existing JSON liveness probe, now enriched with `uptime_seconds` and `started_at` (RFC 3339). | + +Exposed metrics (a global `service=""` label distinguishes processes +when scraped into one backend): + +- `http_requests_total{service,method,route,status}` — request counter. +- `http_request_duration_seconds{service,method,route,status}` — latency + histogram/summary. +- `uptime_seconds{service}` — process uptime gauge (refreshed on each scrape). + +The `route` label uses the **templated** path (e.g. `/{resource_type}/{id}`), +never raw URIs, to keep cardinality bounded. Tenant is recorded as a *trace span +attribute* (read from `X-Tenant-ID`), never as a metric label. + +### OTLP traces (feature-gated) + +OTLP trace export is behind the `otel` cargo feature (kept out of the default +build because the OTel SDK + tonic stack increases build time): + +```bash +# Build any server with OTLP trace export available +cargo build --bin hfs --features otel + +# Enable at runtime by pointing at a Collector / Jaeger (gRPC OTLP, :4317) +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \ + cargo run --bin hfs --features otel +``` + +Export is inert unless built with `--features otel` **and** +`OTEL_EXPORTER_OTLP_ENDPOINT` is set. Standard OTel env vars apply +(`OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_SERVICE_NAME`). The `/metrics` endpoint +and uptime need no configuration. + +**OTLP metrics** are produced out-of-process: point an OpenTelemetry Collector's +`prometheus` receiver at `/metrics` and forward OTLP. We intentionally do not use +`opentelemetry-prometheus` (unmaintained; carries a RUSTSEC advisory via +`protobuf`) — `/metrics` uses the maintained `metrics` + +`metrics-exporter-prometheus` stack instead. + +--- + ## Docker Generic Dockerfile supporting all server binaries via `BINARY_NAME` build arg: diff --git a/Cargo.lock b/Cargo.lock index db8c610af..baf73963e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3012,6 +3012,7 @@ dependencies = [ "futures", "helios-fhir", "helios-fhirpath-support", + "helios-observability", "hex", "http 1.4.0", "lazy_static", @@ -3053,6 +3054,7 @@ dependencies = [ "helios-audit", "helios-auth", "helios-fhir", + "helios-observability", "helios-persistence", "helios-rest", "helios-subscriptions", @@ -3081,6 +3083,7 @@ dependencies = [ "form_urlencoded", "futures", "helios-fhir", + "helios-observability", "helios-persistence", "r2d2", "r2d2_sqlite", @@ -3104,6 +3107,23 @@ dependencies = [ "zip 0.6.6", ] +[[package]] +name = "helios-observability" +version = "0.1.47" +dependencies = [ + "axum", + "chrono", + "http 1.4.0", + "metrics", + "metrics-exporter-prometheus", + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry_sdk", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", +] + [[package]] name = "helios-persistence" version = "0.1.47" @@ -3166,6 +3186,7 @@ dependencies = [ "helios-audit", "helios-auth", "helios-fhir", + "helios-observability", "helios-persistence", "helios-serde", "helios-subscriptions", @@ -3232,6 +3253,7 @@ dependencies = [ "helios-fhir", "helios-fhirpath", "helios-fhirpath-support", + "helios-observability", "http 1.4.0", "mime", "object_store", @@ -3553,7 +3575,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.3", "system-configuration", "tokio", "tower-service", @@ -4248,6 +4270,46 @@ dependencies = [ "autocfg", ] +[[package]] +name = "metrics" +version = "0.24.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89550ee9f79e88fef3119de263694973a8adb26c21d75322164fb8c493039fe2" +dependencies = [ + "portable-atomic", + "rapidhash", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd7399781913e5393588a8d8c6a2867bf85fb38eaf2502fdce465aad2dc6f034" +dependencies = [ + "base64 0.22.1", + "indexmap 2.14.0", + "metrics", + "metrics-util", + "quanta", + "thiserror 1.0.69", +] + +[[package]] +name = "metrics-util" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8496cc523d1f94c1385dd8f0f0c2c480b2b8aeccb5b7e4485ad6365523ae376" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.15.5", + "metrics", + "quanta", + "rand 0.9.4", + "rand_xoshiro", + "sketches-ddsketch", +] + [[package]] name = "mime" version = "0.3.17" @@ -4678,6 +4740,68 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "opentelemetry" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0142c63252a9e054e68a4c61a5778f7b14f576274d593f8ce883d191a099682" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.18", + "tracing", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9966929966d17620d7c316c643ba62631826e10021409357772d5eea84f62c35" +dependencies = [ + "http 1.4.0", + "opentelemetry", + "opentelemetry-proto", + "opentelemetry_sdk", + "prost", + "thiserror 2.0.18", + "tokio", + "tonic", + "tonic-types", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56d658ba1faf63f7b9c492cfbe6e0ec365440a16132d3270c1065f7b33f1b638" +dependencies = [ + "opentelemetry", + "opentelemetry_sdk", + "prost", + "tonic", + "tonic-prost", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b59f80e1ac4d5ff7a2db8fb6c80badb7f0f3f858211fba08dd9aaec750894f9" +dependencies = [ + "futures-channel", + "futures-executor", + "futures-util", + "opentelemetry", + "percent-encoding", + "portable-atomic", + "rand 0.9.4", + "thiserror 2.0.18", + "tokio", + "tokio-stream", +] + [[package]] name = "ordered-float" version = "2.10.1" @@ -5305,6 +5429,21 @@ dependencies = [ "serde", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -5338,7 +5477,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.5.10", + "socket2 0.6.3", "thiserror 2.0.18", "tokio", "tracing", @@ -5375,7 +5514,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.3", "tracing", "windows-sys 0.60.2", ] @@ -5511,6 +5650,33 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + +[[package]] +name = "rapidhash" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59" +dependencies = [ + "rustversion", +] + +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags 2.11.1", +] + [[package]] name = "rayon" version = "1.12.0" @@ -6325,6 +6491,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" + [[package]] name = "slab" version = "0.4.12" @@ -6945,6 +7117,17 @@ dependencies = [ "tonic", ] +[[package]] +name = "tonic-types" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a875a902255423d34c1f20838ab374126db8eb41625b7947a1d54113b0b7399" +dependencies = [ + "prost", + "prost-types", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -7063,6 +7246,22 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adbc64cba7137545b8044cb1fe9814f7aacf3c6b5f9b45be8bb5db538befdb26" +dependencies = [ + "js-sys", + "opentelemetry", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + [[package]] name = "tracing-subscriber" version = "0.3.23" diff --git a/Cargo.toml b/Cargo.toml index 0970e3514..f1830a0ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ default-members = [ "crates/auth", "crates/audit", "crates/subscriptions", + "crates/observability", ] resolver = "2" diff --git a/crates/fhirpath/Cargo.toml b/crates/fhirpath/Cargo.toml index f4f9b0ff0..29137ac28 100644 --- a/crates/fhirpath/Cargo.toml +++ b/crates/fhirpath/Cargo.toml @@ -17,9 +17,13 @@ R4B = ["helios-fhir/R4B"] R5 = ["helios-fhir/R5"] R6 = ["helios-fhir/R6"] +# OpenTelemetry OTLP trace export (heavy deps; opt-in for production images) +otel = ["helios-observability/otel"] + [dependencies] helios-fhir = { path = "../fhir", version = "0.1.47", default-features = false } helios-fhirpath-support = { path = "../fhirpath-support", version = "0.1.47" } +helios-observability = { path = "../observability", version = "0.1.47" } chumsky = "0.10" roxmltree = "0.20" chrono = { workspace = true } # For date/time parsing and functions diff --git a/crates/fhirpath/src/handlers.rs b/crates/fhirpath/src/handlers.rs index 5e2d0831a..60d59564f 100644 --- a/crates/fhirpath/src/handlers.rs +++ b/crates/fhirpath/src/handlers.rs @@ -459,6 +459,8 @@ pub async fn health_check() -> impl IntoResponse { "status": "ok", "service": "fhirpath-server", "version": env!("CARGO_PKG_VERSION"), + "uptime_seconds": helios_observability::uptime::uptime_seconds(), + "started_at": helios_observability::uptime::started_at_rfc3339(), "timestamp": Utc::now().to_rfc3339() })) } diff --git a/crates/fhirpath/src/server.rs b/crates/fhirpath/src/server.rs index 26df5cc40..07bef25ca 100644 --- a/crates/fhirpath/src/server.rs +++ b/crates/fhirpath/src/server.rs @@ -195,16 +195,11 @@ impl From for ServerConfig { /// Run the FHIRPath server pub async fn run_server(config: ServerConfig) -> Result<(), Box> { - // Initialize tracing - let filter = format!( - "fhirpath_server={},tower_http={}", - config.log_level, config.log_level - ); - tracing_subscriber::fmt() - .with_env_filter( - tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| filter.into()), - ) - .init(); + // Initialize observability: logging/tracing (+ optional OTLP), uptime, and + // the Prometheus metrics recorder. + helios_observability::uptime::init(); + helios_observability::telemetry::init("fhirpath-server", &config.log_level); + helios_observability::metrics::init("fhirpath-server"); info!("Starting FHIRPath server..."); info!("Configuration: {:?}", config); @@ -268,6 +263,13 @@ pub fn create_app(config: &ServerConfig) -> Router { // Add tracing app = app.layer(TraceLayer::new_for_http()); + // Observability: `/metrics` (state-free) + per-request metrics/trace span. + app = app + .merge(helios_observability::metrics::router()) + .layer(axum::middleware::from_fn( + helios_observability::middleware::track, + )); + app } diff --git a/crates/hfs/Cargo.toml b/crates/hfs/Cargo.toml index 5dc3ec69e..b99eab6f5 100644 --- a/crates/hfs/Cargo.toml +++ b/crates/hfs/Cargo.toml @@ -42,6 +42,9 @@ subscriptions = ["helios-rest/subscriptions", "dep:helios-subscriptions"] # Cloud audit backends cloudwatch = ["helios-audit/cloudwatch"] +# OpenTelemetry OTLP trace export (heavy deps; opt-in for production images) +otel = ["helios-observability/otel"] + [build-dependencies] reqwest = { version = "0.12", features = ["blocking"] } @@ -52,6 +55,7 @@ helios-persistence = { path = "../persistence", version = "0.1.47", default-feat helios-auth = { path = "../auth", version = "0.1.47" } helios-audit = { path = "../audit", version = "0.1.47", default-features = false } helios-subscriptions = { path = "../subscriptions", version = "0.1.47", optional = true, default-features = false } +helios-observability = { path = "../observability", version = "0.1.47" } # Web framework axum = "0.8" diff --git a/crates/hfs/src/main.rs b/crates/hfs/src/main.rs index 9d7b3b403..049d0f5d0 100644 --- a/crates/hfs/src/main.rs +++ b/crates/hfs/src/main.rs @@ -25,9 +25,7 @@ use helios_audit::{ }; use helios_auth::{AuthConfig, InMemoryJtiCache, JtiCache, JwksBearerAuthProvider, JwksCache}; use helios_persistence::{BackendKind, ResourceStorage, TenantContext}; -use helios_rest::{ - AuthMiddlewareState, ServerConfig, StorageBackendMode, create_app_with_auth, init_logging, -}; +use helios_rest::{AuthMiddlewareState, ServerConfig, StorageBackendMode, create_app_with_auth}; use tracing::info; use helios_persistence::backends::local_fs::LocalFsOutputStore; @@ -554,6 +552,15 @@ async fn serve( ) -> anyhow::Result<()> { let addr = config.socket_addr(); info!(address = %addr, "Server listening"); + + // Observability: expose `/metrics` (outside the auth layer, so scrapers + // need no token) and instrument every request with metrics + a trace span. + let app = app + .merge(helios_observability::metrics::router()) + .layer(axum::middleware::from_fn( + helios_observability::middleware::track, + )); + let listener = tokio::net::TcpListener::bind(&addr).await?; axum::serve(listener, app) @@ -564,6 +571,8 @@ async fn serve( lifecycle::record_shutdown(&*state.sink, &state.config.source_observer).await; state.sink.flush().await; } + // Flush any buffered OTLP spans (no-op without the `otel` feature). + helios_observability::telemetry::shutdown(); }) .await?; Ok(()) @@ -738,7 +747,9 @@ async fn main() -> anyhow::Result<()> { // sub-structs — both `#[arg(skip)]` for clap — are populated from // their `HFS_*` environment variables. let config = ServerConfig::from_env(); - init_logging(&config.log_level); + helios_observability::uptime::init(); + helios_observability::telemetry::init("hfs", &config.log_level); + helios_observability::metrics::init("hfs"); if let Err(errors) = config.validate() { for error in &errors { diff --git a/crates/hts/Cargo.toml b/crates/hts/Cargo.toml index dd89937cf..72b7090a3 100644 --- a/crates/hts/Cargo.toml +++ b/crates/hts/Cargo.toml @@ -28,9 +28,13 @@ R4B = ["helios-fhir/R4B", "helios-persistence/R4B"] R5 = ["helios-fhir/R5", "helios-persistence/R5"] R6 = ["helios-fhir/R6", "helios-persistence/R6"] +# OpenTelemetry OTLP trace export (heavy deps; opt-in for production images) +otel = ["helios-observability/otel"] + [dependencies] helios-fhir = { workspace = true } helios-persistence = { path = "../persistence", version = "0.1.47", default-features = false, features = ["sqlite"] } +helios-observability = { path = "../observability", version = "0.1.47" } async-trait = "0.1" axum = { version = "0.8", features = ["json", "query"] } diff --git a/crates/hts/src/main.rs b/crates/hts/src/main.rs index e0d3a8b53..3addc5b01 100644 --- a/crates/hts/src/main.rs +++ b/crates/hts/src/main.rs @@ -14,14 +14,6 @@ use helios_hts::state::AppState; #[cfg(feature = "postgres")] use helios_hts::backends::PostgresTerminologyBackend; -fn init_logging(log_level: &str) { - use tracing_subscriber::{EnvFilter, fmt}; - - let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(log_level)); - - fmt().with_env_filter(filter).with_target(false).init(); -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); @@ -30,7 +22,9 @@ async fn main() -> anyhow::Result<()> { .unwrap_or_else(|| Command::Run(HtsConfig::parse_from(["hts"]))) { Command::Run(config) => { - init_logging(&config.log_level); + helios_observability::uptime::init(); + helios_observability::telemetry::init("hts", &config.log_level); + helios_observability::metrics::init("hts"); info!( port = config.port, host = %config.host, @@ -46,7 +40,7 @@ async fn main() -> anyhow::Result<()> { } else { &args.log_level }; - init_logging(log_level); + helios_observability::telemetry::init("hts", log_level); let code = run_import(args).await?; std::process::exit(code); } diff --git a/crates/hts/src/operations/health.rs b/crates/hts/src/operations/health.rs index 400c48d63..9fd8d197f 100644 --- a/crates/hts/src/operations/health.rs +++ b/crates/hts/src/operations/health.rs @@ -33,6 +33,8 @@ where "service": "hts", "version": env!("CARGO_PKG_VERSION"), "backend": state.backend().backend_name(), + "uptime_seconds": helios_observability::uptime::uptime_seconds(), + "started_at": helios_observability::uptime::started_at_rfc3339(), "timestamp": chrono::Utc::now().to_rfc3339(), })), ) diff --git a/crates/hts/src/server.rs b/crates/hts/src/server.rs index 415c23902..960db46c0 100644 --- a/crates/hts/src/server.rs +++ b/crates/hts/src/server.rs @@ -167,6 +167,11 @@ where .delete(delete_concept_map::), ) .with_state(state) + // Observability: `/metrics` (state-free) + per-request metrics/trace span. + .merge(helios_observability::metrics::router()) + .layer(axum::middleware::from_fn( + helios_observability::middleware::track, + )) .layer(cors) .layer(TimeoutLayer::with_status_code( axum::http::StatusCode::REQUEST_TIMEOUT, diff --git a/crates/observability/Cargo.toml b/crates/observability/Cargo.toml new file mode 100644 index 000000000..c9109e8c6 --- /dev/null +++ b/crates/observability/Cargo.toml @@ -0,0 +1,48 @@ +[package] +name = "helios-observability" +version.workspace = true +edition.workspace = true +license.workspace = true +authors.workspace = true +repository.workspace = true +homepage = "https://github.com/HeliosSoftware/hfs/tree/main/crates/observability" +rust-version.workspace = true +description = "Shared observability wiring (uptime, Prometheus /metrics, OTLP traces) for Helios servers" +keywords = ["helios-software", "observability", "prometheus", "opentelemetry", "metrics"] +categories = ["web-programming"] + +[features] +default = [] + +# Opt-in OpenTelemetry: OTLP trace export via tracing-opentelemetry. Kept out of +# the default build because the OTel SDK + tonic stack noticeably increases build +# time. The Prometheus `/metrics` endpoint and uptime tracking are always built. +otel = [ + "dep:opentelemetry", + "dep:opentelemetry_sdk", + "dep:opentelemetry-otlp", + "dep:tracing-opentelemetry", +] + +[dependencies] +# Web framework (MatchedPath extractor needs the matched-path feature) +axum = { version = "0.8", features = ["matched-path"] } +http = "1.0" + +# Metrics facade + maintained Prometheus pull exporter +metrics = "0.24" +metrics-exporter-prometheus = { version = "0.16", default-features = false } + +# Logging / tracing +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } + +# Date/time for started_at +chrono.workspace = true + +# OpenTelemetry (feature-gated). Versions pinned to a mutually compatible set: +# core 0.32 / tracing-opentelemetry 0.33 (runs one minor ahead of core). +opentelemetry = { version = "0.32", optional = true } +opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true } +opentelemetry-otlp = { version = "0.32", default-features = false, features = ["grpc-tonic", "trace"], optional = true } +tracing-opentelemetry = { version = "0.33", optional = true } diff --git a/crates/observability/src/lib.rs b/crates/observability/src/lib.rs new file mode 100644 index 000000000..335bd4e58 --- /dev/null +++ b/crates/observability/src/lib.rs @@ -0,0 +1,40 @@ +//! Shared observability wiring for Helios servers. +//! +//! This crate centralizes the telemetry surface so the four Helios server +//! binaries (`hfs`, `hts`, `sof-server`, `fhirpath-server`) expose a uniform +//! set of endpoints and signals without duplicating wiring: +//! +//! - [`uptime`] — process start-time tracking, surfaced on `/health` and as the +//! `uptime_seconds` metric. +//! - [`metrics`] — global Prometheus recorder + a `GET /metrics` router. +//! - [`middleware`] — per-request count/latency metrics and a tracing span. +//! - [`telemetry`] — `tracing-subscriber` init, optionally bridged to OTLP +//! (feature `otel`). +//! +//! ## Typical wiring +//! +//! ```rust,ignore +//! // in main(), before serving: +//! helios_observability::uptime::init(); +//! helios_observability::telemetry::init("hfs", &log_level); +//! helios_observability::metrics::init("hfs"); +//! +//! let app = my_router() +//! .merge(helios_observability::metrics::router()) +//! .layer(axum::middleware::from_fn(helios_observability::middleware::track)); +//! +//! // on graceful shutdown: +//! helios_observability::telemetry::shutdown(); +//! ``` +//! +//! ## Design notes +//! +//! - `/metrics` uses the maintained `metrics` + `metrics-exporter-prometheus` +//! stack, not `opentelemetry-prometheus` (unmaintained, RUSTSEC advisory). +//! - OTLP *metrics* are expected to be produced by an OpenTelemetry Collector +//! scraping `/metrics`; the app itself only pushes OTLP *traces*. + +pub mod metrics; +pub mod middleware; +pub mod telemetry; +pub mod uptime; diff --git a/crates/observability/src/metrics.rs b/crates/observability/src/metrics.rs new file mode 100644 index 000000000..ae849cbd0 --- /dev/null +++ b/crates/observability/src/metrics.rs @@ -0,0 +1,58 @@ +//! Prometheus metrics: global recorder installation and the `/metrics` endpoint. +//! +//! Uses the [`metrics`] facade with the maintained +//! [`metrics_exporter_prometheus`] pull exporter. We deliberately avoid +//! `opentelemetry-prometheus` (unmaintained, carries a RUSTSEC advisory via +//! `protobuf`). OTLP *metrics* are produced out-of-process by an OpenTelemetry +//! Collector scraping this `/metrics` endpoint. + +use std::sync::OnceLock; + +use axum::{Router, http::StatusCode, response::IntoResponse, routing::get}; +use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle}; + +use crate::uptime; + +static HANDLE: OnceLock = OnceLock::new(); + +/// Install the global Prometheus recorder. A global `service=""` +/// label is attached so the generic metric names (`http_requests_total`, …) +/// stay distinct across the different Helios server processes when scraped into +/// one backend. Idempotent across repeated calls within a process: the first +/// call installs the recorder, later calls are no-ops (useful for tests). +/// +/// # Panics +/// +/// Panics only if a *different* global metrics recorder was already installed by +/// something other than this function. +pub fn init(service_name: &str) { + if HANDLE.get().is_some() { + return; + } + let handle = PrometheusBuilder::new() + .add_global_label("service", service_name) + .install_recorder() + .expect("failed to install Prometheus recorder"); + let _ = HANDLE.set(handle); +} + +/// A state-free [`Router`] exposing `GET /metrics`. Merge it into each server's +/// router with `router.merge(helios_observability::metrics::router())`. +pub fn router() -> Router { + Router::new().route("/metrics", get(render)) +} + +/// Render the Prometheus exposition text. The `uptime_seconds` gauge is set +/// immediately before rendering because the pull exporter has no scrape +/// callback. +async fn render() -> impl IntoResponse { + metrics::gauge!("uptime_seconds").set(uptime::uptime_seconds()); + match HANDLE.get() { + Some(handle) => handle.render().into_response(), + None => ( + StatusCode::SERVICE_UNAVAILABLE, + "metrics recorder not initialized\n", + ) + .into_response(), + } +} diff --git a/crates/observability/src/middleware.rs b/crates/observability/src/middleware.rs new file mode 100644 index 000000000..e940e5f1a --- /dev/null +++ b/crates/observability/src/middleware.rs @@ -0,0 +1,67 @@ +//! Per-request instrumentation middleware. +//! +//! Records request count and latency as Prometheus metrics and opens a tracing +//! span per request. Under the `otel` feature the span is exported as an OTLP +//! trace span by the `tracing-opentelemetry` layer (see [`crate::telemetry`]). +//! +//! Cardinality discipline: the `route` label uses axum's templated +//! [`MatchedPath`] (e.g. `/{resource_type}/{id}`), never the raw URI with +//! concrete IDs. The tenant is recorded as a *span attribute* only (useful for +//! per-tenant latency in traces) and is never a metric label, to avoid +//! unbounded Prometheus series. + +use std::time::Instant; + +use axum::{extract::MatchedPath, extract::Request, middleware::Next, response::Response}; +use tracing::Instrument; + +/// Tower/axum middleware (`axum::middleware::from_fn`) that instruments every +/// request. State-free, so it composes with any server's router. +pub async fn track(req: Request, next: Next) -> Response { + let method = req.method().clone(); + let route = req + .extensions() + .get::() + .map(|m| m.as_str().to_owned()) + .unwrap_or_else(|| "".to_owned()); + let tenant = req + .headers() + .get("x-tenant-id") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_owned(); + + let span = tracing::info_span!( + "http.request", + http.method = %method, + http.route = %route, + tenant = %tenant, + http.status_code = tracing::field::Empty, + ); + + let start = Instant::now(); + let response = next.run(req).instrument(span.clone()).await; + let elapsed = start.elapsed().as_secs_f64(); + let status = response.status().as_u16(); + + span.record("http.status_code", status); + + let method = method.as_str().to_owned(); + let status = status.to_string(); + metrics::counter!( + "http_requests_total", + "method" => method.clone(), + "route" => route.clone(), + "status" => status.clone(), + ) + .increment(1); + metrics::histogram!( + "http_request_duration_seconds", + "method" => method, + "route" => route, + "status" => status, + ) + .record(elapsed); + + response +} diff --git a/crates/observability/src/telemetry.rs b/crates/observability/src/telemetry.rs new file mode 100644 index 000000000..7233bdde8 --- /dev/null +++ b/crates/observability/src/telemetry.rs @@ -0,0 +1,90 @@ +//! Tracing-subscriber initialization, optionally bridged to OTLP. +//! +//! [`init`] replaces the ad-hoc `tracing_subscriber` setup each server used to +//! do itself. It always installs an `fmt` + `EnvFilter` subscriber. When built +//! with the `otel` feature **and** `OTEL_EXPORTER_OTLP_ENDPOINT` is set, it also +//! exports spans over OTLP via `tracing-opentelemetry`. Call [`shutdown`] during +//! graceful shutdown to flush buffered spans. + +use tracing_subscriber::{EnvFilter, fmt, prelude::*}; + +#[cfg(feature = "otel")] +static PROVIDER: std::sync::OnceLock = + std::sync::OnceLock::new(); + +/// Global default log directives when neither `RUST_LOG` nor the cli/env filter +/// is set. Applies `level` globally; deps remain overridable via `RUST_LOG`. +fn default_directives(level: &str) -> String { + level.to_string() +} + +/// Initialize logging/tracing for a server process. `service_name` is used as +/// the OTel resource service name (overridable by `OTEL_SERVICE_NAME`). Call +/// once per process. +pub fn init(service_name: &str, log_level: &str) { + let filter = EnvFilter::try_from_default_env() + .unwrap_or_else(|_| EnvFilter::new(default_directives(log_level))); + + #[cfg(feature = "otel")] + { + if let Some(provider) = build_otlp_tracer(service_name) { + use opentelemetry::trace::TracerProvider as _; + let tracer = provider.tracer("helios"); + tracing_subscriber::registry() + .with(filter) + .with(fmt::layer()) + .with(tracing_opentelemetry::layer().with_tracer(tracer)) + .init(); + let _ = PROVIDER.set(provider); + tracing::info!(service = service_name, "OTLP trace export enabled"); + return; + } + } + + tracing_subscriber::registry() + .with(filter) + .with(fmt::layer()) + .init(); + let _ = service_name; +} + +/// Flush and shut down the OTLP exporter, if one was configured. A no-op when +/// the `otel` feature is disabled or OTLP export was not configured. +pub fn shutdown() { + #[cfg(feature = "otel")] + if let Some(provider) = PROVIDER.get() { + let _ = provider.shutdown(); + } +} + +/// Build an OTLP-exporting tracer provider, or `None` when +/// `OTEL_EXPORTER_OTLP_ENDPOINT` is not set (export disabled). +#[cfg(feature = "otel")] +fn build_otlp_tracer(service_name: &str) -> Option { + use opentelemetry_otlp::SpanExporter; + use opentelemetry_sdk::{Resource, trace::SdkTracerProvider}; + + if std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT").is_err() { + return None; + } + + let exporter = match SpanExporter::builder().with_tonic().build() { + Ok(exporter) => exporter, + Err(err) => { + tracing::warn!(error = %err, "failed to build OTLP span exporter; tracing export disabled"); + return None; + } + }; + + let service_name = + std::env::var("OTEL_SERVICE_NAME").unwrap_or_else(|_| service_name.to_string()); + let resource = Resource::builder().with_service_name(service_name).build(); + + let provider = SdkTracerProvider::builder() + .with_batch_exporter(exporter) + .with_resource(resource) + .build(); + + opentelemetry::global::set_tracer_provider(provider.clone()); + Some(provider) +} diff --git a/crates/observability/src/uptime.rs b/crates/observability/src/uptime.rs new file mode 100644 index 000000000..d8b5c1cdb --- /dev/null +++ b/crates/observability/src/uptime.rs @@ -0,0 +1,45 @@ +//! Process uptime tracking. +//! +//! [`init`] is called once at server startup to record the process start time. +//! The accessors are cheap and lock-free, suitable for calling on every +//! `/health` or `/metrics` request. + +use std::sync::OnceLock; +use std::time::Instant; + +static START_INSTANT: OnceLock = OnceLock::new(); +static STARTED_AT: OnceLock> = OnceLock::new(); + +/// Record the process start time. Idempotent — the first call wins, so calling +/// it more than once (e.g. from tests) is harmless. +pub fn init() { + let _ = START_INSTANT.set(Instant::now()); + let _ = STARTED_AT.set(chrono::Utc::now()); +} + +/// Seconds elapsed since [`init`] was called. Returns `0.0` if `init` was never +/// called (e.g. in a unit test that exercises a handler directly). +pub fn uptime_seconds() -> f64 { + START_INSTANT + .get() + .map(|start| start.elapsed().as_secs_f64()) + .unwrap_or(0.0) +} + +/// RFC 3339 timestamp of when the process started, or an empty string if +/// [`init`] was never called. +pub fn started_at_rfc3339() -> String { + STARTED_AT.get().map(|t| t.to_rfc3339()).unwrap_or_default() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn uptime_is_non_negative_after_init() { + init(); + assert!(uptime_seconds() >= 0.0); + assert!(!started_at_rfc3339().is_empty()); + } +} diff --git a/crates/rest/Cargo.toml b/crates/rest/Cargo.toml index 8af58a6f1..bcc8469aa 100644 --- a/crates/rest/Cargo.toml +++ b/crates/rest/Cargo.toml @@ -36,6 +36,9 @@ redis = ["helios-auth/redis"] # Subscriptions subscriptions = ["dep:helios-subscriptions"] +# OpenTelemetry OTLP trace export (forwards to helios-observability) +otel = ["helios-observability/otel"] + [dependencies] # Core dependencies helios-fhir = { path = "../fhir", version = "0.1.47", default-features = false } @@ -44,6 +47,7 @@ helios-serde = { path = "../serde", version = "0.1.47", optional = true, default helios-auth = { path = "../auth", version = "0.1.47" } helios-audit = { path = "../audit", version = "0.1.47", default-features = false } helios-subscriptions = { path = "../subscriptions", version = "0.1.47", optional = true, default-features = false } +helios-observability = { path = "../observability", version = "0.1.47" } # Async runtime tokio = { version = "1", features = ["full"] } diff --git a/crates/rest/src/handlers/health.rs b/crates/rest/src/handlers/health.rs index 01731e6bd..3e3606828 100644 --- a/crates/rest/src/handlers/health.rs +++ b/crates/rest/src/handlers/health.rs @@ -41,6 +41,8 @@ where "service": "hfs", "version": env!("CARGO_PKG_VERSION"), "backend": backend_name, + "uptime_seconds": helios_observability::uptime::uptime_seconds(), + "started_at": helios_observability::uptime::started_at_rfc3339(), "timestamp": chrono::Utc::now().to_rfc3339() }); diff --git a/crates/sof/Cargo.toml b/crates/sof/Cargo.toml index c857ba9b9..f3001168e 100644 --- a/crates/sof/Cargo.toml +++ b/crates/sof/Cargo.toml @@ -25,10 +25,14 @@ R4B = ["helios-fhir/R4B", "helios-fhirpath/R4B"] R5 = ["helios-fhir/R5", "helios-fhirpath/R5"] R6 = ["helios-fhir/R6", "helios-fhirpath/R6"] +# OpenTelemetry OTLP trace export (heavy deps; opt-in for production images) +otel = ["helios-observability/otel"] + [dependencies] clap = { version = "4.0", features = ["derive", "env"] } tokio = { version = "1.0", features = ["full"] } helios-fhir = { path = "../fhir", version = "0.1.47", default-features = false } +helios-observability = { path = "../observability", version = "0.1.47" } helios-fhirpath = { path = "../fhirpath", version = "0.1.47", default-features = false } helios-fhirpath-support = { path = "../fhirpath-support", version = "0.1.47" } serde = { workspace = true } diff --git a/crates/sof/src/handlers.rs b/crates/sof/src/handlers.rs index 29d8979a8..7f9b14caf 100644 --- a/crates/sof/src/handlers.rs +++ b/crates/sof/src/handlers.rs @@ -883,6 +883,8 @@ pub async fn health_check() -> impl IntoResponse { "status": "ok", "service": "sof-server", "version": env!("CARGO_PKG_VERSION"), + "uptime_seconds": helios_observability::uptime::uptime_seconds(), + "started_at": helios_observability::uptime::started_at_rfc3339(), "timestamp": chrono::Utc::now().to_rfc3339() })) } diff --git a/crates/sof/src/server.rs b/crates/sof/src/server.rs index 4564e7056..9be685970 100644 --- a/crates/sof/src/server.rs +++ b/crates/sof/src/server.rs @@ -158,16 +158,11 @@ async fn main() -> Result<(), Box> { // Parse command line arguments first to get log level let config = parse_args(); - // Initialize tracing subscriber for logging with configured level - let filter = format!( - "sof_server={},tower_http={}", - config.log_level, config.log_level - ); - tracing_subscriber::fmt() - .with_env_filter( - tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| filter.into()), - ) - .init(); + // Initialize observability: logging/tracing (+ optional OTLP), uptime, and + // the Prometheus metrics recorder. + helios_observability::uptime::init(); + helios_observability::telemetry::init("sof-server", &config.log_level); + helios_observability::metrics::init("sof-server"); // Propagate SOF_TERMINOLOGY_SERVER to FHIRPATH_TERMINOLOGY_SERVER so that any // FHIRPath evaluation (memberOf, subsumes, etc.) delegates terminology @@ -337,6 +332,13 @@ fn create_app_with_config(config: &ServerConfig) -> Router { // Add tracing app = app.layer(TraceLayer::new_for_http()); + // Observability: `/metrics` (state-free) + per-request metrics/trace span. + app = app + .merge(helios_observability::metrics::router()) + .layer(axum::middleware::from_fn( + helios_observability::middleware::track, + )); + app }