From be9328d702e7e1d8228adbc28a0439d067a497ae Mon Sep 17 00:00:00 2001 From: Cody Littley Date: Tue, 19 May 2026 13:26:58 -0500 Subject: [PATCH] Convert littdb to use otel --- .../db_engine/litt/littbuilder/build_utils.go | 59 +--- sei-db/db_engine/litt/littbuilder/db_impl.go | 37 +- sei-db/db_engine/litt/littdb_config.go | 17 +- .../db_engine/litt/metrics/littdb_metrics.go | 329 ++++++++---------- sei-db/db_engine/litt/util/cache_metrics.go | 119 ++++--- 5 files changed, 258 insertions(+), 303 deletions(-) diff --git a/sei-db/db_engine/litt/littbuilder/build_utils.go b/sei-db/db_engine/litt/littbuilder/build_utils.go index c97e9cc2f2..a9abc18309 100644 --- a/sei-db/db_engine/litt/littbuilder/build_utils.go +++ b/sei-db/db_engine/litt/littbuilder/build_utils.go @@ -1,17 +1,13 @@ package littbuilder import ( + "context" "fmt" "log/slog" - "net/http" "os" "path" - "strings" - "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/collectors" - "github.com/prometheus/client_golang/prometheus/promhttp" + commonmetrics "github.com/sei-protocol/sei-chain/sei-db/common/metrics" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/dbcache" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/disktable" @@ -243,45 +239,26 @@ func buildLogger(config *litt.Config) *slog.Logger { return slog.Default() } -// buildMetrics creates a new metrics object based on the configuration. If the returned server is not nil, -// then it is the responsibility of the caller to eventually call server.Shutdown(). -func buildMetrics(config *litt.Config, logger *slog.Logger) (*metrics.LittDBMetrics, *http.Server) { +// buildMetrics creates a new metrics object backed by the global OTel +// MeterProvider. When MetricsEnabled is true, this configures the global +// provider with a Prometheus exporter and starts an HTTP server on +// MetricsPort that serves /metrics. The returned shutdown function flushes +// the provider; it is the responsibility of the caller to invoke it during +// teardown. +func buildMetrics(config *litt.Config, logger *slog.Logger) (*metrics.LittDBMetrics, func(context.Context) error) { if !config.MetricsEnabled { return nil, nil } - var registry *prometheus.Registry - var server *http.Server - - if config.MetricsEnabled { - if config.MetricsRegistry == nil { - registry = prometheus.NewRegistry() - registry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) - registry.MustRegister(collectors.NewGoCollector()) - - logger.Info("Starting metrics server", "port", config.MetricsPort) - addr := fmt.Sprintf(":%d", config.MetricsPort) - mux := http.NewServeMux() - mux.Handle("/metrics", promhttp.HandlerFor( - registry, - promhttp.HandlerOpts{}, - )) - server = &http.Server{ - Addr: addr, - Handler: mux, - ReadHeaderTimeout: 10 * time.Second, - } - - go func() { - err := server.ListenAndServe() - if err != nil && !strings.Contains(err.Error(), "http: Server closed") { - logger.Error("metrics server error", "error", err) - } - }() - } else { - registry = config.MetricsRegistry - } + reg, shutdown, err := commonmetrics.SetupOtelPrometheus() + if err != nil { + logger.Error("failed to set up OTel Prometheus exporter", "error", err) + return nil, nil } - return metrics.NewLittDBMetrics(registry, config.MetricsNamespace), server + addr := fmt.Sprintf(":%d", config.MetricsPort) + logger.Info("Starting metrics server", "port", config.MetricsPort) + commonmetrics.StartMetricsServer(config.CTX, reg, addr) + + return metrics.NewLittDBMetrics(), shutdown } diff --git a/sei-db/db_engine/litt/littbuilder/db_impl.go b/sei-db/db_engine/litt/littbuilder/db_impl.go index 949aa1de53..4dc2809f1a 100644 --- a/sei-db/db_engine/litt/littbuilder/db_impl.go +++ b/sei-db/db_engine/litt/littbuilder/db_impl.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "log/slog" - "net/http" "sync" "sync/atomic" "time" @@ -53,8 +52,8 @@ type db struct { // Metrics for the database. metrics *metrics.LittDBMetrics - // The HTTP server for metrics. nil if metrics are disabled or if an external party is managing the server. - metricsServer *http.Server + // Shuts down the OTel MeterProvider configured by buildMetrics. nil if metrics are disabled. + metricsShutdown func(context.Context) error // A function that releases file locks. releaseLocks func() @@ -125,9 +124,9 @@ func NewDBUnsafe(config *litt.Config, tableBuilder TableBuilderFunc) (litt.DB, e } var dbMetrics *metrics.LittDBMetrics - var metricsServer *http.Server + var metricsShutdown func(context.Context) error if config.MetricsEnabled { - dbMetrics, metricsServer = buildMetrics(config, config.Logger) + dbMetrics, metricsShutdown = buildMetrics(config, config.Logger) } if config.SnapshotDirectory != "" { @@ -136,16 +135,16 @@ func NewDBUnsafe(config *litt.Config, tableBuilder TableBuilderFunc) (litt.DB, e } database := &db{ - ctx: config.CTX, - logger: config.Logger, - clock: config.Clock, - ttl: config.TTL, - gcPeriod: config.GCPeriod, - tableBuilder: tableBuilder, - tables: make(map[string]litt.ManagedTable), - metrics: dbMetrics, - metricsServer: metricsServer, - releaseLocks: releaseLocks, + ctx: config.CTX, + logger: config.Logger, + clock: config.Clock, + ttl: config.TTL, + gcPeriod: config.GCPeriod, + tableBuilder: tableBuilder, + tables: make(map[string]litt.ManagedTable), + metrics: dbMetrics, + metricsShutdown: metricsShutdown, + releaseLocks: releaseLocks, } if config.MetricsEnabled { @@ -281,11 +280,13 @@ func (d *db) Destroy() error { // gatherMetrics is a method that periodically collects metrics. func (d *db) gatherMetrics(interval time.Duration) { - if d.metricsServer != nil { + if d.metricsShutdown != nil { defer func() { - err := d.metricsServer.Close() + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + err := d.metricsShutdown(shutdownCtx) if err != nil { - d.logger.Error("error closing metrics server", "error", err) + d.logger.Error("error shutting down metrics provider", "error", err) } }() } diff --git a/sei-db/db_engine/litt/littdb_config.go b/sei-db/db_engine/litt/littdb_config.go index 4c5594c4de..ec2e29d0cd 100644 --- a/sei-db/db_engine/litt/littdb_config.go +++ b/sei-db/db_engine/litt/littdb_config.go @@ -7,7 +7,6 @@ import ( "math" "time" - "github.com/prometheus/client_golang/prometheus" "github.com/sei-protocol/sei-chain/sei-db/common/unit" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/disktable/keymap" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/util" @@ -117,17 +116,12 @@ type Config struct { // than keymap.MemKeymapType, performing this check may be very expensive. By default, this is false. DoubleWriteProtection bool - // If enabled, collect DB metrics and export them to prometheus. By default, this is false. + // If enabled, collect DB metrics and export them via the global OTel MeterProvider. By default, this is false. + // When enabled, the database configures a Prometheus exporter on the global provider and serves /metrics on + // MetricsPort. MetricsEnabled bool - // The namespace to use for metrics. If empty, the default namespace "litt" is used. - MetricsNamespace string - - // The prometheus registry to use for metrics. If nil and metrics are enabled, a new registry is created. - MetricsRegistry *prometheus.Registry - - // The port to use for the metrics server. Ignored if MetricsEnabled is false or MetricsRegistry is not nil. - // The default is 9101. + // The port to use for the metrics server. Ignored if MetricsEnabled is false. The default is 9101. MetricsPort int // The interval at which various DB metrics are updated. The default is 1 second. @@ -194,7 +188,6 @@ func DefaultConfigNoPaths() *Config { Fsync: true, DoubleWriteProtection: false, MetricsEnabled: false, - MetricsNamespace: "litt", MetricsPort: 9101, MetricsUpdateInterval: time.Second, PurgeLocks: false, @@ -258,7 +251,7 @@ func (c *Config) SanityCheck() error { if c.GCPeriod == 0 { return fmt.Errorf("gc period must be at least 1") } - if (c.MetricsEnabled || c.MetricsRegistry != nil) && c.MetricsUpdateInterval == 0 { + if c.MetricsEnabled && c.MetricsUpdateInterval == 0 { return fmt.Errorf("metrics update interval must be at least 1 if metrics are enabled") } diff --git a/sei-db/db_engine/litt/metrics/littdb_metrics.go b/sei-db/db_engine/litt/metrics/littdb_metrics.go index 6e41b6c44a..c7d1aa468c 100644 --- a/sei-db/db_engine/litt/metrics/littdb_metrics.go +++ b/sei-db/db_engine/litt/metrics/littdb_metrics.go @@ -1,10 +1,14 @@ package metrics import ( + "context" "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + commonmetrics "github.com/sei-protocol/sei-chain/sei-db/common/metrics" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt" "github.com/sei-protocol/sei-chain/sei-db/db_engine/litt/util" ) @@ -22,56 +26,64 @@ import ( // - segment creation rate // - used/unused segment space (useful for detecting shard assignment issues) -// LittDBMetrics encapsulates metrics for a LittDB. +const littMeterName = "litt" + +// LittDBMetrics encapsulates metrics for a LittDB. Metrics are exported via +// whatever exporter is configured on the global OTel MeterProvider (e.g. +// Prometheus, OTLP). The caller is responsible for setting up the provider +// before calling NewLittDBMetrics (see commonmetrics.SetupOtelPrometheus). +// +// Per-table observations are tagged with a "table" attribute. A nil +// LittDBMetrics acts as a no-op for all Report* methods. type LittDBMetrics struct { // The size of individual tables in the database. - tableSizeInBytes *prometheus.GaugeVec + tableSizeInBytes metric.Int64Gauge // The number of keys in individual tables in the database. - tableKeyCount *prometheus.GaugeVec + tableKeyCount metric.Int64Gauge // The number of bytes read from disk since startup. - bytesReadCounter *prometheus.CounterVec + bytesReadCounter metric.Int64Counter // The number of keys read from disk since startup. - keysReadCounter *prometheus.CounterVec + keysReadCounter metric.Int64Counter // The number of cache hits since startup. - cacheHitCounter *prometheus.CounterVec + cacheHitCounter metric.Int64Counter // The number of cache misses since startup. - cacheMissCounter *prometheus.CounterVec + cacheMissCounter metric.Int64Counter // Reports on the read latency of the database. This metric includes both cache hits and cache misses. - readLatency *prometheus.SummaryVec + readLatency metric.Float64Histogram // Reports on the write latency of the database, but only measures the time to read a value when a // cache miss occurs. - cacheMissLatency *prometheus.SummaryVec + cacheMissLatency metric.Float64Histogram // The number of bytes written to disk since startup. Only includes values, not metadata. - bytesWrittenCounter *prometheus.CounterVec + bytesWrittenCounter metric.Int64Counter // The number of keys written to disk since startup. - keysWrittenCounter *prometheus.CounterVec + keysWrittenCounter metric.Int64Counter // Reports on the write latency of the database. - writeLatency *prometheus.SummaryVec + writeLatency metric.Float64Histogram // The number of times a flush operation has been performed. - flushCount *prometheus.CounterVec + flushCount metric.Int64Counter // Reports on the latency of a flush operation. - flushLatency *prometheus.SummaryVec + flushLatency metric.Float64Histogram // Reports on the latency of a flushing segment files. This is a subset of the time spent during a flush operation. - segmentFlushLatency *prometheus.SummaryVec + segmentFlushLatency metric.Float64Histogram // Reports on the latency of a keymap flush operation. This is a subset of the time spent during a flush operation. - keymapFlushLatency *prometheus.SummaryVec + keymapFlushLatency metric.Float64Histogram - // The latency of garbage collection operations.1 - garbageCollectionLatency *prometheus.SummaryVec + // The latency of garbage collection operations. + garbageCollectionLatency metric.Float64Histogram // Metrics for the write cache. writeCacheMetrics *util.CacheMetrics @@ -80,179 +92,124 @@ type LittDBMetrics struct { readCacheMetrics *util.CacheMetrics } -// NewLittDBMetrics creates a new LittDBMetrics instance. -func NewLittDBMetrics(registry *prometheus.Registry, namespace string) *LittDBMetrics { - if registry == nil { - return nil - } - - objectives := map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001} - - tableSizeInBytes := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "table_size_bytes", - Help: "The size of individual tables in the database in bytes.", - }, - []string{"table"}, - ) - - tableKeyCount := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "table_key_count", - Help: "The number of keys in individual tables in the database.", - }, - []string{"table"}, +// NewLittDBMetrics creates a new LittDBMetrics instance backed by the global +// OTel MeterProvider. The caller must configure a MeterProvider with a +// Prometheus or other exporter before calling this (e.g. via +// commonmetrics.SetupOtelPrometheus). +func NewLittDBMetrics() *LittDBMetrics { + meter := otel.Meter(littMeterName) + + tableSizeInBytes, _ := meter.Int64Gauge( + "litt_table_size_bytes", + metric.WithDescription("The size of individual tables in the database in bytes."), + metric.WithUnit("By"), ) - bytesReadCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "bytes_read", - Help: "The number of bytes read from disk since startup.", - }, - []string{"table"}, + tableKeyCount, _ := meter.Int64Gauge( + "litt_table_key_count", + metric.WithDescription("The number of keys in individual tables in the database."), + metric.WithUnit("{count}"), ) - keysReadCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "keys_read", - Help: "The number of keys read from disk since startup.", - }, - []string{"table"}, + bytesReadCounter, _ := meter.Int64Counter( + "litt_bytes_read", + metric.WithDescription("The number of bytes read from disk since startup."), + metric.WithUnit("By"), ) - cacheHitCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "cache_hits", - Help: "The number of cache hits since startup.", - }, - []string{"table"}, + keysReadCounter, _ := meter.Int64Counter( + "litt_keys_read", + metric.WithDescription("The number of keys read from disk since startup."), + metric.WithUnit("{count}"), ) - cacheMissCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "cache_misses", - Help: "The number of cache misses since startup.", - }, - []string{"table"}, + cacheHitCounter, _ := meter.Int64Counter( + "litt_cache_hits", + metric.WithDescription("The number of cache hits since startup."), + metric.WithUnit("{count}"), ) - readLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "read_latency_ms", - Help: "Reports on the read latency of the database. " + - "This metric includes both cache hits and cache misses.", - Objectives: objectives, - }, - []string{"table"}, + cacheMissCounter, _ := meter.Int64Counter( + "litt_cache_misses", + metric.WithDescription("The number of cache misses since startup."), + metric.WithUnit("{count}"), ) - cacheMissLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "cache_miss_latency_ms", - Help: "Reports on the write latency of the database, " + - "but only measures the time to read a value when a cache miss occurs.", - Objectives: objectives, - }, - []string{"table"}, + readLatency, _ := meter.Float64Histogram( + "litt_read_latency_seconds", + metric.WithDescription( + "Reports on the read latency of the database. "+ + "This metric includes both cache hits and cache misses."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - bytesWrittenCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "bytes_written", - Help: "The number of bytes written to disk since startup. Only includes values, not metadata.", - }, - []string{"table"}, + cacheMissLatency, _ := meter.Float64Histogram( + "litt_cache_miss_latency_seconds", + metric.WithDescription( + "Reports on the read latency of the database, "+ + "but only measures the time to read a value when a cache miss occurs."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - keysWrittenCounter := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "keys_written", - Help: "The number of keys written to disk since startup.", - }, - []string{"table"}, + bytesWrittenCounter, _ := meter.Int64Counter( + "litt_bytes_written", + metric.WithDescription("The number of bytes written to disk since startup. Only includes values, not metadata."), + metric.WithUnit("By"), ) - writeLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "write_latency_ms", - Help: "Reports on the write latency of the database.", - Objectives: objectives, - }, - []string{"table"}, + keysWrittenCounter, _ := meter.Int64Counter( + "litt_keys_written", + metric.WithDescription("The number of keys written to disk since startup."), + metric.WithUnit("{count}"), ) - flushCount := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "flush_count", - Help: "The number of times a flush operation has been performed.", - }, - []string{"table"}, + writeLatency, _ := meter.Float64Histogram( + "litt_write_latency_seconds", + metric.WithDescription("Reports on the write latency of the database."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - flushLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "flush_latency_ms", - Help: "Reports on the latency of a flush operation.", - Objectives: objectives, - }, - []string{"table"}, + flushCount, _ := meter.Int64Counter( + "litt_flush_count", + metric.WithDescription("The number of times a flush operation has been performed."), + metric.WithUnit("{count}"), ) - segmentFlushLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "segment_flush_latency_ms", - Help: "Reports on segment flush latency. This is a subset of the time spent during a flush operation.", - Objectives: objectives, - }, - []string{"table"}, + flushLatency, _ := meter.Float64Histogram( + "litt_flush_latency_seconds", + metric.WithDescription("Reports on the latency of a flush operation."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - keymapFlushLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "keymap_flush_latency_ms", - Help: "Reports on the latency of a keymap flush operation. " + - "This is a subset of the time spent during a flush operation.", - Objectives: objectives, - }, - []string{"table"}, + segmentFlushLatency, _ := meter.Float64Histogram( + "litt_segment_flush_latency_seconds", + metric.WithDescription("Reports on segment flush latency. This is a subset of the time spent during a flush operation."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - garbageCollectionLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "garbage_collection_latency_ms", - Help: "Reports on the latency of garbage collection operations.", - Objectives: objectives, - }, - []string{"table"}, + keymapFlushLatency, _ := meter.Float64Histogram( + "litt_keymap_flush_latency_seconds", + metric.WithDescription( + "Reports on the latency of a keymap flush operation. "+ + "This is a subset of the time spent during a flush operation."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - writeCacheMetrics := util.NewCacheMetrics( - registry, - namespace, - "chunk_write", + garbageCollectionLatency, _ := meter.Float64Histogram( + "litt_garbage_collection_latency_seconds", + metric.WithDescription("Reports on the latency of garbage collection operations."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) - readCacheMetrics := util.NewCacheMetrics( - registry, - namespace, - "chunk_read", - ) + writeCacheMetrics := util.NewCacheMetrics("chunk_write") + readCacheMetrics := util.NewCacheMetrics("chunk_read") return &LittDBMetrics{ tableSizeInBytes: tableSizeInBytes, @@ -276,6 +233,13 @@ func NewLittDBMetrics(registry *prometheus.Registry, namespace string) *LittDBMe } } +// tableAttr returns the OTel measurement option that tags an observation with +// the given table name. Allocated per call rather than cached because callers +// pass arbitrary table names; for hot-path call sites consider caching upstream. +func tableAttr(tableName string) metric.MeasurementOption { + return metric.WithAttributes(attribute.String("table", tableName)) +} + // CollectPeriodicMetrics is a method that is periodically called to collect metrics. Tables are not permitted to be // added or dropped while this method is running. func (m *LittDBMetrics) CollectPeriodicMetrics(tables map[string]litt.ManagedTable) { @@ -283,14 +247,16 @@ func (m *LittDBMetrics) CollectPeriodicMetrics(tables map[string]litt.ManagedTab return } + ctx := context.Background() for _, table := range tables { tableName := table.Name() + attrs := tableAttr(tableName) tableSize := table.Size() - m.tableSizeInBytes.WithLabelValues(tableName).Set(float64(tableSize)) + m.tableSizeInBytes.Record(ctx, int64(tableSize), attrs) //nolint:gosec // table size fits int64 tableKeyCount := table.KeyCount() - m.tableKeyCount.WithLabelValues(tableName).Set(float64(tableKeyCount)) + m.tableKeyCount.Record(ctx, int64(tableKeyCount), attrs) //nolint:gosec // key count fits int64 } } @@ -305,15 +271,18 @@ func (m *LittDBMetrics) ReportReadOperation( return } - m.bytesReadCounter.WithLabelValues(tableName).Add(float64(dataSize)) - m.keysReadCounter.WithLabelValues(tableName).Inc() - m.readLatency.WithLabelValues(tableName).Observe(latency.Seconds()) + ctx := context.Background() + attrs := tableAttr(tableName) + + m.bytesReadCounter.Add(ctx, int64(dataSize), attrs) //nolint:gosec // data size fits int64 + m.keysReadCounter.Add(ctx, 1, attrs) + m.readLatency.Record(ctx, latency.Seconds(), attrs) if cacheHit { - m.cacheHitCounter.WithLabelValues(tableName).Inc() + m.cacheHitCounter.Add(ctx, 1, attrs) } else { - m.cacheMissCounter.WithLabelValues(tableName).Inc() - m.cacheMissLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + m.cacheMissCounter.Add(ctx, 1, attrs) + m.cacheMissLatency.Record(ctx, latency.Seconds(), attrs) } } @@ -328,9 +297,12 @@ func (m *LittDBMetrics) ReportWriteOperation( return } - m.bytesWrittenCounter.WithLabelValues(tableName).Add(float64(dataSize)) - m.keysWrittenCounter.WithLabelValues(tableName).Add(float64(batchSize)) - m.writeLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + ctx := context.Background() + attrs := tableAttr(tableName) + + m.bytesWrittenCounter.Add(ctx, int64(dataSize), attrs) //nolint:gosec // data size fits int64 + m.keysWrittenCounter.Add(ctx, int64(batchSize), attrs) //nolint:gosec // batch size fits int64 + m.writeLatency.Record(ctx, latency.Seconds(), attrs) } // ReportFlushOperation reports the results of a flush operation. @@ -339,8 +311,11 @@ func (m *LittDBMetrics) ReportFlushOperation(tableName string, latency time.Dura return } - m.flushCount.WithLabelValues(tableName).Inc() - m.flushLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + ctx := context.Background() + attrs := tableAttr(tableName) + + m.flushCount.Add(ctx, 1, attrs) + m.flushLatency.Record(ctx, latency.Seconds(), attrs) } // ReportSegmentFlushLatency reports the amount of time taken to flush value files. @@ -349,7 +324,7 @@ func (m *LittDBMetrics) ReportSegmentFlushLatency(tableName string, latency time return } - m.segmentFlushLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + m.segmentFlushLatency.Record(context.Background(), latency.Seconds(), tableAttr(tableName)) } // ReportKeymapFlushLatency reports the amount of time taken to flush the keymap. @@ -358,7 +333,7 @@ func (m *LittDBMetrics) ReportKeymapFlushLatency(tableName string, latency time. return } - m.keymapFlushLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + m.keymapFlushLatency.Record(context.Background(), latency.Seconds(), tableAttr(tableName)) } // ReportGarbageCollectionLatency reports the latency of a garbage collection operation. @@ -367,7 +342,7 @@ func (m *LittDBMetrics) ReportGarbageCollectionLatency(tableName string, latency return } - m.garbageCollectionLatency.WithLabelValues(tableName).Observe(util.ToMilliseconds(latency)) + m.garbageCollectionLatency.Record(context.Background(), latency.Seconds(), tableAttr(tableName)) } func (m *LittDBMetrics) GetWriteCacheMetrics() *util.CacheMetrics { diff --git a/sei-db/db_engine/litt/util/cache_metrics.go b/sei-db/db_engine/litt/util/cache_metrics.go index 572f8e3327..e6f92ca620 100644 --- a/sei-db/db_engine/litt/util/cache_metrics.go +++ b/sei-db/db_engine/litt/util/cache_metrics.go @@ -1,74 +1,81 @@ package util import ( + "context" "time" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + commonmetrics "github.com/sei-protocol/sei-chain/sei-db/common/metrics" ) -// CacheMetrics is a struct that holds metrics for a cache. A nil CacheMetrics instance acts as a no-op. +const cacheMeterName = "litt" + +// CacheMetrics is a struct that holds OTel metrics for a cache. A nil +// CacheMetrics instance acts as a no-op for all report* methods. +// +// Multiple CacheMetrics instances may be created for the same process; each +// receives references to the same underlying instruments because OTel +// instrument registration is idempotent. The "cache" attribute (set at +// construction time) distinguishes series in the exporter +// (e.g. litt_chunk_cache_keys_added{cache="chunk_read"}). type CacheMetrics struct { - keyCount *prometheus.GaugeVec - weight *prometheus.GaugeVec - keysAdded *prometheus.CounterVec - weightAdded *prometheus.CounterVec - evictionLatency *prometheus.SummaryVec + // Pre-computed attribute option reused on every recording to avoid + // per-call allocations on the hot path. + attrs metric.MeasurementOption + + keyCount metric.Int64Gauge + weight metric.Int64Gauge + keysAdded metric.Int64Counter + weightAdded metric.Int64Counter + evictionLatency metric.Float64Histogram } -// NewCacheMetrics creates a new CacheMetrics instance. If the registry is nil, it returns nil. -// The cacheName does not need to include the suffix "_cache" as this is added automatically. -func NewCacheMetrics(registry *prometheus.Registry, namespace string, cacheName string) *CacheMetrics { - if registry == nil { - return nil - } - - evictionLatency := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: cacheName + "_cache_eviction_latency_ms", - Help: "Reports on the eviction latency of the cache.", - }, - []string{}, +// NewCacheMetrics creates a new CacheMetrics that records via the global OTel +// MeterProvider. The cacheName is attached as the "cache" attribute on every +// observation, allowing multiple cache instances to be distinguished in the +// exporter (for example "chunk_read" vs "chunk_write"). +// +// The caller must have configured a MeterProvider before calling this (e.g. +// commonmetrics.SetupOtelPrometheus). +func NewCacheMetrics(cacheName string) *CacheMetrics { + meter := otel.Meter(cacheMeterName) + + keyCount, _ := meter.Int64Gauge( + "litt_chunk_cache_key_count", + metric.WithDescription("Reports on the number of keys in the cache."), + metric.WithUnit("{count}"), ) - keyCount := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: cacheName + "_cache_key_count", - Help: "Reports on the number of keys in the cache", - }, - []string{}, + weight, _ := meter.Int64Gauge( + "litt_chunk_cache_weight_bytes", + metric.WithDescription("Reports on the weight of the cache in bytes."), + metric.WithUnit("By"), ) - weight := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: cacheName + "_cache_weight", - Help: "Reports on the weight of the cache", - }, - []string{}, + keysAdded, _ := meter.Int64Counter( + "litt_chunk_cache_keys_added", + metric.WithDescription("Reports on the number of keys added to the cache."), + metric.WithUnit("{count}"), ) - keysAdded := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: cacheName + "_cache_keys_added", - Help: "Reports on the number of keys added to the cache", - }, - []string{}, + weightAdded, _ := meter.Int64Counter( + "litt_chunk_cache_weight_added_bytes", + metric.WithDescription("Reports on the weight of the entries added to the cache."), + metric.WithUnit("By"), ) - weightAdded := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: cacheName + "_cache_weight_added", - Help: "Reports on the weight of the entries added to the cache", - }, - []string{}, + evictionLatency, _ := meter.Float64Histogram( + "litt_chunk_cache_eviction_latency_seconds", + metric.WithDescription("Reports on the eviction latency of the cache."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(commonmetrics.LatencyBuckets...), ) return &CacheMetrics{ + attrs: metric.WithAttributes(attribute.String("cache", cacheName)), keyCount: keyCount, weight: weight, keysAdded: keysAdded, @@ -83,8 +90,9 @@ func (m *CacheMetrics) reportInsertion(weight uint64) { return } - m.keysAdded.WithLabelValues().Inc() - m.weightAdded.WithLabelValues().Add(float64(weight)) + ctx := context.Background() + m.keysAdded.Add(ctx, 1, m.attrs) + m.weightAdded.Add(ctx, int64(weight), m.attrs) //nolint:gosec // weight fits int64 } // reportEviction is used to report an entry being evicted from the cache. @@ -93,7 +101,7 @@ func (m *CacheMetrics) reportEviction(age time.Duration) { return } - m.evictionLatency.WithLabelValues().Observe(ToMilliseconds(age)) + m.evictionLatency.Record(context.Background(), age.Seconds(), m.attrs) } // reportCurrentSize is used to report the current size/weight of the cache. @@ -102,6 +110,7 @@ func (m *CacheMetrics) reportCurrentSize(size int, weight uint64) { return } - m.keyCount.WithLabelValues().Set(float64(size)) - m.weight.WithLabelValues().Set(float64(weight)) + ctx := context.Background() + m.keyCount.Record(ctx, int64(size), m.attrs) //nolint:gosec // size fits int64 + m.weight.Record(ctx, int64(weight), m.attrs) //nolint:gosec // weight fits int64 }