From d679a4c6b7950729935f27949db34038f276f6d5 Mon Sep 17 00:00:00 2001 From: Pavan Kumar Date: Thu, 21 May 2026 01:01:56 +0530 Subject: [PATCH] fix(metrics): bind meter to our provider directly, bypass single-call global MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to vcon-dev/vcon-server#178. The fork-safe init landed, but SignOz queries still showed only one worker's value (peak inflight 128 across 4 worker processes instead of the expected ~512). Root cause: the user-provided OTel resource (``host.name``, ``service.instance.id``) never reached the metrics backend. OpenTelemetry Python's ``metrics.set_meter_provider()`` is single-call. When ``opentelemetry-instrumentation`` (auto-instrumentation) is active in the process, it registers a default MeterProvider early in startup — before this module's lazy ``_init_otel_metrics()`` runs. Our subsequent ``set_meter_provider(our_provider)`` call is silently ignored, and ``metrics.get_meter(__name__)`` returns the global proxy bound to the auto-instrumentation's provider — whose resource we don't control. The visible CH symptom: ``resource_attrs`` on every conserver.* metric only contains ``service.name`` + ``telemetry.sdk.*`` + ``telemetry.auto.version``; ``host.name`` and ``service.instance.id`` that this module sets are dropped. All four worker processes collapse onto a single fingerprint, last-write-wins. Fix: bind ``meter`` to OUR provider directly via ``provider.get_meter(__name__)``, never touching the global. Our provider has the correct resource and its own export pipeline to the OTel collector. Auto-instrumentation metrics continue to flow through their own pipeline independently — both arrive at the collector. Tests updated to assert ``set_meter_provider`` is NOT called and the ``meter`` global is bound to the provider's own meter, not the global proxy. Co-Authored-By: Claude Opus 4.7 (1M context) --- common/lib/metrics.py | 41 ++++++++++++++++++++---- common/tests/test_otel_fork_safe_init.py | 24 +++++++++++--- 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/common/lib/metrics.py b/common/lib/metrics.py index 1e34af9..9e9446d 100644 --- a/common/lib/metrics.py +++ b/common/lib/metrics.py @@ -1,3 +1,4 @@ +import atexit import logging import os import socket @@ -94,12 +95,40 @@ def _init_otel_metrics(): metric_readers=[reader], ) - # OTel Python's metrics.set_meter_provider may have already been - # called by the parent process. Calling it again in this child - # process replaces the global with our fork-local provider, - # which is what we want here. - metrics.set_meter_provider(provider) - meter = metrics.get_meter(__name__) + # Bind ``meter`` to OUR provider directly, NOT via the global + # ``metrics.get_meter(...)`` API. + # + # Why: OTel Python's ``set_meter_provider()`` is single-call. If + # auto-instrumentation (``opentelemetry-instrumentation``) is also + # active in this process, it has typically registered its own + # MeterProvider before our lazy init runs. Subsequent + # ``set_meter_provider`` calls are silently ignored, and + # ``metrics.get_meter(__name__)`` returns the global proxy bound + # to the auto-instrumentation's provider — whose resource we don't + # control. The visible symptom: our user-provided resource + # attributes (``host.name``, ``service.instance.id``) never reach + # the metrics backend, so per-process series collapse onto one + # fingerprint at the storage layer. + # + # Binding directly via ``provider.get_meter(__name__)`` sidesteps + # the global entirely. Our ``meter`` exports through OUR provider's + # pipeline (with the correct resource); auto-instrumentation + # metrics continue to flow through their own pipeline. Both end up + # at the OTel collector independently. + meter = provider.get_meter(__name__) + + # Flush our provider's metric reader on process exit. The + # auto-instrumentation MeterProvider registers its own atexit + # hook against the global; ours doesn't (we never set it + # globally), so without this, ~5s of in-buffer metrics get + # dropped each time a worker exits cleanly (e.g. during a + # rolling deploy). Worker forks inherit the parent's atexit + # registrations; each child's own ``_init_otel_metrics`` then + # adds its OWN hook for its OWN provider. The inherited + # parent-provider hook is harmless in the child — the gRPC + # state it holds was copied at fork time and shutdown() on it + # is a no-op against a duplicated state. + atexit.register(provider.shutdown) # Clear cached instrument handles inherited from the parent — # they point at the parent's MeterProvider and exporter, neither diff --git a/common/tests/test_otel_fork_safe_init.py b/common/tests/test_otel_fork_safe_init.py index ad4b171..e27a9c5 100644 --- a/common/tests/test_otel_fork_safe_init.py +++ b/common/tests/test_otel_fork_safe_init.py @@ -32,15 +32,26 @@ def test_first_call_in_a_process_initializes(self): with patch("lib.metrics.OTLPMetricExporter") as mock_exp, \ patch("lib.metrics.MeterProvider") as mock_mp, \ patch("lib.metrics.metrics.set_meter_provider") as mock_set, \ - patch("lib.metrics.metrics.get_meter", return_value=MagicMock()), \ + patch("lib.metrics.atexit.register") as mock_atexit, \ patch.object(metrics, "OTEL_EXPORTER_OTLP_ENDPOINT", "http://fake:4317"): metrics._init_otel_metrics() - # MeterProvider was built, set globally, and we recorded the pid. + # MeterProvider was built and we recorded the pid. assert mock_mp.called - assert mock_set.called + # We must NOT touch the global meter provider — auto-instrumentation + # may have already set it, and ``set_meter_provider`` is single-call. + # See the comment in ``_init_otel_metrics`` for the rationale. + assert not mock_set.called + # Meter must come from OUR provider, not the global proxy. + provider_instance = mock_mp.return_value + assert provider_instance.get_meter.called + assert metrics.meter is provider_instance.get_meter.return_value assert metrics._otel_initialized_pid == os.getpid() + # Provider's shutdown is registered as an atexit hook so the + # PeriodicExportingMetricReader flushes its buffer on process + # exit — otherwise rolling deploys drop the last ~5s of metrics. + mock_atexit.assert_called_once_with(provider_instance.shutdown) def test_second_call_in_same_pid_is_a_no_op(self): from lib import metrics @@ -72,13 +83,14 @@ def test_different_pid_triggers_reinit(self): with patch("lib.metrics.OTLPMetricExporter") as mock_exp, \ patch("lib.metrics.MeterProvider") as mock_mp, \ patch("lib.metrics.metrics.set_meter_provider") as mock_set, \ - patch("lib.metrics.metrics.get_meter", return_value=MagicMock()), \ patch.object(metrics, "OTEL_EXPORTER_OTLP_ENDPOINT", "http://fake:4317"): metrics._init_otel_metrics() # Fresh provider built in this process. assert mock_mp.called + # Same single-call discipline — never touch the global. + assert not mock_set.called # Resource passed to MeterProvider contains a service.instance.id # built from host + this pid. kwargs = mock_mp.call_args.kwargs @@ -87,6 +99,10 @@ def test_different_pid_triggers_reinit(self): assert attrs["service.instance.id"] == f"{metrics.host_name}-pid-{os.getpid()}" assert attrs["host.name"] == metrics.host_name + # Meter is bound to the new provider, not the global. + provider_instance = mock_mp.return_value + assert metrics.meter is provider_instance.get_meter.return_value + # Instrument caches inherited from the "parent" were cleared so # they get rebuilt against the new MeterProvider. assert metrics.counter_metrics == {}