From d679a4c6b7950729935f27949db34038f276f6d5 Mon Sep 17 00:00:00 2001
From: Pavan Kumar <pavanputhra@gmail.com>
Date: Thu, 21 May 2026 01:01:56 +0530
Subject: [PATCH] fix(metrics): bind meter to our provider directly, bypass
 single-call global
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to vcon-dev/vcon-server#178. The fork-safe init landed, but
SignOz queries still showed only one worker's value (peak inflight 128
across 4 worker processes instead of the expected ~512). Root cause:
the user-provided OTel resource (``host.name``, ``service.instance.id``)
never reached the metrics backend.

OpenTelemetry Python's ``metrics.set_meter_provider()`` is single-call.
When ``opentelemetry-instrumentation`` (auto-instrumentation) is active
in the process, it registers a default MeterProvider early in startup —
before this module's lazy ``_init_otel_metrics()`` runs. Our subsequent
``set_meter_provider(our_provider)`` call is silently ignored, and
``metrics.get_meter(__name__)`` returns the global proxy bound to the
auto-instrumentation's provider — whose resource we don't control.

The visible CH symptom: ``resource_attrs`` on every conserver.*
metric only contains ``service.name`` + ``telemetry.sdk.*`` +
``telemetry.auto.version``; ``host.name`` and ``service.instance.id``
that this module sets are dropped. All four worker processes collapse
onto a single fingerprint, last-write-wins.

Fix: bind ``meter`` to OUR provider directly via
``provider.get_meter(__name__)``, never touching the global. Our
provider has the correct resource and its own export pipeline to
the OTel collector. Auto-instrumentation metrics continue to flow
through their own pipeline independently — both arrive at the
collector.

Tests updated to assert ``set_meter_provider`` is NOT called and the
``meter`` global is bound to the provider's own meter, not the global
proxy.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 common/lib/metrics.py                    | 41 ++++++++++++++++++++----
 common/tests/test_otel_fork_safe_init.py | 24 +++++++++++---
 2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/common/lib/metrics.py b/common/lib/metrics.py
index 1e34af9..9e9446d 100644
--- a/common/lib/metrics.py
+++ b/common/lib/metrics.py
@@ -1,3 +1,4 @@
+import atexit
 import logging
 import os
 import socket
@@ -94,12 +95,40 @@ def _init_otel_metrics():
             metric_readers=[reader],
         )
 
-        # OTel Python's metrics.set_meter_provider may have already been
-        # called by the parent process. Calling it again in this child
-        # process replaces the global with our fork-local provider,
-        # which is what we want here.
-        metrics.set_meter_provider(provider)
-        meter = metrics.get_meter(__name__)
+        # Bind ``meter`` to OUR provider directly, NOT via the global
+        # ``metrics.get_meter(...)`` API.
+        #
+        # Why: OTel Python's ``set_meter_provider()`` is single-call. If
+        # auto-instrumentation (``opentelemetry-instrumentation``) is also
+        # active in this process, it has typically registered its own
+        # MeterProvider before our lazy init runs. Subsequent
+        # ``set_meter_provider`` calls are silently ignored, and
+        # ``metrics.get_meter(__name__)`` returns the global proxy bound
+        # to the auto-instrumentation's provider — whose resource we don't
+        # control. The visible symptom: our user-provided resource
+        # attributes (``host.name``, ``service.instance.id``) never reach
+        # the metrics backend, so per-process series collapse onto one
+        # fingerprint at the storage layer.
+        #
+        # Binding directly via ``provider.get_meter(__name__)`` sidesteps
+        # the global entirely. Our ``meter`` exports through OUR provider's
+        # pipeline (with the correct resource); auto-instrumentation
+        # metrics continue to flow through their own pipeline. Both end up
+        # at the OTel collector independently.
+        meter = provider.get_meter(__name__)
+
+        # Flush our provider's metric reader on process exit. The
+        # auto-instrumentation MeterProvider registers its own atexit
+        # hook against the global; ours doesn't (we never set it
+        # globally), so without this, ~5s of in-buffer metrics get
+        # dropped each time a worker exits cleanly (e.g. during a
+        # rolling deploy). Worker forks inherit the parent's atexit
+        # registrations; each child's own ``_init_otel_metrics`` then
+        # adds its OWN hook for its OWN provider. The inherited
+        # parent-provider hook is harmless in the child — the gRPC
+        # state it holds was copied at fork time and shutdown() on it
+        # is a no-op against a duplicated state.
+        atexit.register(provider.shutdown)
 
         # Clear cached instrument handles inherited from the parent —
         # they point at the parent's MeterProvider and exporter, neither
diff --git a/common/tests/test_otel_fork_safe_init.py b/common/tests/test_otel_fork_safe_init.py
index ad4b171..e27a9c5 100644
--- a/common/tests/test_otel_fork_safe_init.py
+++ b/common/tests/test_otel_fork_safe_init.py
@@ -32,15 +32,26 @@ def test_first_call_in_a_process_initializes(self):
         with patch("lib.metrics.OTLPMetricExporter") as mock_exp, \
              patch("lib.metrics.MeterProvider") as mock_mp, \
              patch("lib.metrics.metrics.set_meter_provider") as mock_set, \
-             patch("lib.metrics.metrics.get_meter", return_value=MagicMock()), \
+             patch("lib.metrics.atexit.register") as mock_atexit, \
              patch.object(metrics, "OTEL_EXPORTER_OTLP_ENDPOINT", "http://fake:4317"):
 
             metrics._init_otel_metrics()
 
-        # MeterProvider was built, set globally, and we recorded the pid.
+        # MeterProvider was built and we recorded the pid.
         assert mock_mp.called
-        assert mock_set.called
+        # We must NOT touch the global meter provider — auto-instrumentation
+        # may have already set it, and ``set_meter_provider`` is single-call.
+        # See the comment in ``_init_otel_metrics`` for the rationale.
+        assert not mock_set.called
+        # Meter must come from OUR provider, not the global proxy.
+        provider_instance = mock_mp.return_value
+        assert provider_instance.get_meter.called
+        assert metrics.meter is provider_instance.get_meter.return_value
         assert metrics._otel_initialized_pid == os.getpid()
+        # Provider's shutdown is registered as an atexit hook so the
+        # PeriodicExportingMetricReader flushes its buffer on process
+        # exit — otherwise rolling deploys drop the last ~5s of metrics.
+        mock_atexit.assert_called_once_with(provider_instance.shutdown)
 
     def test_second_call_in_same_pid_is_a_no_op(self):
         from lib import metrics
@@ -72,13 +83,14 @@ def test_different_pid_triggers_reinit(self):
         with patch("lib.metrics.OTLPMetricExporter") as mock_exp, \
              patch("lib.metrics.MeterProvider") as mock_mp, \
              patch("lib.metrics.metrics.set_meter_provider") as mock_set, \
-             patch("lib.metrics.metrics.get_meter", return_value=MagicMock()), \
              patch.object(metrics, "OTEL_EXPORTER_OTLP_ENDPOINT", "http://fake:4317"):
 
             metrics._init_otel_metrics()
 
         # Fresh provider built in this process.
         assert mock_mp.called
+        # Same single-call discipline — never touch the global.
+        assert not mock_set.called
         # Resource passed to MeterProvider contains a service.instance.id
         # built from host + this pid.
         kwargs = mock_mp.call_args.kwargs
@@ -87,6 +99,10 @@ def test_different_pid_triggers_reinit(self):
         assert attrs["service.instance.id"] == f"{metrics.host_name}-pid-{os.getpid()}"
         assert attrs["host.name"] == metrics.host_name
 
+        # Meter is bound to the new provider, not the global.
+        provider_instance = mock_mp.return_value
+        assert metrics.meter is provider_instance.get_meter.return_value
+
         # Instrument caches inherited from the "parent" were cleared so
         # they get rebuilt against the new MeterProvider.
         assert metrics.counter_metrics == {}