diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 000000000..eb3940d40 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,51 @@ +# Datadog APM instrumentation for Agentex agents + +## Use exactly one ddtrace auto-instrumentation entry point per process + +The SGP platform enables Datadog **single-step instrumentation (SSI)** on every +agentex pod (`admission.datadoghq.com/enabled: "true"` in the sgp helm charts). +SSI injects ddtrace and auto-instruments FastAPI for you. Agent code and +deployment config must therefore **not add a second entry point**: + +- Do **not** call `ddtrace.patch_all()` or `import ddtrace.auto` in your agent. +- Do **not** wrap the launch command with `ddtrace-run` (the ACP server is started + as a plain `uvicorn project.acp:acp`). + +Two entry points layer two ddtrace copies whose de-dupe guards don't cross-recognize +each other, so ddtrace wraps the route handler twice. + +**Local / non-SSI environments:** without the platform admission controller there is +no SSI, so FastAPI is not auto-instrumented. If you want APM locally, run the server +under `ddtrace-run` — but only in that environment, and never alongside SSI. + +**Pin your tracing libraries.** Floor-only pins (`ddtrace>=...`, `fastapi>=...`, +`starlette>=...`) let each image rebuild pull whatever is latest, which can drift away +from the platform-injected ddtrace version and is a latent source of cross-copy +instrumentation mismatches. Pin ddtrace (and fastapi/starlette) to known-good versions. + +## Symptom of a double-instrumentation regression + +ddtrace's `resource_name` / `http.route` doubles the router prefix — +`POST /api/api`, `GET /healthz/healthz` — coexisting with the correct single-prefix +names. The **served URL is unaffected** (`http.url`, `http.path_group`, +`http.inferred_route` stay single), so traffic works but monitors and dashboards +filtered on `resource_name:post_/api` silently miss the doubled fraction. + +The ACP server logs a `WARNING` at startup when it detects more than one ddtrace +wrap on its route handler (`_warn_if_double_instrumented` in `base_acp_server.py`). +The check is best-effort — it reads ddtrace/wrapt internals and silently no-ops if +those shift, so treat it as a heads-up, not a hard gate. If you see it, remove the +in-app `patch_all()` / `ddtrace-run` and rely on SSI. + +## Emitting custom spans + +You don't need `patch_all()` to add your own spans. With SSI active, +`ddtrace.tracer` is the injected tracer — `with tracer.trace(...)` and +`span.set_tag(...)` work as-is. For Scale Groundplane spans use `adk.tracing`. + +## Restoring monitor coverage after a doubling incident + +`http.path_group` and `http.inferred_route` are derived from the request URL, not +from the doubled tracer resource, so they stay single. Monitors and dashboards +should filter on `@http.path_group:/api` (or `@http.inferred_route:/api`) rather +than `resource_name:post_/api` — this is robust whether or not the doubling recurs. diff --git a/src/agentex/lib/sdk/fastacp/base/base_acp_server.py b/src/agentex/lib/sdk/fastacp/base/base_acp_server.py index b0b1c3685..837e20273 100644 --- a/src/agentex/lib/sdk/fastacp/base/base_acp_server.py +++ b/src/agentex/lib/sdk/fastacp/base/base_acp_server.py @@ -1,5 +1,6 @@ from __future__ import annotations +import sys import uuid import asyncio import inspect @@ -44,6 +45,55 @@ task_message_update_adapter = TypeAdapter(TaskMessageUpdate) +def _count_ddtrace_route_handler_wraps() -> int: + """Best-effort count of ddtrace ``traced_handler`` wrappers on the route handler. + >1 means double-instrumentation; 0 if ddtrace is not active. Never raises.""" + if "ddtrace" not in sys.modules: + return 0 + try: + import fastapi.routing + import starlette.routing + + def walk(handle: Any) -> int: + count, seen, obj = 0, set(), handle + while obj is not None and id(obj) not in seen: + seen.add(id(obj)) + wrapper = getattr(obj, "_self_wrapper", None) + if ( + wrapper is not None + and "ddtrace" in (getattr(wrapper, "__module__", "") or "") + and "traced_handler" in (getattr(wrapper, "__name__", "") or "") + ): + count += 1 + obj = getattr(obj, "__wrapped__", None) + return count + + return max( + walk(fastapi.routing.APIRoute.handle), + walk(starlette.routing.Route.handle), + ) + except Exception: + return 0 + + +def _warn_if_double_instrumented() -> None: + """Warn at startup if ddtrace wrapped the ACP route handler more than once.""" + wraps = _count_ddtrace_route_handler_wraps() + if wraps > 1: + logger.warning( + "ddtrace is instrumenting the ACP route handler %d times (expected 1). This " + "doubles the APM resource_name/http.route (e.g. 'POST /api/api', 'GET " + "/healthz/healthz') while the served path stays single, so monitors and " + "dashboards filtered on the single-prefix resource_name silently miss the " + "doubled traffic. Cause: more than one ddtrace auto-instrumentation entry " + "point in this process — typically a manual ddtrace.patch_all()/ddtrace-run in " + "agent code alongside Datadog single-step instrumentation " + "(admission.datadoghq.com/enabled). Use exactly one: when the platform injects " + "single-step instrumentation, remove the in-app patch_all()/ddtrace-run.", + wraps, + ) + + class RequestIDMiddleware: """Pure ASGI middleware to set request IDs without buffering streaming responses.""" @@ -102,6 +152,7 @@ def _setup_handlers(self): def get_lifespan_function(self): @asynccontextmanager async def lifespan_context(app: FastAPI): # noqa: ARG001 + _warn_if_double_instrumented() env_vars = EnvironmentVariables.refresh() if env_vars.AGENTEX_BASE_URL: await register_agent(env_vars, agent_card=self._agent_card)