From 926598e76249de49a089824d8f95fe0b9e00abbc Mon Sep 17 00:00:00 2001
From: jsonbailey <jbailey@launchdarkly.com>
Date: Wed, 25 Mar 2026 16:48:36 -0500
Subject: [PATCH 1/6] feat!: split track_metrics_of into sync and async
 (track_metrics_of_async) variants

feat: add optional graph_key to all LDAIConfigTracker track_* methods for graph correlation
feat: add track_tool_call/track_tool_calls to LDAIConfigTracker
feat: add graph_key property to AIGraphTracker
feat: make AIGraphTracker.track_total_tokens accept Optional[TokenUsage], skip when None or total <= 0
feat: add LangChainHelper.get_tool_calls_from_response and sum_token_usage_from_messages
feat: extract OpenAIHelper.get_ai_usage_from_response; delegate get_ai_metrics_from_response to it
refactor: remove node-scoped methods from AIGraphTracker (track_node_invocation, track_tool_call, track_node_judge_response)
refactor: use time.time_ns() for sub-millisecond precision in duration calculations
---
 .../src/ldai_langchain/langchain_helper.py    |  38 +++
 packages/sdk/server-ai/README.md              |   6 +-
 .../sdk/server-ai/src/ldai/judge/__init__.py  |   2 +-
 .../sdk/server-ai/src/ldai/managed_model.py   |   2 +-
 .../src/ldai/providers/runner_factory.py      |   2 +-
 packages/sdk/server-ai/src/ldai/tracker.py    | 259 ++++++++++--------
 packages/sdk/server-ai/tests/test_judge.py    |  16 +-
 packages/sdk/server-ai/tests/test_tracker.py  | 153 ++++++++++-
 8 files changed, 346 insertions(+), 132 deletions(-)

diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_helper.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_helper.py
index 5061a1b..35eb396 100644
--- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_helper.py
+++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_helper.py
@@ -115,3 +115,41 @@ def get_ai_metrics_from_response(response: Any) -> LDAIMetrics:
     :return: LDAIMetrics with success status and token usage
     """
     return LDAIMetrics(success=True, usage=get_ai_usage_from_response(response))
+
+
+def get_tool_calls_from_response(response: Any) -> List[str]:
+    """
+    Get tool call names from a LangChain provider response.
+
+    :param response: The response from the LangChain model
+    :return: List of tool names in order, or empty list if none
+    """
+    names: List[str] = []
+    if hasattr(response, 'tool_calls') and response.tool_calls:
+        for tc in response.tool_calls:
+            n = tc.get('name')
+            if n:
+                names.append(str(n))
+    return names
+
+
+def sum_token_usage_from_messages(messages: List[Any]) -> Optional[TokenUsage]:
+    """
+    Sum token usage across LangChain messages using get_ai_usage_from_response per message.
+
+    :param messages: List of message objects (e.g. from a graph state)
+    :return: Aggregated TokenUsage, or None if no usage on any message
+    """
+    in_sum = 0
+    out_sum = 0
+    total_sum = 0
+    for m in messages:
+        u = get_ai_usage_from_response(m)
+        if u is None:
+            continue
+        in_sum += u.input
+        out_sum += u.output
+        total_sum += u.total
+    if in_sum == 0 and out_sum == 0 and total_sum == 0:
+        return None
+    return TokenUsage(total=total_sum, input=in_sum, output=out_sum)
diff --git a/packages/sdk/server-ai/README.md b/packages/sdk/server-ai/README.md
index 3aa7bae..3720dc6 100644
--- a/packages/sdk/server-ai/README.md
+++ b/packages/sdk/server-ai/README.md
@@ -150,8 +150,8 @@ async def main():
     # Create LangChain model from configuration
     llm = await LangChainProvider.create_langchain_model(ai_config)
     
-    # Use with tracking
-    response = await ai_config.tracker.track_metrics_of(
+    # Use with tracking (sync invoke)
+    response = ai_config.tracker.track_metrics_of(
         lambda: llm.invoke(messages),
         lambda result: LangChainProvider.get_ai_metrics_from_response(result)
     )
@@ -190,7 +190,7 @@ async def main():
             temperature=ai_config.model.get_parameter('temperature') if ai_config.model else 0.5,
         )
     
-    result = await ai_config.tracker.track_metrics_of(
+    result = await ai_config.tracker.track_metrics_of_async(
         call_custom_provider,
         map_custom_provider_metrics
     )
diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py
index b364fb0..a842db6 100644
--- a/packages/sdk/server-ai/src/ldai/judge/__init__.py
+++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py
@@ -71,7 +71,7 @@ async def evaluate(
             messages = self._construct_evaluation_messages(input_text, output_text)
             assert self._evaluation_response_structure is not None
 
-            response = await self._ai_config_tracker.track_metrics_of(
+            response = await self._ai_config_tracker.track_metrics_of_async(
                 lambda: self._model_runner.invoke_structured_model(messages, self._evaluation_response_structure),
                 lambda result: result.metrics,
             )
diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py
index 28bab2f..c1ef021 100644
--- a/packages/sdk/server-ai/src/ldai/managed_model.py
+++ b/packages/sdk/server-ai/src/ldai/managed_model.py
@@ -48,7 +48,7 @@ async def invoke(self, prompt: str) -> ModelResponse:
         config_messages = self._ai_config.messages or []
         all_messages = config_messages + self._messages
 
-        response = await self._tracker.track_metrics_of(
+        response = await self._tracker.track_metrics_of_async(
             lambda: self._model_runner.invoke_model(all_messages),
             lambda result: result.metrics,
         )
diff --git a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py
index 3612bac..190704e 100644
--- a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py
+++ b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py
@@ -77,7 +77,7 @@ def _with_fallback(
                     continue
                 result = fn(provider_factory)
                 if result is not None:
-                    log.debug(f"Successfully created capability using provider '{provider_type}'")
+                    log.debug(f"Successfully invoked create function with provider '{provider_type}'")
                     return result
             except Exception as exc:
                 log.warning(f"Provider '{provider_type}' failed: {exc}")
diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py
index d4f0912..e50f0e4 100644
--- a/packages/sdk/server-ai/src/ldai/tracker.py
+++ b/packages/sdk/server-ai/src/ldai/tracker.py
@@ -1,7 +1,7 @@
 import time
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional
 
 from ldclient import Context, LDClient
 
@@ -98,46 +98,54 @@ def __init__(
         self._context = context
         self._summary = LDAIMetricSummary()
 
-    def __get_track_data(self):
+    def __get_track_data(self, graph_key: Optional[str] = None) -> dict:
         """
         Get tracking data for events.
 
+        :param graph_key: When set, include ``graphKey`` in the payload.
         :return: Dictionary containing variation and config keys.
         """
-        return {
+        data = {
             "variationKey": self._variation_key,
             "configKey": self._config_key,
             "version": self._version,
             "modelName": self._model_name,
             "providerName": self._provider_name,
         }
+        if graph_key is not None:
+            return {**data, "graphKey": graph_key}
+        return data
 
-    def track_duration(self, duration: int) -> None:
+    def track_duration(self, duration: int, *, graph_key: Optional[str] = None) -> None:
         """
         Manually track the duration of an AI operation.
 
         :param duration: Duration in milliseconds.
+        :param graph_key: When set, include ``graphKey`` in the event payload (e.g. config-level metrics inside a graph).
         """
         self._summary._duration = duration
         self._ld_client.track(
-            "$ld:ai:duration:total", self._context, self.__get_track_data(), duration
+            "$ld:ai:duration:total", self._context, self.__get_track_data(graph_key), duration
         )
 
-    def track_time_to_first_token(self, time_to_first_token: int) -> None:
+    def track_time_to_first_token(
+        self, time_to_first_token: int, *, graph_key: Optional[str] = None
+    ) -> None:
         """
         Manually track the time to first token of an AI operation.
 
         :param time_to_first_token: Time to first token in milliseconds.
+        :param graph_key: When set, include ``graphKey`` in the event payload.
         """
         self._summary._time_to_first_token = time_to_first_token
         self._ld_client.track(
             "$ld:ai:tokens:ttf",
             self._context,
-            self.__get_track_data(),
+            self.__get_track_data(graph_key),
             time_to_first_token,
         )
 
-    def track_duration_of(self, func):
+    def track_duration_of(self, func, *, graph_key: Optional[str] = None):
         """
         Automatically track the duration of an AI operation.
 
@@ -145,21 +153,43 @@ def track_duration_of(self, func):
         track the duration. The exception will be re-thrown.
 
         :param func: Function to track (synchronous only).
+        :param graph_key: When set, passed through to :meth:`track_duration`.
         :return: Result of the tracked function.
         """
-        start_time = time.time()
+        start_ns = time.time_ns()
         try:
             result = func()
         finally:
-            end_time = time.time()
-            duration = int((end_time - start_time) * 1000)  # duration in milliseconds
-            self.track_duration(duration)
+            duration = (time.time_ns() - start_ns) // 1_000_000  # duration in milliseconds
+            self.track_duration(duration, graph_key=graph_key)
 
         return result
 
-    async def track_metrics_of(self, func, metrics_extractor):
+    def _track_from_metrics_extractor(
+        self,
+        result: Any,
+        metrics_extractor: Callable[[Any], Any],
+        *,
+        graph_key: Optional[str] = None,
+    ) -> Any:
+        metrics = metrics_extractor(result)
+        if metrics.success:
+            self.track_success(graph_key=graph_key)
+        else:
+            self.track_error(graph_key=graph_key)
+        if metrics.usage:
+            self.track_tokens(metrics.usage, graph_key=graph_key)
+        return result
+
+    def track_metrics_of(
+        self,
+        func: Callable[[], Any],
+        metrics_extractor: Callable[[Any], Any],
+        *,
+        graph_key: Optional[str] = None,
+    ) -> Any:
         """
-        Track metrics for a generic AI operation.
+        Track metrics for a synchronous AI operation.
 
         This function will track the duration of the operation, extract metrics using the provided
         metrics extractor function, and track success or error status accordingly.
@@ -168,47 +198,59 @@ async def track_metrics_of(self, func, metrics_extractor):
         In the case the provided function throws, this function will record the duration and an error.
         A failed operation will not have any token usage data.
 
-        :param func: Async function which executes the operation
+        For async operations, use :meth:`track_metrics_of_async`.
+
+        :param func: Synchronous callable that runs the operation
         :param metrics_extractor: Function that extracts LDAIMetrics from the operation result
+        :param graph_key: When set, include ``graphKey`` on emitted config-level events.
         :return: The result of the operation
         """
-        start_time = time.time()
-        result = None
+        start_ns = time.time_ns()
         try:
-            result = await func()
+            result = func()
         except Exception as err:
-            end_time = time.time()
-            duration = int((end_time - start_time) * 1000)
-            self.track_duration(duration)
-            self.track_error()
+            duration = (time.time_ns() - start_ns) // 1_000_000
+            self.track_duration(duration, graph_key=graph_key)
+            self.track_error(graph_key=graph_key)
             raise err
 
-        # Track duration after successful call
-        end_time = time.time()
-        duration = int((end_time - start_time) * 1000)
-        self.track_duration(duration)
+        duration = (time.time_ns() - start_ns) // 1_000_000
+        self.track_duration(duration, graph_key=graph_key)
+        return self._track_from_metrics_extractor(result, metrics_extractor, graph_key=graph_key)
 
-        # Extract metrics after successful AI call
-        from ldai.providers.types import LDAIMetrics
-        metrics = metrics_extractor(result)
+    async def track_metrics_of_async(
+        self, func, metrics_extractor, *, graph_key: Optional[str] = None
+    ):
+        """
+        Track metrics for an async AI operation (``func`` is awaited).
 
-        # Track success/error based on metrics
-        if metrics.success:
-            self.track_success()
-        else:
-            self.track_error()
+        Same event semantics as :meth:`track_metrics_of`.
 
-        # Track token usage if available
-        if metrics.usage:
-            self.track_tokens(metrics.usage)
+        :param func: Async callable or zero-arg callable that returns an awaitable when called
+        :param metrics_extractor: Function that extracts LDAIMetrics from the operation result
+        :param graph_key: When set, include ``graphKey`` on emitted config-level events.
+        :return: The result of the operation
+        """
+        start_ns = time.time_ns()
+        result = None
+        try:
+            result = await func()
+        except Exception as err:
+            duration = (time.time_ns() - start_ns) // 1_000_000
+            self.track_duration(duration, graph_key=graph_key)
+            self.track_error(graph_key=graph_key)
+            raise err
 
-        return result
+        duration = (time.time_ns() - start_ns) // 1_000_000
+        self.track_duration(duration, graph_key=graph_key)
+        return self._track_from_metrics_extractor(result, metrics_extractor, graph_key=graph_key)
 
-    def track_eval_scores(self, scores: Dict[str, Any]) -> None:
+    def track_eval_scores(self, scores: Dict[str, Any], *, graph_key: Optional[str] = None) -> None:
         """
         Track evaluation scores for multiple metrics.
 
         :param scores: Dictionary mapping metric keys to their evaluation scores (EvalScore objects)
+        :param graph_key: When set, include ``graphKey`` in the event payload.
         """
         from ldai.providers.types import EvalScore
 
@@ -218,22 +260,23 @@ def track_eval_scores(self, scores: Dict[str, Any]) -> None:
                 self._ld_client.track(
                     metric_key,
                     self._context,
-                    self.__get_track_data(),
+                    self.__get_track_data(graph_key=graph_key),
                     eval_score.score
                 )
 
-    def track_judge_response(self, judge_response: Any) -> None:
+    def track_judge_response(self, judge_response: Any, *, graph_key: Optional[str] = None) -> None:
         """
         Track a judge response, including evaluation scores with judge config key.
 
         :param judge_response: JudgeResponse object containing evals and success status
+        :param graph_key: When set, include ``graphKey`` in the event payload.
         """
         from ldai.providers.types import EvalScore, JudgeResponse
 
         if isinstance(judge_response, JudgeResponse):
             # Track evaluation scores with judge config key included in metadata
             if judge_response.evals:
-                track_data = self.__get_track_data()
+                track_data = self.__get_track_data(graph_key=graph_key)
                 if judge_response.judge_config_key:
                     track_data = {**track_data, 'judgeConfigKey': judge_response.judge_config_key}
 
@@ -246,44 +289,49 @@ def track_judge_response(self, judge_response: Any) -> None:
                             eval_score.score
                         )
 
-    def track_feedback(self, feedback: Dict[str, FeedbackKind]) -> None:
+    def track_feedback(self, feedback: Dict[str, FeedbackKind], *, graph_key: Optional[str] = None) -> None:
         """
         Track user feedback for an AI operation.
 
         :param feedback: Dictionary containing feedback kind.
+        :param graph_key: When set, include ``graphKey`` in the event payload.
         """
         self._summary._feedback = feedback
         if feedback["kind"] == FeedbackKind.Positive:
             self._ld_client.track(
                 "$ld:ai:feedback:user:positive",
                 self._context,
-                self.__get_track_data(),
+                self.__get_track_data(graph_key=graph_key),
                 1,
             )
         elif feedback["kind"] == FeedbackKind.Negative:
             self._ld_client.track(
                 "$ld:ai:feedback:user:negative",
                 self._context,
-                self.__get_track_data(),
+                self.__get_track_data(graph_key=graph_key),
                 1,
             )
 
-    def track_success(self) -> None:
+    def track_success(self, *, graph_key: Optional[str] = None) -> None:
         """
         Track a successful AI generation.
+
+        :param graph_key: When set, include ``graphKey`` in the event payload.
         """
         self._summary._success = True
         self._ld_client.track(
-            "$ld:ai:generation:success", self._context, self.__get_track_data(), 1
+            "$ld:ai:generation:success", self._context, self.__get_track_data(graph_key=graph_key), 1
         )
 
-    def track_error(self) -> None:
+    def track_error(self, *, graph_key: Optional[str] = None) -> None:
         """
         Track an unsuccessful AI generation attempt.
+
+        :param graph_key: When set, include ``graphKey`` in the event payload.
         """
         self._summary._success = False
         self._ld_client.track(
-            "$ld:ai:generation:error", self._context, self.__get_track_data(), 1
+            "$ld:ai:generation:error", self._context, self.__get_track_data(graph_key=graph_key), 1
         )
 
     def track_openai_metrics(self, func):
@@ -303,18 +351,16 @@ def track_openai_metrics(self, func):
         :param func: Function to track.
         :return: Result of the tracked function.
         """
-        start_time = time.time()
+        start_ns = time.time_ns()
         try:
             result = func()
-            end_time = time.time()
-            duration = int((end_time - start_time) * 1000)
+            duration = (time.time_ns() - start_ns) // 1_000_000
             self.track_duration(duration)
             self.track_success()
             if hasattr(result, "usage") and hasattr(result.usage, "to_dict"):
                 self.track_tokens(_openai_to_token_usage(result.usage.to_dict()))
         except Exception:
-            end_time = time.time()
-            duration = int((end_time - start_time) * 1000)
+            duration = (time.time_ns() - start_ns) // 1_000_000
             self.track_duration(duration)
             self.track_error()
             raise
@@ -343,35 +389,64 @@ def track_bedrock_converse_metrics(self, res: dict) -> dict:
             self.track_tokens(_bedrock_to_token_usage(res["usage"]))
         return res
 
-    def track_tokens(self, tokens: TokenUsage) -> None:
+    def track_tokens(self, tokens: TokenUsage, *, graph_key: Optional[str] = None) -> None:
         """
         Track token usage metrics.
 
         :param tokens: Token usage data from either custom, OpenAI, or Bedrock sources.
+        :param graph_key: When set, include ``graphKey`` in the event payload.
         """
         self._summary._usage = tokens
+        td = self.__get_track_data(graph_key=graph_key)
         if tokens.total > 0:
             self._ld_client.track(
                 "$ld:ai:tokens:total",
                 self._context,
-                self.__get_track_data(),
+                td,
                 tokens.total,
             )
         if tokens.input > 0:
             self._ld_client.track(
                 "$ld:ai:tokens:input",
                 self._context,
-                self.__get_track_data(),
+                td,
                 tokens.input,
             )
         if tokens.output > 0:
             self._ld_client.track(
                 "$ld:ai:tokens:output",
                 self._context,
-                self.__get_track_data(),
+                td,
                 tokens.output,
             )
 
+    def track_tool_call(self, tool_key: str, *, graph_key: Optional[str] = None) -> None:
+        """
+        Track a tool invocation for this configuration (standalone or within a graph).
+
+        :param tool_key: Identifier of the tool that was invoked.
+        :param graph_key: When set, include ``graphKey`` in the event payload.
+        """
+        track_data = {**self.__get_track_data(graph_key=graph_key), "toolKey": tool_key}
+        self._ld_client.track(
+            "$ld:ai:tool_call",
+            self._context,
+            track_data,
+            1,
+        )
+
+    def track_tool_calls(
+        self, tool_keys: Iterable[str], *, graph_key: Optional[str] = None
+    ) -> None:
+        """
+        Track multiple tool invocations for this configuration.
+
+        :param tool_keys: Tool identifiers (e.g. from a model response).
+        :param graph_key: When set, include ``graphKey`` on each event.
+        """
+        for tool_key in tool_keys:
+            self.track_tool_call(tool_key, graph_key=graph_key)
+
     def get_summary(self) -> LDAIMetricSummary:
         """
         Get the current summary of AI metrics.
@@ -437,6 +512,11 @@ def __init__(
         self._version = version
         self._context = context
 
+    @property
+    def graph_key(self) -> str:
+        """Graph configuration key used in tracking payloads."""
+        return self._graph_key
+
     def __get_track_data(self):
         """
         Get tracking data for events.
@@ -485,12 +565,14 @@ def track_latency(self, duration: int) -> None:
             duration,
         )
 
-    def track_total_tokens(self, tokens: TokenUsage) -> None:
+    def track_total_tokens(self, tokens: Optional[TokenUsage] = None) -> None:
         """
         Track aggregated token usage across the entire graph invocation.
 
-        :param tokens: Token usage data.
+        :param tokens: Token usage data, or ``None`` when usage is unknown.
         """
+        if tokens is None or tokens.total <= 0:
+            return
         self._ld_client.track(
             "$ld:ai:graph:total_tokens",
             self._context,
@@ -535,63 +617,6 @@ def track_judge_response(self, response: Any) -> None:
                             eval_score.score,
                         )
 
-    def track_node_invocation(self, config_key: str) -> None:
-        """
-        Track when a node is invoked during graph execution.
-
-        :param config_key: The configuration key of the node being invoked.
-        """
-        track_data = {**self.__get_track_data(), "configKey": config_key}
-        self._ld_client.track(
-            "$ld:ai:graph:node_invocation",
-            self._context,
-            track_data,
-            1,
-        )
-
-    def track_tool_call(self, config_key: str, tool_key: str) -> None:
-        """
-        Track tool calls made by nodes during graph execution.
-
-        :param config_key: The configuration key of the node making the tool call.
-        :param tool_key: The key of the tool being called.
-        """
-        track_data = {
-            **self.__get_track_data(),
-            "configKey": config_key,
-            "toolKey": tool_key,
-        }
-        self._ld_client.track(
-            "$ld:ai:graph:tool_call",
-            self._context,
-            track_data,
-            1,
-        )
-
-    def track_node_judge_response(self, config_key: str, response: Any) -> None:
-        """
-        Track judge responses for a specific node.
-
-        :param config_key: The configuration key of the node being evaluated.
-        :param response: JudgeResponse object containing evals and success status.
-        """
-        from ldai.providers.types import EvalScore, JudgeResponse
-
-        if isinstance(response, JudgeResponse):
-            if response.evals:
-                track_data = {**self.__get_track_data(), "configKey": config_key}
-                if response.judge_config_key:
-                    track_data = {**track_data, "judgeConfigKey": response.judge_config_key}
-
-                for metric_key, eval_score in response.evals.items():
-                    if isinstance(eval_score, EvalScore):
-                        self._ld_client.track(
-                            metric_key,
-                            self._context,
-                            track_data,
-                            eval_score.score,
-                        )
-
     def track_redirect(self, source_key: str, redirected_target: str) -> None:
         """
         Track when a node redirects to a different target than originally specified.
diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py
index 8326b72..e61ac4a 100644
--- a/packages/sdk/server-ai/tests/test_judge.py
+++ b/packages/sdk/server-ai/tests/test_judge.py
@@ -156,7 +156,7 @@ async def test_evaluate_success_with_valid_response(
         )
         
         mock_runner.invoke_structured_model.return_value = mock_response
-        tracker.track_metrics_of = AsyncMock(return_value=mock_response)
+        tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
         
         judge = Judge(judge_config_with_key, tracker, mock_runner)
         
@@ -183,7 +183,7 @@ async def test_evaluate_success_with_evaluation_response_shape(
             metrics=LDAIMetrics(success=True),
         )
         mock_runner.invoke_structured_model.return_value = mock_response
-        tracker.track_metrics_of = AsyncMock(return_value=mock_response)
+        tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
 
         judge = Judge(judge_config_with_key, tracker, mock_runner)
         result = await judge.evaluate("What is feature flagging?", "Feature flagging is...")
@@ -206,7 +206,7 @@ async def test_evaluate_handles_missing_evaluation_in_response(
         )
         
         mock_runner.invoke_structured_model.return_value = mock_response
-        tracker.track_metrics_of = AsyncMock(return_value=mock_response)
+        tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
         
         judge = Judge(judge_config_with_key, tracker, mock_runner)
         
@@ -231,7 +231,7 @@ async def test_evaluate_handles_invalid_score(
         )
         
         mock_runner.invoke_structured_model.return_value = mock_response
-        tracker.track_metrics_of = AsyncMock(return_value=mock_response)
+        tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
         
         judge = Judge(judge_config_with_key, tracker, mock_runner)
         
@@ -253,7 +253,7 @@ async def test_evaluate_handles_missing_reasoning(
         )
         
         mock_runner.invoke_structured_model.return_value = mock_response
-        tracker.track_metrics_of = AsyncMock(return_value=mock_response)
+        tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
         
         judge = Judge(judge_config_with_key, tracker, mock_runner)
         
@@ -269,7 +269,7 @@ async def test_evaluate_handles_exception(
     ):
         """Evaluate should handle exceptions gracefully."""
         mock_runner.invoke_structured_model.side_effect = Exception("Provider error")
-        tracker.track_metrics_of = AsyncMock(side_effect=Exception("Provider error"))
+        tracker.track_metrics_of_async = AsyncMock(side_effect=Exception("Provider error"))
         
         judge = Judge(judge_config_with_key, tracker, mock_runner)
         
@@ -311,7 +311,7 @@ async def test_evaluate_messages_calls_evaluate(
         )
         
         mock_runner.invoke_structured_model.return_value = mock_response
-        tracker.track_metrics_of = AsyncMock(return_value=mock_response)
+        tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
         
         judge = Judge(judge_config_with_key, tracker, mock_runner)
         
@@ -328,7 +328,7 @@ async def test_evaluate_messages_calls_evaluate(
         
         assert result is not None
         assert result.success is True
-        assert tracker.track_metrics_of.called
+        assert tracker.track_metrics_of_async.called
 
 
 class TestEvaluationSchemaBuilder:
diff --git a/packages/sdk/server-ai/tests/test_tracker.py b/packages/sdk/server-ai/tests/test_tracker.py
index 57f13fd..5fea61a 100644
--- a/packages/sdk/server-ai/tests/test_tracker.py
+++ b/packages/sdk/server-ai/tests/test_tracker.py
@@ -5,7 +5,8 @@
 from ldclient import Config, Context, LDClient
 from ldclient.integrations.test_data import TestData
 
-from ldai.tracker import FeedbackKind, LDAIConfigTracker, TokenUsage
+from ldai.providers.types import LDAIMetrics
+from ldai.tracker import AIGraphTracker, FeedbackKind, LDAIConfigTracker, TokenUsage
 
 
 @pytest.fixture
@@ -440,3 +441,153 @@ def test_error_overwrites_success(client: LDClient):
     client.track.assert_has_calls(calls)  # type: ignore
 
     assert tracker.get_summary().success is False
+
+
+def _base_td() -> dict:
+    return {
+        "variationKey": "variation-key",
+        "configKey": "config-key",
+        "version": 3,
+        "modelName": "fakeModel",
+        "providerName": "fakeProvider",
+    }
+
+
+def test_config_tracker_includes_graph_key_when_provided(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context
+    )
+    expected = {**_base_td(), "graphKey": "my-graph"}
+    tracker.track_success(graph_key="my-graph")
+    client.track.assert_called_with("$ld:ai:generation:success", context, expected, 1)  # type: ignore
+
+
+def test_config_tracker_track_tokens_with_graph_key(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context
+    )
+    tokens = TokenUsage(10, 4, 6)
+    expected = {**_base_td(), "graphKey": "g1"}
+    tracker.track_tokens(tokens, graph_key="g1")
+    client.track.assert_any_call("$ld:ai:tokens:total", context, expected, 10)  # type: ignore
+
+
+def test_config_tracker_track_feedback_with_graph_key(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context
+    )
+    expected = {**_base_td(), "graphKey": "gx"}
+    tracker.track_feedback({"kind": FeedbackKind.Positive}, graph_key="gx")
+    client.track.assert_called_with(
+        "$ld:ai:feedback:user:positive", context, expected, 1
+    )  # type: ignore
+
+
+def test_config_tracker_track_tool_call(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context
+    )
+    expected = {**_base_td(), "toolKey": "search"}
+    tracker.track_tool_call("search")
+    client.track.assert_called_with("$ld:ai:tool_call", context, expected, 1)  # type: ignore
+
+
+def test_config_tracker_track_tool_call_with_graph_key(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context
+    )
+    expected = {**_base_td(), "graphKey": "my-graph", "toolKey": "calc"}
+    tracker.track_tool_call("calc", graph_key="my-graph")
+    client.track.assert_called_with("$ld:ai:tool_call", context, expected, 1)  # type: ignore
+
+
+def test_config_tracker_track_tool_calls(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context
+    )
+    tracker.track_tool_calls(["a", "b"], graph_key="g")
+    assert client.track.call_count == 2  # type: ignore
+    client.track.assert_any_call(
+        "$ld:ai:tool_call",
+        context,
+        {**_base_td(), "graphKey": "g", "toolKey": "a"},
+        1,
+    )  # type: ignore
+    client.track.assert_any_call(
+        "$ld:ai:tool_call",
+        context,
+        {**_base_td(), "graphKey": "g", "toolKey": "b"},
+        1,
+    )  # type: ignore
+
+
+def test_config_tracker_track_metrics_of(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context
+    )
+
+    def fn():
+        return "done"
+
+    def extract(r):
+        return LDAIMetrics(success=True, usage=TokenUsage(5, 2, 3))
+
+    out = tracker.track_metrics_of(fn, extract)
+    assert out == "done"
+    calls = client.track.mock_calls  # type: ignore
+    assert any(c.args[0] == "$ld:ai:generation:success" for c in calls)
+    assert any(c.args[0] == "$ld:ai:tokens:total" and c.args[3] == 5 for c in calls)
+
+
+@pytest.mark.asyncio
+async def test_config_tracker_track_metrics_of_async_passes_graph_key(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context
+    )
+
+    async def fn():
+        return "ok"
+
+    def extract(r):
+        return LDAIMetrics(success=True, usage=TokenUsage(5, 2, 3))
+
+    await tracker.track_metrics_of_async(fn, extract, graph_key="gg")
+    gk_td = {**_base_td(), "graphKey": "gg"}
+    calls = client.track.mock_calls  # type: ignore
+    assert any(
+        c.args[0] == "$ld:ai:generation:success" and c.args[2] == gk_td for c in calls
+    )
+
+
+def test_ai_graph_tracker_graph_key_property(client: LDClient):
+    context = Context.create("user-key")
+    g = AIGraphTracker(client, "variation-key", "graph-key", 2, context)
+    assert g.graph_key == "graph-key"
+
+
+def test_ai_graph_tracker_track_total_tokens_skips_none_and_nonpositive(client: LDClient):
+    context = Context.create("user-key")
+    g = AIGraphTracker(client, "variation-key", "graph-key", 2, context)
+    g.track_total_tokens(None)
+    g.track_total_tokens(TokenUsage(0, 0, 0))
+    client.track.assert_not_called()  # type: ignore
+
+
+def test_ai_graph_tracker_track_total_tokens_tracks_when_positive(client: LDClient):
+    context = Context.create("user-key")
+    g = AIGraphTracker(client, "variation-key", "graph-key", 2, context)
+    g.track_total_tokens(TokenUsage(42, 30, 12))
+    client.track.assert_called_with(  # type: ignore
+        "$ld:ai:graph:total_tokens",
+        context,
+        {"variationKey": "variation-key", "graphKey": "graph-key", "version": 2},
+        42,
+    )

From 4aca10e34943aff72877d9cec2c9687af2923918 Mon Sep 17 00:00:00 2001
From: jsonbailey <jbailey@launchdarkly.com>
Date: Wed, 25 Mar 2026 17:02:15 -0500
Subject: [PATCH 2/6] refactor: use time.perf_counter_ns() instead of
 time.time_ns() for duration measurement

perf_counter_ns is monotonic and designed for elapsed-time measurement; time.time_ns
reflects wall-clock time and can go backward due to NTP or clock adjustments.
---
 packages/sdk/server-ai/src/ldai/tracker.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py
index e50f0e4..3accc1f 100644
--- a/packages/sdk/server-ai/src/ldai/tracker.py
+++ b/packages/sdk/server-ai/src/ldai/tracker.py
@@ -156,11 +156,11 @@ def track_duration_of(self, func, *, graph_key: Optional[str] = None):
         :param graph_key: When set, passed through to :meth:`track_duration`.
         :return: Result of the tracked function.
         """
-        start_ns = time.time_ns()
+        start_ns = time.perf_counter_ns()
         try:
             result = func()
         finally:
-            duration = (time.time_ns() - start_ns) // 1_000_000  # duration in milliseconds
+            duration = (time.perf_counter_ns() - start_ns) // 1_000_000  # duration in milliseconds
             self.track_duration(duration, graph_key=graph_key)
 
         return result
@@ -205,16 +205,16 @@ def track_metrics_of(
         :param graph_key: When set, include ``graphKey`` on emitted config-level events.
         :return: The result of the operation
         """
-        start_ns = time.time_ns()
+        start_ns = time.perf_counter_ns()
         try:
             result = func()
         except Exception as err:
-            duration = (time.time_ns() - start_ns) // 1_000_000
+            duration = (time.perf_counter_ns() - start_ns) // 1_000_000
             self.track_duration(duration, graph_key=graph_key)
             self.track_error(graph_key=graph_key)
             raise err
 
-        duration = (time.time_ns() - start_ns) // 1_000_000
+        duration = (time.perf_counter_ns() - start_ns) // 1_000_000
         self.track_duration(duration, graph_key=graph_key)
         return self._track_from_metrics_extractor(result, metrics_extractor, graph_key=graph_key)
 
@@ -231,17 +231,17 @@ async def track_metrics_of_async(
         :param graph_key: When set, include ``graphKey`` on emitted config-level events.
         :return: The result of the operation
         """
-        start_ns = time.time_ns()
+        start_ns = time.perf_counter_ns()
         result = None
         try:
             result = await func()
         except Exception as err:
-            duration = (time.time_ns() - start_ns) // 1_000_000
+            duration = (time.perf_counter_ns() - start_ns) // 1_000_000
             self.track_duration(duration, graph_key=graph_key)
             self.track_error(graph_key=graph_key)
             raise err
 
-        duration = (time.time_ns() - start_ns) // 1_000_000
+        duration = (time.perf_counter_ns() - start_ns) // 1_000_000
         self.track_duration(duration, graph_key=graph_key)
         return self._track_from_metrics_extractor(result, metrics_extractor, graph_key=graph_key)
 
@@ -351,16 +351,16 @@ def track_openai_metrics(self, func):
         :param func: Function to track.
         :return: Result of the tracked function.
         """
-        start_ns = time.time_ns()
+        start_ns = time.perf_counter_ns()
         try:
             result = func()
-            duration = (time.time_ns() - start_ns) // 1_000_000
+            duration = (time.perf_counter_ns() - start_ns) // 1_000_000
             self.track_duration(duration)
             self.track_success()
             if hasattr(result, "usage") and hasattr(result.usage, "to_dict"):
                 self.track_tokens(_openai_to_token_usage(result.usage.to_dict()))
         except Exception:
-            duration = (time.time_ns() - start_ns) // 1_000_000
+            duration = (time.perf_counter_ns() - start_ns) // 1_000_000
             self.track_duration(duration)
             self.track_error()
             raise

From a183f12086def01ccade1aa7ce0f6aa0d9a27797 Mon Sep 17 00:00:00 2001
From: jsonbailey <jbailey@launchdarkly.com>
Date: Thu, 26 Mar 2026 08:16:59 -0500
Subject: [PATCH 3/6] docs: update LangChain README example to use
 track_metrics_of_async

---
 packages/ai-providers/server-ai-langchain/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/ai-providers/server-ai-langchain/README.md b/packages/ai-providers/server-ai-langchain/README.md
index a58dc00..be7125e 100644
--- a/packages/ai-providers/server-ai-langchain/README.md
+++ b/packages/ai-providers/server-ai-langchain/README.md
@@ -138,7 +138,7 @@ provider = await LangChainProvider.create(config)
 async def invoke():
     return await provider.invoke_model(messages)
 
-response = await config.tracker.track_metrics_of(
+response = await config.tracker.track_metrics_of_async(
     invoke,
     lambda r: r.metrics
 )

From 91cd3005b6caf0ef360f461e5016b70b01292dee Mon Sep 17 00:00:00 2001
From: jsonbailey <jbailey@launchdarkly.com>
Date: Thu, 26 Mar 2026 12:55:46 -0500
Subject: [PATCH 4/6] fix: mutate track data dict in place and guard tool_calls
 with isinstance check

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../server-ai-langchain/src/ldai_langchain/langchain_helper.py  | 2 +-
 packages/sdk/server-ai/src/ldai/tracker.py                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_helper.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_helper.py
index 35eb396..e160061 100644
--- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_helper.py
+++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_helper.py
@@ -125,7 +125,7 @@ def get_tool_calls_from_response(response: Any) -> List[str]:
     :return: List of tool names in order, or empty list if none
     """
     names: List[str] = []
-    if hasattr(response, 'tool_calls') and response.tool_calls:
+    if hasattr(response, 'tool_calls') and isinstance(response.tool_calls, list):
         for tc in response.tool_calls:
             n = tc.get('name')
             if n:
diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py
index 3accc1f..6fbc46d 100644
--- a/packages/sdk/server-ai/src/ldai/tracker.py
+++ b/packages/sdk/server-ai/src/ldai/tracker.py
@@ -113,7 +113,7 @@ def __get_track_data(self, graph_key: Optional[str] = None) -> dict:
             "providerName": self._provider_name,
         }
         if graph_key is not None:
-            return {**data, "graphKey": graph_key}
+            data['graphKey'] = graph_key
         return data
 
     def track_duration(self, duration: int, *, graph_key: Optional[str] = None) -> None:

From d331775a269e1dfa48f446a5d60f819428c0430d Mon Sep 17 00:00:00 2001
From: jsonbailey <jbailey@launchdarkly.com>
Date: Thu, 26 Mar 2026 12:59:08 -0500
Subject: [PATCH 5/6] fix: export get_tool_calls_from_response and
 sum_token_usage_from_messages; add tests

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/ldai_langchain/__init__.py            |  4 +
 .../tests/test_langchain_provider.py          | 86 ++++++++++++++++++-
 2 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/__init__.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/__init__.py
index 2b88026..cb455e5 100644
--- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/__init__.py
+++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/__init__.py
@@ -3,7 +3,9 @@
     create_langchain_model,
     get_ai_metrics_from_response,
     get_ai_usage_from_response,
+    get_tool_calls_from_response,
     map_provider,
+    sum_token_usage_from_messages,
 )
 from ldai_langchain.langchain_model_runner import LangChainModelRunner
 from ldai_langchain.langchain_runner_factory import LangChainRunnerFactory
@@ -18,5 +20,7 @@
     'create_langchain_model',
     'get_ai_metrics_from_response',
     'get_ai_usage_from_response',
+    'get_tool_calls_from_response',
     'map_provider',
+    'sum_token_usage_from_messages',
 ]
diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
index b78fde8..9ce4e88 100644
--- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
+++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
@@ -7,7 +7,15 @@
 
 from ldai import LDMessage
 
-from ldai_langchain import LangChainModelRunner, LangChainRunnerFactory, convert_messages_to_langchain, get_ai_metrics_from_response, map_provider
+from ldai_langchain import (
+    LangChainModelRunner,
+    LangChainRunnerFactory,
+    convert_messages_to_langchain,
+    get_ai_metrics_from_response,
+    get_tool_calls_from_response,
+    map_provider,
+    sum_token_usage_from_messages,
+)
 
 
 class TestConvertMessages:
@@ -237,6 +245,82 @@ async def test_returns_success_false_when_structured_model_invocation_throws_err
         assert result.metrics.usage is None
 
 
+class TestGetToolCallsFromResponse:
+    """Tests for get_tool_calls_from_response."""
+
+    def test_returns_tool_call_names_in_order(self):
+        """Should return tool call names from response.tool_calls."""
+        mock_response = MagicMock()
+        mock_response.tool_calls = [
+            {'name': 'search', 'args': {}},
+            {'name': 'calculator', 'args': {}},
+        ]
+        assert get_tool_calls_from_response(mock_response) == ['search', 'calculator']
+
+    def test_returns_empty_list_when_tool_calls_is_empty(self):
+        """Should return empty list when tool_calls is an empty list."""
+        mock_response = MagicMock()
+        mock_response.tool_calls = []
+        assert get_tool_calls_from_response(mock_response) == []
+
+    def test_returns_empty_list_when_no_tool_calls_attribute(self):
+        """Should return empty list when response has no tool_calls attribute."""
+        mock_response = MagicMock(spec=[])
+        assert get_tool_calls_from_response(mock_response) == []
+
+    def test_returns_empty_list_when_tool_calls_is_not_a_list(self):
+        """Should return empty list when tool_calls is not a list."""
+        mock_response = MagicMock()
+        mock_response.tool_calls = 'not-a-list'
+        assert get_tool_calls_from_response(mock_response) == []
+
+    def test_skips_tool_calls_without_name(self):
+        """Should skip tool calls that have no name."""
+        mock_response = MagicMock()
+        mock_response.tool_calls = [{'args': {}}, {'name': 'search', 'args': {}}]
+        assert get_tool_calls_from_response(mock_response) == ['search']
+
+
+class TestSumTokenUsageFromMessages:
+    """Tests for sum_token_usage_from_messages."""
+
+    def test_sums_usage_across_messages(self):
+        """Should sum token usage from all messages."""
+        msg1 = AIMessage(content='a')
+        msg1.usage_metadata = {'total_tokens': 10, 'input_tokens': 6, 'output_tokens': 4}
+        msg2 = AIMessage(content='b')
+        msg2.usage_metadata = {'total_tokens': 20, 'input_tokens': 12, 'output_tokens': 8}
+
+        result = sum_token_usage_from_messages([msg1, msg2])
+
+        assert result is not None
+        assert result.total == 30
+        assert result.input == 18
+        assert result.output == 12
+
+    def test_returns_none_when_no_usage_on_any_message(self):
+        """Should return None when no message has usage metadata."""
+        msg = AIMessage(content='hello')
+        assert sum_token_usage_from_messages([msg]) is None
+
+    def test_returns_none_for_empty_list(self):
+        """Should return None for an empty message list."""
+        assert sum_token_usage_from_messages([]) is None
+
+    def test_skips_messages_without_usage(self):
+        """Should skip messages that have no usage and sum the rest."""
+        msg1 = AIMessage(content='a')
+        msg2 = AIMessage(content='b')
+        msg2.usage_metadata = {'total_tokens': 5, 'input_tokens': 3, 'output_tokens': 2}
+
+        result = sum_token_usage_from_messages([msg1, msg2])
+
+        assert result is not None
+        assert result.total == 5
+        assert result.input == 3
+        assert result.output == 2
+
+
 class TestGetLlm:
     """Tests for LangChainModelRunner.get_llm."""
 

From cbcb5aea139bf6d3bdf63b3aa377c773d2b472c0 Mon Sep 17 00:00:00 2001
From: jsonbailey <jbailey@launchdarkly.com>
Date: Thu, 26 Mar 2026 13:16:12 -0500
Subject: [PATCH 6/6] fix: wrap long docstring line in tracker.py

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/sdk/server-ai/src/ldai/tracker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py
index 6fbc46d..c84365a 100644
--- a/packages/sdk/server-ai/src/ldai/tracker.py
+++ b/packages/sdk/server-ai/src/ldai/tracker.py
@@ -121,7 +121,8 @@ def track_duration(self, duration: int, *, graph_key: Optional[str] = None) -> N
         Manually track the duration of an AI operation.
 
         :param duration: Duration in milliseconds.
-        :param graph_key: When set, include ``graphKey`` in the event payload (e.g. config-level metrics inside a graph).
+        :param graph_key: When set, include ``graphKey`` in the event payload
+            (e.g. config-level metrics inside a graph).
         """
         self._summary._duration = duration
         self._ld_client.track(