11from __future__ import annotations
22
33import os
4+ import time
45import asyncio
56from enum import Enum
67from dataclasses import dataclass
78
89from agentex .types .span import Span
910from agentex .lib .utils .logging import make_logger
11+ from agentex .lib .core .observability import tracing_metrics_recording as _metrics
1012from agentex .lib .core .tracing .processors .tracing_processor_interface import (
1113 AsyncTracingProcessor ,
1214)
@@ -67,6 +69,7 @@ class _SpanQueueItem:
6769 event_type : SpanEventType
6870 span : Span
6971 processors : list [AsyncTracingProcessor ]
72+ enqueued_at : float | None = None
7073 # Number of times this item has already been dispatched. Used to bound
7174 # re-enqueue on transient failures.
7275 attempts : int = 0
@@ -134,6 +137,12 @@ def _record_drop(self, count: int, reason: str) -> None:
134137 if count <= 0 :
135138 return
136139 self ._dropped_spans += count
140+ if "shutdown" in reason :
141+ for _ in range (count ):
142+ _metrics .record_span_dropped ("shutdown" )
143+ elif "queue full" in reason :
144+ for _ in range (count ):
145+ _metrics .record_span_dropped ("queue_full" )
137146 # Warn on the first drop and then sparsely, so a drop storm is visible
138147 # without flooding the log.
139148 if self ._dropped_spans == count or self ._dropped_spans % 100 < count :
@@ -155,7 +164,15 @@ def enqueue(
155164 return
156165 self ._ensure_drain_running ()
157166 try :
158- self ._queue .put_nowait (_SpanQueueItem (event_type = event_type , span = span , processors = processors ))
167+ self ._queue .put_nowait (
168+ _SpanQueueItem (
169+ event_type = event_type ,
170+ span = span ,
171+ processors = processors ,
172+ enqueued_at = _metrics .monotonic_if_enabled (),
173+ )
174+ )
175+ _metrics .record_span_enqueued (event_type .value )
159176 except asyncio .QueueFull :
160177 self ._record_drop (1 , "queue full" )
161178
@@ -197,16 +214,33 @@ async def _drain_loop(self) -> None:
197214 break
198215
199216 try :
217+ _metrics .record_batch_coalesced (
218+ queue_depth = self ._queue .qsize () + len (batch ),
219+ batch_items = batch ,
220+ )
221+
200222 # Separate START and END events. Processing all STARTs before
201223 # ENDs ensures that on_span_start completes before on_span_end
202224 # for any span whose both events land in the same batch.
203225 starts = [i for i in batch if i .event_type == SpanEventType .START ]
204226 ends = [i for i in batch if i .event_type == SpanEventType .END ]
205227
206228 if starts :
229+ phase_start = time .perf_counter ()
207230 await self ._process_items (starts )
231+ _metrics .record_batch_phase (
232+ phase = "start" ,
233+ size = len (starts ),
234+ duration_ms = (time .perf_counter () - phase_start ) * 1000.0 ,
235+ )
208236 if ends :
237+ phase_start = time .perf_counter ()
209238 await self ._process_items (ends )
239+ _metrics .record_batch_phase (
240+ phase = "end" ,
241+ size = len (ends ),
242+ duration_ms = (time .perf_counter () - phase_start ) * 1000.0 ,
243+ )
210244 finally :
211245 for _ in batch :
212246 self ._queue .task_done ()
@@ -265,6 +299,12 @@ def _handle_failure(
265299 exhausted = len (items ) - len (retriable )
266300 if exhausted :
267301 self ._record_drop (exhausted , f"{ type (p ).__name__ } retries exhausted during { event_type .value } " )
302+ _metrics .record_export_failure (
303+ processor = p ,
304+ event_type = event_type .value ,
305+ span_count = exhausted ,
306+ exc = exc ,
307+ )
268308 for item in retriable :
269309 self ._reenqueue (item , p )
270310 if retriable :
@@ -285,6 +325,12 @@ def _handle_failure(
285325 len (items ),
286326 event_type .value ,
287327 )
328+ _metrics .record_export_failure (
329+ processor = p ,
330+ event_type = event_type .value ,
331+ span_count = len (items ),
332+ exc = exc ,
333+ )
288334
289335 def _reenqueue (self , item : _SpanQueueItem , p : AsyncTracingProcessor ) -> None :
290336 """Put a single failed item back on the queue, scoped to the processor
@@ -295,6 +341,7 @@ def _reenqueue(self, item: _SpanQueueItem, p: AsyncTracingProcessor) -> None:
295341 event_type = item .event_type ,
296342 span = item .span ,
297343 processors = [p ],
344+ enqueued_at = item .enqueued_at ,
298345 attempts = item .attempts + 1 ,
299346 )
300347 )
@@ -312,9 +359,11 @@ async def shutdown(self, timeout: float = 30.0) -> None:
312359 try :
313360 await asyncio .wait_for (self ._queue .join (), timeout = timeout )
314361 except asyncio .TimeoutError :
362+ remaining = self ._queue .qsize ()
315363 logger .warning (
316- "Span queue shutdown timed out after %.1fs with %d items remaining" , timeout , self . _queue . qsize ()
364+ "Span queue shutdown timed out after %.1fs with %d items remaining" , timeout , remaining
317365 )
366+ _metrics .record_shutdown_timeout (remaining_items = remaining )
318367 if self ._drain_task is not None and not self ._drain_task .done ():
319368 self ._drain_task .cancel ()
320369 try :
0 commit comments