diff --git a/.claude/worktrees/agent-a2dfcea2 b/.claude/worktrees/agent-a2dfcea2 new file mode 160000 index 00000000000..fc4b1a36cee --- /dev/null +++ b/.claude/worktrees/agent-a2dfcea2 @@ -0,0 +1 @@ +Subproject commit fc4b1a36ceef9c610441436e2003a0d31f94aeee diff --git a/.claude/worktrees/agent-adf53b58 b/.claude/worktrees/agent-adf53b58 new file mode 160000 index 00000000000..4666c89336e --- /dev/null +++ b/.claude/worktrees/agent-adf53b58 @@ -0,0 +1 @@ +Subproject commit 4666c89336ea288846835fcb0cbbf3698504c841 diff --git a/.claude/worktrees/master-bench b/.claude/worktrees/master-bench new file mode 160000 index 00000000000..26f1bcb84a5 --- /dev/null +++ b/.claude/worktrees/master-bench @@ -0,0 +1 @@ +Subproject commit 26f1bcb84a5b85ed43b9bc72cb4fc5dab57d6bbe diff --git a/.claude/worktrees/v1.62.0-bench b/.claude/worktrees/v1.62.0-bench new file mode 160000 index 00000000000..16c6a5fd1fe --- /dev/null +++ b/.claude/worktrees/v1.62.0-bench @@ -0,0 +1 @@ +Subproject commit 16c6a5fd1fed71cef6c35f69122f19f5ee242757 diff --git a/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java b/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java index b72812bec07..55929c73f51 100644 --- a/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java +++ b/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java @@ -139,8 +139,7 @@ protected long getFeaturesDiscoveryMinDelayMillis() { private synchronized void discoverIfOutdated(final long maxElapsedMs) { final long now = System.currentTimeMillis(); - final State previous = discoveryState; - final long elapsed = now - previous.lastTimeDiscovered; + final long elapsed = now - discoveryState.lastTimeDiscovered; if (elapsed > maxElapsedMs) { final State newState = new State(); doDiscovery(newState); diff --git a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java index ff4b3cd218d..15b4367fd54 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java @@ -78,6 +78,7 @@ public final class GeneralConfig { "trace.tracer.metrics.ignored.resources"; public static final String TRACE_STATS_CARDINALITY_LIMITS_ENABLED = "trace.stats.cardinality.limits.enabled"; + public static final String TRACE_STATS_ADDITIONAL_TAGS = "trace.stats.additional.tags"; public static final String AZURE_APP_SERVICES = "azure.app.services"; public static final String INTERNAL_EXIT_ON_FAILURE = "trace.internal.exit.on.failure"; diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java new file mode 100644 index 00000000000..7feffea5472 --- /dev/null +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java @@ -0,0 +1,103 @@ +package datadog.trace.common.metrics; + +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT; +import static java.util.concurrent.TimeUnit.SECONDS; + +import datadog.trace.api.WellKnownTags; +import datadog.trace.core.CoreSpan; +import de.thetaphi.forbiddenapis.SuppressForbidden; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Regression benchmark for the additional-tags hot path; {@code limitsEnabled=true} is slower here + * because it records more (sentinel collapses), not because limiting is expensive. + */ +@State(Scope.Benchmark) +@Warmup(iterations = 2, time = 15, timeUnit = SECONDS) +@Measurement(iterations = 5, time = 15, timeUnit = SECONDS) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(SECONDS) +@Threads(8) +@Fork(value = 1) +public class AdditionalTagsMetricsBenchmark { + + private ClientStatsAggregator aggregator; + private AdversarialMetricsBenchmark.CountingHealthMetrics health; + + @Param({"false", "true"}) + public boolean limitsEnabled; + + @State(Scope.Thread) + public static class ThreadState { + int cursor; + } + + @Setup + public void setup() { + this.health = new AdversarialMetricsBenchmark.CountingHealthMetrics(); + AdditionalTagsSchema additionalTagsSchema = + AdditionalTagsSchema.from( + new LinkedHashSet<>(Arrays.asList("region", "tenant_id")), limitsEnabled, this.health); + this.aggregator = + new ClientStatsAggregator( + new WellKnownTags("", "", "", "", "", ""), + Collections.emptySet(), + additionalTagsSchema, + new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( + Collections.singleton("peer.hostname"), Collections.emptySet()), + this.health, + new ClientStatsAggregatorBenchmark.NullSink(), + 2048, + 2048, + false); + this.aggregator.start(); + } + + @TearDown + @SuppressForbidden + public void tearDown() { + aggregator.close(); + System.err.println("[ADDITIONAL-TAGS] counters (across all threads, single fork):"); + System.err.println(" onStatsInboxFull = " + health.inboxFull.sum()); + System.err.println(" onStatsAggregateDropped = " + health.aggregateDropped.sum()); + } + + @Benchmark + public void publish(ThreadState ts, Blackhole blackhole) { + int idx = ts.cursor++; + ThreadLocalRandom rng = ThreadLocalRandom.current(); + + int scrambled = idx * 0x9E3779B1; + String region = "region-" + ((scrambled >>> 4) & 0xFFFF); + String tenant = "tenant-" + ((scrambled >>> 16) & 0xFFFF); + long durationNanos = 1L + (rng.nextLong() & 0x3FFFFFFFL); + + SimpleSpan span = + new SimpleSpan("svc", "op", "res", "web", true, true, false, 0, durationNanos, 200); + span.setTag(SPAN_KIND, SPAN_KIND_CLIENT); + span.setTag("region", region); + span.setTag("tenant_id", tenant); + + List> trace = Collections.singletonList(span); + blackhole.consume(aggregator.publish(trace)); + } +} diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java index 01ba90097a4..65d2ad49e9b 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java @@ -75,6 +75,7 @@ public void setup() { new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), + AdditionalTagsSchema.EMPTY, new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( Collections.singleton("peer.hostname"), Collections.emptySet()), this.health, diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorBenchmark.java index b9d72eaf3ab..2c71101956a 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorBenchmark.java @@ -42,6 +42,7 @@ public class ClientStatsAggregatorBenchmark { new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), + AdditionalTagsSchema.EMPTY, featuresDiscovery, HealthMetrics.NO_OP, new NullSink(), diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorDDSpanBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorDDSpanBenchmark.java index 0453b8888db..901d5a96381 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorDDSpanBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorDDSpanBenchmark.java @@ -62,6 +62,7 @@ public class ClientStatsAggregatorDDSpanBenchmark { new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), + AdditionalTagsSchema.EMPTY, featuresDiscovery, HealthMetrics.NO_OP, new ClientStatsAggregatorBenchmark.NullSink(), diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityPeerMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityPeerMetricsBenchmark.java index 8fc45b4acd8..839657f16e5 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityPeerMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityPeerMetricsBenchmark.java @@ -65,6 +65,7 @@ public void setup() { new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), + AdditionalTagsSchema.EMPTY, new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( Collections.singleton("peer.hostname"), Collections.emptySet()), this.health, diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityResourceMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityResourceMetricsBenchmark.java index 90c1f088935..f34028f41da 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityResourceMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityResourceMetricsBenchmark.java @@ -61,6 +61,7 @@ public void setup() { new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), + AdditionalTagsSchema.EMPTY, new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( Collections.singleton("peer.hostname"), Collections.emptySet()), this.health, diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java new file mode 100644 index 00000000000..5e23efd9ce7 --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java @@ -0,0 +1,118 @@ +package datadog.trace.common.metrics; + +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import datadog.trace.core.monitor.HealthMetrics; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Immutable schema of configured span-derived primary tag keys; built once at aggregator + * construction. + */ +final class AdditionalTagsSchema { + + private static final Logger log = LoggerFactory.getLogger(AdditionalTagsSchema.class); + + // Backend pipeline supports ~4 primary tag dimensions by default; drop overflow at startup. + static final int MAX_ADDITIONAL_TAG_KEYS = 10; + + /** Singleton empty schema returned when no additional tags are configured. */ + static final AdditionalTagsSchema EMPTY = + new AdditionalTagsSchema(new String[0], new TagCardinalityHandler[0], HealthMetrics.NO_OP); + + final String[] names; + + /** Per-key handlers providing UTF8 caching and per-cycle cardinality limiting. */ + private final TagCardinalityHandler[] handlers; + + private final HealthMetrics healthMetrics; + + private AdditionalTagsSchema( + String[] names, TagCardinalityHandler[] handlers, HealthMetrics healthMetrics) { + this.names = names; + this.handlers = handlers; + this.healthMetrics = healthMetrics; + } + + /** Test convenience: uses {@link HealthMetrics#NO_OP} and limits enabled. */ + static AdditionalTagsSchema from(Set configured) { + return from(configured, true, HealthMetrics.NO_OP); + } + + static AdditionalTagsSchema from( + Set configured, boolean useBlockedSentinel, HealthMetrics healthMetrics) { + if (configured == null || configured.isEmpty()) { + return EMPTY; + } + List valid = new ArrayList<>(); + for (String key : configured) { + if (key == null || key.isEmpty()) { + log.warn("Ignoring empty additional metric tag key"); + continue; + } + if (key.contains(":")) { + log.warn("Ignoring additional metric tag key '{}': keys must not contain ':'", key); + continue; + } + valid.add(key); + } + if (valid.isEmpty()) { + return EMPTY; + } + Collections.sort(valid); + // Dedup (sort brings duplicates adjacent) + List deduped = new ArrayList<>(valid.size()); + String prev = null; + for (String key : valid) { + if (!key.equals(prev)) { + deduped.add(key); + prev = key; + } + } + if (deduped.size() > MAX_ADDITIONAL_TAG_KEYS) { + log.warn( + "Configured additional metric tag keys ({}) exceeds the supported limit of {}; " + + "dropping extra keys: {}", + deduped.size(), + MAX_ADDITIONAL_TAG_KEYS, + deduped.subList(MAX_ADDITIONAL_TAG_KEYS, deduped.size())); + deduped = deduped.subList(0, MAX_ADDITIONAL_TAG_KEYS); + } + String[] namesArr = deduped.toArray(new String[0]); + TagCardinalityHandler[] handlersArr = new TagCardinalityHandler[namesArr.length]; + for (int i = 0; i < namesArr.length; i++) { + handlersArr[i] = + new TagCardinalityHandler( + namesArr[i], MetricCardinalityLimits.ADDITIONAL_TAG_VALUE, useBlockedSentinel); + } + return new AdditionalTagsSchema(namesArr, handlersArr, healthMetrics); + } + + int size() { + return names.length; + } + + String name(int i) { + return names[i]; + } + + UTF8BytesString register(int i, String value) { + return handlers[i].register(value); + } + + void resetHandlers() { + for (int i = 0; i < handlers.length; i++) { + long blocked = handlers[i].reset(); + if (blocked > 0) { + log.warn( + "Cardinality limit reached for additional metric tag '{}'; further values will be reported as blocked_by_tracer", + names[i]); + healthMetrics.onTagCardinalityBlocked(handlers[i].statsDTag(), blocked); + } + } + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index b71a95e1aae..43318f697be 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -1,66 +1,25 @@ package datadog.trace.common.metrics; import datadog.metrics.api.Histogram; -import datadog.trace.api.Config; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; -import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.Hashtable; import datadog.trace.util.LongHashingUtils; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicLongArray; import javax.annotation.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Aggregator hashtable entry: UTF8 label fields + counter/histogram state; hashing runs after - * canonicalization so overflow values collapse to a shared sentinel bucket rather than fragmenting. - * Not thread-safe — all mutation is on the aggregator thread. Tests must call {@link - * #resetCardinalityHandlers()} in setup to avoid cross-test handler pollution (handlers are - * static); tests using {@link AdditionalTagsSchema} must also call {@link - * AdditionalTagsSchema#resetHandlers()} on the schema instance. - */ -final class AggregateEntry extends Hashtable.Entry { - private static final Logger log = LoggerFactory.getLogger(AggregateEntry.class); +/** Aggregator hashtable entry: UTF8 label fields + counter/histogram state. */ +final class AggregateEntry extends Hashtable.Entry { static final long ERROR_TAG = 0x8000000000000000L; static final long TOP_LEVEL_TAG = 0x4000000000000000L; private static final UTF8BytesString[] EMPTY_TAGS = new UTF8BytesString[0]; - // Frozen at first AggregateEntry class-load; construct handlers with explicit useBlockedSentinel - // args in tests rather than trying to flip this via Config. - static final boolean LIMITS_ENABLED = Config.get().isTraceStatsCardinalityLimitsEnabled(); - - // Per-field cardinality handlers. Limits live on MetricCardinalityLimits -- see that class for - // per-field rationale. - static final PropertyCardinalityHandler RESOURCE_HANDLER = - new PropertyCardinalityHandler("resource", MetricCardinalityLimits.RESOURCE, LIMITS_ENABLED); - static final PropertyCardinalityHandler SERVICE_HANDLER = - new PropertyCardinalityHandler("service", MetricCardinalityLimits.SERVICE, LIMITS_ENABLED); - static final PropertyCardinalityHandler OPERATION_HANDLER = - new PropertyCardinalityHandler( - "operation", MetricCardinalityLimits.OPERATION, LIMITS_ENABLED); - static final PropertyCardinalityHandler SERVICE_SOURCE_HANDLER = - new PropertyCardinalityHandler( - "service_source", MetricCardinalityLimits.SERVICE_SOURCE, LIMITS_ENABLED); - static final PropertyCardinalityHandler TYPE_HANDLER = - new PropertyCardinalityHandler("type", MetricCardinalityLimits.TYPE, LIMITS_ENABLED); - static final PropertyCardinalityHandler SPAN_KIND_HANDLER = - new PropertyCardinalityHandler( - "span_kind", MetricCardinalityLimits.SPAN_KIND, LIMITS_ENABLED); - static final PropertyCardinalityHandler HTTP_METHOD_HANDLER = - new PropertyCardinalityHandler( - "http_method", MetricCardinalityLimits.HTTP_METHOD, LIMITS_ENABLED); - static final PropertyCardinalityHandler HTTP_ENDPOINT_HANDLER = - new PropertyCardinalityHandler( - "http_endpoint", MetricCardinalityLimits.HTTP_ENDPOINT, LIMITS_ENABLED); - static final PropertyCardinalityHandler GRPC_STATUS_CODE_HANDLER = - new PropertyCardinalityHandler( - "grpc_status_code", MetricCardinalityLimits.GRPC_STATUS_CODE, LIMITS_ENABLED); - final UTF8BytesString resource; final UTF8BytesString service; final UTF8BytesString operationName; @@ -78,15 +37,13 @@ final class AggregateEntry extends Hashtable.Entry { final boolean traceRoot; final List peerTags; - // Mutable aggregate state -- single-thread (aggregator) writer. - private final Histogram okLatencies = Histogram.newHistogram(); + // Schema-ordered "key:value" strings; "key:" prefix makes packing unambiguous without null slots. + final UTF8BytesString[] additionalTags; - /** - * Lazily allocated on the first recorded error. Most entries never see an error and keep this - * null for life; {@link SerializingMetricWriter} writes a cached empty-histogram form when null - * to keep the wire payload identical. Once allocated, it survives {@link #clear()} (cleared, not - * nulled) since an entry that errored once tends to error again. - */ + // Recording state. Mutated only on the aggregator thread. Not thread-safe. + private final Histogram okLatencies; + + // Null until first error; SerializingMetricWriter writes empty histogram form when null. @Nullable private Histogram errorLatencies; private int errorCount; @@ -109,7 +66,8 @@ private AggregateEntry( short httpStatusCode, boolean synthetic, boolean traceRoot, - List peerTags) { + List peerTags, + UTF8BytesString[] additionalTags) { super(keyHash); this.resource = resource; this.service = service; @@ -124,86 +82,8 @@ private AggregateEntry( this.synthetic = synthetic; this.traceRoot = traceRoot; this.peerTags = peerTags; - } - - /** - * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link - * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. - */ - AggregateEntry recordOneDuration(long tagAndDuration) { - ++hitCount; - if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { - tagAndDuration ^= TOP_LEVEL_TAG; - ++topLevelCount; - } - if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) { - tagAndDuration ^= ERROR_TAG; - errorLatenciesForWrite().accept(tagAndDuration); - ++errorCount; - } else { - okLatencies.accept(tagAndDuration); - } - duration += tagAndDuration; - return this; - } - - int getErrorCount() { - return errorCount; - } - - int getHitCount() { - return hitCount; - } - - int getTopLevelCount() { - return topLevelCount; - } - - long getDuration() { - return duration; - } - - Histogram getOkLatencies() { - return okLatencies; - } - - /** - * Returns the entry's error-latency histogram, or {@code null} if no error has been recorded. - * Callers serializing this should treat {@code null} as "emit a cached empty histogram"; see - * {@link SerializingMetricWriter}. - */ - @Nullable - Histogram getErrorLatencies() { - return errorLatencies; - } - - /** Lazy-allocates {@link #errorLatencies} on the first error. */ - private Histogram errorLatenciesForWrite() { - Histogram h = errorLatencies; - if (h == null) { - h = Histogram.newHistogram(); - errorLatencies = h; - } - return h; - } - - /** - * Resets the per-cycle counters and histograms. Label fields ({@code resource}, {@code service}, - * ..., {@code peerTags}) are deliberately left intact -- they're the entry's bucket identity and - * must persist so a subsequent snapshot with the same key reuses this entry instead of allocating - * a fresh one. Entries that stay at {@code hitCount == 0} across a cycle are reaped by {@link - * AggregateTable#expungeStaleAggregates}. - */ - void clear() { - this.errorCount = 0; - this.hitCount = 0; - this.topLevelCount = 0; - this.duration = 0; - this.okLatencies.clear(); - // errorLatencies stays null on entries that never errored. Only clear if it was allocated. - if (this.errorLatencies != null) { - this.errorLatencies.clear(); - } + this.additionalTags = additionalTags; + this.okLatencies = Histogram.newHistogram(); } /** @@ -252,7 +132,9 @@ static AggregateEntry of( synthetic, traceRoot, peerTagsArr, - peerTagsArr.length); + peerTagsArr.length, + EMPTY_TAGS, + 0); return new AggregateEntry( keyHash, resourceUtf, @@ -267,38 +149,8 @@ static AggregateEntry of( (short) httpStatusCode, synthetic, traceRoot, - peerTagsList); - } - - /** - * Resets the static per-field cardinality handlers. Does not cover {@link AdditionalTagsSchema}. - */ - static void resetCardinalityHandlers() { - resetCardinalityHandlers(HealthMetrics.NO_OP); - } - - static void resetCardinalityHandlers(HealthMetrics healthMetrics) { - reportIfBlocked(healthMetrics, RESOURCE_HANDLER); - reportIfBlocked(healthMetrics, SERVICE_HANDLER); - reportIfBlocked(healthMetrics, OPERATION_HANDLER); - reportIfBlocked(healthMetrics, SERVICE_SOURCE_HANDLER); - reportIfBlocked(healthMetrics, TYPE_HANDLER); - reportIfBlocked(healthMetrics, SPAN_KIND_HANDLER); - reportIfBlocked(healthMetrics, HTTP_METHOD_HANDLER); - reportIfBlocked(healthMetrics, HTTP_ENDPOINT_HANDLER); - reportIfBlocked(healthMetrics, GRPC_STATUS_CODE_HANDLER); - PeerTagSchema.INTERNAL.resetHandlers(healthMetrics); - } - - private static void reportIfBlocked( - HealthMetrics healthMetrics, PropertyCardinalityHandler handler) { - long blocked = handler.reset(); - if (blocked > 0) { - log.warn( - "Cardinality limit reached for stats field '{}'; further values will be reported as blocked_by_tracer", - handler.name); - healthMetrics.onTagCardinalityBlocked(handler.statsDTag(), blocked); - } + peerTagsList, + EMPTY_TAGS); } /** @@ -326,7 +178,9 @@ static long hashOf( boolean synthetic, boolean traceRoot, UTF8BytesString[] peerTags, - int peerTagCount) { + int peerTagCount, + UTF8BytesString[] additionalTags, + int additionalTagCount) { long h = 0; h = LongHashingUtils.addToHash(h, resource); h = LongHashingUtils.addToHash(h, service); @@ -340,6 +194,11 @@ static long hashOf( for (int i = 0; i < peerTagCount; i++) { h = LongHashingUtils.addToHash(h, peerTags[i]); } + // Additional tags are packed compactly in schema order (alphabetical by key); each carries its + // "key:" prefix so the packed form is unambiguous without positional null slots. + for (int i = 0; i < additionalTagCount; i++) { + h = LongHashingUtils.addToHash(h, additionalTags[i]); + } h = LongHashingUtils.addToHash(h, httpStatusCode); h = LongHashingUtils.addToHash(h, synthetic); h = LongHashingUtils.addToHash(h, traceRoot); @@ -431,9 +290,162 @@ List getPeerTags() { return peerTags; } - // Production AggregateEntry intentionally has no equals/hashCode override -- AggregateTable - // bucketing uses keyHash + Canonical.matches and never invokes Object.equals. For tests that - // need value-equality (Spock argument matchers), use AggregateEntryTestUtils in src/test. + /** + * @return the packed additional-tag values this entry recorded, as canonical {@code "key:value"} + * UTF8BytesStrings in schema (alphabetical-by-key) order. Only tags the span actually set are + * present (no null slots), so the length is the count of present tags -- empty when the span + * set none or no additional tags are configured. + */ + UTF8BytesString[] getAdditionalTags() { + return additionalTags; + } + + // ----- recording state accessors ----- + + int getHitCount() { + return hitCount; + } + + int getErrorCount() { + return errorCount; + } + + int getTopLevelCount() { + return topLevelCount; + } + + long getDuration() { + return duration; + } + + Histogram getOkLatencies() { + return okLatencies; + } + + /** + * Returns the entry's error latency histogram, or {@code null} if no error has been recorded yet. + * Callers should treat null as "serialize as an empty histogram" (see {@link + * SerializingMetricWriter}). + */ + Histogram getErrorLatencies() { + return errorLatencies; + } + + /** Lazy-allocates {@link #errorLatencies} on the first error. */ + private Histogram errorLatenciesForWrite() { + Histogram h = errorLatencies; + if (h == null) { + h = Histogram.newHistogram(); + errorLatencies = h; + } + return h; + } + + /** + * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link + * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. + */ + @SuppressFBWarnings( + value = "AT_NONATOMIC_OPERATIONS_ON_SHARED_VARIABLE", + justification = + "Single-writer by design: recording counters are mutated only on the aggregator thread" + + " (see class javadoc); no cross-thread atomicity guarantee is needed.") + AggregateEntry recordOneDuration(long tagAndDuration) { + ++hitCount; + if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { + tagAndDuration ^= TOP_LEVEL_TAG; + ++topLevelCount; + } + if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) { + tagAndDuration ^= ERROR_TAG; + errorLatenciesForWrite().accept(tagAndDuration); + ++errorCount; + } else { + okLatencies.accept(tagAndDuration); + } + duration += tagAndDuration; + return this; + } + + /** + * Records {@code count} durations from {@code durations} (positions 0..count-1). Used by + * integration tests; production code uses {@link #recordOneDuration}. + */ + @SuppressFBWarnings( + value = "AT_NONATOMIC_OPERATIONS_ON_SHARED_VARIABLE", + justification = + "Single-writer by design: recording counters are mutated only on the aggregator thread" + + " (see class javadoc); no cross-thread atomicity guarantee is needed.") + AggregateEntry recordDurations(int count, AtomicLongArray durations) { + this.hitCount += count; + for (int i = 0; i < count && i < durations.length(); ++i) { + long d = durations.getAndSet(i, 0); + if ((d & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { + d ^= TOP_LEVEL_TAG; + ++topLevelCount; + } + if ((d & ERROR_TAG) == ERROR_TAG) { + d ^= ERROR_TAG; + errorLatenciesForWrite().accept(d); + ++errorCount; + } else { + okLatencies.accept(d); + } + this.duration += d; + } + return this; + } + + /** + * Clears the recording state. The OK histogram is reused; the error histogram (if allocated) is + * reused too, but entries that never saw an error keep their {@code errorLatencies} field null. + */ + @SuppressFBWarnings( + value = {"AT_NONATOMIC_64BIT_PRIMITIVE", "AT_STALE_THREAD_WRITE_OF_PRIMITIVE"}, + justification = + "Single-writer by design: recording counters are reset only on the aggregator thread" + + " (see class javadoc); no cross-thread visibility guarantee is needed.") + void clearAggregate() { + this.errorCount = 0; + this.hitCount = 0; + this.topLevelCount = 0; + this.duration = 0; + this.okLatencies.clear(); + if (this.errorLatencies != null) { + this.errorLatencies.clear(); + } + } + + /** + * Equality on the 13 label fields (not on the recording counters). Used only by test mock + * matchers; the {@link Hashtable} does its own bucketing via {@link #keyHash} + {@link + * Canonical#matches} and never calls {@code equals}. + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof AggregateEntry)) return false; + AggregateEntry that = (AggregateEntry) o; + return httpStatusCode == that.httpStatusCode + && synthetic == that.synthetic + && traceRoot == that.traceRoot + && Objects.equals(resource, that.resource) + && Objects.equals(service, that.service) + && Objects.equals(operationName, that.operationName) + && Objects.equals(serviceSource, that.serviceSource) + && Objects.equals(type, that.type) + && Objects.equals(spanKind, that.spanKind) + && Objects.equals(peerTags, that.peerTags) + && Arrays.equals(additionalTags, that.additionalTags) + && Objects.equals(httpMethod, that.httpMethod) + && Objects.equals(httpEndpoint, that.httpEndpoint) + && Objects.equals(grpcStatusCode, that.grpcStatusCode); + } + + @Override + public int hashCode() { + return (int) keyHash; + } /** * Reusable scratch buffer for canonicalizing a {@link SpanSnapshot} into UTF8 fields, computing @@ -467,39 +479,69 @@ static final class Canonical { int peerTagsSize = 0; + /** Schema + per-key blocked sentinels for additional metric tags. Immutable. */ + final AdditionalTagsSchema additionalTagsSchema; + + /** Per-field property cardinality handlers; owned by the enclosing {@link AggregateTable}. */ + final PropertyHandlers handlers; + + /** + * Reusable scratch for canonicalized additional-tag values, sized to the schema. Present values + * are packed at the front in schema order (alphabetical by key); {@link #additionalTagsSize} + * gives the count. Each entry is a {@code "key:value"} UTF8BytesString, so packing loses no + * information -- the key prefix disambiguates which key a value belongs to. Mirrors the {@code + * peerTagsBuffer + peerTagsSize} pattern. {@link #createEntry} copies the populated prefix into + * the new entry. + */ + final UTF8BytesString[] additionalTagsBuffer; + + int additionalTagsSize; + long keyHash; + Canonical(AdditionalTagsSchema additionalTagsSchema, PropertyHandlers handlers) { + this.additionalTagsSchema = additionalTagsSchema; + this.handlers = handlers; + this.additionalTagsBuffer = new UTF8BytesString[additionalTagsSchema.size()]; + } + /** Canonicalize all fields from {@code s} through the handlers into this buffer. */ void populate(SpanSnapshot s) { - this.resource = RESOURCE_HANDLER.register(s.resourceName); - this.service = SERVICE_HANDLER.register(s.serviceName); - this.operationName = OPERATION_HANDLER.register(s.operationName); - this.serviceSource = SERVICE_SOURCE_HANDLER.register(s.serviceNameSource); - this.type = TYPE_HANDLER.register(s.spanType); - this.spanKind = SPAN_KIND_HANDLER.register(s.spanKind); - this.httpMethod = HTTP_METHOD_HANDLER.register(s.httpMethod); - this.httpEndpoint = HTTP_ENDPOINT_HANDLER.register(s.httpEndpoint); - this.grpcStatusCode = GRPC_STATUS_CODE_HANDLER.register(s.grpcStatusCode); + this.resource = handlers.resource.register(s.resourceName); + this.service = handlers.service.register(s.serviceName); + this.operationName = handlers.operation.register(s.operationName); + this.serviceSource = handlers.serviceSource.register(s.serviceNameSource); + this.type = handlers.type.register(s.spanType); + this.spanKind = handlers.spanKind.register(s.spanKind); + this.httpMethod = handlers.httpMethod.register(s.httpMethod); + this.httpEndpoint = handlers.httpEndpoint.register(s.httpEndpoint); + this.grpcStatusCode = handlers.grpcStatusCode.register(s.grpcStatusCode); this.httpStatusCode = s.httpStatusCode; this.synthetic = s.synthetic; this.traceRoot = s.traceRoot; populatePeerTags(s.peerTagSchema, s.peerTagValues); - this.keyHash = - hashOf( - resource, - service, - operationName, - serviceSource, - type, - spanKind, - httpMethod, - httpEndpoint, - grpcStatusCode, - httpStatusCode, - synthetic, - traceRoot, - peerTagsBuffer != null ? peerTagsBuffer : EMPTY_TAGS, - peerTagsSize); + populateAdditionalTags(s.additionalTagValues); + this.keyHash = computeKeyHash(); + } + + private long computeKeyHash() { + return hashOf( + resource, + service, + operationName, + serviceSource, + type, + spanKind, + httpMethod, + httpEndpoint, + grpcStatusCode, + httpStatusCode, + synthetic, + traceRoot, + peerTagsBuffer, + peerTagsSize, + additionalTagsBuffer, + additionalTagsSize); } /** @@ -509,7 +551,7 @@ void populate(SpanSnapshot s) { * Producer-side {@code capturePeerTagValues} produces sparse-null arrays, so the skip pays off * whenever a span carries only a subset of the configured peer tags. */ - private void populatePeerTags(PeerTagSchema schema, String[] values) { + private void populatePeerTags(@Nullable PeerTagSchema schema, @Nullable String[] values) { peerTagsSize = 0; if (schema == null || values == null) { return; @@ -527,6 +569,27 @@ private void populatePeerTags(PeerTagSchema schema, String[] values) { } } + /** + * Packs canonical {@code "key:value"} UTF8BytesStrings for each present slot of {@code values} + * into the front of {@link #additionalTagsBuffer} (schema order), via {@link + * AdditionalTagsSchema#register}, and sets {@link #additionalTagsSize}. The handler returns the + * per-key blocked sentinel when the per-cycle value budget is exhausted. + */ + private void populateAdditionalTags(@Nullable String[] values) { + additionalTagsSize = 0; + int n = additionalTagsBuffer.length; + if (n == 0 || values == null) { + return; + } + for (int i = 0; i < n; i++) { + String v = values[i]; + if (v == null) { + continue; + } + additionalTagsBuffer[additionalTagsSize++] = additionalTagsSchema.register(i, v); + } + } + /** * Whether this canonicalized snapshot matches the given entry. Compares UTF8 fields via * content-equality (so an entry surviving a handler reset still matches a freshly-canonicalized @@ -550,7 +613,22 @@ boolean matches(AggregateEntry e) { && peerTagsEqual(peerTagsBuffer, peerTagsSize, e.peerTags) && httpStatusCode == e.httpStatusCode && synthetic == e.synthetic - && traceRoot == e.traceRoot; + && traceRoot == e.traceRoot + && additionalTagsEqual(additionalTagsBuffer, additionalTagsSize, e.additionalTags); + } + + /** Compact compare: first {@code aSize} slots of {@code a} against the entry's packed array. */ + private static boolean additionalTagsEqual( + UTF8BytesString[] a, int aSize, UTF8BytesString[] b) { + if (aSize != b.length) { + return false; + } + for (int i = 0; i < aSize; i++) { + if (!a[i].equals(b[i])) { + return false; + } + } + return true; } private static boolean peerTagsEqual(UTF8BytesString[] a, int aSize, List b) { @@ -580,6 +658,10 @@ AggregateEntry createEntry() { } else { snapshottedPeerTags = Arrays.asList(Arrays.copyOf(peerTagsBuffer, n)); } + UTF8BytesString[] snapshottedAdditionalTags = + additionalTagsSize == 0 + ? EMPTY_TAGS + : Arrays.copyOf(additionalTagsBuffer, additionalTagsSize); return new AggregateEntry( keyHash, resource, @@ -594,7 +676,8 @@ AggregateEntry createEntry() { httpStatusCode, synthetic, traceRoot, - snapshottedPeerTags); + snapshottedPeerTags, + snapshottedAdditionalTags); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java index 71e96022ba7..b9d2ef3942e 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -1,5 +1,7 @@ package datadog.trace.common.metrics; +import datadog.trace.api.Config; +import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.Hashtable; import datadog.trace.util.Hashtable.MutatingTableIterator; import datadog.trace.util.Hashtable.Support; @@ -18,8 +20,8 @@ * *

Not thread-safe. The aggregator thread is the sole writer of both this table and its * contained {@link AggregateEntry} state. Any cross-thread request that needs to mutate -- e.g. - * {@link ConflatingMetricsAggregator#disable()} -- must funnel onto the aggregator thread via the - * inbox (see the {@code ClearSignal} routing in {@link Aggregator}). The invariant is convention- + * {@link ClientStatsAggregator#disable()} -- must funnel onto the aggregator thread via the inbox + * (see the {@code ClearSignal} routing in {@link Aggregator}). The invariant is convention- * enforced; nothing here checks the calling thread at runtime, so a wrong-thread call would corrupt * bucket chains silently. */ @@ -27,7 +29,7 @@ final class AggregateTable { private final Hashtable.Entry[] buckets; private final int maxAggregates; - private final AggregateEntry.Canonical canonical = new AggregateEntry.Canonical(); + private final AggregateEntry.Canonical canonical; private int size; /** @@ -38,8 +40,28 @@ final class AggregateTable { private int evictCursor; AggregateTable(int maxAggregates) { + this(maxAggregates, AdditionalTagsSchema.EMPTY); + } + + AggregateTable(int maxAggregates, AdditionalTagsSchema additionalTagsSchema) { + this( + maxAggregates, + additionalTagsSchema, + // Read fresh from Config rather than the frozen MetricCardinalityLimits.ENABLED so tests + // can flip the flag via injectSysConfig before constructing the table. In production the + // two are identical (Config is immutable for the process lifetime). + new PropertyHandlers(Config.get().isTraceStatsCardinalityLimitsEnabled())); + } + + AggregateTable( + int maxAggregates, AdditionalTagsSchema additionalTagsSchema, PropertyHandlers handlers) { this.buckets = Support.create(maxAggregates, Support.MAX_RATIO); this.maxAggregates = maxAggregates; + this.canonical = new AggregateEntry.Canonical(additionalTagsSchema, handlers); + } + + void resetHandlers(HealthMetrics healthMetrics) { + canonical.handlers.reset(healthMetrics); } int size() { @@ -65,6 +87,7 @@ AggregateEntry findOrInsert(SpanSnapshot snapshot) { return candidate; } } + // Miss path. if (size >= maxAggregates && !evictOneStale()) { return null; } @@ -84,11 +107,11 @@ AggregateEntry findOrInsert(SpanSnapshot snapshot) { * onStatsAggregateDropped}) rather than evicting an established one. Cap is sized to the * steady-state working set, so eviction is rare in the common case. * - *

How often this fires depends on {@link AggregateEntry#LIMITS_ENABLED}. With limits enabled, - * over-cap values for a given field collapse into a shared {@code blocked_by_tracer} bucket, so - * the table itself rarely reaches {@code maxAggregates}. With limits disabled (the default), - * over-cap values flow to distinct buckets and {@code maxAggregates} becomes the load-bearing - * backstop -- the cursor-resumed scan was added specifically for this regime. + *

How often this fires depends on {@link MetricCardinalityLimits#ENABLED}. With limits + * enabled, over-cap values for a given field collapse into a shared {@code blocked_by_tracer} + * bucket, so the table itself rarely reaches {@code maxAggregates}. With limits disabled (the + * default), over-cap values flow to distinct buckets and {@code maxAggregates} becomes the + * load-bearing backstop -- the cursor-resumed scan was added specifically for this regime. */ private boolean evictOneStale() { // Two passes -- [cursor, length) then [0, cursor) -- using the half-open-range iterator. The diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index 8a33d3f1ea7..afdfcd614e5 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -49,6 +49,7 @@ final class Aggregator implements Runnable { long reportingInterval, TimeUnit reportingIntervalTimeUnit, HealthMetrics healthMetrics, + AdditionalTagsSchema additionalTagsSchema, Runnable onReportCycle) { this( writer, @@ -58,6 +59,7 @@ final class Aggregator implements Runnable { reportingIntervalTimeUnit, DEFAULT_SLEEP_MILLIS, healthMetrics, + additionalTagsSchema, onReportCycle); } @@ -69,16 +71,21 @@ final class Aggregator implements Runnable { TimeUnit reportingIntervalTimeUnit, long sleepMillis, HealthMetrics healthMetrics, + AdditionalTagsSchema additionalTagsSchema, Runnable onReportCycle) { this.writer = writer; this.inbox = inbox; - this.aggregates = new AggregateTable(maxAggregates); + this.aggregates = new AggregateTable(maxAggregates, additionalTagsSchema); this.reportingIntervalNanos = reportingIntervalTimeUnit.toNanos(reportingInterval); this.sleepMillis = sleepMillis; this.healthMetrics = healthMetrics; this.onReportCycle = onReportCycle; } + void resetPropertyHandlers(HealthMetrics healthMetrics) { + aggregates.resetHandlers(healthMetrics); + } + @Override public void run() { Thread currentThread = Thread.currentThread(); @@ -174,7 +181,7 @@ private void report(long when, SignalItem signal) { writer, (w, entry) -> { w.add(entry); - entry.clear(); + entry.clearAggregate(); }); // note that this may do IO and block writer.finishBucket(); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java index 1861f9555b6..dfdcc060e90 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java @@ -71,6 +71,7 @@ public final class ClientStatsAggregator implements MetricsAggregator, EventList private final TimeUnit reportingIntervalTimeUnit; private final DDAgentFeaturesDiscovery features; private final HealthMetrics healthMetrics; + private final AdditionalTagsSchema additionalTagsSchema; private final boolean includeEndpointInMetrics; /** @@ -100,6 +101,8 @@ public ClientStatsAggregator( this( config.getWellKnownTags(), config.getMetricsIgnoredResources(), + AdditionalTagsSchema.from( + config.getTraceStatsAdditionalTags(), MetricCardinalityLimits.ENABLED, healthMetrics), sharedCommunicationObjects.featuresDiscovery(config), healthMetrics, new OkHttpSink( @@ -117,6 +120,7 @@ public ClientStatsAggregator( ClientStatsAggregator( WellKnownTags wellKnownTags, Set ignoredResources, + AdditionalTagsSchema additionalTagsSchema, DDAgentFeaturesDiscovery features, HealthMetrics healthMetric, Sink sink, @@ -126,6 +130,7 @@ public ClientStatsAggregator( this( wellKnownTags, ignoredResources, + additionalTagsSchema, features, healthMetric, sink, @@ -139,6 +144,7 @@ public ClientStatsAggregator( ClientStatsAggregator( WellKnownTags wellKnownTags, Set ignoredResources, + AdditionalTagsSchema additionalTagsSchema, DDAgentFeaturesDiscovery features, HealthMetrics healthMetric, Sink sink, @@ -149,6 +155,7 @@ public ClientStatsAggregator( boolean includeEndpointInMetrics) { this( ignoredResources, + additionalTagsSchema, features, healthMetric, sink, @@ -160,6 +167,7 @@ public ClientStatsAggregator( includeEndpointInMetrics); } + /** Test-only: defaults to no additional tags schema. */ ClientStatsAggregator( Set ignoredResources, DDAgentFeaturesDiscovery features, @@ -171,7 +179,34 @@ public ClientStatsAggregator( long reportingInterval, TimeUnit timeUnit, boolean includeEndpointInMetrics) { + this( + ignoredResources, + AdditionalTagsSchema.EMPTY, + features, + healthMetric, + sink, + metricWriter, + maxAggregates, + queueSize, + reportingInterval, + timeUnit, + includeEndpointInMetrics); + } + + ClientStatsAggregator( + Set ignoredResources, + AdditionalTagsSchema additionalTagsSchema, + DDAgentFeaturesDiscovery features, + HealthMetrics healthMetric, + Sink sink, + MetricWriter metricWriter, + int maxAggregates, + int queueSize, + long reportingInterval, + TimeUnit timeUnit, + boolean includeEndpointInMetrics) { this.ignoredResources = ignoredResources; + this.additionalTagsSchema = additionalTagsSchema; this.includeEndpointInMetrics = includeEndpointInMetrics; this.inbox = Queues.mpscArrayQueue(queueSize); this.features = features; @@ -185,6 +220,7 @@ public ClientStatsAggregator( reportingInterval, timeUnit, healthMetric, + additionalTagsSchema, this::resetCardinalityHandlers); this.thread = newAgentThread(METRICS_AGGREGATOR, aggregator); this.reportingInterval = reportingInterval; @@ -344,6 +380,8 @@ private boolean publish(CoreSpan span, boolean isTopLevel, PeerTagSchema peer spanPeerTagSchema = null; } + String[] additionalTagValues = captureAdditionalTagValues(span); + SpanSnapshot snapshot = new SpanSnapshot( span.getResourceName(), @@ -360,6 +398,7 @@ private boolean publish(CoreSpan span, boolean isTopLevel, PeerTagSchema peer httpMethod, httpEndpoint, grpcStatusCode, + additionalTagValues, tagAndDuration); if (!inbox.offer(snapshot)) { healthMetrics.onStatsInboxFull(); @@ -368,6 +407,16 @@ private boolean publish(CoreSpan span, boolean isTopLevel, PeerTagSchema peer return error; } + /** + * Captures the span's additional-metric-tag values into a {@code String[]} parallel to {@code + * additionalTagsSchema.names}. Returns {@code null} when no additional tags are configured or + * none of the configured keys are set on the span. Raw values only -- length cap and + * canonicalization run on the aggregator thread. + */ + private String[] captureAdditionalTagValues(CoreSpan span) { + return captureTagValues(span, additionalTagsSchema.names); + } + /** * One-time producer-side bootstrap of {@link #cachedPeerTagSchema}. Synchronized double-check * guards against two producers racing on the very first publish; after this returns, {@code @@ -405,18 +454,18 @@ private PeerTagSchema buildPeerTagSchema() { /** * Single reset hook invoked on the aggregator thread at the end of each report cycle. Reconciles * the cached peer-tag schema against the latest feature discovery, then resets all cardinality - * state in lockstep: the static property handlers + {@code PeerTagSchema.INTERNAL} (via {@link - * AggregateEntry#resetCardinalityHandlers()}) and the cached peer-tag schema (with whatever - * reconciliation just produced). New handlers added anywhere in this pipeline should be reset - * from here. + * state in lockstep: the property handlers, both peer-tag schemas, and the additional tags + * schema. New handlers added anywhere in this pipeline should be reset from here. */ private void resetCardinalityHandlers() { reconcilePeerTagSchema(); - AggregateEntry.resetCardinalityHandlers(healthMetrics); + aggregator.resetPropertyHandlers(healthMetrics); + PeerTagSchema.INTERNAL.resetHandlers(healthMetrics); PeerTagSchema schema = cachedPeerTagSchema; if (schema != null) { schema.resetHandlers(healthMetrics); } + additionalTagsSchema.resetHandlers(); } /** @@ -473,7 +522,10 @@ private static PeerTagSchema peerTagSchemaFor(CoreSpan span, PeerTagSchema pe * Returns {@code null} when none of the configured peer tags are set on the span. */ private static String[] capturePeerTagValues(CoreSpan span, PeerTagSchema schema) { - String[] names = schema.names; + return captureTagValues(span, schema.names); + } + + private static String[] captureTagValues(CoreSpan span, String[] names) { int n = names.length; String[] values = null; for (int i = 0; i < n; i++) { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java index f7d91343d4b..f42a37ebbfd 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java @@ -1,5 +1,7 @@ package datadog.trace.common.metrics; +import datadog.trace.api.Config; + /** * Per-field caps on the number of distinct values canonicalized per reporting cycle. Overflow * values collapse to a {@code blocked_by_tracer} sentinel so they merge into one aggregate row @@ -12,6 +14,14 @@ final class MetricCardinalityLimits { private MetricCardinalityLimits() {} + /** + * Master switch for cardinality limiting, frozen at class-load from {@code Config}. When {@code + * false}, handlers still cap their working-set size but emit freshly-allocated values instead of + * the {@code blocked_by_tracer} sentinel. Tests construct handlers with an explicit {@code + * useBlockedSentinel} arg rather than flipping this. + */ + static final boolean ENABLED = Config.get().isTraceStatsCardinalityLimitsEnabled(); + /** * Distinct {@code resource.name} values per cycle. Highest-cardinality field by far: DB-query * obfuscations, HTTP route templates, custom resources. Typical service: 30-200 unique. @@ -70,4 +80,10 @@ private MetricCardinalityLimits() {} * peer tag gets its own handler at this limit. */ static final int PEER_TAG_VALUE = 512; + + /** + * Distinct values per additional-tag key (e.g. distinct values of a span-derived primary tag). + * Each configured additional tag gets its own {@link TagCardinalityHandler} at this limit. + */ + static final int ADDITIONAL_TAG_VALUE = 512; } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index af7e1bfd6b5..5c23f14efc1 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -83,7 +83,7 @@ private PeerTagSchema(String[] names, String state) { for (int i = 0; i < names.length; i++) { this.handlers[i] = new TagCardinalityHandler( - names[i], MetricCardinalityLimits.PEER_TAG_VALUE, AggregateEntry.LIMITS_ENABLED); + names[i], MetricCardinalityLimits.PEER_TAG_VALUE, MetricCardinalityLimits.ENABLED); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java new file mode 100644 index 00000000000..76d1bf87860 --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java @@ -0,0 +1,75 @@ +package datadog.trace.common.metrics; + +import datadog.trace.core.monitor.HealthMetrics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Bundles the nine per-field property cardinality handlers; owned by {@link ClientStatsAggregator}. + */ +final class PropertyHandlers { + + private static final Logger log = LoggerFactory.getLogger(PropertyHandlers.class); + + final PropertyCardinalityHandler resource; + final PropertyCardinalityHandler service; + final PropertyCardinalityHandler operation; + final PropertyCardinalityHandler serviceSource; + final PropertyCardinalityHandler type; + final PropertyCardinalityHandler spanKind; + final PropertyCardinalityHandler httpMethod; + final PropertyCardinalityHandler httpEndpoint; + final PropertyCardinalityHandler grpcStatusCode; + + private final PropertyCardinalityHandler[] handlers; + + PropertyHandlers(boolean limitsEnabled) { + this.resource = + new PropertyCardinalityHandler("resource", MetricCardinalityLimits.RESOURCE, limitsEnabled); + this.service = + new PropertyCardinalityHandler("service", MetricCardinalityLimits.SERVICE, limitsEnabled); + this.operation = + new PropertyCardinalityHandler( + "operation", MetricCardinalityLimits.OPERATION, limitsEnabled); + this.serviceSource = + new PropertyCardinalityHandler( + "service_source", MetricCardinalityLimits.SERVICE_SOURCE, limitsEnabled); + this.type = new PropertyCardinalityHandler("type", MetricCardinalityLimits.TYPE, limitsEnabled); + this.spanKind = + new PropertyCardinalityHandler( + "span_kind", MetricCardinalityLimits.SPAN_KIND, limitsEnabled); + this.httpMethod = + new PropertyCardinalityHandler( + "http_method", MetricCardinalityLimits.HTTP_METHOD, limitsEnabled); + this.httpEndpoint = + new PropertyCardinalityHandler( + "http_endpoint", MetricCardinalityLimits.HTTP_ENDPOINT, limitsEnabled); + this.grpcStatusCode = + new PropertyCardinalityHandler( + "grpc_status_code", MetricCardinalityLimits.GRPC_STATUS_CODE, limitsEnabled); + this.handlers = + new PropertyCardinalityHandler[] { + resource, + service, + operation, + serviceSource, + type, + spanKind, + httpMethod, + httpEndpoint, + grpcStatusCode + }; + } + + void reset(HealthMetrics healthMetrics) { + for (PropertyCardinalityHandler h : handlers) { + long blocked = h.reset(); + if (blocked > 0) { + log.warn( + "Cardinality limit reached for stats field '{}'; further values will be reported as blocked_by_tracer", + h.name); + healthMetrics.onTagCardinalityBlocked(h.statsDTag(), blocked); + } + } + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java index 622a4a14cb0..fc9822afec2 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java @@ -43,6 +43,7 @@ public final class SerializingMetricWriter implements MetricWriter { private static final byte[] IS_TRACE_ROOT = "IsTraceRoot".getBytes(ISO_8859_1); private static final byte[] SPAN_KIND = "SpanKind".getBytes(ISO_8859_1); private static final byte[] PEER_TAGS = "PeerTags".getBytes(ISO_8859_1); + private static final byte[] ADDITIONAL_METRIC_TAGS = "AdditionalMetricTags".getBytes(ISO_8859_1); private static final byte[] HTTP_METHOD = "HTTPMethod".getBytes(ISO_8859_1); private static final byte[] HTTP_ENDPOINT = "HTTPEndpoint".getBytes(ISO_8859_1); private static final byte[] GRPC_STATUS_CODE = "GRPCStatusCode".getBytes(ISO_8859_1); @@ -157,12 +158,15 @@ public void add(AggregateEntry entry) { final boolean hasHttpEndpoint = entry.hasHttpEndpoint(); final boolean hasServiceSource = entry.hasServiceSource(); final boolean hasGrpcStatusCode = entry.hasGrpcStatusCode(); + final UTF8BytesString[] additionalTags = entry.getAdditionalTags(); + final boolean hasAdditionalTags = additionalTags.length > 0; final int mapSize = 15 + (hasServiceSource ? 1 : 0) + (hasHttpMethod ? 1 : 0) + (hasHttpEndpoint ? 1 : 0) - + (hasGrpcStatusCode ? 1 : 0); + + (hasGrpcStatusCode ? 1 : 0) + + (hasAdditionalTags ? 1 : 0); writer.startMap(mapSize); @@ -198,6 +202,17 @@ public void add(AggregateEntry entry) { writer.writeUTF8(peerTag); } + // Emit AdditionalMetricTags as a packed array of pre-built "key:value" UTF8BytesStrings, in + // schema (alphabetical-by-key) order. The field is omitted entirely when the entry carries no + // additional tags, so spans that set none pay zero payload overhead. + if (hasAdditionalTags) { + writer.writeUTF8(ADDITIONAL_METRIC_TAGS); + writer.startArray(additionalTags.length); + for (UTF8BytesString slot : additionalTags) { + writer.writeUTF8(slot); + } + } + if (hasServiceSource) { writer.writeUTF8(SERVICE_SOURCE); writer.writeUTF8(entry.getServiceSource()); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java index 8bbc6a29edb..da7a338cd70 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -40,6 +40,14 @@ final class SpanSnapshot implements InboxItem { @Nullable final String httpEndpoint; @Nullable final String grpcStatusCode; + /** + * Additional metric tag values captured from the span, parallel to {@code + * additionalTagsSchema.names}. A {@code null} entry means the span didn't have that tag set. + * {@code null} (the whole array) when no additional tags are configured or none were set on the + * span. Length cap is applied on the aggregator thread; the producer carries raw values only. + */ + @Nullable final String[] additionalTagValues; + /** Duration in nanoseconds, OR-ed with {@code ERROR_TAG} / {@code TOP_LEVEL_TAG} as needed. */ final long tagAndDuration; @@ -58,6 +66,7 @@ final class SpanSnapshot implements InboxItem { @Nullable String httpMethod, @Nullable String httpEndpoint, @Nullable String grpcStatusCode, + @Nullable String[] additionalTagValues, long tagAndDuration) { this.resourceName = resourceName; this.serviceName = serviceName; @@ -73,6 +82,7 @@ final class SpanSnapshot implements InboxItem { this.httpMethod = httpMethod; this.httpEndpoint = httpEndpoint; this.grpcStatusCode = grpcStatusCode; + this.additionalTagValues = additionalTagValues; this.tagAndDuration = tagAndDuration; } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java index 8ec3b756b72..2ce852c13c0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java @@ -1,6 +1,7 @@ package datadog.trace.common.metrics; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.util.Arrays; /** @@ -72,6 +73,11 @@ final class TagCardinalityHandler { * miss-with-budget) the prior-cycle table; mixing with the upper half ({@code h ^ (h >>> 16)}) * keeps inputs sharing a low-bit pattern off the same probe chain. */ + @SuppressFBWarnings( + value = "ES_COMPARING_PARAMETER_STRING_WITH_EQ", + justification = + "Intentional identity fast-path: the reference check short-circuits the .equals() call" + + " when the stored key and probe value are the same instance.") UTF8BytesString register(String value) { if (value == null) { return UTF8BytesString.EMPTY; diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy index ff7aba89b80..734d0a92e61 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy @@ -38,6 +38,7 @@ class ClientStatsAggregatorTest extends DDSpecification { ClientStatsAggregator aggregator = new ClientStatsAggregator( wellKnownTags, empty, + AdditionalTagsSchema.EMPTY, features, HealthMetrics.NO_OP, sink, @@ -68,6 +69,7 @@ class ClientStatsAggregatorTest extends DDSpecification { ClientStatsAggregator aggregator = new ClientStatsAggregator( wellKnownTags, [ignoredResourceName].toSet(), + AdditionalTagsSchema.EMPTY, features, HealthMetrics.NO_OP, sink, @@ -1697,6 +1699,52 @@ class ClientStatsAggregatorTest extends DDSpecification { aggregator.close() } + def "cardinality limits reset between report cycles"() { + setup: + injectSysConfig("trace.stats.cardinality.limits.enabled", "true") + List cycle1Entries = [] + List cycle2Entries = [] + CountDownLatch latch1 = new CountDownLatch(1) + CountDownLatch latch2 = new CountDownLatch(1) + MetricWriter writer = Mock(MetricWriter) + Sink sink = Stub(Sink) + DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) + features.supportsMetrics() >> true + features.peerTags() >> [] + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, + features, HealthMetrics.NO_OP, sink, writer, 256, queueSize, reportingInterval, SECONDS, false) + aggregator.start() + + when: "publish SERVICE+1 distinct services to fill and overflow the cardinality budget" + for (int i = 0; i <= MetricCardinalityLimits.SERVICE; i++) { + aggregator.publish([new SimpleSpan("svc-$i", "op", "resource", "web", false, true, false, 0, 100, HTTP_OK)]) + } + aggregator.report() + latch1.await(2, SECONDS) + + then: "the overflow service maps to the blocked_by_tracer sentinel" + 1 * writer.startBucket(MetricCardinalityLimits.SERVICE + 1, _, _) + (1.._) * writer.add(_) >> { AggregateEntry e -> cycle1Entries << e } + 1 * writer.finishBucket() >> { latch1.countDown() } + cycle1Entries.count { it.getService().toString() == "blocked_by_tracer" } == 1 + + when: "publish the overflow service in the next cycle after the cardinality reset" + aggregator.publish([ + new SimpleSpan("svc-${MetricCardinalityLimits.SERVICE}", "op", "resource", "web", false, true, false, 0, 100, HTTP_OK) + ]) + aggregator.report() + latch2.await(2, SECONDS) + + then: "after reset the overflow service name is accepted as a real entry" + 1 * writer.startBucket(1, _, _) + 1 * writer.add(_) >> { AggregateEntry e -> cycle2Entries << e } + 1 * writer.finishBucket() >> { latch2.countDown() } + cycle2Entries[0].getService().toString() == "svc-${MetricCardinalityLimits.SERVICE}" + + cleanup: + aggregator.close() + } + def reportAndWaitUntilEmpty(ClientStatsAggregator aggregator) { waitUntilEmpty(aggregator) aggregator.report() diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/FootprintForkedTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/FootprintForkedTest.groovy index 86a91c23b3f..fb5bc2ed561 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/FootprintForkedTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/FootprintForkedTest.groovy @@ -40,6 +40,7 @@ class FootprintForkedTest extends DDSpecification { ClientStatsAggregator aggregator = new ClientStatsAggregator( new WellKnownTags("runtimeid","hostname", "env", "service", "version","language"), [].toSet() as Set, + AdditionalTagsSchema.EMPTY, features, HealthMetrics.NO_OP, sink, diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsSchemaTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsSchemaTest.java new file mode 100644 index 00000000000..6029a188149 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsSchemaTest.java @@ -0,0 +1,67 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashSet; +import org.junit.jupiter.api.Test; + +class AdditionalTagsSchemaTest { + + @Test + void emptyConfigReturnsSharedEmptySchema() { + assertSame(AdditionalTagsSchema.EMPTY, AdditionalTagsSchema.from(null)); + assertSame(AdditionalTagsSchema.EMPTY, AdditionalTagsSchema.from(Collections.emptySet())); + } + + @Test + void schemaSortsKeysAlphabetically() { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region", "tenant_id", "az"))); + assertArrayEquals(new String[] {"az", "region", "tenant_id"}, schema.names); + } + + @Test + void schemaDedupesAndCapsAtMaxTagKeys() { + LinkedHashSet configured = new LinkedHashSet<>(); + // 12 distinct keys, more than MAX_ADDITIONAL_TAG_KEYS (10). Sort by alphabetical, drop the + // last 2. + for (int i = 0; i < 12; i++) { + configured.add(String.format("tag%02d", i)); + } + AdditionalTagsSchema schema = AdditionalTagsSchema.from(configured); + assertEquals(AdditionalTagsSchema.MAX_ADDITIONAL_TAG_KEYS, schema.size()); + assertArrayEquals( + new String[] { + "tag00", "tag01", "tag02", "tag03", "tag04", "tag05", "tag06", "tag07", "tag08", "tag09" + }, + schema.names); + } + + @Test + void rejectsEmptyAndColonContainingKeys() { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from( + new LinkedHashSet<>(Arrays.asList("region", "", "bad:key", "tenant_id"))); + // Empty key and "bad:key" are dropped; only the two valid keys remain. + assertArrayEquals(new String[] {"region", "tenant_id"}, schema.names); + } + + @Test + void allInvalidKeysReturnsEmptySchema() { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("", "also:bad"))); + assertSame(AdditionalTagsSchema.EMPTY, schema); + } + + @Test + void emptySchemaHasZeroSize() { + AdditionalTagsSchema schema = AdditionalTagsSchema.EMPTY; + assertEquals(0, schema.size()); + assertTrue(schema.names.length == 0); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java index 7d1aa023a69..a48e7d94d3e 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java @@ -41,7 +41,7 @@ void clearResetsAllCounters() { entry.recordOneDuration(5L); entry.recordOneDuration(ERROR_TAG | 6L); entry.recordOneDuration(TOP_LEVEL_TAG | 7L); - entry.clear(); + entry.clearAggregate(); assertEquals(0, entry.getDuration()); assertEquals(0, entry.getErrorCount()); assertEquals(0, entry.getTopLevelCount()); diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java new file mode 100644 index 00000000000..20402a662c5 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java @@ -0,0 +1,84 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; + +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +class AggregateTableAdditionalTagsTest { + + @BeforeAll + static void initAgentMeter() { + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + monitoring.newTimer("test.init"); + } + + @Test + void distinctAdditionalTagValuesYieldDistinctEntries() { + AdditionalTagsSchema schema = schemaFor("region"); + AggregateTable table = newTable(schema); + + AggregateEntry usEast = table.findOrInsert(snapshot(schema, "us-east-1")); + AggregateEntry euWest = table.findOrInsert(snapshot(schema, "eu-west-1")); + + assertNotNull(usEast); + assertNotNull(euWest); + assertNotSame(usEast, euWest); + assertEquals(2, table.size()); + } + + @Test + void sameAdditionalTagValuesShareEntry() { + AdditionalTagsSchema schema = schemaFor("region"); + AggregateTable table = newTable(schema); + + AggregateEntry first = table.findOrInsert(snapshot(schema, "us-east-1")); + AggregateEntry second = table.findOrInsert(snapshot(schema, "us-east-1")); + + assertSame(first, second); + assertEquals(1, table.size()); + } + + // ---------- helpers ---------- + + private static AdditionalTagsSchema schemaFor(String... names) { + return AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList(names))); + } + + private static AggregateTable newTable(AdditionalTagsSchema schema) { + return new AggregateTable(256, schema); + } + + private static SpanSnapshot snapshot(AdditionalTagsSchema schema, String regionValue) { + String[] values = new String[schema.size()]; + values[0] = regionValue; + return new SpanSnapshot( + "resource", + "service", + "operation", + null, + "web", + (short) 200, + false, + true, + "client", + null, + null, + null, + null, + null, + values, + 0L); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java index 8694892ea84..74360bbdee3 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java @@ -8,18 +8,21 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertSame; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; import datadog.metrics.agent.AgentMeter; import datadog.metrics.api.statsd.StatsDClient; import datadog.metrics.impl.DDSketchHistograms; import datadog.metrics.impl.MonitoringImpl; +import datadog.trace.core.monitor.HealthMetrics; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; class AggregateTableTest { @@ -32,15 +35,6 @@ static void initAgentMeter() { monitoring.newTimer("test.init"); } - @BeforeEach - void resetCardinalityHandlers() { - // AggregateEntry's property handlers are static and accumulate state across tests. Some tests - // in this class (e.g. backToBackEvictionsAllSucceed) drive 40 distinct services, which exceeds - // MetricCardinalityLimits.SERVICE (32) and leaves later tests seeing a shared "blocked" - // canonical for "a"/"b"/"c"-style services -- collapsing distinct snapshots into one entry. - AggregateEntry.resetCardinalityHandlers(); - } - @Test void insertOnMissReturnsNewAggregate() { AggregateTable table = new AggregateTable(8); @@ -292,6 +286,7 @@ private static SpanSnapshot nullServiceKindSnapshot(String service, String spanK null, null, null, + null, 0L); } @@ -312,9 +307,38 @@ private static SpanSnapshot nullableSnapshot( null, null, null, + null, 0L); } + @Test + void resetHandlersClearsBlockedCountsAndRefreshesCapacity() { + // Use limits-enabled handlers injected via the 3-arg constructor to test resetHandlers() + // without relying on the Config flag being set. + PropertyHandlers handlers = new PropertyHandlers(true); + AggregateTable table = new AggregateTable(512, AdditionalTagsSchema.EMPTY, handlers); + + // Fill the service cardinality budget and push one value over the limit. + for (int i = 0; i < MetricCardinalityLimits.SERVICE; i++) { + table.findOrInsert(snapshot("svc-" + i, "op", "client")); + } + AggregateEntry blocked = table.findOrInsert(snapshot("svc-overflow", "op", "client")); + // All overflow services map to the same sentinel bucket. + AggregateEntry blocked2 = table.findOrInsert(snapshot("svc-overflow-2", "op", "client")); + assertSame(blocked, blocked2); + + HealthMetrics metrics = mock(HealthMetrics.class); + table.resetHandlers(metrics); + + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:service"}, 2L); + verifyNoMoreInteractions(metrics); + + // After reset, a new service name should land in a fresh bucket, not the sentinel. + AggregateEntry afterReset = table.findOrInsert(snapshot("svc-new", "op", "client")); + assertNotSame(blocked, afterReset); + assertEquals("svc-new", afterReset.getService().toString()); + } + // ---------- helpers ---------- private static SpanSnapshot snapshot(String service, String operation, String spanKind) { @@ -368,6 +392,7 @@ SpanSnapshot build() { null, null, null, + null, tagAndDuration); } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java index 7c15b8a49e5..1661e13507a 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java @@ -217,8 +217,6 @@ void tagOverLimitWithSentinelDisabledReturnsFreshUtf8() { @Test void tagOverLimitWithSentinelDisabledNeverSubstitutesBlockedSentinel() { - // The sentinel should never materialize in disabled mode -- over-cap values carry their real - // "tag:value" content rather than the blocked sentinel. TagCardinalityHandler h = new TagCardinalityHandler("peer.service", 1, false); h.register("svc-1"); UTF8BytesString overCap = h.register("svc-2"); diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/PropertyHandlersTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/PropertyHandlersTest.java new file mode 100644 index 00000000000..25142008eff --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/PropertyHandlersTest.java @@ -0,0 +1,93 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; + +import datadog.trace.core.monitor.HealthMetrics; +import org.junit.jupiter.api.Test; + +class PropertyHandlersTest { + + @Test + void resetReportsBlockedCountForExhaustedHandler() { + PropertyHandlers handlers = new PropertyHandlers(true); + // Exhaust span_kind (limit = 8) and record 2 blocked values. + for (int i = 0; i < MetricCardinalityLimits.SPAN_KIND; i++) { + handlers.spanKind.register("kind-" + i); + } + handlers.spanKind.register("overflow-1"); + handlers.spanKind.register("overflow-2"); + + HealthMetrics metrics = mock(HealthMetrics.class); + handlers.reset(metrics); + + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:span_kind"}, 2L); + verifyNoMoreInteractions(metrics); + } + + @Test + void resetReportsBlockedCountForAllNineHandlers() { + PropertyHandlers handlers = new PropertyHandlers(true); + exhaustAndBlock(handlers.resource, MetricCardinalityLimits.RESOURCE); + exhaustAndBlock(handlers.service, MetricCardinalityLimits.SERVICE); + exhaustAndBlock(handlers.operation, MetricCardinalityLimits.OPERATION); + exhaustAndBlock(handlers.serviceSource, MetricCardinalityLimits.SERVICE_SOURCE); + exhaustAndBlock(handlers.type, MetricCardinalityLimits.TYPE); + exhaustAndBlock(handlers.spanKind, MetricCardinalityLimits.SPAN_KIND); + exhaustAndBlock(handlers.httpMethod, MetricCardinalityLimits.HTTP_METHOD); + exhaustAndBlock(handlers.httpEndpoint, MetricCardinalityLimits.HTTP_ENDPOINT); + exhaustAndBlock(handlers.grpcStatusCode, MetricCardinalityLimits.GRPC_STATUS_CODE); + + HealthMetrics metrics = mock(HealthMetrics.class); + handlers.reset(metrics); + + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:resource"}, 1L); + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:service"}, 1L); + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:operation"}, 1L); + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:service_source"}, 1L); + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:type"}, 1L); + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:span_kind"}, 1L); + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:http_method"}, 1L); + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:http_endpoint"}, 1L); + verify(metrics).onTagCardinalityBlocked(new String[] {"tag:grpc_status_code"}, 1L); + verifyNoMoreInteractions(metrics); + } + + @Test + void resetRefreshesCapacityForNextCycle() { + PropertyHandlers handlers = new PropertyHandlers(true); + for (int i = 0; i < MetricCardinalityLimits.SPAN_KIND; i++) { + handlers.spanKind.register("kind-" + i); + } + assertEquals("blocked_by_tracer", handlers.spanKind.register("overflow").toString()); + + handlers.reset(HealthMetrics.NO_OP); + + // Overflow value should now be accepted as a real value. + assertNotEquals("blocked_by_tracer", handlers.spanKind.register("overflow").toString()); + assertEquals("overflow", handlers.spanKind.register("overflow").toString()); + } + + @Test + void resetWithNoBlockedValuesDoesNotCallHealthMetrics() { + PropertyHandlers handlers = new PropertyHandlers(true); + handlers.resource.register("r1"); + handlers.service.register("svc"); + + HealthMetrics metrics = mock(HealthMetrics.class); + handlers.reset(metrics); + + verifyNoMoreInteractions(metrics); + } + + /** Fills {@code handler} to its cardinality limit then registers one more to block it. */ + private static void exhaustAndBlock(PropertyCardinalityHandler handler, int limit) { + for (int i = 0; i < limit; i++) { + handler.register("value-" + i); + } + handler.register("overflow"); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java new file mode 100644 index 00000000000..717ffeb7284 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java @@ -0,0 +1,220 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.Histograms; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import datadog.trace.api.WellKnownTags; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.msgpack.core.MessagePack; +import org.msgpack.core.MessageUnpacker; + +class SerializingMetricWriterAdditionalTagsTest { + + @BeforeAll + static void initAgentMeter() { + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + monitoring.newTimer("test.init"); + Histograms.register(DDSketchHistograms.FACTORY); + } + + @Test + void additionalMetricTagsEmittedWhenSet() throws Exception { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region", "tenant_id"))); + AggregateTable table = newTable(schema); + + AggregateEntry entry = table.findOrInsert(snapshot(schema, "us-east-1", "acme-corp")); + entry.recordOneDuration(1L); + + List additionalTags = parseAdditionalMetricTags(writeBucket(table)); + assertEquals(2, additionalTags.size()); + // Order matches schema (alphabetical): region first, then tenant_id. + assertEquals("region:us-east-1", additionalTags.get(0)); + assertEquals("tenant_id:acme-corp", additionalTags.get(1)); + } + + @Test + void additionalMetricTagsFieldOmittedWhenNoneSet() throws Exception { + // Schema configured, but the span doesn't set any of the configured tags. + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region"))); + AggregateTable table = newTable(schema); + + AggregateEntry entry = table.findOrInsert(snapshot(schema, new String[] {null})); + entry.recordOneDuration(1L); + + assertFalse( + containsKey(writeBucket(table), "AdditionalMetricTags"), + "AdditionalMetricTags should be omitted when no slots are populated"); + } + + @Test + void additionalMetricTagsSkipsNullSlots() throws Exception { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region", "tenant_id"))); + AggregateTable table = newTable(schema); + + // Set only tenant_id; leave region null. + AggregateEntry entry = + table.findOrInsert( + snapshot( + schema, + new String[] { + /*region*/ + null, /*tenant_id*/ "acme-corp" + })); + entry.recordOneDuration(1L); + + List additionalTags = parseAdditionalMetricTags(writeBucket(table)); + assertEquals(1, additionalTags.size()); + assertEquals("tenant_id:acme-corp", additionalTags.get(0)); + } + + // ---------- helpers ---------- + + private static AggregateTable newTable(AdditionalTagsSchema schema) { + return new AggregateTable(64, schema); + } + + private static SpanSnapshot snapshot(AdditionalTagsSchema schema, String... values) { + String[] padded = new String[schema.size()]; + if (values != null) { + System.arraycopy(values, 0, padded, 0, Math.min(values.length, padded.length)); + } + return new SpanSnapshot( + "resource", + "service", + "operation", + null, + "web", + (short) 200, + false, + true, + "client", + null, + null, + null, + null, + null, + padded, + 0L); + } + + /** + * Serializes a single-bucket payload via {@link SerializingMetricWriter} into a {@link + * ByteBuffer}. The test's {@link CapturingSink} keeps the produced buffer for unpack. + */ + private static ByteBuffer writeBucket(AggregateTable table) { + CapturingSink sink = new CapturingSink(); + SerializingMetricWriter writer = + new SerializingMetricWriter( + new WellKnownTags("rid", "host", "env", "svc", "ver", "lang"), sink, 64 * 1024); + writer.startBucket(table.size(), 0L, TimeUnit.SECONDS.toNanos(10)); + table.forEach(writer::add); + writer.finishBucket(); + return sink.buffer; + } + + private static List parseAdditionalMetricTags(ByteBuffer payload) throws Exception { + MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(payload); + // Top-level map: skip to the per-stat entry. Structure mirrors SerializingMetricWriterTest. + int topMapSize = unpacker.unpackMapHeader(); + for (int i = 0; i < topMapSize; i++) { + String key = unpacker.unpackString(); + if ("Stats".equals(key)) { + // Stats is a 1-element array of buckets; each bucket has Start/Duration/Stats(=array of + // per-metric maps). + unpacker.unpackArrayHeader(); + int bucketMapSize = unpacker.unpackMapHeader(); + for (int j = 0; j < bucketMapSize; j++) { + String bucketKey = unpacker.unpackString(); + if ("Stats".equals(bucketKey)) { + int statsCount = unpacker.unpackArrayHeader(); + // Take the first stat entry and walk its map looking for AdditionalMetricTags. + for (int k = 0; k < statsCount; k++) { + int entryMapSize = unpacker.unpackMapHeader(); + for (int m = 0; m < entryMapSize; m++) { + String entryKey = unpacker.unpackString(); + if ("AdditionalMetricTags".equals(entryKey)) { + int n = unpacker.unpackArrayHeader(); + List result = new ArrayList<>(n); + for (int p = 0; p < n; p++) { + result.add(unpacker.unpackString()); + } + return result; + } else { + unpacker.skipValue(); + } + } + if (k == 0) break; // only inspecting the first stat entry + } + } else { + unpacker.skipValue(); + } + } + } else { + unpacker.skipValue(); + } + } + return new ArrayList<>(); + } + + private static boolean containsKey(ByteBuffer payload, String soughtKey) throws Exception { + MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(payload); + int topMapSize = unpacker.unpackMapHeader(); + for (int i = 0; i < topMapSize; i++) { + String key = unpacker.unpackString(); + if ("Stats".equals(key)) { + unpacker.unpackArrayHeader(); + int bucketMapSize = unpacker.unpackMapHeader(); + for (int j = 0; j < bucketMapSize; j++) { + String bucketKey = unpacker.unpackString(); + if ("Stats".equals(bucketKey)) { + int statsCount = unpacker.unpackArrayHeader(); + for (int k = 0; k < statsCount; k++) { + int entryMapSize = unpacker.unpackMapHeader(); + for (int m = 0; m < entryMapSize; m++) { + String entryKey = unpacker.unpackString(); + if (soughtKey.equals(entryKey)) { + return true; + } + unpacker.skipValue(); + } + if (k == 0) return false; // checked the only entry + } + } else { + unpacker.skipValue(); + } + } + } else { + unpacker.skipValue(); + } + } + return false; + } + + private static final class CapturingSink implements Sink { + ByteBuffer buffer; + + @Override + public void register(EventListener listener) {} + + @Override + public void accept(int messageCount, ByteBuffer buffer) { + this.buffer = buffer.duplicate(); + } + } +} diff --git a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy b/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy deleted file mode 100644 index 4c4ee81b276..00000000000 --- a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy +++ /dev/null @@ -1,79 +0,0 @@ -import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT -import static datadog.trace.common.metrics.EventListener.EventType.OK -import static java.util.concurrent.TimeUnit.SECONDS - -import datadog.communication.http.OkHttpUtils -import datadog.metrics.api.Histograms -import datadog.metrics.impl.DDSketchHistograms -import datadog.trace.api.Config -import datadog.trace.api.WellKnownTags -import datadog.trace.common.metrics.AggregateEntry -import datadog.trace.common.metrics.EventListener -import datadog.trace.common.metrics.OkHttpSink -import datadog.trace.common.metrics.PeerTagSchema -import datadog.trace.common.metrics.SerializingMetricWriter -import datadog.trace.common.metrics.SpanSnapshot -import java.util.concurrent.CopyOnWriteArrayList -import java.util.concurrent.CountDownLatch -import okhttp3.HttpUrl - -class MetricsIntegrationTest extends AbstractTraceAgentTest { - - def setupSpec() { - // Initialize metrics-lib histograms to register the DDSketch implementation - Histograms.register(DDSketchHistograms.FACTORY) - } - - def "send metrics to trace agent should notify with OK event"() { - setup: - def latch = new CountDownLatch(1) - def listener = new BlockingListener(latch) - def agentUrl = Config.get().getAgentUrl() - OkHttpSink sink = new OkHttpSink(OkHttpUtils.buildHttpClient(HttpUrl.parse(agentUrl), 5000L), agentUrl, V06_METRICS_ENDPOINT, true, false, [:]) - sink.register(listener) - - when: - SerializingMetricWriter writer = new SerializingMetricWriter( - new WellKnownTags("runtimeid","hostname", "env", "service", "version","language"), - sink - ) - writer.startBucket(2, System.nanoTime(), SECONDS.toNanos(10)) - // Build entries via the production AggregateEntry.forSnapshot(snap, keyHash) path -- same - // construction as AggregateTable.findOrInsert. Both entries use one peer tag (grault:quux) - // -> schema names=["grault"], values=["quux"]. - PeerTagSchema schema = PeerTagSchema.testSchema(["grault"] as String[]) - SpanSnapshot snap1 = new SpanSnapshot( - "resource1", "service1", "operation1", null, "sql", (short) 0, - false, true, "xyzzy", schema, ["quux"] as String[], null, null, null, 0L) - def entry1 = new AggregateEntry(snap1, AggregateEntry.hashOf(snap1)) - [2, 1, 2, 250, 4].each { entry1.recordOneDuration(it as long) } - writer.add(entry1) - SpanSnapshot snap2 = new SpanSnapshot( - "resource2", "service2", "operation2", null, "web", (short) 200, - false, true, "xyzzy", schema, ["quux"] as String[], null, null, null, 0L) - def entry2 = new AggregateEntry(snap2, AggregateEntry.hashOf(snap2)) - [1, 1, 200, 2, 3, 4, 5, 6, 7, 8].each { entry2.recordOneDuration(it as long) } - writer.add(entry2) - writer.finishBucket() - - then: - latch.await(5, SECONDS) - listener.events.size() == 1 && listener.events[0] == OK - } - - static class BlockingListener implements EventListener { - - List events = new CopyOnWriteArrayList<>() - final CountDownLatch latch - - BlockingListener(CountDownLatch latch) { - this.latch = latch - } - - @Override - void onEvent(EventType eventType, String message) { - events.add(eventType) - latch.countDown() - } - } -} diff --git a/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java b/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java new file mode 100644 index 00000000000..2f6b96cb7a9 --- /dev/null +++ b/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java @@ -0,0 +1,188 @@ +package datadog.trace.common.metrics; + +import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT; +import static datadog.trace.junit.utils.config.WithConfigExtension.injectSysConfig; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.communication.http.OkHttpUtils; +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import datadog.trace.api.Config; +import datadog.trace.api.ConfigDefaults; +import datadog.trace.api.WellKnownTags; +import datadog.trace.api.config.TracerConfig; +import datadog.trace.junit.utils.config.WithConfigExtension; +import java.time.Duration; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import okhttp3.HttpUrl; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.startupcheck.MinimumDurationRunningStartupCheckStrategy; + +@ExtendWith(WithConfigExtension.class) +class MetricsIntegrationTest { + + // CI runs an agent container alongside the build (reached via CI_AGENT_HOST); when building + // locally we start one ourselves with testcontainers. + private static final boolean RUNNING_IN_CI = "true".equals(System.getenv("CI")); + private static GenericContainer agentContainer; + + @BeforeAll + static void setupSpec() { + // recordOneDuration -> Histogram.accept needs the metrics meter / histogram factory registered. + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + + if (!RUNNING_IN_CI) { + Map env = new HashMap<>(); + env.put("DD_APM_ENABLED", "true"); + env.put("DD_BIND_HOST", "0.0.0.0"); + env.put("DD_API_KEY", "invalid_key_but_this_is_fine"); + env.put("DD_HOSTNAME", "doesnotexist"); + env.put("DD_LOGS_STDOUT", "yes"); + agentContainer = + new GenericContainer<>("datadog/agent:7.40.1") + .withEnv(env) + .withExposedPorts(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT) + .withStartupTimeout(Duration.ofSeconds(120)) + // Sleep for a bit so the agent's rate_by_service response is populated -- mirrors the + // race-condition workaround from the original Spock base. + .withStartupCheckStrategy( + new MinimumDurationRunningStartupCheckStrategy(Duration.ofSeconds(10))); + agentContainer.start(); + } + } + + @AfterAll + static void cleanupSpec() { + if (agentContainer != null) { + agentContainer.stop(); + } + } + + @BeforeEach + void setup() { + injectSysConfig(TracerConfig.AGENT_HOST, agentHost()); + injectSysConfig(TracerConfig.TRACE_AGENT_PORT, agentPort()); + } + + private static String agentHost() { + return agentContainer != null ? agentContainer.getHost() : System.getenv("CI_AGENT_HOST"); + } + + private static String agentPort() { + return agentContainer != null + ? String.valueOf(agentContainer.getMappedPort(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT)) + : String.valueOf(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT); + } + + @Test + void sendMetricsToTraceAgentShouldNotifyWithOkEvent() throws InterruptedException { + // setup + CountDownLatch latch = new CountDownLatch(1); + BlockingListener listener = new BlockingListener(latch); + String agentUrl = Config.get().getAgentUrl(); + OkHttpSink sink = + new OkHttpSink( + OkHttpUtils.buildHttpClient(HttpUrl.parse(agentUrl), 5000L), + agentUrl, + V06_METRICS_ENDPOINT, + true, + false, + Collections.emptyMap()); + sink.register(listener); + + // when + SerializingMetricWriter writer = + new SerializingMetricWriter( + new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language"), + sink); + writer.startBucket(2, System.nanoTime(), SECONDS.toNanos(10)); + // Build entries through the production AggregateTable.findOrInsert path (canonicalizes the + // snapshot and creates/looks up the entry). Both entries use one peer tag (grault:quux) and no + // additional tags -> schema names=["grault"], values=["quux"]. + AggregateTable table = new AggregateTable(8); + PeerTagSchema schema = PeerTagSchema.testSchema(new String[] {"grault"}); + SpanSnapshot snap1 = + new SpanSnapshot( + "resource1", + "service1", + "operation1", + null, + "sql", + (short) 0, + false, + true, + "xyzzy", + schema, + new String[] {"quux"}, + null, + null, + null, + null, + 0L); + AggregateEntry entry1 = table.findOrInsert(snap1); + for (long duration : new long[] {2, 1, 2, 250, 4}) { + entry1.recordOneDuration(duration); + } + writer.add(entry1); + SpanSnapshot snap2 = + new SpanSnapshot( + "resource2", + "service2", + "operation2", + null, + "web", + (short) 200, + false, + true, + "xyzzy", + schema, + new String[] {"quux"}, + null, + null, + null, + null, + 0L); + AggregateEntry entry2 = table.findOrInsert(snap2); + for (long duration : new long[] {1, 1, 200, 2, 3, 4, 5, 6, 7, 8}) { + entry2.recordOneDuration(duration); + } + writer.add(entry2); + writer.finishBucket(); + + // then + assertTrue(latch.await(5, SECONDS)); + assertEquals(1, listener.events.size()); + assertEquals(EventListener.EventType.OK, listener.events.get(0)); + } + + static class BlockingListener implements EventListener { + final List events = new CopyOnWriteArrayList<>(); + final CountDownLatch latch; + + BlockingListener(CountDownLatch latch) { + this.latch = latch; + } + + @Override + public void onEvent(EventListener.EventType eventType, String message) { + events.add(eventType); + latch.countDown(); + } + } +} diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 907b6db47d8..93db73fd57c 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -421,6 +421,7 @@ import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_MAX_PENDING; import static datadog.trace.api.config.GeneralConfig.TRACE_DEBUG; import static datadog.trace.api.config.GeneralConfig.TRACE_LOG_LEVEL; +import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_ADDITIONAL_TAGS; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_CARDINALITY_LIMITS_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_COMPUTATION_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_COMPUTATION_IGNORE_AGENT_VERSION; @@ -5202,6 +5203,10 @@ public Set getMetricsIgnoredResources() { return tryMakeImmutableSet(configProvider.getList(TRACER_METRICS_IGNORED_RESOURCES)); } + public Set getTraceStatsAdditionalTags() { + return tryMakeImmutableSet(configProvider.getList(TRACE_STATS_ADDITIONAL_TAGS)); + } + public String getEnv() { // intentionally not thread safe if (env == null) { diff --git a/metadata/supported-configurations.json b/metadata/supported-configurations.json index 0fa56b676f9..8400d54cd63 100644 --- a/metadata/supported-configurations.json +++ b/metadata/supported-configurations.json @@ -10641,6 +10641,14 @@ "aliases": [] } ], + "DD_TRACE_STATS_ADDITIONAL_TAGS": [ + { + "version": "A", + "type": "list", + "default": null, + "aliases": [] + } + ], "DD_TRACE_STATUS404DECORATOR_ENABLED": [ { "version": "A",