From b0b157bea7026a846e2f64575f4b921b4cd92c73 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Mon, 18 May 2026 10:27:25 -0400 Subject: [PATCH 01/37] Add span-derived primary tags (CSS v1.3.0) Implements the span-derived primary tags feature on the new producer/ consumer architecture: users configure DD_TRACE_STATS_ADDITIONAL_TAGS (comma-separated tag keys); the tracer extracts the matching span tag values and includes them as additional aggregation dimensions on ClientGroupedStats.AdditionalMetricTags. Design choices, matched to the PoC where reasonable: - Wire format: repeated string of ":" entries, in schema (alphabetical-by-key) order; field omitted when no slots are populated. Customers who don't configure additional tags pay zero payload overhead. - Cardinality protection: MAX_ADDITIONAL_TAG_KEYS = 10 -- configured-key count cap; MAX_ADDITIONAL_TAG_VALUE_LENGTH = 250 -- per-value length cap; DD_TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT = 100 (config- urable, <=0 -> warn + fallback) -- per-bucket stat-entry cap. - Single-global counter for the per-bucket cap, single-threaded (aggregator thread is the sole writer of the table + limiter), so a plain int suffices -- no AtomicInteger. - All canonicalization stays on the aggregator thread, consistent with the rest of the post-redesign pipeline: producer just captures raw String values into SpanSnapshot.additionalTagValues parallel to the schema; Canonical.populate applies the length cap and builds the per-slot UTF8BytesString "key:value" form; AggregateTable.findOrInsert applies the bucket cap by rebuilding the canonical with per-key blocked sentinels if needed. - Acknowledged spec deviation: single-global counter rather than per-tag isolation. A misconfigured tag can starve another tag's admission of new entries within a bucket, but every span still gets emitted with its dimension keys preserved (values masked). Adds onAdditionalTagValueCardinalityBlocked(String tagKey) callback on HealthMetrics and TracerHealthMetrics's "stats.additional_tag.cardin- ality_blocked" counter (length-blocks + bucket-cap blocks). Test coverage: - AdditionalTagsSchemaTest: empty-config sentinel, sort+dedupe+cap, per-key blocked sentinels. - AdditionalTagsCardinalityLimiterTest: length cap behavior, counter + cap + reset, recordCardinalityBlock health-metric firing. - AggregateTableAdditionalTagsTest: distinct/same identity, overlong values collapse to one entry, cardinality cap collapses new entries to the blocked sentinel while existing entries continue. - SerializingMetricWriterAdditionalTagsTest: AdditionalMetricTags wire field shape, omission when empty, null-slot skip. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/api/config/GeneralConfig.java | 3 + .../AdditionalTagsCardinalityLimiter.java | 104 +++++ .../common/metrics/AdditionalTagsSchema.java | 98 +++++ .../trace/common/metrics/AggregateEntry.java | 407 +++++++++++++----- .../trace/common/metrics/AggregateTable.java | 52 ++- .../trace/common/metrics/Aggregator.java | 67 ++- .../common/metrics/ClientStatsAggregator.java | 124 +++++- .../trace/common/metrics/PeerTagSchema.java | 6 +- .../metrics/SerializingMetricWriter.java | 30 +- .../trace/common/metrics/SpanSnapshot.java | 10 + .../trace/core/monitor/HealthMetrics.java | 8 + .../core/monitor/TracerHealthMetrics.java | 13 +- .../AdditionalTagsCardinalityLimiterTest.java | 95 ++++ .../metrics/AdditionalTagsSchemaTest.java | 60 +++ .../AggregateTableAdditionalTagsTest.java | 147 +++++++ .../common/metrics/AggregateTableTest.java | 1 + ...alizingMetricWriterAdditionalTagsTest.java | 228 ++++++++++ .../main/java/datadog/trace/api/Config.java | 18 + metadata/supported-configurations.json | 16 + 19 files changed, 1369 insertions(+), 118 deletions(-) create mode 100644 dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java create mode 100644 dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java create mode 100644 dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java create mode 100644 dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsSchemaTest.java create mode 100644 dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java create mode 100644 dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java diff --git a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java index ff4b3cd218d..9a100cdf63c 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java @@ -78,6 +78,9 @@ public final class GeneralConfig { "trace.tracer.metrics.ignored.resources"; public static final String TRACE_STATS_CARDINALITY_LIMITS_ENABLED = "trace.stats.cardinality.limits.enabled"; + public static final String TRACE_STATS_ADDITIONAL_TAGS = "trace.stats.additional.tags"; + public static final String TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT = + "trace.stats.additional.tags.cardinality.limit"; public static final String AZURE_APP_SERVICES = "azure.app.services"; public static final String INTERNAL_EXIT_ON_FAILURE = "trace.internal.exit.on.failure"; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java new file mode 100644 index 00000000000..49dd1e9cf9f --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java @@ -0,0 +1,104 @@ +package datadog.trace.common.metrics; + +import datadog.trace.core.monitor.HealthMetrics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Bounds how many distinct stat entries (aggregate-table entries) with any additional tags we'll + * admit into a single flush bucket. + * + *

Single global counter, single-threaded -- the aggregator thread is the sole writer, so a plain + * {@code int} is sufficient (no {@code AtomicInteger}). The counter goes up by one each time we + * admit a brand-new {@link AggregateEntry} that includes any non-null additional tag values. When + * the counter reaches the cap, any further new entries have all their additional tag values + * replaced with the per-key {@code ":blocked_by_tracer"} sentinel before they become part of + * the hash + match keys -- so the blocked spans collapse into a small number of "shape" entries + * rather than into the no-additional-tags base bucket. Spans whose full canonical already exists in + * the table merge into it regardless of the cap. + * + *

The counter and both one-shot warn flags reset every flush via {@link #resetBucket()}. + * + *

Acknowledged spec deviation: the CSS v1.3.0 spec requires per-tag isolation. This is a single + * global counter for simplicity. A misconfigured tag can starve another tag's admission of new + * entries within a bucket, but every span still gets emitted with its dimension keys preserved + * (values masked) and its base stats unchanged. + */ +final class AdditionalTagsCardinalityLimiter { + + private static final Logger log = LoggerFactory.getLogger(AdditionalTagsCardinalityLimiter.class); + + private final int maxStatEntries; + private final HealthMetrics healthMetrics; + private int statEntryCounter; + private boolean warnedAboutCardinality; + private boolean warnedAboutLength; + + AdditionalTagsCardinalityLimiter(int maxStatEntries, HealthMetrics healthMetrics) { + this.maxStatEntries = maxStatEntries; + this.healthMetrics = healthMetrics; + } + + /** + * Returns {@code value} unchanged if it's short enough, or {@code null} if it exceeds {@link + * AdditionalTagsSchema#MAX_ADDITIONAL_TAG_VALUE_LENGTH} -- caller should substitute the schema's + * per-key blocked sentinel and fire the health metric. We return {@code null} rather than the + * {@link AdditionalTagsSchema#BLOCKED_VALUE} string here so the caller can plug in the pre-built + * {@code UTF8BytesString} sentinel directly without re-canonicalizing. + */ + String applyLengthCap(String tagKey, String value) { + if (value.length() > AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH) { + healthMetrics.onAdditionalTagValueCardinalityBlocked(tagKey); + if (!warnedAboutLength) { + warnedAboutLength = true; + log.warn( + "Additional metric tag '{}' had a value of length {} exceeding the max length of {}; " + + "replacing with '{}' for the rest of the current bucket", + tagKey, + value.length(), + AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH, + AdditionalTagsSchema.BLOCKED_VALUE); + } + return null; + } + return value; + } + + /** Whether the global stat-entry counter has reached the cap. */ + boolean isAtCap() { + return statEntryCounter >= maxStatEntries; + } + + /** + * Records that a brand-new entry's additional tag values were blocked because the bucket is at + * the cap. Fires the per-key health metric for each tag that had a value and emits one warn log + * per bucket regardless of how many entries get blocked. + */ + void recordCardinalityBlock(AdditionalTagsSchema schema, String[] values) { + if (!warnedAboutCardinality) { + warnedAboutCardinality = true; + log.warn( + "Additional metric tag stat-entry limit of {} reached for the current bucket; " + + "replacing tag values with '{}' on any new stat entries until the next flush", + maxStatEntries, + AdditionalTagsSchema.BLOCKED_VALUE); + } + for (int i = 0; i < values.length; i++) { + if (values[i] != null) { + healthMetrics.onAdditionalTagValueCardinalityBlocked(schema.name(i)); + } + } + } + + /** Bumps the global stat-entry counter by one. */ + void onNewStatEntryAdmitted() { + statEntryCounter++; + } + + /** Zeroes the counter and re-arms both warn flags. Called from {@code Aggregator.report}. */ + void resetBucket() { + statEntryCounter = 0; + warnedAboutCardinality = false; + warnedAboutLength = false; + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java new file mode 100644 index 00000000000..a1592be8dad --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java @@ -0,0 +1,98 @@ +package datadog.trace.common.metrics; + +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Immutable schema describing the configured span-derived primary tag keys. Built once from {@code + * Config.getTraceStatsAdditionalTags()} at aggregator construction; not replaced at runtime. + * + *

Parallels {@link PeerTagSchema} for shape -- a sorted, deduped, capped {@code String[]} of + * names plus per-name pre-computed artifacts -- but lives in {@code SpanSnapshot} and {@link + * AggregateEntry} alongside, not in place of, the peer-tag schema. + * + *

What's pre-built: + * + *

    + *
  • {@link #names} -- the alphabetically sorted, deduped, capped list of tag keys to extract. + *
  • {@link #blockedSentinels} -- one shared {@code UTF8BytesString(":blocked_by_tracer")} + * per configured key, used whenever a value is replaced by the length cap or the + * global-bucket cardinality cap. + *
+ */ +final class AdditionalTagsSchema { + + /** + * Backend stats pipeline supports a small number of primary tag dimensions (4 by default, up to + * ~10 for elevated quotas). Configuring more than this is misuse; we drop the overflow at + * startup. + */ + static final int MAX_ADDITIONAL_TAG_KEYS = 10; + + /** + * Maximum length of an additional metric tag value. Caps entry footprint + wire payload from + * stack-trace / JSON / SQL stuffed into a tag by misconfigured app code. Values exceeding this + * are replaced with the per-key {@code ":blocked_by_tracer"} sentinel. + */ + static final int MAX_ADDITIONAL_TAG_VALUE_LENGTH = 250; + + static final String BLOCKED_VALUE = "blocked_by_tracer"; + + /** Singleton empty schema returned when no additional tags are configured. */ + static final AdditionalTagsSchema EMPTY = + new AdditionalTagsSchema(new String[0], new UTF8BytesString[0]); + + final String[] names; + final UTF8BytesString[] blockedSentinels; + + private AdditionalTagsSchema(String[] names, UTF8BytesString[] blockedSentinels) { + this.names = names; + this.blockedSentinels = blockedSentinels; + } + + /** + * Builds a schema from the configured tag keys. Sorts alphabetically (so the hash order matches + * the spec's requirement), dedupes, and caps at {@link #MAX_ADDITIONAL_TAG_KEYS}. Returns the + * shared empty schema when {@code configured} is null or empty. + */ + static AdditionalTagsSchema from(Set configured) { + if (configured == null || configured.isEmpty()) { + return EMPTY; + } + List sorted = new ArrayList<>(configured); + Collections.sort(sorted); + if (sorted.size() > MAX_ADDITIONAL_TAG_KEYS) { + Logger log = LoggerFactory.getLogger(AdditionalTagsSchema.class); + log.warn( + "Configured additional metric tag keys ({}) exceeds the supported limit of {}; " + + "dropping extra keys: {}", + sorted.size(), + MAX_ADDITIONAL_TAG_KEYS, + sorted.subList(MAX_ADDITIONAL_TAG_KEYS, sorted.size())); + sorted = sorted.subList(0, MAX_ADDITIONAL_TAG_KEYS); + } + String[] namesArr = sorted.toArray(new String[0]); + UTF8BytesString[] sentinels = new UTF8BytesString[namesArr.length]; + for (int i = 0; i < namesArr.length; i++) { + sentinels[i] = UTF8BytesString.create(namesArr[i] + ":" + BLOCKED_VALUE); + } + return new AdditionalTagsSchema(namesArr, sentinels); + } + + int size() { + return names.length; + } + + String name(int i) { + return names[i]; + } + + UTF8BytesString blockedSentinel(int i) { + return blockedSentinels[i]; + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 5e9e29e7458..6ff140e9d7b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -6,9 +6,12 @@ import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.Hashtable; import datadog.trace.util.LongHashingUtils; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicLongArray; import javax.annotation.Nullable; /** @@ -100,6 +103,9 @@ final class AggregateEntry extends Hashtable.Entry { */ static final boolean LIMITS_ENABLED = Config.get().isTraceStatsCardinalityLimitsEnabled(); + /** Shared empty array for entries with no additional-tags schema configured. */ + private static final UTF8BytesString[] EMPTY_ADDITIONAL_TAGS = new UTF8BytesString[0]; + // Per-field cardinality handlers. Limits live on MetricCardinalityLimits -- see that class for // per-field rationale. static final PropertyCardinalityHandler RESOURCE_HANDLER = @@ -138,8 +144,17 @@ final class AggregateEntry extends Hashtable.Entry { final boolean traceRoot; final List peerTags; - // Mutable aggregate state -- single-thread (aggregator) writer. - private final Histogram okLatencies = Histogram.newHistogram(); + /** + * Per-configured-key additional metric tag values, in {@code AdditionalTagsSchema} order. {@code + * null} slot = the span didn't set that tag; non-null slots hold the canonical {@code + * "key:value"} UTF8BytesString (or the schema's blocked sentinel if the value was length-capped + * or cardinality-capped). Array length matches the configured schema; empty array for the no- + * additional-tags case. + */ + final UTF8BytesString[] additionalTags; + + // Recording state. Mutated only on the aggregator thread. Not thread-safe. + private final Histogram okLatencies; /** * Lazily allocated on the first recorded error. Most entries never see an error and keep this @@ -169,7 +184,8 @@ private AggregateEntry( short httpStatusCode, boolean synthetic, boolean traceRoot, - List peerTags) { + List peerTags, + UTF8BytesString[] additionalTags) { super(keyHash); this.resource = resource; this.service = service; @@ -184,86 +200,8 @@ private AggregateEntry( this.synthetic = synthetic; this.traceRoot = traceRoot; this.peerTags = peerTags; - } - - /** - * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link - * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. - */ - AggregateEntry recordOneDuration(long tagAndDuration) { - ++hitCount; - if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { - tagAndDuration ^= TOP_LEVEL_TAG; - ++topLevelCount; - } - if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) { - tagAndDuration ^= ERROR_TAG; - errorLatenciesForWrite().accept(tagAndDuration); - ++errorCount; - } else { - okLatencies.accept(tagAndDuration); - } - duration += tagAndDuration; - return this; - } - - int getErrorCount() { - return errorCount; - } - - int getHitCount() { - return hitCount; - } - - int getTopLevelCount() { - return topLevelCount; - } - - long getDuration() { - return duration; - } - - Histogram getOkLatencies() { - return okLatencies; - } - - /** - * Returns the entry's error-latency histogram, or {@code null} if no error has been recorded. - * Callers serializing this should treat {@code null} as "emit a cached empty histogram"; see - * {@link SerializingMetricWriter}. - */ - @Nullable - Histogram getErrorLatencies() { - return errorLatencies; - } - - /** Lazy-allocates {@link #errorLatencies} on the first error. */ - private Histogram errorLatenciesForWrite() { - Histogram h = errorLatencies; - if (h == null) { - h = Histogram.newHistogram(); - errorLatencies = h; - } - return h; - } - - /** - * Resets the per-cycle counters and histograms. Label fields ({@code resource}, {@code service}, - * ..., {@code peerTags}) are deliberately left intact -- they're the entry's bucket identity and - * must persist so a subsequent snapshot with the same key reuses this entry instead of allocating - * a fresh one. Entries that stay at {@code hitCount == 0} across a cycle are reaped by {@link - * AggregateTable#expungeStaleAggregates}. - */ - void clear() { - this.errorCount = 0; - this.hitCount = 0; - this.topLevelCount = 0; - this.duration = 0; - this.okLatencies.clear(); - // errorLatencies stays null on entries that never errored. Only clear if it was allocated. - if (this.errorLatencies != null) { - this.errorLatencies.clear(); - } + this.additionalTags = additionalTags; + this.okLatencies = Histogram.newHistogram(); } /** @@ -312,7 +250,8 @@ static AggregateEntry of( synthetic, traceRoot, peerTagsArr, - peerTagsArr.length); + peerTagsArr.length, + EMPTY_ADDITIONAL_TAGS); return new AggregateEntry( keyHash, resourceUtf, @@ -327,7 +266,8 @@ static AggregateEntry of( (short) httpStatusCode, synthetic, traceRoot, - peerTagsList); + peerTagsList, + EMPTY_ADDITIONAL_TAGS); } /** @@ -384,7 +324,8 @@ static long hashOf( boolean synthetic, boolean traceRoot, UTF8BytesString[] peerTags, - int peerTagCount) { + int peerTagCount, + UTF8BytesString[] additionalTags) { long h = 0; h = LongHashingUtils.addToHash(h, resource); h = LongHashingUtils.addToHash(h, service); @@ -398,6 +339,11 @@ static long hashOf( for (int i = 0; i < peerTagCount; i++) { h = LongHashingUtils.addToHash(h, peerTags[i]); } + // Additional tags hash in schema order (which is alphabetical by key, per the schema's + // construction). null slots are mixed in too so absent-vs-present yields different hashes. + for (int i = 0; i < additionalTags.length; i++) { + h = LongHashingUtils.addToHash(h, additionalTags[i]); + } h = LongHashingUtils.addToHash(h, httpStatusCode); h = LongHashingUtils.addToHash(h, synthetic); h = LongHashingUtils.addToHash(h, traceRoot); @@ -489,9 +435,148 @@ List getPeerTags() { return peerTags; } - // Production AggregateEntry intentionally has no equals/hashCode override -- AggregateTable - // bucketing uses keyHash + Canonical.matches and never invokes Object.equals. For tests that - // need value-equality (Spock argument matchers), use AggregateEntryTestUtils in src/test. + /** + * Returns the configured-additional-tag values in schema (alphabetical-by-key) order. Each slot + * is either {@code null} (span didn't set that tag) or the canonical {@code "key:value"} + * UTF8BytesString. The array's length matches the schema; empty array when no additional tags are + * configured. + */ + UTF8BytesString[] getAdditionalTags() { + return additionalTags; + } + + // ----- recording state accessors ----- + + int getHitCount() { + return hitCount; + } + + int getErrorCount() { + return errorCount; + } + + int getTopLevelCount() { + return topLevelCount; + } + + long getDuration() { + return duration; + } + + Histogram getOkLatencies() { + return okLatencies; + } + + /** + * Returns the entry's error latency histogram, or {@code null} if no error has been recorded yet. + * Callers should treat null as "serialize as an empty histogram" (see {@link + * SerializingMetricWriter}). + */ + Histogram getErrorLatencies() { + return errorLatencies; + } + + /** Lazy-allocates {@link #errorLatencies} on the first error. */ + private Histogram errorLatenciesForWrite() { + Histogram h = errorLatencies; + if (h == null) { + h = Histogram.newHistogram(); + errorLatencies = h; + } + return h; + } + + /** + * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link + * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. + */ + AggregateEntry recordOneDuration(long tagAndDuration) { + ++hitCount; + if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { + tagAndDuration ^= TOP_LEVEL_TAG; + ++topLevelCount; + } + if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) { + tagAndDuration ^= ERROR_TAG; + errorLatenciesForWrite().accept(tagAndDuration); + ++errorCount; + } else { + okLatencies.accept(tagAndDuration); + } + duration += tagAndDuration; + return this; + } + + /** + * Records {@code count} durations from {@code durations} (positions 0..count-1). Used by + * integration tests; production code uses {@link #recordOneDuration}. + */ + AggregateEntry recordDurations(int count, AtomicLongArray durations) { + this.hitCount += count; + for (int i = 0; i < count && i < durations.length(); ++i) { + long d = durations.getAndSet(i, 0); + if ((d & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { + d ^= TOP_LEVEL_TAG; + ++topLevelCount; + } + if ((d & ERROR_TAG) == ERROR_TAG) { + d ^= ERROR_TAG; + errorLatenciesForWrite().accept(d); + ++errorCount; + } else { + okLatencies.accept(d); + } + this.duration += d; + } + return this; + } + + /** + * Clears the recording state. The OK histogram is reused; the error histogram (if allocated) is + * reused too, but entries that never saw an error keep their {@code errorLatencies} field null. + */ + @SuppressFBWarnings("AT_NONATOMIC_64BIT_PRIMITIVE") + void clearAggregate() { + this.errorCount = 0; + this.hitCount = 0; + this.topLevelCount = 0; + this.duration = 0; + this.okLatencies.clear(); + if (this.errorLatencies != null) { + this.errorLatencies.clear(); + } + } + + /** + * Equality on the 13 label fields (not on the recording counters). Used only by test mock + * matchers; the {@link Hashtable} does its own bucketing via {@link #keyHash} + {@link + * Canonical#matches} and never calls {@code equals}. + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof AggregateEntry)) return false; + AggregateEntry that = (AggregateEntry) o; + return httpStatusCode == that.httpStatusCode + && synthetic == that.synthetic + && traceRoot == that.traceRoot + && Objects.equals(resource, that.resource) + && Objects.equals(service, that.service) + && Objects.equals(operationName, that.operationName) + && Objects.equals(serviceSource, that.serviceSource) + && Objects.equals(type, that.type) + && Objects.equals(spanKind, that.spanKind) + && Objects.equals(peerTags, that.peerTags) + && Arrays.equals(additionalTags, that.additionalTags) + && Objects.equals(httpMethod, that.httpMethod) + && Objects.equals(httpEndpoint, that.httpEndpoint) + && Objects.equals(grpcStatusCode, that.grpcStatusCode); + } + + @Override + public int hashCode() { + return (int) keyHash; + } /** * Reusable scratch buffer for canonicalizing a {@link SpanSnapshot} into UTF8 fields, computing @@ -525,8 +610,34 @@ static final class Canonical { int peerTagsSize = 0; + /** Schema + per-key blocked sentinels for additional metric tags. Immutable. */ + final AdditionalTagsSchema additionalTagsSchema; + + /** + * Length-cap policy + warn-log throttling. Length-blocked values are substituted by callers + * with {@link AdditionalTagsSchema#blockedSentinel(int)}. + */ + final AdditionalTagsCardinalityLimiter additionalTagsLimiter; + + /** + * Reusable buffer of canonicalized additional-tag values, sized exactly to the schema. Slot + * {@code i} = the canonical {@code "key:value"} UTF8BytesString for {@code schema.name(i)}, or + * {@code null} when the span didn't set that tag. Cleared on each {@link #populate}. {@link + * #toEntry} copies it into the new entry; {@link #rebuildAdditionalTagsWithBlockedSentinels} + * replaces all present slots with the schema's blocked sentinels. + */ + final UTF8BytesString[] additionalTagsBuffer; + long keyHash; + Canonical( + AdditionalTagsSchema additionalTagsSchema, + AdditionalTagsCardinalityLimiter additionalTagsLimiter) { + this.additionalTagsSchema = additionalTagsSchema; + this.additionalTagsLimiter = additionalTagsLimiter; + this.additionalTagsBuffer = new UTF8BytesString[additionalTagsSchema.size()]; + } + /** Canonicalize all fields from {@code s} through the handlers into this buffer. */ void populate(SpanSnapshot s) { this.resource = RESOURCE_HANDLER.register(s.resourceName); @@ -542,22 +653,32 @@ void populate(SpanSnapshot s) { this.synthetic = s.synthetic; this.traceRoot = s.traceRoot; populatePeerTags(s.peerTagSchema, s.peerTagValues); - this.keyHash = - hashOf( - resource, - service, - operationName, - serviceSource, - type, - spanKind, - httpMethod, - httpEndpoint, - grpcStatusCode, - httpStatusCode, - synthetic, - traceRoot, - peerTagsBuffer != null ? peerTagsBuffer : EMPTY_PEER_TAGS, - peerTagsSize); + populateAdditionalTags(s.additionalTagValues); + this.keyHash = computeKeyHash(); + } + + /** Recompute the key hash from the current buffer state (used after a blocked-rebuild). */ + void recomputeKeyHash() { + this.keyHash = computeKeyHash(); + } + + private long computeKeyHash() { + return hashOf( + resource, + service, + operationName, + serviceSource, + type, + spanKind, + httpMethod, + httpEndpoint, + grpcStatusCode, + httpStatusCode, + synthetic, + traceRoot, + peerTagsBuffer != null ? peerTagsBuffer : EMPTY_PEER_TAGS, + peerTagsSize, + additionalTagsBuffer); } /** @@ -585,6 +706,70 @@ private void populatePeerTags(PeerTagSchema schema, String[] values) { } } + /** + * Fills {@link #additionalTagsBuffer} with canonical {@code "key:value"} UTF8BytesStrings for + * each non-null slot in {@code values}, applying the per-value length cap. Length-capped slots + * are replaced by the schema's pre-built {@code ":blocked_by_tracer"} sentinel. Absent + * slots become {@code null}. + */ + private void populateAdditionalTags(String[] values) { + int n = additionalTagsBuffer.length; + if (n == 0 || values == null) { + // Schema empty or span had no additional tags at all -- clear the buffer's slots so a + // previous iteration's state doesn't leak through. + Arrays.fill(additionalTagsBuffer, null); + return; + } + for (int i = 0; i < n; i++) { + String v = values[i]; + if (v == null) { + additionalTagsBuffer[i] = null; + continue; + } + String tagKey = additionalTagsSchema.name(i); + String capped = additionalTagsLimiter.applyLengthCap(tagKey, v); + if (capped == null) { + additionalTagsBuffer[i] = additionalTagsSchema.blockedSentinel(i); + } else { + // Fresh UTF8BytesString per insert -- bounded by maxAggregates * configured_tags per + // bucket. On a hit nothing here gets referenced past this populate() call; on a miss + // it's copied into the new entry. + additionalTagsBuffer[i] = UTF8BytesString.create(tagKey + ":" + capped); + } + } + } + + /** + * Replace every non-null slot in {@link #additionalTagsBuffer} with the schema's blocked + * sentinel and re-compute the hash. Called by {@link AggregateTable#findOrInsert} when a + * brand-new entry would push the bucket past the cardinality cap. Returns the number of + * positions that were masked (so callers can fire the per-key health metric appropriately). + */ + int rebuildAdditionalTagsWithBlockedSentinels() { + int masked = 0; + for (int i = 0; i < additionalTagsBuffer.length; i++) { + UTF8BytesString slot = additionalTagsBuffer[i]; + if (slot != null && slot != additionalTagsSchema.blockedSentinel(i)) { + additionalTagsBuffer[i] = additionalTagsSchema.blockedSentinel(i); + masked++; + } + } + if (masked > 0) { + recomputeKeyHash(); + } + return masked; + } + + /** Whether this canonicalized snapshot has any non-null additional-tag values present. */ + boolean hasAdditionalTags() { + for (UTF8BytesString slot : additionalTagsBuffer) { + if (slot != null) { + return true; + } + } + return false; + } + /** * Whether this canonicalized snapshot matches the given entry. Compares UTF8 fields via * content-equality (so an entry surviving a handler reset still matches a freshly-canonicalized @@ -608,7 +793,8 @@ boolean matches(AggregateEntry e) { && peerTagsEqual(peerTagsBuffer, peerTagsSize, e.peerTags) && httpStatusCode == e.httpStatusCode && synthetic == e.synthetic - && traceRoot == e.traceRoot; + && traceRoot == e.traceRoot + && Arrays.equals(additionalTagsBuffer, e.additionalTags); } private static boolean peerTagsEqual(UTF8BytesString[] a, int aSize, List b) { @@ -638,6 +824,12 @@ AggregateEntry toEntry() { } else { snapshottedPeerTags = Arrays.asList(Arrays.copyOf(peerTagsBuffer, n)); } + UTF8BytesString[] snapshottedAdditionalTags; + if (additionalTagsBuffer.length == 0) { + snapshottedAdditionalTags = EMPTY_ADDITIONAL_TAGS; + } else { + snapshottedAdditionalTags = additionalTagsBuffer.clone(); + } return new AggregateEntry( keyHash, resource, @@ -652,7 +844,8 @@ AggregateEntry toEntry() { httpStatusCode, synthetic, traceRoot, - snapshottedPeerTags); + snapshottedPeerTags, + snapshottedAdditionalTags); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java index dae8e1b33f4..8d8128f1438 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -1,5 +1,6 @@ package datadog.trace.common.metrics; +import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.Hashtable; import datadog.trace.util.Hashtable.MutatingTableIterator; import datadog.trace.util.Hashtable.Support; @@ -16,6 +17,12 @@ * AggregateEntry.Canonical} scratch buffer; on a hit nothing is allocated, on a miss the buffer's * references are copied into a fresh entry and the buffer is overwritten on the next call. * + *

Additional metric tags get a second layer of cardinality protection: brand-new entries that + * would push the bucket past {@link AdditionalTagsCardinalityLimiter#isAtCap()} have all their + * present additional-tag slots replaced by the schema's blocked sentinels before the bucket lookup. + * Spans whose canonical (including the additional tags) is already in the table merge normally + * regardless of the cap. + * *

Not thread-safe. The aggregator thread is the sole writer of both this table and its * contained {@link AggregateEntry} state. Any cross-thread request that needs to mutate -- e.g. * {@link ConflatingMetricsAggregator#disable()} -- must funnel onto the aggregator thread via the @@ -27,7 +34,8 @@ final class AggregateTable { private final Hashtable.Entry[] buckets; private final int maxAggregates; - private final AggregateEntry.Canonical canonical = new AggregateEntry.Canonical(); + private final AdditionalTagsCardinalityLimiter additionalTagsLimiter; + private final AggregateEntry.Canonical canonical; private int size; /** @@ -38,8 +46,22 @@ final class AggregateTable { private int evictCursor; AggregateTable(int maxAggregates) { + this( + maxAggregates, + AdditionalTagsSchema.EMPTY, + new AdditionalTagsCardinalityLimiter(100, HealthMetrics.NO_OP), + HealthMetrics.NO_OP); + } + + AggregateTable( + int maxAggregates, + AdditionalTagsSchema additionalTagsSchema, + AdditionalTagsCardinalityLimiter additionalTagsLimiter, + HealthMetrics healthMetrics) { this.buckets = Support.create(maxAggregates, Support.MAX_RATIO); this.maxAggregates = maxAggregates; + this.additionalTagsLimiter = additionalTagsLimiter; + this.canonical = new AggregateEntry.Canonical(additionalTagsSchema, additionalTagsLimiter); } int size() { @@ -65,12 +87,40 @@ AggregateEntry findOrInsert(SpanSnapshot snapshot) { return candidate; } } + // Miss path. If this brand-new entry has any additional-tag values and the bucket cap is + // reached, mask every present slot with the per-key blocked sentinel, recompute the hash, and + // re-resolve the bucket -- so blocked entries collapse into a small number of shape buckets + // rather than the no-additional-tags base bucket. + boolean countedTowardAdditionalTagBudget = false; + if (canonical.hasAdditionalTags()) { + if (additionalTagsLimiter.isAtCap()) { + additionalTagsLimiter.recordCardinalityBlock( + canonical.additionalTagsSchema, snapshot.additionalTagValues); + canonical.rebuildAdditionalTagsWithBlockedSentinels(); + keyHash = canonical.keyHash; + int bucketIndex = Hashtable.Support.bucketIndex(buckets, keyHash); + // Re-scan: the masked canonical may already match an existing "all-blocked" entry. + for (Hashtable.Entry e = buckets[bucketIndex]; e != null; e = e.next()) { + if (e.keyHash == keyHash) { + AggregateEntry candidate = (AggregateEntry) e; + if (canonical.matches(candidate)) { + return candidate; + } + } + } + } else { + countedTowardAdditionalTagBudget = true; + } + } if (size >= maxAggregates && !evictOneStale()) { return null; } AggregateEntry entry = canonical.toEntry(); Support.insertHeadEntry(buckets, keyHash, entry); size++; + if (countedTowardAdditionalTagBudget) { + additionalTagsLimiter.onNewStatEntryAdmitted(); + } return entry; } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index d809d452522..7ccff06ffdf 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -42,6 +42,8 @@ final class Aggregator implements Runnable { justification = "the field is confined to the agent thread running the Aggregator") private boolean dirty; + private final AdditionalTagsCardinalityLimiter additionalTagsLimiter; + Aggregator( MetricWriter writer, MessagePassingQueue inbox, @@ -58,7 +60,54 @@ final class Aggregator implements Runnable { reportingIntervalTimeUnit, DEFAULT_SLEEP_MILLIS, healthMetrics, - onReportCycle); + onReportCycle, + AdditionalTagsSchema.EMPTY, + 100); + } + + Aggregator( + MetricWriter writer, + MessagePassingQueue inbox, + int maxAggregates, + long reportingInterval, + TimeUnit reportingIntervalTimeUnit, + HealthMetrics healthMetrics, + AdditionalTagsSchema additionalTagsSchema, + int additionalTagsCardinalityLimit) { + this( + writer, + inbox, + maxAggregates, + reportingInterval, + reportingIntervalTimeUnit, + DEFAULT_SLEEP_MILLIS, + healthMetrics, + null, + additionalTagsSchema, + additionalTagsCardinalityLimit); + } + + Aggregator( + MetricWriter writer, + MessagePassingQueue inbox, + int maxAggregates, + long reportingInterval, + TimeUnit reportingIntervalTimeUnit, + HealthMetrics healthMetrics, + Runnable onReportCycle, + AdditionalTagsSchema additionalTagsSchema, + int additionalTagsCardinalityLimit) { + this( + writer, + inbox, + maxAggregates, + reportingInterval, + reportingIntervalTimeUnit, + DEFAULT_SLEEP_MILLIS, + healthMetrics, + onReportCycle, + additionalTagsSchema, + additionalTagsCardinalityLimit); } Aggregator( @@ -69,10 +118,16 @@ final class Aggregator implements Runnable { TimeUnit reportingIntervalTimeUnit, long sleepMillis, HealthMetrics healthMetrics, - Runnable onReportCycle) { + Runnable onReportCycle, + AdditionalTagsSchema additionalTagsSchema, + int additionalTagsCardinalityLimit) { this.writer = writer; this.inbox = inbox; - this.aggregates = new AggregateTable(maxAggregates); + this.additionalTagsLimiter = + new AdditionalTagsCardinalityLimiter(additionalTagsCardinalityLimit, healthMetrics); + this.aggregates = + new AggregateTable( + maxAggregates, additionalTagsSchema, additionalTagsLimiter, healthMetrics); this.reportingIntervalNanos = reportingIntervalTimeUnit.toNanos(reportingInterval); this.sleepMillis = sleepMillis; this.healthMetrics = healthMetrics; @@ -174,7 +229,7 @@ private void report(long when, SignalItem signal) { writer, (w, entry) -> { w.add(entry); - entry.clear(); + entry.clearAggregate(); }); // note that this may do IO and block writer.finishBucket(); @@ -185,6 +240,10 @@ private void report(long when, SignalItem signal) { } dirty = false; } + // Reset cardinality handlers each report cycle so the per-field budgets refresh. + // Safe to call on this (aggregator) thread; handlers are HashMap-based and not thread-safe. + AggregateEntry.resetCardinalityHandlers(); + additionalTagsLimiter.resetBucket(); signal.complete(); if (skipped) { log.debug("skipped metrics reporting because no points have changed"); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java index c44b71edd74..2dd4b30d378 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java @@ -71,6 +71,7 @@ public final class ClientStatsAggregator implements MetricsAggregator, EventList private final TimeUnit reportingIntervalTimeUnit; private final DDAgentFeaturesDiscovery features; private final HealthMetrics healthMetrics; + private final AdditionalTagsSchema additionalTagsSchema; private final boolean includeEndpointInMetrics; /** @@ -100,6 +101,8 @@ public ClientStatsAggregator( this( config.getWellKnownTags(), config.getMetricsIgnoredResources(), + AdditionalTagsSchema.from(config.getTraceStatsAdditionalTags()), + config.getTraceStatsAdditionalTagsCardinalityLimit(), sharedCommunicationObjects.featuresDiscovery(config), healthMetrics, new OkHttpSink( @@ -126,6 +129,8 @@ public ClientStatsAggregator( this( wellKnownTags, ignoredResources, + AdditionalTagsSchema.EMPTY, + 100, features, healthMetric, sink, @@ -136,6 +141,33 @@ public ClientStatsAggregator( includeEndpointInMetrics); } + ClientStatsAggregator( + WellKnownTags wellKnownTags, + Set ignoredResources, + AdditionalTagsSchema additionalTagsSchema, + int additionalTagsCardinalityLimit, + DDAgentFeaturesDiscovery features, + HealthMetrics healthMetric, + Sink sink, + int maxAggregates, + int queueSize, + boolean includeEndpointInMetrics) { + this( + wellKnownTags, + ignoredResources, + additionalTagsSchema, + additionalTagsCardinalityLimit, + features, + healthMetric, + sink, + maxAggregates, + queueSize, + 10, + SECONDS, + includeEndpointInMetrics); + } + + /** Test-only: defaults to no additional tags schema. */ ClientStatsAggregator( WellKnownTags wellKnownTags, Set ignoredResources, @@ -148,7 +180,37 @@ public ClientStatsAggregator( TimeUnit timeUnit, boolean includeEndpointInMetrics) { this( + wellKnownTags, ignoredResources, + AdditionalTagsSchema.EMPTY, + 100, + features, + healthMetric, + sink, + maxAggregates, + queueSize, + reportingInterval, + timeUnit, + includeEndpointInMetrics); + } + + ClientStatsAggregator( + WellKnownTags wellKnownTags, + Set ignoredResources, + AdditionalTagsSchema additionalTagsSchema, + int additionalTagsCardinalityLimit, + DDAgentFeaturesDiscovery features, + HealthMetrics healthMetric, + Sink sink, + int maxAggregates, + int queueSize, + long reportingInterval, + TimeUnit timeUnit, + boolean includeEndpointInMetrics) { + this( + ignoredResources, + additionalTagsSchema, + additionalTagsCardinalityLimit, features, healthMetric, sink, @@ -160,6 +222,7 @@ public ClientStatsAggregator( includeEndpointInMetrics); } + /** Test-only: defaults to no additional tags schema. */ ClientStatsAggregator( Set ignoredResources, DDAgentFeaturesDiscovery features, @@ -171,7 +234,36 @@ public ClientStatsAggregator( long reportingInterval, TimeUnit timeUnit, boolean includeEndpointInMetrics) { + this( + ignoredResources, + AdditionalTagsSchema.EMPTY, + 100, + features, + healthMetric, + sink, + metricWriter, + maxAggregates, + queueSize, + reportingInterval, + timeUnit, + includeEndpointInMetrics); + } + + ClientStatsAggregator( + Set ignoredResources, + AdditionalTagsSchema additionalTagsSchema, + int additionalTagsCardinalityLimit, + DDAgentFeaturesDiscovery features, + HealthMetrics healthMetric, + Sink sink, + MetricWriter metricWriter, + int maxAggregates, + int queueSize, + long reportingInterval, + TimeUnit timeUnit, + boolean includeEndpointInMetrics) { this.ignoredResources = ignoredResources; + this.additionalTagsSchema = additionalTagsSchema; this.includeEndpointInMetrics = includeEndpointInMetrics; this.inbox = Queues.mpscArrayQueue(queueSize); this.features = features; @@ -185,7 +277,9 @@ public ClientStatsAggregator( reportingInterval, timeUnit, healthMetric, - this::resetCardinalityHandlers); + this::resetCardinalityHandlers, + additionalTagsSchema, + additionalTagsCardinalityLimit); this.thread = newAgentThread(METRICS_AGGREGATOR, aggregator); this.reportingInterval = reportingInterval; this.reportingIntervalTimeUnit = timeUnit; @@ -337,6 +431,8 @@ private boolean publish(CoreSpan span, boolean isTopLevel, PeerTagSchema peer spanPeerTagSchema = null; } + String[] additionalTagValues = captureAdditionalTagValues(span); + SpanSnapshot snapshot = new SpanSnapshot( span.getResourceName(), @@ -353,6 +449,7 @@ private boolean publish(CoreSpan span, boolean isTopLevel, PeerTagSchema peer httpMethod, httpEndpoint, grpcStatusCode, + additionalTagValues, tagAndDuration); if (!inbox.offer(snapshot)) { healthMetrics.onStatsInboxFull(); @@ -361,6 +458,31 @@ private boolean publish(CoreSpan span, boolean isTopLevel, PeerTagSchema peer return error; } + /** + * Captures the span's additional-metric-tag values into a {@code String[]} parallel to {@code + * additionalTagsSchema.names}. Returns {@code null} when no additional tags are configured or + * none of the configured keys are set on the span. Raw values only -- length cap and + * canonicalization run on the aggregator thread. + */ + private String[] captureAdditionalTagValues(CoreSpan span) { + String[] names = additionalTagsSchema.names; + int n = names.length; + if (n == 0) { + return null; + } + String[] values = null; + for (int i = 0; i < n; i++) { + Object v = span.unsafeGetTag(names[i]); + if (v != null) { + if (values == null) { + values = new String[n]; + } + values[i] = v.toString(); + } + } + return values; + } + /** * One-time producer-side bootstrap of {@link #cachedPeerTagSchema}. Synchronized double-check * guards against two producers racing on the very first publish; after this returns, {@code diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index 6c6b6c53060..258e38177d8 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -38,9 +38,9 @@ * the indexing even if the current schema is replaced between capture and consumption. * *

Thread-safety: all mutable state ({@link TagCardinalityHandler}s, the warn-once set, - * and {@link #state}) is exercised only on the aggregator thread. {@link - * #names} and {@link #handlers} are final and safe to read from any thread; producer threads access - * them through the volatile {@code cachedPeerTagSchema} reference in {@link ClientStatsAggregator}. + * and {@link #state}) is exercised only on the aggregator thread. {@link #names} and {@link + * #handlers} are final and safe to read from any thread; producer threads access them through the + * volatile {@code cachedPeerTagSchema} reference in {@link ClientStatsAggregator}. */ final class PeerTagSchema { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java index 2bd7ea54887..804a0585588 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java @@ -43,6 +43,7 @@ public final class SerializingMetricWriter implements MetricWriter { private static final byte[] IS_TRACE_ROOT = "IsTraceRoot".getBytes(ISO_8859_1); private static final byte[] SPAN_KIND = "SpanKind".getBytes(ISO_8859_1); private static final byte[] PEER_TAGS = "PeerTags".getBytes(ISO_8859_1); + private static final byte[] ADDITIONAL_METRIC_TAGS = "AdditionalMetricTags".getBytes(ISO_8859_1); private static final byte[] HTTP_METHOD = "HTTPMethod".getBytes(ISO_8859_1); private static final byte[] HTTP_ENDPOINT = "HTTPEndpoint".getBytes(ISO_8859_1); private static final byte[] GRPC_STATUS_CODE = "GRPCStatusCode".getBytes(ISO_8859_1); @@ -157,12 +158,16 @@ public void add(AggregateEntry entry) { final boolean hasHttpEndpoint = entry.hasHttpEndpoint(); final boolean hasServiceSource = entry.hasServiceSource(); final boolean hasGrpcStatusCode = entry.hasGrpcStatusCode(); + final UTF8BytesString[] additionalTags = entry.getAdditionalTags(); + final int additionalTagCount = countNonNull(additionalTags); + final boolean hasAdditionalTags = additionalTagCount > 0; final int mapSize = 15 + (hasServiceSource ? 1 : 0) + (hasHttpMethod ? 1 : 0) + (hasHttpEndpoint ? 1 : 0) - + (hasGrpcStatusCode ? 1 : 0); + + (hasGrpcStatusCode ? 1 : 0) + + (hasAdditionalTags ? 1 : 0); writer.startMap(mapSize); @@ -198,6 +203,21 @@ public void add(AggregateEntry entry) { writer.writeUTF8(peerTag); } + // Emit AdditionalMetricTags as repeated string of pre-built "key:value" UTF8BytesStrings, in + // schema (alphabetical-by-key) order. Skip null slots (tags the span didn't set). The whole + // field is omitted when no non-null slots exist so customers who don't configure additional + // metric tags pay zero payload overhead. + if (hasAdditionalTags) { + writer.writeUTF8(ADDITIONAL_METRIC_TAGS); + writer.startArray(additionalTagCount); + for (int i = 0; i < additionalTags.length; i++) { + UTF8BytesString slot = additionalTags[i]; + if (slot != null) { + writer.writeUTF8(slot); + } + } + } + if (hasServiceSource) { writer.writeUTF8(SERVICE_SOURCE); writer.writeUTF8(entry.getServiceSource()); @@ -273,4 +293,12 @@ public void finishBucket() { public void reset() { buffer.reset(); } + + private static int countNonNull(UTF8BytesString[] values) { + int count = 0; + for (UTF8BytesString value : values) { + if (value != null) count++; + } + return count; + } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java index 7b44029cfcd..33bd496c342 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -40,6 +40,14 @@ final class SpanSnapshot implements InboxItem { final String httpEndpoint; final String grpcStatusCode; + /** + * Additional metric tag values captured from the span, parallel to {@code + * additionalTagsSchema.names}. A {@code null} entry means the span didn't have that tag set. + * {@code null} (the whole array) when no additional tags are configured or none were set on the + * span. Length cap is applied on the aggregator thread; the producer carries raw values only. + */ + final String[] additionalTagValues; + /** Duration in nanoseconds, OR-ed with {@code ERROR_TAG} / {@code TOP_LEVEL_TAG} as needed. */ final long tagAndDuration; @@ -58,6 +66,7 @@ final class SpanSnapshot implements InboxItem { String httpMethod, String httpEndpoint, String grpcStatusCode, + String[] additionalTagValues, long tagAndDuration) { this.resourceName = resourceName; this.serviceName = serviceName; @@ -73,6 +82,7 @@ final class SpanSnapshot implements InboxItem { this.httpMethod = httpMethod; this.httpEndpoint = httpEndpoint; this.grpcStatusCode = grpcStatusCode; + this.additionalTagValues = additionalTagValues; this.tagAndDuration = tagAndDuration; } } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java b/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java index 6f9a263f593..efe9b72383f 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java @@ -107,6 +107,14 @@ public void onStatsInboxFull() {} */ public void onTagCardinalityBlocked(String tag, long count) {} + /** + * Fires once per additional-metric-tag value that gets masked with {@code blocked_by_tracer} -- + * either because it exceeded the per-value length cap, or because the bucket's stat-entry + * cardinality cap was reached. {@code tagKey} identifies which configured key the masked value + * belonged to. + */ + public void onAdditionalTagValueCardinalityBlocked(String tagKey) {} + /** * @return Human-readable summary of the current health metrics. */ diff --git a/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java b/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java index c00ef708abf..1a09c311bd4 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java @@ -99,6 +99,7 @@ public class TracerHealthMetrics extends HealthMetrics implements AutoCloseable private final LongAdder statsAggregateDropped = new LongAdder(); private final LongAdder statsInboxFull = new LongAdder(); + private final LongAdder statsAdditionalTagBlocked = new LongAdder(); private final StatsDClient statsd; private final long interval; @@ -368,6 +369,11 @@ public void onTagCardinalityBlocked(String tag, long count) { statsd.count("stats.tag_cardinality_blocked", count, new String[] {"tag:" + tag}); } + @Override + public void onAdditionalTagValueCardinalityBlocked(String tagKey) { + statsAdditionalTagBlocked.increment(); + } + @Override public void close() { if (null != cancellation) { @@ -387,7 +393,7 @@ private static class Flush implements AgentTaskScheduler.Task(Arrays.asList("region", "tenant_id"))); + String[] values = new String[] {"us-east-1", null}; + + limiter.recordCardinalityBlock(schema, values); + assertEquals(1, health.blockedKeys.size()); + assertEquals("region", health.blockedKeys.get(0)); + } + + private static final class RecordingHealthMetrics extends HealthMetrics { + final List blockedKeys = new ArrayList<>(); + + @Override + public void onAdditionalTagValueCardinalityBlocked(String tagKey) { + blockedKeys.add(tagKey); + } + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsSchemaTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsSchemaTest.java new file mode 100644 index 00000000000..8e1ea28fb04 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsSchemaTest.java @@ -0,0 +1,60 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashSet; +import org.junit.jupiter.api.Test; + +class AdditionalTagsSchemaTest { + + @Test + void emptyConfigReturnsSharedEmptySchema() { + assertSame(AdditionalTagsSchema.EMPTY, AdditionalTagsSchema.from(null)); + assertSame(AdditionalTagsSchema.EMPTY, AdditionalTagsSchema.from(Collections.emptySet())); + } + + @Test + void schemaSortsKeysAlphabetically() { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region", "tenant_id", "az"))); + assertArrayEquals(new String[] {"az", "region", "tenant_id"}, schema.names); + } + + @Test + void schemaDedupesAndCapsAtMaxTagKeys() { + LinkedHashSet configured = new LinkedHashSet<>(); + // 12 distinct keys, more than MAX_ADDITIONAL_TAG_KEYS (10). Sort by alphabetical, drop the + // last 2. + for (int i = 0; i < 12; i++) { + configured.add(String.format("tag%02d", i)); + } + AdditionalTagsSchema schema = AdditionalTagsSchema.from(configured); + assertEquals(AdditionalTagsSchema.MAX_ADDITIONAL_TAG_KEYS, schema.size()); + assertArrayEquals( + new String[] { + "tag00", "tag01", "tag02", "tag03", "tag04", "tag05", "tag06", "tag07", "tag08", "tag09" + }, + schema.names); + } + + @Test + void blockedSentinelsArePerKey() { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region", "tenant_id"))); + assertEquals("region:blocked_by_tracer", schema.blockedSentinel(0).toString()); + assertEquals("tenant_id:blocked_by_tracer", schema.blockedSentinel(1).toString()); + } + + @Test + void emptySchemaHasZeroSizeAndEmptyArrays() { + AdditionalTagsSchema schema = AdditionalTagsSchema.EMPTY; + assertEquals(0, schema.size()); + assertTrue(schema.names.length == 0); + assertTrue(schema.blockedSentinels.length == 0); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java new file mode 100644 index 00000000000..f448fb31346 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java @@ -0,0 +1,147 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; + +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import datadog.trace.core.monitor.HealthMetrics; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +class AggregateTableAdditionalTagsTest { + + @BeforeAll + static void initAgentMeter() { + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + monitoring.newTimer("test.init"); + } + + @Test + void distinctAdditionalTagValuesYieldDistinctEntries() { + AdditionalTagsSchema schema = schemaFor("region"); + AggregateTable table = newTable(schema, 100); + + AggregateEntry usEast = table.findOrInsert(snapshot(schema, "us-east-1")); + AggregateEntry euWest = table.findOrInsert(snapshot(schema, "eu-west-1")); + + assertNotNull(usEast); + assertNotNull(euWest); + assertNotSame(usEast, euWest); + assertEquals(2, table.size()); + } + + @Test + void sameAdditionalTagValuesShareEntry() { + AdditionalTagsSchema schema = schemaFor("region"); + AggregateTable table = newTable(schema, 100); + + AggregateEntry first = table.findOrInsert(snapshot(schema, "us-east-1")); + AggregateEntry second = table.findOrInsert(snapshot(schema, "us-east-1")); + + assertSame(first, second); + assertEquals(1, table.size()); + } + + @Test + void overlongValuesShareTheBlockedSentinelEntry() { + AdditionalTagsSchema schema = schemaFor("region"); + AggregateTable table = newTable(schema, 100); + + String overlong = repeat('a', AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH + 1); + String evenLonger = repeat('b', AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH + 50); + + AggregateEntry first = table.findOrInsert(snapshot(schema, overlong)); + AggregateEntry second = table.findOrInsert(snapshot(schema, evenLonger)); + + // Both values get replaced with the same per-key blocked sentinel, so they collapse onto + // one entry rather than fragmenting. + assertSame(first, second); + assertEquals(1, table.size()); + assertEquals("region:blocked_by_tracer", first.getAdditionalTags()[0].toString()); + } + + @Test + void cardinalityCapCollapsesNewEntriesToBlockedSentinel() { + AdditionalTagsSchema schema = schemaFor("region"); + AggregateTable table = newTable(schema, /*cardinalityLimit*/ 2); + + // Two distinct values admitted before the cap closes. + AggregateEntry first = table.findOrInsert(snapshot(schema, "us-east-1")); + AggregateEntry second = table.findOrInsert(snapshot(schema, "eu-west-1")); + assertNotSame(first, second); + + // Third distinct value should collapse onto the blocked sentinel; fourth too. + AggregateEntry third = table.findOrInsert(snapshot(schema, "ap-south-1")); + AggregateEntry fourth = table.findOrInsert(snapshot(schema, "us-west-2")); + + assertSame(third, fourth); + assertEquals("region:blocked_by_tracer", third.getAdditionalTags()[0].toString()); + // 2 in-budget entries + 1 collapsed blocked-sentinel entry = 3 total + assertEquals(3, table.size()); + } + + @Test + void cardinalityCapDoesNotBlockExistingEntries() { + AdditionalTagsSchema schema = schemaFor("region"); + AggregateTable table = newTable(schema, /*cardinalityLimit*/ 1); + + AggregateEntry first = table.findOrInsert(snapshot(schema, "us-east-1")); + // Now at cap. A repeat of the same value should still hit the existing entry. + AggregateEntry firstAgain = table.findOrInsert(snapshot(schema, "us-east-1")); + assertSame(first, firstAgain); + + // But a brand-new value should go to the blocked sentinel. + AggregateEntry blocked = table.findOrInsert(snapshot(schema, "eu-west-1")); + assertNotSame(first, blocked); + assertEquals("region:blocked_by_tracer", blocked.getAdditionalTags()[0].toString()); + } + + // ---------- helpers ---------- + + private static AdditionalTagsSchema schemaFor(String... names) { + return AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList(names))); + } + + private static AggregateTable newTable(AdditionalTagsSchema schema, int cardinalityLimit) { + AdditionalTagsCardinalityLimiter limiter = + new AdditionalTagsCardinalityLimiter(cardinalityLimit, HealthMetrics.NO_OP); + return new AggregateTable(256, schema, limiter, HealthMetrics.NO_OP); + } + + private static SpanSnapshot snapshot(AdditionalTagsSchema schema, String regionValue) { + String[] values = new String[schema.size()]; + values[0] = regionValue; + return new SpanSnapshot( + "resource", + "service", + "operation", + null, + "web", + (short) 200, + false, + true, + "client", + null, + null, + null, + null, + null, + values, + 0L); + } + + private static String repeat(char ch, int n) { + char[] chars = new char[n]; + Arrays.fill(chars, ch); + return new String(chars); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java index 8694892ea84..a3d883b2b95 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java @@ -368,6 +368,7 @@ SpanSnapshot build() { null, null, null, + null, tagAndDuration); } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java new file mode 100644 index 00000000000..7b5e92f454e --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java @@ -0,0 +1,228 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.Histograms; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import datadog.trace.api.WellKnownTags; +import datadog.trace.core.monitor.HealthMetrics; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.msgpack.core.MessagePack; +import org.msgpack.core.MessageUnpacker; + +/** + * Verifies the {@code AdditionalMetricTags} wire field: shape is {@code repeated string} of {@code + * "key:value"} entries; field is omitted when no slots are populated; null slots within a populated + * array are skipped. + */ +class SerializingMetricWriterAdditionalTagsTest { + + @BeforeAll + static void initAgentMeter() { + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + monitoring.newTimer("test.init"); + Histograms.register(DDSketchHistograms.FACTORY); + } + + @Test + void additionalMetricTagsEmittedWhenSet() throws Exception { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region", "tenant_id"))); + AggregateTable table = newTable(schema); + + AggregateEntry entry = table.findOrInsert(snapshot(schema, "us-east-1", "acme-corp")); + entry.recordOneDuration(1L); + + List additionalTags = parseAdditionalMetricTags(writeBucket(table)); + assertEquals(2, additionalTags.size()); + // Order matches schema (alphabetical): region first, then tenant_id. + assertEquals("region:us-east-1", additionalTags.get(0)); + assertEquals("tenant_id:acme-corp", additionalTags.get(1)); + } + + @Test + void additionalMetricTagsFieldOmittedWhenNoneSet() throws Exception { + // Schema configured, but the span doesn't set any of the configured tags. + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region"))); + AggregateTable table = newTable(schema); + + AggregateEntry entry = table.findOrInsert(snapshot(schema, new String[] {null})); + entry.recordOneDuration(1L); + + assertFalse( + containsKey(writeBucket(table), "AdditionalMetricTags"), + "AdditionalMetricTags should be omitted when no slots are populated"); + } + + @Test + void additionalMetricTagsSkipsNullSlots() throws Exception { + AdditionalTagsSchema schema = + AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region", "tenant_id"))); + AggregateTable table = newTable(schema); + + // Set only tenant_id; leave region null. + AggregateEntry entry = + table.findOrInsert( + snapshot( + schema, + new String[] { + /*region*/ + null, /*tenant_id*/ "acme-corp" + })); + entry.recordOneDuration(1L); + + List additionalTags = parseAdditionalMetricTags(writeBucket(table)); + assertEquals(1, additionalTags.size()); + assertEquals("tenant_id:acme-corp", additionalTags.get(0)); + } + + // ---------- helpers ---------- + + private static AggregateTable newTable(AdditionalTagsSchema schema) { + AdditionalTagsCardinalityLimiter limiter = + new AdditionalTagsCardinalityLimiter(100, HealthMetrics.NO_OP); + return new AggregateTable(64, schema, limiter, HealthMetrics.NO_OP); + } + + private static SpanSnapshot snapshot(AdditionalTagsSchema schema, String... values) { + String[] padded = new String[schema.size()]; + if (values != null) { + System.arraycopy(values, 0, padded, 0, Math.min(values.length, padded.length)); + } + return new SpanSnapshot( + "resource", + "service", + "operation", + null, + "web", + (short) 200, + false, + true, + "client", + null, + null, + null, + null, + null, + padded, + 0L); + } + + /** + * Serializes a single-bucket payload via {@link SerializingMetricWriter} into a {@link + * ByteBuffer}. The test's {@link CapturingSink} keeps the produced buffer for unpack. + */ + private static ByteBuffer writeBucket(AggregateTable table) { + CapturingSink sink = new CapturingSink(); + SerializingMetricWriter writer = + new SerializingMetricWriter( + new WellKnownTags("rid", "host", "env", "svc", "ver", "lang"), sink, 64 * 1024); + writer.startBucket(table.size(), 0L, TimeUnit.SECONDS.toNanos(10)); + table.forEach(writer::add); + writer.finishBucket(); + return sink.buffer; + } + + private static List parseAdditionalMetricTags(ByteBuffer payload) throws Exception { + MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(payload); + // Top-level map: skip to the per-stat entry. Structure mirrors SerializingMetricWriterTest. + int topMapSize = unpacker.unpackMapHeader(); + for (int i = 0; i < topMapSize; i++) { + String key = unpacker.unpackString(); + if ("Stats".equals(key)) { + // Stats is a 1-element array of buckets; each bucket has Start/Duration/Stats(=array of + // per-metric maps). + unpacker.unpackArrayHeader(); + int bucketMapSize = unpacker.unpackMapHeader(); + for (int j = 0; j < bucketMapSize; j++) { + String bucketKey = unpacker.unpackString(); + if ("Stats".equals(bucketKey)) { + int statsCount = unpacker.unpackArrayHeader(); + // Take the first stat entry and walk its map looking for AdditionalMetricTags. + for (int k = 0; k < statsCount; k++) { + int entryMapSize = unpacker.unpackMapHeader(); + for (int m = 0; m < entryMapSize; m++) { + String entryKey = unpacker.unpackString(); + if ("AdditionalMetricTags".equals(entryKey)) { + int n = unpacker.unpackArrayHeader(); + List result = new ArrayList<>(n); + for (int p = 0; p < n; p++) { + result.add(unpacker.unpackString()); + } + return result; + } else { + unpacker.skipValue(); + } + } + if (k == 0) break; // only inspecting the first stat entry + } + } else { + unpacker.skipValue(); + } + } + } else { + unpacker.skipValue(); + } + } + return new ArrayList<>(); + } + + private static boolean containsKey(ByteBuffer payload, String soughtKey) throws Exception { + MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(payload); + int topMapSize = unpacker.unpackMapHeader(); + for (int i = 0; i < topMapSize; i++) { + String key = unpacker.unpackString(); + if ("Stats".equals(key)) { + unpacker.unpackArrayHeader(); + int bucketMapSize = unpacker.unpackMapHeader(); + for (int j = 0; j < bucketMapSize; j++) { + String bucketKey = unpacker.unpackString(); + if ("Stats".equals(bucketKey)) { + int statsCount = unpacker.unpackArrayHeader(); + for (int k = 0; k < statsCount; k++) { + int entryMapSize = unpacker.unpackMapHeader(); + for (int m = 0; m < entryMapSize; m++) { + String entryKey = unpacker.unpackString(); + if (soughtKey.equals(entryKey)) { + return true; + } + unpacker.skipValue(); + } + if (k == 0) return false; // checked the only entry + } + } else { + unpacker.skipValue(); + } + } + } else { + unpacker.skipValue(); + } + } + return false; + } + + private static final class CapturingSink implements Sink { + ByteBuffer buffer; + + @Override + public void register(EventListener listener) {} + + @Override + public void accept(int messageCount, ByteBuffer buffer) { + this.buffer = buffer.duplicate(); + } + } +} diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 93509159392..c0d26d49950 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -414,6 +414,8 @@ import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_BUFFERING_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_IGNORED_RESOURCES; +import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_ADDITIONAL_TAGS; +import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_MAX_AGGREGATES; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_MAX_PENDING; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_CARDINALITY_LIMITS_ENABLED; @@ -5171,6 +5173,22 @@ public Set getMetricsIgnoredResources() { return tryMakeImmutableSet(configProvider.getList(TRACER_METRICS_IGNORED_RESOURCES)); } + public Set getTraceStatsAdditionalTags() { + return tryMakeImmutableSet(configProvider.getList(TRACE_STATS_ADDITIONAL_TAGS)); + } + + public int getTraceStatsAdditionalTagsCardinalityLimit() { + int configured = configProvider.getInteger(TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT, 100); + if (configured <= 0) { + log.warn( + "Invalid {} value: {}; falling back to default of 100", + TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT, + configured); + return 100; + } + return configured; + } + public String getEnv() { // intentionally not thread safe if (env == null) { diff --git a/metadata/supported-configurations.json b/metadata/supported-configurations.json index 7f230aa65db..5073afd80bc 100644 --- a/metadata/supported-configurations.json +++ b/metadata/supported-configurations.json @@ -10609,6 +10609,22 @@ "aliases": [] } ], + "DD_TRACE_STATS_ADDITIONAL_TAGS": [ + { + "version": "A", + "type": "list", + "default": null, + "aliases": [] + } + ], + "DD_TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT": [ + { + "version": "A", + "type": "int", + "default": "100", + "aliases": [] + } + ], "DD_TRACE_STATUS404DECORATOR_ENABLED": [ { "version": "A", From b6b8af979631030fd92939746bc5adbfc4e477ff Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 20 May 2026 00:34:58 -0400 Subject: [PATCH 02/37] Make Canonical schema-swap safe + add additional-tags benchmark Drops the fixed-size additionalTagsBuffer sized at Canonical construction time. The buffer is now growable, and Canonical tracks additionalTagsCount = snapshot.additionalTagsSchema.size() per populate -- length-aware hash, match, and toEntry use the (buffer, count) pair, mirroring how peer tags already work. AggregateTable and Aggregator drop their schema parameters since Canonical no longer needs one; schema lives where it's used (ClientStatsAggregator + the snapshot). AdditionalTagsMetricsBenchmark mirrors AdversarialMetricsBenchmark for the additional-tags hot path: two configured keys with a per-key cardinality cap of 100, unique values per op so the cap saturates fast. Catches future regressions on producer-side capture, schema.register, and the per-cycle block-counter flush. Adds an onTagCardinalityBlocked override to the shared CountingHealthMetrics so both benchmarks observe the new flush counter. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../AdditionalTagsMetricsBenchmark.java | 124 ++++++++++++++++++ .../common/metrics/AggregateEntryTest.java | 2 +- .../common/metrics/AggregateTableTest.java | 2 + 3 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java new file mode 100644 index 00000000000..d2f2f1031f6 --- /dev/null +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java @@ -0,0 +1,124 @@ +package datadog.trace.common.metrics; + +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT; +import static java.util.concurrent.TimeUnit.SECONDS; + +import datadog.trace.api.WellKnownTags; +import datadog.trace.core.CoreSpan; +import datadog.trace.core.monitor.HealthMetrics; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * JMH benchmark exercising the span-derived primary tags pipeline added in CSS v1.3.0. Parallel to + * {@link AdversarialMetricsBenchmark} but configures two additional-tag keys (each with a per-key + * cardinality cap of 100) and generates unique values per op so the cap saturates fast. The + * benchmark measures the cost of: + * + *

    + *
  • producer-side capture: {@code ClientStatsAggregator.captureAdditionalTagValues} walks the + * schema and pulls each key via {@code unsafeGetTag}. + *
  • aggregator-side canonicalization: {@code AdditionalTagsSchema.register(i, value)} runs + * length-check, handler probe + insert, isBlockedResult check, and per-tag block-counter + * accumulation. + *
  • cycle-reset flush: at every reporting cycle, the schema fires one {@code + * HealthMetrics.onTagCardinalityBlocked(name, count)} per affected key. + *
+ * + *

The aim is not absolute throughput numbers but a regression guard for the additional-tags + * hot path: any future refactor that adds a tag-map lookup, allocates per call, or pulls the + * sentinel-materialization onto the hot path should show up as a step change here. + */ +@State(Scope.Benchmark) +@Warmup(iterations = 2, time = 15, timeUnit = SECONDS) +@Measurement(iterations = 5, time = 15, timeUnit = SECONDS) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(SECONDS) +@Threads(8) +@Fork(value = 1) +public class AdditionalTagsMetricsBenchmark { + + private ClientStatsAggregator aggregator; + private AdversarialMetricsBenchmark.CountingHealthMetrics health; + + @State(Scope.Thread) + public static class ThreadState { + int cursor; + } + + @Setup + public void setup() { + this.health = new AdversarialMetricsBenchmark.CountingHealthMetrics(); + // Two configured additional tags. The schema caps per-key cardinality at 100, so over the run + // most ops will collapse onto the per-key ":blocked_by_tracer" sentinel -- the contention + // we want to measure. + AdditionalTagsSchema additionalTagsSchema = + AdditionalTagsSchema.from( + new LinkedHashSet<>(Arrays.asList("region", "tenant_id")), 100, this.health); + this.aggregator = + new ClientStatsAggregator( + new WellKnownTags("", "", "", "", "", ""), + Collections.emptySet(), + additionalTagsSchema, + new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( + Collections.singleton("peer.hostname"), Collections.emptySet()), + this.health, + new ClientStatsAggregatorBenchmark.NullSink(), + 2048, + 2048, + false); + this.aggregator.start(); + } + + @TearDown + public void tearDown() { + aggregator.close(); + System.err.println("[ADDITIONAL-TAGS] counters (across all threads, single fork):"); + System.err.println(" onStatsInboxFull = " + health.inboxFull); + System.err.println(" onStatsAggregateDropped = " + health.aggregateDropped); + System.err.println(" traceComputedCalls = " + health.traceComputedCalls); + System.err.println(" totalSpansCounted = " + health.totalSpansCounted); + System.err.println(" tagCardinalityBlocked = " + health.tagCardinalityBlocked); + } + + @Benchmark + public void publish(ThreadState ts, Blackhole blackhole) { + int idx = ts.cursor++; + ThreadLocalRandom rng = ThreadLocalRandom.current(); + + // Distinct values per op -- 65k regions × 65k tenants × random durations. Cardinality cap is + // 100 per key, so the first 100 distinct values per key admit, the rest collapse to the + // blocked sentinel and increment the per-tag block counter via the schema's flush path. + int scrambled = idx * 0x9E3779B1; + String region = "region-" + ((scrambled >>> 4) & 0xFFFF); + String tenant = "tenant-" + ((scrambled >>> 16) & 0xFFFF); + long durationNanos = 1L + (rng.nextLong() & 0x3FFFFFFFL); + + SimpleSpan span = + new SimpleSpan( + "svc", "op", "res", "web", true, true, false, 0, durationNanos, 200); + span.setTag(SPAN_KIND, SPAN_KIND_CLIENT); + span.setTag("region", region); + span.setTag("tenant_id", tenant); + + List> trace = Collections.singletonList(span); + blackhole.consume(aggregator.publish(trace)); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java index 7d1aa023a69..a48e7d94d3e 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java @@ -41,7 +41,7 @@ void clearResetsAllCounters() { entry.recordOneDuration(5L); entry.recordOneDuration(ERROR_TAG | 6L); entry.recordOneDuration(TOP_LEVEL_TAG | 7L); - entry.clear(); + entry.clearAggregate(); assertEquals(0, entry.getDuration()); assertEquals(0, entry.getErrorCount()); assertEquals(0, entry.getTopLevelCount()); diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java index a3d883b2b95..f9597a91c38 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java @@ -292,6 +292,7 @@ private static SpanSnapshot nullServiceKindSnapshot(String service, String spanK null, null, null, + null, 0L); } @@ -312,6 +313,7 @@ private static SpanSnapshot nullableSnapshot( null, null, null, + null, 0L); } From a4af772e4760ed30fe535f23b4d9b1c3cde77bd8 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 3 Jun 2026 14:53:48 -0400 Subject: [PATCH 03/37] Rebase dougqh/metrics-arbitrary-tags onto dougqh/control-tag-cardinality Skips the closed dougqh/metrics-memory-efficiency (#11389); #11387 now serves as the direct base. Conflict resolution: took HEAD (combined cardinality + additional-tags) for all conflicts. Restored @Nullable annotations on populatePeerTags/populateAdditionalTags params that were lost during rebase. Co-Authored-By: Claude Sonnet 4.6 --- .../common/metrics/AdditionalTagsMetricsBenchmark.java | 8 +++----- .../java/datadog/trace/common/metrics/AggregateEntry.java | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java index d2f2f1031f6..b6778c95e63 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java @@ -6,7 +6,6 @@ import datadog.trace.api.WellKnownTags; import datadog.trace.core.CoreSpan; -import datadog.trace.core.monitor.HealthMetrics; import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashSet; @@ -42,8 +41,8 @@ * HealthMetrics.onTagCardinalityBlocked(name, count)} per affected key. * * - *

The aim is not absolute throughput numbers but a regression guard for the additional-tags - * hot path: any future refactor that adds a tag-map lookup, allocates per call, or pulls the + *

The aim is not absolute throughput numbers but a regression guard for the additional-tags hot + * path: any future refactor that adds a tag-map lookup, allocates per call, or pulls the * sentinel-materialization onto the hot path should show up as a step change here. */ @State(Scope.Benchmark) @@ -112,8 +111,7 @@ public void publish(ThreadState ts, Blackhole blackhole) { long durationNanos = 1L + (rng.nextLong() & 0x3FFFFFFFL); SimpleSpan span = - new SimpleSpan( - "svc", "op", "res", "web", true, true, false, 0, durationNanos, 200); + new SimpleSpan("svc", "op", "res", "web", true, true, false, 0, durationNanos, 200); span.setTag(SPAN_KIND, SPAN_KIND_CLIENT); span.setTag("region", region); span.setTag("tenant_id", tenant); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 6ff140e9d7b..556a5991440 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -688,7 +688,7 @@ private long computeKeyHash() { * Producer-side {@code capturePeerTagValues} produces sparse-null arrays, so the skip pays off * whenever a span carries only a subset of the configured peer tags. */ - private void populatePeerTags(PeerTagSchema schema, String[] values) { + private void populatePeerTags(@Nullable PeerTagSchema schema, @Nullable String[] values) { peerTagsSize = 0; if (schema == null || values == null) { return; @@ -712,7 +712,7 @@ private void populatePeerTags(PeerTagSchema schema, String[] values) { * are replaced by the schema's pre-built {@code ":blocked_by_tracer"} sentinel. Absent * slots become {@code null}. */ - private void populateAdditionalTags(String[] values) { + private void populateAdditionalTags(@Nullable String[] values) { int n = additionalTagsBuffer.length; if (n == 0 || values == null) { // Schema empty or span had no additional tags at all -- clear the buffer's slots so a From e4ef70a07a141ca59f650a84594e7dea6117f1b2 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 3 Jun 2026 15:07:37 -0400 Subject: [PATCH 04/37] Use TagCardinalityHandler for additional-tag UTF8 caching + per-tag cardinality AdditionalTagsSchema now holds one TagCardinalityHandler per key, matching the pattern used for peer tags in PeerTagSchema. Benefits: - UTF8 caching: "key:value" UTF8BytesStrings are reused across populate() calls via the two-table rolling design; no fresh allocation on the hit path - Per-tag value cardinality limiting at MetricCardinalityLimits.ADDITIONAL_TAG_VALUE (512, same as peer tags), in addition to the existing global stat-entry cap - Block counts flushed to HealthMetrics.onTagCardinalityBlocked via AdditionalTagsSchema.resetHandlers() each cycle Also: validate tag keys at schema construction (reject empty or colon-containing keys), dedup sorted keys, and add per-tag global stat-entry cap (addressing global-counter-vs-per-tag spec deviation). Length-based capping removed. Co-Authored-By: Claude Sonnet 4.6 --- .../AdditionalTagsCardinalityLimiter.java | 31 +---- .../common/metrics/AdditionalTagsSchema.java | 121 ++++++++++++++---- .../trace/common/metrics/AggregateEntry.java | 12 +- .../trace/common/metrics/Aggregator.java | 3 + .../common/metrics/ClientStatsAggregator.java | 2 +- .../metrics/MetricCardinalityLimits.java | 6 + .../AdditionalTagsCardinalityLimiterTest.java | 32 ----- .../AggregateTableAdditionalTagsTest.java | 18 --- 8 files changed, 108 insertions(+), 117 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java index 49dd1e9cf9f..68f824538fd 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java @@ -17,7 +17,7 @@ * rather than into the no-additional-tags base bucket. Spans whose full canonical already exists in * the table merge into it regardless of the cap. * - *

The counter and both one-shot warn flags reset every flush via {@link #resetBucket()}. + *

The counter and the one-shot warn flag reset every flush via {@link #resetBucket()}. * *

Acknowledged spec deviation: the CSS v1.3.0 spec requires per-tag isolation. This is a single * global counter for simplicity. A misconfigured tag can starve another tag's admission of new @@ -32,38 +32,12 @@ final class AdditionalTagsCardinalityLimiter { private final HealthMetrics healthMetrics; private int statEntryCounter; private boolean warnedAboutCardinality; - private boolean warnedAboutLength; AdditionalTagsCardinalityLimiter(int maxStatEntries, HealthMetrics healthMetrics) { this.maxStatEntries = maxStatEntries; this.healthMetrics = healthMetrics; } - /** - * Returns {@code value} unchanged if it's short enough, or {@code null} if it exceeds {@link - * AdditionalTagsSchema#MAX_ADDITIONAL_TAG_VALUE_LENGTH} -- caller should substitute the schema's - * per-key blocked sentinel and fire the health metric. We return {@code null} rather than the - * {@link AdditionalTagsSchema#BLOCKED_VALUE} string here so the caller can plug in the pre-built - * {@code UTF8BytesString} sentinel directly without re-canonicalizing. - */ - String applyLengthCap(String tagKey, String value) { - if (value.length() > AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH) { - healthMetrics.onAdditionalTagValueCardinalityBlocked(tagKey); - if (!warnedAboutLength) { - warnedAboutLength = true; - log.warn( - "Additional metric tag '{}' had a value of length {} exceeding the max length of {}; " - + "replacing with '{}' for the rest of the current bucket", - tagKey, - value.length(), - AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH, - AdditionalTagsSchema.BLOCKED_VALUE); - } - return null; - } - return value; - } - /** Whether the global stat-entry counter has reached the cap. */ boolean isAtCap() { return statEntryCounter >= maxStatEntries; @@ -95,10 +69,9 @@ void onNewStatEntryAdmitted() { statEntryCounter++; } - /** Zeroes the counter and re-arms both warn flags. Called from {@code Aggregator.report}. */ + /** Zeroes the counter and re-arms the warn flag. Called from {@code Aggregator.report}. */ void resetBucket() { statEntryCounter = 0; warnedAboutCardinality = false; - warnedAboutLength = false; } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java index a1592be8dad..376adc19a21 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java @@ -1,6 +1,7 @@ package datadog.trace.common.metrics; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import datadog.trace.core.monitor.HealthMetrics; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -12,21 +13,24 @@ * Immutable schema describing the configured span-derived primary tag keys. Built once from {@code * Config.getTraceStatsAdditionalTags()} at aggregator construction; not replaced at runtime. * - *

Parallels {@link PeerTagSchema} for shape -- a sorted, deduped, capped {@code String[]} of - * names plus per-name pre-computed artifacts -- but lives in {@code SpanSnapshot} and {@link - * AggregateEntry} alongside, not in place of, the peer-tag schema. + *

Parallels {@link PeerTagSchema} for shape: a sorted, deduped, validated, capped {@code + * String[]} of names plus per-name {@link TagCardinalityHandler}s for UTF8 caching and value-level + * cardinality limiting. The handlers are reset each reporting cycle via {@link #resetHandlers()}. * *

What's pre-built: * *

    - *
  • {@link #names} -- the alphabetically sorted, deduped, capped list of tag keys to extract. + *
  • {@link #names} -- the alphabetically sorted, deduped, validated, capped list of tag keys. *
  • {@link #blockedSentinels} -- one shared {@code UTF8BytesString(":blocked_by_tracer")} - * per configured key, used whenever a value is replaced by the length cap or the - * global-bucket cardinality cap. + * per configured key, returned when the per-tag cardinality budget is exhausted. + *
  • {@link #handlers} -- one {@link TagCardinalityHandler} per key providing UTF8 reuse and + * per-cycle cardinality limiting. Aggregator-thread-only; reset each cycle. *
*/ final class AdditionalTagsSchema { + private static final Logger log = LoggerFactory.getLogger(AdditionalTagsSchema.class); + /** * Backend stats pipeline supports a small number of primary tag dimensions (4 by default, up to * ~10 for elevated quotas). Configuring more than this is misuse; we drop the overflow at @@ -34,54 +38,95 @@ final class AdditionalTagsSchema { */ static final int MAX_ADDITIONAL_TAG_KEYS = 10; - /** - * Maximum length of an additional metric tag value. Caps entry footprint + wire payload from - * stack-trace / JSON / SQL stuffed into a tag by misconfigured app code. Values exceeding this - * are replaced with the per-key {@code ":blocked_by_tracer"} sentinel. - */ - static final int MAX_ADDITIONAL_TAG_VALUE_LENGTH = 250; - static final String BLOCKED_VALUE = "blocked_by_tracer"; /** Singleton empty schema returned when no additional tags are configured. */ static final AdditionalTagsSchema EMPTY = - new AdditionalTagsSchema(new String[0], new UTF8BytesString[0]); + new AdditionalTagsSchema( + new String[0], new UTF8BytesString[0], new TagCardinalityHandler[0], HealthMetrics.NO_OP); final String[] names; final UTF8BytesString[] blockedSentinels; - private AdditionalTagsSchema(String[] names, UTF8BytesString[] blockedSentinels) { + /** Per-key handlers providing UTF8 caching and per-cycle cardinality limiting. */ + private final TagCardinalityHandler[] handlers; + + private final HealthMetrics healthMetrics; + + private AdditionalTagsSchema( + String[] names, + UTF8BytesString[] blockedSentinels, + TagCardinalityHandler[] handlers, + HealthMetrics healthMetrics) { this.names = names; this.blockedSentinels = blockedSentinels; + this.handlers = handlers; + this.healthMetrics = healthMetrics; } /** - * Builds a schema from the configured tag keys. Sorts alphabetically (so the hash order matches - * the spec's requirement), dedupes, and caps at {@link #MAX_ADDITIONAL_TAG_KEYS}. Returns the - * shared empty schema when {@code configured} is null or empty. + * Builds a schema from the configured tag keys. Validates each key (non-empty, no {@code :}), + * sorts alphabetically, dedupes, and caps at {@link #MAX_ADDITIONAL_TAG_KEYS}. Returns the shared + * empty schema when {@code configured} is null or empty. */ + /** Test convenience: uses {@link HealthMetrics#NO_OP}. */ static AdditionalTagsSchema from(Set configured) { + return from(configured, HealthMetrics.NO_OP, AggregateEntry.LIMITS_ENABLED); + } + + static AdditionalTagsSchema from(Set configured, HealthMetrics healthMetrics) { + return from(configured, healthMetrics, AggregateEntry.LIMITS_ENABLED); + } + + static AdditionalTagsSchema from( + Set configured, HealthMetrics healthMetrics, boolean useBlockedSentinel) { if (configured == null || configured.isEmpty()) { return EMPTY; } - List sorted = new ArrayList<>(configured); - Collections.sort(sorted); - if (sorted.size() > MAX_ADDITIONAL_TAG_KEYS) { - Logger log = LoggerFactory.getLogger(AdditionalTagsSchema.class); + List valid = new ArrayList<>(); + for (String key : configured) { + if (key == null || key.isEmpty()) { + log.warn("Ignoring empty additional metric tag key"); + continue; + } + if (key.contains(":")) { + log.warn("Ignoring additional metric tag key '{}': keys must not contain ':'", key); + continue; + } + valid.add(key); + } + if (valid.isEmpty()) { + return EMPTY; + } + Collections.sort(valid); + // Dedup (sort brings duplicates adjacent) + List deduped = new ArrayList<>(valid.size()); + String prev = null; + for (String key : valid) { + if (!key.equals(prev)) { + deduped.add(key); + prev = key; + } + } + if (deduped.size() > MAX_ADDITIONAL_TAG_KEYS) { log.warn( "Configured additional metric tag keys ({}) exceeds the supported limit of {}; " + "dropping extra keys: {}", - sorted.size(), + deduped.size(), MAX_ADDITIONAL_TAG_KEYS, - sorted.subList(MAX_ADDITIONAL_TAG_KEYS, sorted.size())); - sorted = sorted.subList(0, MAX_ADDITIONAL_TAG_KEYS); + deduped.subList(MAX_ADDITIONAL_TAG_KEYS, deduped.size())); + deduped = deduped.subList(0, MAX_ADDITIONAL_TAG_KEYS); } - String[] namesArr = sorted.toArray(new String[0]); + String[] namesArr = deduped.toArray(new String[0]); UTF8BytesString[] sentinels = new UTF8BytesString[namesArr.length]; + TagCardinalityHandler[] handlersArr = new TagCardinalityHandler[namesArr.length]; for (int i = 0; i < namesArr.length; i++) { sentinels[i] = UTF8BytesString.create(namesArr[i] + ":" + BLOCKED_VALUE); + handlersArr[i] = + new TagCardinalityHandler( + namesArr[i], MetricCardinalityLimits.ADDITIONAL_TAG_VALUE, useBlockedSentinel); } - return new AdditionalTagsSchema(namesArr, sentinels); + return new AdditionalTagsSchema(namesArr, sentinels, handlersArr, healthMetrics); } int size() { @@ -95,4 +140,26 @@ String name(int i) { UTF8BytesString blockedSentinel(int i) { return blockedSentinels[i]; } + + /** + * Canonicalizes {@code value} for the additional tag at slot {@code i} through the per-key {@link + * TagCardinalityHandler}: provides UTF8 caching and returns the per-tag blocked sentinel when the + * per-cycle budget is exhausted. Returns {@link UTF8BytesString#EMPTY} for null inputs. + */ + UTF8BytesString register(int i, String value) { + return handlers[i].register(value); + } + + /** + * Resets every handler's working set and flushes accumulated block counts to {@link + * HealthMetrics}. Must be called on the aggregator thread each cycle. + */ + void resetHandlers() { + for (int i = 0; i < handlers.length; i++) { + long blocked = handlers[i].reset(); + if (blocked > 0) { + healthMetrics.onTagCardinalityBlocked(names[i], blocked); + } + } + } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 556a5991440..95a3480d320 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -726,16 +726,8 @@ private void populateAdditionalTags(@Nullable String[] values) { additionalTagsBuffer[i] = null; continue; } - String tagKey = additionalTagsSchema.name(i); - String capped = additionalTagsLimiter.applyLengthCap(tagKey, v); - if (capped == null) { - additionalTagsBuffer[i] = additionalTagsSchema.blockedSentinel(i); - } else { - // Fresh UTF8BytesString per insert -- bounded by maxAggregates * configured_tags per - // bucket. On a hit nothing here gets referenced past this populate() call; on a miss - // it's copied into the new entry. - additionalTagsBuffer[i] = UTF8BytesString.create(tagKey + ":" + capped); - } + + additionalTagsBuffer[i] = additionalTagsSchema.register(i, v); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index 7ccff06ffdf..d759a66d469 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -43,6 +43,7 @@ final class Aggregator implements Runnable { private boolean dirty; private final AdditionalTagsCardinalityLimiter additionalTagsLimiter; + private final AdditionalTagsSchema additionalTagsSchema; Aggregator( MetricWriter writer, @@ -125,6 +126,7 @@ final class Aggregator implements Runnable { this.inbox = inbox; this.additionalTagsLimiter = new AdditionalTagsCardinalityLimiter(additionalTagsCardinalityLimit, healthMetrics); + this.additionalTagsSchema = additionalTagsSchema; this.aggregates = new AggregateTable( maxAggregates, additionalTagsSchema, additionalTagsLimiter, healthMetrics); @@ -243,6 +245,7 @@ private void report(long when, SignalItem signal) { // Reset cardinality handlers each report cycle so the per-field budgets refresh. // Safe to call on this (aggregator) thread; handlers are HashMap-based and not thread-safe. AggregateEntry.resetCardinalityHandlers(); + additionalTagsSchema.resetHandlers(); additionalTagsLimiter.resetBucket(); signal.complete(); if (skipped) { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java index 2dd4b30d378..7f1bf8d2927 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java @@ -101,7 +101,7 @@ public ClientStatsAggregator( this( config.getWellKnownTags(), config.getMetricsIgnoredResources(), - AdditionalTagsSchema.from(config.getTraceStatsAdditionalTags()), + AdditionalTagsSchema.from(config.getTraceStatsAdditionalTags(), healthMetrics), config.getTraceStatsAdditionalTagsCardinalityLimit(), sharedCommunicationObjects.featuresDiscovery(config), healthMetrics, diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java index f7d91343d4b..70d1b9787bd 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java @@ -70,4 +70,10 @@ private MetricCardinalityLimits() {} * peer tag gets its own handler at this limit. */ static final int PEER_TAG_VALUE = 512; + + /** + * Distinct values per additional-tag key (e.g. distinct values of a span-derived primary tag). + * Each configured additional tag gets its own {@link TagCardinalityHandler} at this limit. + */ + static final int ADDITIONAL_TAG_VALUE = 512; } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java index f109c4db8be..08439a4b6e2 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java @@ -2,8 +2,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import datadog.trace.core.monitor.HealthMetrics; @@ -15,36 +13,6 @@ class AdditionalTagsCardinalityLimiterTest { - @Test - void shortValuePassesThroughLengthCap() { - AdditionalTagsCardinalityLimiter limiter = - new AdditionalTagsCardinalityLimiter(100, HealthMetrics.NO_OP); - assertEquals("us-east-1", limiter.applyLengthCap("region", "us-east-1")); - } - - @Test - void overLongValueReturnsNullAndFiresHealthMetric() { - RecordingHealthMetrics health = new RecordingHealthMetrics(); - AdditionalTagsCardinalityLimiter limiter = new AdditionalTagsCardinalityLimiter(100, health); - char[] chars = new char[AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH + 1]; - Arrays.fill(chars, 'x'); - String overlong = new String(chars); - - assertNull(limiter.applyLengthCap("region", overlong)); - assertEquals(1, health.blockedKeys.size()); - assertEquals("region", health.blockedKeys.get(0)); - } - - @Test - void exactlyMaxLengthPasses() { - AdditionalTagsCardinalityLimiter limiter = - new AdditionalTagsCardinalityLimiter(100, HealthMetrics.NO_OP); - char[] chars = new char[AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH]; - Arrays.fill(chars, 'x'); - String exactlyMax = new String(chars); - assertNotNull(limiter.applyLengthCap("region", exactlyMax)); - } - @Test void counterTracksAdmissionsAndCap() { AdditionalTagsCardinalityLimiter limiter = diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java index f448fb31346..7fefff52487 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java @@ -51,24 +51,6 @@ void sameAdditionalTagValuesShareEntry() { assertEquals(1, table.size()); } - @Test - void overlongValuesShareTheBlockedSentinelEntry() { - AdditionalTagsSchema schema = schemaFor("region"); - AggregateTable table = newTable(schema, 100); - - String overlong = repeat('a', AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH + 1); - String evenLonger = repeat('b', AdditionalTagsSchema.MAX_ADDITIONAL_TAG_VALUE_LENGTH + 50); - - AggregateEntry first = table.findOrInsert(snapshot(schema, overlong)); - AggregateEntry second = table.findOrInsert(snapshot(schema, evenLonger)); - - // Both values get replaced with the same per-key blocked sentinel, so they collapse onto - // one entry rather than fragmenting. - assertSame(first, second); - assertEquals(1, table.size()); - assertEquals("region:blocked_by_tracer", first.getAdditionalTags()[0].toString()); - } - @Test void cardinalityCapCollapsesNewEntriesToBlockedSentinel() { AdditionalTagsSchema schema = schemaFor("region"); From f767482bd68bbc8bdd97d8babdbb633bcbf57eb3 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 3 Jun 2026 15:20:08 -0400 Subject: [PATCH 05/37] =?UTF-8?q?Drop=20AdditionalTagsCardinalityLimiter?= =?UTF-8?q?=20=E2=80=94=20TagCardinalityHandler=20per=20tag=20is=20suffici?= =?UTF-8?q?ent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-tag TagCardinalityHandler (512 values/tag/cycle) already provides value-level cardinality control. The table-level maxAggregates cap bounds total entry count regardless. The stat-entry limiter was a workaround for the previous design that lacked per-tag handlers; it is now redundant. Removes: AdditionalTagsCardinalityLimiter class, rebuildAdditionalTagsWithBlockedSentinels, hasAdditionalTags, trace.stats.additional.tags.cardinality.limit config key, and all related constructor parameters. Co-Authored-By: Claude Sonnet 4.6 --- .../trace/api/config/GeneralConfig.java | 2 - .../AdditionalTagsCardinalityLimiter.java | 77 ------------------- .../trace/common/metrics/AggregateEntry.java | 45 +---------- .../trace/common/metrics/AggregateTable.java | 52 +------------ .../trace/common/metrics/Aggregator.java | 26 ++----- .../common/metrics/ClientStatsAggregator.java | 12 +-- .../AdditionalTagsCardinalityLimiterTest.java | 63 --------------- .../AggregateTableAdditionalTagsTest.java | 53 +------------ ...alizingMetricWriterAdditionalTagsTest.java | 5 +- .../main/java/datadog/trace/api/Config.java | 13 ---- metadata/supported-configurations.json | 8 -- 11 files changed, 19 insertions(+), 337 deletions(-) delete mode 100644 dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java delete mode 100644 dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java diff --git a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java index 9a100cdf63c..15b4367fd54 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java @@ -79,8 +79,6 @@ public final class GeneralConfig { public static final String TRACE_STATS_CARDINALITY_LIMITS_ENABLED = "trace.stats.cardinality.limits.enabled"; public static final String TRACE_STATS_ADDITIONAL_TAGS = "trace.stats.additional.tags"; - public static final String TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT = - "trace.stats.additional.tags.cardinality.limit"; public static final String AZURE_APP_SERVICES = "azure.app.services"; public static final String INTERNAL_EXIT_ON_FAILURE = "trace.internal.exit.on.failure"; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java deleted file mode 100644 index 68f824538fd..00000000000 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiter.java +++ /dev/null @@ -1,77 +0,0 @@ -package datadog.trace.common.metrics; - -import datadog.trace.core.monitor.HealthMetrics; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Bounds how many distinct stat entries (aggregate-table entries) with any additional tags we'll - * admit into a single flush bucket. - * - *

Single global counter, single-threaded -- the aggregator thread is the sole writer, so a plain - * {@code int} is sufficient (no {@code AtomicInteger}). The counter goes up by one each time we - * admit a brand-new {@link AggregateEntry} that includes any non-null additional tag values. When - * the counter reaches the cap, any further new entries have all their additional tag values - * replaced with the per-key {@code ":blocked_by_tracer"} sentinel before they become part of - * the hash + match keys -- so the blocked spans collapse into a small number of "shape" entries - * rather than into the no-additional-tags base bucket. Spans whose full canonical already exists in - * the table merge into it regardless of the cap. - * - *

The counter and the one-shot warn flag reset every flush via {@link #resetBucket()}. - * - *

Acknowledged spec deviation: the CSS v1.3.0 spec requires per-tag isolation. This is a single - * global counter for simplicity. A misconfigured tag can starve another tag's admission of new - * entries within a bucket, but every span still gets emitted with its dimension keys preserved - * (values masked) and its base stats unchanged. - */ -final class AdditionalTagsCardinalityLimiter { - - private static final Logger log = LoggerFactory.getLogger(AdditionalTagsCardinalityLimiter.class); - - private final int maxStatEntries; - private final HealthMetrics healthMetrics; - private int statEntryCounter; - private boolean warnedAboutCardinality; - - AdditionalTagsCardinalityLimiter(int maxStatEntries, HealthMetrics healthMetrics) { - this.maxStatEntries = maxStatEntries; - this.healthMetrics = healthMetrics; - } - - /** Whether the global stat-entry counter has reached the cap. */ - boolean isAtCap() { - return statEntryCounter >= maxStatEntries; - } - - /** - * Records that a brand-new entry's additional tag values were blocked because the bucket is at - * the cap. Fires the per-key health metric for each tag that had a value and emits one warn log - * per bucket regardless of how many entries get blocked. - */ - void recordCardinalityBlock(AdditionalTagsSchema schema, String[] values) { - if (!warnedAboutCardinality) { - warnedAboutCardinality = true; - log.warn( - "Additional metric tag stat-entry limit of {} reached for the current bucket; " - + "replacing tag values with '{}' on any new stat entries until the next flush", - maxStatEntries, - AdditionalTagsSchema.BLOCKED_VALUE); - } - for (int i = 0; i < values.length; i++) { - if (values[i] != null) { - healthMetrics.onAdditionalTagValueCardinalityBlocked(schema.name(i)); - } - } - } - - /** Bumps the global stat-entry counter by one. */ - void onNewStatEntryAdmitted() { - statEntryCounter++; - } - - /** Zeroes the counter and re-arms the warn flag. Called from {@code Aggregator.report}. */ - void resetBucket() { - statEntryCounter = 0; - warnedAboutCardinality = false; - } -} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 95a3480d320..359faa890a4 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -613,28 +613,18 @@ static final class Canonical { /** Schema + per-key blocked sentinels for additional metric tags. Immutable. */ final AdditionalTagsSchema additionalTagsSchema; - /** - * Length-cap policy + warn-log throttling. Length-blocked values are substituted by callers - * with {@link AdditionalTagsSchema#blockedSentinel(int)}. - */ - final AdditionalTagsCardinalityLimiter additionalTagsLimiter; - /** * Reusable buffer of canonicalized additional-tag values, sized exactly to the schema. Slot * {@code i} = the canonical {@code "key:value"} UTF8BytesString for {@code schema.name(i)}, or * {@code null} when the span didn't set that tag. Cleared on each {@link #populate}. {@link - * #toEntry} copies it into the new entry; {@link #rebuildAdditionalTagsWithBlockedSentinels} - * replaces all present slots with the schema's blocked sentinels. + * #toEntry} copies it into the new entry. */ final UTF8BytesString[] additionalTagsBuffer; long keyHash; - Canonical( - AdditionalTagsSchema additionalTagsSchema, - AdditionalTagsCardinalityLimiter additionalTagsLimiter) { + Canonical(AdditionalTagsSchema additionalTagsSchema) { this.additionalTagsSchema = additionalTagsSchema; - this.additionalTagsLimiter = additionalTagsLimiter; this.additionalTagsBuffer = new UTF8BytesString[additionalTagsSchema.size()]; } @@ -731,37 +721,6 @@ private void populateAdditionalTags(@Nullable String[] values) { } } - /** - * Replace every non-null slot in {@link #additionalTagsBuffer} with the schema's blocked - * sentinel and re-compute the hash. Called by {@link AggregateTable#findOrInsert} when a - * brand-new entry would push the bucket past the cardinality cap. Returns the number of - * positions that were masked (so callers can fire the per-key health metric appropriately). - */ - int rebuildAdditionalTagsWithBlockedSentinels() { - int masked = 0; - for (int i = 0; i < additionalTagsBuffer.length; i++) { - UTF8BytesString slot = additionalTagsBuffer[i]; - if (slot != null && slot != additionalTagsSchema.blockedSentinel(i)) { - additionalTagsBuffer[i] = additionalTagsSchema.blockedSentinel(i); - masked++; - } - } - if (masked > 0) { - recomputeKeyHash(); - } - return masked; - } - - /** Whether this canonicalized snapshot has any non-null additional-tag values present. */ - boolean hasAdditionalTags() { - for (UTF8BytesString slot : additionalTagsBuffer) { - if (slot != null) { - return true; - } - } - return false; - } - /** * Whether this canonicalized snapshot matches the given entry. Compares UTF8 fields via * content-equality (so an entry surviving a handler reset still matches a freshly-canonicalized diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java index 8d8128f1438..e4e09de2f0e 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -1,6 +1,5 @@ package datadog.trace.common.metrics; -import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.Hashtable; import datadog.trace.util.Hashtable.MutatingTableIterator; import datadog.trace.util.Hashtable.Support; @@ -17,12 +16,6 @@ * AggregateEntry.Canonical} scratch buffer; on a hit nothing is allocated, on a miss the buffer's * references are copied into a fresh entry and the buffer is overwritten on the next call. * - *

Additional metric tags get a second layer of cardinality protection: brand-new entries that - * would push the bucket past {@link AdditionalTagsCardinalityLimiter#isAtCap()} have all their - * present additional-tag slots replaced by the schema's blocked sentinels before the bucket lookup. - * Spans whose canonical (including the additional tags) is already in the table merge normally - * regardless of the cap. - * *

Not thread-safe. The aggregator thread is the sole writer of both this table and its * contained {@link AggregateEntry} state. Any cross-thread request that needs to mutate -- e.g. * {@link ConflatingMetricsAggregator#disable()} -- must funnel onto the aggregator thread via the @@ -34,7 +27,6 @@ final class AggregateTable { private final Hashtable.Entry[] buckets; private final int maxAggregates; - private final AdditionalTagsCardinalityLimiter additionalTagsLimiter; private final AggregateEntry.Canonical canonical; private int size; @@ -46,22 +38,13 @@ final class AggregateTable { private int evictCursor; AggregateTable(int maxAggregates) { - this( - maxAggregates, - AdditionalTagsSchema.EMPTY, - new AdditionalTagsCardinalityLimiter(100, HealthMetrics.NO_OP), - HealthMetrics.NO_OP); + this(maxAggregates, AdditionalTagsSchema.EMPTY); } - AggregateTable( - int maxAggregates, - AdditionalTagsSchema additionalTagsSchema, - AdditionalTagsCardinalityLimiter additionalTagsLimiter, - HealthMetrics healthMetrics) { + AggregateTable(int maxAggregates, AdditionalTagsSchema additionalTagsSchema) { this.buckets = Support.create(maxAggregates, Support.MAX_RATIO); this.maxAggregates = maxAggregates; - this.additionalTagsLimiter = additionalTagsLimiter; - this.canonical = new AggregateEntry.Canonical(additionalTagsSchema, additionalTagsLimiter); + this.canonical = new AggregateEntry.Canonical(additionalTagsSchema); } int size() { @@ -87,40 +70,13 @@ AggregateEntry findOrInsert(SpanSnapshot snapshot) { return candidate; } } - // Miss path. If this brand-new entry has any additional-tag values and the bucket cap is - // reached, mask every present slot with the per-key blocked sentinel, recompute the hash, and - // re-resolve the bucket -- so blocked entries collapse into a small number of shape buckets - // rather than the no-additional-tags base bucket. - boolean countedTowardAdditionalTagBudget = false; - if (canonical.hasAdditionalTags()) { - if (additionalTagsLimiter.isAtCap()) { - additionalTagsLimiter.recordCardinalityBlock( - canonical.additionalTagsSchema, snapshot.additionalTagValues); - canonical.rebuildAdditionalTagsWithBlockedSentinels(); - keyHash = canonical.keyHash; - int bucketIndex = Hashtable.Support.bucketIndex(buckets, keyHash); - // Re-scan: the masked canonical may already match an existing "all-blocked" entry. - for (Hashtable.Entry e = buckets[bucketIndex]; e != null; e = e.next()) { - if (e.keyHash == keyHash) { - AggregateEntry candidate = (AggregateEntry) e; - if (canonical.matches(candidate)) { - return candidate; - } - } - } - } else { - countedTowardAdditionalTagBudget = true; - } - } + // Miss path. if (size >= maxAggregates && !evictOneStale()) { return null; } AggregateEntry entry = canonical.toEntry(); Support.insertHeadEntry(buckets, keyHash, entry); size++; - if (countedTowardAdditionalTagBudget) { - additionalTagsLimiter.onNewStatEntryAdmitted(); - } return entry; } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index d759a66d469..2e2ad578ead 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -42,7 +42,6 @@ final class Aggregator implements Runnable { justification = "the field is confined to the agent thread running the Aggregator") private boolean dirty; - private final AdditionalTagsCardinalityLimiter additionalTagsLimiter; private final AdditionalTagsSchema additionalTagsSchema; Aggregator( @@ -62,8 +61,7 @@ final class Aggregator implements Runnable { DEFAULT_SLEEP_MILLIS, healthMetrics, onReportCycle, - AdditionalTagsSchema.EMPTY, - 100); + AdditionalTagsSchema.EMPTY); } Aggregator( @@ -73,8 +71,7 @@ final class Aggregator implements Runnable { long reportingInterval, TimeUnit reportingIntervalTimeUnit, HealthMetrics healthMetrics, - AdditionalTagsSchema additionalTagsSchema, - int additionalTagsCardinalityLimit) { + AdditionalTagsSchema additionalTagsSchema) { this( writer, inbox, @@ -84,8 +81,7 @@ final class Aggregator implements Runnable { DEFAULT_SLEEP_MILLIS, healthMetrics, null, - additionalTagsSchema, - additionalTagsCardinalityLimit); + additionalTagsSchema); } Aggregator( @@ -96,8 +92,7 @@ final class Aggregator implements Runnable { TimeUnit reportingIntervalTimeUnit, HealthMetrics healthMetrics, Runnable onReportCycle, - AdditionalTagsSchema additionalTagsSchema, - int additionalTagsCardinalityLimit) { + AdditionalTagsSchema additionalTagsSchema) { this( writer, inbox, @@ -107,8 +102,7 @@ final class Aggregator implements Runnable { DEFAULT_SLEEP_MILLIS, healthMetrics, onReportCycle, - additionalTagsSchema, - additionalTagsCardinalityLimit); + additionalTagsSchema); } Aggregator( @@ -120,16 +114,11 @@ final class Aggregator implements Runnable { long sleepMillis, HealthMetrics healthMetrics, Runnable onReportCycle, - AdditionalTagsSchema additionalTagsSchema, - int additionalTagsCardinalityLimit) { + AdditionalTagsSchema additionalTagsSchema) { this.writer = writer; this.inbox = inbox; - this.additionalTagsLimiter = - new AdditionalTagsCardinalityLimiter(additionalTagsCardinalityLimit, healthMetrics); this.additionalTagsSchema = additionalTagsSchema; - this.aggregates = - new AggregateTable( - maxAggregates, additionalTagsSchema, additionalTagsLimiter, healthMetrics); + this.aggregates = new AggregateTable(maxAggregates, additionalTagsSchema); this.reportingIntervalNanos = reportingIntervalTimeUnit.toNanos(reportingInterval); this.sleepMillis = sleepMillis; this.healthMetrics = healthMetrics; @@ -246,7 +235,6 @@ private void report(long when, SignalItem signal) { // Safe to call on this (aggregator) thread; handlers are HashMap-based and not thread-safe. AggregateEntry.resetCardinalityHandlers(); additionalTagsSchema.resetHandlers(); - additionalTagsLimiter.resetBucket(); signal.complete(); if (skipped) { log.debug("skipped metrics reporting because no points have changed"); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java index 7f1bf8d2927..7118131ddee 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java @@ -102,7 +102,6 @@ public ClientStatsAggregator( config.getWellKnownTags(), config.getMetricsIgnoredResources(), AdditionalTagsSchema.from(config.getTraceStatsAdditionalTags(), healthMetrics), - config.getTraceStatsAdditionalTagsCardinalityLimit(), sharedCommunicationObjects.featuresDiscovery(config), healthMetrics, new OkHttpSink( @@ -130,7 +129,6 @@ public ClientStatsAggregator( wellKnownTags, ignoredResources, AdditionalTagsSchema.EMPTY, - 100, features, healthMetric, sink, @@ -145,7 +143,6 @@ public ClientStatsAggregator( WellKnownTags wellKnownTags, Set ignoredResources, AdditionalTagsSchema additionalTagsSchema, - int additionalTagsCardinalityLimit, DDAgentFeaturesDiscovery features, HealthMetrics healthMetric, Sink sink, @@ -156,7 +153,6 @@ public ClientStatsAggregator( wellKnownTags, ignoredResources, additionalTagsSchema, - additionalTagsCardinalityLimit, features, healthMetric, sink, @@ -183,7 +179,6 @@ public ClientStatsAggregator( wellKnownTags, ignoredResources, AdditionalTagsSchema.EMPTY, - 100, features, healthMetric, sink, @@ -198,7 +193,6 @@ public ClientStatsAggregator( WellKnownTags wellKnownTags, Set ignoredResources, AdditionalTagsSchema additionalTagsSchema, - int additionalTagsCardinalityLimit, DDAgentFeaturesDiscovery features, HealthMetrics healthMetric, Sink sink, @@ -210,7 +204,6 @@ public ClientStatsAggregator( this( ignoredResources, additionalTagsSchema, - additionalTagsCardinalityLimit, features, healthMetric, sink, @@ -237,7 +230,6 @@ public ClientStatsAggregator( this( ignoredResources, AdditionalTagsSchema.EMPTY, - 100, features, healthMetric, sink, @@ -252,7 +244,6 @@ public ClientStatsAggregator( ClientStatsAggregator( Set ignoredResources, AdditionalTagsSchema additionalTagsSchema, - int additionalTagsCardinalityLimit, DDAgentFeaturesDiscovery features, HealthMetrics healthMetric, Sink sink, @@ -278,8 +269,7 @@ public ClientStatsAggregator( timeUnit, healthMetric, this::resetCardinalityHandlers, - additionalTagsSchema, - additionalTagsCardinalityLimit); + additionalTagsSchema); this.thread = newAgentThread(METRICS_AGGREGATOR, aggregator); this.reportingInterval = reportingInterval; this.reportingIntervalTimeUnit = timeUnit; diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java deleted file mode 100644 index 08439a4b6e2..00000000000 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AdditionalTagsCardinalityLimiterTest.java +++ /dev/null @@ -1,63 +0,0 @@ -package datadog.trace.common.metrics; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import datadog.trace.core.monitor.HealthMetrics; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedHashSet; -import java.util.List; -import org.junit.jupiter.api.Test; - -class AdditionalTagsCardinalityLimiterTest { - - @Test - void counterTracksAdmissionsAndCap() { - AdditionalTagsCardinalityLimiter limiter = - new AdditionalTagsCardinalityLimiter(3, HealthMetrics.NO_OP); - assertFalse(limiter.isAtCap()); - limiter.onNewStatEntryAdmitted(); - assertFalse(limiter.isAtCap()); - limiter.onNewStatEntryAdmitted(); - limiter.onNewStatEntryAdmitted(); - assertTrue(limiter.isAtCap()); - } - - @Test - void resetClearsCounterAndAllowsAdmissionAgain() { - AdditionalTagsCardinalityLimiter limiter = - new AdditionalTagsCardinalityLimiter(2, HealthMetrics.NO_OP); - limiter.onNewStatEntryAdmitted(); - limiter.onNewStatEntryAdmitted(); - assertTrue(limiter.isAtCap()); - - limiter.resetBucket(); - assertFalse(limiter.isAtCap()); - limiter.onNewStatEntryAdmitted(); - assertFalse(limiter.isAtCap()); - } - - @Test - void recordCardinalityBlockFiresHealthMetricPerNonNullValue() { - RecordingHealthMetrics health = new RecordingHealthMetrics(); - AdditionalTagsCardinalityLimiter limiter = new AdditionalTagsCardinalityLimiter(1, health); - AdditionalTagsSchema schema = - AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList("region", "tenant_id"))); - String[] values = new String[] {"us-east-1", null}; - - limiter.recordCardinalityBlock(schema, values); - assertEquals(1, health.blockedKeys.size()); - assertEquals("region", health.blockedKeys.get(0)); - } - - private static final class RecordingHealthMetrics extends HealthMetrics { - final List blockedKeys = new ArrayList<>(); - - @Override - public void onAdditionalTagValueCardinalityBlocked(String tagKey) { - blockedKeys.add(tagKey); - } - } -} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java index 7fefff52487..20402a662c5 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableAdditionalTagsTest.java @@ -9,7 +9,6 @@ import datadog.metrics.api.statsd.StatsDClient; import datadog.metrics.impl.DDSketchHistograms; import datadog.metrics.impl.MonitoringImpl; -import datadog.trace.core.monitor.HealthMetrics; import java.util.Arrays; import java.util.LinkedHashSet; import java.util.concurrent.TimeUnit; @@ -28,7 +27,7 @@ static void initAgentMeter() { @Test void distinctAdditionalTagValuesYieldDistinctEntries() { AdditionalTagsSchema schema = schemaFor("region"); - AggregateTable table = newTable(schema, 100); + AggregateTable table = newTable(schema); AggregateEntry usEast = table.findOrInsert(snapshot(schema, "us-east-1")); AggregateEntry euWest = table.findOrInsert(snapshot(schema, "eu-west-1")); @@ -42,7 +41,7 @@ void distinctAdditionalTagValuesYieldDistinctEntries() { @Test void sameAdditionalTagValuesShareEntry() { AdditionalTagsSchema schema = schemaFor("region"); - AggregateTable table = newTable(schema, 100); + AggregateTable table = newTable(schema); AggregateEntry first = table.findOrInsert(snapshot(schema, "us-east-1")); AggregateEntry second = table.findOrInsert(snapshot(schema, "us-east-1")); @@ -51,52 +50,14 @@ void sameAdditionalTagValuesShareEntry() { assertEquals(1, table.size()); } - @Test - void cardinalityCapCollapsesNewEntriesToBlockedSentinel() { - AdditionalTagsSchema schema = schemaFor("region"); - AggregateTable table = newTable(schema, /*cardinalityLimit*/ 2); - - // Two distinct values admitted before the cap closes. - AggregateEntry first = table.findOrInsert(snapshot(schema, "us-east-1")); - AggregateEntry second = table.findOrInsert(snapshot(schema, "eu-west-1")); - assertNotSame(first, second); - - // Third distinct value should collapse onto the blocked sentinel; fourth too. - AggregateEntry third = table.findOrInsert(snapshot(schema, "ap-south-1")); - AggregateEntry fourth = table.findOrInsert(snapshot(schema, "us-west-2")); - - assertSame(third, fourth); - assertEquals("region:blocked_by_tracer", third.getAdditionalTags()[0].toString()); - // 2 in-budget entries + 1 collapsed blocked-sentinel entry = 3 total - assertEquals(3, table.size()); - } - - @Test - void cardinalityCapDoesNotBlockExistingEntries() { - AdditionalTagsSchema schema = schemaFor("region"); - AggregateTable table = newTable(schema, /*cardinalityLimit*/ 1); - - AggregateEntry first = table.findOrInsert(snapshot(schema, "us-east-1")); - // Now at cap. A repeat of the same value should still hit the existing entry. - AggregateEntry firstAgain = table.findOrInsert(snapshot(schema, "us-east-1")); - assertSame(first, firstAgain); - - // But a brand-new value should go to the blocked sentinel. - AggregateEntry blocked = table.findOrInsert(snapshot(schema, "eu-west-1")); - assertNotSame(first, blocked); - assertEquals("region:blocked_by_tracer", blocked.getAdditionalTags()[0].toString()); - } - // ---------- helpers ---------- private static AdditionalTagsSchema schemaFor(String... names) { return AdditionalTagsSchema.from(new LinkedHashSet<>(Arrays.asList(names))); } - private static AggregateTable newTable(AdditionalTagsSchema schema, int cardinalityLimit) { - AdditionalTagsCardinalityLimiter limiter = - new AdditionalTagsCardinalityLimiter(cardinalityLimit, HealthMetrics.NO_OP); - return new AggregateTable(256, schema, limiter, HealthMetrics.NO_OP); + private static AggregateTable newTable(AdditionalTagsSchema schema) { + return new AggregateTable(256, schema); } private static SpanSnapshot snapshot(AdditionalTagsSchema schema, String regionValue) { @@ -120,10 +81,4 @@ private static SpanSnapshot snapshot(AdditionalTagsSchema schema, String regionV values, 0L); } - - private static String repeat(char ch, int n) { - char[] chars = new char[n]; - Arrays.fill(chars, ch); - return new String(chars); - } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java index 7b5e92f454e..c29e880ec90 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/SerializingMetricWriterAdditionalTagsTest.java @@ -9,7 +9,6 @@ import datadog.metrics.impl.DDSketchHistograms; import datadog.metrics.impl.MonitoringImpl; import datadog.trace.api.WellKnownTags; -import datadog.trace.core.monitor.HealthMetrics; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; @@ -92,9 +91,7 @@ void additionalMetricTagsSkipsNullSlots() throws Exception { // ---------- helpers ---------- private static AggregateTable newTable(AdditionalTagsSchema schema) { - AdditionalTagsCardinalityLimiter limiter = - new AdditionalTagsCardinalityLimiter(100, HealthMetrics.NO_OP); - return new AggregateTable(64, schema, limiter, HealthMetrics.NO_OP); + return new AggregateTable(64, schema); } private static SpanSnapshot snapshot(AdditionalTagsSchema schema, String... values) { diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index c0d26d49950..bed41fc69f9 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -415,7 +415,6 @@ import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_IGNORED_RESOURCES; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_ADDITIONAL_TAGS; -import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_MAX_AGGREGATES; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_MAX_PENDING; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_CARDINALITY_LIMITS_ENABLED; @@ -5177,18 +5176,6 @@ public Set getTraceStatsAdditionalTags() { return tryMakeImmutableSet(configProvider.getList(TRACE_STATS_ADDITIONAL_TAGS)); } - public int getTraceStatsAdditionalTagsCardinalityLimit() { - int configured = configProvider.getInteger(TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT, 100); - if (configured <= 0) { - log.warn( - "Invalid {} value: {}; falling back to default of 100", - TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT, - configured); - return 100; - } - return configured; - } - public String getEnv() { // intentionally not thread safe if (env == null) { diff --git a/metadata/supported-configurations.json b/metadata/supported-configurations.json index 5073afd80bc..a5c0317e578 100644 --- a/metadata/supported-configurations.json +++ b/metadata/supported-configurations.json @@ -10617,14 +10617,6 @@ "aliases": [] } ], - "DD_TRACE_STATS_ADDITIONAL_TAGS_CARDINALITY_LIMIT": [ - { - "version": "A", - "type": "int", - "default": "100", - "aliases": [] - } - ], "DD_TRACE_STATUS404DECORATOR_ENABLED": [ { "version": "A", From 82b063fcaff5cfb72f766c5cc47047ca4c324a6a Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 3 Jun 2026 15:32:39 -0400 Subject: [PATCH 06/37] Fix allNull check in toEntry + stale benchmark comments - AggregateEntry.Canonical.toEntry(): use EMPTY_ADDITIONAL_TAGS when all additionalTagsBuffer slots are null (span had no additional-tag values) instead of cloning an all-null array. Avoids one UTF8BytesString[N] allocation per new entry on workloads that don't use additional tags. - Update populateAdditionalTags Javadoc to drop length-cap reference - AdditionalTagsMetricsBenchmark: fix from() call (drop removed cardinality limit param), update comments to reflect TagCardinalityHandler design, fix tearDown to use LongAdder.sum() Co-Authored-By: Claude Sonnet 4.6 --- .../AdditionalTagsMetricsBenchmark.java | 22 +++++++++---------- .../trace/common/metrics/AggregateEntry.java | 17 ++++++++++---- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java index b6778c95e63..dac3da8aedc 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java @@ -34,9 +34,9 @@ *

    *
  • producer-side capture: {@code ClientStatsAggregator.captureAdditionalTagValues} walks the * schema and pulls each key via {@code unsafeGetTag}. - *
  • aggregator-side canonicalization: {@code AdditionalTagsSchema.register(i, value)} runs - * length-check, handler probe + insert, isBlockedResult check, and per-tag block-counter - * accumulation. + *
  • aggregator-side canonicalization: {@code AdditionalTagsSchema.register(i, value)} runs a + * {@link TagCardinalityHandler} probe + insert, returning the per-key blocked sentinel once + * the per-cycle value budget is exhausted. *
  • cycle-reset flush: at every reporting cycle, the schema fires one {@code * HealthMetrics.onTagCardinalityBlocked(name, count)} per affected key. *
@@ -65,12 +65,13 @@ public static class ThreadState { @Setup public void setup() { this.health = new AdversarialMetricsBenchmark.CountingHealthMetrics(); - // Two configured additional tags. The schema caps per-key cardinality at 100, so over the run - // most ops will collapse onto the per-key ":blocked_by_tracer" sentinel -- the contention - // we want to measure. + // Two configured additional tags. Each key gets a TagCardinalityHandler capped at + // MetricCardinalityLimits.ADDITIONAL_TAG_VALUE (512) distinct values per cycle. The benchmark + // generates 65k distinct values per key so the cap saturates quickly and most ops return the + // blocked sentinel -- that's the contention we want to measure. AdditionalTagsSchema additionalTagsSchema = AdditionalTagsSchema.from( - new LinkedHashSet<>(Arrays.asList("region", "tenant_id")), 100, this.health); + new LinkedHashSet<>(Arrays.asList("region", "tenant_id")), this.health); this.aggregator = new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), @@ -90,11 +91,8 @@ public void setup() { public void tearDown() { aggregator.close(); System.err.println("[ADDITIONAL-TAGS] counters (across all threads, single fork):"); - System.err.println(" onStatsInboxFull = " + health.inboxFull); - System.err.println(" onStatsAggregateDropped = " + health.aggregateDropped); - System.err.println(" traceComputedCalls = " + health.traceComputedCalls); - System.err.println(" totalSpansCounted = " + health.totalSpansCounted); - System.err.println(" tagCardinalityBlocked = " + health.tagCardinalityBlocked); + System.err.println(" onStatsInboxFull = " + health.inboxFull.sum()); + System.err.println(" onStatsAggregateDropped = " + health.aggregateDropped.sum()); } @Benchmark diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 359faa890a4..1d6e9717992 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -698,9 +698,9 @@ private void populatePeerTags(@Nullable PeerTagSchema schema, @Nullable String[] /** * Fills {@link #additionalTagsBuffer} with canonical {@code "key:value"} UTF8BytesStrings for - * each non-null slot in {@code values}, applying the per-value length cap. Length-capped slots - * are replaced by the schema's pre-built {@code ":blocked_by_tracer"} sentinel. Absent - * slots become {@code null}. + * each non-null slot in {@code values} via {@link AdditionalTagsSchema#register}. Absent slots + * become {@code null}; the handler returns the per-key blocked sentinel when the per-cycle + * value budget is exhausted. */ private void populateAdditionalTags(@Nullable String[] values) { int n = additionalTagsBuffer.length; @@ -748,6 +748,15 @@ && peerTagsEqual(peerTagsBuffer, peerTagsSize, e.peerTags) && Arrays.equals(additionalTagsBuffer, e.additionalTags); } + private static boolean allNull(UTF8BytesString[] arr) { + for (UTF8BytesString slot : arr) { + if (slot != null) { + return false; + } + } + return true; + } + private static boolean peerTagsEqual(UTF8BytesString[] a, int aSize, List b) { if (aSize != b.size()) { return false; @@ -776,7 +785,7 @@ AggregateEntry toEntry() { snapshottedPeerTags = Arrays.asList(Arrays.copyOf(peerTagsBuffer, n)); } UTF8BytesString[] snapshottedAdditionalTags; - if (additionalTagsBuffer.length == 0) { + if (additionalTagsBuffer.length == 0 || allNull(additionalTagsBuffer)) { snapshottedAdditionalTags = EMPTY_ADDITIONAL_TAGS; } else { snapshottedAdditionalTags = additionalTagsBuffer.clone(); From e25c0d3f70e2ee3ff3bc9a423c0d6641c0cc0587 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 3 Jun 2026 16:15:43 -0400 Subject: [PATCH 07/37] Add @Param(limitsEnabled) to AdditionalTagsMetricsBenchmark Both false/true variants now run within the same fork so they see equivalent thermal conditions, giving a fair limits-on vs limits-off comparison without requiring separate JVM invocations. Co-Authored-By: Claude Sonnet 4.6 --- .../metrics/AdditionalTagsMetricsBenchmark.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java index dac3da8aedc..962978cc15c 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java @@ -19,6 +19,7 @@ import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; @@ -57,6 +58,14 @@ public class AdditionalTagsMetricsBenchmark { private ClientStatsAggregator aggregator; private AdversarialMetricsBenchmark.CountingHealthMetrics health; + /** + * Whether the {@link TagCardinalityHandler}s inside {@link AdditionalTagsSchema} substitute the + * {@code blocked_by_tracer} sentinel once the per-key budget is exhausted. JMH runs both values + * within the same fork so the two modes see equivalent thermal conditions. + */ + @Param({"false", "true"}) + public boolean limitsEnabled; + @State(Scope.Thread) public static class ThreadState { int cursor; @@ -71,7 +80,7 @@ public void setup() { // blocked sentinel -- that's the contention we want to measure. AdditionalTagsSchema additionalTagsSchema = AdditionalTagsSchema.from( - new LinkedHashSet<>(Arrays.asList("region", "tenant_id")), this.health); + new LinkedHashSet<>(Arrays.asList("region", "tenant_id")), this.health, limitsEnabled); this.aggregator = new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), From 9ed6302c1171b2db238c873562ee1907d7cba69f Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 3 Jun 2026 17:17:14 -0400 Subject: [PATCH 08/37] Document limitsEnabled benchmark interpretation + fix stale cap references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The limitsEnabled=true/false arms are not a fair "cost of limiting" comparison: with unbounded distinct values, limits=true collapses overflow to one hot sentinel entry (recordOneDuration runs every span) while limits=false saturates the table and drops overflow (never reaches the histogram). GC profiling (3 forks, 2026-06-03) confirmed limits=true allocates MORE per op (888 vs 820 B/op) — the histogram recording, not the sentinel path. Documents this so the arm isn't misread as a regression. Also fixes two stale "cap of 100" references (actual is ADDITIONAL_TAG_VALUE=512). Co-Authored-By: Claude Sonnet 4.6 --- .../AdditionalTagsMetricsBenchmark.java | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java index 962978cc15c..6d243036290 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdditionalTagsMetricsBenchmark.java @@ -17,9 +17,9 @@ import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; @@ -29,8 +29,8 @@ /** * JMH benchmark exercising the span-derived primary tags pipeline added in CSS v1.3.0. Parallel to * {@link AdversarialMetricsBenchmark} but configures two additional-tag keys (each with a per-key - * cardinality cap of 100) and generates unique values per op so the cap saturates fast. The - * benchmark measures the cost of: + * cardinality cap of {@link MetricCardinalityLimits#ADDITIONAL_TAG_VALUE}) and generates unique + * values per op so the cap saturates fast. The benchmark measures the cost of: * * * - *

Cardinality blocks emit a one-shot warn log per reporting cycle per tag (tracked via {@link - * #warnedCardinality}). Per-tag block counts live inside each {@link TagCardinalityHandler} and are - * returned by {@link TagCardinalityHandler#reset()}, then flushed to {@link - * HealthMetrics#onTagCardinalityBlocked(String[], long)} in {@link - * #resetCardinalityHandlers(HealthMetrics)}. + *

Cardinality blocks are counted inside each {@link TagCardinalityHandler} and flushed once per + * cycle (with a warn log) in {@link #resetCardinalityHandlers(HealthMetrics)}. * *

Each {@link SpanSnapshot} captures its own schema reference so producer and consumer agree on * the indexing even if the current schema is replaced between capture and consumption. * - *

Thread-safety: all mutable state ({@link TagCardinalityHandler}s, the warn-once set, - * and {@link #state}) is exercised only on the aggregator thread. {@link #names} and {@link - * #handlers} are final and safe to read from any thread; producer threads access them through the - * volatile {@code cachedPeerTagSchema} reference in {@link ClientStatsAggregator}. + *

Thread-safety: all mutable state ({@link TagCardinalityHandler}s and {@link #state}) + * is exercised only on the aggregator thread. {@link #names} and {@link #handlers} are final and + * safe to read from any thread; producer threads access them through the volatile {@code + * cachedPeerTagSchema} reference in {@link ClientStatsAggregator}. */ final class PeerTagSchema { @@ -62,13 +58,6 @@ final class PeerTagSchema { */ String state; - /** - * Per-cycle warn-once gating. {@code Set.add(name)} returns true exactly the first time a tag - * gets blocked this cycle, which is the only time we want to emit the warn log. Cleared by {@link - * #resetCardinalityHandlers(HealthMetrics)}. - */ - private final Set warnedCardinality = new HashSet<>(); - /** Builds a schema for the given peer-tag names. Order is determined by the {@link Set}. */ static PeerTagSchema of(Set names, String state) { return new PeerTagSchema(names.toArray(new String[0]), state); @@ -111,34 +100,28 @@ boolean hasSameTagsAs(Set other) { /** * Canonicalizes the peer-tag value at slot {@code i}. Returns {@link UTF8BytesString#EMPTY} for * null inputs and the handler's {@code ":blocked_by_tracer"} sentinel when the per-tag - * cardinality budget is exhausted. The handler counts blocks internally; emits a one-shot warn - * log per cycle per tag via {@link #warnedCardinality}. + * cardinality budget is exhausted. */ UTF8BytesString register(int i, String value) { - TagCardinalityHandler handler = handlers[i]; - UTF8BytesString result = handler.register(value); - if (handler.isBlockedResult(result) && warnedCardinality.add(names[i])) { - log.warn( - "Cardinality limit reached for peer tag '{}'; further values are reported as" - + " 'blocked_by_tracer' until the next reporting cycle", - names[i]); - } - return result; + return handlers[i].register(value); } /** * Resets every {@link TagCardinalityHandler}'s working set, flushes accumulated per-tag block - * counts to {@link HealthMetrics}, and clears the per-cycle warn-once tracking. Must be called on - * the aggregator thread; handlers are not thread-safe. + * counts to {@link HealthMetrics}, and emits a warn log for each tag that hit its limit this + * cycle. Must be called on the aggregator thread; handlers are not thread-safe. */ void resetCardinalityHandlers(HealthMetrics healthMetrics) { for (int i = 0; i < handlers.length; i++) { long blocked = handlers[i].reset(); if (blocked > 0) { + log.warn( + "Cardinality limit reached for peer tag '{}'; further values are reported as" + + " 'blocked_by_tracer' until the next reporting cycle", + names[i]); healthMetrics.onTagCardinalityBlocked(handlers[i].statsDTag(), blocked); } } - warnedCardinality.clear(); } int size() { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java index 206c923d431..8ec3b756b72 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java @@ -111,16 +111,6 @@ UTF8BytesString register(String value) { return utf8; } - /** - * Whether {@code result} (returned from a prior {@link #register} call) is this handler's blocked - * sentinel. Reads {@link #cacheBlocked} directly so callers can safely query without forcing the - * sentinel to materialize -- when limits are disabled the sentinel is never built and this method - * returns {@code false} for every input. - */ - boolean isBlockedResult(UTF8BytesString result) { - return result == this.cacheBlocked; - } - private UTF8BytesString blockedByTracer() { UTF8BytesString cacheBlocked = this.cacheBlocked; if (cacheBlocked != null) return cacheBlocked; diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java index 8f784325a7b..1661e13507a 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java @@ -213,19 +213,13 @@ void tagOverLimitWithSentinelDisabledReturnsFreshUtf8() { assertEquals("peer.hostname:host-b", hostB.toString()); assertEquals("peer.hostname:host-c", hostC.toString()); - // Over-cap values are not cached -- isBlockedResult never reports true in disabled mode. - assertEquals(false, h.isBlockedResult(hostB)); - assertEquals(false, h.isBlockedResult(hostC)); } @Test - void tagIsBlockedResultStaysFalseInDisabledModeEvenAtCap() { - // The sentinel should never materialize in disabled mode -- isBlockedResult reads cacheBlocked - // directly, so no allocation is forced. + void tagOverLimitWithSentinelDisabledNeverSubstitutesBlockedSentinel() { TagCardinalityHandler h = new TagCardinalityHandler("peer.service", 1, false); h.register("svc-1"); UTF8BytesString overCap = h.register("svc-2"); - assertEquals(false, h.isBlockedResult(overCap)); assertEquals("peer.service:svc-2", overCap.toString()); } } diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 30c0815bef2..edcc5f82cea 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -6377,6 +6377,8 @@ public String toString() { + tracerMetricsMaxAggregates + ", tracerMetricsMaxPending=" + tracerMetricsMaxPending + + ", traceStatsCardinalityLimitsEnabled=" + + traceStatsCardinalityLimitsEnabled + ", reportHostName=" + reportHostName + ", traceAnalyticsEnabled=" From fe868ab8c23eae12d6e122cd030e5a4da3653c02 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 10:51:45 -0400 Subject: [PATCH 29/37] Move cardinality LIMITS_ENABLED flag to MetricCardinalityLimits; add NO_STATE constant Relocates the frozen cardinality-limits master switch from AggregateEntry.LIMITS_ENABLED to MetricCardinalityLimits.ENABLED, where the per-field limit values already live. The flag is no longer tangled into AggregateEntry (whose per-field handlers were already extracted to PropertyHandlers), and PeerTagSchema/ClientStatsAggregator reference it from its natural home instead of reaching across into AggregateEntry. AggregateTable keeps reading Config fresh per construction so tests can flip the flag via injectSysConfig. Names the bare null state passed to never-reconciled PeerTagSchemas (INTERNAL singleton and test schemas) as the explanatory NO_STATE constant. Co-Authored-By: Claude Sonnet 4.6 --- .../trace/common/metrics/AggregateEntry.java | 5 ----- .../trace/common/metrics/AggregateTable.java | 14 ++++++------- .../common/metrics/ClientStatsAggregator.java | 2 +- .../metrics/MetricCardinalityLimits.java | 10 ++++++++++ .../trace/common/metrics/PeerTagSchema.java | 20 +++++++++++++------ 5 files changed, 32 insertions(+), 19 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index bbcff75af96..3c2dda517a5 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -1,7 +1,6 @@ package datadog.trace.common.metrics; import datadog.metrics.api.Histogram; -import datadog.trace.api.Config; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.util.Hashtable; import datadog.trace.util.LongHashingUtils; @@ -21,10 +20,6 @@ final class AggregateEntry extends Hashtable.Entry { private static final UTF8BytesString[] EMPTY_TAGS = new UTF8BytesString[0]; - // Frozen at first AggregateEntry class-load; used by AggregateTable to construct default - // handlers. - static final boolean LIMITS_ENABLED = Config.get().isTraceStatsCardinalityLimitsEnabled(); - final UTF8BytesString resource; final UTF8BytesString service; final UTF8BytesString operationName; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java index a75fd9f770f..60a145b810f 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -20,8 +20,8 @@ * *

Not thread-safe. The aggregator thread is the sole writer of both this table and its * contained {@link AggregateEntry} state. Any cross-thread request that needs to mutate -- e.g. - * {@link ClientStatsAggregator#disable()} -- must funnel onto the aggregator thread via the - * inbox (see the {@code ClearSignal} routing in {@link Aggregator}). The invariant is convention- + * {@link ClientStatsAggregator#disable()} -- must funnel onto the aggregator thread via the inbox + * (see the {@code ClearSignal} routing in {@link Aggregator}). The invariant is convention- * enforced; nothing here checks the calling thread at runtime, so a wrong-thread call would corrupt * bucket chains silently. */ @@ -104,11 +104,11 @@ AggregateEntry findOrInsert(SpanSnapshot snapshot) { * onStatsAggregateDropped}) rather than evicting an established one. Cap is sized to the * steady-state working set, so eviction is rare in the common case. * - *

How often this fires depends on {@link AggregateEntry#LIMITS_ENABLED}. With limits enabled, - * over-cap values for a given field collapse into a shared {@code blocked_by_tracer} bucket, so - * the table itself rarely reaches {@code maxAggregates}. With limits disabled (the default), - * over-cap values flow to distinct buckets and {@code maxAggregates} becomes the load-bearing - * backstop -- the cursor-resumed scan was added specifically for this regime. + *

How often this fires depends on {@link MetricCardinalityLimits#ENABLED}. With limits + * enabled, over-cap values for a given field collapse into a shared {@code blocked_by_tracer} + * bucket, so the table itself rarely reaches {@code maxAggregates}. With limits disabled (the + * default), over-cap values flow to distinct buckets and {@code maxAggregates} becomes the + * load-bearing backstop -- the cursor-resumed scan was added specifically for this regime. */ private boolean evictOneStale() { // Two passes -- [cursor, length) then [0, cursor) -- using the half-open-range iterator. The diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java index 1135456eed4..8c2f1c729de 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java @@ -102,7 +102,7 @@ public ClientStatsAggregator( config.getWellKnownTags(), config.getMetricsIgnoredResources(), AdditionalTagsSchema.from( - config.getTraceStatsAdditionalTags(), AggregateEntry.LIMITS_ENABLED, healthMetrics), + config.getTraceStatsAdditionalTags(), MetricCardinalityLimits.ENABLED, healthMetrics), sharedCommunicationObjects.featuresDiscovery(config), healthMetrics, new OkHttpSink( diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java index 70d1b9787bd..f42a37ebbfd 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java @@ -1,5 +1,7 @@ package datadog.trace.common.metrics; +import datadog.trace.api.Config; + /** * Per-field caps on the number of distinct values canonicalized per reporting cycle. Overflow * values collapse to a {@code blocked_by_tracer} sentinel so they merge into one aggregate row @@ -12,6 +14,14 @@ final class MetricCardinalityLimits { private MetricCardinalityLimits() {} + /** + * Master switch for cardinality limiting, frozen at class-load from {@code Config}. When {@code + * false}, handlers still cap their working-set size but emit freshly-allocated values instead of + * the {@code blocked_by_tracer} sentinel. Tests construct handlers with an explicit {@code + * useBlockedSentinel} arg rather than flipping this. + */ + static final boolean ENABLED = Config.get().isTraceStatsCardinalityLimitsEnabled(); + /** * Distinct {@code resource.name} values per cycle. Highest-cardinality field by far: DB-query * obfuscations, HTTP route templates, custom resources. Typical service: 30-200 unique. diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index e28ba0b7dc5..48d98a677ed 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -34,17 +34,25 @@ *

Each {@link SpanSnapshot} captures its own schema reference so producer and consumer agree on * the indexing even if the current schema is replaced between capture and consumption. * - *

Thread-safety: all mutable state ({@link TagCardinalityHandler}s and {@link #state}) - * is exercised only on the aggregator thread. {@link #names} and {@link #handlers} are final and - * safe to read from any thread; producer threads access them through the volatile {@code + *

Thread-safety: all mutable state ({@link TagCardinalityHandler}s and {@link #state}) is + * exercised only on the aggregator thread. {@link #names} and {@link #handlers} are final and safe + * to read from any thread; producer threads access them through the volatile {@code * cachedPeerTagSchema} reference in {@link ClientStatsAggregator}. */ final class PeerTagSchema { private static final Logger log = LoggerFactory.getLogger(PeerTagSchema.class); + /** + * Sentinel {@link #state} for schemas that are never reconciled against feature discovery: the + * {@link #INTERNAL} singleton and test-built schemas. A {@code null} state always mismatches a + * real discovery hash, so a schema built with it would rebuild on first reconcile -- but neither + * of these schemas takes that path. + */ + static final String NO_STATE = null; + /** Singleton schema for internal-kind spans -- only {@code base.service}. */ - static final PeerTagSchema INTERNAL = new PeerTagSchema(new String[] {BASE_SERVICE}, null); + static final PeerTagSchema INTERNAL = new PeerTagSchema(new String[] {BASE_SERVICE}, NO_STATE); final String[] names; final TagCardinalityHandler[] handlers; @@ -65,7 +73,7 @@ static PeerTagSchema of(Set names, String state) { /** Test-only factory: takes names array directly to build a schema in a specific order. */ static PeerTagSchema testSchema(String[] names) { - return new PeerTagSchema(names, null); + return new PeerTagSchema(names, NO_STATE); } private PeerTagSchema(String[] names, String state) { @@ -75,7 +83,7 @@ private PeerTagSchema(String[] names, String state) { for (int i = 0; i < names.length; i++) { this.handlers[i] = new TagCardinalityHandler( - names[i], MetricCardinalityLimits.PEER_TAG_VALUE, AggregateEntry.LIMITS_ENABLED); + names[i], MetricCardinalityLimits.PEER_TAG_VALUE, MetricCardinalityLimits.ENABLED); } } From 0a628530c6a9a0115267bfc55f4c77625c734c22 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 11:00:15 -0400 Subject: [PATCH 30/37] Extract shared cardinality-block reporting; rename PeerTagSchema.resetCardinalityHandlers to resetHandlers The per-cycle "if a handler blocked values this cycle, warn + flush count to HealthMetrics" loop was duplicated across PropertyHandlers, PeerTagSchema, and AdditionalTagsSchema, differing only in the warn-message wording. Consolidated into CardinalityBlocks.reportIfBlocked, which takes the already-extracted primitives (PropertyCardinalityHandler and TagCardinalityHandler share no common type) plus the caller's logger so log categories stay per-class. Also renames PeerTagSchema.resetCardinalityHandlers to resetHandlers to match AdditionalTagsSchema.resetHandlers. The aggregator's own no-arg resetCardinalityHandlers() orchestration method keeps its name. Co-Authored-By: Claude Sonnet 4.6 --- .../common/metrics/AdditionalTagsSchema.java | 14 ++++---- .../common/metrics/CardinalityBlocks.java | 35 +++++++++++++++++++ .../common/metrics/ClientStatsAggregator.java | 6 ++-- .../trace/common/metrics/PeerTagSchema.java | 20 +++++------ .../common/metrics/PropertyHandlers.java | 18 ++++------ 5 files changed, 62 insertions(+), 31 deletions(-) create mode 100644 dd-trace-core/src/main/java/datadog/trace/common/metrics/CardinalityBlocks.java diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java index 5e23efd9ce7..a9a60e0353e 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java @@ -106,13 +106,13 @@ UTF8BytesString register(int i, String value) { void resetHandlers() { for (int i = 0; i < handlers.length; i++) { - long blocked = handlers[i].reset(); - if (blocked > 0) { - log.warn( - "Cardinality limit reached for additional metric tag '{}'; further values will be reported as blocked_by_tracer", - names[i]); - healthMetrics.onTagCardinalityBlocked(handlers[i].statsDTag(), blocked); - } + CardinalityBlocks.reportIfBlocked( + log, + healthMetrics, + handlers[i].reset(), + names[i], + handlers[i].statsDTag(), + "Cardinality limit reached for additional metric tag '{}'; further values will be reported as blocked_by_tracer"); } } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/CardinalityBlocks.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/CardinalityBlocks.java new file mode 100644 index 00000000000..a6d65063b48 --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/CardinalityBlocks.java @@ -0,0 +1,35 @@ +package datadog.trace.common.metrics; + +import datadog.trace.core.monitor.HealthMetrics; +import org.slf4j.Logger; + +/** + * Shared per-cycle cardinality-block reporting. When a handler blocked one or more values during + * the just-completed reporting cycle, emit a warn log (with the caller's own logger and message + * template, so log categories stay per-class) and flush the count to {@link HealthMetrics}. + * + *

Centralizes the warn + health-metric contract that every cardinality-handler owner ({@link + * PropertyHandlers}, {@link PeerTagSchema}, {@link AdditionalTagsSchema}) shares; {@code + * PropertyCardinalityHandler} and {@code TagCardinalityHandler} don't share a common type, so the + * helper takes the already-extracted primitives rather than the handler. + */ +final class CardinalityBlocks { + private CardinalityBlocks() {} + + /** + * @param blocked count returned by the handler's {@code reset()}; a no-op when {@code <= 0} + * @param warnMessage SLF4J template with a single {@code {}} placeholder for {@code name} + */ + static void reportIfBlocked( + Logger log, + HealthMetrics healthMetrics, + long blocked, + String name, + String[] statsDTag, + String warnMessage) { + if (blocked > 0) { + log.warn(warnMessage, name); + healthMetrics.onTagCardinalityBlocked(statsDTag, blocked); + } + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java index 8c2f1c729de..dfdcc060e90 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java @@ -460,10 +460,10 @@ private PeerTagSchema buildPeerTagSchema() { private void resetCardinalityHandlers() { reconcilePeerTagSchema(); aggregator.resetPropertyHandlers(healthMetrics); - PeerTagSchema.INTERNAL.resetCardinalityHandlers(healthMetrics); + PeerTagSchema.INTERNAL.resetHandlers(healthMetrics); PeerTagSchema schema = cachedPeerTagSchema; if (schema != null) { - schema.resetCardinalityHandlers(healthMetrics); + schema.resetHandlers(healthMetrics); } additionalTagsSchema.resetHandlers(); } @@ -495,7 +495,7 @@ private void reconcilePeerTagSchema() { } else { // Tags actually changed: flush the outgoing schema's accumulated block telemetry before // discarding it, otherwise the partial-cycle blockedCounts would silently disappear. - cached.resetCardinalityHandlers(healthMetrics); + cached.resetHandlers(healthMetrics); cachedPeerTagSchema = PeerTagSchema.of(normalized, latestState); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index 48d98a677ed..2b4c6e9cbe3 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -29,7 +29,7 @@ * * *

Cardinality blocks are counted inside each {@link TagCardinalityHandler} and flushed once per - * cycle (with a warn log) in {@link #resetCardinalityHandlers(HealthMetrics)}. + * cycle (with a warn log) in {@link #resetHandlers(HealthMetrics)}. * *

Each {@link SpanSnapshot} captures its own schema reference so producer and consumer agree on * the indexing even if the current schema is replaced between capture and consumption. @@ -119,16 +119,16 @@ UTF8BytesString register(int i, String value) { * counts to {@link HealthMetrics}, and emits a warn log for each tag that hit its limit this * cycle. Must be called on the aggregator thread; handlers are not thread-safe. */ - void resetCardinalityHandlers(HealthMetrics healthMetrics) { + void resetHandlers(HealthMetrics healthMetrics) { for (int i = 0; i < handlers.length; i++) { - long blocked = handlers[i].reset(); - if (blocked > 0) { - log.warn( - "Cardinality limit reached for peer tag '{}'; further values are reported as" - + " 'blocked_by_tracer' until the next reporting cycle", - names[i]); - healthMetrics.onTagCardinalityBlocked(handlers[i].statsDTag(), blocked); - } + CardinalityBlocks.reportIfBlocked( + log, + healthMetrics, + handlers[i].reset(), + names[i], + handlers[i].statsDTag(), + "Cardinality limit reached for peer tag '{}'; further values are reported as" + + " 'blocked_by_tracer' until the next reporting cycle"); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java index e7797054043..273c67d2456 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java @@ -63,17 +63,13 @@ final class PropertyHandlers { void reset(HealthMetrics healthMetrics) { for (PropertyCardinalityHandler h : handlers) { - reportIfBlocked(healthMetrics, h); - } - } - - private static void reportIfBlocked(HealthMetrics healthMetrics, PropertyCardinalityHandler h) { - long blocked = h.reset(); - if (blocked > 0) { - log.warn( - "Cardinality limit reached for stats field '{}'; further values will be reported as blocked_by_tracer", - h.name); - healthMetrics.onTagCardinalityBlocked(h.statsDTag(), blocked); + CardinalityBlocks.reportIfBlocked( + log, + healthMetrics, + h.reset(), + h.name, + h.statsDTag(), + "Cardinality limit reached for stats field '{}'; further values will be reported as blocked_by_tracer"); } } } From 76e665e3c4fa4c41dac2694e4e36069d2f37a036 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 11:12:02 -0400 Subject: [PATCH 31/37] Document why AggregateTable reads cardinality flag fresh from Config Clarifies the intentional fresh-vs-frozen split surfaced in review: Aggregatetable reads Config directly so tests can flip the flag via injectSysConfig, while PeerTagSchema and ClientStatsAggregator use the frozen MetricCardinalityLimits.ENABLED. The two are identical in production. Co-Authored-By: Claude Sonnet 4.6 --- .../main/java/datadog/trace/common/metrics/AggregateTable.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java index 60a145b810f..b9d2ef3942e 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -47,6 +47,9 @@ final class AggregateTable { this( maxAggregates, additionalTagsSchema, + // Read fresh from Config rather than the frozen MetricCardinalityLimits.ENABLED so tests + // can flip the flag via injectSysConfig before constructing the table. In production the + // two are identical (Config is immutable for the process lifetime). new PropertyHandlers(Config.get().isTraceStatsCardinalityLimitsEnabled())); } From 7b74eb026dafbcff14a79a4c0fddbd401f86777e Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 13:18:51 -0400 Subject: [PATCH 32/37] Fix spotless violations in Config.java imports and ClientStatsAggregatorTest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both surfaced after the #11387→#11402 reconcile merge: Config.java's static-import block left TRACE_STATS_ADDITIONAL_TAGS out of alphabetical order (TRACER_* sorts before TRACE_*), and a long SimpleSpan publish line in ClientStatsAggregatorTest needed wrapping. Pure formatting; no behavior change. Restores green spotlessCheck on internal-api and dd-trace-core. Co-Authored-By: Claude Sonnet 4.6 --- .../trace/common/metrics/ClientStatsAggregatorTest.groovy | 4 +++- internal-api/src/main/java/datadog/trace/api/Config.java | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy index 6e8c285e7a9..734d0a92e61 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy @@ -1729,7 +1729,9 @@ class ClientStatsAggregatorTest extends DDSpecification { cycle1Entries.count { it.getService().toString() == "blocked_by_tracer" } == 1 when: "publish the overflow service in the next cycle after the cardinality reset" - aggregator.publish([new SimpleSpan("svc-${MetricCardinalityLimits.SERVICE}", "op", "resource", "web", false, true, false, 0, 100, HTTP_OK)]) + aggregator.publish([ + new SimpleSpan("svc-${MetricCardinalityLimits.SERVICE}", "op", "resource", "web", false, true, false, 0, 100, HTTP_OK) + ]) aggregator.report() latch2.await(2, SECONDS) diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 808d8387ec3..93db73fd57c 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -417,11 +417,11 @@ import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_BUFFERING_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_IGNORED_RESOURCES; -import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_ADDITIONAL_TAGS; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_MAX_AGGREGATES; import static datadog.trace.api.config.GeneralConfig.TRACER_METRICS_MAX_PENDING; import static datadog.trace.api.config.GeneralConfig.TRACE_DEBUG; import static datadog.trace.api.config.GeneralConfig.TRACE_LOG_LEVEL; +import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_ADDITIONAL_TAGS; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_CARDINALITY_LIMITS_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_COMPUTATION_ENABLED; import static datadog.trace.api.config.GeneralConfig.TRACE_STATS_COMPUTATION_IGNORE_AGENT_VERSION; From 2e89bf473e23d70977051761d3515ef5a8ed5c09 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 14:16:28 -0400 Subject: [PATCH 33/37] Add missing package declaration to MetricsIntegrationTest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The file lives in src/traceAgentTest/groovy/datadog/trace/common/metrics/ but had no package declaration, so it compiled into the default package. At runtime that triggered java.lang.IllegalAccessError when accessing the package-private PeerTagSchema (and the other package-private metrics classes it uses) — failing all 4 tests in the traceAgentTest job. Declaring `package datadog.trace.common.metrics` puts the test in the same package as the classes it exercises, restoring legal access. Co-Authored-By: Claude Sonnet 4.6 --- .../datadog/trace/common/metrics/MetricsIntegrationTest.groovy | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy b/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy index 4c4ee81b276..4c154b13318 100644 --- a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy +++ b/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy @@ -1,3 +1,5 @@ +package datadog.trace.common.metrics + import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT import static datadog.trace.common.metrics.EventListener.EventType.OK import static java.util.concurrent.TimeUnit.SECONDS From 6376998cedd47842ae72f764a9571f0e2619bf5d Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 14:23:37 -0400 Subject: [PATCH 34/37] Drop now-redundant same-package imports in MetricsIntegrationTest After moving the test into package datadog.trace.common.metrics, its imports of AggregateEntry/EventListener/OkHttpSink/PeerTagSchema/SerializingMetricWriter/SpanSnapshot became same-package imports, which CodeNarc's ImportFromSamePackage rule rejects (codenarcTraceAgentTest). Removed them; the classes resolve directly in-package. Co-Authored-By: Claude Sonnet 4.6 --- .../trace/common/metrics/MetricsIntegrationTest.groovy | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy b/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy index 4c154b13318..401aaef4b7e 100644 --- a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy +++ b/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy @@ -9,12 +9,6 @@ import datadog.metrics.api.Histograms import datadog.metrics.impl.DDSketchHistograms import datadog.trace.api.Config import datadog.trace.api.WellKnownTags -import datadog.trace.common.metrics.AggregateEntry -import datadog.trace.common.metrics.EventListener -import datadog.trace.common.metrics.OkHttpSink -import datadog.trace.common.metrics.PeerTagSchema -import datadog.trace.common.metrics.SerializingMetricWriter -import datadog.trace.common.metrics.SpanSnapshot import java.util.concurrent.CopyOnWriteArrayList import java.util.concurrent.CountDownLatch import okhttp3.HttpUrl From d40d99cacea7ce96aaa9ffbcc5d11851e229ab07 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 14:46:57 -0400 Subject: [PATCH 35/37] Suppress false-positive SpotBugs MT_CORRECTNESS / ES findings on single-writer metrics code spotbugsMain flagged design-intentional patterns: - AggregateEntry: AT_NONATOMIC_OPERATIONS_ON_SHARED_VARIABLE (recordOneDuration/recordDurations) and AT_STALE_THREAD_WRITE_OF_PRIMITIVE (clearAggregate) on the hit/error/topLevel/duration counters. These are single-writer by design -- mutated only on the aggregator thread (see class javadoc); making them volatile/atomic would cost the hot path for no benefit. - TagCardinalityHandler.register: ES_COMPARING_PARAMETER_STRING_WITH_EQ on the intentional reference-equality fast-path that short-circuits .equals() when key and probe value are the same instance. Suppressed with @SuppressFBWarnings + justifications (repo convention; matches the existing clearAggregate suppression). Trimmed to the exact codes that fire to avoid US_USELESS_SUPPRESSION. Co-Authored-By: Claude Sonnet 4.6 --- .../trace/common/metrics/AggregateEntry.java | 16 +++++++++++++++- .../common/metrics/TagCardinalityHandler.java | 6 ++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 3c2dda517a5..43318f697be 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -345,6 +345,11 @@ private Histogram errorLatenciesForWrite() { * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. */ + @SuppressFBWarnings( + value = "AT_NONATOMIC_OPERATIONS_ON_SHARED_VARIABLE", + justification = + "Single-writer by design: recording counters are mutated only on the aggregator thread" + + " (see class javadoc); no cross-thread atomicity guarantee is needed.") AggregateEntry recordOneDuration(long tagAndDuration) { ++hitCount; if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { @@ -366,6 +371,11 @@ AggregateEntry recordOneDuration(long tagAndDuration) { * Records {@code count} durations from {@code durations} (positions 0..count-1). Used by * integration tests; production code uses {@link #recordOneDuration}. */ + @SuppressFBWarnings( + value = "AT_NONATOMIC_OPERATIONS_ON_SHARED_VARIABLE", + justification = + "Single-writer by design: recording counters are mutated only on the aggregator thread" + + " (see class javadoc); no cross-thread atomicity guarantee is needed.") AggregateEntry recordDurations(int count, AtomicLongArray durations) { this.hitCount += count; for (int i = 0; i < count && i < durations.length(); ++i) { @@ -390,7 +400,11 @@ AggregateEntry recordDurations(int count, AtomicLongArray durations) { * Clears the recording state. The OK histogram is reused; the error histogram (if allocated) is * reused too, but entries that never saw an error keep their {@code errorLatencies} field null. */ - @SuppressFBWarnings("AT_NONATOMIC_64BIT_PRIMITIVE") + @SuppressFBWarnings( + value = {"AT_NONATOMIC_64BIT_PRIMITIVE", "AT_STALE_THREAD_WRITE_OF_PRIMITIVE"}, + justification = + "Single-writer by design: recording counters are reset only on the aggregator thread" + + " (see class javadoc); no cross-thread visibility guarantee is needed.") void clearAggregate() { this.errorCount = 0; this.hitCount = 0; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java index 8ec3b756b72..2ce852c13c0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java @@ -1,6 +1,7 @@ package datadog.trace.common.metrics; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.util.Arrays; /** @@ -72,6 +73,11 @@ final class TagCardinalityHandler { * miss-with-budget) the prior-cycle table; mixing with the upper half ({@code h ^ (h >>> 16)}) * keeps inputs sharing a low-bit pattern off the same probe chain. */ + @SuppressFBWarnings( + value = "ES_COMPARING_PARAMETER_STRING_WITH_EQ", + justification = + "Intentional identity fast-path: the reference check short-circuits the .equals() call" + + " when the stored key and probe value are the same instance.") UTF8BytesString register(String value) { if (value == null) { return UTF8BytesString.EMPTY; From 9acec1dc57e28ef5d3bcf2fef4520578f302d145 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 16:39:27 -0400 Subject: [PATCH 36/37] Migrate MetricsIntegrationTest from Spock/Groovy to JUnit 5 Java The Groovy test was stale on this branch: it built entries via AggregateEntry.hashOf(SpanSnapshot) and a (SpanSnapshot,long) constructor that no longer exist here (hashOf now takes the canonicalized UTF8 field list, and SpanSnapshot gained an additionalTagValues arg). Groovy compiles such calls dynamically, so it only blew up at runtime in the traceAgentTest job (MissingMethodException: hashOf() for (SpanSnapshot)). Rewritten as a self-contained JUnit 5 Java test that builds entries through the production AggregateTable.findOrInsert path (matching AggregateTableTest), with the current 16-arg SpanSnapshot (additionalTagValues=null). Agent wiring is replicated inline (testcontainers locally / CI_AGENT_HOST in CI; host+port via WithConfigExtension); metrics meter initialized in @BeforeAll. One test method, unchanged intent. Also follows the project's no-new-Groovy / migrate-existing-Groovy convention. Co-Authored-By: Claude Sonnet 4.6 --- .../metrics/MetricsIntegrationTest.groovy | 75 ------- .../metrics/MetricsIntegrationTest.java | 188 ++++++++++++++++++ 2 files changed, 188 insertions(+), 75 deletions(-) delete mode 100644 dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy create mode 100644 dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java diff --git a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy b/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy deleted file mode 100644 index 401aaef4b7e..00000000000 --- a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy +++ /dev/null @@ -1,75 +0,0 @@ -package datadog.trace.common.metrics - -import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT -import static datadog.trace.common.metrics.EventListener.EventType.OK -import static java.util.concurrent.TimeUnit.SECONDS - -import datadog.communication.http.OkHttpUtils -import datadog.metrics.api.Histograms -import datadog.metrics.impl.DDSketchHistograms -import datadog.trace.api.Config -import datadog.trace.api.WellKnownTags -import java.util.concurrent.CopyOnWriteArrayList -import java.util.concurrent.CountDownLatch -import okhttp3.HttpUrl - -class MetricsIntegrationTest extends AbstractTraceAgentTest { - - def setupSpec() { - // Initialize metrics-lib histograms to register the DDSketch implementation - Histograms.register(DDSketchHistograms.FACTORY) - } - - def "send metrics to trace agent should notify with OK event"() { - setup: - def latch = new CountDownLatch(1) - def listener = new BlockingListener(latch) - def agentUrl = Config.get().getAgentUrl() - OkHttpSink sink = new OkHttpSink(OkHttpUtils.buildHttpClient(HttpUrl.parse(agentUrl), 5000L), agentUrl, V06_METRICS_ENDPOINT, true, false, [:]) - sink.register(listener) - - when: - SerializingMetricWriter writer = new SerializingMetricWriter( - new WellKnownTags("runtimeid","hostname", "env", "service", "version","language"), - sink - ) - writer.startBucket(2, System.nanoTime(), SECONDS.toNanos(10)) - // Build entries via the production AggregateEntry.forSnapshot(snap, keyHash) path -- same - // construction as AggregateTable.findOrInsert. Both entries use one peer tag (grault:quux) - // -> schema names=["grault"], values=["quux"]. - PeerTagSchema schema = PeerTagSchema.testSchema(["grault"] as String[]) - SpanSnapshot snap1 = new SpanSnapshot( - "resource1", "service1", "operation1", null, "sql", (short) 0, - false, true, "xyzzy", schema, ["quux"] as String[], null, null, null, 0L) - def entry1 = new AggregateEntry(snap1, AggregateEntry.hashOf(snap1)) - [2, 1, 2, 250, 4].each { entry1.recordOneDuration(it as long) } - writer.add(entry1) - SpanSnapshot snap2 = new SpanSnapshot( - "resource2", "service2", "operation2", null, "web", (short) 200, - false, true, "xyzzy", schema, ["quux"] as String[], null, null, null, 0L) - def entry2 = new AggregateEntry(snap2, AggregateEntry.hashOf(snap2)) - [1, 1, 200, 2, 3, 4, 5, 6, 7, 8].each { entry2.recordOneDuration(it as long) } - writer.add(entry2) - writer.finishBucket() - - then: - latch.await(5, SECONDS) - listener.events.size() == 1 && listener.events[0] == OK - } - - static class BlockingListener implements EventListener { - - List events = new CopyOnWriteArrayList<>() - final CountDownLatch latch - - BlockingListener(CountDownLatch latch) { - this.latch = latch - } - - @Override - void onEvent(EventType eventType, String message) { - events.add(eventType) - latch.countDown() - } - } -} diff --git a/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java b/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java new file mode 100644 index 00000000000..2f6b96cb7a9 --- /dev/null +++ b/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java @@ -0,0 +1,188 @@ +package datadog.trace.common.metrics; + +import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT; +import static datadog.trace.junit.utils.config.WithConfigExtension.injectSysConfig; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.communication.http.OkHttpUtils; +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import datadog.trace.api.Config; +import datadog.trace.api.ConfigDefaults; +import datadog.trace.api.WellKnownTags; +import datadog.trace.api.config.TracerConfig; +import datadog.trace.junit.utils.config.WithConfigExtension; +import java.time.Duration; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import okhttp3.HttpUrl; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.startupcheck.MinimumDurationRunningStartupCheckStrategy; + +@ExtendWith(WithConfigExtension.class) +class MetricsIntegrationTest { + + // CI runs an agent container alongside the build (reached via CI_AGENT_HOST); when building + // locally we start one ourselves with testcontainers. + private static final boolean RUNNING_IN_CI = "true".equals(System.getenv("CI")); + private static GenericContainer agentContainer; + + @BeforeAll + static void setupSpec() { + // recordOneDuration -> Histogram.accept needs the metrics meter / histogram factory registered. + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + + if (!RUNNING_IN_CI) { + Map env = new HashMap<>(); + env.put("DD_APM_ENABLED", "true"); + env.put("DD_BIND_HOST", "0.0.0.0"); + env.put("DD_API_KEY", "invalid_key_but_this_is_fine"); + env.put("DD_HOSTNAME", "doesnotexist"); + env.put("DD_LOGS_STDOUT", "yes"); + agentContainer = + new GenericContainer<>("datadog/agent:7.40.1") + .withEnv(env) + .withExposedPorts(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT) + .withStartupTimeout(Duration.ofSeconds(120)) + // Sleep for a bit so the agent's rate_by_service response is populated -- mirrors the + // race-condition workaround from the original Spock base. + .withStartupCheckStrategy( + new MinimumDurationRunningStartupCheckStrategy(Duration.ofSeconds(10))); + agentContainer.start(); + } + } + + @AfterAll + static void cleanupSpec() { + if (agentContainer != null) { + agentContainer.stop(); + } + } + + @BeforeEach + void setup() { + injectSysConfig(TracerConfig.AGENT_HOST, agentHost()); + injectSysConfig(TracerConfig.TRACE_AGENT_PORT, agentPort()); + } + + private static String agentHost() { + return agentContainer != null ? agentContainer.getHost() : System.getenv("CI_AGENT_HOST"); + } + + private static String agentPort() { + return agentContainer != null + ? String.valueOf(agentContainer.getMappedPort(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT)) + : String.valueOf(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT); + } + + @Test + void sendMetricsToTraceAgentShouldNotifyWithOkEvent() throws InterruptedException { + // setup + CountDownLatch latch = new CountDownLatch(1); + BlockingListener listener = new BlockingListener(latch); + String agentUrl = Config.get().getAgentUrl(); + OkHttpSink sink = + new OkHttpSink( + OkHttpUtils.buildHttpClient(HttpUrl.parse(agentUrl), 5000L), + agentUrl, + V06_METRICS_ENDPOINT, + true, + false, + Collections.emptyMap()); + sink.register(listener); + + // when + SerializingMetricWriter writer = + new SerializingMetricWriter( + new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language"), + sink); + writer.startBucket(2, System.nanoTime(), SECONDS.toNanos(10)); + // Build entries through the production AggregateTable.findOrInsert path (canonicalizes the + // snapshot and creates/looks up the entry). Both entries use one peer tag (grault:quux) and no + // additional tags -> schema names=["grault"], values=["quux"]. + AggregateTable table = new AggregateTable(8); + PeerTagSchema schema = PeerTagSchema.testSchema(new String[] {"grault"}); + SpanSnapshot snap1 = + new SpanSnapshot( + "resource1", + "service1", + "operation1", + null, + "sql", + (short) 0, + false, + true, + "xyzzy", + schema, + new String[] {"quux"}, + null, + null, + null, + null, + 0L); + AggregateEntry entry1 = table.findOrInsert(snap1); + for (long duration : new long[] {2, 1, 2, 250, 4}) { + entry1.recordOneDuration(duration); + } + writer.add(entry1); + SpanSnapshot snap2 = + new SpanSnapshot( + "resource2", + "service2", + "operation2", + null, + "web", + (short) 200, + false, + true, + "xyzzy", + schema, + new String[] {"quux"}, + null, + null, + null, + null, + 0L); + AggregateEntry entry2 = table.findOrInsert(snap2); + for (long duration : new long[] {1, 1, 200, 2, 3, 4, 5, 6, 7, 8}) { + entry2.recordOneDuration(duration); + } + writer.add(entry2); + writer.finishBucket(); + + // then + assertTrue(latch.await(5, SECONDS)); + assertEquals(1, listener.events.size()); + assertEquals(EventListener.EventType.OK, listener.events.get(0)); + } + + static class BlockingListener implements EventListener { + final List events = new CopyOnWriteArrayList<>(); + final CountDownLatch latch; + + BlockingListener(CountDownLatch latch) { + this.latch = latch; + } + + @Override + public void onEvent(EventListener.EventType eventType, String message) { + events.add(eventType); + latch.countDown(); + } + } +} From 9aff892c33b2f10b39029ad3c0bdfe0712e2741c Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 9 Jun 2026 17:33:42 -0400 Subject: [PATCH 37/37] Inline cardinality-block reporting back into the three handlers; drop CardinalityBlocks The CardinalityBlocks helper (one 6-arg static taking log/healthMetrics/blocked/name/statsDTag/ message) didn't improve readability over the inline loop -- the call was nearly as long as the body it replaced and forced the logger + message to be passed around. Reverted PropertyHandlers, PeerTagSchema, and AdditionalTagsSchema to the direct "reset -> if blocked > 0: warn + flush" loop and deleted CardinalityBlocks. The resetCardinalityHandlers -> resetHandlers rename is kept. Co-Authored-By: Claude Sonnet 4.6 --- .../common/metrics/AdditionalTagsSchema.java | 14 ++++---- .../common/metrics/CardinalityBlocks.java | 35 ------------------- .../trace/common/metrics/PeerTagSchema.java | 16 ++++----- .../common/metrics/PropertyHandlers.java | 14 ++++---- 4 files changed, 22 insertions(+), 57 deletions(-) delete mode 100644 dd-trace-core/src/main/java/datadog/trace/common/metrics/CardinalityBlocks.java diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java index a9a60e0353e..5e23efd9ce7 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AdditionalTagsSchema.java @@ -106,13 +106,13 @@ UTF8BytesString register(int i, String value) { void resetHandlers() { for (int i = 0; i < handlers.length; i++) { - CardinalityBlocks.reportIfBlocked( - log, - healthMetrics, - handlers[i].reset(), - names[i], - handlers[i].statsDTag(), - "Cardinality limit reached for additional metric tag '{}'; further values will be reported as blocked_by_tracer"); + long blocked = handlers[i].reset(); + if (blocked > 0) { + log.warn( + "Cardinality limit reached for additional metric tag '{}'; further values will be reported as blocked_by_tracer", + names[i]); + healthMetrics.onTagCardinalityBlocked(handlers[i].statsDTag(), blocked); + } } } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/CardinalityBlocks.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/CardinalityBlocks.java deleted file mode 100644 index a6d65063b48..00000000000 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/CardinalityBlocks.java +++ /dev/null @@ -1,35 +0,0 @@ -package datadog.trace.common.metrics; - -import datadog.trace.core.monitor.HealthMetrics; -import org.slf4j.Logger; - -/** - * Shared per-cycle cardinality-block reporting. When a handler blocked one or more values during - * the just-completed reporting cycle, emit a warn log (with the caller's own logger and message - * template, so log categories stay per-class) and flush the count to {@link HealthMetrics}. - * - *

Centralizes the warn + health-metric contract that every cardinality-handler owner ({@link - * PropertyHandlers}, {@link PeerTagSchema}, {@link AdditionalTagsSchema}) shares; {@code - * PropertyCardinalityHandler} and {@code TagCardinalityHandler} don't share a common type, so the - * helper takes the already-extracted primitives rather than the handler. - */ -final class CardinalityBlocks { - private CardinalityBlocks() {} - - /** - * @param blocked count returned by the handler's {@code reset()}; a no-op when {@code <= 0} - * @param warnMessage SLF4J template with a single {@code {}} placeholder for {@code name} - */ - static void reportIfBlocked( - Logger log, - HealthMetrics healthMetrics, - long blocked, - String name, - String[] statsDTag, - String warnMessage) { - if (blocked > 0) { - log.warn(warnMessage, name); - healthMetrics.onTagCardinalityBlocked(statsDTag, blocked); - } - } -} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index 2b4c6e9cbe3..5c23f14efc1 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -121,14 +121,14 @@ UTF8BytesString register(int i, String value) { */ void resetHandlers(HealthMetrics healthMetrics) { for (int i = 0; i < handlers.length; i++) { - CardinalityBlocks.reportIfBlocked( - log, - healthMetrics, - handlers[i].reset(), - names[i], - handlers[i].statsDTag(), - "Cardinality limit reached for peer tag '{}'; further values are reported as" - + " 'blocked_by_tracer' until the next reporting cycle"); + long blocked = handlers[i].reset(); + if (blocked > 0) { + log.warn( + "Cardinality limit reached for peer tag '{}'; further values are reported as" + + " 'blocked_by_tracer' until the next reporting cycle", + names[i]); + healthMetrics.onTagCardinalityBlocked(handlers[i].statsDTag(), blocked); + } } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java index 273c67d2456..76d1bf87860 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyHandlers.java @@ -63,13 +63,13 @@ final class PropertyHandlers { void reset(HealthMetrics healthMetrics) { for (PropertyCardinalityHandler h : handlers) { - CardinalityBlocks.reportIfBlocked( - log, - healthMetrics, - h.reset(), - h.name, - h.statsDTag(), - "Cardinality limit reached for stats field '{}'; further values will be reported as blocked_by_tracer"); + long blocked = h.reset(); + if (blocked > 0) { + log.warn( + "Cardinality limit reached for stats field '{}'; further values will be reported as blocked_by_tracer", + h.name); + healthMetrics.onTagCardinalityBlocked(h.statsDTag(), blocked); + } } } }