Skip to content

Commit a577934

Browse files
authored
fix(spanner): derive built-in metrics project from database client (#13262)
Fixes #13240 ### Problem The built-in-metrics Cloud Monitoring exporter can initialize before any DatabaseClient exists, so the OpenTelemetry Resource may carry the host/GKE project instead of the Spanner project — metrics get attributed to the wrong project. A single resource-level project also misattributes metrics for multi-project clients (RPCs to databases in other projects). ### Solution Route each metric by its point-level project_id instead of the resource project: 1. HeaderInterceptor adds project_id to its existing cached built-in attribute map (cache-miss only, only when parsed from the resource prefix and not the undefined-project sentinel). No new parsing or per-RPC work. 2. The exporter ignores the resource-level project_id, derives each TimeSeries' project from the point, groups by project, and calls createServiceTimeSeries once per project (still batched at the 200 quota limit). 3. Fallback for points with no usable project is meter-scoped: grpc-java/grpc-gcp core metrics use a fallback project (first DatabaseClient/BatchClient project, set in initializeBuiltInMetrics); Spanner/GAX DB-scoped metrics are skipped rather than exported under an unrelated project. No fallback yet → skip. #### Notes / trade-offs - gRPC/core metrics for multi-project clients are attributed to the first client's project (they carry no point project). - gRPC/core metrics before the first data/batch client, or in admin-only processes, are dropped. - Per-project export-failure logging (once per project, with PermissionDeniedException setup guidance + project id; cleared on a successful export).
1 parent a6a7941 commit a577934

11 files changed

Lines changed: 439 additions & 90 deletions

java-spanner/google-cloud-spanner/src/main/java/com/google/cloud/spanner/BuiltInMetricsConstant.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ public class BuiltInMetricsConstant {
144144
ImmutableList.of(EEF_FALLBACK_COUNT_NAME, EEF_CALL_STATUS_NAME);
145145

146146
public static final String SPANNER_RESOURCE_TYPE = "spanner_instance_client";
147+
public static final String UNDEFINED_PROJECT_ID = "undefined-project";
147148

148149
public static final AttributeKey<String> PROJECT_ID_KEY = AttributeKey.stringKey("project_id");
149150
public static final AttributeKey<String> INSTANCE_ID_KEY = AttributeKey.stringKey("instance_id");

java-spanner/google-cloud-spanner/src/main/java/com/google/cloud/spanner/BuiltInMetricsProvider.java

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import com.google.cloud.opentelemetry.detection.AttributeKeys;
3333
import com.google.cloud.opentelemetry.detection.DetectedPlatform;
3434
import com.google.cloud.opentelemetry.detection.GCPPlatformDetector;
35+
import com.google.common.annotations.VisibleForTesting;
3536
import com.google.common.base.Strings;
3637
import com.google.common.hash.HashFunction;
3738
import com.google.common.hash.Hashing;
@@ -75,10 +76,13 @@ final class BuiltInMetricsProvider {
7576
private static final String default_location = "global";
7677

7778
private OpenTelemetry openTelemetry;
79+
private String projectId;
80+
private boolean mismatchedProjectIdLogged;
81+
private Thread shutdownHook;
7882

7983
private BuiltInMetricsProvider() {}
8084

81-
OpenTelemetry getOrCreateOpenTelemetry(
85+
synchronized OpenTelemetry getOrCreateOpenTelemetry(
8286
String projectId,
8387
@Nullable Credentials credentials,
8488
@Nullable String monitoringHost,
@@ -88,12 +92,13 @@ OpenTelemetry getOrCreateOpenTelemetry(
8892
SdkMeterProviderBuilder sdkMeterProviderBuilder = SdkMeterProvider.builder();
8993
BuiltInMetricsView.registerBuiltinMetrics(
9094
SpannerCloudMonitoringExporter.create(
91-
projectId, credentials, monitoringHost, universeDomain),
95+
this::getProjectId, credentials, monitoringHost, universeDomain),
9296
sdkMeterProviderBuilder);
9397
sdkMeterProviderBuilder.setResource(Resource.create(createResourceAttributes(projectId)));
9498
SdkMeterProvider sdkMeterProvider = sdkMeterProviderBuilder.build();
9599
this.openTelemetry = OpenTelemetrySdk.builder().setMeterProvider(sdkMeterProvider).build();
96-
Runtime.getRuntime().addShutdownHook(new Thread(sdkMeterProvider::close));
100+
this.shutdownHook = new Thread(sdkMeterProvider::close);
101+
Runtime.getRuntime().addShutdownHook(this.shutdownHook);
97102
}
98103
return this.openTelemetry;
99104
} catch (IOException ex) {
@@ -106,6 +111,47 @@ OpenTelemetry getOrCreateOpenTelemetry(
106111
}
107112
}
108113

114+
synchronized void setProjectIdIfAbsent(String projectId) {
115+
if (this.projectId == null) {
116+
this.projectId = projectId;
117+
} else if (!this.projectId.equals(projectId) && !mismatchedProjectIdLogged) {
118+
mismatchedProjectIdLogged = true;
119+
logger.log(
120+
Level.WARNING,
121+
"Built-in metrics fallback project is already initialized to project {0}. Non-Spanner"
122+
+ " metrics without project information will be exported using that project instead"
123+
+ " of project {1}.",
124+
new Object[] {this.projectId, projectId});
125+
}
126+
}
127+
128+
@Nullable
129+
synchronized OpenTelemetry getOpenTelemetry() {
130+
return this.openTelemetry;
131+
}
132+
133+
synchronized String getProjectId() {
134+
return this.projectId;
135+
}
136+
137+
@VisibleForTesting
138+
synchronized void reset() {
139+
if (this.openTelemetry instanceof OpenTelemetrySdk) {
140+
((OpenTelemetrySdk) this.openTelemetry).getSdkMeterProvider().close();
141+
}
142+
if (this.shutdownHook != null) {
143+
try {
144+
Runtime.getRuntime().removeShutdownHook(this.shutdownHook);
145+
} catch (IllegalStateException ignored) {
146+
// The JVM is already shutting down.
147+
}
148+
}
149+
this.openTelemetry = null;
150+
this.projectId = null;
151+
this.mismatchedProjectIdLogged = false;
152+
this.shutdownHook = null;
153+
}
154+
109155
// TODO: Remove when
110156
// https://github.com/GoogleCloudPlatform/opentelemetry-operations-java/issues/421
111157
// has been fixed.

java-spanner/google-cloud-spanner/src/main/java/com/google/cloud/spanner/SpannerCloudMonitoringExporter.java

Lines changed: 75 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,15 @@
4242
import io.opentelemetry.sdk.metrics.data.AggregationTemporality;
4343
import io.opentelemetry.sdk.metrics.data.MetricData;
4444
import io.opentelemetry.sdk.metrics.export.MetricExporter;
45-
import io.opentelemetry.sdk.resources.Resource;
4645
import java.io.IOException;
4746
import java.time.Duration;
4847
import java.util.ArrayList;
4948
import java.util.Collection;
5049
import java.util.List;
51-
import java.util.concurrent.atomic.AtomicBoolean;
50+
import java.util.Map;
51+
import java.util.Set;
52+
import java.util.concurrent.ConcurrentHashMap;
53+
import java.util.function.Supplier;
5254
import java.util.logging.Level;
5355
import java.util.logging.Logger;
5456
import java.util.stream.Collectors;
@@ -69,13 +71,12 @@ class SpannerCloudMonitoringExporter implements MetricExporter {
6971
// This the quota limit from Cloud Monitoring. More details in
7072
// https://cloud.google.com/monitoring/quotas#custom_metrics_quotas.
7173
private static final int EXPORT_BATCH_SIZE_LIMIT = 200;
72-
private final AtomicBoolean spannerExportFailureLogged = new AtomicBoolean(false);
73-
private final AtomicBoolean lastExportSkippedData = new AtomicBoolean(false);
74+
private final Set<String> spannerExportFailureLoggedProjects = ConcurrentHashMap.newKeySet();
7475
private final MetricServiceClient client;
75-
private final String spannerProjectId;
76+
private final Supplier<String> fallbackProjectIdSupplier;
7677

7778
static SpannerCloudMonitoringExporter create(
78-
String projectId,
79+
Supplier<String> fallbackProjectIdSupplier,
7980
@Nullable Credentials credentials,
8081
@Nullable String monitoringHost,
8182
String universeDomain)
@@ -114,13 +115,19 @@ static SpannerCloudMonitoringExporter create(
114115
settingsBuilder.createServiceTimeSeriesSettings().setSimpleTimeoutNoRetriesDuration(timeout);
115116

116117
return new SpannerCloudMonitoringExporter(
117-
projectId, MetricServiceClient.create(settingsBuilder.build()));
118+
fallbackProjectIdSupplier, MetricServiceClient.create(settingsBuilder.build()));
118119
}
119120

120121
@VisibleForTesting
121-
SpannerCloudMonitoringExporter(String projectId, MetricServiceClient client) {
122+
SpannerCloudMonitoringExporter(MetricServiceClient client) {
123+
this(() -> null, client);
124+
}
125+
126+
@VisibleForTesting
127+
SpannerCloudMonitoringExporter(
128+
Supplier<String> fallbackProjectIdSupplier, MetricServiceClient client) {
122129
this.client = client;
123-
this.spannerProjectId = projectId;
130+
this.fallbackProjectIdSupplier = fallbackProjectIdSupplier;
124131
}
125132

126133
@Override
@@ -140,37 +147,16 @@ MetricServiceClient getMetricServiceClient() {
140147

141148
/** Export client built in metrics */
142149
private CompletableResultCode exportSpannerClientMetrics(Collection<MetricData> collection) {
143-
// Filter spanner metrics. Only include metrics that contain a valid project.
144-
List<MetricData> spannerMetricData = collection.stream().collect(Collectors.toList());
145-
146-
// Log warnings for metrics that will be skipped.
147-
boolean mustFilter = false;
148-
if (spannerMetricData.stream()
149-
.map(metricData -> metricData.getResource())
150-
.anyMatch(this::shouldSkipPointDataDueToProjectId)) {
151-
logger.log(
152-
Level.WARNING, "Some metric data contain a different projectId. These will be skipped.");
153-
mustFilter = true;
154-
}
155-
156-
if (mustFilter) {
157-
spannerMetricData =
158-
spannerMetricData.stream()
159-
.filter(this::shouldSkipMetricData)
160-
.collect(Collectors.toList());
161-
}
162-
lastExportSkippedData.set(mustFilter);
163-
164150
// Skips exporting if there's none
165-
if (spannerMetricData.isEmpty()) {
151+
if (collection.isEmpty()) {
166152
return CompletableResultCode.ofSuccess();
167153
}
168154

169155
List<TimeSeries> spannerTimeSeries;
170156
try {
171157
spannerTimeSeries =
172158
SpannerCloudMonitoringExporterUtils.convertToSpannerTimeSeries(
173-
spannerMetricData, this.spannerProjectId);
159+
collection, fallbackProjectIdSupplier.get());
174160
} catch (Throwable e) {
175161
logger.log(
176162
Level.WARNING,
@@ -179,37 +165,60 @@ private CompletableResultCode exportSpannerClientMetrics(Collection<MetricData>
179165
return CompletableResultCode.ofFailure();
180166
}
181167

182-
ProjectName projectName = ProjectName.of(spannerProjectId);
168+
if (spannerTimeSeries.isEmpty()) {
169+
return CompletableResultCode.ofSuccess();
170+
}
171+
172+
Map<String, List<TimeSeries>> timeSeriesByProject =
173+
spannerTimeSeries.stream()
174+
.collect(
175+
Collectors.groupingBy(
176+
timeSeries ->
177+
timeSeries
178+
.getResource()
179+
.getLabelsMap()
180+
.get(BuiltInMetricsConstant.PROJECT_ID_KEY.getKey())));
181+
182+
List<ApiFuture<List<Empty>>> futures = new ArrayList<>();
183+
for (Map.Entry<String, List<TimeSeries>> entry : timeSeriesByProject.entrySet()) {
184+
ProjectName projectName = ProjectName.of(entry.getKey());
185+
ApiFuture<List<Empty>> future = exportTimeSeriesInBatch(projectName, entry.getValue());
186+
ApiFutures.addCallback(
187+
future,
188+
new ApiFutureCallback<List<Empty>>() {
189+
@Override
190+
public void onFailure(Throwable throwable) {
191+
logExportFailure(throwable, projectName);
192+
}
193+
194+
@Override
195+
public void onSuccess(List<Empty> ignored) {
196+
spannerExportFailureLoggedProjects.remove(projectName.getProject());
197+
}
198+
},
199+
MoreExecutors.directExecutor());
200+
futures.add(future);
201+
}
183202

184-
ApiFuture<List<Empty>> futureList = exportTimeSeriesInBatch(projectName, spannerTimeSeries);
203+
ApiFuture<List<List<Empty>>> groupedFuture = ApiFutures.allAsList(futures);
204+
ApiFuture<List<Empty>> futureList =
205+
ApiFutures.transform(
206+
groupedFuture,
207+
groupedResults ->
208+
groupedResults.stream().flatMap(List::stream).collect(Collectors.toList()),
209+
MoreExecutors.directExecutor());
185210

186211
CompletableResultCode spannerExportCode = new CompletableResultCode();
187212
ApiFutures.addCallback(
188213
futureList,
189214
new ApiFutureCallback<List<Empty>>() {
190215
@Override
191216
public void onFailure(Throwable throwable) {
192-
if (spannerExportFailureLogged.compareAndSet(false, true)) {
193-
String msg = "createServiceTimeSeries request failed for spanner metrics.";
194-
if (throwable instanceof PermissionDeniedException) {
195-
// TODO: Add the link of public documentation when available in the log message.
196-
msg +=
197-
String.format(
198-
" Need monitoring metric writer permission on project=%s. Follow"
199-
+ " https://cloud.google.com/spanner/docs/view-manage-client-side-metrics#access-client-side-metrics"
200-
+ " to set up permissions",
201-
projectName.getProject());
202-
}
203-
logger.log(Level.WARNING, msg, throwable);
204-
}
205217
spannerExportCode.fail();
206218
}
207219

208220
@Override
209221
public void onSuccess(List<Empty> empty) {
210-
// When an export succeeded reset the export failure flag to false so if there's a
211-
// transient failure it'll be logged.
212-
spannerExportFailureLogged.set(false);
213222
spannerExportCode.succeed();
214223
}
215224
},
@@ -218,16 +227,22 @@ public void onSuccess(List<Empty> empty) {
218227
return spannerExportCode;
219228
}
220229

221-
private boolean shouldSkipMetricData(MetricData metricData) {
222-
return shouldSkipPointDataDueToProjectId(metricData.getResource());
223-
}
224-
225-
private boolean shouldSkipPointDataDueToProjectId(Resource resource) {
226-
return !spannerProjectId.equals(SpannerCloudMonitoringExporterUtils.getProjectId(resource));
227-
}
228-
229-
boolean lastExportSkippedData() {
230-
return this.lastExportSkippedData.get();
230+
private void logExportFailure(Throwable throwable, ProjectName projectName) {
231+
if (spannerExportFailureLoggedProjects.add(projectName.getProject())) {
232+
String msg = "createServiceTimeSeries request failed for spanner metrics.";
233+
if (throwable instanceof PermissionDeniedException) {
234+
msg +=
235+
String.format(
236+
" Need monitoring metric writer permission on project=%s. Follow"
237+
+ " https://cloud.google.com/spanner/docs/view-manage-client-side-metrics"
238+
+ "#access-client-side-metrics"
239+
+ " to set up permissions",
240+
projectName.getProject());
241+
} else {
242+
msg += String.format(" project=%s.", projectName.getProject());
243+
}
244+
logger.log(Level.WARNING, msg, throwable);
245+
}
231246
}
232247

233248
private ApiFuture<List<Empty>> exportTimeSeriesInBatch(

0 commit comments

Comments
 (0)