From bfcbefc50fcdce985c70b1df7ea10c7c396aa804 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 09:22:11 +1100
Subject: [PATCH 01/23] Run k6 via Docker and remove remote import in k6 script

Run k6 load tests using grafana/k6 Docker image instead of requiring
a local k6 binary. Mounts workspace at /workspace (read-only) and
remaps file paths (PROTO_DIR etc.) to container paths. Uses
--network host for target access.

Replace remote randomString import with built-in crypto.randomUUID()
in the k6 script.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolkit/src/core/k6.js | 44 ++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)
diff --git a/toolkit/src/core/k6.js b/toolkit/src/core/k6.js
index 67110bf..25c10dc 100644
--- a/toolkit/src/core/k6.js
+++ b/toolkit/src/core/k6.js
@@ -1,33 +1,57 @@
-import { join } from 'node:path';
+import { join, resolve, relative } from 'node:path';
 import { tmpdir } from 'node:os';
 import { randomUUID } from 'node:crypto';
 import { readFile, unlink } from 'node:fs/promises';
 import { exec } from '../util/exec.js';
 import { getLogger } from '../util/logger.js';
 
+const K6_IMAGE = 'grafana/k6:latest';
+
 export async function runK6(scriptPath, options = {}) {
   const log = getLogger();
   const { vus = 50, duration = '30s', env = {} } = options;
 
-  const summaryFile = join(tmpdir(), `benchmark-k6-${randomUUID()}.json`);
+  const workDir = resolve('.');
+  const summaryName = `benchmark-k6-${randomUUID()}.json`;
+  const summaryDir = tmpdir();
+  const summaryFile = join(summaryDir, summaryName);
+
+  const toContainerPath = (hostPath) => {
+    const abs = resolve(hostPath);
+    return `/workspace/${relative(workDir, abs)}`;
+  };
+
+  const dockerArgs = [
+    'run',
+    '--rm',
+    '--network',
+    'host',
+    '-v',
+    `${workDir}:/workspace:ro`,
+    '-v',
+    `${summaryDir}:/results`,
+  ];
+
+  for (const [key, value] of Object.entries(env)) {
+    const mapped = value.startsWith('./') ? toContainerPath(value) : value;
+    dockerArgs.push('-e', `${key}=${mapped}`);
+  }
 
-  const args = [
+  dockerArgs.push(
+    K6_IMAGE,
     'run',
     '--summary-export',
-    summaryFile,
+    `/results/${summaryName}`,
     '--vus',
     String(vus),
     '--duration',
     duration,
-    scriptPath,
-  ];
+    toContainerPath(scriptPath)
+  );
 
   log.info({ scriptPath, vus, duration, msg: 'running k6 load test' });
 
-  await exec('k6', args, {
-    stdio: 'inherit',
-    env: { ...process.env, ...env },
-  });
+  await exec('docker', dockerArgs, { stdio: 'inherit' });
 
   try {
     const raw = await readFile(summaryFile, 'utf-8');

From 0d7d58f07a994e3da17bb426381268f863b84020 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 09:33:10 +1100
Subject: [PATCH 02/23] Enable gRPC reflection and use k6 reflect mode

Add connectrpc.com/grpcreflect to the server so k6 can discover
services at runtime via reflect:true. Remove proto file loading
from the k6 script and PROTO_DIR from benchmark config. Update
agent and scaffold command accordingly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .claude/agents/connect-rpc-go.md                      |  9 +++++++++
 .claude/commands/scaffold-implementation.md           |  2 +-
 benchmark.config.json                                 |  3 +--
 .../content-api/_shared/protobuf/k6/content-api.js    | 11 +----------
 .../connect-rpc/cmd/server/setup_gateway.go           |  7 +++++++
 projects/content-api/connect-rpc/go.mod               |  3 ++-
 projects/content-api/connect-rpc/go.sum               |  2 ++
 7 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/.claude/agents/connect-rpc-go.md b/.claude/agents/connect-rpc-go.md
index f72c51f..b4575cb 100644
--- a/.claude/agents/connect-rpc-go.md
+++ b/.claude/agents/connect-rpc-go.md
@@ -983,6 +983,7 @@ package main
 
 import (
     "connectrpc.com/connect"
+    "connectrpc.com/grpcreflect"
 
     contentv1connect "<module>/gen/proto/content/v1/contentv1connect"
     contentapi "<module>/internal/api/content"
@@ -1002,6 +1003,13 @@ func setupGateway(cfg *config.Config, domains *Domains) connectapp.App {
     )
     application.Handle(path, h)
 
+    // gRPC server reflection — enables k6 reflect:true and tools like grpcurl
+    reflector := grpcreflect.NewStaticReflector(
+        contentv1connect.ContentServiceName,
+    )
+    application.Handle(grpcreflect.NewHandlerV1(reflector))
+    application.Handle(grpcreflect.NewHandlerV1Alpha(reflector))
+
     return application
 }
 ```
@@ -1139,6 +1147,7 @@ services:
 |---|---|
 | Go | 1.25.6 |
 | connectrpc.com/connect | v1.19.1 |
+| connectrpc.com/grpcreflect | latest stable |
 | connectrpc.com/validate | latest stable |
 | google.golang.org/protobuf | v1.36.11 |
 | github.com/jackc/pgx/v5 | latest stable |
diff --git a/.claude/commands/scaffold-implementation.md b/.claude/commands/scaffold-implementation.md
index 010a29f..72f75a8 100644
--- a/.claude/commands/scaffold-implementation.md
+++ b/.claude/commands/scaffold-implementation.md
@@ -123,7 +123,7 @@ Add a target entry to `benchmark.config.json`:
 
 For protobuf targets, adjust:
 - `protocol`: `"grpc"`
-- `k6.env`: use `GRPC_HOST` (set to `localhost:8080`) and `PROTO_DIR` (e.g., `./projects/<project>/_shared/protobuf`) instead of `BASE_URL`
+- `k6.env`: use `GRPC_HOST` (set to `localhost:8080`) instead of `BASE_URL`
 
 ## Step 7 — Verify
 
diff --git a/benchmark.config.json b/benchmark.config.json
index 99a6820..cf3896f 100644
--- a/benchmark.config.json
+++ b/benchmark.config.json
@@ -18,8 +18,7 @@
         "vus": 50,
         "duration": "30s",
         "env": {
-          "GRPC_HOST": "localhost:8080",
-          "PROTO_DIR": "./projects/content-api/_shared/protobuf"
+          "GRPC_HOST": "localhost:8080"
         }
       },
       "tags": {
diff --git a/projects/content-api/_shared/protobuf/k6/content-api.js b/projects/content-api/_shared/protobuf/k6/content-api.js
index 9ff1a9d..04399bd 100644
--- a/projects/content-api/_shared/protobuf/k6/content-api.js
+++ b/projects/content-api/_shared/protobuf/k6/content-api.js
@@ -2,18 +2,9 @@ import grpc from 'k6/net/grpc';
 import { check, group, sleep } from 'k6';
 
 const GRPC_HOST = __ENV.GRPC_HOST || 'localhost:8080';
-const PROTO_DIR = __ENV.PROTO_DIR || '';
 
 const client = new grpc.Client();
 
-if (PROTO_DIR) {
-  client.load(
-    [PROTO_DIR],
-    'content/v1/content_service.proto',
-    'content/v1/content_model.proto',
-  );
-}
-
 export const options = {
   thresholds: {
     grpc_req_duration: ['p(95)<500'],
@@ -21,7 +12,7 @@ export const options = {
 };
 
 export default function () {
-  client.connect(GRPC_HOST, { plaintext: true });
+  client.connect(GRPC_HOST, { plaintext: true, reflect: true });
 
   let contentId;
 
diff --git a/projects/content-api/connect-rpc/cmd/server/setup_gateway.go b/projects/content-api/connect-rpc/cmd/server/setup_gateway.go
index 960d227..86ad610 100644
--- a/projects/content-api/connect-rpc/cmd/server/setup_gateway.go
+++ b/projects/content-api/connect-rpc/cmd/server/setup_gateway.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"connectrpc.com/connect"
+	"connectrpc.com/grpcreflect"
 
 	contentv1connect "content-api-connect-rpc/gen/proto/content/v1/contentv1connect"
 	contentapi "content-api-connect-rpc/internal/api/content"
@@ -21,5 +22,11 @@ func setupGateway(cfg *config.Config, domains *Domains) connectapp.App {
 	)
 	application.Handle(path, h)
 
+	reflector := grpcreflect.NewStaticReflector(
+		contentv1connect.ContentServiceName,
+	)
+	application.Handle(grpcreflect.NewHandlerV1(reflector))
+	application.Handle(grpcreflect.NewHandlerV1Alpha(reflector))
+
 	return application
 }
diff --git a/projects/content-api/connect-rpc/go.mod b/projects/content-api/connect-rpc/go.mod
index f903c8c..68bcc57 100644
--- a/projects/content-api/connect-rpc/go.mod
+++ b/projects/content-api/connect-rpc/go.mod
@@ -3,7 +3,9 @@ module content-api-connect-rpc
 go 1.25.0
 
 require (
+	buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.9-20250912141014-52f32327d4b0.1
 	connectrpc.com/connect v1.19.1
+	connectrpc.com/grpcreflect v1.3.0
 	connectrpc.com/validate v0.6.0
 	github.com/gofrs/uuid/v5 v5.4.0
 	github.com/jackc/pgx/v5 v5.8.0
@@ -17,7 +19,6 @@ require (
 )
 
 require (
-	buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.9-20250912141014-52f32327d4b0.1 // indirect
 	buf.build/go/protovalidate v1.0.0 // indirect
 	cel.dev/expr v0.24.0 // indirect
 	github.com/antlr4-go/antlr/v4 v4.13.1 // indirect
diff --git a/projects/content-api/connect-rpc/go.sum b/projects/content-api/connect-rpc/go.sum
index ab1322c..fe5ca72 100644
--- a/projects/content-api/connect-rpc/go.sum
+++ b/projects/content-api/connect-rpc/go.sum
@@ -6,6 +6,8 @@ cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY=
 cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
 connectrpc.com/connect v1.19.1 h1:R5M57z05+90EfEvCY1b7hBxDVOUl45PrtXtAV2fOC14=
 connectrpc.com/connect v1.19.1/go.mod h1:tN20fjdGlewnSFeZxLKb0xwIZ6ozc3OQs2hTXy4du9w=
+connectrpc.com/grpcreflect v1.3.0 h1:Y4V+ACf8/vOb1XOc251Qun7jMB75gCUNw6llvB9csXc=
+connectrpc.com/grpcreflect v1.3.0/go.mod h1:nfloOtCS8VUQOQ1+GTdFzVg2CJo4ZGaat8JIovCtDYs=
 connectrpc.com/validate v0.6.0 h1:DcrgDKt2ZScrUs/d/mh9itD2yeEa0UbBBa+i0mwzx+4=
 connectrpc.com/validate v0.6.0/go.mod h1:ihrpI+8gVbLH1fvVWJL1I3j0CfWnF8P/90LsmluRiZs=
 github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ=

From 4879a6ca10bfae4dfdb11491fac4e4151fa33246 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 09:47:57 +1100
Subject: [PATCH 03/23] Fix k6 FieldMask serialization for gRPC reflection mode

Use comma-separated string format for google.protobuf.FieldMask as
required by protobuf JSON encoding spec, fixing serialization error.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 projects/content-api/_shared/protobuf/k6/content-api.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/content-api/_shared/protobuf/k6/content-api.js b/projects/content-api/_shared/protobuf/k6/content-api.js
index 04399bd..7a8be0d 100644
--- a/projects/content-api/_shared/protobuf/k6/content-api.js
+++ b/projects/content-api/_shared/protobuf/k6/content-api.js
@@ -67,7 +67,7 @@ export default function () {
         title: `Updated ${crypto.randomUUID()}`,
         status: 'CONTENT_STATUS_PUBLISHED',
       },
-      updateMask: { paths: ['title', 'status'] },
+      updateMask: 'title,status',
     });
 
     check(res, {

From ee6e4353ff18e2bfdc6927817ce556fa5c7d979f Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:15:43 +1100
Subject: [PATCH 04/23] Fix toolkit result collection: exec return, gRPC
 metrics, versions

- Return execa result from exec() so version collection works
- Parse gRPC metrics (grpc_req_duration) alongside HTTP metrics
- Remove .values wrapper missing in k6 summary-export format
- Use protocol-agnostic field names (reqs, reqDuration, reqFailedRate)
- Get k6 version from Docker image instead of local binary
- Update all consumers (compare, formatter, run, loadtest commands)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolkit/src/cli/commands/loadtest.js |  6 ++---
 toolkit/src/cli/commands/run.js      |  4 +--
 toolkit/src/metrics/collector.js     |  2 +-
 toolkit/src/metrics/k6-parser.js     | 39 ++++++++++++++--------------
 toolkit/src/publish/formatter.js     | 38 +++++++++++++--------------
 toolkit/src/report/compare.js        | 12 ++++-----
 toolkit/src/util/exec.js             |  2 +-
 7 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/toolkit/src/cli/commands/loadtest.js b/toolkit/src/cli/commands/loadtest.js
index aa29eb3..c4cddc6 100644
--- a/toolkit/src/cli/commands/loadtest.js
+++ b/toolkit/src/cli/commands/loadtest.js
@@ -31,9 +31,9 @@ export function loadtestCommand() {
 
       log.info({
         target: targetName,
-        reqsPerSec: summary.httpReqsPerSec.toFixed(1),
-        p95: summary.httpReqDuration.p95.toFixed(1),
-        errorRate: summary.httpReqFailed.toFixed(4),
+        reqsPerSec: summary.reqsPerSec.toFixed(1),
+        p95: summary.reqDuration.p95.toFixed(1),
+        errorRate: summary.reqFailedRate.toFixed(4),
         msg: 'load test completed',
       });
 
diff --git a/toolkit/src/cli/commands/run.js b/toolkit/src/cli/commands/run.js
index 983edac..9959577 100644
--- a/toolkit/src/cli/commands/run.js
+++ b/toolkit/src/cli/commands/run.js
@@ -81,8 +81,8 @@ export function runCommand() {
             summary,
           };
           log.info({
-            reqsPerSec: summary.httpReqsPerSec.toFixed(1),
-            p95: summary.httpReqDuration.p95.toFixed(1),
+            reqsPerSec: summary.reqsPerSec.toFixed(1),
+            p95: summary.reqDuration.p95.toFixed(1),
             msg: 'loadtest completed',
           });
         }
diff --git a/toolkit/src/metrics/collector.js b/toolkit/src/metrics/collector.js
index 0608878..b8a2a7e 100644
--- a/toolkit/src/metrics/collector.js
+++ b/toolkit/src/metrics/collector.js
@@ -14,7 +14,7 @@ async function collectEnvironment() {
   const [dockerVersion, nodeVersion, k6Version] = await Promise.all([
     getVersion('docker', ['--version']),
     getVersion('node', ['--version']),
-    getVersion('k6', ['version']),
+    getVersion('docker', ['run', '--rm', 'grafana/k6:latest', 'version']),
   ]);
 
   return {
diff --git a/toolkit/src/metrics/k6-parser.js b/toolkit/src/metrics/k6-parser.js
index 055a7ff..3494c66 100644
--- a/toolkit/src/metrics/k6-parser.js
+++ b/toolkit/src/metrics/k6-parser.js
@@ -1,31 +1,32 @@
 export function parseK6Summary(raw) {
   const metrics = raw.metrics || {};
 
-  const httpReqDuration = metrics.http_req_duration?.values || {};
-  const httpReqs = metrics.http_reqs?.values || {};
-  const httpReqFailed = metrics.http_req_failed?.values || {};
-  const checks = metrics.checks?.values || {};
-  const iterations = metrics.iterations?.values || {};
-  const dataReceived = metrics.data_received?.values || {};
-  const dataSent = metrics.data_sent?.values || {};
+  // Support both HTTP and gRPC protocols
+  const reqDuration = metrics.http_req_duration || metrics.grpc_req_duration || {};
+  const reqs = metrics.http_reqs || {};
+  const reqFailed = metrics.http_req_failed || {};
+  const checks = metrics.checks || {};
+  const iterations = metrics.iterations || {};
+  const dataReceived = metrics.data_received || {};
+  const dataSent = metrics.data_sent || {};
 
   return {
-    httpReqs: httpReqs.count ?? 0,
-    httpReqsPerSec: httpReqs.rate ?? 0,
-    httpReqDuration: {
-      avg: httpReqDuration.avg ?? 0,
-      min: httpReqDuration.min ?? 0,
-      med: httpReqDuration.med ?? 0,
-      max: httpReqDuration.max ?? 0,
-      p90: httpReqDuration['p(90)'] ?? 0,
-      p95: httpReqDuration['p(95)'] ?? 0,
-      p99: httpReqDuration['p(99)'] ?? 0,
+    reqs: reqs.count ?? iterations.count ?? 0,
+    reqsPerSec: reqs.rate ?? iterations.rate ?? 0,
+    reqDuration: {
+      avg: reqDuration.avg ?? 0,
+      min: reqDuration.min ?? 0,
+      med: reqDuration.med ?? 0,
+      max: reqDuration.max ?? 0,
+      p90: reqDuration['p(90)'] ?? 0,
+      p95: reqDuration['p(95)'] ?? 0,
+      p99: reqDuration['p(99)'] ?? 0,
     },
-    httpReqFailed: httpReqFailed.rate ?? 0,
+    reqFailedRate: reqFailed.rate ?? 0,
     iterations: iterations.count ?? 0,
     iterationsPerSec: iterations.rate ?? 0,
     dataReceived: dataReceived.count ?? 0,
     dataSent: dataSent.count ?? 0,
-    checksPassRate: checks.rate ?? 0,
+    checksPassRate: checks.value ?? 0,
   };
 }
diff --git a/toolkit/src/publish/formatter.js b/toolkit/src/publish/formatter.js
index f22e4cf..a670ac1 100644
--- a/toolkit/src/publish/formatter.js
+++ b/toolkit/src/publish/formatter.js
@@ -81,67 +81,67 @@ export function formatAsOtlpMetrics(results) {
     const { summary } = results.metrics.loadtest;
 
     metrics.push(
-      makeGauge('benchmark.http_reqs', '1', summary.httpReqs, timeUnixNano, metricAttributes),
+      makeGauge('benchmark.reqs', '1', summary.reqs, timeUnixNano, metricAttributes),
       makeGauge(
-        'benchmark.http_reqs_per_sec',
+        'benchmark.reqs_per_sec',
         '1/s',
-        summary.httpReqsPerSec,
+        summary.reqsPerSec,
         timeUnixNano,
         metricAttributes
       ),
       makeGauge(
-        'benchmark.http_req_failed_rate',
+        'benchmark.req_failed_rate',
         '1',
-        summary.httpReqFailed,
+        summary.reqFailedRate,
         timeUnixNano,
         metricAttributes
       ),
       makeGauge(
-        'benchmark.http_req.duration.avg',
+        'benchmark.req.duration.avg',
         'ms',
-        summary.httpReqDuration.avg,
+        summary.reqDuration.avg,
         timeUnixNano,
         metricAttributes
       ),
       makeGauge(
-        'benchmark.http_req.duration.min',
+        'benchmark.req.duration.min',
         'ms',
-        summary.httpReqDuration.min,
+        summary.reqDuration.min,
         timeUnixNano,
         metricAttributes
       ),
       makeGauge(
-        'benchmark.http_req.duration.med',
+        'benchmark.req.duration.med',
         'ms',
-        summary.httpReqDuration.med,
+        summary.reqDuration.med,
         timeUnixNano,
         metricAttributes
       ),
       makeGauge(
-        'benchmark.http_req.duration.max',
+        'benchmark.req.duration.max',
         'ms',
-        summary.httpReqDuration.max,
+        summary.reqDuration.max,
         timeUnixNano,
         metricAttributes
       ),
       makeGauge(
-        'benchmark.http_req.duration.p90',
+        'benchmark.req.duration.p90',
         'ms',
-        summary.httpReqDuration.p90,
+        summary.reqDuration.p90,
         timeUnixNano,
         metricAttributes
       ),
       makeGauge(
-        'benchmark.http_req.duration.p95',
+        'benchmark.req.duration.p95',
         'ms',
-        summary.httpReqDuration.p95,
+        summary.reqDuration.p95,
         timeUnixNano,
         metricAttributes
       ),
       makeGauge(
-        'benchmark.http_req.duration.p99',
+        'benchmark.req.duration.p99',
         'ms',
-        summary.httpReqDuration.p99,
+        summary.reqDuration.p99,
         timeUnixNano,
         metricAttributes
       ),
diff --git a/toolkit/src/report/compare.js b/toolkit/src/report/compare.js
index eb9b01d..d3a4343 100644
--- a/toolkit/src/report/compare.js
+++ b/toolkit/src/report/compare.js
@@ -28,12 +28,12 @@ export function compareResults(results) {
 
     if (r.metrics.loadtest) {
       const s = r.metrics.loadtest.summary;
-      row.reqsPerSec = s.httpReqsPerSec;
-      row.avgMs = s.httpReqDuration.avg;
-      row.p90Ms = s.httpReqDuration.p90;
-      row.p95Ms = s.httpReqDuration.p95;
-      row.p99Ms = s.httpReqDuration.p99;
-      row.errorRate = s.httpReqFailed;
+      row.reqsPerSec = s.reqsPerSec;
+      row.avgMs = s.reqDuration.avg;
+      row.p90Ms = s.reqDuration.p90;
+      row.p95Ms = s.reqDuration.p95;
+      row.p99Ms = s.reqDuration.p99;
+      row.errorRate = s.reqFailedRate;
     }
 
     return row;
diff --git a/toolkit/src/util/exec.js b/toolkit/src/util/exec.js
index 06f982c..751e698 100644
--- a/toolkit/src/util/exec.js
+++ b/toolkit/src/util/exec.js
@@ -5,7 +5,7 @@ export async function exec(command, args = [], options = {}) {
   const log = getLogger();
   log.debug({ command, args, cwd: options.cwd, msg: 'executing command' });
 
-  await execa(command, args, {
+  return await execa(command, args, {
     stdio: options.stdio ?? 'pipe',
     cwd: options.cwd,
     env: options.env,

From 4bfd9900cdfb4ab9d16d2ad5b6b5bd5f6e610fa1 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:21:41 +1100
Subject: [PATCH 05/23] Add .env support for Grafana credentials and update
 README

- Add dotenv to auto-load .env file on CLI startup
- Create .env.example with Grafana Cloud variables
- Update README: add Grafana setup section, remove local k6 prerequisite,
  update metric names to protocol-agnostic format

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example             |  4 +++
 README.md                | 64 ++++++++++++++++++++++++++--------------
 package-lock.json        | 21 +++++++++++--
 package.json             |  3 ++
 toolkit/bin/benchmark.js |  1 +
 5 files changed, 68 insertions(+), 25 deletions(-)
 create mode 100644 .env.example

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..a10e830
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,4 @@
+# Grafana Cloud OTLP credentials
+# Find these at: Grafana Cloud portal > My Account > OpenTelemetry (OTLP)
+GRAFANA_INSTANCE_ID=
+GRAFANA_API_KEY=
diff --git a/README.md b/README.md
index 19d1939..c9635b9 100644
--- a/README.md
+++ b/README.md
@@ -6,16 +6,36 @@ API benchmark platform for comparing backend service implementations. Define you
 
 - [Node.js](https://nodejs.org/) >= 22
 - [Docker](https://docs.docker.com/get-docker/) with Compose v2
-- [k6](https://grafana.com/docs/k6/latest/set-up/install-k6/) for load testing
 - [Claude Code](https://docs.anthropic.com/en/docs/claude-code) for scaffolding implementations
 - A [Grafana Cloud](https://grafana.com/products/cloud/) account (optional, for publishing results)
 
+k6 load tests run inside Docker (`grafana/k6:latest`) — no local k6 installation required.
+
 ## Setup
 
 ```bash
 npm install
 ```
 
+### Grafana Cloud (optional)
+
+To publish benchmark results to Grafana Cloud, create a `.env` file at the repository root:
+
+```bash
+cp .env.example .env
+```
+
+Then fill in your credentials:
+
+```env
+GRAFANA_INSTANCE_ID=123456
+GRAFANA_API_KEY=glc_eyJ...
+```
+
+You can find these values in your Grafana Cloud portal under **My Account** > **Grafana Cloud** > **OpenTelemetry (OTLP)**. The instance ID is the numeric identifier and the API key is a Cloud Access Policy token with `metrics:write` scope.
+
+These variables are interpolated into `benchmark.config.json` at load time wherever `${VAR_NAME}` syntax is used.
+
 ## Quick start
 
 ### 1. Define an API
@@ -153,14 +173,14 @@ All targets are defined in `benchmark.config.json` at the repo root. Each target
 
 ### Environment variables
 
-Grafana Cloud credentials are resolved from environment variables. Create a `.env` file or export them in your shell:
+Grafana Cloud credentials are resolved from environment variables at config load time. The toolkit automatically loads a `.env` file from the repository root (see [Setup](#grafana-cloud-optional)).
 
-```bash
-export GRAFANA_INSTANCE_ID=your-instance-id
-export GRAFANA_API_KEY=your-api-key
-```
+| Variable               | Description                                        |
+| ---------------------- | -------------------------------------------------- |
+| `GRAFANA_INSTANCE_ID`  | Grafana Cloud instance ID (numeric)                |
+| `GRAFANA_API_KEY`      | Cloud Access Policy token with `metrics:write`     |
 
-Values in the config using `${VAR_NAME}` syntax are interpolated from the environment at load time.
+Values in `benchmark.config.json` using `${VAR_NAME}` syntax are interpolated from `process.env`.
 
 ## Commands
 
@@ -265,19 +285,19 @@ Time from `docker compose up -d` until the service health check passes. Health i
 
 ### Load test (k6)
 
-Parsed from k6's `--summary-export` JSON output:
+Parsed from k6's `--summary-export` JSON output. Supports both HTTP (`http_req_duration`) and gRPC (`grpc_req_duration`) protocols:
 
-| Metric                | Description                        |
-| --------------------- | ---------------------------------- |
-| `httpReqs`            | Total HTTP requests                |
-| `httpReqsPerSec`      | Throughput (requests/second)       |
-| `httpReqDuration.avg` | Average response time (ms)         |
-| `httpReqDuration.med` | Median / p50 response time (ms)    |
-| `httpReqDuration.p90` | 90th percentile response time (ms) |
-| `httpReqDuration.p95` | 95th percentile response time (ms) |
-| `httpReqDuration.p99` | 99th percentile response time (ms) |
-| `httpReqFailed`       | Error rate (0.0 - 1.0)             |
-| `checksPassRate`      | k6 check pass rate (0.0 - 1.0)     |
+| Metric             | Description                        |
+| ------------------ | ---------------------------------- |
+| `reqs`             | Total requests (iterations)        |
+| `reqsPerSec`       | Throughput (requests/second)       |
+| `reqDuration.avg`  | Average response time (ms)         |
+| `reqDuration.med`  | Median / p50 response time (ms)    |
+| `reqDuration.p90`  | 90th percentile response time (ms) |
+| `reqDuration.p95`  | 95th percentile response time (ms) |
+| `reqDuration.p99`  | 99th percentile response time (ms) |
+| `reqFailedRate`    | Error rate (0.0 - 1.0)             |
+| `checksPassRate`   | k6 check pass rate (0.0 - 1.0)     |
 
 ## Grafana Cloud publishing
 
@@ -287,9 +307,9 @@ Metrics appear in Grafana with the `benchmark.*` prefix:
 
 - `benchmark.build.duration`
 - `benchmark.deploy.duration`
-- `benchmark.http_reqs_per_sec`
-- `benchmark.http_req.duration.{avg,med,p90,p95,p99}`
-- `benchmark.http_req_failed_rate`
+- `benchmark.reqs_per_sec`
+- `benchmark.req.duration.{avg,med,p90,p95,p99}`
+- `benchmark.req_failed_rate`
 - `benchmark.checks_pass_rate`
 
 ## Custom k6 scripts
diff --git a/package-lock.json b/package-lock.json
index a92b1ec..3aa54ee 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -11,6 +11,9 @@
       "workspaces": [
         "toolkit"
       ],
+      "dependencies": {
+        "dotenv": "^17.3.1"
+      },
       "devDependencies": {
         "@anthropic-ai/claude-code": "^2.1.63"
       }
@@ -771,9 +774,9 @@
       "license": "MIT"
     },
     "node_modules/dotenv": {
-      "version": "16.6.1",
-      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
-      "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
+      "version": "17.3.1",
+      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.3.1.tgz",
+      "integrity": "sha512-IO8C/dzEb6O3F9/twg6ZLXz164a2fhTnEWb95H23Dm4OuN+92NmEAlTrupP9VW6Jm3sO26tQlqyvyi4CsnY9GA==",
       "license": "BSD-2-Clause",
       "engines": {
         "node": ">=12"
@@ -1931,6 +1934,18 @@
       "engines": {
         "node": ">=22.0.0"
       }
+    },
+    "toolkit/node_modules/dotenv": {
+      "version": "16.6.1",
+      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
+      "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://dotenvx.com"
+      }
     }
   }
 }
diff --git a/package.json b/package.json
index ceb20fe..56fa489 100644
--- a/package.json
+++ b/package.json
@@ -27,5 +27,8 @@
   },
   "devDependencies": {
     "@anthropic-ai/claude-code": "^2.1.63"
+  },
+  "dependencies": {
+    "dotenv": "^17.3.1"
   }
 }
diff --git a/toolkit/bin/benchmark.js b/toolkit/bin/benchmark.js
index fd023ef..028cb9e 100755
--- a/toolkit/bin/benchmark.js
+++ b/toolkit/bin/benchmark.js
@@ -1,5 +1,6 @@
 #!/usr/bin/env node
 
+import 'dotenv/config';
 import process from 'node:process';
 import { createProgram } from '../src/cli/index.js';
 

From 8fa476649ca127f8d45837aeaf0e8abb87f4b00c Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:30:42 +1100
Subject: [PATCH 06/23] Pin grafana/k6 Docker image to v1.6.1

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 README.md                        | 2 +-
 toolkit/src/core/k6.js           | 2 +-
 toolkit/src/metrics/collector.js | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c9635b9..5c487b8 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ API benchmark platform for comparing backend service implementations. Define you
 - [Claude Code](https://docs.anthropic.com/en/docs/claude-code) for scaffolding implementations
 - A [Grafana Cloud](https://grafana.com/products/cloud/) account (optional, for publishing results)
 
-k6 load tests run inside Docker (`grafana/k6:latest`) — no local k6 installation required.
+k6 load tests run inside Docker (`grafana/k6:1.6.1`) — no local k6 installation required.
 
 ## Setup
 
diff --git a/toolkit/src/core/k6.js b/toolkit/src/core/k6.js
index 25c10dc..d54a71a 100644
--- a/toolkit/src/core/k6.js
+++ b/toolkit/src/core/k6.js
@@ -5,7 +5,7 @@ import { readFile, unlink } from 'node:fs/promises';
 import { exec } from '../util/exec.js';
 import { getLogger } from '../util/logger.js';
 
-const K6_IMAGE = 'grafana/k6:latest';
+const K6_IMAGE = 'grafana/k6:1.6.1';
 
 export async function runK6(scriptPath, options = {}) {
   const log = getLogger();
diff --git a/toolkit/src/metrics/collector.js b/toolkit/src/metrics/collector.js
index b8a2a7e..fc31f07 100644
--- a/toolkit/src/metrics/collector.js
+++ b/toolkit/src/metrics/collector.js
@@ -14,7 +14,7 @@ async function collectEnvironment() {
   const [dockerVersion, nodeVersion, k6Version] = await Promise.all([
     getVersion('docker', ['--version']),
     getVersion('node', ['--version']),
-    getVersion('docker', ['run', '--rm', 'grafana/k6:latest', 'version']),
+    getVersion('docker', ['run', '--rm', 'grafana/k6:1.6.1', 'version']),
   ]);
 
   return {

From d942e650c9cdae781e28d0621acf87624d3dafbc Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:32:37 +1100
Subject: [PATCH 07/23] Clarify Grafana Cloud credential instructions in
 .env.example and README

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example | 4 +++-
 README.md    | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.env.example b/.env.example
index a10e830..747dcfb 100644
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,6 @@
 # Grafana Cloud OTLP credentials
-# Find these at: Grafana Cloud portal > My Account > OpenTelemetry (OTLP)
+# Find these at: Grafana Cloud portal > Your Stack > Connections > OpenTelemetry (OTLP)
+# Instance ID: numeric ID shown on the OTLP configuration page
+# API Key: generate a Cloud Access Policy token with metrics:write scope
 GRAFANA_INSTANCE_ID=
 GRAFANA_API_KEY=
diff --git a/README.md b/README.md
index 5c487b8..ba224da 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ GRAFANA_INSTANCE_ID=123456
 GRAFANA_API_KEY=glc_eyJ...
 ```
 
-You can find these values in your Grafana Cloud portal under **My Account** > **Grafana Cloud** > **OpenTelemetry (OTLP)**. The instance ID is the numeric identifier and the API key is a Cloud Access Policy token with `metrics:write` scope.
+To find these values, sign in to [Grafana Cloud](https://grafana.com), open your stack, and go to **Connections** > **OpenTelemetry (OTLP)**. The instance ID is the numeric identifier shown on the configuration page. For the API key, click **Generate now** to create a Cloud Access Policy token with `metrics:write` scope.
 
 These variables are interpolated into `benchmark.config.json` at load time wherever `${VAR_NAME}` syntax is used.
 

From 3f3a0cb48185d627163629bb18f2d17a5b65b965 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:39:03 +1100
Subject: [PATCH 08/23] Simplify Grafana config to endpoint + token

Replace instanceId/apiKey with a single base64 token that Grafana Cloud
generates directly on the OTLP configuration page. Also make the
endpoint configurable via env var.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example                         |  7 +++----
 README.md                            | 22 ++++++++++++----------
 benchmark.config.json                |  5 ++---
 toolkit/src/config/schema.js         |  3 +--
 toolkit/src/publish/grafana-cloud.js |  9 ++++-----
 5 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/.env.example b/.env.example
index 747dcfb..a325ce3 100644
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,5 @@
 # Grafana Cloud OTLP credentials
 # Find these at: Grafana Cloud portal > Your Stack > Connections > OpenTelemetry (OTLP)
-# Instance ID: numeric ID shown on the OTLP configuration page
-# API Key: generate a Cloud Access Policy token with metrics:write scope
-GRAFANA_INSTANCE_ID=
-GRAFANA_API_KEY=
+# Copy the generated environment variable values from the configuration page
+GRAFANA_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
+GRAFANA_API_TOKEN=
diff --git a/README.md b/README.md
index ba224da..ed354a5 100644
--- a/README.md
+++ b/README.md
@@ -28,11 +28,14 @@ cp .env.example .env
 Then fill in your credentials:
 
 ```env
-GRAFANA_INSTANCE_ID=123456
-GRAFANA_API_KEY=glc_eyJ...
+GRAFANA_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
+GRAFANA_API_TOKEN=<base64-encoded token>
 ```
 
-To find these values, sign in to [Grafana Cloud](https://grafana.com), open your stack, and go to **Connections** > **OpenTelemetry (OTLP)**. The instance ID is the numeric identifier shown on the configuration page. For the API key, click **Generate now** to create a Cloud Access Policy token with `metrics:write` scope.
+To find these values, sign in to [Grafana Cloud](https://grafana.com), open your stack, and go to **Connections** > **OpenTelemetry (OTLP)**:
+
+1. Copy the **OTLP endpoint** URL
+2. Generate an API token — the page will show `OTEL_EXPORTER_OTLP_HEADERS` containing `Authorization=Basic <token>`. Copy the base64 token value after `Basic `.
 
 These variables are interpolated into `benchmark.config.json` at load time wherever `${VAR_NAME}` syntax is used.
 
@@ -150,9 +153,8 @@ All targets are defined in `benchmark.config.json` at the repo root. Each target
     }
   },
   "grafana": {
-    "endpoint": "https://otlp-gateway-prod-us-central-0.grafana.net/otlp",
-    "instanceId": "${GRAFANA_INSTANCE_ID}",
-    "apiKey": "${GRAFANA_API_KEY}"
+    "endpoint": "${GRAFANA_OTLP_ENDPOINT}",
+    "token": "${GRAFANA_API_TOKEN}"
   },
   "output": { "dir": "./results" }
 }
@@ -175,10 +177,10 @@ All targets are defined in `benchmark.config.json` at the repo root. Each target
 
 Grafana Cloud credentials are resolved from environment variables at config load time. The toolkit automatically loads a `.env` file from the repository root (see [Setup](#grafana-cloud-optional)).
 
-| Variable               | Description                                        |
-| ---------------------- | -------------------------------------------------- |
-| `GRAFANA_INSTANCE_ID`  | Grafana Cloud instance ID (numeric)                |
-| `GRAFANA_API_KEY`      | Cloud Access Policy token with `metrics:write`     |
+| Variable               | Description                                                    |
+| ---------------------- | -------------------------------------------------------------- |
+| `GRAFANA_OTLP_ENDPOINT`| Grafana Cloud OTLP gateway URL                                |
+| `GRAFANA_API_TOKEN`    | Base64-encoded token from `OTEL_EXPORTER_OTLP_HEADERS`        |
 
 Values in `benchmark.config.json` using `${VAR_NAME}` syntax are interpolated from `process.env`.
 
diff --git a/benchmark.config.json b/benchmark.config.json
index cf3896f..fe7ae0c 100644
--- a/benchmark.config.json
+++ b/benchmark.config.json
@@ -41,9 +41,8 @@
     }
   },
   "grafana": {
-    "endpoint": "https://otlp-gateway-prod-us-central-0.grafana.net/otlp",
-    "instanceId": "${GRAFANA_INSTANCE_ID}",
-    "apiKey": "${GRAFANA_API_KEY}"
+    "endpoint": "${GRAFANA_OTLP_ENDPOINT}",
+    "token": "${GRAFANA_API_TOKEN}"
   },
   "output": {
     "dir": "./results"
diff --git a/toolkit/src/config/schema.js b/toolkit/src/config/schema.js
index 391ed41..0c201f9 100644
--- a/toolkit/src/config/schema.js
+++ b/toolkit/src/config/schema.js
@@ -33,8 +33,7 @@ const targetSchema = z.object({
 
 const grafanaSchema = z.object({
   endpoint: z.string().url(),
-  instanceId: z.string(),
-  apiKey: z.string(),
+  token: z.string(),
 });
 
 export const configSchema = z.object({
diff --git a/toolkit/src/publish/grafana-cloud.js b/toolkit/src/publish/grafana-cloud.js
index 7c314eb..25166ed 100644
--- a/toolkit/src/publish/grafana-cloud.js
+++ b/toolkit/src/publish/grafana-cloud.js
@@ -8,17 +8,16 @@ export async function publishToGrafanaCloud(results, config) {
     throw new Error('grafana config is required for publishing');
   }
 
-  const { endpoint, instanceId, apiKey } = config.grafana;
+  const { endpoint, token } = config.grafana;
 
-  if (!instanceId || !apiKey) {
+  if (!token) {
     throw new Error(
-      'grafana instanceId and apiKey are required. Set GRAFANA_INSTANCE_ID and GRAFANA_API_KEY environment variables.'
+      'grafana token is required. Set GRAFANA_API_TOKEN environment variable.'
     );
   }
 
   const body = formatAsOtlpMetrics(results);
   const url = `${endpoint}/v1/metrics`;
-  const auth = Buffer.from(`${instanceId}:${apiKey}`).toString('base64');
 
   log.debug({
     url,
@@ -35,7 +34,7 @@ export async function publishToGrafanaCloud(results, config) {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
-          Authorization: `Basic ${auth}`,
+          Authorization: `Basic ${token}`,
         },
         body: JSON.stringify(body),
       });

From 5e5b24078ad897ef258f356626e7e8078bec636c Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:41:01 +1100
Subject: [PATCH 09/23] Use standard OTel env vars for Grafana Cloud
 credentials

Align on OTEL_EXPORTER_OTLP_ENDPOINT and OTEL_EXPORTER_OTLP_HEADERS
which are the exact env vars Grafana Cloud generates on the OTLP
configuration page. Parse the headers string at publish time.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example                         |  6 +++---
 README.md                            | 21 +++++++++------------
 benchmark.config.json                |  4 ++--
 toolkit/src/config/schema.js         |  2 +-
 toolkit/src/publish/grafana-cloud.js | 20 +++++++++++++-------
 5 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/.env.example b/.env.example
index a325ce3..a795ff1 100644
--- a/.env.example
+++ b/.env.example
@@ -1,5 +1,5 @@
 # Grafana Cloud OTLP credentials
 # Find these at: Grafana Cloud portal > Your Stack > Connections > OpenTelemetry (OTLP)
-# Copy the generated environment variable values from the configuration page
-GRAFANA_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
-GRAFANA_API_TOKEN=
+# Generate an API token and copy the two environment variables shown
+OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
+OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic <token>
diff --git a/README.md b/README.md
index ed354a5..16775e5 100644
--- a/README.md
+++ b/README.md
@@ -28,14 +28,11 @@ cp .env.example .env
 Then fill in your credentials:
 
 ```env
-GRAFANA_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
-GRAFANA_API_TOKEN=<base64-encoded token>
+OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
+OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic <token>
 ```
 
-To find these values, sign in to [Grafana Cloud](https://grafana.com), open your stack, and go to **Connections** > **OpenTelemetry (OTLP)**:
-
-1. Copy the **OTLP endpoint** URL
-2. Generate an API token — the page will show `OTEL_EXPORTER_OTLP_HEADERS` containing `Authorization=Basic <token>`. Copy the base64 token value after `Basic `.
+To find these values, sign in to [Grafana Cloud](https://grafana.com), open your stack, and go to **Connections** > **OpenTelemetry (OTLP)**. Generate an API token and copy the two environment variables shown on the page.
 
 These variables are interpolated into `benchmark.config.json` at load time wherever `${VAR_NAME}` syntax is used.
 
@@ -153,8 +150,8 @@ All targets are defined in `benchmark.config.json` at the repo root. Each target
     }
   },
   "grafana": {
-    "endpoint": "${GRAFANA_OTLP_ENDPOINT}",
-    "token": "${GRAFANA_API_TOKEN}"
+    "endpoint": "${OTEL_EXPORTER_OTLP_ENDPOINT}",
+    "headers": "${OTEL_EXPORTER_OTLP_HEADERS}"
   },
   "output": { "dir": "./results" }
 }
@@ -177,10 +174,10 @@ All targets are defined in `benchmark.config.json` at the repo root. Each target
 
 Grafana Cloud credentials are resolved from environment variables at config load time. The toolkit automatically loads a `.env` file from the repository root (see [Setup](#grafana-cloud-optional)).
 
-| Variable               | Description                                                    |
-| ---------------------- | -------------------------------------------------------------- |
-| `GRAFANA_OTLP_ENDPOINT`| Grafana Cloud OTLP gateway URL                                |
-| `GRAFANA_API_TOKEN`    | Base64-encoded token from `OTEL_EXPORTER_OTLP_HEADERS`        |
+| Variable                       | Description                                  |
+| ------------------------------ | -------------------------------------------- |
+| `OTEL_EXPORTER_OTLP_ENDPOINT` | Grafana Cloud OTLP gateway URL               |
+| `OTEL_EXPORTER_OTLP_HEADERS`  | Auth header (`Authorization=Basic <token>`)  |
 
 Values in `benchmark.config.json` using `${VAR_NAME}` syntax are interpolated from `process.env`.
 
diff --git a/benchmark.config.json b/benchmark.config.json
index fe7ae0c..693fd27 100644
--- a/benchmark.config.json
+++ b/benchmark.config.json
@@ -41,8 +41,8 @@
     }
   },
   "grafana": {
-    "endpoint": "${GRAFANA_OTLP_ENDPOINT}",
-    "token": "${GRAFANA_API_TOKEN}"
+    "endpoint": "${OTEL_EXPORTER_OTLP_ENDPOINT}",
+    "headers": "${OTEL_EXPORTER_OTLP_HEADERS}"
   },
   "output": {
     "dir": "./results"
diff --git a/toolkit/src/config/schema.js b/toolkit/src/config/schema.js
index 0c201f9..a4e7043 100644
--- a/toolkit/src/config/schema.js
+++ b/toolkit/src/config/schema.js
@@ -33,7 +33,7 @@ const targetSchema = z.object({
 
 const grafanaSchema = z.object({
   endpoint: z.string().url(),
-  token: z.string(),
+  headers: z.string(),
 });
 
 export const configSchema = z.object({
diff --git a/toolkit/src/publish/grafana-cloud.js b/toolkit/src/publish/grafana-cloud.js
index 25166ed..9ecc26b 100644
--- a/toolkit/src/publish/grafana-cloud.js
+++ b/toolkit/src/publish/grafana-cloud.js
@@ -8,17 +8,26 @@ export async function publishToGrafanaCloud(results, config) {
     throw new Error('grafana config is required for publishing');
   }
 
-  const { endpoint, token } = config.grafana;
+  const { endpoint, headers } = config.grafana;
 
-  if (!token) {
+  if (!headers) {
     throw new Error(
-      'grafana token is required. Set GRAFANA_API_TOKEN environment variable.'
+      'grafana headers are required. Set OTEL_EXPORTER_OTLP_HEADERS environment variable.'
     );
   }
 
   const body = formatAsOtlpMetrics(results);
   const url = `${endpoint}/v1/metrics`;
 
+  // Parse OTLP headers format: "Key=Value,Key2=Value2"
+  const parsedHeaders = { 'Content-Type': 'application/json' };
+  for (const entry of headers.split(',')) {
+    const idx = entry.indexOf('=');
+    if (idx > 0) {
+      parsedHeaders[entry.slice(0, idx).trim()] = entry.slice(idx + 1).trim();
+    }
+  }
+
   log.debug({
     url,
     metricsCount: body.resourceMetrics[0].scopeMetrics[0].metrics.length,
@@ -32,10 +41,7 @@ export async function publishToGrafanaCloud(results, config) {
     try {
       const response = await fetch(url, {
         method: 'POST',
-        headers: {
-          'Content-Type': 'application/json',
-          Authorization: `Basic ${token}`,
-        },
+        headers: parsedHeaders,
         body: JSON.stringify(body),
       });
 

From a1d104885bd3756e5da7cd7ca3fb9feb06e850d0 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:46:22 +1100
Subject: [PATCH 10/23] Support multiple files in publish command

Accept variadic arguments so results can be batch-published without
needing to re-run benchmarks.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 README.md                           |  8 ++++++--
 toolkit/src/cli/commands/publish.js | 20 ++++++++++++--------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 16775e5..735d02a 100644
--- a/README.md
+++ b/README.md
@@ -231,12 +231,16 @@ npm run benchmark -- loadtest my-api
 npm run benchmark -- loadtest my-api --k6-vus 100 --k6-duration 1m
 ```
 
-### `benchmark publish <results>`
+### `benchmark publish <results...>`
 
-Publishes a results JSON file to Grafana Cloud.
+Publishes one or more results JSON files to Grafana Cloud. Accepts multiple files so you can batch-publish results from previous runs.
 
 ```bash
+# publish a single result
 npm run benchmark -- publish results/my-api-2026-03-04T12-00-00-000Z.json
+
+# publish multiple results at once
+npm run benchmark -- publish results/connect-rpc-*.json results/spring-boot-*.json
 ```
 
 ### `benchmark compare <targets...>`
diff --git a/toolkit/src/cli/commands/publish.js b/toolkit/src/cli/commands/publish.js
index 4126ff8..30e265f 100644
--- a/toolkit/src/cli/commands/publish.js
+++ b/toolkit/src/cli/commands/publish.js
@@ -8,18 +8,22 @@ import { getLogger } from '../../util/logger.js';
 export function publishCommand() {
   return new Command('publish')
     .description('publish benchmark results to Grafana Cloud')
-    .argument('<results>', 'path to results JSON file')
-    .action(async (resultsPath, options, command) => {
+    .argument('<results...>', 'path(s) to results JSON file(s)')
+    .action(async (resultsPaths, options, command) => {
       const log = getLogger();
       const globalOpts = command.parent.opts();
       const config = await loadConfig(globalOpts.config);
 
-      const absolutePath = resolve(resultsPath);
-      const content = await readFile(absolutePath, 'utf-8');
-      const results = JSON.parse(content);
+      for (const resultsPath of resultsPaths) {
+        const absolutePath = resolve(resultsPath);
+        const content = await readFile(absolutePath, 'utf-8');
+        const results = JSON.parse(content);
 
-      log.info({ file: absolutePath, target: results.target, msg: 'publishing results' });
-      await publishToGrafanaCloud(results, config);
-      log.info({ msg: 'publish completed' });
+        log.info({ file: absolutePath, target: results.target, msg: 'publishing results' });
+        await publishToGrafanaCloud(results, config);
+        log.info({ target: results.target, msg: 'published' });
+      }
+
+      log.info({ count: resultsPaths.length, msg: 'publish completed' });
     });
 }

From 49f2b8e9d99c5cb067f7576720cc99ae1fe3cd93 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:55:13 +1100
Subject: [PATCH 11/23] Replace custom OTLP publisher with OpenTelemetry JS SDK

Use @opentelemetry/exporter-metrics-otlp-http which natively reads
OTEL_EXPORTER_OTLP_ENDPOINT and OTEL_EXPORTER_OTLP_HEADERS from env
vars. Remove custom header parsing, OTLP formatter, and grafana config
section from benchmark.config.json.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example                         |   2 +-
 README.md                            |  14 +-
 benchmark.config.json                |   4 -
 package-lock.json                    | 268 ++++++++++++++++++++++++++-
 package.json                         |   4 +
 toolkit/src/cli/commands/publish.js  |  10 +-
 toolkit/src/cli/commands/run.js      |   5 +-
 toolkit/src/config/schema.js         |   6 -
 toolkit/src/publish/formatter.js     | 179 ------------------
 toolkit/src/publish/grafana-cloud.js |  76 --------
 toolkit/src/publish/otlp.js          |  72 +++++++
 11 files changed, 352 insertions(+), 288 deletions(-)
 delete mode 100644 toolkit/src/publish/formatter.js
 delete mode 100644 toolkit/src/publish/grafana-cloud.js
 create mode 100644 toolkit/src/publish/otlp.js

diff --git a/.env.example b/.env.example
index a795ff1..386edc4 100644
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,4 @@
-# Grafana Cloud OTLP credentials
+# Grafana Cloud OTLP credentials (standard OpenTelemetry env vars)
 # Find these at: Grafana Cloud portal > Your Stack > Connections > OpenTelemetry (OTLP)
 # Generate an API token and copy the two environment variables shown
 OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
diff --git a/README.md b/README.md
index 735d02a..009c525 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ To publish benchmark results to Grafana Cloud, create a `.env` file at the repos
 cp .env.example .env
 ```
 
-Then fill in your credentials:
+Then fill in the standard OpenTelemetry env vars:
 
 ```env
 OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
@@ -34,7 +34,7 @@ OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic <token>
 
 To find these values, sign in to [Grafana Cloud](https://grafana.com), open your stack, and go to **Connections** > **OpenTelemetry (OTLP)**. Generate an API token and copy the two environment variables shown on the page.
 
-These variables are interpolated into `benchmark.config.json` at load time wherever `${VAR_NAME}` syntax is used.
+The toolkit uses the official [OpenTelemetry JS SDK](https://opentelemetry.io/docs/languages/js/) to export metrics, so it reads these env vars natively.
 
 ## Quick start
 
@@ -149,10 +149,6 @@ All targets are defined in `benchmark.config.json` at the repo root. Each target
       "timeoutMs": 120000
     }
   },
-  "grafana": {
-    "endpoint": "${OTEL_EXPORTER_OTLP_ENDPOINT}",
-    "headers": "${OTEL_EXPORTER_OTLP_HEADERS}"
-  },
   "output": { "dir": "./results" }
 }
 ```
@@ -179,8 +175,6 @@ Grafana Cloud credentials are resolved from environment variables at config load
 | `OTEL_EXPORTER_OTLP_ENDPOINT` | Grafana Cloud OTLP gateway URL               |
 | `OTEL_EXPORTER_OTLP_HEADERS`  | Auth header (`Authorization=Basic <token>`)  |
 
-Values in `benchmark.config.json` using `${VAR_NAME}` syntax are interpolated from `process.env`.
-
 ## Commands
 
 ### `benchmark run <target>`
@@ -304,7 +298,7 @@ Parsed from k6's `--summary-export` JSON output. Supports both HTTP (`http_req_d
 
 ## Grafana Cloud publishing
 
-Results are pushed to Grafana Cloud using the [OTLP/HTTP JSON](https://opentelemetry.io/docs/specs/otlp/) protocol. Each metric is sent as a gauge data point labeled with the target name, tag, and environment info.
+Results are pushed to Grafana Cloud using the [OpenTelemetry JS SDK](https://opentelemetry.io/docs/languages/js/) via OTLP/HTTP. Each metric is sent as a gauge data point with the target name, tag, and environment info as resource attributes. The SDK reads `OTEL_EXPORTER_OTLP_ENDPOINT` and `OTEL_EXPORTER_OTLP_HEADERS` from the environment (loaded via `.env`).
 
 Metrics appear in Grafana with the `benchmark.*` prefix:
 
@@ -395,7 +389,7 @@ toolkit/                       # the benchmark CLI toolkit
     config/                    # zod schema, loader, defaults
     core/                      # docker, k6, timer, health check
     metrics/                   # result collector, k6 parser
-    publish/                   # OTLP formatter, Grafana Cloud client
+    publish/                   # OpenTelemetry OTLP metrics export
     report/                    # comparison logic, terminal table, JSON writer
   k6/scripts/                  # bundled k6 test scripts
 projects/                      # benchmark target projects
diff --git a/benchmark.config.json b/benchmark.config.json
index 693fd27..e10938c 100644
--- a/benchmark.config.json
+++ b/benchmark.config.json
@@ -40,10 +40,6 @@
       "timeoutMs": 120000
     }
   },
-  "grafana": {
-    "endpoint": "${OTEL_EXPORTER_OTLP_ENDPOINT}",
-    "headers": "${OTEL_EXPORTER_OTLP_HEADERS}"
-  },
   "output": {
     "dir": "./results"
   }
diff --git a/package-lock.json b/package-lock.json
index 3aa54ee..5350e74 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -12,6 +12,10 @@
         "toolkit"
       ],
       "dependencies": {
+        "@opentelemetry/exporter-metrics-otlp-http": "^0.213.0",
+        "@opentelemetry/resources": "^2.6.0",
+        "@opentelemetry/sdk-metrics": "^2.6.0",
+        "@opentelemetry/semantic-conventions": "^1.40.0",
         "dotenv": "^17.3.1"
       },
       "devDependencies": {
@@ -556,12 +560,244 @@
       "resolved": "toolkit",
       "link": true
     },
+    "node_modules/@opentelemetry/api": {
+      "version": "1.9.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz",
+      "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/@opentelemetry/api-logs": {
+      "version": "0.213.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.213.0.tgz",
+      "integrity": "sha512-zRM5/Qj6G84Ej3F1yt33xBVY/3tnMxtL1fiDIxYbDWYaZ/eudVw3/PBiZ8G7JwUxXxjW8gU4g6LnOyfGKYHYgw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      },
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/@opentelemetry/core": {
+      "version": "2.6.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.6.0.tgz",
+      "integrity": "sha512-HLM1v2cbZ4TgYN6KEOj+Bbj8rAKriOdkF9Ed3tG25FoprSiQl7kYc+RRT6fUZGOvx0oMi5U67GoFdT+XUn8zEg==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/exporter-metrics-otlp-http": {
+      "version": "0.213.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/exporter-metrics-otlp-http/-/exporter-metrics-otlp-http-0.213.0.tgz",
+      "integrity": "sha512-yw3fTIw4KQIRXC/ZyYQq5gtA3Ogfdfz/g5HVgleobQAcjUUE8Nj3spGMx8iQPp+S+u6/js7BixufRkXhzLmpJA==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.6.0",
+        "@opentelemetry/otlp-exporter-base": "0.213.0",
+        "@opentelemetry/otlp-transformer": "0.213.0",
+        "@opentelemetry/resources": "2.6.0",
+        "@opentelemetry/sdk-metrics": "2.6.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      }
+    },
+    "node_modules/@opentelemetry/otlp-exporter-base": {
+      "version": "0.213.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-exporter-base/-/otlp-exporter-base-0.213.0.tgz",
+      "integrity": "sha512-MegxAP1/n09Ob2dQvY5NBDVjAFkZRuKtWKxYev1R2M8hrsgXzQGkaMgoEKeUOyQ0FUyYcO29UOnYdQWmWa0PXg==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.6.0",
+        "@opentelemetry/otlp-transformer": "0.213.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      }
+    },
+    "node_modules/@opentelemetry/otlp-transformer": {
+      "version": "0.213.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-transformer/-/otlp-transformer-0.213.0.tgz",
+      "integrity": "sha512-RSuAlxFFPjeK4d5Y6ps8L2WhaQI6CXWllIjvo5nkAlBpmq2XdYWEBGiAbOF4nDs8CX4QblJDv5BbMUft3sEfDw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api-logs": "0.213.0",
+        "@opentelemetry/core": "2.6.0",
+        "@opentelemetry/resources": "2.6.0",
+        "@opentelemetry/sdk-logs": "0.213.0",
+        "@opentelemetry/sdk-metrics": "2.6.0",
+        "@opentelemetry/sdk-trace-base": "2.6.0",
+        "protobufjs": "^7.0.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      }
+    },
+    "node_modules/@opentelemetry/resources": {
+      "version": "2.6.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.6.0.tgz",
+      "integrity": "sha512-D4y/+OGe3JSuYUCBxtH5T9DSAWNcvCb/nQWIga8HNtXTVPQn59j0nTBAgaAXxUVBDl40mG3Tc76b46wPlZaiJQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.6.0",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-logs": {
+      "version": "0.213.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-logs/-/sdk-logs-0.213.0.tgz",
+      "integrity": "sha512-00xlU3GZXo3kXKve4DLdrAL0NAFUaZ9appU/mn00S/5kSUdAvyYsORaDUfR04Mp2CLagAOhrzfUvYozY/EZX2g==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api-logs": "0.213.0",
+        "@opentelemetry/core": "2.6.0",
+        "@opentelemetry/resources": "2.6.0",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.4.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-metrics": {
+      "version": "2.6.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-2.6.0.tgz",
+      "integrity": "sha512-CicxWZxX6z35HR83jl+PLgtFgUrKRQ9LCXyxgenMnz5A1lgYWfAog7VtdOvGkJYyQgMNPhXQwkYrDLujk7z1Iw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.6.0",
+        "@opentelemetry/resources": "2.6.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.9.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-trace-base": {
+      "version": "2.6.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.6.0.tgz",
+      "integrity": "sha512-g/OZVkqlxllgFM7qMKqbPV9c1DUPhQ7d4n3pgZFcrnrNft9eJXZM2TNHTPYREJBrtNdRytYyvwjgL5geDKl3EQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.6.0",
+        "@opentelemetry/resources": "2.6.0",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/semantic-conventions": {
+      "version": "1.40.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-1.40.0.tgz",
+      "integrity": "sha512-cifvXDhcqMwwTlTK04GBNeIe7yyo28Mfby85QXFe1Yk8nmi36Ab/5UQwptOx84SsoGNRg+EVSjwzfSZMy6pmlw==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=14"
+      }
+    },
     "node_modules/@pinojs/redact": {
       "version": "0.4.0",
       "resolved": "https://registry.npmjs.org/@pinojs/redact/-/redact-0.4.0.tgz",
       "integrity": "sha512-k2ENnmBugE/rzQfEcdWHcCY+/FM3VLzH9cYEsbdsoqrvzAKRhUZeRNhAZvB8OitQJ1TBed3yqWtdjzS6wJKBwg==",
       "license": "MIT"
     },
+    "node_modules/@protobufjs/aspromise": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
+      "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/base64": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz",
+      "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/codegen": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz",
+      "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/eventemitter": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz",
+      "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/fetch": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz",
+      "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==",
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@protobufjs/aspromise": "^1.1.1",
+        "@protobufjs/inquire": "^1.1.0"
+      }
+    },
+    "node_modules/@protobufjs/float": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz",
+      "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/inquire": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz",
+      "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/path": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz",
+      "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/pool": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz",
+      "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/utf8": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
+      "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==",
+      "license": "BSD-3-Clause"
+    },
     "node_modules/@sec-ant/readable-stream": {
       "version": "0.4.1",
       "resolved": "https://registry.npmjs.org/@sec-ant/readable-stream/-/readable-stream-0.4.1.tgz",
@@ -605,7 +841,6 @@
       "version": "25.3.5",
       "resolved": "https://registry.npmjs.org/@types/node/-/node-25.3.5.tgz",
       "integrity": "sha512-oX8xrhvpiyRCQkG1MFchB09f+cXftgIXb3a7UUa4Y3wpmZPw5tyZGTLWhlESOLq1Rq6oDlc8npVU2/9xiCuXMA==",
-      "dev": true,
       "license": "MIT",
       "dependencies": {
         "undici-types": "~7.18.0"
@@ -1327,6 +1562,12 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/long": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",
+      "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==",
+      "license": "Apache-2.0"
+    },
     "node_modules/minimatch": {
       "version": "10.2.4",
       "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.4.tgz",
@@ -1620,6 +1861,30 @@
       ],
       "license": "MIT"
     },
+    "node_modules/protobufjs": {
+      "version": "7.5.4",
+      "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz",
+      "integrity": "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg==",
+      "hasInstallScript": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@protobufjs/aspromise": "^1.1.2",
+        "@protobufjs/base64": "^1.1.2",
+        "@protobufjs/codegen": "^2.0.4",
+        "@protobufjs/eventemitter": "^1.1.0",
+        "@protobufjs/fetch": "^1.1.0",
+        "@protobufjs/float": "^1.0.2",
+        "@protobufjs/inquire": "^1.1.0",
+        "@protobufjs/path": "^1.1.2",
+        "@protobufjs/pool": "^1.1.0",
+        "@protobufjs/utf8": "^1.1.0",
+        "@types/node": ">=13.7.0",
+        "long": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=12.0.0"
+      }
+    },
     "node_modules/pump": {
       "version": "3.0.4",
       "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.4.tgz",
@@ -1816,7 +2081,6 @@
       "version": "7.18.2",
       "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz",
       "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==",
-      "dev": true,
       "license": "MIT"
     },
     "node_modules/unicorn-magic": {
diff --git a/package.json b/package.json
index 56fa489..91a0a7f 100644
--- a/package.json
+++ b/package.json
@@ -29,6 +29,10 @@
     "@anthropic-ai/claude-code": "^2.1.63"
   },
   "dependencies": {
+    "@opentelemetry/exporter-metrics-otlp-http": "^0.213.0",
+    "@opentelemetry/resources": "^2.6.0",
+    "@opentelemetry/sdk-metrics": "^2.6.0",
+    "@opentelemetry/semantic-conventions": "^1.40.0",
     "dotenv": "^17.3.1"
   }
 }
diff --git a/toolkit/src/cli/commands/publish.js b/toolkit/src/cli/commands/publish.js
index 30e265f..ef21448 100644
--- a/toolkit/src/cli/commands/publish.js
+++ b/toolkit/src/cli/commands/publish.js
@@ -1,18 +1,15 @@
 import { Command } from 'commander';
 import { readFile } from 'node:fs/promises';
 import { resolve } from 'node:path';
-import { loadConfig } from '../../config/loader.js';
-import { publishToGrafanaCloud } from '../../publish/grafana-cloud.js';
+import { publishResults } from '../../publish/otlp.js';
 import { getLogger } from '../../util/logger.js';
 
 export function publishCommand() {
   return new Command('publish')
     .description('publish benchmark results to Grafana Cloud')
     .argument('<results...>', 'path(s) to results JSON file(s)')
-    .action(async (resultsPaths, options, command) => {
+    .action(async (resultsPaths) => {
       const log = getLogger();
-      const globalOpts = command.parent.opts();
-      const config = await loadConfig(globalOpts.config);
 
       for (const resultsPath of resultsPaths) {
         const absolutePath = resolve(resultsPath);
@@ -20,8 +17,7 @@ export function publishCommand() {
         const results = JSON.parse(content);
 
         log.info({ file: absolutePath, target: results.target, msg: 'publishing results' });
-        await publishToGrafanaCloud(results, config);
-        log.info({ target: results.target, msg: 'published' });
+        await publishResults(results);
       }
 
       log.info({ count: resultsPaths.length, msg: 'publish completed' });
diff --git a/toolkit/src/cli/commands/run.js b/toolkit/src/cli/commands/run.js
index 9959577..e4b0150 100644
--- a/toolkit/src/cli/commands/run.js
+++ b/toolkit/src/cli/commands/run.js
@@ -101,9 +101,8 @@ export function runCommand() {
 
         // --- Publish (if requested) ---
         if (options.publish) {
-          const { publishToGrafanaCloud } = await import('../../publish/grafana-cloud.js');
-          await publishToGrafanaCloud(results, config);
-          log.info({ msg: 'results published to Grafana Cloud' });
+          const { publishResults } = await import('../../publish/otlp.js');
+          await publishResults(results);
         }
       } finally {
         // --- Cleanup (only if deploy was attempted) ---
diff --git a/toolkit/src/config/schema.js b/toolkit/src/config/schema.js
index a4e7043..dad1942 100644
--- a/toolkit/src/config/schema.js
+++ b/toolkit/src/config/schema.js
@@ -31,11 +31,6 @@ const targetSchema = z.object({
   tags: z.record(z.string()).default({}),
 });
 
-const grafanaSchema = z.object({
-  endpoint: z.string().url(),
-  headers: z.string(),
-});
-
 export const configSchema = z.object({
   targets: z.record(targetSchema),
   defaults: z
@@ -44,7 +39,6 @@ export const configSchema = z.object({
       readinessProbe: readinessProbeSchema.partial().default({}),
     })
     .default({}),
-  grafana: grafanaSchema.optional(),
   output: z
     .object({
       dir: z.string().default('./results'),
diff --git a/toolkit/src/publish/formatter.js b/toolkit/src/publish/formatter.js
deleted file mode 100644
index a670ac1..0000000
--- a/toolkit/src/publish/formatter.js
+++ /dev/null
@@ -1,179 +0,0 @@
-function makeAttribute(key, value) {
-  if (typeof value === 'number') {
-    return { key, value: { intValue: String(Math.round(value)) } };
-  }
-  return { key, value: { stringValue: String(value) } };
-}
-
-function makeGauge(name, unit, value, timeUnixNano, attributes) {
-  const dataPoint = {
-    timeUnixNano: String(timeUnixNano),
-    attributes,
-  };
-
-  if (Number.isInteger(value)) {
-    dataPoint.asInt = String(value);
-  } else {
-    dataPoint.asDouble = value;
-  }
-
-  return {
-    name,
-    unit,
-    gauge: {
-      dataPoints: [dataPoint],
-    },
-  };
-}
-
-export function formatAsOtlpMetrics(results) {
-  const timeUnixNano = BigInt(new Date(results.timestamp).getTime()) * 1_000_000n;
-
-  const resourceAttributes = [
-    makeAttribute('benchmark.target', results.target),
-    makeAttribute('benchmark.tag', results.tag),
-    makeAttribute('host.os', results.environment.os),
-    makeAttribute('host.arch', results.environment.arch),
-  ];
-
-  // Add target tags as resource attributes
-  if (results.metrics.loadtest?.config) {
-    const { vus, duration } = results.metrics.loadtest.config;
-    resourceAttributes.push(makeAttribute('benchmark.k6.vus', vus));
-    resourceAttributes.push(makeAttribute('benchmark.k6.duration', duration));
-  }
-
-  const metricAttributes = [
-    makeAttribute('benchmark.target', results.target),
-    makeAttribute('benchmark.tag', results.tag),
-  ];
-
-  const metrics = [];
-
-  // Build metrics
-  if (results.metrics.build) {
-    metrics.push(
-      makeGauge(
-        'benchmark.build.duration',
-        'ms',
-        results.metrics.build.durationMs,
-        timeUnixNano,
-        metricAttributes
-      )
-    );
-  }
-
-  // Deploy metrics
-  if (results.metrics.deploy) {
-    metrics.push(
-      makeGauge(
-        'benchmark.deploy.duration',
-        'ms',
-        results.metrics.deploy.durationMs,
-        timeUnixNano,
-        metricAttributes
-      )
-    );
-  }
-
-  // Load test metrics
-  if (results.metrics.loadtest) {
-    const { summary } = results.metrics.loadtest;
-
-    metrics.push(
-      makeGauge('benchmark.reqs', '1', summary.reqs, timeUnixNano, metricAttributes),
-      makeGauge(
-        'benchmark.reqs_per_sec',
-        '1/s',
-        summary.reqsPerSec,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.req_failed_rate',
-        '1',
-        summary.reqFailedRate,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.req.duration.avg',
-        'ms',
-        summary.reqDuration.avg,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.req.duration.min',
-        'ms',
-        summary.reqDuration.min,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.req.duration.med',
-        'ms',
-        summary.reqDuration.med,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.req.duration.max',
-        'ms',
-        summary.reqDuration.max,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.req.duration.p90',
-        'ms',
-        summary.reqDuration.p90,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.req.duration.p95',
-        'ms',
-        summary.reqDuration.p95,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.req.duration.p99',
-        'ms',
-        summary.reqDuration.p99,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge(
-        'benchmark.checks_pass_rate',
-        '1',
-        summary.checksPassRate,
-        timeUnixNano,
-        metricAttributes
-      ),
-      makeGauge('benchmark.iterations', '1', summary.iterations, timeUnixNano, metricAttributes),
-      makeGauge(
-        'benchmark.iterations_per_sec',
-        '1/s',
-        summary.iterationsPerSec,
-        timeUnixNano,
-        metricAttributes
-      )
-    );
-  }
-
-  return {
-    resourceMetrics: [
-      {
-        resource: { attributes: resourceAttributes },
-        scopeMetrics: [
-          {
-            scope: { name: '@labset/benchmark-toolkit', version: '1.0.0' },
-            metrics,
-          },
-        ],
-      },
-    ],
-  };
-}
diff --git a/toolkit/src/publish/grafana-cloud.js b/toolkit/src/publish/grafana-cloud.js
deleted file mode 100644
index 9ecc26b..0000000
--- a/toolkit/src/publish/grafana-cloud.js
+++ /dev/null
@@ -1,76 +0,0 @@
-import { formatAsOtlpMetrics } from './formatter.js';
-import { getLogger } from '../util/logger.js';
-
-export async function publishToGrafanaCloud(results, config) {
-  const log = getLogger();
-
-  if (!config.grafana) {
-    throw new Error('grafana config is required for publishing');
-  }
-
-  const { endpoint, headers } = config.grafana;
-
-  if (!headers) {
-    throw new Error(
-      'grafana headers are required. Set OTEL_EXPORTER_OTLP_HEADERS environment variable.'
-    );
-  }
-
-  const body = formatAsOtlpMetrics(results);
-  const url = `${endpoint}/v1/metrics`;
-
-  // Parse OTLP headers format: "Key=Value,Key2=Value2"
-  const parsedHeaders = { 'Content-Type': 'application/json' };
-  for (const entry of headers.split(',')) {
-    const idx = entry.indexOf('=');
-    if (idx > 0) {
-      parsedHeaders[entry.slice(0, idx).trim()] = entry.slice(idx + 1).trim();
-    }
-  }
-
-  log.debug({
-    url,
-    metricsCount: body.resourceMetrics[0].scopeMetrics[0].metrics.length,
-    msg: 'publishing to Grafana Cloud',
-  });
-
-  const maxRetries = 3;
-  let lastError;
-
-  for (let attempt = 1; attempt <= maxRetries; attempt++) {
-    try {
-      const response = await fetch(url, {
-        method: 'POST',
-        headers: parsedHeaders,
-        body: JSON.stringify(body),
-      });
-
-      if (response.ok) {
-        log.info({
-          status: response.status,
-          target: results.target,
-          msg: 'published to Grafana Cloud',
-        });
-        return;
-      }
-
-      if (response.status === 429 && attempt < maxRetries) {
-        const retryAfter = parseInt(response.headers.get('retry-after') || '5', 10);
-        log.warn({ retryAfter, attempt, msg: 'rate limited, retrying' });
-        await new Promise((r) => setTimeout(r, retryAfter * 1000));
-        continue;
-      }
-
-      const responseText = await response.text();
-      lastError = new Error(`Grafana Cloud returned ${response.status}: ${responseText}`);
-    } catch (err) {
-      lastError = err;
-      if (attempt < maxRetries) {
-        log.warn({ error: err.message, attempt, msg: 'publish failed, retrying' });
-        await new Promise((r) => setTimeout(r, 2000 * attempt));
-      }
-    }
-  }
-
-  throw lastError;
-}
diff --git a/toolkit/src/publish/otlp.js b/toolkit/src/publish/otlp.js
new file mode 100644
index 0000000..13e1e2a
--- /dev/null
+++ b/toolkit/src/publish/otlp.js
@@ -0,0 +1,72 @@
+import { MeterProvider } from '@opentelemetry/sdk-metrics';
+import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-http';
+import { Resource } from '@opentelemetry/resources';
+import { getLogger } from '../util/logger.js';
+
+export async function publishResults(results) {
+  const log = getLogger();
+
+  if (!process.env.OTEL_EXPORTER_OTLP_ENDPOINT) {
+    throw new Error(
+      'OTEL_EXPORTER_OTLP_ENDPOINT is not set. Configure it in your .env file.'
+    );
+  }
+
+  const resource = new Resource({
+    'service.name': '@labset/benchmark-toolkit',
+    'service.version': '1.0.0',
+    'benchmark.target': results.target,
+    'benchmark.tag': results.tag,
+    'host.os': results.environment.os,
+    'host.arch': results.environment.arch,
+  });
+
+  const exporter = new OTLPMetricExporter();
+  const meterProvider = new MeterProvider({
+    resource,
+    readers: [
+      new (await import('@opentelemetry/sdk-metrics')).PeriodicExportingMetricReader({
+        exporter,
+        exportIntervalMillis: 60_000,
+      }),
+    ],
+  });
+
+  const meter = meterProvider.getMeter('benchmark');
+
+  // Build metrics
+  if (results.metrics.build) {
+    meter.createGauge('benchmark.build.duration', { unit: 'ms' })
+      .record(results.metrics.build.durationMs);
+  }
+
+  // Deploy metrics
+  if (results.metrics.deploy) {
+    meter.createGauge('benchmark.deploy.duration', { unit: 'ms' })
+      .record(results.metrics.deploy.durationMs);
+  }
+
+  // Load test metrics
+  if (results.metrics.loadtest) {
+    const s = results.metrics.loadtest.summary;
+
+    meter.createGauge('benchmark.reqs', { unit: '1' }).record(s.reqs);
+    meter.createGauge('benchmark.reqs_per_sec', { unit: '1/s' }).record(s.reqsPerSec);
+    meter.createGauge('benchmark.req_failed_rate', { unit: '1' }).record(s.reqFailedRate);
+    meter.createGauge('benchmark.req.duration.avg', { unit: 'ms' }).record(s.reqDuration.avg);
+    meter.createGauge('benchmark.req.duration.min', { unit: 'ms' }).record(s.reqDuration.min);
+    meter.createGauge('benchmark.req.duration.med', { unit: 'ms' }).record(s.reqDuration.med);
+    meter.createGauge('benchmark.req.duration.max', { unit: 'ms' }).record(s.reqDuration.max);
+    meter.createGauge('benchmark.req.duration.p90', { unit: 'ms' }).record(s.reqDuration.p90);
+    meter.createGauge('benchmark.req.duration.p95', { unit: 'ms' }).record(s.reqDuration.p95);
+    meter.createGauge('benchmark.req.duration.p99', { unit: 'ms' }).record(s.reqDuration.p99);
+    meter.createGauge('benchmark.checks_pass_rate', { unit: '1' }).record(s.checksPassRate);
+    meter.createGauge('benchmark.iterations', { unit: '1' }).record(s.iterations);
+    meter.createGauge('benchmark.iterations_per_sec', { unit: '1/s' }).record(s.iterationsPerSec);
+  }
+
+  await meterProvider.forceFlush();
+  await meterProvider.shutdown();
+
+  log.info({ target: results.target, msg: 'published to Grafana Cloud' });
+}

From 057fa18f8859728ed3eb3184cc568bd5bab7caac Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 10:57:37 +1100
Subject: [PATCH 12/23] Update README quick start with publish command, clear
 .env.example values

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example | 4 ++--
 README.md    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.env.example b/.env.example
index 386edc4..6f26ade 100644
--- a/.env.example
+++ b/.env.example
@@ -1,5 +1,5 @@
 # Grafana Cloud OTLP credentials (standard OpenTelemetry env vars)
 # Find these at: Grafana Cloud portal > Your Stack > Connections > OpenTelemetry (OTLP)
 # Generate an API token and copy the two environment variables shown
-OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
-OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic <token>
+OTEL_EXPORTER_OTLP_ENDPOINT=
+OTEL_EXPORTER_OTLP_HEADERS=
diff --git a/README.md b/README.md
index 009c525..d52dce4 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ npm run benchmark -- run content-api/spring-boot
 npm run benchmark -- compare results/content-api-connect-rpc-*.json results/content-api-spring-boot-*.json
 
 # publish results to Grafana Cloud
-npm run benchmark -- run content-api/connect-rpc --publish
+npm run benchmark -- publish results/content-api-connect-rpc-*.json
 ```
 
 ## Configuration

From df455fbf9bf477be54fb40922390a600bda86ca0 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 11:05:37 +1100
Subject: [PATCH 13/23] Fix OTel v2 imports, add devloop scripts, move deps to
 toolkit

- Fix resourceFromAttributes import (OTel resources v2 API change)
- Use static import for PeriodicExportingMetricReader
- Add build/fix scripts to toolkit and root package.json
- Move OTel and dotenv deps from root to toolkit workspace

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 package-lock.json           | 22 +++-------------------
 package.json                | 10 ++--------
 toolkit/package.json        |  7 ++++++-
 toolkit/src/publish/otlp.js | 18 +++++++++---------
 4 files changed, 20 insertions(+), 37 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 5350e74..07383f4 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -11,13 +11,6 @@
       "workspaces": [
         "toolkit"
       ],
-      "dependencies": {
-        "@opentelemetry/exporter-metrics-otlp-http": "^0.213.0",
-        "@opentelemetry/resources": "^2.6.0",
-        "@opentelemetry/sdk-metrics": "^2.6.0",
-        "@opentelemetry/semantic-conventions": "^1.40.0",
-        "dotenv": "^17.3.1"
-      },
       "devDependencies": {
         "@anthropic-ai/claude-code": "^2.1.63"
       }
@@ -1008,18 +1001,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/dotenv": {
-      "version": "17.3.1",
-      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.3.1.tgz",
-      "integrity": "sha512-IO8C/dzEb6O3F9/twg6ZLXz164a2fhTnEWb95H23Dm4OuN+92NmEAlTrupP9VW6Jm3sO26tQlqyvyi4CsnY9GA==",
-      "license": "BSD-2-Clause",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://dotenvx.com"
-      }
-    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -2175,6 +2156,9 @@
       "version": "1.0.0",
       "license": "Apache-2.0",
       "dependencies": {
+        "@opentelemetry/exporter-metrics-otlp-http": "^0.213.0",
+        "@opentelemetry/resources": "^2.6.0",
+        "@opentelemetry/sdk-metrics": "^2.6.0",
         "chalk": "^5.4.1",
         "cli-table3": "^0.6.5",
         "commander": "^13.1.0",
diff --git a/package.json b/package.json
index 91a0a7f..3b58914 100644
--- a/package.json
+++ b/package.json
@@ -23,16 +23,10 @@
     "lint:fix": "npm run lint:fix --workspaces --if-present",
     "format": "npm run format --workspaces --if-present",
     "format:check": "npm run format:check --workspaces --if-present",
-    "build": "npm run build --workspaces --if-present"
+    "build": "npm run build --workspaces --if-present",
+    "fix": "npm run fix --workspaces --if-present"
   },
   "devDependencies": {
     "@anthropic-ai/claude-code": "^2.1.63"
-  },
-  "dependencies": {
-    "@opentelemetry/exporter-metrics-otlp-http": "^0.213.0",
-    "@opentelemetry/resources": "^2.6.0",
-    "@opentelemetry/sdk-metrics": "^2.6.0",
-    "@opentelemetry/semantic-conventions": "^1.40.0",
-    "dotenv": "^17.3.1"
   }
 }
diff --git a/toolkit/package.json b/toolkit/package.json
index 6b756b6..d2d825a 100644
--- a/toolkit/package.json
+++ b/toolkit/package.json
@@ -15,9 +15,14 @@
     "lint": "eslint .",
     "lint:fix": "eslint . --fix",
     "format": "prettier --write .",
-    "format:check": "prettier --check ."
+    "format:check": "prettier --check .",
+    "build": "npm run format:check && npm run lint",
+    "fix": "npm run format && npm run lint:fix"
   },
   "dependencies": {
+    "@opentelemetry/exporter-metrics-otlp-http": "^0.213.0",
+    "@opentelemetry/resources": "^2.6.0",
+    "@opentelemetry/sdk-metrics": "^2.6.0",
     "chalk": "^5.4.1",
     "cli-table3": "^0.6.5",
     "commander": "^13.1.0",
diff --git a/toolkit/src/publish/otlp.js b/toolkit/src/publish/otlp.js
index 13e1e2a..3d031c3 100644
--- a/toolkit/src/publish/otlp.js
+++ b/toolkit/src/publish/otlp.js
@@ -1,18 +1,16 @@
-import { MeterProvider } from '@opentelemetry/sdk-metrics';
+import { MeterProvider, PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics';
 import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-http';
-import { Resource } from '@opentelemetry/resources';
+import { resourceFromAttributes } from '@opentelemetry/resources';
 import { getLogger } from '../util/logger.js';
 
 export async function publishResults(results) {
   const log = getLogger();
 
   if (!process.env.OTEL_EXPORTER_OTLP_ENDPOINT) {
-    throw new Error(
-      'OTEL_EXPORTER_OTLP_ENDPOINT is not set. Configure it in your .env file.'
-    );
+    throw new Error('OTEL_EXPORTER_OTLP_ENDPOINT is not set. Configure it in your .env file.');
   }
 
-  const resource = new Resource({
+  const resource = resourceFromAttributes({
     'service.name': '@labset/benchmark-toolkit',
     'service.version': '1.0.0',
     'benchmark.target': results.target,
@@ -25,7 +23,7 @@ export async function publishResults(results) {
   const meterProvider = new MeterProvider({
     resource,
     readers: [
-      new (await import('@opentelemetry/sdk-metrics')).PeriodicExportingMetricReader({
+      new PeriodicExportingMetricReader({
         exporter,
         exportIntervalMillis: 60_000,
       }),
@@ -36,13 +34,15 @@ export async function publishResults(results) {
 
   // Build metrics
   if (results.metrics.build) {
-    meter.createGauge('benchmark.build.duration', { unit: 'ms' })
+    meter
+      .createGauge('benchmark.build.duration', { unit: 'ms' })
       .record(results.metrics.build.durationMs);
   }
 
   // Deploy metrics
   if (results.metrics.deploy) {
-    meter.createGauge('benchmark.deploy.duration', { unit: 'ms' })
+    meter
+      .createGauge('benchmark.deploy.duration', { unit: 'ms' })
       .record(results.metrics.deploy.durationMs);
   }
 

From 4a45aa1add130f46d4b5a28a9896cad0520ca578 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 11:20:00 +1100
Subject: [PATCH 14/23] Add Grafana dashboard and fix OTLP metric visibility

- Add importable Grafana dashboard (grafana/benchmark-dashboard.json)
  with overview stats, latency comparison, throughput, and summary table
- Move benchmark attributes from resource to data point attributes so
  they appear as Prometheus labels without requiring promotion
- Remove metric units to avoid Grafana Cloud appending suffixes
- Update README with dashboard import instructions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 README.md                        |  20 +
 grafana/benchmark-dashboard.json | 723 +++++++++++++++++++++++++++++++
 toolkit/src/publish/otlp.js      |  49 ++-
 3 files changed, 772 insertions(+), 20 deletions(-)
 create mode 100644 grafana/benchmark-dashboard.json

diff --git a/README.md b/README.md
index d52dce4..4499052 100644
--- a/README.md
+++ b/README.md
@@ -309,6 +309,25 @@ Metrics appear in Grafana with the `benchmark.*` prefix:
 - `benchmark.req_failed_rate`
 - `benchmark.checks_pass_rate`
 
+### Grafana dashboard
+
+A pre-built dashboard is included at `grafana/benchmark-dashboard.json`. To import it:
+
+1. In Grafana, go to **Dashboards** > **New** > **Import**
+2. Upload `grafana/benchmark-dashboard.json`
+3. Select your Prometheus data source
+4. Click **Import**
+
+The dashboard includes:
+
+- **Overview** — stat panels for throughput, error rate, checks pass rate, build and deploy times
+- **Latency Comparison** — bar chart of avg/med/p90/p95/p99 grouped by target
+- **Throughput & Volume** — bar gauges for requests/sec and iterations/sec
+- **Build & Deploy Comparison** — bar gauges comparing build and deploy times
+- **Summary Table** — all metrics in a sortable table
+
+Use the **Target** and **Tag** dropdowns at the top to filter by implementation and run tag.
+
 ## Custom k6 scripts
 
 The toolkit ships with default k6 scripts for HTTP and gRPC APIs in `toolkit/k6/scripts/`. To use a custom script, set the `k6.script` path in your target config:
@@ -399,6 +418,7 @@ projects/                      # benchmark target projects
 .claude/
   commands/                    # Claude Code slash commands
   agents/                      # architecture agents for code generation
+grafana/                       # Grafana dashboard JSON (importable)
 results/                       # benchmark output (gitignored)
 ```
 
diff --git a/grafana/benchmark-dashboard.json b/grafana/benchmark-dashboard.json
new file mode 100644
index 0000000..fbd340b
--- /dev/null
+++ b/grafana/benchmark-dashboard.json
@@ -0,0 +1,723 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "Prometheus data source for benchmark metrics",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "11.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "stat",
+      "name": "Stat",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "barchart",
+      "name": "Bar chart",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "bargauge",
+      "name": "Bar gauge",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "table",
+      "name": "Table",
+      "version": ""
+    }
+  ],
+  "id": null,
+  "uid": null,
+  "title": "API Benchmark Results",
+  "description": "Compare benchmark results across API implementations — build time, deploy time, throughput, and latency",
+  "tags": ["benchmark", "api", "performance"],
+  "timezone": "browser",
+  "editable": true,
+  "graphTooltip": 1,
+  "time": {
+    "from": "now-7d",
+    "to": "now"
+  },
+  "refresh": "",
+  "schemaVersion": 39,
+  "version": 1,
+  "templating": {
+    "list": [
+      {
+        "name": "datasource",
+        "type": "datasource",
+        "query": "prometheus",
+        "current": {},
+        "hide": 0,
+        "includeAll": false,
+        "multi": false,
+        "label": "Data Source"
+      },
+      {
+        "name": "target",
+        "type": "query",
+        "datasource": { "type": "prometheus", "uid": "${datasource}" },
+        "query": "label_values(benchmark_reqs_per_sec, benchmark_target)",
+        "current": {},
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "allValue": ".*",
+        "label": "Target",
+        "refresh": 2,
+        "sort": 1
+      },
+      {
+        "name": "tag",
+        "type": "query",
+        "datasource": { "type": "prometheus", "uid": "${datasource}" },
+        "query": "label_values(benchmark_reqs_per_sec{benchmark_target=~\"$target\"}, benchmark_tag)",
+        "current": {},
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "allValue": ".*",
+        "label": "Tag",
+        "refresh": 2,
+        "sort": 1
+      }
+    ]
+  },
+  "panels": [
+    {
+      "type": "row",
+      "title": "Overview",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "stat",
+      "title": "Throughput",
+      "description": "Requests per second (higher is better)",
+      "gridPos": { "h": 6, "w": 5, "x": 0, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "red" },
+              { "value": 100, "color": "yellow" },
+              { "value": 500, "color": "green" }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "graphMode": "none",
+        "textMode": "value_and_name",
+        "orientation": "vertical",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_reqs_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "title": "Error Rate",
+      "description": "Request failure rate (lower is better)",
+      "gridPos": { "h": 6, "w": 5, "x": 5, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 0.01, "color": "yellow" },
+              { "value": 0.05, "color": "red" }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "graphMode": "none",
+        "textMode": "value_and_name",
+        "orientation": "vertical",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_failed_rate{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "title": "Checks Pass Rate",
+      "description": "k6 check pass rate (higher is better)",
+      "gridPos": { "h": 6, "w": 5, "x": 10, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "red" },
+              { "value": 0.95, "color": "yellow" },
+              { "value": 1, "color": "green" }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "graphMode": "none",
+        "textMode": "value_and_name",
+        "orientation": "vertical",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_checks_pass_rate{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "title": "Build Duration",
+      "description": "Docker build time in seconds",
+      "gridPos": { "h": 6, "w": 5, "x": 15, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 30000, "color": "yellow" },
+              { "value": 120000, "color": "red" }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "graphMode": "none",
+        "textMode": "value_and_name",
+        "orientation": "vertical",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_build_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "title": "Deploy Duration",
+      "description": "Time to healthy in seconds",
+      "gridPos": { "h": 6, "w": 4, "x": 20, "y": 1 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 10000, "color": "yellow" },
+              { "value": 60000, "color": "red" }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "graphMode": "none",
+        "textMode": "value_and_name",
+        "orientation": "vertical",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_deploy_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Latency Comparison",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "barchart",
+      "title": "Response Time by Percentile",
+      "description": "Latency distribution across targets (lower is better)",
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 8 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "color": { "mode": "palette-classic" }
+        },
+        "overrides": []
+      },
+      "options": {
+        "orientation": "horizontal",
+        "showValue": "always",
+        "groupWidth": 0.7,
+        "barWidth": 0.9,
+        "stacking": "none",
+        "legend": { "displayMode": "list", "placement": "right" },
+        "tooltip": { "mode": "multi" },
+        "xTickLabelRotation": 0
+      },
+      "transformations": [
+        {
+          "id": "merge",
+          "options": {}
+        }
+      ],
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_avg{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} — avg",
+          "refId": "A",
+          "instant": true
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_med{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} — med",
+          "refId": "B",
+          "instant": true
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_p90{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} — p90",
+          "refId": "C",
+          "instant": true
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_p95{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} — p95",
+          "refId": "D",
+          "instant": true
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_p99{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} — p99",
+          "refId": "E",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Throughput & Volume",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 },
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "bargauge",
+      "title": "Requests per Second",
+      "description": "Throughput comparison across targets (higher is better)",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 19 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "red" },
+              { "value": 100, "color": "yellow" },
+              { "value": 500, "color": "green" }
+            ]
+          },
+          "min": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "orientation": "horizontal",
+        "displayMode": "gradient",
+        "showUnfilled": true,
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "valueMode": "color"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_reqs_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "bargauge",
+      "title": "Iterations per Second",
+      "description": "Iteration throughput across targets",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 19 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ipm",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "red" },
+              { "value": 100, "color": "yellow" },
+              { "value": 500, "color": "green" }
+            ]
+          },
+          "min": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "orientation": "horizontal",
+        "displayMode": "gradient",
+        "showUnfilled": true,
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "valueMode": "color"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_iterations_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Build & Deploy Comparison",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 },
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "bargauge",
+      "title": "Build Time",
+      "description": "Docker build duration (lower is better)",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 30000, "color": "yellow" },
+              { "value": 120000, "color": "red" }
+            ]
+          },
+          "min": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "orientation": "horizontal",
+        "displayMode": "gradient",
+        "showUnfilled": true,
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "valueMode": "color"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_build_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "bargauge",
+      "title": "Deploy Time",
+      "description": "Time from docker compose up to healthy (lower is better)",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 10000, "color": "yellow" },
+              { "value": 60000, "color": "red" }
+            ]
+          },
+          "min": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "orientation": "horizontal",
+        "displayMode": "gradient",
+        "showUnfilled": true,
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "valueMode": "color"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_deploy_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "refId": "A",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Summary Table",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 36 },
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "table",
+      "title": "All Metrics",
+      "description": "Full comparison table of all benchmark metrics",
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 37 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {},
+        "overrides": [
+          {
+            "matcher": { "id": "byName", "options": "benchmark_target" },
+            "properties": [{ "id": "displayName", "value": "Target" }]
+          },
+          {
+            "matcher": { "id": "byName", "options": "benchmark_tag" },
+            "properties": [{ "id": "displayName", "value": "Tag" }]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #reqs_per_sec" },
+            "properties": [
+              { "id": "displayName", "value": "Req/s" },
+              { "id": "unit", "value": "reqps" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #avg" },
+            "properties": [
+              { "id": "displayName", "value": "Avg (ms)" },
+              { "id": "unit", "value": "ms" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #med" },
+            "properties": [
+              { "id": "displayName", "value": "Med (ms)" },
+              { "id": "unit", "value": "ms" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #p90" },
+            "properties": [
+              { "id": "displayName", "value": "p90 (ms)" },
+              { "id": "unit", "value": "ms" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #p95" },
+            "properties": [
+              { "id": "displayName", "value": "p95 (ms)" },
+              { "id": "unit", "value": "ms" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #p99" },
+            "properties": [
+              { "id": "displayName", "value": "p99 (ms)" },
+              { "id": "unit", "value": "ms" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #error_rate" },
+            "properties": [
+              { "id": "displayName", "value": "Error Rate" },
+              { "id": "unit", "value": "percentunit" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #build" },
+            "properties": [
+              { "id": "displayName", "value": "Build (ms)" },
+              { "id": "unit", "value": "ms" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #deploy" },
+            "properties": [
+              { "id": "displayName", "value": "Deploy (ms)" },
+              { "id": "unit", "value": "ms" }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "showHeader": true,
+        "sortBy": [{ "displayName": "Req/s", "desc": true }],
+        "footer": { "show": false }
+      },
+      "transformations": [
+        {
+          "id": "merge",
+          "options": {}
+        },
+        {
+          "id": "filterFieldsByName",
+          "options": {
+            "include": {
+              "pattern": "benchmark_target|benchmark_tag|Value.*"
+            }
+          }
+        }
+      ],
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_reqs_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "reqs_per_sec",
+          "instant": true,
+          "format": "table"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_avg{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "avg",
+          "instant": true,
+          "format": "table"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_med{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "med",
+          "instant": true,
+          "format": "table"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_p90{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "p90",
+          "instant": true,
+          "format": "table"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_p95{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "p95",
+          "instant": true,
+          "format": "table"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_duration_p99{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "p99",
+          "instant": true,
+          "format": "table"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_req_failed_rate{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "error_rate",
+          "instant": true,
+          "format": "table"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_build_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "build",
+          "instant": true,
+          "format": "table"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_deploy_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "",
+          "refId": "deploy",
+          "instant": true,
+          "format": "table"
+        }
+      ]
+    }
+  ]
+}
diff --git a/toolkit/src/publish/otlp.js b/toolkit/src/publish/otlp.js
index 3d031c3..cc15c4e 100644
--- a/toolkit/src/publish/otlp.js
+++ b/toolkit/src/publish/otlp.js
@@ -13,11 +13,17 @@ export async function publishResults(results) {
   const resource = resourceFromAttributes({
     'service.name': '@labset/benchmark-toolkit',
     'service.version': '1.0.0',
+  });
+
+  // Data point attributes become Prometheus labels directly.
+  // Resource attributes only appear as labels if promoted by Grafana Cloud,
+  // and custom attributes like benchmark.target are not in the default promoted list.
+  const attributes = {
     'benchmark.target': results.target,
     'benchmark.tag': results.tag,
     'host.os': results.environment.os,
     'host.arch': results.environment.arch,
-  });
+  };
 
   const exporter = new OTLPMetricExporter();
   const meterProvider = new MeterProvider({
@@ -32,37 +38,40 @@ export async function publishResults(results) {
 
   const meter = meterProvider.getMeter('benchmark');
 
+  // Helper to record a gauge with shared attributes.
+  // Units are omitted to avoid Grafana Cloud appending suffixes
+  // (e.g. _milliseconds, _ratio) which vary by configuration.
+  const gauge = (name, value) => {
+    meter.createGauge(name).record(value, attributes);
+  };
+
   // Build metrics
   if (results.metrics.build) {
-    meter
-      .createGauge('benchmark.build.duration', { unit: 'ms' })
-      .record(results.metrics.build.durationMs);
+    gauge('benchmark.build.duration', results.metrics.build.durationMs);
   }
 
   // Deploy metrics
   if (results.metrics.deploy) {
-    meter
-      .createGauge('benchmark.deploy.duration', { unit: 'ms' })
-      .record(results.metrics.deploy.durationMs);
+    gauge('benchmark.deploy.duration', results.metrics.deploy.durationMs);
   }
 
   // Load test metrics
   if (results.metrics.loadtest) {
     const s = results.metrics.loadtest.summary;
 
-    meter.createGauge('benchmark.reqs', { unit: '1' }).record(s.reqs);
-    meter.createGauge('benchmark.reqs_per_sec', { unit: '1/s' }).record(s.reqsPerSec);
-    meter.createGauge('benchmark.req_failed_rate', { unit: '1' }).record(s.reqFailedRate);
-    meter.createGauge('benchmark.req.duration.avg', { unit: 'ms' }).record(s.reqDuration.avg);
-    meter.createGauge('benchmark.req.duration.min', { unit: 'ms' }).record(s.reqDuration.min);
-    meter.createGauge('benchmark.req.duration.med', { unit: 'ms' }).record(s.reqDuration.med);
-    meter.createGauge('benchmark.req.duration.max', { unit: 'ms' }).record(s.reqDuration.max);
-    meter.createGauge('benchmark.req.duration.p90', { unit: 'ms' }).record(s.reqDuration.p90);
-    meter.createGauge('benchmark.req.duration.p95', { unit: 'ms' }).record(s.reqDuration.p95);
-    meter.createGauge('benchmark.req.duration.p99', { unit: 'ms' }).record(s.reqDuration.p99);
-    meter.createGauge('benchmark.checks_pass_rate', { unit: '1' }).record(s.checksPassRate);
-    meter.createGauge('benchmark.iterations', { unit: '1' }).record(s.iterations);
-    meter.createGauge('benchmark.iterations_per_sec', { unit: '1/s' }).record(s.iterationsPerSec);
+    gauge('benchmark.reqs', s.reqs);
+    gauge('benchmark.reqs_per_sec', s.reqsPerSec);
+    gauge('benchmark.req_failed_rate', s.reqFailedRate);
+    gauge('benchmark.req.duration.avg', s.reqDuration.avg);
+    gauge('benchmark.req.duration.min', s.reqDuration.min);
+    gauge('benchmark.req.duration.med', s.reqDuration.med);
+    gauge('benchmark.req.duration.max', s.reqDuration.max);
+    gauge('benchmark.req.duration.p90', s.reqDuration.p90);
+    gauge('benchmark.req.duration.p95', s.reqDuration.p95);
+    gauge('benchmark.req.duration.p99', s.reqDuration.p99);
+    gauge('benchmark.checks_pass_rate', s.checksPassRate);
+    gauge('benchmark.iterations', s.iterations);
+    gauge('benchmark.iterations_per_sec', s.iterationsPerSec);
   }
 
   await meterProvider.forceFlush();

From 4787b20f510a1e5f52cb0e2f45002ae9163f7acb Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 11:28:22 +1100
Subject: [PATCH 15/23] Fix stat panels to use instant queries for gauge data

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 grafana/benchmark-dashboard.json | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/grafana/benchmark-dashboard.json b/grafana/benchmark-dashboard.json
index fbd340b..c0247f9 100644
--- a/grafana/benchmark-dashboard.json
+++ b/grafana/benchmark-dashboard.json
@@ -144,7 +144,8 @@
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_reqs_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
           "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A"
+          "refId": "A",
+          "instant": true
         }
       ]
     },
@@ -180,7 +181,8 @@
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_req_failed_rate{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
           "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A"
+          "refId": "A",
+          "instant": true
         }
       ]
     },
@@ -216,7 +218,8 @@
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_checks_pass_rate{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
           "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A"
+          "refId": "A",
+          "instant": true
         }
       ]
     },
@@ -252,7 +255,8 @@
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_build_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
           "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A"
+          "refId": "A",
+          "instant": true
         }
       ]
     },
@@ -288,7 +292,8 @@
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_deploy_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
           "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A"
+          "refId": "A",
+          "instant": true
         }
       ]
     },

From 05ef7ce998ca11b980fa29b38c2a0fa0e7d04f85 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 12:42:16 +1100
Subject: [PATCH 16/23] Address PR review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix k6 Docker networking: use --network host on Linux, --add-host
  host.docker.internal on macOS/Windows for Docker Desktop compatibility
- Use k6 image tag as version instead of running a container to detect it
- Restore .values accessor in k6 summary parser (--summary-export format)
- Fix checksPassRate to use checks.values.rate instead of checks.value
- Fix dashboard panel descriptions: "in seconds" → "in milliseconds"
- Update README: clarify OTLP credentials are read at publish time

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 README.md                        |  2 +-
 grafana/benchmark-dashboard.json |  4 ++--
 toolkit/src/core/k6.js           |  9 +++++++--
 toolkit/src/metrics/collector.js |  4 ++--
 toolkit/src/metrics/k6-parser.js | 19 ++++++++++---------
 5 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 4499052..0e0f82d 100644
--- a/README.md
+++ b/README.md
@@ -168,7 +168,7 @@ All targets are defined in `benchmark.config.json` at the repo root. Each target
 
 ### Environment variables
 
-Grafana Cloud credentials are resolved from environment variables at config load time. The toolkit automatically loads a `.env` file from the repository root (see [Setup](#grafana-cloud-optional)).
+Grafana Cloud credentials are read from the standard `OTEL_EXPORTER_OTLP_*` environment variables when publishing results. The CLI automatically loads a `.env` file from the repository root (see [Setup](#grafana-cloud-optional)).
 
 | Variable                       | Description                                  |
 | ------------------------------ | -------------------------------------------- |
diff --git a/grafana/benchmark-dashboard.json b/grafana/benchmark-dashboard.json
index c0247f9..697a1f5 100644
--- a/grafana/benchmark-dashboard.json
+++ b/grafana/benchmark-dashboard.json
@@ -226,7 +226,7 @@
     {
       "type": "stat",
       "title": "Build Duration",
-      "description": "Docker build time in seconds",
+      "description": "Docker build time in milliseconds",
       "gridPos": { "h": 6, "w": 5, "x": 15, "y": 1 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
@@ -263,7 +263,7 @@
     {
       "type": "stat",
       "title": "Deploy Duration",
-      "description": "Time to healthy in seconds",
+      "description": "Time from deploy to healthy in milliseconds",
       "gridPos": { "h": 6, "w": 4, "x": 20, "y": 1 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
diff --git a/toolkit/src/core/k6.js b/toolkit/src/core/k6.js
index d54a71a..32634c9 100644
--- a/toolkit/src/core/k6.js
+++ b/toolkit/src/core/k6.js
@@ -24,14 +24,19 @@ export async function runK6(scriptPath, options = {}) {
   const dockerArgs = [
     'run',
     '--rm',
-    '--network',
-    'host',
     '-v',
     `${workDir}:/workspace:ro`,
     '-v',
     `${summaryDir}:/results`,
   ];
 
+  if (process.platform === 'linux') {
+    dockerArgs.push('--network', 'host');
+  } else {
+    // Docker Desktop (macOS/Windows): use host.docker.internal to reach host ports
+    dockerArgs.push('--add-host', 'host.docker.internal:host-gateway');
+  }
+
   for (const [key, value] of Object.entries(env)) {
     const mapped = value.startsWith('./') ? toContainerPath(value) : value;
     dockerArgs.push('-e', `${key}=${mapped}`);
diff --git a/toolkit/src/metrics/collector.js b/toolkit/src/metrics/collector.js
index fc31f07..48e1ee9 100644
--- a/toolkit/src/metrics/collector.js
+++ b/toolkit/src/metrics/collector.js
@@ -11,11 +11,11 @@ async function getVersion(command, args) {
 }
 
 async function collectEnvironment() {
-  const [dockerVersion, nodeVersion, k6Version] = await Promise.all([
+  const [dockerVersion, nodeVersion] = await Promise.all([
     getVersion('docker', ['--version']),
     getVersion('node', ['--version']),
-    getVersion('docker', ['run', '--rm', 'grafana/k6:1.6.1', 'version']),
   ]);
+  const k6Version = 'grafana/k6:1.6.1';
 
   return {
     os: platform(),
diff --git a/toolkit/src/metrics/k6-parser.js b/toolkit/src/metrics/k6-parser.js
index 3494c66..8149625 100644
--- a/toolkit/src/metrics/k6-parser.js
+++ b/toolkit/src/metrics/k6-parser.js
@@ -1,14 +1,15 @@
 export function parseK6Summary(raw) {
   const metrics = raw.metrics || {};
 
-  // Support both HTTP and gRPC protocols
-  const reqDuration = metrics.http_req_duration || metrics.grpc_req_duration || {};
-  const reqs = metrics.http_reqs || {};
-  const reqFailed = metrics.http_req_failed || {};
-  const checks = metrics.checks || {};
-  const iterations = metrics.iterations || {};
-  const dataReceived = metrics.data_received || {};
-  const dataSent = metrics.data_sent || {};
+  // Support both HTTP and gRPC protocols.
+  // k6 --summary-export nests metric data under a .values object.
+  const reqDuration = metrics.http_req_duration?.values || metrics.grpc_req_duration?.values || {};
+  const reqs = metrics.http_reqs?.values || {};
+  const reqFailed = metrics.http_req_failed?.values || {};
+  const checks = metrics.checks?.values || {};
+  const iterations = metrics.iterations?.values || {};
+  const dataReceived = metrics.data_received?.values || {};
+  const dataSent = metrics.data_sent?.values || {};
 
   return {
     reqs: reqs.count ?? iterations.count ?? 0,
@@ -27,6 +28,6 @@ export function parseK6Summary(raw) {
     iterationsPerSec: iterations.rate ?? 0,
     dataReceived: dataReceived.count ?? 0,
     dataSent: dataSent.count ?? 0,
-    checksPassRate: checks.value ?? 0,
+    checksPassRate: checks.rate ?? 0,
   };
 }

From 2b5a404a03a1a6c853ddac6241cdca93106b0c4b Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 12:52:43 +1100
Subject: [PATCH 17/23] Add per-stage Docker build metrics

- Parse BuildKit --progress=plain output to extract per-stage durations
  (e.g. generate, builder, runtime)
- Publish per-stage metrics as benchmark.build.stage.duration with
  benchmark.build.stage label for Grafana filtering
- Add stage columns to comparison table
- Add Build Stage Breakdown panel to Grafana dashboard

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 grafana/benchmark-dashboard.json    | 44 +++++++++++++++++++++++++++--
 toolkit/src/cli/commands/build.js   |  9 ++++--
 toolkit/src/cli/commands/run.js     |  8 ++++--
 toolkit/src/core/docker.js          |  3 +-
 toolkit/src/metrics/build-parser.js | 35 +++++++++++++++++++++++
 toolkit/src/publish/otlp.js         |  9 ++++++
 toolkit/src/report/compare.js       | 27 ++++++++++++++++--
 toolkit/src/report/table.js         | 30 +++++++++++---------
 8 files changed, 139 insertions(+), 26 deletions(-)
 create mode 100644 toolkit/src/metrics/build-parser.js

diff --git a/grafana/benchmark-dashboard.json b/grafana/benchmark-dashboard.json
index 697a1f5..fc96d1c 100644
--- a/grafana/benchmark-dashboard.json
+++ b/grafana/benchmark-dashboard.json
@@ -543,16 +543,56 @@
     },
     {
       "type": "row",
-      "title": "Summary Table",
+      "title": "Build Stage Breakdown",
       "gridPos": { "h": 1, "w": 24, "x": 0, "y": 36 },
       "collapsed": false,
       "panels": []
     },
+    {
+      "type": "barchart",
+      "title": "Build Stage Duration",
+      "description": "Time spent in each Dockerfile build stage (lower is better)",
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 37 },
+      "datasource": { "type": "prometheus", "uid": "${datasource}" },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "color": { "mode": "palette-classic" }
+        },
+        "overrides": []
+      },
+      "options": {
+        "orientation": "horizontal",
+        "showValue": "always",
+        "groupWidth": 0.7,
+        "barWidth": 0.9,
+        "stacking": "none",
+        "legend": { "displayMode": "list", "placement": "right" },
+        "tooltip": { "mode": "multi" },
+        "xTickLabelRotation": 0
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${datasource}" },
+          "expr": "benchmark_build_stage_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
+          "legendFormat": "{{benchmark_target}} — {{benchmark_build_stage}}",
+          "refId": "A",
+          "instant": true
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Summary Table",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 },
+      "collapsed": false,
+      "panels": []
+    },
     {
       "type": "table",
       "title": "All Metrics",
       "description": "Full comparison table of all benchmark metrics",
-      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 37 },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 48 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {},
diff --git a/toolkit/src/cli/commands/build.js b/toolkit/src/cli/commands/build.js
index 884d643..f01384b 100644
--- a/toolkit/src/cli/commands/build.js
+++ b/toolkit/src/cli/commands/build.js
@@ -2,6 +2,7 @@ import { Command } from 'commander';
 import { resolve } from 'node:path';
 import { loadConfig, resolveTarget } from '../../config/loader.js';
 import { composeBuild } from '../../core/docker.js';
+import { parseBuildStages } from '../../metrics/build-parser.js';
 import { startTimer } from '../../core/timer.js';
 import { getLogger } from '../../util/logger.js';
 
@@ -19,14 +20,16 @@ export function buildCommand() {
       const projectDir = resolve(target.path);
       const timer = startTimer();
 
-      await composeBuild(projectDir, {
+      const result = await composeBuild(projectDir, {
         composeFile: target.composeFile,
         noCache: !options.cache,
       });
 
       const { durationMs, durationSec } = timer.stop();
-      log.info({ target: targetName, durationMs, durationSec, msg: 'build completed' });
+      const stages = parseBuildStages(result.stderr);
 
-      return { durationMs, cached: options.cache };
+      log.info({ target: targetName, durationMs, durationSec, stages, msg: 'build completed' });
+
+      return { durationMs, cached: options.cache, stages };
     });
 }
diff --git a/toolkit/src/cli/commands/run.js b/toolkit/src/cli/commands/run.js
index e4b0150..f02f20e 100644
--- a/toolkit/src/cli/commands/run.js
+++ b/toolkit/src/cli/commands/run.js
@@ -5,6 +5,7 @@ import { composeBuild, composeUp, composeDown } from '../../core/docker.js';
 import { waitForHealthy } from '../../core/health.js';
 import { runK6 } from '../../core/k6.js';
 import { startTimer } from '../../core/timer.js';
+import { parseBuildStages } from '../../metrics/build-parser.js';
 import { parseK6Summary } from '../../metrics/k6-parser.js';
 import { collectResults } from '../../metrics/collector.js';
 import { writeResults } from '../../report/json.js';
@@ -38,13 +39,14 @@ export function runCommand() {
         if (!options.skipBuild) {
           log.info({ target: targetName, msg: 'phase: build' });
           const timer = startTimer();
-          await composeBuild(projectDir, {
+          const buildOutput = await composeBuild(projectDir, {
             composeFile: target.composeFile,
             noCache: !options.cache,
           });
           const { durationMs } = timer.stop();
-          buildResult = { durationMs, cached: options.cache };
-          log.info({ durationMs, msg: 'build completed' });
+          const stages = parseBuildStages(buildOutput.stderr);
+          buildResult = { durationMs, cached: options.cache, stages };
+          log.info({ durationMs, stages, msg: 'build completed' });
         }
 
         // --- Deploy phase ---
diff --git a/toolkit/src/core/docker.js b/toolkit/src/core/docker.js
index 41bd3d4..7deb9e8 100644
--- a/toolkit/src/core/docker.js
+++ b/toolkit/src/core/docker.js
@@ -9,13 +9,12 @@ export async function composeBuild(projectDir, options = {}) {
   const log = getLogger();
   const { composeFile = 'docker-compose.yml', noCache = true } = options;
 
-  const args = composeArgs(composeFile, ['build']);
+  const args = composeArgs(composeFile, ['build', '--progress=plain']);
   if (noCache) args.push('--no-cache');
 
   log.info({ projectDir, noCache, msg: 'building docker compose project' });
   return exec('docker', ['compose', ...args], {
     cwd: projectDir,
-    stdio: 'inherit',
   });
 }
 
diff --git a/toolkit/src/metrics/build-parser.js b/toolkit/src/metrics/build-parser.js
new file mode 100644
index 0000000..78e5a9f
--- /dev/null
+++ b/toolkit/src/metrics/build-parser.js
@@ -0,0 +1,35 @@
+/**
+ * Parse Docker BuildKit --progress=plain output to extract per-stage durations.
+ *
+ * BuildKit output format:
+ *   #5 [generate 2/8] RUN apk add --no-cache git
+ *   #5 DONE 1.2s
+ *
+ * Returns an object mapping stage names to total duration in milliseconds,
+ * e.g. { generate: 5200, builder: 12300, runtime: 800 }
+ */
+export function parseBuildStages(output) {
+  const stepStage = new Map();
+  const stageDurations = {};
+
+  const stagePattern = /#(\d+) \[(\S+)\s+\d+\/\d+\]/;
+  const donePattern = /#(\d+) DONE (\d+\.?\d*)s/;
+
+  for (const line of output.split('\n')) {
+    const stageMatch = line.match(stagePattern);
+    if (stageMatch) {
+      stepStage.set(stageMatch[1], stageMatch[2]);
+    }
+
+    const doneMatch = line.match(donePattern);
+    if (doneMatch) {
+      const stage = stepStage.get(doneMatch[1]);
+      if (stage && stage !== 'internal') {
+        const durationMs = Math.round(parseFloat(doneMatch[2]) * 1000);
+        stageDurations[stage] = (stageDurations[stage] ?? 0) + durationMs;
+      }
+    }
+  }
+
+  return stageDurations;
+}
diff --git a/toolkit/src/publish/otlp.js b/toolkit/src/publish/otlp.js
index cc15c4e..5469186 100644
--- a/toolkit/src/publish/otlp.js
+++ b/toolkit/src/publish/otlp.js
@@ -48,6 +48,15 @@ export async function publishResults(results) {
   // Build metrics
   if (results.metrics.build) {
     gauge('benchmark.build.duration', results.metrics.build.durationMs);
+
+    // Per-stage build durations
+    const stages = results.metrics.build.stages;
+    if (stages) {
+      const stageGauge = meter.createGauge('benchmark.build.stage.duration');
+      for (const [stage, durationMs] of Object.entries(stages)) {
+        stageGauge.record(durationMs, { ...attributes, 'benchmark.build.stage': stage });
+      }
+    }
   }
 
   // Deploy metrics
diff --git a/toolkit/src/report/compare.js b/toolkit/src/report/compare.js
index d3a4343..7a387d9 100644
--- a/toolkit/src/report/compare.js
+++ b/toolkit/src/report/compare.js
@@ -20,6 +20,9 @@ export function compareResults(results) {
 
     if (r.metrics.build) {
       row.buildMs = r.metrics.build.durationMs;
+      if (r.metrics.build.stages) {
+        row.stages = r.metrics.build.stages;
+      }
     }
 
     if (r.metrics.deploy) {
@@ -39,13 +42,31 @@ export function compareResults(results) {
     return row;
   });
 
+  // Collect all stage names across results
+  const stageNames = [...new Set(rows.flatMap((r) => Object.keys(r.stages ?? {})))].sort();
+
   // Find best values for highlighting
   const best = {};
-  const numericKeys = ['buildMs', 'deployMs', 'avgMs', 'p90Ms', 'p95Ms', 'p99Ms', 'errorRate'];
+  const numericKeys = [
+    'buildMs',
+    'deployMs',
+    'avgMs',
+    'p90Ms',
+    'p95Ms',
+    'p99Ms',
+    'errorRate',
+    ...stageNames.map((s) => `stage:${s}`),
+  ];
   const higherIsBetter = ['reqsPerSec'];
 
   for (const key of numericKeys) {
-    const values = rows.map((r) => r[key]).filter((v) => v !== undefined);
+    let values;
+    if (key.startsWith('stage:')) {
+      const stage = key.slice(6);
+      values = rows.map((r) => r.stages?.[stage]).filter((v) => v !== undefined);
+    } else {
+      values = rows.map((r) => r[key]).filter((v) => v !== undefined);
+    }
     if (values.length > 0) best[key] = Math.min(...values);
   }
 
@@ -54,5 +75,5 @@ export function compareResults(results) {
     if (values.length > 0) best[key] = Math.max(...values);
   }
 
-  return { rows, best };
+  return { rows, best, stageNames };
 }
diff --git a/toolkit/src/report/table.js b/toolkit/src/report/table.js
index 3795c35..cf89fd5 100644
--- a/toolkit/src/report/table.js
+++ b/toolkit/src/report/table.js
@@ -12,20 +12,23 @@ function highlight(value, bestValue) {
   return value === bestValue ? chalk.green(formatted) : formatted;
 }
 
-export function renderComparisonTable({ rows, best }) {
+export function renderComparisonTable({ rows, best, stageNames = [] }) {
+  const head = [
+    chalk.bold('Target'),
+    chalk.bold('Tag'),
+    chalk.bold('Build (ms)'),
+    ...stageNames.map((s) => chalk.bold(`${s} (ms)`)),
+    chalk.bold('Deploy (ms)'),
+    chalk.bold('Reqs/s'),
+    chalk.bold('Avg (ms)'),
+    chalk.bold('p90 (ms)'),
+    chalk.bold('p95 (ms)'),
+    chalk.bold('p99 (ms)'),
+    chalk.bold('Error %'),
+  ];
+
   const table = new Table({
-    head: [
-      chalk.bold('Target'),
-      chalk.bold('Tag'),
-      chalk.bold('Build (ms)'),
-      chalk.bold('Deploy (ms)'),
-      chalk.bold('Reqs/s'),
-      chalk.bold('Avg (ms)'),
-      chalk.bold('p90 (ms)'),
-      chalk.bold('p95 (ms)'),
-      chalk.bold('p99 (ms)'),
-      chalk.bold('Error %'),
-    ],
+    head,
     style: { head: [], border: [] },
   });
 
@@ -34,6 +37,7 @@ export function renderComparisonTable({ rows, best }) {
       row.target,
       row.tag ?? '-',
       highlight(row.buildMs, best.buildMs),
+      ...stageNames.map((s) => highlight(row.stages?.[s], best[`stage:${s}`])),
       highlight(row.deployMs, best.deployMs),
       highlight(row.reqsPerSec, best.reqsPerSec),
       highlight(row.avgMs, best.avgMs),

From 754c9e65f21987322894a91675d35bc997fe99de Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 12:55:40 +1100
Subject: [PATCH 18/23] Fix k6 connectivity on macOS and add health check
 before loadtest

- Restore --network host on all platforms (works on Docker Desktop)
- Rewrite localhost to host.docker.internal in k6 env vars on non-Linux
  so k6 container can reach services published on the host
- Add health check to standalone loadtest command to prevent running
  k6 against a service that is not ready

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolkit/src/cli/commands/loadtest.js |  4 ++++
 toolkit/src/core/k6.js               | 19 +++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/toolkit/src/cli/commands/loadtest.js b/toolkit/src/cli/commands/loadtest.js
index c4cddc6..7a389b8 100644
--- a/toolkit/src/cli/commands/loadtest.js
+++ b/toolkit/src/cli/commands/loadtest.js
@@ -1,6 +1,7 @@
 import { Command } from 'commander';
 import { resolve } from 'node:path';
 import { loadConfig, resolveTarget } from '../../config/loader.js';
+import { waitForHealthy } from '../../core/health.js';
 import { runK6 } from '../../core/k6.js';
 import { parseK6Summary } from '../../metrics/k6-parser.js';
 import { getLogger } from '../../util/logger.js';
@@ -17,6 +18,9 @@ export function loadtestCommand() {
       const config = await loadConfig(globalOpts.config);
       const target = resolveTarget(config, targetName);
 
+      // Verify the service is healthy before starting the load test
+      await waitForHealthy(target);
+
       const scriptPath = resolve(target.k6.script);
       const vus = options.k6Vus ? parseInt(options.k6Vus, 10) : target.k6.vus;
       const duration = options.k6Duration ?? target.k6.duration;
diff --git a/toolkit/src/core/k6.js b/toolkit/src/core/k6.js
index 32634c9..14612ab 100644
--- a/toolkit/src/core/k6.js
+++ b/toolkit/src/core/k6.js
@@ -21,24 +21,27 @@ export async function runK6(scriptPath, options = {}) {
     return `/workspace/${relative(workDir, abs)}`;
   };
 
+  const isLinux = process.platform === 'linux';
+
   const dockerArgs = [
     'run',
     '--rm',
+    '--network',
+    'host',
     '-v',
     `${workDir}:/workspace:ro`,
     '-v',
     `${summaryDir}:/results`,
   ];
 
-  if (process.platform === 'linux') {
-    dockerArgs.push('--network', 'host');
-  } else {
-    // Docker Desktop (macOS/Windows): use host.docker.internal to reach host ports
-    dockerArgs.push('--add-host', 'host.docker.internal:host-gateway');
-  }
-
   for (const [key, value] of Object.entries(env)) {
-    const mapped = value.startsWith('./') ? toContainerPath(value) : value;
+    let mapped = value.startsWith('./') ? toContainerPath(value) : value;
+    // Docker Desktop (macOS/Windows): --network host still works but localhost
+    // inside the container resolves to the VM, not the macOS host.
+    // Rewrite localhost to host.docker.internal so k6 can reach published ports.
+    if (!isLinux) {
+      mapped = mapped.replace(/localhost|127\.0\.0\.1/g, 'host.docker.internal');
+    }
     dockerArgs.push('-e', `${key}=${mapped}`);
   }
 

From 24241acd9fd6c0d63976f21bee066b06fc98d1ab Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 13:01:47 +1100
Subject: [PATCH 19/23] Simplify Grafana dashboard for legibility

Reduce from 12 panels to 6 by removing redundant stat panels,
duplicate bar gauges, and the summary table. Use consistent
bar gauge and bar chart panels with clear labels and
palette-classic-by-name coloring for easy target comparison.

Layout:
- Performance: Throughput + Error Rate (bar gauges)
- Latency: percentile distribution (bar chart)
- Build & Deploy: build time + deploy time (bar gauges)
- Build Stages: per-stage breakdown (bar chart)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 grafana/benchmark-dashboard.json | 593 ++++---------------------------
 1 file changed, 74 insertions(+), 519 deletions(-)

diff --git a/grafana/benchmark-dashboard.json b/grafana/benchmark-dashboard.json
index fc96d1c..cf270ae 100644
--- a/grafana/benchmark-dashboard.json
+++ b/grafana/benchmark-dashboard.json
@@ -10,55 +10,20 @@
     }
   ],
   "__requires": [
-    {
-      "type": "grafana",
-      "id": "grafana",
-      "name": "Grafana",
-      "version": "11.0.0"
-    },
-    {
-      "type": "datasource",
-      "id": "prometheus",
-      "name": "Prometheus",
-      "version": "1.0.0"
-    },
-    {
-      "type": "panel",
-      "id": "stat",
-      "name": "Stat",
-      "version": ""
-    },
-    {
-      "type": "panel",
-      "id": "barchart",
-      "name": "Bar chart",
-      "version": ""
-    },
-    {
-      "type": "panel",
-      "id": "bargauge",
-      "name": "Bar gauge",
-      "version": ""
-    },
-    {
-      "type": "panel",
-      "id": "table",
-      "name": "Table",
-      "version": ""
-    }
+    { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "11.0.0" },
+    { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" },
+    { "type": "panel", "id": "barchart", "name": "Bar chart", "version": "" },
+    { "type": "panel", "id": "bargauge", "name": "Bar gauge", "version": "" }
   ],
   "id": null,
   "uid": null,
   "title": "API Benchmark Results",
-  "description": "Compare benchmark results across API implementations — build time, deploy time, throughput, and latency",
-  "tags": ["benchmark", "api", "performance"],
+  "description": "Compare benchmark results across API implementations",
+  "tags": ["benchmark"],
   "timezone": "browser",
   "editable": true,
   "graphTooltip": 1,
-  "time": {
-    "from": "now-7d",
-    "to": "now"
-  },
+  "time": { "from": "now-7d", "to": "now" },
   "refresh": "",
   "schemaVersion": 39,
   "version": 1,
@@ -107,53 +72,49 @@
   "panels": [
     {
       "type": "row",
-      "title": "Overview",
+      "title": "Performance",
       "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
       "collapsed": false,
       "panels": []
     },
     {
-      "type": "stat",
-      "title": "Throughput",
-      "description": "Requests per second (higher is better)",
-      "gridPos": { "h": 6, "w": 5, "x": 0, "y": 1 },
+      "type": "bargauge",
+      "title": "Throughput (req/s)",
+      "description": "Higher is better",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
           "unit": "reqps",
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "value": null, "color": "red" },
-              { "value": 100, "color": "yellow" },
-              { "value": 500, "color": "green" }
-            ]
-          }
+          "color": { "mode": "palette-classic-by-name" },
+          "min": 0
         },
         "overrides": []
       },
       "options": {
-        "graphMode": "none",
-        "textMode": "value_and_name",
-        "orientation": "vertical",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+        "orientation": "horizontal",
+        "displayMode": "gradient",
+        "showUnfilled": true,
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "valueMode": "color",
+        "namePlacement": "left",
+        "sizing": "auto"
       },
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_reqs_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "legendFormat": "{{benchmark_target}}",
           "refId": "A",
           "instant": true
         }
       ]
     },
     {
-      "type": "stat",
+      "type": "bargauge",
       "title": "Error Rate",
-      "description": "Request failure rate (lower is better)",
-      "gridPos": { "h": 6, "w": 5, "x": 5, "y": 1 },
+      "description": "Lower is better",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
@@ -166,132 +127,26 @@
               { "value": 0.01, "color": "yellow" },
               { "value": 0.05, "color": "red" }
             ]
-          }
+          },
+          "min": 0,
+          "max": 1
         },
         "overrides": []
       },
       "options": {
-        "graphMode": "none",
-        "textMode": "value_and_name",
-        "orientation": "vertical",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+        "orientation": "horizontal",
+        "displayMode": "gradient",
+        "showUnfilled": true,
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "valueMode": "color",
+        "namePlacement": "left",
+        "sizing": "auto"
       },
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_req_failed_rate{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A",
-          "instant": true
-        }
-      ]
-    },
-    {
-      "type": "stat",
-      "title": "Checks Pass Rate",
-      "description": "k6 check pass rate (higher is better)",
-      "gridPos": { "h": 6, "w": 5, "x": 10, "y": 1 },
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percentunit",
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "value": null, "color": "red" },
-              { "value": 0.95, "color": "yellow" },
-              { "value": 1, "color": "green" }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "graphMode": "none",
-        "textMode": "value_and_name",
-        "orientation": "vertical",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_checks_pass_rate{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A",
-          "instant": true
-        }
-      ]
-    },
-    {
-      "type": "stat",
-      "title": "Build Duration",
-      "description": "Docker build time in milliseconds",
-      "gridPos": { "h": 6, "w": 5, "x": 15, "y": 1 },
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "ms",
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "value": null, "color": "green" },
-              { "value": 30000, "color": "yellow" },
-              { "value": 120000, "color": "red" }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "graphMode": "none",
-        "textMode": "value_and_name",
-        "orientation": "vertical",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_build_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A",
-          "instant": true
-        }
-      ]
-    },
-    {
-      "type": "stat",
-      "title": "Deploy Duration",
-      "description": "Time from deploy to healthy in milliseconds",
-      "gridPos": { "h": 6, "w": 4, "x": 20, "y": 1 },
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "ms",
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "value": null, "color": "green" },
-              { "value": 10000, "color": "yellow" },
-              { "value": 60000, "color": "red" }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "graphMode": "none",
-        "textMode": "value_and_name",
-        "orientation": "vertical",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_deploy_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "legendFormat": "{{benchmark_target}}",
           "refId": "A",
           "instant": true
         }
@@ -299,16 +154,16 @@
     },
     {
       "type": "row",
-      "title": "Latency Comparison",
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
+      "title": "Latency",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 },
       "collapsed": false,
       "panels": []
     },
     {
       "type": "barchart",
-      "title": "Response Time by Percentile",
-      "description": "Latency distribution across targets (lower is better)",
-      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 8 },
+      "title": "Response Time by Percentile (ms)",
+      "description": "Lower is better",
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 10 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
@@ -318,169 +173,63 @@
         "overrides": []
       },
       "options": {
-        "orientation": "horizontal",
+        "orientation": "vertical",
         "showValue": "always",
         "groupWidth": 0.7,
-        "barWidth": 0.9,
+        "barWidth": 0.8,
         "stacking": "none",
-        "legend": { "displayMode": "list", "placement": "right" },
+        "legend": { "displayMode": "list", "placement": "bottom" },
         "tooltip": { "mode": "multi" },
         "xTickLabelRotation": 0
       },
-      "transformations": [
-        {
-          "id": "merge",
-          "options": {}
-        }
-      ],
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_req_duration_avg{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} — avg",
+          "legendFormat": "{{benchmark_target}} avg",
           "refId": "A",
           "instant": true
         },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_req_duration_med{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} — med",
-          "refId": "B",
-          "instant": true
-        },
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_req_duration_p90{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} — p90",
-          "refId": "C",
+          "legendFormat": "{{benchmark_target}} p90",
+          "refId": "B",
           "instant": true
         },
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_req_duration_p95{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} — p95",
-          "refId": "D",
+          "legendFormat": "{{benchmark_target}} p95",
+          "refId": "C",
           "instant": true
         },
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_req_duration_p99{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} — p99",
-          "refId": "E",
-          "instant": true
-        }
-      ]
-    },
-    {
-      "type": "row",
-      "title": "Throughput & Volume",
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 },
-      "collapsed": false,
-      "panels": []
-    },
-    {
-      "type": "bargauge",
-      "title": "Requests per Second",
-      "description": "Throughput comparison across targets (higher is better)",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 19 },
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "reqps",
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "value": null, "color": "red" },
-              { "value": 100, "color": "yellow" },
-              { "value": 500, "color": "green" }
-            ]
-          },
-          "min": 0
-        },
-        "overrides": []
-      },
-      "options": {
-        "orientation": "horizontal",
-        "displayMode": "gradient",
-        "showUnfilled": true,
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-        "valueMode": "color"
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_reqs_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A",
-          "instant": true
-        }
-      ]
-    },
-    {
-      "type": "bargauge",
-      "title": "Iterations per Second",
-      "description": "Iteration throughput across targets",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 19 },
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "ipm",
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "value": null, "color": "red" },
-              { "value": 100, "color": "yellow" },
-              { "value": 500, "color": "green" }
-            ]
-          },
-          "min": 0
-        },
-        "overrides": []
-      },
-      "options": {
-        "orientation": "horizontal",
-        "displayMode": "gradient",
-        "showUnfilled": true,
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-        "valueMode": "color"
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_iterations_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
-          "refId": "A",
+          "legendFormat": "{{benchmark_target}} p99",
+          "refId": "D",
           "instant": true
         }
       ]
     },
     {
       "type": "row",
-      "title": "Build & Deploy Comparison",
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 },
+      "title": "Build & Deploy",
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
       "collapsed": false,
       "panels": []
     },
     {
       "type": "bargauge",
       "title": "Build Time",
-      "description": "Docker build duration (lower is better)",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 },
+      "description": "Total Docker build duration (lower is better)",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
           "unit": "ms",
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "value": null, "color": "green" },
-              { "value": 30000, "color": "yellow" },
-              { "value": 120000, "color": "red" }
-            ]
-          },
+          "color": { "mode": "palette-classic-by-name" },
           "min": 0
         },
         "overrides": []
@@ -490,13 +239,15 @@
         "displayMode": "gradient",
         "showUnfilled": true,
         "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-        "valueMode": "color"
+        "valueMode": "color",
+        "namePlacement": "left",
+        "sizing": "auto"
       },
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_build_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "legendFormat": "{{benchmark_target}}",
           "refId": "A",
           "instant": true
         }
@@ -505,21 +256,13 @@
     {
       "type": "bargauge",
       "title": "Deploy Time",
-      "description": "Time from docker compose up to healthy (lower is better)",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 },
+      "description": "Time to healthy (lower is better)",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
           "unit": "ms",
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "value": null, "color": "green" },
-              { "value": 10000, "color": "yellow" },
-              { "value": 60000, "color": "red" }
-            ]
-          },
+          "color": { "mode": "palette-classic-by-name" },
           "min": 0
         },
         "overrides": []
@@ -529,30 +272,25 @@
         "displayMode": "gradient",
         "showUnfilled": true,
         "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-        "valueMode": "color"
+        "valueMode": "color",
+        "namePlacement": "left",
+        "sizing": "auto"
       },
       "targets": [
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_deploy_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} [{{benchmark_tag}}]",
+          "legendFormat": "{{benchmark_target}}",
           "refId": "A",
           "instant": true
         }
       ]
     },
-    {
-      "type": "row",
-      "title": "Build Stage Breakdown",
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 36 },
-      "collapsed": false,
-      "panels": []
-    },
     {
       "type": "barchart",
-      "title": "Build Stage Duration",
-      "description": "Time spent in each Dockerfile build stage (lower is better)",
-      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 37 },
+      "title": "Build Stage Breakdown (ms)",
+      "description": "Time spent per Dockerfile stage (lower is better)",
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 29 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
@@ -562,12 +300,12 @@
         "overrides": []
       },
       "options": {
-        "orientation": "horizontal",
+        "orientation": "vertical",
         "showValue": "always",
         "groupWidth": 0.7,
-        "barWidth": 0.9,
+        "barWidth": 0.8,
         "stacking": "none",
-        "legend": { "displayMode": "list", "placement": "right" },
+        "legend": { "displayMode": "list", "placement": "bottom" },
         "tooltip": { "mode": "multi" },
         "xTickLabelRotation": 0
       },
@@ -575,194 +313,11 @@
         {
           "datasource": { "type": "prometheus", "uid": "${datasource}" },
           "expr": "benchmark_build_stage_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} — {{benchmark_build_stage}}",
+          "legendFormat": "{{benchmark_target}} {{benchmark_build_stage}}",
           "refId": "A",
           "instant": true
         }
       ]
-    },
-    {
-      "type": "row",
-      "title": "Summary Table",
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 },
-      "collapsed": false,
-      "panels": []
-    },
-    {
-      "type": "table",
-      "title": "All Metrics",
-      "description": "Full comparison table of all benchmark metrics",
-      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 48 },
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {},
-        "overrides": [
-          {
-            "matcher": { "id": "byName", "options": "benchmark_target" },
-            "properties": [{ "id": "displayName", "value": "Target" }]
-          },
-          {
-            "matcher": { "id": "byName", "options": "benchmark_tag" },
-            "properties": [{ "id": "displayName", "value": "Tag" }]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #reqs_per_sec" },
-            "properties": [
-              { "id": "displayName", "value": "Req/s" },
-              { "id": "unit", "value": "reqps" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #avg" },
-            "properties": [
-              { "id": "displayName", "value": "Avg (ms)" },
-              { "id": "unit", "value": "ms" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #med" },
-            "properties": [
-              { "id": "displayName", "value": "Med (ms)" },
-              { "id": "unit", "value": "ms" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #p90" },
-            "properties": [
-              { "id": "displayName", "value": "p90 (ms)" },
-              { "id": "unit", "value": "ms" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #p95" },
-            "properties": [
-              { "id": "displayName", "value": "p95 (ms)" },
-              { "id": "unit", "value": "ms" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #p99" },
-            "properties": [
-              { "id": "displayName", "value": "p99 (ms)" },
-              { "id": "unit", "value": "ms" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #error_rate" },
-            "properties": [
-              { "id": "displayName", "value": "Error Rate" },
-              { "id": "unit", "value": "percentunit" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #build" },
-            "properties": [
-              { "id": "displayName", "value": "Build (ms)" },
-              { "id": "unit", "value": "ms" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #deploy" },
-            "properties": [
-              { "id": "displayName", "value": "Deploy (ms)" },
-              { "id": "unit", "value": "ms" }
-            ]
-          }
-        ]
-      },
-      "options": {
-        "showHeader": true,
-        "sortBy": [{ "displayName": "Req/s", "desc": true }],
-        "footer": { "show": false }
-      },
-      "transformations": [
-        {
-          "id": "merge",
-          "options": {}
-        },
-        {
-          "id": "filterFieldsByName",
-          "options": {
-            "include": {
-              "pattern": "benchmark_target|benchmark_tag|Value.*"
-            }
-          }
-        }
-      ],
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_reqs_per_sec{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "reqs_per_sec",
-          "instant": true,
-          "format": "table"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_req_duration_avg{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "avg",
-          "instant": true,
-          "format": "table"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_req_duration_med{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "med",
-          "instant": true,
-          "format": "table"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_req_duration_p90{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "p90",
-          "instant": true,
-          "format": "table"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_req_duration_p95{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "p95",
-          "instant": true,
-          "format": "table"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_req_duration_p99{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "p99",
-          "instant": true,
-          "format": "table"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_req_failed_rate{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "error_rate",
-          "instant": true,
-          "format": "table"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_build_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "build",
-          "instant": true,
-          "format": "table"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_deploy_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "",
-          "refId": "deploy",
-          "instant": true,
-          "format": "table"
-        }
-      ]
     }
   ]
 }

From ae3fcf00903f001a95a5c3f89be908d62df105a6 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 13:03:27 +1100
Subject: [PATCH 20/23] Fix build stage parsing to check both stdout and stderr

BuildKit progress output may go to stdout or stderr depending on
Docker version and compose configuration. Parse both to reliably
extract per-stage durations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolkit/src/cli/commands/build.js   | 2 +-
 toolkit/src/cli/commands/run.js     | 2 +-
 toolkit/src/metrics/build-parser.js | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/toolkit/src/cli/commands/build.js b/toolkit/src/cli/commands/build.js
index f01384b..6120dc8 100644
--- a/toolkit/src/cli/commands/build.js
+++ b/toolkit/src/cli/commands/build.js
@@ -26,7 +26,7 @@ export function buildCommand() {
       });
 
       const { durationMs, durationSec } = timer.stop();
-      const stages = parseBuildStages(result.stderr);
+      const stages = parseBuildStages(result.stdout, result.stderr);
 
       log.info({ target: targetName, durationMs, durationSec, stages, msg: 'build completed' });
 
diff --git a/toolkit/src/cli/commands/run.js b/toolkit/src/cli/commands/run.js
index f02f20e..e51234d 100644
--- a/toolkit/src/cli/commands/run.js
+++ b/toolkit/src/cli/commands/run.js
@@ -44,7 +44,7 @@ export function runCommand() {
             noCache: !options.cache,
           });
           const { durationMs } = timer.stop();
-          const stages = parseBuildStages(buildOutput.stderr);
+          const stages = parseBuildStages(buildOutput.stdout, buildOutput.stderr);
           buildResult = { durationMs, cached: options.cache, stages };
           log.info({ durationMs, stages, msg: 'build completed' });
         }
diff --git a/toolkit/src/metrics/build-parser.js b/toolkit/src/metrics/build-parser.js
index 78e5a9f..855e7ec 100644
--- a/toolkit/src/metrics/build-parser.js
+++ b/toolkit/src/metrics/build-parser.js
@@ -8,7 +8,8 @@
  * Returns an object mapping stage names to total duration in milliseconds,
  * e.g. { generate: 5200, builder: 12300, runtime: 800 }
  */
-export function parseBuildStages(output) {
+export function parseBuildStages(...outputs) {
+  const output = outputs.filter(Boolean).join('\n');
   const stepStage = new Map();
   const stageDurations = {};
 

From 51ec81332e4f1040c99e8a767334af94d11670eb Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 13:24:15 +1100
Subject: [PATCH 21/23] Replace custom build stage parsing with BuildKit OTLP
 traces

BuildKit natively exports per-stage build traces via the same OTEL env
vars already configured for metrics publishing. This removes the custom
--progress=plain output parser which was unreliable, and delegates
build stage visibility to Grafana Cloud Tempo.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example                        |  4 ++++
 grafana/benchmark-dashboard.json    | 33 --------------------------
 toolkit/src/cli/commands/build.js   |  8 +++----
 toolkit/src/cli/commands/run.js     |  8 +++----
 toolkit/src/metrics/build-parser.js | 36 -----------------------------
 toolkit/src/publish/otlp.js         | 11 +--------
 toolkit/src/report/compare.js       | 27 +++-------------------
 toolkit/src/report/table.js         |  4 +---
 8 files changed, 15 insertions(+), 116 deletions(-)
 delete mode 100644 toolkit/src/metrics/build-parser.js

diff --git a/.env.example b/.env.example
index 6f26ade..5aadb66 100644
--- a/.env.example
+++ b/.env.example
@@ -1,5 +1,9 @@
 # Grafana Cloud OTLP credentials (standard OpenTelemetry env vars)
 # Find these at: Grafana Cloud portal > Your Stack > Connections > OpenTelemetry (OTLP)
 # Generate an API token and copy the two environment variables shown
+#
+# These env vars serve two purposes:
+# 1. Publishing benchmark metrics (build/deploy/loadtest) to Grafana Cloud
+# 2. BuildKit automatically picks them up and sends per-stage build traces to Tempo
 OTEL_EXPORTER_OTLP_ENDPOINT=
 OTEL_EXPORTER_OTLP_HEADERS=
diff --git a/grafana/benchmark-dashboard.json b/grafana/benchmark-dashboard.json
index cf270ae..9a2400e 100644
--- a/grafana/benchmark-dashboard.json
+++ b/grafana/benchmark-dashboard.json
@@ -285,39 +285,6 @@
           "instant": true
         }
       ]
-    },
-    {
-      "type": "barchart",
-      "title": "Build Stage Breakdown (ms)",
-      "description": "Time spent per Dockerfile stage (lower is better)",
-      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 29 },
-      "datasource": { "type": "prometheus", "uid": "${datasource}" },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "ms",
-          "color": { "mode": "palette-classic" }
-        },
-        "overrides": []
-      },
-      "options": {
-        "orientation": "vertical",
-        "showValue": "always",
-        "groupWidth": 0.7,
-        "barWidth": 0.8,
-        "stacking": "none",
-        "legend": { "displayMode": "list", "placement": "bottom" },
-        "tooltip": { "mode": "multi" },
-        "xTickLabelRotation": 0
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${datasource}" },
-          "expr": "benchmark_build_stage_duration{benchmark_target=~\"$target\", benchmark_tag=~\"$tag\"}",
-          "legendFormat": "{{benchmark_target}} {{benchmark_build_stage}}",
-          "refId": "A",
-          "instant": true
-        }
-      ]
     }
   ]
 }
diff --git a/toolkit/src/cli/commands/build.js b/toolkit/src/cli/commands/build.js
index 6120dc8..bba779e 100644
--- a/toolkit/src/cli/commands/build.js
+++ b/toolkit/src/cli/commands/build.js
@@ -2,7 +2,6 @@ import { Command } from 'commander';
 import { resolve } from 'node:path';
 import { loadConfig, resolveTarget } from '../../config/loader.js';
 import { composeBuild } from '../../core/docker.js';
-import { parseBuildStages } from '../../metrics/build-parser.js';
 import { startTimer } from '../../core/timer.js';
 import { getLogger } from '../../util/logger.js';
 
@@ -20,16 +19,15 @@ export function buildCommand() {
       const projectDir = resolve(target.path);
       const timer = startTimer();
 
-      const result = await composeBuild(projectDir, {
+      await composeBuild(projectDir, {
         composeFile: target.composeFile,
         noCache: !options.cache,
       });
 
       const { durationMs, durationSec } = timer.stop();
-      const stages = parseBuildStages(result.stdout, result.stderr);
 
-      log.info({ target: targetName, durationMs, durationSec, stages, msg: 'build completed' });
+      log.info({ target: targetName, durationMs, durationSec, msg: 'build completed' });
 
-      return { durationMs, cached: options.cache, stages };
+      return { durationMs, cached: options.cache };
     });
 }
diff --git a/toolkit/src/cli/commands/run.js b/toolkit/src/cli/commands/run.js
index e51234d..e4b0150 100644
--- a/toolkit/src/cli/commands/run.js
+++ b/toolkit/src/cli/commands/run.js
@@ -5,7 +5,6 @@ import { composeBuild, composeUp, composeDown } from '../../core/docker.js';
 import { waitForHealthy } from '../../core/health.js';
 import { runK6 } from '../../core/k6.js';
 import { startTimer } from '../../core/timer.js';
-import { parseBuildStages } from '../../metrics/build-parser.js';
 import { parseK6Summary } from '../../metrics/k6-parser.js';
 import { collectResults } from '../../metrics/collector.js';
 import { writeResults } from '../../report/json.js';
@@ -39,14 +38,13 @@ export function runCommand() {
         if (!options.skipBuild) {
           log.info({ target: targetName, msg: 'phase: build' });
           const timer = startTimer();
-          const buildOutput = await composeBuild(projectDir, {
+          await composeBuild(projectDir, {
             composeFile: target.composeFile,
             noCache: !options.cache,
           });
           const { durationMs } = timer.stop();
-          const stages = parseBuildStages(buildOutput.stdout, buildOutput.stderr);
-          buildResult = { durationMs, cached: options.cache, stages };
-          log.info({ durationMs, stages, msg: 'build completed' });
+          buildResult = { durationMs, cached: options.cache };
+          log.info({ durationMs, msg: 'build completed' });
         }
 
         // --- Deploy phase ---
diff --git a/toolkit/src/metrics/build-parser.js b/toolkit/src/metrics/build-parser.js
deleted file mode 100644
index 855e7ec..0000000
--- a/toolkit/src/metrics/build-parser.js
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Parse Docker BuildKit --progress=plain output to extract per-stage durations.
- *
- * BuildKit output format:
- *   #5 [generate 2/8] RUN apk add --no-cache git
- *   #5 DONE 1.2s
- *
- * Returns an object mapping stage names to total duration in milliseconds,
- * e.g. { generate: 5200, builder: 12300, runtime: 800 }
- */
-export function parseBuildStages(...outputs) {
-  const output = outputs.filter(Boolean).join('\n');
-  const stepStage = new Map();
-  const stageDurations = {};
-
-  const stagePattern = /#(\d+) \[(\S+)\s+\d+\/\d+\]/;
-  const donePattern = /#(\d+) DONE (\d+\.?\d*)s/;
-
-  for (const line of output.split('\n')) {
-    const stageMatch = line.match(stagePattern);
-    if (stageMatch) {
-      stepStage.set(stageMatch[1], stageMatch[2]);
-    }
-
-    const doneMatch = line.match(donePattern);
-    if (doneMatch) {
-      const stage = stepStage.get(doneMatch[1]);
-      if (stage && stage !== 'internal') {
-        const durationMs = Math.round(parseFloat(doneMatch[2]) * 1000);
-        stageDurations[stage] = (stageDurations[stage] ?? 0) + durationMs;
-      }
-    }
-  }
-
-  return stageDurations;
-}
diff --git a/toolkit/src/publish/otlp.js b/toolkit/src/publish/otlp.js
index 5469186..e8eba6c 100644
--- a/toolkit/src/publish/otlp.js
+++ b/toolkit/src/publish/otlp.js
@@ -45,18 +45,9 @@ export async function publishResults(results) {
     meter.createGauge(name).record(value, attributes);
   };
 
-  // Build metrics
+  // Build metrics (per-stage durations are reported via BuildKit OTLP traces)
   if (results.metrics.build) {
     gauge('benchmark.build.duration', results.metrics.build.durationMs);
-
-    // Per-stage build durations
-    const stages = results.metrics.build.stages;
-    if (stages) {
-      const stageGauge = meter.createGauge('benchmark.build.stage.duration');
-      for (const [stage, durationMs] of Object.entries(stages)) {
-        stageGauge.record(durationMs, { ...attributes, 'benchmark.build.stage': stage });
-      }
-    }
   }
 
   // Deploy metrics
diff --git a/toolkit/src/report/compare.js b/toolkit/src/report/compare.js
index 7a387d9..d3a4343 100644
--- a/toolkit/src/report/compare.js
+++ b/toolkit/src/report/compare.js
@@ -20,9 +20,6 @@ export function compareResults(results) {
 
     if (r.metrics.build) {
       row.buildMs = r.metrics.build.durationMs;
-      if (r.metrics.build.stages) {
-        row.stages = r.metrics.build.stages;
-      }
     }
 
     if (r.metrics.deploy) {
@@ -42,31 +39,13 @@ export function compareResults(results) {
     return row;
   });
 
-  // Collect all stage names across results
-  const stageNames = [...new Set(rows.flatMap((r) => Object.keys(r.stages ?? {})))].sort();
-
   // Find best values for highlighting
   const best = {};
-  const numericKeys = [
-    'buildMs',
-    'deployMs',
-    'avgMs',
-    'p90Ms',
-    'p95Ms',
-    'p99Ms',
-    'errorRate',
-    ...stageNames.map((s) => `stage:${s}`),
-  ];
+  const numericKeys = ['buildMs', 'deployMs', 'avgMs', 'p90Ms', 'p95Ms', 'p99Ms', 'errorRate'];
   const higherIsBetter = ['reqsPerSec'];
 
   for (const key of numericKeys) {
-    let values;
-    if (key.startsWith('stage:')) {
-      const stage = key.slice(6);
-      values = rows.map((r) => r.stages?.[stage]).filter((v) => v !== undefined);
-    } else {
-      values = rows.map((r) => r[key]).filter((v) => v !== undefined);
-    }
+    const values = rows.map((r) => r[key]).filter((v) => v !== undefined);
     if (values.length > 0) best[key] = Math.min(...values);
   }
 
@@ -75,5 +54,5 @@ export function compareResults(results) {
     if (values.length > 0) best[key] = Math.max(...values);
   }
 
-  return { rows, best, stageNames };
+  return { rows, best };
 }
diff --git a/toolkit/src/report/table.js b/toolkit/src/report/table.js
index cf89fd5..bb113f3 100644
--- a/toolkit/src/report/table.js
+++ b/toolkit/src/report/table.js
@@ -12,12 +12,11 @@ function highlight(value, bestValue) {
   return value === bestValue ? chalk.green(formatted) : formatted;
 }
 
-export function renderComparisonTable({ rows, best, stageNames = [] }) {
+export function renderComparisonTable({ rows, best }) {
   const head = [
     chalk.bold('Target'),
     chalk.bold('Tag'),
     chalk.bold('Build (ms)'),
-    ...stageNames.map((s) => chalk.bold(`${s} (ms)`)),
     chalk.bold('Deploy (ms)'),
     chalk.bold('Reqs/s'),
     chalk.bold('Avg (ms)'),
@@ -37,7 +36,6 @@ export function renderComparisonTable({ rows, best, stageNames = [] }) {
       row.target,
       row.tag ?? '-',
       highlight(row.buildMs, best.buildMs),
-      ...stageNames.map((s) => highlight(row.stages?.[s], best[`stage:${s}`])),
       highlight(row.deployMs, best.deployMs),
       highlight(row.reqsPerSec, best.reqsPerSec),
       highlight(row.avgMs, best.avgMs),

From 146e348792c8acb2b027da8a2aa5e7814ed5d1f8 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 14:07:14 +1100
Subject: [PATCH 22/23] Fix k6 summary parser to match k6 1.6.1 export format

k6 1.6.1 --summary-export puts metrics directly on the metric object
(e.g. metrics.grpc_req_duration.avg) without a .values wrapper. Also
fix checksPassRate to read checks.value instead of checks.rate.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolkit/src/metrics/k6-parser.js | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/toolkit/src/metrics/k6-parser.js b/toolkit/src/metrics/k6-parser.js
index 8149625..4504a4c 100644
--- a/toolkit/src/metrics/k6-parser.js
+++ b/toolkit/src/metrics/k6-parser.js
@@ -2,14 +2,13 @@ export function parseK6Summary(raw) {
   const metrics = raw.metrics || {};
 
   // Support both HTTP and gRPC protocols.
-  // k6 --summary-export nests metric data under a .values object.
-  const reqDuration = metrics.http_req_duration?.values || metrics.grpc_req_duration?.values || {};
-  const reqs = metrics.http_reqs?.values || {};
-  const reqFailed = metrics.http_req_failed?.values || {};
-  const checks = metrics.checks?.values || {};
-  const iterations = metrics.iterations?.values || {};
-  const dataReceived = metrics.data_received?.values || {};
-  const dataSent = metrics.data_sent?.values || {};
+  const reqDuration = metrics.http_req_duration || metrics.grpc_req_duration || {};
+  const reqs = metrics.http_reqs || {};
+  const reqFailed = metrics.http_req_failed || {};
+  const checks = metrics.checks || {};
+  const iterations = metrics.iterations || {};
+  const dataReceived = metrics.data_received || {};
+  const dataSent = metrics.data_sent || {};
 
   return {
     reqs: reqs.count ?? iterations.count ?? 0,
@@ -28,6 +27,6 @@ export function parseK6Summary(raw) {
     iterationsPerSec: iterations.rate ?? 0,
     dataReceived: dataReceived.count ?? 0,
     dataSent: dataSent.count ?? 0,
-    checksPassRate: checks.rate ?? 0,
+    checksPassRate: checks.value ?? 0,
   };
 }

From 68db4ecf550f919872f486e5e9b13362cfbc05e6 Mon Sep 17 00:00:00 2001
From: Hasnae <hasnae@labset.org>
Date: Sun, 8 Mar 2026 14:13:14 +1100
Subject: [PATCH 23/23] Simplify Grafana dashboard layout

Remove row separator panels and tighten grid positions for a cleaner
5-panel layout.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 grafana/benchmark-dashboard.json | 31 +++++--------------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/grafana/benchmark-dashboard.json b/grafana/benchmark-dashboard.json
index 9a2400e..abd8d04 100644
--- a/grafana/benchmark-dashboard.json
+++ b/grafana/benchmark-dashboard.json
@@ -70,18 +70,11 @@
     ]
   },
   "panels": [
-    {
-      "type": "row",
-      "title": "Performance",
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
-      "collapsed": false,
-      "panels": []
-    },
     {
       "type": "bargauge",
       "title": "Throughput (req/s)",
       "description": "Higher is better",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
@@ -114,7 +107,7 @@
       "type": "bargauge",
       "title": "Error Rate",
       "description": "Lower is better",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
@@ -152,18 +145,11 @@
         }
       ]
     },
-    {
-      "type": "row",
-      "title": "Latency",
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 },
-      "collapsed": false,
-      "panels": []
-    },
     {
       "type": "barchart",
       "title": "Response Time by Percentile (ms)",
       "description": "Lower is better",
-      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 10 },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 8 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
@@ -213,18 +199,11 @@
         }
       ]
     },
-    {
-      "type": "row",
-      "title": "Build & Deploy",
-      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
-      "collapsed": false,
-      "panels": []
-    },
     {
       "type": "bargauge",
       "title": "Build Time",
       "description": "Total Docker build duration (lower is better)",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {
@@ -257,7 +236,7 @@
       "type": "bargauge",
       "title": "Deploy Time",
       "description": "Time to healthy (lower is better)",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
       "datasource": { "type": "prometheus", "uid": "${datasource}" },
       "fieldConfig": {
         "defaults": {