From 29ec1105af7f5c6968c6823d2f3f9b90c1960736 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Sun, 28 Jun 2026 09:22:43 -0700
Subject: [PATCH 1/3] Update

[ghstack-poisoned]
---
 backends/webgpu/CMakeLists.txt                |   3 +
 .../webgpu/test/native/test_dynamic_shape.cpp | 466 ++++++++++++++++++
 .../test_dynamic_shape_export.py              | 365 ++++++++++++++
 3 files changed, 834 insertions(+)
 create mode 100644 backends/webgpu/test/native/test_dynamic_shape.cpp
 create mode 100644 backends/webgpu/test/ops/dynamic_shape/test_dynamic_shape_export.py
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index c3d9387942a..15a1e7bfd10 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -160,6 +160,9 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
   add_webgpu_native_test(
     webgpu_update_cache_test test/native/test_update_cache.cpp
   )
+  add_webgpu_native_test(
+    webgpu_dynamic_shape_test test/native/test_dynamic_shape.cpp
+  )
 
   # Manifest-driven op-test framework: a generic gtest driver (webgpu_op_test) +
   # its device-free util unit test. GTest needs -DEXECUTORCH_BUILD_TESTS=ON.
diff --git a/backends/webgpu/test/native/test_dynamic_shape.cpp b/backends/webgpu/test/native/test_dynamic_shape.cpp
new file mode 100644
index 00000000000..a001f6944ee
--- /dev/null
+++ b/backends/webgpu/test/native/test_dynamic_shape.cpp
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Native test for dynamic tensor shapes (Option 2). One graph is built at the
+// upper-bound seq-len MAXS and run at several live S; the output must match the
+// torch golden at each S (allocate-at-max + per-op resize hooks + output-EValue
+// resize). Cases:
+//   A  dyn_rms at S=MAXS                       -> golden (static-equivalent)
+//   B  dyn_rms at S < MAXS (64, 8, 1)          -> golden (resize shrinks
+//   dispatch) C  ONE loaded graph reused across S        -> all golden (buffers
+//   never moved
+//                                                 => bind groups stayed valid)
+//   D  static_rms (no dynamic dim)             -> golden (static path
+//   unchanged) F  dyn_rms_chain (rms(rms(x))) at 3 S      -> golden (resize
+//   CASCADE, DD-4)
+// .pte + goldens from test/ops/dynamic_shape/test_dynamic_shape_export.py.
+
+#include <executorch/backends/webgpu/runtime/WebGPUCompat.h>
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace executorch::backends::webgpu;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+namespace {
+
+constexpr int kHidden = 64;
+
+std::vector<float> read_bin(const std::string& path) {
+  std::ifstream f(path, std::ios::binary | std::ios::ate);
+  if (!f) {
+    return {};
+  }
+  const std::streamsize n = f.tellg();
+  f.seekg(0);
+  std::vector<float> v(static_cast<size_t>(n) / sizeof(float));
+  f.read(reinterpret_cast<char*>(v.data()), n);
+  return v;
+}
+
+float max_err(const std::vector<float>& a, const std::vector<float>& b) {
+  if (a.size() != b.size() || a.empty()) {
+    return 1e30f;
+  }
+  float m = 0.0f;
+  for (size_t i = 0; i < a.size(); i++) {
+    m = std::fmax(m, std::fabs(a[i] - b[i]));
+  }
+  return m;
+}
+
+// Run one forward of a [1,1,S,kHidden] input through `m`; return the output.
+std::vector<float>
+run_s(Module& m, const std::string& dir, const std::string& prefix, int s) {
+  auto input =
+      read_bin(dir + "/" + prefix + ".S" + std::to_string(s) + ".input.bin");
+  if (input.empty()) {
+    printf("  MISSING input %s.S%d\n", prefix.c_str(), s);
+    return {};
+  }
+  auto t = make_tensor_ptr({1, 1, s, kHidden}, std::move(input));
+  auto r = m.forward({EValue(t)});
+  if (!r.ok() || r.get().empty() || !r.get()[0].isTensor()) {
+    printf("  forward FAILED (S=%d, err=%d)\n", s, r.ok() ? 0 : (int)r.error());
+    return {};
+  }
+  const auto& out = r.get()[0].toTensor();
+  const float* d = out.const_data_ptr<float>();
+  const size_t numel = static_cast<size_t>(s) * kHidden;
+  // Output EValue must have been resized to the live shape.
+  if (out.numel() != static_cast<ssize_t>(numel)) {
+    printf(
+        "  WRONG output numel: got %zd want %zu (S=%d)\n",
+        (ssize_t)out.numel(),
+        numel,
+        s);
+    return {};
+  }
+  return std::vector<float>(d, d + numel);
+}
+
+bool check_s(
+    Module& m,
+    const std::string& dir,
+    const std::string& prefix,
+    int s,
+    bool& ok) {
+  auto got = run_s(m, dir, prefix, s);
+  auto golden =
+      read_bin(dir + "/" + prefix + ".S" + std::to_string(s) + ".golden.bin");
+  float e = max_err(got, golden);
+  bool pass = !got.empty() && e < 1e-3f;
+  printf(
+      "  %s S=%-3d max_err=%e -> %s\n",
+      prefix.c_str(),
+      s,
+      e,
+      pass ? "PASS" : "FAIL");
+  if (!pass) {
+    printf("    got.size=%zu golden.size=%zu\n", got.size(), golden.size());
+    for (size_t i = 0; i < 4 && i < got.size() && i < golden.size(); i++) {
+      printf("    [%zu] got=%.6f golden=%.6f\n", i, got[i], golden[i]);
+    }
+  }
+  ok = ok && pass;
+  return pass;
+}
+
+// Dynamic quantized linear: input [M, lin_k] -> output [M, lin_n].
+constexpr int kLinK = 64;
+constexpr int kLinN = 128;
+void check_linear(const std::string& dir, int m_rows, bool& ok) {
+  Module m(dir + "/dyn_linear.pte");
+  if (m.load_forward() != Error::Ok) {
+    printf("  FAIL load dyn_linear.pte\n");
+    ok = false;
+    return;
+  }
+  auto input =
+      read_bin(dir + "/dyn_linear.S" + std::to_string(m_rows) + ".input.bin");
+  auto golden =
+      read_bin(dir + "/dyn_linear.S" + std::to_string(m_rows) + ".golden.bin");
+  if (input.empty()) {
+    printf("  MISSING dyn_linear.S%d\n", m_rows);
+    ok = false;
+    return;
+  }
+  auto t = make_tensor_ptr({m_rows, kLinK}, std::move(input));
+  auto r = m.forward({EValue(t)});
+  if (!r.ok() || r.get().empty() || !r.get()[0].isTensor()) {
+    printf("  linear M=%d forward FAILED\n", m_rows);
+    ok = false;
+    return;
+  }
+  const auto& out = r.get()[0].toTensor();
+  const size_t numel = static_cast<size_t>(m_rows) * kLinN;
+  std::vector<float> got(
+      out.const_data_ptr<float>(), out.const_data_ptr<float>() + numel);
+  float e = max_err(got, golden);
+  // 4-bit quant: looser tol (the kernel mirrors the dequant-matmul reference).
+  bool pass = out.numel() == static_cast<ssize_t>(numel) && e < 5e-3f;
+  printf(
+      "  dyn_linear M=%-3d max_err=%e -> %s\n",
+      m_rows,
+      e,
+      pass ? "PASS" : "FAIL");
+  ok = ok && pass;
+}
+
+// Dynamic SDPA (GQA prefill, input_pos=0): q[1,s,hq,d] k/v[1,s,hkv,d]
+// caches[1,cmax,hkv,d]; attn output [1,s,hq,d] selected by shape (3 outputs).
+constexpr int kSdHq = 8, kSdHkv = 2, kSdD = 16, kSdCmax = 64;
+void check_sdpa(const std::string& dir, int s, bool& ok) {
+  Module m(dir + "/sdpa_dyn.pte");
+  Error le = m.load_forward();
+  if (le == Error::DelegateInvalidCompatibility) {
+    // PENDING op coverage: dynamic-S SDPA build throws err 48 until registered.
+    printf("  PENDING sdpa_dyn S=%d (op coverage, err %d)\n", s, (int)le);
+    return;
+  }
+  if (le != Error::Ok) {
+    printf("  sdpa_dyn S=%d load FAILED (err %d)\n", s, (int)le);
+    ok = false;
+    return;
+  }
+  const std::string b = dir + "/sdpa_dyn.S" + std::to_string(s) + ".";
+  auto q = read_bin(b + "q.bin");
+  auto k = read_bin(b + "k.bin");
+  auto v = read_bin(b + "v.bin");
+  auto kc = read_bin(b + "kc.bin");
+  auto vc = read_bin(b + "vc.bin");
+  auto golden = read_bin(b + "golden.bin");
+  if (q.empty() || golden.empty()) {
+    printf("  MISSING sdpa_dyn.S%d\n", s);
+    ok = false;
+    return;
+  }
+  auto tq = make_tensor_ptr({1, s, kSdHq, kSdD}, std::move(q));
+  auto tk = make_tensor_ptr({1, s, kSdHkv, kSdD}, std::move(k));
+  auto tv = make_tensor_ptr({1, s, kSdHkv, kSdD}, std::move(v));
+  auto tkc = make_tensor_ptr({1, kSdCmax, kSdHkv, kSdD}, std::move(kc));
+  auto tvc = make_tensor_ptr({1, kSdCmax, kSdHkv, kSdD}, std::move(vc));
+  auto r =
+      m.forward({EValue(tq), EValue(tk), EValue(tv), EValue(tkc), EValue(tvc)});
+  if (!r.ok()) {
+    printf("  sdpa S=%d forward FAILED (err=%d)\n", s, (int)r.error());
+    ok = false;
+    return;
+  }
+  // Select the attn output by full shape [1,s,hq,d] (never numel).
+  const float* attn = nullptr;
+  size_t numel = static_cast<size_t>(s) * kSdHq * kSdD;
+  for (size_t i = 0; i < r.get().size(); i++) {
+    if (!r.get()[i].isTensor()) {
+      continue;
+    }
+    const auto& t = r.get()[i].toTensor();
+    if (t.dim() == 4 && t.size(1) == s && t.size(2) == kSdHq &&
+        t.size(3) == kSdD) {
+      attn = t.const_data_ptr<float>();
+      break;
+    }
+  }
+  if (attn == nullptr) {
+    printf(
+        "  sdpa S=%d: no attn output of shape [1,%d,%d,%d]\n",
+        s,
+        s,
+        kSdHq,
+        kSdD);
+    ok = false;
+    return;
+  }
+  std::vector<float> got(attn, attn + numel);
+  float e = max_err(got, golden);
+  bool pass = e < 2e-3f; // SDPA tol (abs 1e-4 / rel 1e-3 family)
+  printf("  sdpa_dyn S=%-3d max_err=%e -> %s\n", s, e, pass ? "PASS" : "FAIL");
+  ok = ok && pass;
+}
+
+// Dynamic embedding: int64 token ids [N] -> [N, kEmbDim] fp32. The int64 host
+// input exercises copy_inputs' int64->int32 narrow path under dynamic shapes.
+constexpr int kEmbDim = 64;
+void check_embedding(const std::string& dir, int n, bool& ok) {
+  Module m(dir + "/emb_dyn.pte");
+  if (m.load_forward() != Error::Ok) {
+    printf("  FAIL load emb_dyn.pte\n");
+    ok = false;
+    return;
+  }
+  const std::string b = dir + "/emb_dyn.S" + std::to_string(n) + ".";
+  std::ifstream f(b + "idx.bin", std::ios::binary | std::ios::ate);
+  if (!f) {
+    printf("  MISSING emb_dyn.S%d\n", n);
+    ok = false;
+    return;
+  }
+  const std::streamsize nb = f.tellg();
+  f.seekg(0);
+  std::vector<int64_t> idx(static_cast<size_t>(nb) / sizeof(int64_t));
+  f.read(reinterpret_cast<char*>(idx.data()), nb);
+  auto golden = read_bin(b + "golden.bin");
+  auto t = make_tensor_ptr({n}, std::move(idx)); // int64 (Long) host input
+  auto r = m.forward({EValue(t)});
+  if (!r.ok() || r.get().empty() || !r.get()[0].isTensor()) {
+    printf(
+        "  emb N=%d forward FAILED (err=%d)\n", n, r.ok() ? 0 : (int)r.error());
+    ok = false;
+    return;
+  }
+  const auto& out = r.get()[0].toTensor();
+  const size_t numel = static_cast<size_t>(n) * kEmbDim;
+  std::vector<float> got(
+      out.const_data_ptr<float>(), out.const_data_ptr<float>() + numel);
+  float e = max_err(got, golden);
+  bool pass = out.numel() == static_cast<ssize_t>(numel) && e < 5e-3f;
+  printf("  emb_dyn N=%-3d max_err=%e -> %s\n", n, e, pass ? "PASS" : "FAIL");
+  ok = ok && pass;
+}
+
+// Dynamic RoPE: xq[1,s,nh,hd] xk[1,s,nkv,hd] freqs[s,hd/2] -> xq_out/xk_out
+// (2 outputs, selected by head count nh != nkv).
+constexpr int kRopeNH = 8, kRopeNKV = 2, kRopeHD = 64;
+void check_rope(const std::string& dir, int s, bool& ok) {
+  Module m(dir + "/rope_dyn.pte");
+  if (m.load_forward() != Error::Ok) {
+    printf("  FAIL load rope_dyn.pte\n");
+    ok = false;
+    return;
+  }
+  const std::string b = dir + "/rope_dyn.S" + std::to_string(s) + ".";
+  auto xq = read_bin(b + "xq.bin");
+  auto xk = read_bin(b + "xk.bin");
+  auto fc = read_bin(b + "fc.bin");
+  auto fs = read_bin(b + "fs.bin");
+  auto gq = read_bin(b + "gq.bin");
+  auto gk = read_bin(b + "gk.bin");
+  if (xq.empty() || gq.empty()) {
+    printf("  MISSING rope_dyn.S%d\n", s);
+    ok = false;
+    return;
+  }
+  auto txq = make_tensor_ptr({1, s, kRopeNH, kRopeHD}, std::move(xq));
+  auto txk = make_tensor_ptr({1, s, kRopeNKV, kRopeHD}, std::move(xk));
+  auto tfc = make_tensor_ptr({s, kRopeHD / 2}, std::move(fc));
+  auto tfs = make_tensor_ptr({s, kRopeHD / 2}, std::move(fs));
+  auto r = m.forward({EValue(txq), EValue(txk), EValue(tfc), EValue(tfs)});
+  if (!r.ok()) {
+    printf("  rope S=%d forward FAILED (err=%d)\n", s, (int)r.error());
+    ok = false;
+    return;
+  }
+  // Select xq_out (nh heads) and xk_out (nkv heads) by shape.
+  const float *oq = nullptr, *okp = nullptr;
+  for (size_t i = 0; i < r.get().size(); i++) {
+    if (!r.get()[i].isTensor()) {
+      continue;
+    }
+    const auto& t = r.get()[i].toTensor();
+    if (t.dim() == 4 && t.size(1) == s && t.size(3) == kRopeHD) {
+      if (t.size(2) == kRopeNH) {
+        oq = t.const_data_ptr<float>();
+      } else if (t.size(2) == kRopeNKV) {
+        okp = t.const_data_ptr<float>();
+      }
+    }
+  }
+  if (oq == nullptr || okp == nullptr) {
+    printf("  rope S=%d: missing xq_out/xk_out by shape\n", s);
+    ok = false;
+    return;
+  }
+  std::vector<float> gotq(oq, oq + static_cast<size_t>(s) * kRopeNH * kRopeHD);
+  std::vector<float> gotk(
+      okp, okp + static_cast<size_t>(s) * kRopeNKV * kRopeHD);
+  float e = std::fmax(max_err(gotq, gq), max_err(gotk, gk));
+  bool pass = e < 1e-3f;
+  printf("  rope_dyn S=%-3d max_err=%e -> %s\n", s, e, pass ? "PASS" : "FAIL");
+  ok = ok && pass;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  std::string dir = "/tmp/dynamic_shape";
+  if (argc > 1) {
+    dir = argv[1];
+  }
+  if (const char* env = std::getenv("WEBGPU_DYNAMIC_SHAPE_DIR")) {
+    dir = env;
+  }
+
+  WebGPUContext ctx;
+  try {
+    ctx = create_webgpu_context();
+  } catch (const std::exception& e) {
+    printf("SKIP: %s\n", e.what());
+    return 0;
+  }
+  set_default_webgpu_context(&ctx);
+  printf("WebGPU device acquired (native); dir: %s\n", dir.c_str());
+
+  bool ok = true;
+
+  // Cases A + B: single dynamic rms_norm at S = MAXS .. 1 (fresh module each).
+  printf("\n--- A/B: dynamic rms_norm at several S (fresh load each) ---\n");
+  for (int s : {128, 64, 8, 1}) {
+    Module m(dir + "/dyn_rms.pte");
+    if (m.load_forward() != Error::Ok) {
+      printf("  FAIL load dyn_rms.pte\n");
+      ok = false;
+      break;
+    }
+    check_s(m, dir, "dyn_rms", s, ok);
+  }
+
+  // Case C: ONE loaded graph reused across S (buffers must not move).
+  printf("\n--- C: one graph reused across S (bind groups stay valid) ---\n");
+  {
+    Module m(dir + "/dyn_rms.pte");
+    if (m.load_forward() != Error::Ok) {
+      printf("  FAIL load dyn_rms.pte\n");
+      ok = false;
+    } else {
+      for (int s : {128, 1, 64, 8, 128}) {
+        check_s(m, dir, "dyn_rms", s, ok);
+      }
+    }
+  }
+
+  // Case D: static rms_norm (no dynamic dim) — regression.
+  printf("\n--- D: static rms_norm (static path unchanged) ---\n");
+  {
+    Module m(dir + "/static_rms.pte");
+    if (m.load_forward() != Error::Ok) {
+      printf("  FAIL load static_rms.pte\n");
+      ok = false;
+    } else {
+      check_s(m, dir, "static_rms", 8, ok);
+    }
+  }
+
+  // Case F: 2-op chain rms(rms(x)) — resize cascade.
+  printf("\n--- F: rms(rms(x)) cascade at several S ---\n");
+  for (int s : {128, 16, 1}) {
+    Module m(dir + "/dyn_rms_chain.pte");
+    if (m.load_forward() != Error::Ok) {
+      printf("  FAIL load dyn_rms_chain.pte\n");
+      ok = false;
+      break;
+    }
+    check_s(m, dir, "dyn_rms_chain", s, ok);
+  }
+
+  // Case G: rms(x)+x residual — cross-op (rms -> add) cascade.
+  printf("\n--- G: rms(x)+x residual (rms->add cascade) at several S ---\n");
+  for (int s : {128, 32, 1}) {
+    Module m(dir + "/dyn_residual.pte");
+    if (m.load_forward() != Error::Ok) {
+      printf("  FAIL load dyn_residual.pte\n");
+      ok = false;
+      break;
+    }
+    check_s(m, dir, "dyn_residual", s, ok);
+  }
+
+  // Case H: rms(x)*x — exercises the mul op resize.
+  printf("\n--- H: rms(x)*x (mul op) at several S ---\n");
+  for (int s : {128, 32, 1}) {
+    Module m(dir + "/dyn_rmsmul.pte");
+    if (m.load_forward() != Error::Ok) {
+      printf("  FAIL load dyn_rmsmul.pte\n");
+      ok = false;
+      break;
+    }
+    check_s(m, dir, "dyn_rmsmul", s, ok);
+  }
+
+  // Case I: dynamic 4-bit quantized linear (prefill GEMM) at several M.
+  printf("\n--- I: dynamic linear_q4gsw [M,64]->[M,128] at several M ---\n");
+  for (int mrows : {128, 32, 1}) {
+    check_linear(dir, mrows, ok);
+  }
+
+  // Case J: dynamic SDPA (GQA prefill) at several seq-len S.
+  printf("\n--- J: dynamic sdpa_with_kv_cache (prefill) at several S ---\n");
+  for (int s : {64, 16, 1}) {
+    check_sdpa(dir, s, ok);
+  }
+
+  // Case K: dynamic embedding (int64 token ids) at several token counts.
+  printf("\n--- K: dynamic embedding_q4gsw (int64 ids) at several N ---\n");
+  for (int n : {16, 8, 1}) {
+    check_embedding(dir, n, ok);
+  }
+
+  // Case L: dynamic RoPE (two outputs) at several seq-len S.
+  printf("\n--- L: dynamic apply_rotary_emb at several S ---\n");
+  for (int s : {16, 8, 1}) {
+    check_rope(dir, s, ok);
+  }
+
+  set_default_webgpu_context(nullptr);
+  destroy_webgpu_context(ctx);
+
+  if (!ok) {
+    printf("\ndynamic_shape tests FAILED\n");
+    return 1;
+  }
+  printf("\nAll dynamic_shape tests passed\n");
+  return 0;
+}
diff --git a/backends/webgpu/test/ops/dynamic_shape/test_dynamic_shape_export.py b/backends/webgpu/test/ops/dynamic_shape/test_dynamic_shape_export.py
new file mode 100644
index 00000000000..cc694534c2d
--- /dev/null
+++ b/backends/webgpu/test/ops/dynamic_shape/test_dynamic_shape_export.py
@@ -0,0 +1,365 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Dynamic tensor-shape (Option 2) export tests via VulkanPartitioner.
+
+Exports ONE graph built at the upper-bound seq-len MAXS that the native runtime
+(`test/native/test_dynamic_shape.cpp`) then runs at several live S, asserting the
+output matches the torch golden and that the static path is unchanged. Numerics
+are checked in the native test; this verifies the dynamic export side + writes
+goldens.
+"""
+
+import os
+import unittest
+
+import torch
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+MAXS = 128  # upper bound for the dynamic seq-len dim (within the 1D dispatch cap)
+HIDDEN = 64
+
+
+def _rms(x: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+    x_f32 = x.to(torch.float32)
+    var = x_f32.pow(2).mean(dim=-1, keepdim=True)
+    return (x_f32 * torch.rsqrt(var + eps)) * weight
+
+
+class RmsNormModule(torch.nn.Module):
+    def __init__(self, hidden: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(
+            torch.linspace(0.5, 1.5, hidden, dtype=torch.float32)
+        )
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return _rms(x, self.weight, self.eps)
+
+
+class RmsChainModule(torch.nn.Module):
+    """rms(rms(x)) — two ops; exercises the resize-cascade (DD-4)."""
+
+    def __init__(self, hidden: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.w1 = torch.nn.Parameter(
+            torch.linspace(0.5, 1.5, hidden, dtype=torch.float32)
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.linspace(1.5, 0.5, hidden, dtype=torch.float32)
+        )
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return _rms(_rms(x, self.w1, self.eps), self.w2, self.eps)
+
+
+class RmsResidualModule(torch.nn.Module):
+    """rms(x) + x — rms op feeding an add op; proves the cross-op resize cascade."""
+
+    def __init__(self, hidden: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.w = torch.nn.Parameter(
+            torch.linspace(0.5, 1.5, hidden, dtype=torch.float32)
+        )
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return _rms(x, self.w, self.eps) + x
+
+
+class RmsMulModule(torch.nn.Module):
+    """rms(x) * x — exercises the mul op (two same-shape dynamic operands)."""
+
+    def __init__(self, hidden: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.w = torch.nn.Parameter(
+            torch.linspace(0.5, 1.5, hidden, dtype=torch.float32)
+        )
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return _rms(x, self.w, self.eps) * x
+
+
+def _ramp(shape) -> torch.Tensor:
+    n = 1
+    for d in shape:
+        n *= d
+    return torch.linspace(-1.0, 1.0, n, dtype=torch.float32).reshape(shape)
+
+
+def _export(model, example_inputs, dynamic_shapes, path: str) -> None:
+    ep = torch.export.export(model, example_inputs, dynamic_shapes=dynamic_shapes)
+    et = to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]).to_executorch()
+    found = any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+    assert found, f"Expected VulkanBackend delegate in {path}"
+    with open(path, "wb") as f:
+        f.write(et.buffer)
+    print(f"Exported {path}")
+
+
+def _write_goldens(model, prefix: str, out_dir: str, s_values) -> None:
+    for s in s_values:
+        x = _ramp((1, 1, s, HIDDEN))
+        with torch.no_grad():
+            g = model(x)
+        x.detach().numpy().astype("<f4").tofile(
+            os.path.join(out_dir, f"{prefix}.S{s}.input.bin")
+        )
+        g.detach().numpy().astype("<f4").tofile(
+            os.path.join(out_dir, f"{prefix}.S{s}.golden.bin")
+        )
+        print(f"  golden {prefix} S={s}")
+
+
+def export_dynamic_shape_cases(out_dir: str) -> None:
+    """Write the dynamic + static .pte's and per-S goldens for the native test."""
+    os.makedirs(out_dir, exist_ok=True)
+    s_dim = torch.export.Dim("s", min=1, max=MAXS)
+
+    # 1) Single dynamic rms_norm, graph built at S=MAXS (upper bound).
+    rms = RmsNormModule(HIDDEN)
+    _export(
+        rms,
+        (_ramp((1, 1, MAXS, HIDDEN)),),
+        {"x": {2: s_dim}},
+        os.path.join(out_dir, "dyn_rms.pte"),
+    )
+    _write_goldens(rms, "dyn_rms", out_dir, [MAXS, 64, 8, 1])
+
+    # 2) Two-op chain (cascade): rms(rms(x)).
+    chain = RmsChainModule(HIDDEN)
+    _export(
+        chain,
+        (_ramp((1, 1, MAXS, HIDDEN)),),
+        {"x": {2: s_dim}},
+        os.path.join(out_dir, "dyn_rms_chain.pte"),
+    )
+    _write_goldens(chain, "dyn_rms_chain", out_dir, [MAXS, 16, 1])
+
+    # 2b) rms(x)+x residual — cross-op (rms->add) cascade.
+    resid = RmsResidualModule(HIDDEN)
+    _export(
+        resid,
+        (_ramp((1, 1, MAXS, HIDDEN)),),
+        {"x": {2: s_dim}},
+        os.path.join(out_dir, "dyn_residual.pte"),
+    )
+    _write_goldens(resid, "dyn_residual", out_dir, [MAXS, 32, 1])
+
+    # 2c) rms(x)*x — exercises the mul op resize.
+    rmsmul = RmsMulModule(HIDDEN)
+    _export(
+        rmsmul,
+        (_ramp((1, 1, MAXS, HIDDEN)),),
+        {"x": {2: s_dim}},
+        os.path.join(out_dir, "dyn_rmsmul.pte"),
+    )
+    _write_goldens(rmsmul, "dyn_rmsmul", out_dir, [MAXS, 32, 1])
+
+    # 2d) 4-bit quantized linear with a DYNAMIC rows (M) dim — prefill GEMM.
+    _export_dynamic_linear(out_dir)
+
+    # 2e) Fused SDPA with a DYNAMIC seq-len S (prefill, input_pos=0).
+    _export_dynamic_sdpa(out_dir)
+
+    # 2f) 4-bit embedding with a DYNAMIC token count (int64 indices).
+    _export_dynamic_embedding(out_dir)
+
+    # 2g) Interleaved RoPE with a DYNAMIC seq-len S (two outputs xq/xk).
+    _export_dynamic_rope(out_dir)
+
+    # 3) Static rms_norm (no dynamic dim) — regression: must stay byte-identical.
+    static = RmsNormModule(HIDDEN)
+    _export(
+        static,
+        (_ramp((1, 1, 8, HIDDEN)),),
+        None,
+        os.path.join(out_dir, "static_rms.pte"),
+    )
+    _write_goldens(static, "static_rms", out_dir, [8])
+
+
+# Quantized linear: K x N weight, dynamic rows M; input [M, K], output [M, N].
+LIN_K = 64
+LIN_N = 128
+LIN_GROUP = 32
+LIN_MAXM = 128
+
+
+def _export_dynamic_linear(out_dir: str) -> None:
+    from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import (
+        _fp64_golden,
+        _make_quantized_model,
+    )
+
+    model = _make_quantized_model(LIN_K, LIN_N, LIN_GROUP)
+    x = _ramp((LIN_MAXM, LIN_K))
+    m_dim = torch.export.Dim("m", min=1, max=LIN_MAXM)
+    ep = torch.export.export(model, (x,), dynamic_shapes=({0: m_dim},))
+    et = to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]).to_executorch()
+    assert any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    ), "linear_q4gsw not delegated"
+    with open(os.path.join(out_dir, "dyn_linear.pte"), "wb") as f:
+        f.write(et.buffer)
+    print("Exported dyn_linear.pte")
+    for m in [LIN_MAXM, 32, 1]:
+        xm = _ramp((m, LIN_K))
+        g = _fp64_golden(model, xm).astype("<f4")  # [m, N]
+        xm.detach().numpy().astype("<f4").tofile(
+            os.path.join(out_dir, f"dyn_linear.S{m}.input.bin")
+        )
+        g.tofile(os.path.join(out_dir, f"dyn_linear.S{m}.golden.bin"))
+        print(f"  golden dyn_linear M={m}")
+
+
+# Dynamic SDPA: GQA prefill (input_pos=0), q/k/v seq-len dynamic.
+SD_HQ = 8
+SD_HKV = 2
+SD_D = 16
+SD_CMAX = 64
+SD_MAXS = 64
+
+
+def _export_dynamic_sdpa(out_dir: str) -> None:
+    from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import (
+        _det_inputs,
+        _golden,
+        SdpaConfig,
+        SdpaModule,
+    )
+
+    def cfg(s: int) -> "SdpaConfig":
+        return SdpaConfig("dyn", SD_HQ, SD_HKV, SD_D, s, SD_CMAX, 0)
+
+    model = SdpaModule(0)
+    q, k, v, kc, vc = _det_inputs(cfg(SD_MAXS))
+    s_dim = torch.export.Dim("s", min=1, max=SD_MAXS)
+    ds = ({1: s_dim}, {1: s_dim}, {1: s_dim}, None, None)
+    ep = torch.export.export(model, (q, k, v, kc, vc), dynamic_shapes=ds)
+    et = to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]).to_executorch()
+    assert any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    ), "sdpa not delegated"
+    with open(os.path.join(out_dir, "sdpa_dyn.pte"), "wb") as f:
+        f.write(et.buffer)
+    print("Exported sdpa_dyn.pte")
+    for s in [SD_MAXS, 16, 1]:
+        c = cfg(s)
+        q, k, v, kc, vc = _det_inputs(c)
+        g = _golden(c, q, k, v, kc, vc)
+        for name, t in [("q", q), ("k", k), ("v", v), ("kc", kc), ("vc", vc), ("golden", g)]:
+            t.detach().cpu().numpy().astype("<f4").tofile(
+                os.path.join(out_dir, f"sdpa_dyn.S{s}.{name}.bin")
+            )
+        print(f"  golden sdpa_dyn S={s} (golden shape {tuple(g.shape)})")
+
+
+# Dynamic embedding: int64 token ids (dynamic count) -> [N, EMBED] fp32.
+EMB_VOCAB = 64
+EMB_DIM = 64
+EMB_GROUP = 32
+EMB_MAXN = 16
+
+
+def _export_dynamic_embedding(out_dir: str) -> None:
+    from executorch.backends.webgpu.test.ops.embedding_q4gsw.test_embedding_q4gsw import (
+        _make_quantized_model,
+        _quant_params,
+        Shape,
+    )
+
+    shape = Shape("dyn", EMB_VOCAB, EMB_DIM, EMB_GROUP, list(range(EMB_MAXN)))
+    qm = _make_quantized_model(shape)
+    idx_max = torch.arange(EMB_MAXN, dtype=torch.long)
+    n_dim = torch.export.Dim("n", min=1, max=EMB_MAXN)
+    ep = torch.export.export(qm, (idx_max,), dynamic_shapes=({0: n_dim},))
+    et = to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]).to_executorch()
+    assert any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    ), "embedding_q4gsw not delegated"
+    with open(os.path.join(out_dir, "emb_dyn.pte"), "wb") as f:
+        f.write(et.buffer)
+    print("Exported emb_dyn.pte")
+    weight, scales, group_size = _quant_params(qm)
+    for n in [EMB_MAXN, 8, 1]:
+        idx = (torch.arange(n, dtype=torch.long) * 7) % EMB_VOCAB
+        g = torch.ops.et_vk.embedding_q4gsw.default(
+            weight, scales, group_size, idx, False
+        )
+        idx.detach().numpy().astype("<i8").tofile(
+            os.path.join(out_dir, f"emb_dyn.S{n}.idx.bin")
+        )
+        g.detach().numpy().astype("<f4").tofile(
+            os.path.join(out_dir, f"emb_dyn.S{n}.golden.bin")
+        )
+        print(f"  golden emb_dyn N={n} (shape {tuple(g.shape)})")
+
+
+# Dynamic RoPE: xq/xk + freqs all share a dynamic seq-len S.
+ROPE_NH = 8
+ROPE_NKV = 2
+ROPE_HD = 64
+ROPE_MAXS = 16
+
+
+def _export_dynamic_rope(out_dir: str) -> None:
+    from executorch.backends.webgpu.test.ops.rope.test_rope import (
+        _golden,
+        _inputs,
+        Shape,
+    )
+    from executorch.examples.models.llama.rope import RotaryEmbedding
+
+    xq, xk, fc, fs = _inputs(Shape("dyn", 1, ROPE_MAXS, ROPE_NH, ROPE_NKV, ROPE_HD))
+    s_dim = torch.export.Dim("s", min=1, max=ROPE_MAXS)
+    ds = ({1: s_dim}, {1: s_dim}, {0: s_dim}, {0: s_dim})
+    ep = torch.export.export(RotaryEmbedding().eval(), (xq, xk, fc, fs), dynamic_shapes=ds)
+    et = to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]).to_executorch()
+    assert any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    ), "apply_rotary_emb not delegated"
+    with open(os.path.join(out_dir, "rope_dyn.pte"), "wb") as f:
+        f.write(et.buffer)
+    print("Exported rope_dyn.pte")
+    for s in [ROPE_MAXS, 8, 1]:
+        xq, xk, fc, fs = _inputs(Shape("dyn", 1, s, ROPE_NH, ROPE_NKV, ROPE_HD))
+        gq, gk = _golden(xq, xk, fc, fs)
+        for name, t in [("xq", xq), ("xk", xk), ("fc", fc), ("fs", fs), ("gq", gq), ("gk", gk)]:
+            t.detach().cpu().numpy().astype("<f4").tofile(
+                os.path.join(out_dir, f"rope_dyn.S{s}.{name}.bin")
+            )
+        print(f"  golden rope_dyn S={s}")
+
+
+class TestDynamicShapeExport(unittest.TestCase):
+    def test_export_dynamic_rms(self) -> None:
+        import tempfile
+
+        with tempfile.TemporaryDirectory() as d:
+            export_dynamic_shape_cases(d)
+            self.assertTrue(os.path.exists(os.path.join(d, "dyn_rms.pte")))
+            self.assertTrue(os.path.exists(os.path.join(d, "dyn_rms.S1.golden.bin")))
+
+
+if __name__ == "__main__":
+    unittest.main()

From c0e55b55d44cc9a97d50bf3610934136574325e6 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 2 Jul 2026 16:00:01 -0700
Subject: [PATCH 2/3] Update

[ghstack-poisoned]
---
 backends/webgpu/CMakeLists.txt                |  10 +-
 .../webgpu/test/native/test_dynamic_shape.cpp | 538 +++++++-----------
 2 files changed, 210 insertions(+), 338 deletions(-)

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 527e9cb57a8..072bcf43b8b 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -161,9 +161,6 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
   add_webgpu_native_test(
     webgpu_update_cache_test test/native/test_update_cache.cpp
   )
-  add_webgpu_native_test(
-    webgpu_dynamic_shape_test test/native/test_dynamic_shape.cpp
-  )
 
   # Manifest-driven op-test framework: a generic gtest driver (webgpu_op_test) +
   # its device-free util unit test. GTest needs -DEXECUTORCH_BUILD_TESTS=ON.
@@ -197,6 +194,13 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
     )
     target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
     set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)
+
+    # Dynamic-shape integration test: a gtest binary with its own main() that
+    # brings up the device once (like webgpu_op_test).
+    add_webgpu_native_test(
+      webgpu_dynamic_shape_test test/native/test_dynamic_shape.cpp
+    )
+    target_link_libraries(webgpu_dynamic_shape_test PRIVATE GTest::gtest)
   endif()
   add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
 endif()
diff --git a/backends/webgpu/test/native/test_dynamic_shape.cpp b/backends/webgpu/test/native/test_dynamic_shape.cpp
index f97b87a00fe..ec58135e666 100644
--- a/backends/webgpu/test/native/test_dynamic_shape.cpp
+++ b/backends/webgpu/test/native/test_dynamic_shape.cpp
@@ -10,26 +10,30 @@
 // upper-bound seq-len MAXS and run at several live S; the output must match the
 // torch golden at each S (allocate-at-max + per-op resize hooks + output-EValue
 // resize). Cases:
-//   A  dyn_rms at S=MAXS                       -> golden (static-equivalent)
-//   B  dyn_rms at S < MAXS (64, 8, 1)          -> golden (resize shrinks
-//   dispatch) C  ONE loaded graph reused across S        -> all golden (buffers
-//   never moved
-//                                                 => bind groups stayed valid)
-//   D  static_rms (no dynamic dim)             -> golden (static path
-//   unchanged) F  dyn_rms_chain (rms(rms(x))) at 3 S      -> golden (resize
-//   CASCADE, DD-4)
-//   G rms+residual  H rms*x  I dyn_linear  J sdpa_dyn  K emb_dyn  L rope_dyn
-//   M dyn_sigmoid  N dyn_select (select_copy(0,-1), dynamic S)
+//   A  dyn_rms at S=MAXS                  -> golden (static-equivalent)
+//   B  dyn_rms at S < MAXS (64, 8, 1)     -> golden (resize shrinks dispatch)
+//   C  ONE loaded graph reused across S   -> all golden (buffers never moved =>
+//                                            bind groups stayed valid)
+//   D  static_rms (no dynamic dim)        -> golden (static path unchanged)
+//   F  dyn_rms_chain (rms(rms(x))) at 3 S -> golden (resize CASCADE, DD-4)
+//   G  rms+residual  H rms*x  I dyn_linear  J sdpa_dyn  K emb_dyn  L rope_dyn
+//   M  dyn_sigmoid   N dyn_select (select_copy(0,-1), dynamic S)
 // .pte + goldens from test/ops/dynamic_shape/test_dynamic_shape_export.py.
+//
+// Artifacts dir: $WEBGPU_DYNAMIC_SHAPE_DIR, else argv[1], else
+// /tmp/dynamic_shape.
 
-#include <executorch/backends/webgpu/runtime/WebGPUCompat.h>
 #include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
 
+#include <gtest/gtest.h>
+
 #include <cmath>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <exception>
 #include <fstream>
 #include <string>
 #include <vector>
@@ -42,6 +46,9 @@ namespace {
 
 constexpr int kHidden = 64;
 
+// Artifacts directory; set from env/argv in main() before RUN_ALL_TESTS().
+std::string g_dir; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
+
 std::vector<float> read_bin(const std::string& path) {
   std::ifstream f(path, std::ios::binary | std::ios::ate);
   if (!f) {
@@ -68,136 +75,72 @@ float max_err(const std::vector<float>& a, const std::vector<float>& b) {
   return m;
 }
 
-// Run one forward of a [1,1,S,kHidden] input through `m`; return the output.
-std::vector<float>
-run_s(Module& m, const std::string& dir, const std::string& prefix, int s) {
-  auto input =
-      read_bin(dir + "/" + prefix + ".S" + std::to_string(s) + ".input.bin");
-  if (input.empty()) {
-    printf("  MISSING input %s.S%d\n", prefix.c_str(), s);
-    return {};
-  }
-  if (input.size() != static_cast<size_t>(s) * kHidden) {
-    printf("  WRONG input size %s.S%d\n", prefix.c_str(), s);
-    return {};
-  }
+// Run a [1,1,S,kHidden] input through `m` and compare to the golden. Shared by
+// every single-output rms-shaped case (A-H, M).
+void check_s(Module& m, const std::string& prefix, int s) {
+  const std::string base = g_dir + "/" + prefix + ".S" + std::to_string(s);
+  auto input = read_bin(base + ".input.bin");
+  ASSERT_FALSE(input.empty()) << "missing input: " << prefix << ".S" << s;
+  ASSERT_EQ(input.size(), static_cast<size_t>(s) * kHidden)
+      << "wrong input size: " << prefix << ".S" << s;
   auto t = make_tensor_ptr({1, 1, s, kHidden}, std::move(input));
   auto r = m.forward({EValue(t)});
-  if (!r.ok() || r.get().empty() || !r.get()[0].isTensor()) {
-    printf("  forward FAILED (S=%d, err=%d)\n", s, r.ok() ? 0 : (int)r.error());
-    return {};
-  }
+  ASSERT_TRUE(r.ok() && !r.get().empty() && r.get()[0].isTensor())
+      << prefix << " S=" << s
+      << " forward failed (err=" << (r.ok() ? 0 : (int)r.error()) << ")";
   const auto& out = r.get()[0].toTensor();
-  const float* d = out.const_data_ptr<float>();
   const size_t numel = static_cast<size_t>(s) * kHidden;
   // Output EValue must have been resized to the live shape.
-  if (out.numel() != static_cast<ssize_t>(numel)) {
-    printf(
-        "  WRONG output numel: got %zd want %zu (S=%d)\n",
-        (ssize_t)out.numel(),
-        numel,
-        s);
-    return {};
-  }
-  return std::vector<float>(d, d + numel);
-}
-
-bool check_s(
-    Module& m,
-    const std::string& dir,
-    const std::string& prefix,
-    int s,
-    bool& ok) {
-  auto got = run_s(m, dir, prefix, s);
-  auto golden =
-      read_bin(dir + "/" + prefix + ".S" + std::to_string(s) + ".golden.bin");
-  float e = max_err(got, golden);
-  bool pass = !got.empty() && e < 1e-3f;
-  printf(
-      "  %s S=%-3d max_err=%e -> %s\n",
-      prefix.c_str(),
-      s,
-      e,
-      pass ? "PASS" : "FAIL");
-  if (!pass) {
-    printf("    got.size=%zu golden.size=%zu\n", got.size(), golden.size());
-    for (size_t i = 0; i < 4 && i < got.size() && i < golden.size(); i++) {
-      printf("    [%zu] got=%.6f golden=%.6f\n", i, got[i], golden[i]);
-    }
-  }
-  ok = ok && pass;
-  return pass;
+  ASSERT_EQ(static_cast<size_t>(out.numel()), numel)
+      << prefix << " S=" << s << " output numel mismatch";
+  const float* d = out.const_data_ptr<float>();
+  std::vector<float> got(d, d + numel);
+  auto golden = read_bin(base + ".golden.bin");
+  const float e = max_err(got, golden);
+  EXPECT_LT(e, 1e-3f) << prefix << " S=" << s << " max_err=" << e
+                      << " (got.size=" << got.size()
+                      << " golden.size=" << golden.size() << ")";
 }
 
-// Dynamic quantized linear: input [M, lin_k] -> output [M, lin_n].
+// Dynamic quantized linear: input [M, kLinK] -> output [M, kLinN].
 constexpr int kLinK = 64;
 constexpr int kLinN = 128;
-void check_linear(const std::string& dir, int m_rows, bool& ok) {
-  Module m(dir + "/dyn_linear.pte");
-  if (m.load_forward() != Error::Ok) {
-    printf("  FAIL load dyn_linear.pte\n");
-    ok = false;
-    return;
-  }
-  auto input =
-      read_bin(dir + "/dyn_linear.S" + std::to_string(m_rows) + ".input.bin");
-  auto golden =
-      read_bin(dir + "/dyn_linear.S" + std::to_string(m_rows) + ".golden.bin");
-  if (input.empty()) {
-    printf("  MISSING dyn_linear.S%d\n", m_rows);
-    ok = false;
-    return;
-  }
+void check_linear(int m_rows) {
+  Module m(g_dir + "/dyn_linear.pte");
+  ASSERT_EQ(m.load_forward(), Error::Ok) << "load dyn_linear.pte";
+  const std::string base = g_dir + "/dyn_linear.S" + std::to_string(m_rows);
+  auto input = read_bin(base + ".input.bin");
+  auto golden = read_bin(base + ".golden.bin");
+  ASSERT_FALSE(input.empty()) << "missing dyn_linear.S" << m_rows;
   auto t = make_tensor_ptr({m_rows, kLinK}, std::move(input));
   auto r = m.forward({EValue(t)});
-  if (!r.ok() || r.get().empty() || !r.get()[0].isTensor()) {
-    printf("  linear M=%d forward FAILED\n", m_rows);
-    ok = false;
-    return;
-  }
+  ASSERT_TRUE(r.ok() && !r.get().empty() && r.get()[0].isTensor())
+      << "dyn_linear M=" << m_rows << " forward failed";
   const auto& out = r.get()[0].toTensor();
   const size_t numel = static_cast<size_t>(m_rows) * kLinN;
+  ASSERT_EQ(static_cast<size_t>(out.numel()), numel)
+      << "dyn_linear M=" << m_rows << " output numel mismatch";
   std::vector<float> got(
       out.const_data_ptr<float>(), out.const_data_ptr<float>() + numel);
-  float e = max_err(got, golden);
+  const float e = max_err(got, golden);
   // 4-bit quant: looser tol (the kernel mirrors the dequant-matmul reference).
-  bool pass = out.numel() == static_cast<ssize_t>(numel) && e < 5e-3f;
-  printf(
-      "  dyn_linear M=%-3d max_err=%e -> %s\n",
-      m_rows,
-      e,
-      pass ? "PASS" : "FAIL");
-  ok = ok && pass;
+  EXPECT_LT(e, 5e-3f) << "dyn_linear M=" << m_rows << " max_err=" << e;
 }
 
 // Dynamic SDPA (GQA prefill, input_pos=0): q[1,s,hq,d] k/v[1,s,hkv,d]
 // caches[1,cmax,hkv,d]; attn output [1,s,hq,d] selected by shape (3 outputs).
 constexpr int kSdHq = 8, kSdHkv = 2, kSdD = 16, kSdCmax = 64;
-void check_sdpa(const std::string& dir, int s, bool& ok) {
-  Module m(dir + "/sdpa_dyn.pte");
-  Error le = m.load_forward();
-  if (le == Error::DelegateInvalidCompatibility) {
-    // PENDING op coverage: dynamic-S SDPA build throws err 48 until registered.
-    printf("  PENDING sdpa_dyn S=%d (op coverage, err %d)\n", s, (int)le);
-    return;
-  }
-  if (le != Error::Ok) {
-    printf("  sdpa_dyn S=%d load FAILED (err %d)\n", s, (int)le);
-    ok = false;
-    return;
-  }
-  const std::string b = dir + "/sdpa_dyn.S" + std::to_string(s) + ".";
+void check_sdpa(int s) {
+  Module m(g_dir + "/sdpa_dyn.pte");
+  ASSERT_EQ(m.load_forward(), Error::Ok) << "sdpa_dyn S=" << s << " load";
+  const std::string b = g_dir + "/sdpa_dyn.S" + std::to_string(s) + ".";
   auto q = read_bin(b + "q.bin");
   auto k = read_bin(b + "k.bin");
   auto v = read_bin(b + "v.bin");
   auto kc = read_bin(b + "kc.bin");
   auto vc = read_bin(b + "vc.bin");
   auto golden = read_bin(b + "golden.bin");
-  if (q.empty() || golden.empty()) {
-    printf("  MISSING sdpa_dyn.S%d\n", s);
-    ok = false;
-    return;
-  }
+  ASSERT_FALSE(q.empty() || golden.empty()) << "missing sdpa_dyn.S" << s;
   auto tq = make_tensor_ptr({1, s, kSdHq, kSdD}, std::move(q));
   auto tk = make_tensor_ptr({1, s, kSdHkv, kSdD}, std::move(k));
   auto tv = make_tensor_ptr({1, s, kSdHkv, kSdD}, std::move(v));
@@ -205,14 +148,11 @@ void check_sdpa(const std::string& dir, int s, bool& ok) {
   auto tvc = make_tensor_ptr({1, kSdCmax, kSdHkv, kSdD}, std::move(vc));
   auto r =
       m.forward({EValue(tq), EValue(tk), EValue(tv), EValue(tkc), EValue(tvc)});
-  if (!r.ok()) {
-    printf("  sdpa S=%d forward FAILED (err=%d)\n", s, (int)r.error());
-    ok = false;
-    return;
-  }
+  ASSERT_TRUE(r.ok()) << "sdpa S=" << s
+                      << " forward failed (err=" << (int)r.error() << ")";
   // Select the attn output by full shape [1,s,hq,d] (never numel).
   const float* attn = nullptr;
-  size_t numel = static_cast<size_t>(s) * kSdHq * kSdD;
+  const size_t numel = static_cast<size_t>(s) * kSdHq * kSdD;
   for (size_t i = 0; i < r.get().size(); i++) {
     if (!r.get()[i].isTensor()) {
       continue;
@@ -224,105 +164,66 @@ void check_sdpa(const std::string& dir, int s, bool& ok) {
       break;
     }
   }
-  if (attn == nullptr) {
-    printf(
-        "  sdpa S=%d: no attn output of shape [1,%d,%d,%d]\n",
-        s,
-        s,
-        kSdHq,
-        kSdD);
-    ok = false;
-    return;
-  }
+  ASSERT_NE(attn, nullptr) << "sdpa S=" << s << ": no attn output of shape [1,"
+                           << s << "," << kSdHq << "," << kSdD << "]";
   std::vector<float> got(attn, attn + numel);
-  float e = max_err(got, golden);
-  bool pass = e < 2e-3f; // SDPA tol (abs 1e-4 / rel 1e-3 family)
-  printf("  sdpa_dyn S=%-3d max_err=%e -> %s\n", s, e, pass ? "PASS" : "FAIL");
-  ok = ok && pass;
+  const float e = max_err(got, golden);
+  EXPECT_LT(e, 2e-3f) << "sdpa_dyn S=" << s << " max_err=" << e;
 }
 
 // Dynamic embedding: int64 token ids [N] -> [N, kEmbDim] fp32. The int64 host
 // input exercises copy_inputs' int64->int32 narrow path under dynamic shapes.
 constexpr int kEmbDim = 64;
-void check_embedding(const std::string& dir, int n, bool& ok) {
-  Module m(dir + "/emb_dyn.pte");
-  if (m.load_forward() != Error::Ok) {
-    printf("  FAIL load emb_dyn.pte\n");
-    ok = false;
-    return;
-  }
-  const std::string b = dir + "/emb_dyn.S" + std::to_string(n) + ".";
+void check_embedding(int n) {
+  Module m(g_dir + "/emb_dyn.pte");
+  ASSERT_EQ(m.load_forward(), Error::Ok) << "load emb_dyn.pte";
+  const std::string b = g_dir + "/emb_dyn.S" + std::to_string(n) + ".";
   std::ifstream f(b + "idx.bin", std::ios::binary | std::ios::ate);
-  if (!f) {
-    printf("  MISSING emb_dyn.S%d\n", n);
-    ok = false;
-    return;
-  }
+  ASSERT_TRUE(f.good()) << "missing emb_dyn.S" << n;
   const std::streamsize nb = f.tellg();
-  if (nb < 0) {
-    printf("  MISSING emb_dyn.S%d\n", n);
-    ok = false;
-    return;
-  }
+  ASSERT_GE(nb, 0) << "missing emb_dyn.S" << n;
   f.seekg(0);
   std::vector<int64_t> idx(static_cast<size_t>(nb) / sizeof(int64_t));
   f.read(reinterpret_cast<char*>(idx.data()), nb);
-  if (idx.size() != static_cast<size_t>(n)) {
-    printf("  WRONG emb_dyn idx size S%d\n", n);
-    ok = false;
-    return;
-  }
+  ASSERT_EQ(idx.size(), static_cast<size_t>(n))
+      << "wrong emb_dyn idx size S" << n;
   auto golden = read_bin(b + "golden.bin");
   auto t = make_tensor_ptr({n}, std::move(idx)); // int64 (Long) host input
   auto r = m.forward({EValue(t)});
-  if (!r.ok() || r.get().empty() || !r.get()[0].isTensor()) {
-    printf(
-        "  emb N=%d forward FAILED (err=%d)\n", n, r.ok() ? 0 : (int)r.error());
-    ok = false;
-    return;
-  }
+  ASSERT_TRUE(r.ok() && !r.get().empty() && r.get()[0].isTensor())
+      << "emb N=" << n
+      << " forward failed (err=" << (r.ok() ? 0 : (int)r.error()) << ")";
   const auto& out = r.get()[0].toTensor();
   const size_t numel = static_cast<size_t>(n) * kEmbDim;
+  ASSERT_EQ(static_cast<size_t>(out.numel()), numel)
+      << "emb N=" << n << " output numel mismatch";
   std::vector<float> got(
       out.const_data_ptr<float>(), out.const_data_ptr<float>() + numel);
-  float e = max_err(got, golden);
-  bool pass = out.numel() == static_cast<ssize_t>(numel) && e < 5e-3f;
-  printf("  emb_dyn N=%-3d max_err=%e -> %s\n", n, e, pass ? "PASS" : "FAIL");
-  ok = ok && pass;
+  const float e = max_err(got, golden);
+  EXPECT_LT(e, 5e-3f) << "emb_dyn N=" << n << " max_err=" << e;
 }
 
 // Dynamic RoPE: xq[1,s,nh,hd] xk[1,s,nkv,hd] freqs[s,hd/2] -> xq_out/xk_out
 // (2 outputs, selected by head count nh != nkv).
 constexpr int kRopeNH = 8, kRopeNKV = 2, kRopeHD = 64;
-void check_rope(const std::string& dir, int s, bool& ok) {
-  Module m(dir + "/rope_dyn.pte");
-  if (m.load_forward() != Error::Ok) {
-    printf("  FAIL load rope_dyn.pte\n");
-    ok = false;
-    return;
-  }
-  const std::string b = dir + "/rope_dyn.S" + std::to_string(s) + ".";
+void check_rope(int s) {
+  Module m(g_dir + "/rope_dyn.pte");
+  ASSERT_EQ(m.load_forward(), Error::Ok) << "load rope_dyn.pte";
+  const std::string b = g_dir + "/rope_dyn.S" + std::to_string(s) + ".";
   auto xq = read_bin(b + "xq.bin");
   auto xk = read_bin(b + "xk.bin");
   auto fc = read_bin(b + "fc.bin");
   auto fs = read_bin(b + "fs.bin");
   auto gq = read_bin(b + "gq.bin");
   auto gk = read_bin(b + "gk.bin");
-  if (xq.empty() || gq.empty()) {
-    printf("  MISSING rope_dyn.S%d\n", s);
-    ok = false;
-    return;
-  }
+  ASSERT_FALSE(xq.empty() || gq.empty()) << "missing rope_dyn.S" << s;
   auto txq = make_tensor_ptr({1, s, kRopeNH, kRopeHD}, std::move(xq));
   auto txk = make_tensor_ptr({1, s, kRopeNKV, kRopeHD}, std::move(xk));
   auto tfc = make_tensor_ptr({s, kRopeHD / 2}, std::move(fc));
   auto tfs = make_tensor_ptr({s, kRopeHD / 2}, std::move(fs));
   auto r = m.forward({EValue(txq), EValue(txk), EValue(tfc), EValue(tfs)});
-  if (!r.ok()) {
-    printf("  rope S=%d forward FAILED (err=%d)\n", s, (int)r.error());
-    ok = false;
-    return;
-  }
+  ASSERT_TRUE(r.ok()) << "rope S=" << s
+                      << " forward failed (err=" << (int)r.error() << ")";
   // Select xq_out (nh heads) and xk_out (nkv heads) by shape.
   const float *oq = nullptr, *okp = nullptr;
   for (size_t i = 0; i < r.get().size(); i++) {
@@ -338,207 +239,174 @@ void check_rope(const std::string& dir, int s, bool& ok) {
       }
     }
   }
-  if (oq == nullptr || okp == nullptr) {
-    printf("  rope S=%d: missing xq_out/xk_out by shape\n", s);
-    ok = false;
-    return;
-  }
+  ASSERT_TRUE(oq != nullptr && okp != nullptr)
+      << "rope S=" << s << ": missing xq_out/xk_out by shape";
   std::vector<float> gotq(oq, oq + static_cast<size_t>(s) * kRopeNH * kRopeHD);
   std::vector<float> gotk(
       okp, okp + static_cast<size_t>(s) * kRopeNKV * kRopeHD);
-  float e = std::fmax(max_err(gotq, gq), max_err(gotk, gk));
-  bool pass = e < 1e-3f;
-  printf("  rope_dyn S=%-3d max_err=%e -> %s\n", s, e, pass ? "PASS" : "FAIL");
-  ok = ok && pass;
+  const float e = std::fmax(max_err(gotq, gq), max_err(gotk, gk));
+  EXPECT_LT(e, 1e-3f) << "rope_dyn S=" << s << " max_err=" << e;
 }
 
 // Dynamic select_copy(0,-1): input [2,1,S,kHidden] -> output [1,S,kHidden]. The
 // negative index resolves against the (static) leading dim live; the dynamic S
 // flows to the output, so the resize hook recomputes its dispatch each S.
 constexpr int kSelLead = 2;
-void check_select(const std::string& dir, int s, bool& ok) {
-  Module m(dir + "/dyn_select.pte");
-  if (m.load_forward() != Error::Ok) {
-    printf("  FAIL load dyn_select.pte\n");
-    ok = false;
-    return;
-  }
-  auto input =
-      read_bin(dir + "/dyn_select.S" + std::to_string(s) + ".input.bin");
-  auto golden =
-      read_bin(dir + "/dyn_select.S" + std::to_string(s) + ".golden.bin");
-  if (input.empty() || golden.empty()) {
-    printf("  MISSING dyn_select.S%d\n", s);
-    ok = false;
-    return;
-  }
+void check_select(int s) {
+  Module m(g_dir + "/dyn_select.pte");
+  ASSERT_EQ(m.load_forward(), Error::Ok) << "load dyn_select.pte";
+  const std::string base = g_dir + "/dyn_select.S" + std::to_string(s);
+  auto input = read_bin(base + ".input.bin");
+  auto golden = read_bin(base + ".golden.bin");
+  ASSERT_FALSE(input.empty() || golden.empty()) << "missing dyn_select.S" << s;
   auto t = make_tensor_ptr({kSelLead, 1, s, kHidden}, std::move(input));
   auto r = m.forward({EValue(t)});
-  if (!r.ok() || r.get().empty() || !r.get()[0].isTensor()) {
-    printf(
-        "  select S=%d forward FAILED (err=%d)\n",
-        s,
-        r.ok() ? 0 : (int)r.error());
-    ok = false;
-    return;
-  }
+  ASSERT_TRUE(r.ok() && !r.get().empty() && r.get()[0].isTensor())
+      << "select S=" << s
+      << " forward failed (err=" << (r.ok() ? 0 : (int)r.error()) << ")";
   const auto& out = r.get()[0].toTensor();
   const size_t numel = static_cast<size_t>(s) * kHidden;
+  ASSERT_EQ(static_cast<size_t>(out.numel()), numel)
+      << "select S=" << s << " output numel mismatch";
   std::vector<float> got(
       out.const_data_ptr<float>(), out.const_data_ptr<float>() + numel);
-  float e = max_err(got, golden);
-  bool pass = out.numel() == static_cast<ssize_t>(numel) && e < 1e-3f;
-  printf(
-      "  dyn_select S=%-3d max_err=%e -> %s\n", s, e, pass ? "PASS" : "FAIL");
-  ok = ok && pass;
+  const float e = max_err(got, golden);
+  EXPECT_LT(e, 1e-3f) << "dyn_select S=" << s << " max_err=" << e;
 }
 
 } // namespace
 
-int main(int argc, char** argv) {
-  std::string dir = "/tmp/dynamic_shape";
-  if (argc > 1) {
-    dir = argv[1];
-  }
-  if (const char* env = std::getenv("WEBGPU_DYNAMIC_SHAPE_DIR")) {
-    dir = env;
-  }
-
-  WebGPUContext ctx;
-  try {
-    ctx = create_webgpu_context();
-  } catch (const std::exception& e) {
-    printf("SKIP: %s\n", e.what());
-    return 0;
-  }
-  set_default_webgpu_context(&ctx);
-  printf("WebGPU device acquired (native); dir: %s\n", dir.c_str());
-
-  bool ok = true;
-
-  // Cases A + B: single dynamic rms_norm at S = MAXS .. 1 (fresh module each).
-  printf("\n--- A/B: dynamic rms_norm at several S (fresh load each) ---\n");
+// A + B: single dynamic rms_norm at S = MAXS .. 1 (fresh module load each S).
+TEST(DynamicShape, RmsNormFreshLoad) {
   for (int s : {128, 64, 8, 1}) {
-    Module m(dir + "/dyn_rms.pte");
-    if (m.load_forward() != Error::Ok) {
-      printf("  FAIL load dyn_rms.pte\n");
-      ok = false;
-      break;
-    }
-    check_s(m, dir, "dyn_rms", s, ok);
+    Module m(g_dir + "/dyn_rms.pte");
+    ASSERT_EQ(m.load_forward(), Error::Ok) << "load dyn_rms.pte";
+    check_s(m, "dyn_rms", s);
   }
+}
 
-  // Case C: ONE loaded graph reused across S (buffers must not move).
-  printf("\n--- C: one graph reused across S (bind groups stay valid) ---\n");
-  {
-    Module m(dir + "/dyn_rms.pte");
-    if (m.load_forward() != Error::Ok) {
-      printf("  FAIL load dyn_rms.pte\n");
-      ok = false;
-    } else {
-      for (int s : {128, 1, 64, 8, 128}) {
-        check_s(m, dir, "dyn_rms", s, ok);
-      }
-    }
+// C: ONE loaded graph reused across S (buffers must not move => bind groups
+// stay valid).
+TEST(DynamicShape, RmsNormReusedGraph) {
+  Module m(g_dir + "/dyn_rms.pte");
+  ASSERT_EQ(m.load_forward(), Error::Ok) << "load dyn_rms.pte";
+  for (int s : {128, 1, 64, 8, 128}) {
+    check_s(m, "dyn_rms", s);
   }
+}
 
-  // Case D: static rms_norm (no dynamic dim) — regression.
-  printf("\n--- D: static rms_norm (static path unchanged) ---\n");
-  {
-    Module m(dir + "/static_rms.pte");
-    if (m.load_forward() != Error::Ok) {
-      printf("  FAIL load static_rms.pte\n");
-      ok = false;
-    } else {
-      check_s(m, dir, "static_rms", 8, ok);
-    }
-  }
+// D: static rms_norm (no dynamic dim) — regression that the static path is
+// unchanged.
+TEST(DynamicShape, StaticRmsNorm) {
+  Module m(g_dir + "/static_rms.pte");
+  ASSERT_EQ(m.load_forward(), Error::Ok) << "load static_rms.pte";
+  check_s(m, "static_rms", 8);
+}
 
-  // Case F: 2-op chain rms(rms(x)) — resize cascade.
-  printf("\n--- F: rms(rms(x)) cascade at several S ---\n");
+// F: 2-op chain rms(rms(x)) — resize cascade.
+TEST(DynamicShape, RmsChainCascade) {
   for (int s : {128, 16, 1}) {
-    Module m(dir + "/dyn_rms_chain.pte");
-    if (m.load_forward() != Error::Ok) {
-      printf("  FAIL load dyn_rms_chain.pte\n");
-      ok = false;
-      break;
-    }
-    check_s(m, dir, "dyn_rms_chain", s, ok);
+    Module m(g_dir + "/dyn_rms_chain.pte");
+    ASSERT_EQ(m.load_forward(), Error::Ok) << "load dyn_rms_chain.pte";
+    check_s(m, "dyn_rms_chain", s);
   }
+}
 
-  // Case G: rms(x)+x residual — cross-op (rms -> add) cascade.
-  printf("\n--- G: rms(x)+x residual (rms->add cascade) at several S ---\n");
+// G: rms(x)+x residual — cross-op (rms -> add) cascade.
+TEST(DynamicShape, RmsResidualCascade) {
   for (int s : {128, 32, 1}) {
-    Module m(dir + "/dyn_residual.pte");
-    if (m.load_forward() != Error::Ok) {
-      printf("  FAIL load dyn_residual.pte\n");
-      ok = false;
-      break;
-    }
-    check_s(m, dir, "dyn_residual", s, ok);
+    Module m(g_dir + "/dyn_residual.pte");
+    ASSERT_EQ(m.load_forward(), Error::Ok) << "load dyn_residual.pte";
+    check_s(m, "dyn_residual", s);
   }
+}
 
-  // Case H: rms(x)*x — exercises the mul op resize.
-  printf("\n--- H: rms(x)*x (mul op) at several S ---\n");
+// H: rms(x)*x — exercises the mul op resize.
+TEST(DynamicShape, RmsMul) {
   for (int s : {128, 32, 1}) {
-    Module m(dir + "/dyn_rmsmul.pte");
-    if (m.load_forward() != Error::Ok) {
-      printf("  FAIL load dyn_rmsmul.pte\n");
-      ok = false;
-      break;
-    }
-    check_s(m, dir, "dyn_rmsmul", s, ok);
+    Module m(g_dir + "/dyn_rmsmul.pte");
+    ASSERT_EQ(m.load_forward(), Error::Ok) << "load dyn_rmsmul.pte";
+    check_s(m, "dyn_rmsmul", s);
   }
+}
 
-  // Case I: dynamic 4-bit quantized linear (prefill GEMM) at several M.
-  printf("\n--- I: dynamic linear_q4gsw [M,64]->[M,128] at several M ---\n");
-  for (int mrows : {128, 32, 1}) {
-    check_linear(dir, mrows, ok);
+// I: dynamic 4-bit quantized linear (prefill GEMM) at several M.
+TEST(DynamicShape, QuantizedLinear) {
+  for (int m_rows : {128, 32, 1}) {
+    check_linear(m_rows);
   }
+}
 
-  // Case J: dynamic SDPA (GQA prefill) at several seq-len S.
-  printf("\n--- J: dynamic sdpa_with_kv_cache (prefill) at several S ---\n");
+// J: dynamic SDPA (GQA prefill) at several seq-len S. The whole case skips
+// while op coverage is pending (the dynamic-S build throws err 48 until
+// registered).
+TEST(DynamicShape, Sdpa) {
+  {
+    Module probe(g_dir + "/sdpa_dyn.pte");
+    if (probe.load_forward() == Error::DelegateInvalidCompatibility) {
+      GTEST_SKIP() << "sdpa_dyn pending op coverage (err "
+                   << (int)Error::DelegateInvalidCompatibility << ")";
+    }
+  }
   for (int s : {64, 16, 1}) {
-    check_sdpa(dir, s, ok);
+    check_sdpa(s);
   }
+}
 
-  // Case K: dynamic embedding (int64 token ids) at several token counts.
-  printf("\n--- K: dynamic embedding_q4gsw (int64 ids) at several N ---\n");
+// K: dynamic embedding (int64 token ids) at several token counts.
+TEST(DynamicShape, Embedding) {
   for (int n : {16, 8, 1}) {
-    check_embedding(dir, n, ok);
+    check_embedding(n);
   }
+}
 
-  // Case L: dynamic RoPE (two outputs) at several seq-len S.
-  printf("\n--- L: dynamic apply_rotary_emb at several S ---\n");
+// L: dynamic RoPE (two outputs) at several seq-len S.
+TEST(DynamicShape, Rope) {
   for (int s : {16, 8, 1}) {
-    check_rope(dir, s, ok);
+    check_rope(s);
   }
+}
 
-  // Case M: dynamic sigmoid (elementwise) at several S.
-  printf("\n--- M: dynamic sigmoid at several S ---\n");
+// M: dynamic sigmoid (elementwise) at several S.
+TEST(DynamicShape, Sigmoid) {
   for (int s : {128, 32, 1}) {
-    Module m(dir + "/dyn_sigmoid.pte");
-    if (m.load_forward() != Error::Ok) {
-      printf("  FAIL load dyn_sigmoid.pte\n");
-      ok = false;
-      break;
-    }
-    check_s(m, dir, "dyn_sigmoid", s, ok);
+    Module m(g_dir + "/dyn_sigmoid.pte");
+    ASSERT_EQ(m.load_forward(), Error::Ok) << "load dyn_sigmoid.pte";
+    check_s(m, "dyn_sigmoid", s);
   }
+}
 
-  // Case N: dynamic select_copy(0,-1) at several S.
-  printf("\n--- N: dynamic select_copy(0,-1) at several S ---\n");
+// N: dynamic select_copy(0,-1) at several S.
+TEST(DynamicShape, Select) {
   for (int s : {128, 32, 1}) {
-    check_select(dir, s, ok);
+    check_select(s);
   }
+}
 
-  set_default_webgpu_context(nullptr);
-  destroy_webgpu_context(ctx);
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
 
-  if (!ok) {
-    printf("\ndynamic_shape tests FAILED\n");
-    return 1;
+  // Artifacts dir: env wins, else first positional arg, else default (gtest
+  // flags were already stripped by InitGoogleTest above).
+  g_dir = "/tmp/dynamic_shape";
+  if (argc > 1) {
+    g_dir = argv[1];
   }
-  printf("\nAll dynamic_shape tests passed\n");
-  return 0;
+  if (const char* env = std::getenv("WEBGPU_DYNAMIC_SHAPE_DIR")) {
+    g_dir = env;
+  }
+
+  WebGPUContext ctx;
+  try {
+    ctx = create_webgpu_context();
+  } catch (const std::exception& e) {
+    std::printf("SKIP: no WebGPU device (%s)\n", e.what());
+    return 0;
+  }
+  set_default_webgpu_context(&ctx);
+
+  const int rc = RUN_ALL_TESTS();
+  set_default_webgpu_context(nullptr);
+  destroy_webgpu_context(ctx);
+  return rc;
 }

From 766b9c828e3a7df2c89b2f601601ee09f8e43fdc Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 2 Jul 2026 16:13:25 -0700
Subject: [PATCH 3/3] Update

[ghstack-poisoned]
---
 backends/webgpu/test/native/test_dynamic_shape.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/backends/webgpu/test/native/test_dynamic_shape.cpp b/backends/webgpu/test/native/test_dynamic_shape.cpp
index ec58135e666..167ce52483a 100644
--- a/backends/webgpu/test/native/test_dynamic_shape.cpp
+++ b/backends/webgpu/test/native/test_dynamic_shape.cpp
@@ -140,7 +140,10 @@ void check_sdpa(int s) {
   auto kc = read_bin(b + "kc.bin");
   auto vc = read_bin(b + "vc.bin");
   auto golden = read_bin(b + "golden.bin");
-  ASSERT_FALSE(q.empty() || golden.empty()) << "missing sdpa_dyn.S" << s;
+  ASSERT_FALSE(
+      q.empty() || k.empty() || v.empty() || kc.empty() || vc.empty() ||
+      golden.empty())
+      << "missing sdpa_dyn.S" << s;
   auto tq = make_tensor_ptr({1, s, kSdHq, kSdD}, std::move(q));
   auto tk = make_tensor_ptr({1, s, kSdHkv, kSdD}, std::move(k));
   auto tv = make_tensor_ptr({1, s, kSdHkv, kSdD}, std::move(v));
@@ -216,7 +219,10 @@ void check_rope(int s) {
   auto fs = read_bin(b + "fs.bin");
   auto gq = read_bin(b + "gq.bin");
   auto gk = read_bin(b + "gk.bin");
-  ASSERT_FALSE(xq.empty() || gq.empty()) << "missing rope_dyn.S" << s;
+  ASSERT_FALSE(
+      xq.empty() || xk.empty() || fc.empty() || fs.empty() || gq.empty() ||
+      gk.empty())
+      << "missing rope_dyn.S" << s;
   auto txq = make_tensor_ptr({1, s, kRopeNH, kRopeHD}, std::move(xq));
   auto txk = make_tensor_ptr({1, s, kRopeNKV, kRopeHD}, std::move(xk));
   auto tfc = make_tensor_ptr({s, kRopeHD / 2}, std::move(fc));