Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,19 +151,9 @@ function(add_webgpu_native_test test_name test_src)
endfunction()

if(EXECUTORCH_BUILD_WEBGPU_TEST)
add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
add_webgpu_native_test(
webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
)
add_webgpu_native_test(
webgpu_scratch_buffer_test test/native/test_scratch_buffer.cpp
)
add_webgpu_native_test(
webgpu_update_cache_test test/native/test_update_cache.cpp
)

# Manifest-driven op-test framework: a generic gtest driver (webgpu_op_test) +
# its device-free util unit test. GTest needs -DEXECUTORCH_BUILD_TESTS=ON.
# All WebGPU native tests use GTest (device-dependent ones bring up the device
# in their own main(); the fold unit test is device-free via gtest_main).
# GTest needs -DEXECUTORCH_BUILD_TESTS=ON.
if(NOT TARGET GTest::gtest)
find_package(GTest QUIET)
endif()
Expand Down Expand Up @@ -195,12 +185,28 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)

# Dynamic-shape integration test: a gtest binary with its own main() that
# brings up the device once (like webgpu_op_test).
# Device-dependent native tests: each has its own main() that brings up the
# device once, then RUN_ALL_TESTS(); link GTest::gtest (not gtest_main).
add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
target_link_libraries(webgpu_native_test PRIVATE GTest::gtest)
add_webgpu_native_test(
webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
)
target_link_libraries(webgpu_dispatch_order_test PRIVATE GTest::gtest)
add_webgpu_native_test(
webgpu_scratch_buffer_test test/native/test_scratch_buffer.cpp
)
target_link_libraries(webgpu_scratch_buffer_test PRIVATE GTest::gtest)
add_webgpu_native_test(
webgpu_update_cache_test test/native/test_update_cache.cpp
)
target_link_libraries(webgpu_update_cache_test PRIVATE GTest::gtest)
add_webgpu_native_test(
webgpu_dynamic_shape_test test/native/test_dynamic_shape.cpp
)
target_link_libraries(webgpu_dynamic_shape_test PRIVATE GTest::gtest)
add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
target_link_libraries(webgpu_index_test PRIVATE GTest::gtest)

# Device-free fold unit test (gtest_main provides main; no device needed).
add_webgpu_native_test(
Expand All @@ -210,5 +216,4 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
webgpu_dispatch_2d_test PRIVATE GTest::gtest GTest::gtest_main
)
endif()
add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
endif()
13 changes: 12 additions & 1 deletion backends/webgpu/scripts/test_webgpu_native_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ UPDATE_CACHE_DIR="/tmp/update_cache"
UPDATE_CACHE_OK=1
INDEX_DIR="/tmp/index"
INDEX_OK=1
DYNAMIC_SHAPE_DIR="/tmp/dynamic_shape"
DYNAMIC_SHAPE_OK=1
EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte"
EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin"
EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
Expand Down Expand Up @@ -111,6 +113,11 @@ from executorch.backends.webgpu.test.ops.index.test_index import export_all_inde
export_all_index_models('${INDEX_DIR}')
" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; }

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.dynamic_shape.test_dynamic_shape_export import export_dynamic_shape_cases
export_dynamic_shape_cases('${DYNAMIC_SHAPE_DIR}')
" || { echo "WARN: dynamic_shape export failed; skipping dynamic_shape native test"; DYNAMIC_SHAPE_OK=0; }

# Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in
# webgpu_native_test below (precise per-config error), so don't exit/mask here.
$PYTHON_EXECUTABLE -c "
Expand All @@ -132,6 +139,7 @@ rm -rf "${BUILD_DIR}"
cmake \
-DEXECUTORCH_BUILD_WEBGPU=ON \
-DEXECUTORCH_BUILD_WEBGPU_TEST=ON \
-DEXECUTORCH_BUILD_TESTS=ON \
-DDawn_DIR="${Dawn_DIR}" \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
Expand All @@ -143,7 +151,7 @@ cmake \
"${EXECUTORCH_ROOT}"

# ── Build + run every native test target that exists in this tree ────────────
TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test webgpu_dispatch_2d_test)
TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test webgpu_dynamic_shape_test webgpu_dispatch_2d_test)
BIN_DIR="${BUILD_DIR}/backends/webgpu"

# Which targets are defined depends on which diffs are landed (native_test +
Expand Down Expand Up @@ -211,6 +219,9 @@ fi
if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then
"${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}"
fi
if [[ "${DYNAMIC_SHAPE_OK}" == "1" && -x "${BIN_DIR}/webgpu_dynamic_shape_test" ]]; then
"${BIN_DIR}/webgpu_dynamic_shape_test" "${DYNAMIC_SHAPE_DIR}"
fi
[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
# Device-free: pure 2D workgroup-count fold unit test (no .pte, no GPU).
[[ -x "${BIN_DIR}/webgpu_dispatch_2d_test" ]] && "${BIN_DIR}/webgpu_dispatch_2d_test"
Expand Down
154 changes: 74 additions & 80 deletions backends/webgpu/test/native/test_dispatch_order.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>

#include <gtest/gtest.h>

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <exception>
#include <fstream>
#include <string>
#include <vector>
Expand All @@ -24,23 +27,8 @@ using namespace executorch::runtime;

namespace {

struct Case {
const char* name;
std::vector<int32_t> sizes;
};

// Mirrors _CASES in test_dispatch_order.py (add-chain or rms_norm+add chain).
const std::vector<Case> kCases = {
{"single", {16, 16}},
{"chain3", {64, 64}},
{"chain5_tiny", {1, 1}},
{"chain5_wide", {7, 896}},
{"chain8", {256, 256}},
{"deep32", {128, 128}},
{"large_chain", {1024, 1024}},
{"het_small", {1, 1, 7, 896}},
{"het_deep", {1, 1, 5, 256}},
};
// Artifacts directory; set from env/argv in main() before RUN_ALL_TESTS().
std::string g_dir; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)

std::vector<float> read_f32_bin(const std::string& path) {
std::ifstream f(path, std::ios::binary | std::ios::ate);
Expand All @@ -59,53 +47,35 @@ std::vector<float> read_f32_bin(const std::string& path) {
return data;
}

bool run_case(const std::string& dir, const Case& tc) {
printf("\n--- dispatch_order[%s] ---\n", tc.name);
const std::string base = dir + "/" + tc.name;
// Mirrors _CASES in test_dispatch_order.py (add-chain or rms_norm+add chain).
void run_case(const char* name, const std::vector<int32_t>& sizes) {
const std::string base = g_dir + "/" + name;
std::vector<float> input = read_f32_bin(base + ".input.bin");
std::vector<float> golden = read_f32_bin(base + ".golden.bin");
if (input.empty() || golden.empty()) {
printf("FAIL: could not read input/golden for %s\n", tc.name);
return false;
}
ASSERT_FALSE(input.empty() || golden.empty())
<< "could not read input/golden for " << name;

Module module(base + ".pte");
if (module.load_forward() != Error::Ok) {
printf("FAIL: could not load %s.pte\n", tc.name);
return false;
}
ASSERT_EQ(module.load_forward(), Error::Ok)
<< "could not load " << name << ".pte";

size_t expected = 1;
for (int32_t d : tc.sizes) {
for (int32_t d : sizes) {
expected *= static_cast<size_t>(d);
}
if (input.size() != expected) {
printf(
"FAIL: input numel %zu != expected %zu for %s\n",
input.size(),
expected,
tc.name);
return false;
}
auto x = make_tensor_ptr(tc.sizes, std::vector<float>(input));
ASSERT_EQ(input.size(), expected)
<< "input numel " << input.size() << " != expected " << expected
<< " for " << name;
auto x = make_tensor_ptr(sizes, std::vector<float>(input));
auto result = module.forward({EValue(x)});
if (!result.ok()) {
printf("FAIL: forward failed (error %d)\n", (int)result.error());
return false;
}
ASSERT_TRUE(result.ok()) << "forward failed (error " << (int)result.error()
<< ")";
const auto& outputs = result.get();
if (outputs.empty() || !outputs[0].isTensor()) {
printf("FAIL: no tensor output\n");
return false;
}
ASSERT_TRUE(!outputs.empty() && outputs[0].isTensor()) << "no tensor output";
const auto& out_tensor = outputs[0].toTensor();
if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
printf(
"FAIL: output numel %zu != golden %zu\n",
(size_t)out_tensor.numel(),
golden.size());
return false;
}
ASSERT_EQ(static_cast<size_t>(out_tensor.numel()), golden.size())
<< "output numel " << (size_t)out_tensor.numel() << " != golden "
<< golden.size();
const float* out_data = out_tensor.const_data_ptr<float>();

float max_abs_err = 0.0f;
Expand All @@ -116,52 +86,76 @@ bool run_case(const std::string& dir, const Case& tc) {
const float denom = std::max(std::abs(golden[i]), 1e-6f);
max_rel_err = std::max(max_rel_err, abs_err / denom);
}
printf(
"Max abs error: %e Max rel error: %e (%zu elements)\n",
max_abs_err,
max_rel_err,
golden.size());
// Lenient gate: pass iff abs<=tol OR rel<=tol (near-zero goldens).
if (max_abs_err > 1e-3f && max_rel_err > 1e-3f) {
printf("FAIL: dispatch_order[%s] exceeds tolerance 1e-3\n", tc.name);
return false;
}
printf("PASS: dispatch_order[%s]\n", tc.name);
return true;
EXPECT_FALSE(max_abs_err > 1e-3f && max_rel_err > 1e-3f)
<< "dispatch_order[" << name
<< "] exceeds tolerance 1e-3 (max_abs_err=" << max_abs_err
<< " max_rel_err=" << max_rel_err << ", " << golden.size()
<< " elements)";
}

} // namespace

TEST(DispatchOrder, single) {
run_case("single", {16, 16});
}

TEST(DispatchOrder, chain3) {
run_case("chain3", {64, 64});
}

TEST(DispatchOrder, chain5_tiny) {
run_case("chain5_tiny", {1, 1});
}

TEST(DispatchOrder, chain5_wide) {
run_case("chain5_wide", {7, 896});
}

TEST(DispatchOrder, chain8) {
run_case("chain8", {256, 256});
}

TEST(DispatchOrder, deep32) {
run_case("deep32", {128, 128});
}

TEST(DispatchOrder, large_chain) {
run_case("large_chain", {1024, 1024});
}

TEST(DispatchOrder, het_small) {
run_case("het_small", {1, 1, 7, 896});
}

TEST(DispatchOrder, het_deep) {
run_case("het_deep", {1, 1, 5, 256});
}

int main(int argc, char** argv) {
std::string dir = "/tmp/dispatch_order";
::testing::InitGoogleTest(&argc, argv);

// Artifacts dir: env wins, else first positional arg, else default (gtest
// flags were already stripped by InitGoogleTest above).
g_dir = "/tmp/dispatch_order";
if (argc > 1) {
dir = argv[1];
g_dir = argv[1];
}
if (const char* env = std::getenv("WEBGPU_DISPATCH_ORDER_DIR")) {
dir = env;
g_dir = env;
}

WebGPUContext ctx;
try {
ctx = create_webgpu_context();
} catch (const std::exception& e) {
printf("SKIP: %s\n", e.what());
std::printf("SKIP: %s\n", e.what());
return 0;
}
set_default_webgpu_context(&ctx);
printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());

bool ok = true;
for (const auto& tc : kCases) {
ok = run_case(dir, tc) && ok;
}

const int rc = RUN_ALL_TESTS();
set_default_webgpu_context(nullptr);
destroy_webgpu_context(ctx);

if (!ok) {
return 1;
}
printf("\nAll dispatch_order tests passed\n");
return 0;
return rc;
}
Loading
Loading