From 4916383661dccb994c464911c134c32ad2a0fdce Mon Sep 17 00:00:00 2001
From: Ali Afzal <maliafzal@meta.com>
Date: Wed, 10 Jun 2026 14:40:30 -0700
Subject: [PATCH] Add OSS CI to cross-compile and run the Cadence Xtensa
 backend

The upstream executor_runner cannot cross-compile to Xtensa because gflags pulls
in mkdir(2), absent from Xtensa newlib. Add cadence_executor_runner, a
gflags-free ExecuTorch runner for the Cadence Xtensa cores targeting the
Instruction Set Simulator (xt-run): it uses plain argv parsing like the Arm and
NXP backends, loads a .pte via xt-run semi-hosting, runs the first method with
all-ones inputs, and prints outputs. EXECUTORCH_BUILD_CADENCE_RUNNER builds it,
linking cadence_ops_lib transitively (no --whole-archive, which would double-run
static kernel registration); -lidma is linked only for Vision/Fusion-G3 cores,
whose ops reference iDMA and whose LSPs ship libidma, while HiFi4 does not. Also
register op_quantized_depthwise_conv1d_{ncl,nlc}.cpp in the HiFi4 operators
CMakeLists, which codegen references (omitting the sources broke the
cross-compile link).

Add an xtensa-build job to the Cadence Build & Test workflow
(build-cadence-runner.yml), alongside the existing host cpu-build/cpu-test, to
cross-compile the backend for the Xtensa cores. It is a build stage producing a
runner artifact; the ISS test stage follows separately (cf. cpu-build ->
cpu-test). The Xtensa toolchain and core configs are licensed and fetched at
runtime from an auth-gated object store via a short-lived OIDC credential; the
role, region, and store are supplied through CI variables and are not committed.
setup-xtensa-tools.sh downloads and installs the toolchain/core for a backend,
rewrites the vendor params to local paths, and exports the Xtensa env;
build-cadence-xtensa.sh cross-compiles cadence_executor_runner. The job builds a
[hifi4, vision] matrix and uploads the runner.

fusion_g3 is omitted from the matrix until the upstream fusion_g3 <-> nnlib API
skew is fixed (its runner does not link).
---
 .ci/scripts/build-cadence-xtensa.sh           |  79 +++++++
 .ci/scripts/setup-xtensa-tools.sh             | 164 +++++++++++++++
 .github/workflows/_xtensa_build.yml           |  94 +++++++++
 .github/workflows/build-cadence-runner.yml    |  22 ++
 backends/cadence/CMakeLists.txt               |  35 ++++
 backends/cadence/cadence_executor_runner.cpp  | 198 ++++++++++++++++++
 .../cadence/hifi/operators/CMakeLists.txt     |   2 +
 7 files changed, 594 insertions(+)
 create mode 100755 .ci/scripts/build-cadence-xtensa.sh
 create mode 100755 .ci/scripts/setup-xtensa-tools.sh
 create mode 100644 .github/workflows/_xtensa_build.yml
 create mode 100644 backends/cadence/cadence_executor_runner.cpp
diff --git a/.ci/scripts/build-cadence-xtensa.sh b/.ci/scripts/build-cadence-xtensa.sh
new file mode 100755
index 00000000000..bb406528bea
--- /dev/null
+++ b/.ci/scripts/build-cadence-xtensa.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Cross-compile cadence_executor_runner for a Cadence Xtensa core and (by
+# default) smoke-test it on the Instruction Set Simulator with a trivial model.
+#
+# Requires the Xtensa toolchain env to already be set (run
+# .ci/scripts/setup-xtensa-tools.sh <backend> first): XTENSA_TOOLCHAIN,
+# TOOLCHAIN_VER, XTENSA_SYSTEM, XTENSA_CORE, XTENSAD_LICENSE_FILE,
+# CADENCE_OPT_FLAG, and xt-clang on PATH.
+#
+# Usage:
+#   .ci/scripts/build-cadence-xtensa.sh [--no-run]
+#     --no-run : compile only, skip the ISS smoke test
+
+set -euo pipefail
+
+RUN_SMOKE=1
+[[ "${1:-}" == "--no-run" ]] && RUN_SMOKE=0
+
+: "${XTENSA_TOOLCHAIN:?run setup-xtensa-tools.sh first}"
+: "${TOOLCHAIN_VER:?run setup-xtensa-tools.sh first}"
+: "${XTENSA_CORE:?run setup-xtensa-tools.sh first}"
+: "${CADENCE_OPT_FLAG:?run setup-xtensa-tools.sh first}"
+
+NPROC=$(nproc)
+echo "=== building cadence_executor_runner for ${XTENSA_CORE} (${CADENCE_OPT_FLAG}) ==="
+xt-clang --version | head -1
+
+rm -rf cmake-out
+CXXFLAGS="-fno-exceptions -fno-rtti" cmake \
+  -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
+  -DCMAKE_INSTALL_PREFIX=cmake-out \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DEXECUTORCH_BUILD_CADENCE=ON \
+  "-D${CADENCE_OPT_FLAG}=ON" \
+  -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
+  -DEXECUTORCH_BUILD_CADENCE_RUNNER=ON \
+  -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+  -DEXECUTORCH_ENABLE_LOGGING=ON \
+  -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+  -DEXECUTORCH_BUILD_CPUINFO=OFF \
+  -DEXECUTORCH_USE_DL=OFF \
+  -DEXECUTORCH_BUILD_KERNELS_LLM=OFF \
+  -DEXECUTORCH_BUILD_DEVTOOLS=OFF \
+  -DHAVE_FNMATCH_H=OFF \
+  -DFLATCC_ALLOW_WERROR=OFF \
+  -DPYTHON_EXECUTABLE="$(which python3)" \
+  -Bcmake-out .
+
+cmake --build cmake-out --target cadence_executor_runner -j"${NPROC}"
+
+RUNNER="cmake-out/backends/cadence/cadence_executor_runner"
+if [[ ! -f "${RUNNER}" ]]; then
+  echo "ERROR: ${RUNNER} was not produced" >&2
+  exit 1
+fi
+command -v file >/dev/null 2>&1 && file "${RUNNER}" || true
+echo "Build OK: ${RUNNER}"
+
+if [[ "${RUN_SMOKE}" == "0" ]]; then
+  echo "Skipping ISS smoke test (--no-run)."
+  exit 0
+fi
+
+echo "=== ISS smoke test: export add.pte and run on xt-run --turbo ==="
+python3 -m examples.portable.scripts.export --model_name=add >/dev/null
+LOG=$(mktemp)
+xt-run --turbo "${RUNNER}" --model_path=add.pte 2>&1 | tee "${LOG}"
+if ! grep -q "Model executed successfully" "${LOG}"; then
+  echo "ERROR: ISS smoke test did not report success for ${XTENSA_CORE}" >&2
+  exit 1
+fi
+echo "ISS smoke test passed for ${XTENSA_CORE}."
diff --git a/.ci/scripts/setup-xtensa-tools.sh b/.ci/scripts/setup-xtensa-tools.sh
new file mode 100755
index 00000000000..8510c32c859
--- /dev/null
+++ b/.ci/scripts/setup-xtensa-tools.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Download and install the licensed Cadence Xtensa toolchain + core config for
+# a given backend, then export the environment that
+# backends/cadence/cadence.cmake and xt-run need.
+#
+# The artifacts (host tools, the core tarball, and the bundled license) cannot
+# be hosted publicly, so they are fetched at runtime from an auth-gated object
+# store. The store location is provided by the caller via XTENSA_S3_BUCKET (set
+# from a CI variable); credentials are obtained out of band before this runs.
+#
+# Usage:
+#   XTENSA_S3_BUCKET=<bucket> .ci/scripts/setup-xtensa-tools.sh <backend>
+#     backend = hifi4 | vision | fusion_g3
+#
+# In GitHub Actions this appends the toolchain env to $GITHUB_ENV so later
+# steps inherit it. Run locally to populate a workspace for manual builds.
+#
+# Modeled on .ci/scripts/setup-arm-baremetal-tools.sh.
+
+set -euo pipefail
+
+BACKEND="${1:-}"
+if [[ -z "${BACKEND}" ]]; then
+  echo "ERROR: usage: XTENSA_S3_BUCKET=<bucket> $0 <hifi4|vision|fusion_g3>" >&2
+  exit 1
+fi
+
+S3_BUCKET="${XTENSA_S3_BUCKET:-}"
+if [[ -z "${S3_BUCKET}" ]]; then
+  echo "ERROR: XTENSA_S3_BUCKET is not set (provide it from a CI variable)." >&2
+  exit 1
+fi
+# Objects live flat at the bucket root by default; set these to put toolchains
+# and cores under key prefixes instead.
+S3_TOOLCHAIN_PREFIX="${XTENSA_S3_TOOLCHAIN_PREFIX:-}"
+S3_CORE_PREFIX="${XTENSA_S3_CORE_PREFIX:-}"
+
+# Per-backend mapping: core tarball, toolchain tarball, core name, OPT flag.
+# The toolchain's clang major must match the core's codegen plugin:
+#   hifi4 / fusion_g3 cores (RI-2022.10, clang 10) -> RI-2022.9 host tools
+#   vision core           (RJ-2025.5,  clang 15)   -> RJ-2025.5 host tools
+case "${BACKEND}" in
+  hifi4)
+    CORE_NAME="hifi4_ss_spfpu_7_et_ci2"
+    CORE_TARBALL="hifi4_ss_spfpu_7_et_ci2_linux.tgz"
+    TOOLCHAIN_TARBALL="XtensaTools_RI_2022_9_linux.tgz"
+    TOOLCHAIN_VER="RI-2022.9-linux"
+    OPT_FLAG="EXECUTORCH_NNLIB_OPT"
+    ;;
+  fusion_g3)
+    CORE_NAME="XRC_FuG3_TYP_SPVFPU_et_c2"
+    CORE_TARBALL="XRC_FuG3_TYP_SPVFPU_et_c2_linux.tgz"
+    TOOLCHAIN_TARBALL="XtensaTools_RI_2022_9_linux.tgz"
+    TOOLCHAIN_VER="RI-2022.9-linux"
+    OPT_FLAG="EXECUTORCH_FUSION_G3_OPT"
+    ;;
+  vision)
+    CORE_NAME="XRC_Vision_110_AO_et_ci2"
+    CORE_TARBALL="XRC_Vision_110_AO_et_ci2_linux.tgz"
+    TOOLCHAIN_TARBALL="XtensaTools_RJ_2025_5_linux.tgz"
+    TOOLCHAIN_VER="RJ-2025.5-linux"
+    OPT_FLAG="EXECUTORCH_VISION_OPT"
+    ;;
+  *)
+    echo "ERROR: unknown backend '${BACKEND}' (expected hifi4|vision|fusion_g3)" >&2
+    exit 1
+    ;;
+esac
+
+XTENSA_ROOT="${XTENSA_ROOT:-/tmp/xtensa}"
+TOOLS_ROOT="${XTENSA_ROOT}/tools"     # contains <ver>-linux/XtensaTools
+CORES_ROOT="${XTENSA_ROOT}/cores"     # contains <corever>-linux/<core>
+REGISTRY_ROOT="${XTENSA_ROOT}/registry/${CORE_NAME}"
+DL_DIR="${XTENSA_ROOT}/download"
+mkdir -p "${TOOLS_ROOT}" "${CORES_ROOT}" "${REGISTRY_ROOT}" "${DL_DIR}"
+
+s3_get() {
+  # $1 = s3 key, $2 = local dest
+  local key="$1" dest="$2"
+  echo "Downloading s3://${S3_BUCKET}/${key} ..."
+  aws s3 cp "s3://${S3_BUCKET}/${key}" "${dest}" --only-show-errors
+}
+
+extract_tgz() {
+  # $1 = .tgz, $2 = dest dir. Some vendor core tarballs carry trailing bytes
+  # after a valid gzip stream; gzip then exits 2 ("trailing garbage ignored")
+  # even though the archive decompressed fully, which aborts `tar xzf`. Key the
+  # success check off tar's exit, not gzip's.
+  local tgz="$1" dest="$2" rc
+  set +o pipefail
+  gzip -dc "${tgz}" 2>/dev/null | tar xf - -C "${dest}"
+  rc=${PIPESTATUS[1]}
+  set -o pipefail
+  [[ "${rc}" -eq 0 ]] || { echo "ERROR: failed to extract ${tgz} (tar rc=${rc})" >&2; exit 1; }
+}
+
+# 1. Toolchain (host xt-clang/xt-run). Skip re-extract if already present.
+if [[ ! -d "${TOOLS_ROOT}/${TOOLCHAIN_VER}/XtensaTools" ]]; then
+  s3_get "${S3_TOOLCHAIN_PREFIX:+${S3_TOOLCHAIN_PREFIX}/}${TOOLCHAIN_TARBALL}" "${DL_DIR}/${TOOLCHAIN_TARBALL}"
+  extract_tgz "${DL_DIR}/${TOOLCHAIN_TARBALL}" "${TOOLS_ROOT}"
+fi
+TOOLCHAIN_HOME="${TOOLS_ROOT}/${TOOLCHAIN_VER}/XtensaTools"
+if [[ ! -x "${TOOLCHAIN_HOME}/bin/xt-clang" ]]; then
+  echo "ERROR: xt-clang not found at ${TOOLCHAIN_HOME}/bin after extract" >&2
+  exit 1
+fi
+
+# 2. Core config (ISA libs, params, examples, bundled magic-key license).
+s3_get "${S3_CORE_PREFIX:+${S3_CORE_PREFIX}/}${CORE_TARBALL}" "${DL_DIR}/${CORE_TARBALL}"
+extract_tgz "${DL_DIR}/${CORE_TARBALL}" "${CORES_ROOT}"
+CORE_DIR=$(echo "${CORES_ROOT}"/*/"${CORE_NAME}")
+if [[ ! -d "${CORE_DIR}" ]]; then
+  echo "ERROR: core dir for ${CORE_NAME} not found under ${CORES_ROOT}" >&2
+  exit 1
+fi
+
+# 3. Build a local Xtensa core registry with the XPG-internal build paths in
+#    the params file rewritten to our extracted toolchain + core locations.
+#    The vendor ships params referencing /././home/xpgcust/... build paths.
+PARAMS_SRC="${CORE_DIR}/config/${CORE_NAME}-params"
+TOOLS_PFX=$(sed -n 's/^install-prefix = //p' "${PARAMS_SRC}" | head -1)
+TOOLSUB_PFX=$(sed -n 's/^xtensa-tools = //p' "${PARAMS_SRC}" | head -1)
+CFG_PFX=$(sed -n 's/^config-prefix = //p' "${PARAMS_SRC}" | head -1)
+sed \
+  -e "s|${TOOLS_PFX}|${TOOLCHAIN_HOME}|g" \
+  -e "s|${TOOLSUB_PFX}|${TOOLCHAIN_HOME}/Tools|g" \
+  -e "s|${CFG_PFX}|${CORE_DIR}|g" \
+  "${PARAMS_SRC}" > "${REGISTRY_ROOT}/${CORE_NAME}-params"
+ln -sf "${CORE_NAME}-params" "${REGISTRY_ROOT}/default-params"
+
+LICENSE_FILE="${CORE_DIR}/misc/license.dat"
+
+# 4. Export environment. cadence.cmake reads XTENSA_TOOLCHAIN/TOOLCHAIN_VER;
+#    xt-clang/xt-run read XTENSA_SYSTEM/XTENSA_CORE; xtensad reads
+#    XTENSAD_LICENSE_FILE (the bundled uncounted magic key, no server needed).
+emit() {
+  # Export into the current shell (so callers that `source` this script get the
+  # vars) and append to $GITHUB_ENV (so later workflow steps inherit them too).
+  echo "$1"
+  export "${1?}"
+  if [[ -n "${GITHUB_ENV:-}" ]]; then echo "$1" >> "${GITHUB_ENV}"; fi
+}
+echo "=== Xtensa env for backend '${BACKEND}' (core ${CORE_NAME}) ==="
+emit "XTENSA_TOOLCHAIN=${TOOLS_ROOT}"
+emit "TOOLCHAIN_VER=${TOOLCHAIN_VER}"
+emit "XTENSA_SYSTEM=${REGISTRY_ROOT}"
+emit "XTENSA_CORE=${CORE_NAME}"
+emit "XTENSAD_LICENSE_FILE=${LICENSE_FILE}"
+emit "CADENCE_OPT_FLAG=${OPT_FLAG}"
+if [[ -n "${GITHUB_PATH:-}" ]]; then
+  echo "${TOOLCHAIN_HOME}/bin" >> "${GITHUB_PATH}"
+fi
+export PATH="${TOOLCHAIN_HOME}/bin:${PATH}"
+
+echo "=== sanity ==="
+xt-clang --version 2>&1 | head -1
+xt-run --show-config=cores 2>&1 | sed -n '/available/,/registry/p' | head -6
+echo "Xtensa toolchain ready for ${BACKEND}."
diff --git a/.github/workflows/_xtensa_build.yml b/.github/workflows/_xtensa_build.yml
new file mode 100644
index 00000000000..ac78323aa3e
--- /dev/null
+++ b/.github/workflows/_xtensa_build.yml
@@ -0,0 +1,94 @@
+# Reusable: cross-compile cadence_executor_runner for one Cadence Xtensa core.
+#
+# A native job (not linux_job_v2) because the GitHub OIDC token must be minted on
+# the runner host: the ACTIONS_ID_TOKEN_REQUEST_* vars do not cross into
+# linux_job_v2's docker exec. So the role is assumed on the host, then the build
+# runs inside the CI image via docker run with the creds passed in. Binding the
+# environment also gives the OIDC token the environment claim. The licensed
+# toolchain + core configs are fetched at runtime from an auth-gated store;
+# role/region/store come from CI variables and are not committed.
+name: xtensa-build
+
+on:
+  workflow_call:
+    inputs:
+      backend:
+        description: "Cadence backend to build (hifi4 | vision | fusion_g3)"
+        required: true
+        type: string
+      ref:
+        description: "Git ref to check out"
+        required: false
+        type: string
+        default: ""
+
+jobs:
+  build:
+    name: ${{ inputs.backend }}
+    runs-on: linux.2xlarge
+    environment: cadence
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Checkout executorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          ref: ${{ inputs.ref }}
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: ci-image:executorch-ubuntu-22.04-clang12
+
+      - name: Pull docker image
+        run: docker pull "${{ steps.calculate-docker-image.outputs.docker-image }}"
+
+      - name: Assume Cadence artifacts role (host OIDC)
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ vars.CADENCE_CI_AWS_ROLE }}
+          aws-region: ${{ vars.CADENCE_CI_AWS_REGION }}
+
+      - name: Cross-compile cadence_executor_runner
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          BACKEND: ${{ inputs.backend }}
+          XTENSA_S3_BUCKET: ${{ vars.CADENCE_CI_S3_BUCKET }}
+        shell: bash
+        run: |
+          set -eux
+          # OIDC/role assumption already happened on the host above; pass the
+          # resulting AWS creds and the store/backend into the CI image, where
+          # the toolchain download + cross-compile run.
+          docker run --rm \
+            -e BACKEND -e XTENSA_S3_BUCKET \
+            -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_SESSION_TOKEN \
+            -e AWS_DEFAULT_REGION -e AWS_REGION \
+            -v "${GITHUB_WORKSPACE}:/work/executorch" -w /work/executorch \
+            "${DOCKER_IMAGE}" \
+            bash -c '
+              set -exo pipefail
+              eval "$(/opt/conda/bin/conda shell.bash hook)"
+              conda activate "$(conda env list --json | jq -r ".envs | .[-1]")"
+              ./install_requirements.sh > /dev/null
+              pip install --quiet awscli
+              # hifi4/fusion_g3 optimized kernels need the foss-xtensa nnlib
+              # sources, which are not vendored in executorch; the cadence
+              # installer clones them. vision has no nnlib dependency.
+              if [ "${BACKEND}" != "vision" ]; then
+                backends/cadence/install_requirements.sh
+              fi
+              source .ci/scripts/setup-xtensa-tools.sh "${BACKEND}"
+              .ci/scripts/build-cadence-xtensa.sh --no-run
+              chmod -R a+rX cmake-out
+            '
+
+      - name: Upload runner
+        uses: actions/upload-artifact@v4
+        with:
+          name: cadence-xtensa-build-${{ inputs.backend }}
+          path: cmake-out/backends/cadence/cadence_executor_runner
+          if-no-files-found: error
diff --git a/.github/workflows/build-cadence-runner.yml b/.github/workflows/build-cadence-runner.yml
index 6f99958616f..83d0e50d7b1 100644
--- a/.github/workflows/build-cadence-runner.yml
+++ b/.github/workflows/build-cadence-runner.yml
@@ -50,3 +50,25 @@ jobs:
     uses: ./.github/workflows/_test_cadence.yml
     with:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+
+  # Cross-compile cadence_executor_runner for each Cadence Xtensa core, one job
+  # per backend so they show as separate lines (no matrix grouping). Shared logic
+  # lives in _xtensa_build.yml. fusion_g3 is omitted until the upstream fusion_g3
+  # <-> nnlib-FusionG3 API skew is fixed (its runner does not link).
+  hifi-build:
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/_xtensa_build.yml
+    with:
+      backend: hifi4
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+
+  vision-build:
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/_xtensa_build.yml
+    with:
+      backend: vision
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index 271b4806614..f04bda30a69 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -97,3 +97,38 @@ else()
 endif()
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)
+
+# Cadence executor_runner: cross-compiled ExecuTorch runner for the Xtensa ISS
+# (xt-run / xt-run --turbo). Self-contained, gflags-free argv parser, reads .pte
+# via xt-run semi-hosting.
+#
+# Usage: cmake ... -DEXECUTORCH_BUILD_CADENCE_RUNNER=ON xt-run --turbo
+# cmake-out/backends/cadence/cadence_executor_runner \ --model_path=add.pte
+if(EXECUTORCH_BUILD_CADENCE_RUNNER)
+  add_executable(cadence_executor_runner cadence_executor_runner.cpp)
+  target_compile_definitions(
+    cadence_executor_runner PRIVATE ET_ENABLE_ENUM_STRINGS=0
+  )
+  target_include_directories(
+    cadence_executor_runner
+    PRIVATE ${_common_include_directories} ${CMAKE_BINARY_DIR}
+            ${CMAKE_BINARY_DIR}/include
+  )
+  # Mirror the upstream executor_runner cadence link list (top-level
+  # CMakeLists.txt: list(APPEND _executor_runner_libs cadence_ops_lib)). Do NOT
+  # add --whole-archive: cadence_ops_lib is also pulled transitively, and
+  # forcing a second copy double-runs its static kernel-registration
+  # initializers and asserts at runtime.
+  target_link_libraries(
+    cadence_executor_runner PRIVATE executorch extension_evalue_util
+                                    extension_runner_util cadence_ops_lib
+  )
+  # Vision and Fusion-G3 ops (e.g. op_softmax) reference iDMA scheduling symbols
+  # and those cores ship libidma in their LSP. HiFi4 and generic cores do not
+  # use iDMA and their LSPs may not provide libidma, so only link it for the
+  # cores that need it.
+  if(EXECUTORCH_VISION_OPT OR EXECUTORCH_FUSION_G3_OPT)
+    target_link_options(cadence_executor_runner PRIVATE -lidma)
+  endif()
+  target_link_options(cadence_executor_runner PRIVATE -static -lm)
+endif()
diff --git a/backends/cadence/cadence_executor_runner.cpp b/backends/cadence/cadence_executor_runner.cpp
new file mode 100644
index 00000000000..57043cd8667
--- /dev/null
+++ b/backends/cadence/cadence_executor_runner.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * ExecuTorch runner for Cadence Xtensa cores, intended to run on the
+ * Xtensa Instruction Set Simulator (xt-run / xt-run --turbo).
+ *
+ * Reads a .pte from the host filesystem via xt-run semi-hosting,
+ * executes the first method with all-ones inputs (via
+ * prepare_input_tensors), and prints the outputs.
+ *
+ * Argument parsing is plain argv inspection — gflags pulls in
+ * mkdir(2), which Xtensa newlib does not declare, breaking
+ * cross-compile. Mirrors the same approach Arm and NXP take in their
+ * embedded runners.
+ *
+ * Usage:
+ *   xt-run --turbo cadence_executor_runner --model_path=add.pte
+ *   xt-run --mem_model --summary cadence_executor_runner --model_path=add.pte
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <string>
+// patternlint-disable executorch-cpp-nostdinc
+#include <vector>
+
+#include <executorch/extension/data_loader/buffer_data_loader.h>
+#include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+namespace {
+
+// 18 KB has historically been enough for the cadence "hello world"
+// models (add, simple MLP). Bump if you hit MemoryAllocator overflow
+// at load_method time.
+constexpr std::size_t kMethodAllocatorBytes = 18 * 1024U;
+uint8_t method_allocator_pool[kMethodAllocatorBytes];
+
+const char* parse_model_path(int argc, char** argv) {
+  constexpr char kFlag[] = "--model_path=";
+  constexpr std::size_t kFlagLen = sizeof(kFlag) - 1;
+  for (int i = 1; i < argc; ++i) {
+    if (std::strncmp(argv[i], kFlag, kFlagLen) == 0) {
+      // Static so the returned pointer stays valid after parse returns.
+      static std::string path{argv[i] + kFlagLen};
+      return path.c_str();
+    }
+  }
+  return "model.pte";
+}
+
+bool slurp(const char* path, std::vector<uint8_t>* out) {
+  FILE* f = std::fopen(path, "rb");
+  if (!f) {
+    ET_LOG(Error, "fopen('%s') failed", path);
+    return false;
+  }
+  std::fseek(f, 0, SEEK_END);
+  long sz = std::ftell(f);
+  std::fseek(f, 0, SEEK_SET);
+  if (sz <= 0) {
+    ET_LOG(Error, "model file '%s' is empty or stat failed", path);
+    std::fclose(f);
+    return false;
+  }
+  out->resize(static_cast<std::size_t>(sz));
+  std::size_t n = std::fread(out->data(), 1, sz, f);
+  std::fclose(f);
+  if (static_cast<long>(n) != sz) {
+    ET_LOG(Error, "fread short on '%s': %zu/%ld", path, n, sz);
+    return false;
+  }
+  ET_LOG(Info, "Loaded %ld bytes from %s", sz, path);
+  return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  executorch::runtime::runtime_init();
+
+  std::vector<uint8_t> model;
+  const char* path = parse_model_path(argc, argv);
+  if (!slurp(path, &model)) {
+    return 1;
+  }
+
+  auto loader =
+      executorch::extension::BufferDataLoader(model.data(), model.size());
+
+  Result<executorch::runtime::Program> program =
+      executorch::runtime::Program::load(&loader);
+  if (!program.ok()) {
+    ET_LOG(Error, "Program::load failed: 0x%" PRIx32, program.error());
+    return 1;
+  }
+  ET_LOG(Info, "Model buffer loaded, has %u methods", program->num_methods());
+
+  const char* method_name = nullptr;
+  {
+    const auto method_name_result = program->get_method_name(0);
+    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
+    method_name = *method_name_result;
+  }
+  ET_LOG(Info, "Running method %s", method_name);
+
+  Result<executorch::runtime::MethodMeta> method_meta =
+      program->method_meta(method_name);
+  if (!method_meta.ok()) {
+    ET_LOG(
+        Error,
+        "method_meta('%s') failed: 0x%x",
+        method_name,
+        (unsigned int)method_meta.error());
+    return 1;
+  }
+
+  executorch::runtime::MemoryAllocator method_allocator(
+      sizeof(method_allocator_pool), method_allocator_pool);
+
+  std::vector<std::unique_ptr<uint8_t[]>> planned_buffers;
+  std::vector<executorch::runtime::Span<uint8_t>> planned_spans;
+  const std::size_t num_planned = method_meta->num_memory_planned_buffers();
+  for (std::size_t id = 0; id < num_planned; ++id) {
+    const std::size_t buffer_size = static_cast<std::size_t>(
+        method_meta->memory_planned_buffer_size(id).get());
+    ET_LOG(Info, "Setting up planned buffer %zu, size %zu", id, buffer_size);
+    planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
+    planned_spans.push_back({planned_buffers.back().get(), buffer_size});
+  }
+  executorch::runtime::HierarchicalAllocator planned_memory(
+      {planned_spans.data(), planned_spans.size()});
+
+  executorch::runtime::MemoryManager memory_manager(
+      &method_allocator, &planned_memory);
+
+  Result<executorch::runtime::Method> method =
+      program->load_method(method_name, &memory_manager);
+  if (!method.ok()) {
+    ET_LOG(
+        Error,
+        "load_method('%s') failed: 0x%" PRIx32,
+        method_name,
+        method.error());
+    return 1;
+  }
+  ET_LOG(Info, "Method loaded.");
+
+  auto cleanup = executorch::extension::prepare_input_tensors(*method);
+  if (!cleanup.ok()) {
+    ET_LOG(
+        Error,
+        "prepare_input_tensors failed: 0x%x",
+        (unsigned int)cleanup.error());
+    return 1;
+  }
+  ET_LOG(Info, "Starting model execution...");
+
+  Error status = method->execute();
+  if (status != Error::Ok) {
+    ET_LOG(Error, "execute() failed for '%s': 0x%" PRIx32, method_name, status);
+    return 1;
+  }
+  ET_LOG(Info, "Model executed successfully.");
+
+  std::vector<executorch::runtime::EValue> outputs(method->outputs_size());
+  method->get_outputs(outputs.data(), outputs.size());
+  for (std::size_t i = 0; i < outputs.size(); ++i) {
+    if (!outputs[i].isTensor()) {
+      ET_LOG(Info, "output[%zu]: non-tensor", i);
+      continue;
+    }
+    const auto& t = outputs[i].toTensor();
+    const float* p = t.const_data_ptr<float>();
+    const std::size_t n = t.numel() < 20 ? t.numel() : 20;
+    ET_LOG(Info, "First %zu elements of output %zu:", n, i);
+    for (std::size_t j = 0; j < n; ++j) {
+      ET_LOG(Info, "  %f", p[j]);
+    }
+  }
+  return 0;
+}
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 2e764541319..b5801f5d488 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -134,6 +134,8 @@ add_library(
   "op_quantized_conv2d_nchw_out.cpp"
   "op_quantized_conv1d_ncl.cpp"
   "op_quantized_conv1d_nlc.cpp"
+  "op_quantized_depthwise_conv1d_ncl.cpp"
+  "op_quantized_depthwise_conv1d_nlc.cpp"
   "op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp"
   "op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp"
   "op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp"