From fb63cef9a6ef296cf0e5b5baf35c0ed7a8e29f8b Mon Sep 17 00:00:00 2001 From: Siddartha Pothapragada Date: Wed, 1 Jul 2026 14:06:29 -0700 Subject: [PATCH] Fix flaky Android NDK download in QNN CI setup Summary: QNN CI jobs intermittently fail during environment setup while downloading the Android NDK, with the signature `curl: (92) HTTP/2 stream 0 was not closed cleanly: INTERNAL_ERROR (err 2)`. Because this happens in the shared `setup_android_ndk` step (used by `build-qnn-sdk.sh`, `build-qnn-direct-sdk.sh`, and `setup-qnn-deps.sh`), the failure surfaces on a different test each run but always with the same signature. The failure is an intermittent HTTP/2 stream reset from `dl.google.com` mid-transfer. Two gaps made it fatal rather than self-healing: the existing `curl --retry 3` never retried it, because curl's default retry set does not include transport error 92 (and `--retry-connrefused` does not cover it either); and `set -ex` then aborted the whole script on the first occurrence. This mirrors the download-robustness pattern already used by `install_qnn` and `install_hexagon_sdk` in the same file, and applies it to `setup_android_ndk`: - `--http1.1` sidesteps the HTTP/2 stream-reset behavior entirely (the standard workaround for this Google CDN error). - `--retry-all-errors` makes the retry count apply to transport failures such as error 92. - `--fail` treats HTTP errors as failures instead of writing an error body into the zip. - The download is wrapped in a 5-attempt loop that removes any partial file and validates the archive with `unzip -tq` before extracting, so a truncated or corrupt download cannot slip through to a confusing `unzip` error. The risky `--continue-at -` resume is dropped. Differential Revision: D110373581 --- backends/qualcomm/scripts/install_qnn_sdk.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/backends/qualcomm/scripts/install_qnn_sdk.sh b/backends/qualcomm/scripts/install_qnn_sdk.sh index f7e8ccab184..a7a9e3ba5e5 100644 --- a/backends/qualcomm/scripts/install_qnn_sdk.sh +++ b/backends/qualcomm/scripts/install_qnn_sdk.sh @@ -27,7 +27,25 @@ setup_android_ndk() { mkdir -p "${NDK_INSTALL_DIR}" NDK_ZIP="android-ndk-${NDK_VERSION}-linux.zip" - curl --retry 3 --retry-delay 5 --retry-connrefused --continue-at - -Lo "/tmp/${NDK_ZIP}" "https://dl.google.com/android/repository/${NDK_ZIP}" + # dl.google.com intermittently resets HTTP/2 streams mid-transfer + # (curl error 92, INTERNAL_ERROR). Force HTTP/1.1 to avoid it, and use + # --retry-all-errors so the retry count actually applies to such transport + # failures (plain --retry does not retry them). Re-download and re-verify + # the archive on each attempt rather than resuming a possibly-corrupt file. + for attempt in 1 2 3 4 5; do + rm -f "/tmp/${NDK_ZIP}" + curl --fail --http1.1 --retry 3 --retry-delay 5 --retry-connrefused --retry-all-errors \ + -Lo "/tmp/${NDK_ZIP}" "https://dl.google.com/android/repository/${NDK_ZIP}" || true + if unzip -tq "/tmp/${NDK_ZIP}" >/dev/null 2>&1; then + break + fi + if [ "${attempt}" -eq 5 ]; then + echo "Failed to download a valid Android NDK archive after ${attempt} attempts" >&2 + exit 1 + fi + echo "NDK download/verify failed (attempt ${attempt}), retrying..." + sleep 5 + done unzip -q "/tmp/${NDK_ZIP}" -d "${NDK_INSTALL_DIR}" mv "${NDK_INSTALL_DIR}/android-ndk-${NDK_VERSION}" "${NDK_INSTALL_DIR}/ndk"