fsspec · zhixiangli · May 26, 2026 · May 25, 2026
diff --git a/cloudbuild/benchmarks/README.md b/cloudbuild/benchmarks/README.md
@@ -54,4 +54,5 @@ To run this automation in your own GCP project, you need to set up atleast two C
 *   **Configuration File**: `cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml`
 *   **Substitutions**:
     *   `_DATASET_NAME`: The name of the BigQuery dataset to store results (e.g., `gcsfs_benchmarks`).
+    *   `_INFRA_PREFIX`: The prefix used for resources in the benchmarks run (e.g., `gcsfs-perf`). Must match the one used in the run trigger.
 *   **Trigger Event**: This trigger is typically scheduled to run after the benchmarks pipeline completes, or triggered manually.
diff --git a/cloudbuild/benchmarks/benchmarks-cloudbuild.yaml b/cloudbuild/benchmarks/benchmarks-cloudbuild.yaml
@@ -24,6 +24,7 @@ steps:
         mkdir -p /workspace/.ssh
         ssh-keygen -t rsa -f /workspace/.ssh/google_compute_engine -N '' -C gcb
         cat /workspace/.ssh/google_compute_engine.pub > /workspace/gcb_ssh_key.pub
+        gcloud compute os-login ssh-keys add --key-file=/workspace/gcb_ssh_key.pub --ttl=4h
     waitFor: ["-"]
 
   # 2. Initialize shared variables.
@@ -38,14 +39,20 @@ steps:
       - |
         SHORT_BUILD_ID=$${BUILD_ID:0:8}
         # Define shared variables
-        SAFE_BRANCH=$(echo "$${BRANCH_NAME:-unknown}" | tr -c 'a-zA-Z0-9_.-' '-')
+        SAFE_BRANCH=$(echo -n "$${BRANCH_NAME:-unknown}" | tr -c 'a-zA-Z0-9_.-' '-')
 
         # Error out if the branch is unknown
         if [ "$${SAFE_BRANCH}" = "unknown" ]; then
           echo "ERROR: BRANCH_NAME is not set or evaluates to unknown. Failing the build."
           exit 1
         fi
 
+        # Error out if required variables are not set
+        if [ -z "${_INFRA_PREFIX}" ] || [ -z "${_ZONE}" ] || [ -z "${_BENCHMARK_TYPE}" ]; then
+          echo "ERROR: One or more required variables (_INFRA_PREFIX, _ZONE, _BENCHMARK_TYPE) are not set."
+          exit 1
+        fi
+
         echo "export BRANCH_NAME=$${SAFE_BRANCH}" >> /workspace/build_vars.env
 
         echo "export RUN_ID=$${BUILD_ID}" >> /workspace/build_vars.env

diff --git a/cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml b/cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml
@@ -11,6 +11,11 @@ steps:
     args:
       - '-c'
       - |
+        if [ -z "${_DATASET_NAME}" ] || [ -z "${_INFRA_PREFIX}" ]; then
+          echo "ERROR: One or more required variables (_DATASET_NAME, _INFRA_PREFIX) are not set."
+          exit 1
+        fi
+
         bq show ${PROJECT_ID}:${_DATASET_NAME} || \
         bq mk --dataset --location=${LOCATION} ${PROJECT_ID}:${_DATASET_NAME}
 

diff --git a/cloudbuild/benchmarks/ingest.sql b/cloudbuild/benchmarks/ingest.sql
@@ -1,47 +1,57 @@
--- 1. Variable declarations must be at the top
-DECLARE alter_stmt STRING;
+BEGIN
+  -- 1. Variable declarations must be at the top
+  DECLARE alter_stmt STRING;
+  DECLARE columns_list STRING;
+  DECLARE insert_query STRING;
 
--- 2. Ensure the history table exists with metadata columns
-CREATE TABLE IF NOT EXISTS `@PROJECT_ID@.@DATASET_NAME@.history`
-(
-  run_date DATE,
-  build_id STRING,
-  run_timestamp TIMESTAMP,
-  source_uri STRING,
-  branch_name STRING
-)
-PARTITION BY run_date;
+  -- 2. Ensure the history table exists with metadata columns
+  CREATE TABLE IF NOT EXISTS `@PROJECT_ID@.@DATASET_NAME@.history`
+  (
+    run_date DATE,
+    build_id STRING,
+    run_timestamp TIMESTAMP,
+    source_uri STRING,
+    branch_name STRING
+  )
+  PARTITION BY run_date;
 
--- 3. Dynamically find new columns in staging that are missing from history
-SET alter_stmt = (
-  SELECT
-    CONCAT("ALTER TABLE `@PROJECT_ID@.@DATASET_NAME@.history` ",
-           STRING_AGG(CONCAT("ADD COLUMN `", column_name, "` ", data_type), ", "))
-  FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
-  WHERE table_name = 'staging'
-    AND column_name NOT IN (
-      SELECT column_name
-      FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
-      WHERE table_name = 'history'
-    )
-);
+  -- 3. Dynamically find new columns in staging that are missing from history
+  SET alter_stmt = (
+    SELECT
+      CONCAT("ALTER TABLE `@PROJECT_ID@.@DATASET_NAME@.history` ",
+             STRING_AGG(CONCAT("ADD COLUMN `", column_name, "` ", data_type), ", "))
+    FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
+    WHERE table_name = 'staging'
+      AND column_name NOT IN (
+        SELECT column_name
+        FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
+        WHERE table_name = 'history'
+      )
+  );
 
--- 4. Execute the schema update only if new columns were found
-IF alter_stmt IS NOT NULL THEN
-  EXECUTE IMMEDIATE alter_stmt;
-END IF;
+  -- 4. Execute the schema update only if new columns were found
+  IF alter_stmt IS NOT NULL THEN
+    EXECUTE IMMEDIATE alter_stmt;
+  END IF;
 
--- 5. Perform the idempotent ingestion
-INSERT INTO `@PROJECT_ID@.@DATASET_NAME@.history`
-SELECT
-  PARSE_DATE('%d%m%Y', REGEXP_EXTRACT(_FILE_NAME, r'/(\d{8})/')) as run_date,
-  REGEXP_EXTRACT(_FILE_NAME, r'/([0-9a-fA-F-]{36})/') as build_id,
-  PARSE_TIMESTAMP('%d%m%Y-%H%M%S', REGEXP_EXTRACT(_FILE_NAME, r'/(\d{8}-\d{6})/')) as run_timestamp,
-  _FILE_NAME as source_uri,
-  REGEXP_EXTRACT(_FILE_NAME, r'/branch=([^/]+)/') as branch_name,
-  *
-FROM `@PROJECT_ID@.@DATASET_NAME@.staging`
-WHERE _FILE_NAME NOT IN (
-  SELECT DISTINCT source_uri
-  FROM `@PROJECT_ID@.@DATASET_NAME@.history`
-);
+  -- 5. Perform the idempotent ingestion using dynamic SQL to match column counts
+  SET columns_list = (
+    SELECT STRING_AGG(CONCAT("`", column_name, "`"), ", " ORDER BY column_name)
+    FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
+    WHERE table_name = 'staging'
+  );
+
+  SET insert_query = CONCAT(
+    "INSERT INTO `@PROJECT_ID@.@DATASET_NAME@.history` (run_date, build_id, run_timestamp, source_uri, branch_name, ",
+    columns_list,
+    ") SELECT PARSE_DATE('%d%m%Y', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8})/')) as run_date, ",
+    "REGEXP_EXTRACT(_FILE_NAME, r'/([0-9a-fA-F-]{36})/') as build_id, ",
+    "PARSE_TIMESTAMP('%d%m%Y-%H%M%S', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8}-\\d{6})/')) as run_timestamp, ",
+    "_FILE_NAME as source_uri, ",
+    "REGEXP_EXTRACT(_FILE_NAME, r'/branch=([^/]+)/') as branch_name, ",
+    columns_list,
+    " FROM `@PROJECT_ID@.@DATASET_NAME@.staging` WHERE _FILE_NAME NOT IN (SELECT DISTINCT source_uri FROM `@PROJECT_ID@.@DATASET_NAME@.history`)"
+  );
+
+  EXECUTE IMMEDIATE insert_query;
+END;
diff --git a/cloudbuild/e2e-tests-cloudbuild.yaml b/cloudbuild/e2e-tests-cloudbuild.yaml
@@ -35,6 +35,8 @@ steps:
         ssh-keygen -t rsa -f /workspace/.ssh/google_compute_engine -N '' -C gcb || { echo "generate-ssh-key" >> /workspace/FAILED; exit 1; }
         # Save the public key content to a file for the cleanup step
         cat /workspace/.ssh/google_compute_engine.pub > /workspace/gcb_ssh_key.pub
+        # Add key to OS Login with TTL
+        gcloud compute os-login ssh-keys add --key-file=/workspace/gcb_ssh_key.pub --ttl=1h
     waitFor: ["-"]
     allowFailure: true