diff --git a/cloudbuild/benchmarks/README.md b/cloudbuild/benchmarks/README.md index 0bff3ffa..31d3a8e0 100644 --- a/cloudbuild/benchmarks/README.md +++ b/cloudbuild/benchmarks/README.md @@ -54,4 +54,5 @@ To run this automation in your own GCP project, you need to set up atleast two C * **Configuration File**: `cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml` * **Substitutions**: * `_DATASET_NAME`: The name of the BigQuery dataset to store results (e.g., `gcsfs_benchmarks`). + * `_INFRA_PREFIX`: The prefix used for resources in the benchmarks run (e.g., `gcsfs-perf`). Must match the one used in the run trigger. * **Trigger Event**: This trigger is typically scheduled to run after the benchmarks pipeline completes, or triggered manually. diff --git a/cloudbuild/benchmarks/benchmarks-cloudbuild.yaml b/cloudbuild/benchmarks/benchmarks-cloudbuild.yaml index 51c41a8b..9e107237 100644 --- a/cloudbuild/benchmarks/benchmarks-cloudbuild.yaml +++ b/cloudbuild/benchmarks/benchmarks-cloudbuild.yaml @@ -24,6 +24,7 @@ steps: mkdir -p /workspace/.ssh ssh-keygen -t rsa -f /workspace/.ssh/google_compute_engine -N '' -C gcb cat /workspace/.ssh/google_compute_engine.pub > /workspace/gcb_ssh_key.pub + gcloud compute os-login ssh-keys add --key-file=/workspace/gcb_ssh_key.pub --ttl=4h waitFor: ["-"] # 2. Initialize shared variables. @@ -38,7 +39,7 @@ steps: - | SHORT_BUILD_ID=$${BUILD_ID:0:8} # Define shared variables - SAFE_BRANCH=$(echo "$${BRANCH_NAME:-unknown}" | tr -c 'a-zA-Z0-9_.-' '-') + SAFE_BRANCH=$(echo -n "$${BRANCH_NAME:-unknown}" | tr -c 'a-zA-Z0-9_.-' '-') # Error out if the branch is unknown if [ "$${SAFE_BRANCH}" = "unknown" ]; then @@ -46,6 +47,12 @@ steps: exit 1 fi + # Error out if required variables are not set + if [ -z "${_INFRA_PREFIX}" ] || [ -z "${_ZONE}" ] || [ -z "${_BENCHMARK_TYPE}" ]; then + echo "ERROR: One or more required variables (_INFRA_PREFIX, _ZONE, _BENCHMARK_TYPE) are not set." + exit 1 + fi + echo "export BRANCH_NAME=$${SAFE_BRANCH}" >> /workspace/build_vars.env echo "export RUN_ID=$${BUILD_ID}" >> /workspace/build_vars.env diff --git a/cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml b/cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml index bf7adc4e..8944c333 100644 --- a/cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml +++ b/cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml @@ -11,6 +11,11 @@ steps: args: - '-c' - | + if [ -z "${_DATASET_NAME}" ] || [ -z "${_INFRA_PREFIX}" ]; then + echo "ERROR: One or more required variables (_DATASET_NAME, _INFRA_PREFIX) are not set." + exit 1 + fi + bq show ${PROJECT_ID}:${_DATASET_NAME} || \ bq mk --dataset --location=${LOCATION} ${PROJECT_ID}:${_DATASET_NAME} diff --git a/cloudbuild/benchmarks/ingest.sql b/cloudbuild/benchmarks/ingest.sql index bee31001..f6f7dc2c 100644 --- a/cloudbuild/benchmarks/ingest.sql +++ b/cloudbuild/benchmarks/ingest.sql @@ -1,47 +1,57 @@ --- 1. Variable declarations must be at the top -DECLARE alter_stmt STRING; +BEGIN + -- 1. Variable declarations must be at the top + DECLARE alter_stmt STRING; + DECLARE columns_list STRING; + DECLARE insert_query STRING; --- 2. Ensure the history table exists with metadata columns -CREATE TABLE IF NOT EXISTS `@PROJECT_ID@.@DATASET_NAME@.history` -( - run_date DATE, - build_id STRING, - run_timestamp TIMESTAMP, - source_uri STRING, - branch_name STRING -) -PARTITION BY run_date; + -- 2. Ensure the history table exists with metadata columns + CREATE TABLE IF NOT EXISTS `@PROJECT_ID@.@DATASET_NAME@.history` + ( + run_date DATE, + build_id STRING, + run_timestamp TIMESTAMP, + source_uri STRING, + branch_name STRING + ) + PARTITION BY run_date; --- 3. Dynamically find new columns in staging that are missing from history -SET alter_stmt = ( - SELECT - CONCAT("ALTER TABLE `@PROJECT_ID@.@DATASET_NAME@.history` ", - STRING_AGG(CONCAT("ADD COLUMN `", column_name, "` ", data_type), ", ")) - FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS` - WHERE table_name = 'staging' - AND column_name NOT IN ( - SELECT column_name - FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS` - WHERE table_name = 'history' - ) -); + -- 3. Dynamically find new columns in staging that are missing from history + SET alter_stmt = ( + SELECT + CONCAT("ALTER TABLE `@PROJECT_ID@.@DATASET_NAME@.history` ", + STRING_AGG(CONCAT("ADD COLUMN `", column_name, "` ", data_type), ", ")) + FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS` + WHERE table_name = 'staging' + AND column_name NOT IN ( + SELECT column_name + FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS` + WHERE table_name = 'history' + ) + ); --- 4. Execute the schema update only if new columns were found -IF alter_stmt IS NOT NULL THEN - EXECUTE IMMEDIATE alter_stmt; -END IF; + -- 4. Execute the schema update only if new columns were found + IF alter_stmt IS NOT NULL THEN + EXECUTE IMMEDIATE alter_stmt; + END IF; --- 5. Perform the idempotent ingestion -INSERT INTO `@PROJECT_ID@.@DATASET_NAME@.history` -SELECT - PARSE_DATE('%d%m%Y', REGEXP_EXTRACT(_FILE_NAME, r'/(\d{8})/')) as run_date, - REGEXP_EXTRACT(_FILE_NAME, r'/([0-9a-fA-F-]{36})/') as build_id, - PARSE_TIMESTAMP('%d%m%Y-%H%M%S', REGEXP_EXTRACT(_FILE_NAME, r'/(\d{8}-\d{6})/')) as run_timestamp, - _FILE_NAME as source_uri, - REGEXP_EXTRACT(_FILE_NAME, r'/branch=([^/]+)/') as branch_name, - * -FROM `@PROJECT_ID@.@DATASET_NAME@.staging` -WHERE _FILE_NAME NOT IN ( - SELECT DISTINCT source_uri - FROM `@PROJECT_ID@.@DATASET_NAME@.history` -); + -- 5. Perform the idempotent ingestion using dynamic SQL to match column counts + SET columns_list = ( + SELECT STRING_AGG(CONCAT("`", column_name, "`"), ", " ORDER BY column_name) + FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS` + WHERE table_name = 'staging' + ); + + SET insert_query = CONCAT( + "INSERT INTO `@PROJECT_ID@.@DATASET_NAME@.history` (run_date, build_id, run_timestamp, source_uri, branch_name, ", + columns_list, + ") SELECT PARSE_DATE('%d%m%Y', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8})/')) as run_date, ", + "REGEXP_EXTRACT(_FILE_NAME, r'/([0-9a-fA-F-]{36})/') as build_id, ", + "PARSE_TIMESTAMP('%d%m%Y-%H%M%S', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8}-\\d{6})/')) as run_timestamp, ", + "_FILE_NAME as source_uri, ", + "REGEXP_EXTRACT(_FILE_NAME, r'/branch=([^/]+)/') as branch_name, ", + columns_list, + " FROM `@PROJECT_ID@.@DATASET_NAME@.staging` WHERE _FILE_NAME NOT IN (SELECT DISTINCT source_uri FROM `@PROJECT_ID@.@DATASET_NAME@.history`)" + ); + + EXECUTE IMMEDIATE insert_query; +END; diff --git a/cloudbuild/e2e-tests-cloudbuild.yaml b/cloudbuild/e2e-tests-cloudbuild.yaml index 0e0c13b7..a8b6b90a 100644 --- a/cloudbuild/e2e-tests-cloudbuild.yaml +++ b/cloudbuild/e2e-tests-cloudbuild.yaml @@ -35,6 +35,8 @@ steps: ssh-keygen -t rsa -f /workspace/.ssh/google_compute_engine -N '' -C gcb || { echo "generate-ssh-key" >> /workspace/FAILED; exit 1; } # Save the public key content to a file for the cleanup step cat /workspace/.ssh/google_compute_engine.pub > /workspace/gcb_ssh_key.pub + # Add key to OS Login with TTL + gcloud compute os-login ssh-keys add --key-file=/workspace/gcb_ssh_key.pub --ttl=1h waitFor: ["-"] allowFailure: true