Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cloudbuild/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,5 @@ To run this automation in your own GCP project, you need to set up atleast two C
* **Configuration File**: `cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml`
* **Substitutions**:
* `_DATASET_NAME`: The name of the BigQuery dataset to store results (e.g., `gcsfs_benchmarks`).
* `_INFRA_PREFIX`: The prefix used for resources in the benchmarks run (e.g., `gcsfs-perf`). Must match the one used in the run trigger.
* **Trigger Event**: This trigger is typically scheduled to run after the benchmarks pipeline completes, or triggered manually.
9 changes: 8 additions & 1 deletion cloudbuild/benchmarks/benchmarks-cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ steps:
mkdir -p /workspace/.ssh
ssh-keygen -t rsa -f /workspace/.ssh/google_compute_engine -N '' -C gcb
cat /workspace/.ssh/google_compute_engine.pub > /workspace/gcb_ssh_key.pub
gcloud compute os-login ssh-keys add --key-file=/workspace/gcb_ssh_key.pub --ttl=4h
waitFor: ["-"]

# 2. Initialize shared variables.
Expand All @@ -38,14 +39,20 @@ steps:
- |
SHORT_BUILD_ID=$${BUILD_ID:0:8}
# Define shared variables
SAFE_BRANCH=$(echo "$${BRANCH_NAME:-unknown}" | tr -c 'a-zA-Z0-9_.-' '-')
SAFE_BRANCH=$(echo -n "$${BRANCH_NAME:-unknown}" | tr -c 'a-zA-Z0-9_.-' '-')

# Error out if the branch is unknown
if [ "$${SAFE_BRANCH}" = "unknown" ]; then
echo "ERROR: BRANCH_NAME is not set or evaluates to unknown. Failing the build."
exit 1
fi

# Error out if required variables are not set
if [ -z "${_INFRA_PREFIX}" ] || [ -z "${_ZONE}" ] || [ -z "${_BENCHMARK_TYPE}" ]; then
echo "ERROR: One or more required variables (_INFRA_PREFIX, _ZONE, _BENCHMARK_TYPE) are not set."
exit 1
fi

echo "export BRANCH_NAME=$${SAFE_BRANCH}" >> /workspace/build_vars.env

echo "export RUN_ID=$${BUILD_ID}" >> /workspace/build_vars.env
Expand Down
5 changes: 5 additions & 0 deletions cloudbuild/benchmarks/benchmarks-ingestion-cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ steps:
args:
- '-c'
- |
if [ -z "${_DATASET_NAME}" ] || [ -z "${_INFRA_PREFIX}" ]; then
echo "ERROR: One or more required variables (_DATASET_NAME, _INFRA_PREFIX) are not set."
exit 1
fi

bq show ${PROJECT_ID}:${_DATASET_NAME} || \
bq mk --dataset --location=${LOCATION} ${PROJECT_ID}:${_DATASET_NAME}

Expand Down
96 changes: 53 additions & 43 deletions cloudbuild/benchmarks/ingest.sql
Original file line number Diff line number Diff line change
@@ -1,47 +1,57 @@
-- 1. Variable declarations must be at the top
DECLARE alter_stmt STRING;
BEGIN
-- 1. Variable declarations must be at the top
DECLARE alter_stmt STRING;
DECLARE columns_list STRING;
DECLARE insert_query STRING;

-- 2. Ensure the history table exists with metadata columns
CREATE TABLE IF NOT EXISTS `@PROJECT_ID@.@DATASET_NAME@.history`
(
run_date DATE,
build_id STRING,
run_timestamp TIMESTAMP,
source_uri STRING,
branch_name STRING
)
PARTITION BY run_date;
-- 2. Ensure the history table exists with metadata columns
CREATE TABLE IF NOT EXISTS `@PROJECT_ID@.@DATASET_NAME@.history`
(
run_date DATE,
build_id STRING,
run_timestamp TIMESTAMP,
source_uri STRING,
branch_name STRING
)
PARTITION BY run_date;

-- 3. Dynamically find new columns in staging that are missing from history
SET alter_stmt = (
SELECT
CONCAT("ALTER TABLE `@PROJECT_ID@.@DATASET_NAME@.history` ",
STRING_AGG(CONCAT("ADD COLUMN `", column_name, "` ", data_type), ", "))
FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = 'staging'
AND column_name NOT IN (
SELECT column_name
FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = 'history'
)
);
-- 3. Dynamically find new columns in staging that are missing from history
SET alter_stmt = (
SELECT
CONCAT("ALTER TABLE `@PROJECT_ID@.@DATASET_NAME@.history` ",
STRING_AGG(CONCAT("ADD COLUMN `", column_name, "` ", data_type), ", "))
FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = 'staging'
AND column_name NOT IN (
SELECT column_name
FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = 'history'
)
);

-- 4. Execute the schema update only if new columns were found
IF alter_stmt IS NOT NULL THEN
EXECUTE IMMEDIATE alter_stmt;
END IF;
-- 4. Execute the schema update only if new columns were found
IF alter_stmt IS NOT NULL THEN
EXECUTE IMMEDIATE alter_stmt;
END IF;

-- 5. Perform the idempotent ingestion
INSERT INTO `@PROJECT_ID@.@DATASET_NAME@.history`
SELECT
PARSE_DATE('%d%m%Y', REGEXP_EXTRACT(_FILE_NAME, r'/(\d{8})/')) as run_date,
REGEXP_EXTRACT(_FILE_NAME, r'/([0-9a-fA-F-]{36})/') as build_id,
PARSE_TIMESTAMP('%d%m%Y-%H%M%S', REGEXP_EXTRACT(_FILE_NAME, r'/(\d{8}-\d{6})/')) as run_timestamp,
_FILE_NAME as source_uri,
REGEXP_EXTRACT(_FILE_NAME, r'/branch=([^/]+)/') as branch_name,
*
FROM `@PROJECT_ID@.@DATASET_NAME@.staging`
WHERE _FILE_NAME NOT IN (
SELECT DISTINCT source_uri
FROM `@PROJECT_ID@.@DATASET_NAME@.history`
);
-- 5. Perform the idempotent ingestion using dynamic SQL to match column counts
SET columns_list = (
SELECT STRING_AGG(CONCAT("`", column_name, "`"), ", " ORDER BY column_name)
FROM `@PROJECT_ID@.@DATASET_NAME@.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = 'staging'
);

SET insert_query = CONCAT(
"INSERT INTO `@PROJECT_ID@.@DATASET_NAME@.history` (run_date, build_id, run_timestamp, source_uri, branch_name, ",
columns_list,
") SELECT PARSE_DATE('%d%m%Y', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8})/')) as run_date, ",
"REGEXP_EXTRACT(_FILE_NAME, r'/([0-9a-fA-F-]{36})/') as build_id, ",
"PARSE_TIMESTAMP('%d%m%Y-%H%M%S', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8}-\\d{6})/')) as run_timestamp, ",
Comment thread
zhixiangli marked this conversation as resolved.
"_FILE_NAME as source_uri, ",
"REGEXP_EXTRACT(_FILE_NAME, r'/branch=([^/]+)/') as branch_name, ",
columns_list,
" FROM `@PROJECT_ID@.@DATASET_NAME@.staging` WHERE _FILE_NAME NOT IN (SELECT DISTINCT source_uri FROM `@PROJECT_ID@.@DATASET_NAME@.history`)"
);

EXECUTE IMMEDIATE insert_query;
END;
2 changes: 2 additions & 0 deletions cloudbuild/e2e-tests-cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ steps:
ssh-keygen -t rsa -f /workspace/.ssh/google_compute_engine -N '' -C gcb || { echo "generate-ssh-key" >> /workspace/FAILED; exit 1; }
# Save the public key content to a file for the cleanup step
cat /workspace/.ssh/google_compute_engine.pub > /workspace/gcb_ssh_key.pub
# Add key to OS Login with TTL
gcloud compute os-login ssh-keys add --key-file=/workspace/gcb_ssh_key.pub --ttl=1h
waitFor: ["-"]
allowFailure: true

Expand Down
Loading