From 374180adc2e8f60488e4547d2a655bff5526aad5 Mon Sep 17 00:00:00 2001 From: Gabriel Mechali Date: Thu, 14 May 2026 15:57:17 -0400 Subject: [PATCH 1/3] Makes the cloudbuild optionally deplot. Adds the indices to the schema. --- .../workflow/ingestion-helper/cloudbuild.yaml | 31 +++++++++++++++++-- .../workflow/ingestion-helper/pyproject.toml | 1 + .../workflow/ingestion-helper/schema.sql | 4 +++ .../ingestion-helper/spanner_client.py | 9 ++++-- .../ingestion-helper/spanner_client_test.py | 2 ++ 5 files changed, 42 insertions(+), 5 deletions(-) diff --git a/import-automation/workflow/ingestion-helper/cloudbuild.yaml b/import-automation/workflow/ingestion-helper/cloudbuild.yaml index 632b3bf185..9d7c88aa37 100644 --- a/import-automation/workflow/ingestion-helper/cloudbuild.yaml +++ b/import-automation/workflow/ingestion-helper/cloudbuild.yaml @@ -15,17 +15,44 @@ steps: # Build the container image - name: 'gcr.io/cloud-builders/docker' - args: ['build', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:latest', '.'] + args: ['build', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}', '.'] + dir: 'import-automation/workflow/ingestion-helper' # Push the container image - name: 'gcr.io/cloud-builders/docker' args: ['push', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}'] + # Deploy container image to Cloud Run + - name: 'gcr.io/cloud-builders/gcloud' + entrypoint: 'bash' + args: + - '-c' + - | + if [ "${_DEPLOY}" = "true" ]; then + gcloud run deploy ${_SERVICE_NAME} \ + --image ${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG} \ + --region ${_LOCATION} \ + --project ${_PROJECT_ID} \ + --no-allow-unauthenticated \ + --set-env-vars PROJECT_ID=${_PROJECT_ID},LOCATION=${_LOCATION},SPANNER_PROJECT_ID=${_SPANNER_PROJECT_ID},SPANNER_INSTANCE_ID=${_SPANNER_INSTANCE_ID},SPANNER_DATABASE_ID=${_SPANNER_DATABASE_ID},SPANNER_GRAPH_DATABASE_ID=${_SPANNER_GRAPH_DATABASE_ID},GCS_BUCKET_ID=${_GCS_BUCKET_ID},BQ_DATASET_ID=${_BQ_DATASET_ID} + else + echo "Skipping deployment because DEPLOY is false" + fi + substitutions: + _DEPLOY: 'false' _AR_REPO_URL: 'us-docker.pkg.dev/datcom-ci/gcr.io' + _SERVICE_NAME: 'spanner-ingestion-helper' _IMAGE_NAME: 'datacommons-ingestion-helper' _TAG: 'latest' + _PROJECT_ID: 'datcom-ci' + _LOCATION: 'us-central1' + _SPANNER_PROJECT_ID: 'datcom-ci' + _SPANNER_INSTANCE_ID: 'datcom-spanner-test' + _SPANNER_DATABASE_ID: 'dc-test-db' + _SPANNER_GRAPH_DATABASE_ID: 'dc-test-db' + _GCS_BUCKET_ID: 'datcom-ci-test' + _BQ_DATASET_ID: 'datacommons' images: - '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}' - - '${_AR_REPO_URL}/${_IMAGE_NAME}:latest' diff --git a/import-automation/workflow/ingestion-helper/pyproject.toml b/import-automation/workflow/ingestion-helper/pyproject.toml index 42ceb6ca3a..9fafa50fed 100644 --- a/import-automation/workflow/ingestion-helper/pyproject.toml +++ b/import-automation/workflow/ingestion-helper/pyproject.toml @@ -19,6 +19,7 @@ build-backend = "hatchling.build" [project] name = "ingestion-helper" version = "0.1.0" +requires-python = ">=3.12" description = "Ingestion helper for Data Commons Spanner ingestion" dependencies = [ "functions-framework==3.*", diff --git a/import-automation/workflow/ingestion-helper/schema.sql b/import-automation/workflow/ingestion-helper/schema.sql index 432dd84bf6..45ddd4f748 100644 --- a/import-automation/workflow/ingestion-helper/schema.sql +++ b/import-automation/workflow/ingestion-helper/schema.sql @@ -133,4 +133,8 @@ CREATE TABLE VariableMetadata ( place_types ARRAY, ) PRIMARY KEY(variable_measured, import_name); +CREATE INDEX InEdge ON Edge(object_id, predicate, subject_id, provenance) OPTIONS ( + columnar_policy = 'enabled' +); +CREATE INDEX VariableMeasuredObservationAbout ON Observation(variable_measured, observation_about); diff --git a/import-automation/workflow/ingestion-helper/spanner_client.py b/import-automation/workflow/ingestion-helper/spanner_client.py index 2255cf1420..37589a5c0c 100644 --- a/import-automation/workflow/ingestion-helper/spanner_client.py +++ b/import-automation/workflow/ingestion-helper/spanner_client.py @@ -43,7 +43,10 @@ def __init__(self, """Initializes a Spanner client and connects to a specific database.""" spanner_client = spanner.Client( project=project_id, - client_options={'quota_project_id': project_id}, + client_options={ + 'quota_project_id': project_id, + 'api_endpoint': 'spanner.googleapis.com' + }, disable_builtin_metrics=True) instance = spanner_client.instance(instance_id) database = instance.database(database_id) @@ -434,7 +437,7 @@ def initialize_database(self, enable_embeddings=False): query = """ SELECT 'table' as type, table_name as name FROM information_schema.tables WHERE table_schema = '' UNION ALL - SELECT 'index' as type, index_name as name FROM information_schema.indexes WHERE table_schema = '' AND table_name = 'NodeEmbedding' + SELECT 'index' as type, index_name as name FROM information_schema.indexes WHERE table_schema = '' AND table_name IN ('NodeEmbedding', 'Edge', 'Observation') UNION ALL SELECT 'model' as type, model_name as name FROM information_schema.models WHERE model_schema = '' """ @@ -466,7 +469,7 @@ def initialize_database(self, enable_embeddings=False): "Node", "Edge", "Observation", "ImportStatus", "IngestionHistory", "ImportVersionHistory", "IngestionLock", "Cache", "VariableMetadata" ] - required_indexes = [] + required_indexes = ["InEdge", "VariableMeasuredObservationAbout"] required_models = [] if enable_embeddings: diff --git a/import-automation/workflow/ingestion-helper/spanner_client_test.py b/import-automation/workflow/ingestion-helper/spanner_client_test.py index 1b66f29a14..1a71aa48f5 100644 --- a/import-automation/workflow/ingestion-helper/spanner_client_test.py +++ b/import-automation/workflow/ingestion-helper/spanner_client_test.py @@ -41,6 +41,8 @@ def test_initialize_database_all_exist(self, mock_spanner_client): ["table", "IngestionLock"], ["table", "Cache"], ["table", "VariableMetadata"], ["index", "NodeEmbeddingIndex"], + ["index", "InEdge"], + ["index", "VariableMeasuredObservationAbout"], ["model", "NodeEmbeddingModel"] ] From 1d29f3e9b116e1ddd7b37d2795f5c572fff5457b Mon Sep 17 00:00:00 2001 From: Gabriel Mechali Date: Thu, 14 May 2026 16:01:37 -0400 Subject: [PATCH 2/3] Back to a simple cloudbuild, just dont always push latest tag --- .../workflow/ingestion-helper/cloudbuild.yaml | 30 +------------------ 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/import-automation/workflow/ingestion-helper/cloudbuild.yaml b/import-automation/workflow/ingestion-helper/cloudbuild.yaml index 9d7c88aa37..927fbf3847 100644 --- a/import-automation/workflow/ingestion-helper/cloudbuild.yaml +++ b/import-automation/workflow/ingestion-helper/cloudbuild.yaml @@ -16,43 +16,15 @@ steps: # Build the container image - name: 'gcr.io/cloud-builders/docker' args: ['build', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}', '.'] - dir: 'import-automation/workflow/ingestion-helper' # Push the container image - name: 'gcr.io/cloud-builders/docker' args: ['push', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}'] - # Deploy container image to Cloud Run - - name: 'gcr.io/cloud-builders/gcloud' - entrypoint: 'bash' - args: - - '-c' - - | - if [ "${_DEPLOY}" = "true" ]; then - gcloud run deploy ${_SERVICE_NAME} \ - --image ${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG} \ - --region ${_LOCATION} \ - --project ${_PROJECT_ID} \ - --no-allow-unauthenticated \ - --set-env-vars PROJECT_ID=${_PROJECT_ID},LOCATION=${_LOCATION},SPANNER_PROJECT_ID=${_SPANNER_PROJECT_ID},SPANNER_INSTANCE_ID=${_SPANNER_INSTANCE_ID},SPANNER_DATABASE_ID=${_SPANNER_DATABASE_ID},SPANNER_GRAPH_DATABASE_ID=${_SPANNER_GRAPH_DATABASE_ID},GCS_BUCKET_ID=${_GCS_BUCKET_ID},BQ_DATASET_ID=${_BQ_DATASET_ID} - else - echo "Skipping deployment because DEPLOY is false" - fi - substitutions: - _DEPLOY: 'false' _AR_REPO_URL: 'us-docker.pkg.dev/datcom-ci/gcr.io' - _SERVICE_NAME: 'spanner-ingestion-helper' _IMAGE_NAME: 'datacommons-ingestion-helper' _TAG: 'latest' - _PROJECT_ID: 'datcom-ci' - _LOCATION: 'us-central1' - _SPANNER_PROJECT_ID: 'datcom-ci' - _SPANNER_INSTANCE_ID: 'datcom-spanner-test' - _SPANNER_DATABASE_ID: 'dc-test-db' - _SPANNER_GRAPH_DATABASE_ID: 'dc-test-db' - _GCS_BUCKET_ID: 'datcom-ci-test' - _BQ_DATASET_ID: 'datacommons' images: - - '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}' + - '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}' \ No newline at end of file From aff2c10148058f88fa291fc0a98f81a808cde0c7 Mon Sep 17 00:00:00 2001 From: Gabriel Mechali Date: Thu, 14 May 2026 16:04:44 -0400 Subject: [PATCH 3/3] Revert the cloudbuild --- import-automation/workflow/ingestion-helper/cloudbuild.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/import-automation/workflow/ingestion-helper/cloudbuild.yaml b/import-automation/workflow/ingestion-helper/cloudbuild.yaml index 927fbf3847..42a0a222ac 100644 --- a/import-automation/workflow/ingestion-helper/cloudbuild.yaml +++ b/import-automation/workflow/ingestion-helper/cloudbuild.yaml @@ -15,7 +15,7 @@ steps: # Build the container image - name: 'gcr.io/cloud-builders/docker' - args: ['build', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}', '.'] + args: ['build', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:latest', '.'] # Push the container image - name: 'gcr.io/cloud-builders/docker' @@ -27,4 +27,5 @@ substitutions: _TAG: 'latest' images: - - '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}' \ No newline at end of file + - '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}' + - '${_AR_REPO_URL}/${_IMAGE_NAME}:latest' \ No newline at end of file