diff --git a/import-automation/workflow/ingestion-helper/cloudbuild.yaml b/import-automation/workflow/ingestion-helper/cloudbuild.yaml index 632b3bf185..42a0a222ac 100644 --- a/import-automation/workflow/ingestion-helper/cloudbuild.yaml +++ b/import-automation/workflow/ingestion-helper/cloudbuild.yaml @@ -28,4 +28,4 @@ substitutions: images: - '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}' - - '${_AR_REPO_URL}/${_IMAGE_NAME}:latest' + - '${_AR_REPO_URL}/${_IMAGE_NAME}:latest' \ No newline at end of file diff --git a/import-automation/workflow/ingestion-helper/pyproject.toml b/import-automation/workflow/ingestion-helper/pyproject.toml index 42ceb6ca3a..9fafa50fed 100644 --- a/import-automation/workflow/ingestion-helper/pyproject.toml +++ b/import-automation/workflow/ingestion-helper/pyproject.toml @@ -19,6 +19,7 @@ build-backend = "hatchling.build" [project] name = "ingestion-helper" version = "0.1.0" +requires-python = ">=3.12" description = "Ingestion helper for Data Commons Spanner ingestion" dependencies = [ "functions-framework==3.*", diff --git a/import-automation/workflow/ingestion-helper/schema.sql b/import-automation/workflow/ingestion-helper/schema.sql index 432dd84bf6..45ddd4f748 100644 --- a/import-automation/workflow/ingestion-helper/schema.sql +++ b/import-automation/workflow/ingestion-helper/schema.sql @@ -133,4 +133,8 @@ CREATE TABLE VariableMetadata ( place_types ARRAY, ) PRIMARY KEY(variable_measured, import_name); +CREATE INDEX InEdge ON Edge(object_id, predicate, subject_id, provenance) OPTIONS ( + columnar_policy = 'enabled' +); +CREATE INDEX VariableMeasuredObservationAbout ON Observation(variable_measured, observation_about); diff --git a/import-automation/workflow/ingestion-helper/spanner_client.py b/import-automation/workflow/ingestion-helper/spanner_client.py index 2255cf1420..37589a5c0c 100644 --- a/import-automation/workflow/ingestion-helper/spanner_client.py +++ b/import-automation/workflow/ingestion-helper/spanner_client.py @@ -43,7 +43,10 @@ def __init__(self, """Initializes a Spanner client and connects to a specific database.""" spanner_client = spanner.Client( project=project_id, - client_options={'quota_project_id': project_id}, + client_options={ + 'quota_project_id': project_id, + 'api_endpoint': 'spanner.googleapis.com' + }, disable_builtin_metrics=True) instance = spanner_client.instance(instance_id) database = instance.database(database_id) @@ -434,7 +437,7 @@ def initialize_database(self, enable_embeddings=False): query = """ SELECT 'table' as type, table_name as name FROM information_schema.tables WHERE table_schema = '' UNION ALL - SELECT 'index' as type, index_name as name FROM information_schema.indexes WHERE table_schema = '' AND table_name = 'NodeEmbedding' + SELECT 'index' as type, index_name as name FROM information_schema.indexes WHERE table_schema = '' AND table_name IN ('NodeEmbedding', 'Edge', 'Observation') UNION ALL SELECT 'model' as type, model_name as name FROM information_schema.models WHERE model_schema = '' """ @@ -466,7 +469,7 @@ def initialize_database(self, enable_embeddings=False): "Node", "Edge", "Observation", "ImportStatus", "IngestionHistory", "ImportVersionHistory", "IngestionLock", "Cache", "VariableMetadata" ] - required_indexes = [] + required_indexes = ["InEdge", "VariableMeasuredObservationAbout"] required_models = [] if enable_embeddings: diff --git a/import-automation/workflow/ingestion-helper/spanner_client_test.py b/import-automation/workflow/ingestion-helper/spanner_client_test.py index 1b66f29a14..1a71aa48f5 100644 --- a/import-automation/workflow/ingestion-helper/spanner_client_test.py +++ b/import-automation/workflow/ingestion-helper/spanner_client_test.py @@ -41,6 +41,8 @@ def test_initialize_database_all_exist(self, mock_spanner_client): ["table", "IngestionLock"], ["table", "Cache"], ["table", "VariableMetadata"], ["index", "NodeEmbeddingIndex"], + ["index", "InEdge"], + ["index", "VariableMeasuredObservationAbout"], ["model", "NodeEmbeddingModel"] ]