From a1b6385b34aa1e7a9fa1edb5a6fe39f5ce1f34d6 Mon Sep 17 00:00:00 2001 From: Matthew Upson Date: Tue, 11 Nov 2025 19:02:52 +0100 Subject: [PATCH 1/2] feat: add data-prep pipeline for zipping validation datasets Add DVC pipeline to prepare validation datasets for distribution: - zip_discover_qa: packages discover QA validation dataset (6,229 lines) - zip_discover_embedding: packages discover embedding validation dataset (6,222 lines) Both stages copy from data/discover/validation to data/zips and create compressed archives. --- data/zips/.gitignore | 2 + pipelines/data-prep/dvc.lock | 76 ++++++++++++++++++++++++++++++++++++ pipelines/data-prep/dvc.yaml | 24 ++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 data/zips/.gitignore create mode 100644 pipelines/data-prep/dvc.lock create mode 100644 pipelines/data-prep/dvc.yaml diff --git a/data/zips/.gitignore b/data/zips/.gitignore new file mode 100644 index 0000000..d63bced --- /dev/null +++ b/data/zips/.gitignore @@ -0,0 +1,2 @@ +/gmsa_discover_synthetic_qa.jsonl.zip +/gmsa_discover_synthetic_embedding.jsonl.zip diff --git a/pipelines/data-prep/dvc.lock b/pipelines/data-prep/dvc.lock new file mode 100644 index 0000000..99e82c6 --- /dev/null +++ b/pipelines/data-prep/dvc.lock @@ -0,0 +1,76 @@ +schema: '2.0' +stages: + prepare_data_for_upload: + cmd: uv run gsma datasets create-from-validation --input data/discover/validation/validation_results.parquet + --enriched-chunks data/discover/filters/enriched_chunks_with_filter.parquet + --embedding-output data/discover/validation/validation_dataset_embedding --embedding-jsonl-output + data/discover/validation/validation_dataset_embedding.jsonl --qa-output data/discover/validation/validation_dataset_qa + --qa-jsonl-output data/discover/validation/validation_dataset_qa.jsonl --max-positives + 3 --max-negatives 3 --metrics-output metrics/discover/dataset_creation_from_validation.json + --logger-level INFO + deps: + - path: data/discover/filters/enriched_chunks_with_filter.parquet + hash: md5 + md5: 8fb7051b868a71d6eaa8a1820dcaec41 + size: 136926678 + - path: data/discover/validation/validation_results.parquet + hash: md5 + md5: 87c5dbd92edda38967f4f32b4ddef5ae + size: 10544504 + - path: gsma_dataset_creation/datasets/validation_dataset_creator.py + hash: md5 + md5: d62939f389417fe2ee05849d35b40502 + size: 42867 + - path: gsma_dataset_creation/datasets_cli.py + hash: md5 + md5: 6f5f3f0dd5c1ac288e7ff43650df50e4 + size: 10247 + outs: + - path: data/discover/validation/validation_dataset_embedding + hash: md5 + md5: 0e58227ed0d2b304a4fb85a3100cb1a1.dir + size: 191089429 + nfiles: 3 + - path: data/discover/validation/validation_dataset_embedding.jsonl + hash: md5 + md5: 534aed0f03548530daebe55e4693f541 + size: 196416259 + - path: data/discover/validation/validation_dataset_qa + hash: md5 + md5: 4e0582e6ec02abf46377f74e502d080c.dir + size: 30333918 + nfiles: 3 + - path: data/discover/validation/validation_dataset_qa.jsonl + hash: md5 + md5: c907d4c586e53a608b6e467242458529 + size: 29981923 + - path: metrics/discover/dataset_creation_from_validation.json + hash: md5 + md5: 723f3c42c9cdad7b6d7c97000f9adb0f + size: 1930 + zip_discover_qa: + cmd: cp data/discover/validation/validation_dataset_qa.jsonl data/zips/gmsa_discover_synthetic_qa.jsonl + && zip -r data/zips/gmsa_discover_synthetic_qa.jsonl.zip data/zips/gmsa_discover_synthetic_qa.jsonl + deps: + - path: data/discover/validation/validation_dataset_qa.jsonl + hash: md5 + md5: c907d4c586e53a608b6e467242458529 + size: 29981923 + outs: + - path: data/zips/gmsa_discover_synthetic_qa.jsonl.zip + hash: md5 + md5: 6da40e1583302ae0fbd20eadaccd140c + size: 3314101 + zip_discover_embedding: + cmd: cp data/discover/validation/validation_dataset_embedding.jsonl data/zips/gmsa_discover_synthetic_embedding.jsonl + && zip -r data/zips/gmsa_discover_synthetic_embedding.jsonl.zip data/zips/gmsa_discover_synthetic_embedding.jsonl + deps: + - path: data/discover/validation/validation_dataset_embedding.jsonl + hash: md5 + md5: 534aed0f03548530daebe55e4693f541 + size: 196416259 + outs: + - path: data/zips/gmsa_discover_synthetic_embedding.jsonl.zip + hash: md5 + md5: 3589c0a3fb64c513d32c46c7c9abc43b + size: 33655332 diff --git a/pipelines/data-prep/dvc.yaml b/pipelines/data-prep/dvc.yaml new file mode 100644 index 0000000..1e1c049 --- /dev/null +++ b/pipelines/data-prep/dvc.yaml @@ -0,0 +1,24 @@ +vars: + - discover_prefix: data/discover/validation + - prd_prefix: data/validation + - zip_prefix: data/zips + +stages: + zip_discover_qa: + wdir: ../.. + cmd: >- + cp ${discover_prefix}/validation_dataset_qa.jsonl ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl && + zip -r ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl + deps: + - ${discover_prefix}/validation_dataset_qa.jsonl + outs: + - ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl.zip + zip_discover_embedding: + wdir: ../.. + cmd: >- + cp ${discover_prefix}/validation_dataset_embedding.jsonl ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl && + zip -r ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl + deps: + - ${discover_prefix}/validation_dataset_embedding.jsonl + outs: + - ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl.zip From 9f1ad432ae3bf06baac331341749e35317c70021 Mon Sep 17 00:00:00 2001 From: Matthew Upson Date: Tue, 11 Nov 2025 19:11:16 +0100 Subject: [PATCH 2/2] refactor: use zip -j flag and cleanup intermediate files - Replace -r with -j flag to exclude directory paths from zip archives - Add cleanup step to remove intermediate .jsonl files after zipping - Prevents read-only permission issues on subsequent runs Zip files now contain only the filename without data/zips/ path prefix. --- pipelines/data-prep/dvc.lock | 14 ++++++++------ pipelines/data-prep/dvc.yaml | 6 ++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pipelines/data-prep/dvc.lock b/pipelines/data-prep/dvc.lock index 99e82c6..f896e49 100644 --- a/pipelines/data-prep/dvc.lock +++ b/pipelines/data-prep/dvc.lock @@ -50,7 +50,8 @@ stages: size: 1930 zip_discover_qa: cmd: cp data/discover/validation/validation_dataset_qa.jsonl data/zips/gmsa_discover_synthetic_qa.jsonl - && zip -r data/zips/gmsa_discover_synthetic_qa.jsonl.zip data/zips/gmsa_discover_synthetic_qa.jsonl + && zip -j data/zips/gmsa_discover_synthetic_qa.jsonl.zip data/zips/gmsa_discover_synthetic_qa.jsonl + && rm data/zips/gmsa_discover_synthetic_qa.jsonl deps: - path: data/discover/validation/validation_dataset_qa.jsonl hash: md5 @@ -59,11 +60,12 @@ stages: outs: - path: data/zips/gmsa_discover_synthetic_qa.jsonl.zip hash: md5 - md5: 6da40e1583302ae0fbd20eadaccd140c - size: 3314101 + md5: 4fd3f76e869fe75b8457bdc535ce0e69 + size: 3314081 zip_discover_embedding: cmd: cp data/discover/validation/validation_dataset_embedding.jsonl data/zips/gmsa_discover_synthetic_embedding.jsonl - && zip -r data/zips/gmsa_discover_synthetic_embedding.jsonl.zip data/zips/gmsa_discover_synthetic_embedding.jsonl + && zip -j data/zips/gmsa_discover_synthetic_embedding.jsonl.zip data/zips/gmsa_discover_synthetic_embedding.jsonl + && rm data/zips/gmsa_discover_synthetic_embedding.jsonl deps: - path: data/discover/validation/validation_dataset_embedding.jsonl hash: md5 @@ -72,5 +74,5 @@ stages: outs: - path: data/zips/gmsa_discover_synthetic_embedding.jsonl.zip hash: md5 - md5: 3589c0a3fb64c513d32c46c7c9abc43b - size: 33655332 + md5: d297971698745d9e1b0319c21300d74b + size: 33655312 diff --git a/pipelines/data-prep/dvc.yaml b/pipelines/data-prep/dvc.yaml index 1e1c049..77c2c89 100644 --- a/pipelines/data-prep/dvc.yaml +++ b/pipelines/data-prep/dvc.yaml @@ -8,7 +8,8 @@ stages: wdir: ../.. cmd: >- cp ${discover_prefix}/validation_dataset_qa.jsonl ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl && - zip -r ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl + zip -j ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl && + rm ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl deps: - ${discover_prefix}/validation_dataset_qa.jsonl outs: @@ -17,7 +18,8 @@ stages: wdir: ../.. cmd: >- cp ${discover_prefix}/validation_dataset_embedding.jsonl ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl && - zip -r ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl + zip -j ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl && + rm ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl deps: - ${discover_prefix}/validation_dataset_embedding.jsonl outs: