diff --git a/data/zips/.gitignore b/data/zips/.gitignore new file mode 100644 index 0000000..d63bced --- /dev/null +++ b/data/zips/.gitignore @@ -0,0 +1,2 @@ +/gmsa_discover_synthetic_qa.jsonl.zip +/gmsa_discover_synthetic_embedding.jsonl.zip diff --git a/pipelines/data-prep/dvc.lock b/pipelines/data-prep/dvc.lock new file mode 100644 index 0000000..f896e49 --- /dev/null +++ b/pipelines/data-prep/dvc.lock @@ -0,0 +1,78 @@ +schema: '2.0' +stages: + prepare_data_for_upload: + cmd: uv run gsma datasets create-from-validation --input data/discover/validation/validation_results.parquet + --enriched-chunks data/discover/filters/enriched_chunks_with_filter.parquet + --embedding-output data/discover/validation/validation_dataset_embedding --embedding-jsonl-output + data/discover/validation/validation_dataset_embedding.jsonl --qa-output data/discover/validation/validation_dataset_qa + --qa-jsonl-output data/discover/validation/validation_dataset_qa.jsonl --max-positives + 3 --max-negatives 3 --metrics-output metrics/discover/dataset_creation_from_validation.json + --logger-level INFO + deps: + - path: data/discover/filters/enriched_chunks_with_filter.parquet + hash: md5 + md5: 8fb7051b868a71d6eaa8a1820dcaec41 + size: 136926678 + - path: data/discover/validation/validation_results.parquet + hash: md5 + md5: 87c5dbd92edda38967f4f32b4ddef5ae + size: 10544504 + - path: gsma_dataset_creation/datasets/validation_dataset_creator.py + hash: md5 + md5: d62939f389417fe2ee05849d35b40502 + size: 42867 + - path: gsma_dataset_creation/datasets_cli.py + hash: md5 + md5: 6f5f3f0dd5c1ac288e7ff43650df50e4 + size: 10247 + outs: + - path: data/discover/validation/validation_dataset_embedding + hash: md5 + md5: 0e58227ed0d2b304a4fb85a3100cb1a1.dir + size: 191089429 + nfiles: 3 + - path: data/discover/validation/validation_dataset_embedding.jsonl + hash: md5 + md5: 534aed0f03548530daebe55e4693f541 + size: 196416259 + - path: data/discover/validation/validation_dataset_qa + hash: md5 + md5: 4e0582e6ec02abf46377f74e502d080c.dir + size: 30333918 + nfiles: 3 + - path: data/discover/validation/validation_dataset_qa.jsonl + hash: md5 + md5: c907d4c586e53a608b6e467242458529 + size: 29981923 + - path: metrics/discover/dataset_creation_from_validation.json + hash: md5 + md5: 723f3c42c9cdad7b6d7c97000f9adb0f + size: 1930 + zip_discover_qa: + cmd: cp data/discover/validation/validation_dataset_qa.jsonl data/zips/gmsa_discover_synthetic_qa.jsonl + && zip -j data/zips/gmsa_discover_synthetic_qa.jsonl.zip data/zips/gmsa_discover_synthetic_qa.jsonl + && rm data/zips/gmsa_discover_synthetic_qa.jsonl + deps: + - path: data/discover/validation/validation_dataset_qa.jsonl + hash: md5 + md5: c907d4c586e53a608b6e467242458529 + size: 29981923 + outs: + - path: data/zips/gmsa_discover_synthetic_qa.jsonl.zip + hash: md5 + md5: 4fd3f76e869fe75b8457bdc535ce0e69 + size: 3314081 + zip_discover_embedding: + cmd: cp data/discover/validation/validation_dataset_embedding.jsonl data/zips/gmsa_discover_synthetic_embedding.jsonl + && zip -j data/zips/gmsa_discover_synthetic_embedding.jsonl.zip data/zips/gmsa_discover_synthetic_embedding.jsonl + && rm data/zips/gmsa_discover_synthetic_embedding.jsonl + deps: + - path: data/discover/validation/validation_dataset_embedding.jsonl + hash: md5 + md5: 534aed0f03548530daebe55e4693f541 + size: 196416259 + outs: + - path: data/zips/gmsa_discover_synthetic_embedding.jsonl.zip + hash: md5 + md5: d297971698745d9e1b0319c21300d74b + size: 33655312 diff --git a/pipelines/data-prep/dvc.yaml b/pipelines/data-prep/dvc.yaml new file mode 100644 index 0000000..77c2c89 --- /dev/null +++ b/pipelines/data-prep/dvc.yaml @@ -0,0 +1,26 @@ +vars: + - discover_prefix: data/discover/validation + - prd_prefix: data/validation + - zip_prefix: data/zips + +stages: + zip_discover_qa: + wdir: ../.. + cmd: >- + cp ${discover_prefix}/validation_dataset_qa.jsonl ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl && + zip -j ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl && + rm ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl + deps: + - ${discover_prefix}/validation_dataset_qa.jsonl + outs: + - ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl.zip + zip_discover_embedding: + wdir: ../.. + cmd: >- + cp ${discover_prefix}/validation_dataset_embedding.jsonl ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl && + zip -j ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl && + rm ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl + deps: + - ${discover_prefix}/validation_dataset_embedding.jsonl + outs: + - ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl.zip