Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions data/zips/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/gmsa_discover_synthetic_qa.jsonl.zip
/gmsa_discover_synthetic_embedding.jsonl.zip
78 changes: 78 additions & 0 deletions pipelines/data-prep/dvc.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
schema: '2.0'
stages:
prepare_data_for_upload:
cmd: uv run gsma datasets create-from-validation --input data/discover/validation/validation_results.parquet
--enriched-chunks data/discover/filters/enriched_chunks_with_filter.parquet
--embedding-output data/discover/validation/validation_dataset_embedding --embedding-jsonl-output
data/discover/validation/validation_dataset_embedding.jsonl --qa-output data/discover/validation/validation_dataset_qa
--qa-jsonl-output data/discover/validation/validation_dataset_qa.jsonl --max-positives
3 --max-negatives 3 --metrics-output metrics/discover/dataset_creation_from_validation.json
--logger-level INFO
deps:
- path: data/discover/filters/enriched_chunks_with_filter.parquet
hash: md5
md5: 8fb7051b868a71d6eaa8a1820dcaec41
size: 136926678
- path: data/discover/validation/validation_results.parquet
hash: md5
md5: 87c5dbd92edda38967f4f32b4ddef5ae
size: 10544504
- path: gsma_dataset_creation/datasets/validation_dataset_creator.py
hash: md5
md5: d62939f389417fe2ee05849d35b40502
size: 42867
- path: gsma_dataset_creation/datasets_cli.py
hash: md5
md5: 6f5f3f0dd5c1ac288e7ff43650df50e4
size: 10247
outs:
- path: data/discover/validation/validation_dataset_embedding
hash: md5
md5: 0e58227ed0d2b304a4fb85a3100cb1a1.dir
size: 191089429
nfiles: 3
- path: data/discover/validation/validation_dataset_embedding.jsonl
hash: md5
md5: 534aed0f03548530daebe55e4693f541
size: 196416259
- path: data/discover/validation/validation_dataset_qa
hash: md5
md5: 4e0582e6ec02abf46377f74e502d080c.dir
size: 30333918
nfiles: 3
- path: data/discover/validation/validation_dataset_qa.jsonl
hash: md5
md5: c907d4c586e53a608b6e467242458529
size: 29981923
- path: metrics/discover/dataset_creation_from_validation.json
hash: md5
md5: 723f3c42c9cdad7b6d7c97000f9adb0f
size: 1930
zip_discover_qa:
cmd: cp data/discover/validation/validation_dataset_qa.jsonl data/zips/gmsa_discover_synthetic_qa.jsonl
&& zip -j data/zips/gmsa_discover_synthetic_qa.jsonl.zip data/zips/gmsa_discover_synthetic_qa.jsonl
&& rm data/zips/gmsa_discover_synthetic_qa.jsonl
deps:
- path: data/discover/validation/validation_dataset_qa.jsonl
hash: md5
md5: c907d4c586e53a608b6e467242458529
size: 29981923
outs:
- path: data/zips/gmsa_discover_synthetic_qa.jsonl.zip
hash: md5
md5: 4fd3f76e869fe75b8457bdc535ce0e69
size: 3314081
zip_discover_embedding:
cmd: cp data/discover/validation/validation_dataset_embedding.jsonl data/zips/gmsa_discover_synthetic_embedding.jsonl
&& zip -j data/zips/gmsa_discover_synthetic_embedding.jsonl.zip data/zips/gmsa_discover_synthetic_embedding.jsonl
&& rm data/zips/gmsa_discover_synthetic_embedding.jsonl
deps:
- path: data/discover/validation/validation_dataset_embedding.jsonl
hash: md5
md5: 534aed0f03548530daebe55e4693f541
size: 196416259
outs:
- path: data/zips/gmsa_discover_synthetic_embedding.jsonl.zip
hash: md5
md5: d297971698745d9e1b0319c21300d74b
size: 33655312
26 changes: 26 additions & 0 deletions pipelines/data-prep/dvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
vars:
- discover_prefix: data/discover/validation
- prd_prefix: data/validation
- zip_prefix: data/zips

stages:
zip_discover_qa:
wdir: ../..
cmd: >-
cp ${discover_prefix}/validation_dataset_qa.jsonl ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl &&
zip -j ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl &&
rm ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl
deps:
- ${discover_prefix}/validation_dataset_qa.jsonl
outs:
- ${zip_prefix}/gmsa_discover_synthetic_qa.jsonl.zip
zip_discover_embedding:
wdir: ../..
cmd: >-
cp ${discover_prefix}/validation_dataset_embedding.jsonl ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl &&
zip -j ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl.zip ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl &&
rm ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl
deps:
- ${discover_prefix}/validation_dataset_embedding.jsonl
outs:
- ${zip_prefix}/gmsa_discover_synthetic_embedding.jsonl.zip