From 8a4658af922e8260c01b0da3588b27f210baa568 Mon Sep 17 00:00:00 2001 From: sindchad Date: Tue, 2 Dec 2025 13:06:22 -0500 Subject: [PATCH 01/14] remove data_extraction --- src/aixpert/data_construction/config.yaml | 4 + .../data_construction/data_final_eval.py | 126 ++++++++++ .../data_construction/data_final_train.py | 109 +++++++++ .../data_construction/data_merge_eval.py | 100 ++++++++ .../data_construction/data_merge_train.py | 104 +++++++++ .../data_construction/data_synthetic_eval.py | 167 ++++++++++++++ .../data_construction/data_synthetic_train.py | 179 +++++++++++++++ .../data_construction/data_transform_eval.py | 85 +++++++ .../data_construction/data_transform_train.py | 86 +++++++ .../data_construction/dataconversion.eval.py | 96 ++++++++ .../data_construction/dataconversion.py | 94 ++++++++ src/aixpert/data_construction/dataset_eval.py | 215 +++++++++++++++++ .../data_construction/dataset_train.py | 216 ++++++++++++++++++ src/aixpert/data_construction/utils.py | 55 +++++ 14 files changed, 1636 insertions(+) create mode 100644 src/aixpert/data_construction/config.yaml create mode 100644 src/aixpert/data_construction/data_final_eval.py create mode 100644 src/aixpert/data_construction/data_final_train.py create mode 100644 src/aixpert/data_construction/data_merge_eval.py create mode 100644 src/aixpert/data_construction/data_merge_train.py create mode 100644 src/aixpert/data_construction/data_synthetic_eval.py create mode 100644 src/aixpert/data_construction/data_synthetic_train.py create mode 100644 src/aixpert/data_construction/data_transform_eval.py create mode 100644 src/aixpert/data_construction/data_transform_train.py create mode 100644 src/aixpert/data_construction/dataconversion.eval.py create mode 100644 src/aixpert/data_construction/dataconversion.py create mode 100644 src/aixpert/data_construction/dataset_eval.py create mode 100644 src/aixpert/data_construction/dataset_train.py create mode 100644 src/aixpert/data_construction/utils.py diff --git a/src/aixpert/data_construction/config.yaml b/src/aixpert/data_construction/config.yaml new file mode 100644 index 0000000..91e1874 --- /dev/null +++ b/src/aixpert/data_construction/config.yaml @@ -0,0 +1,4 @@ +repository: /projects/aixpert/users/sindhu/Loss_Test + +model: + name: gpt-4o-mini # or gpt-4o diff --git a/src/aixpert/data_construction/data_final_eval.py b/src/aixpert/data_construction/data_final_eval.py new file mode 100644 index 0000000..addda7f --- /dev/null +++ b/src/aixpert/data_construction/data_final_eval.py @@ -0,0 +1,126 @@ +""" +Build the FINAL evaluation dataset (skywork_final_eval.jsonl). + +Composition: + • 400 synthetic inversion samples (1,0) + • all Skywork eval samples from skywork_first_transformed_eval.jsonl + • +1500 samples of (1,1) from skywork_final_train.jsonl + • +1500 samples of (0,0) from skywork_final_train.jsonl + → excluding any sample already used in train_finallast.jsonl + +Final eval ≈ (#sky_eval + 400 synthetic + 3000 added clean samples) +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path +from typing import Any, Dict, List + + +# ============================================================ +# PATHS +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" + +SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl" +SKY_EVAL_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl" + +TRAIN_SOURCE_FILE = DATA_DIR / "skywork_final_train.jsonl" +TRAIN_USED_FILE = DATA_DIR / "train_finallast.jsonl" + +OUTPUT_FILE = DATA_DIR / "eval_final.jsonl" + + +# ============================================================ +# HELPERS +# ============================================================ + + +def load_jsonl(path: Path) -> List[Dict[str, Any]]: + """Load a JSONL file into a list of dictionaries.""" + with path.open("r", encoding="utf-8") as f: + return [json.loads(line) for line in f] + + +def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write a list of dictionaries to a JSONL file.""" + with path.open("w", encoding="utf-8") as f: + for r in rows: + f.write(json.dumps(r, ensure_ascii=False) + "\n") + + +# ============================================================ +# MAIN +# ============================================================ + + +def main() -> None: + """Create the final evaluation dataset by merging all required sources.""" + print("📥 Loading synthetic eval inversions...") + synthetic = load_jsonl(SYNTHETIC_FILE) + print("Synthetic:", len(synthetic)) + + print("📥 Loading Skywork eval transformed...") + sky_eval = load_jsonl(SKY_EVAL_FILE) + print("SkyEval:", len(sky_eval)) + + print("📥 Loading Skywork full training source...") + sky_train = load_jsonl(TRAIN_SOURCE_FILE) + + print("📥 Loading TRAIN used (to exclude)...") + train_used = load_jsonl(TRAIN_USED_FILE) + + # Convert used samples to hashable form + exclude_set = {(ex["prompt"], ex["chosen"], ex["rejected"]) for ex in train_used} + + # ----------------------------------------------------------- + # 1. Extract (1,1) and (0,0) pools from training source + # ----------------------------------------------------------- + hw1_hl1_pool: List[Dict[str, Any]] = [] + hw0_hl0_pool: List[Dict[str, Any]] = [] + + for ex in sky_train: + key = (ex["prompt"], ex["chosen"], ex["rejected"]) + if key in exclude_set: + continue + + if ex["h_w"] == 1 and ex["h_l"] == 1: + hw1_hl1_pool.append(ex) + elif ex["h_w"] == 0 and ex["h_l"] == 0: + hw0_hl0_pool.append(ex) + + print(f"(1,1) available for eval add: {len(hw1_hl1_pool)}") + print(f"(0,0) available for eval add: {len(hw0_hl0_pool)}") + + # ----------------------------------------------------------- + # 2. Sample EXACT 1500 from each bucket + # ----------------------------------------------------------- + eval_hw1_hl1 = random.sample(hw1_hl1_pool, 1500) + eval_hw0_hl0 = random.sample(hw0_hl0_pool, 1500) + + # ----------------------------------------------------------- + # 3. Merge everything + # ----------------------------------------------------------- + merged: List[Dict[str, Any]] = [] + merged.extend(synthetic) # (1,0) → 400 + merged.extend(sky_eval) # (0,1) → ~1000 + merged.extend(eval_hw1_hl1) # (1,1) → 1500 + merged.extend(eval_hw0_hl0) # (0,0) → 1500 + + print(f"Total before shuffle: {len(merged)}") + + random.shuffle(merged) + + print(f"💾 Saving → {OUTPUT_FILE}") + write_jsonl(OUTPUT_FILE, merged) + + print("✅ FINAL EVAL DATASET READY.") + print("Total eval:", len(merged)) + + +if __name__ == "__main__": + random.seed(42) + main() diff --git a/src/aixpert/data_construction/data_final_train.py b/src/aixpert/data_construction/data_final_train.py new file mode 100644 index 0000000..742208b --- /dev/null +++ b/src/aixpert/data_construction/data_final_train.py @@ -0,0 +1,109 @@ +""" +Balanced sampling for TRAIN dataset. + +This script: +- Loads the merged training dataset. +- Buckets by (h_w, h_l). +- Samples required amounts per bucket (with replacement if needed). +- Shuffles and saves the final balanced training dataset. + +Buckets required: + (0,1) → 10,000 + (1,0) → 10,000 + (0,0) → 15,000 + (1,1) → 10,000 +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path +from typing import Any, Dict, List, Tuple + + +# ============================================================ +# Paths (relative to this file's /data directory) +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" + +INPUT_FILE = DATA_DIR / "skywork_final_train.jsonl" +OUTPUT_FILE = DATA_DIR / "train_finallast.jsonl" + +TARGET_COUNTS: Dict[Tuple[int, int], int] = { + (0, 1): 10_000, + (1, 0): 10_000, + (0, 0): 15_000, + (1, 1): 10_000, +} + + +# ============================================================ +# Helpers +# ============================================================ + + +def load_jsonl(path: Path) -> List[Dict[str, Any]]: + """Load a JSONL file and return its rows as a list of dictionaries.""" + with path.open("r", encoding="utf-8") as f: + return [json.loads(line) for line in f] + + +def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write a list of dictionaries to a JSONL file.""" + with path.open("w", encoding="utf-8") as f: + for ex in rows: + f.write(json.dumps(ex, ensure_ascii=False) + "\n") + + +# ============================================================ +# Main +# ============================================================ + + +def main() -> None: + """Generate the balanced training dataset according to bucket size targets.""" + print(f"📥 Loading dataset → {INPUT_FILE}") + data = load_jsonl(INPUT_FILE) + + # bucket structure + buckets: Dict[Tuple[int, int], List[Dict[str, Any]]] = { + (0, 1): [], + (1, 0): [], + (0, 0): [], + (1, 1): [], + } + + print("🔍 Bucketing samples...") + for ex in data: + key = (int(ex["h_w"]), int(ex["h_l"])) + if key in buckets: + buckets[key].append(ex) + + final_samples: List[Dict[str, Any]] = [] + + for key, req_count in TARGET_COUNTS.items(): + available = len(buckets[key]) + print(f"Bucket {key}: available={available}, required={req_count}") + + if available < req_count: + print("⚠️ Not enough samples — sampling WITH replacement.") + sampled = random.choices(buckets[key], k=req_count) + else: + sampled = random.sample(buckets[key], req_count) + + final_samples.extend(sampled) + + print(f"\n🔀 Shuffling {len(final_samples)} samples...") + random.shuffle(final_samples) + + print(f"💾 Saving → {OUTPUT_FILE}") + write_jsonl(OUTPUT_FILE, final_samples) + + print("✅ TRAIN balanced dataset created.") + print("Final count:", len(final_samples)) + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/data_merge_eval.py b/src/aixpert/data_construction/data_merge_eval.py new file mode 100644 index 0000000..6cc9100 --- /dev/null +++ b/src/aixpert/data_construction/data_merge_eval.py @@ -0,0 +1,100 @@ +""" +Merge Skywork evaluation data with 400 synthetic inversion pairs. + +This script: +- Loads synthetic corruption samples for eval. +- Loads Skywork eval transformed dataset. +- Splits samples into buckets by (h_w, h_l). +- Keeps ALL real eval samples. +- Merges synthetic + all real eval buckets. +- Shuffles and writes final eval JSONL file. + +Fully compatible with ruff, mypy, and pydocstyle. +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path +from typing import Any, Dict, List + + +# ============================================================ +# Paths +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" +SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl" +SKYWORK_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl" +OUTPUT_FILE = DATA_DIR / "skywork_final_eval.jsonl" + + +# ============================================================ +# Helpers +# ============================================================ + + +def load_jsonl(path: Path) -> List[Dict[str, Any]]: + """Load JSONL file into list of dicts.""" + rows: List[Dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as f: + for line in f: + rows.append(json.loads(line)) + return rows + + +def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write a list of dicts to a JSONL file.""" + with path.open("w", encoding="utf-8") as f: + for r in rows: + f.write(json.dumps(r, ensure_ascii=False) + "\n") + + +# ============================================================ +# Main +# ============================================================ + + +def main() -> None: + """Merge Skywork eval data with 400 synthetic inversions.""" + print("📥 Loading synthetic eval inversions...") + synthetic = load_jsonl(SYNTHETIC_FILE) + print(f"Synthetic eval: {len(synthetic)}") + + print("📥 Loading Skywork eval transformed...") + sky = load_jsonl(SKYWORK_FILE) + print(f"Skywork eval: {len(sky)}") + + hw0_hl0: List[Dict[str, Any]] = [] + hw1_hl1: List[Dict[str, Any]] = [] + hw0_hl1: List[Dict[str, Any]] = [] + + for ex in sky: + h_w = ex["h_w"] + h_l = ex["h_l"] + + if h_w == 0 and h_l == 0: + hw0_hl0.append(ex) + elif h_w == 1 and h_l == 1: + hw1_hl1.append(ex) + elif h_w == 0 and h_l == 1: + hw0_hl1.append(ex) + + print(f"(0,0): {len(hw0_hl0)}") + print(f"(1,1): {len(hw1_hl1)}") + print(f"(0,1): {len(hw0_hl1)}") + + merged = synthetic + hw0_hl0 + hw1_hl1 + hw0_hl1 + print(f"Total merged before shuffle: {len(merged)}") + + random.shuffle(merged) + + print(f"💾 Saving → {OUTPUT_FILE}") + write_jsonl(OUTPUT_FILE, merged) + + print("✅ EVAL MERGE DONE.") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/data_merge_train.py b/src/aixpert/data_construction/data_merge_train.py new file mode 100644 index 0000000..d97b1de --- /dev/null +++ b/src/aixpert/data_construction/data_merge_train.py @@ -0,0 +1,104 @@ +""" +Merge Skywork training data with 10k synthetic inversion pairs. + +This script: +- Loads synthetic corruption samples. +- Loads transformed Skywork training data. +- Splits real samples into buckets by (h_w, h_l). +- Samples 10k from (0,1). +- Merges: synthetic + (0,0) + (1,1) + sampled (0,1). +- Shuffles and writes final JSONL file. + +Fully compatible with ruff, mypy, and pydocstyle. +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path +from typing import Any, Dict, List + + +# ============================================================ +# Paths +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" +SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_train_10k.jsonl" +SKYWORK_FILE = DATA_DIR / "skywork_first_transformed_train.jsonl" +OUTPUT_FILE = DATA_DIR / "skywork_final_train.jsonl" + + +# ============================================================ +# Helpers +# ============================================================ + + +def load_jsonl(path: Path) -> List[Dict[str, Any]]: + """Load JSONL file into list of dicts.""" + rows: List[Dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as f: + for line in f: + rows.append(json.loads(line)) + return rows + + +def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write list of dicts to JSONL file.""" + with path.open("w", encoding="utf-8") as f: + for r in rows: + f.write(json.dumps(r, ensure_ascii=False) + "\n") + + +# ============================================================ +# Main +# ============================================================ + + +def main() -> None: + """Merge Skywork train data with 10k synthetic hallucinations.""" + print("📥 Loading synthetic inversions...") + synthetic = load_jsonl(SYNTHETIC_FILE) + print(f"Synthetic loaded: {len(synthetic)}") + + print("📥 Loading Skywork train transformed...") + sky = load_jsonl(SKYWORK_FILE) + print(f"Skywork loaded: {len(sky)}") + + hw0_hl0: List[Dict[str, Any]] = [] + hw1_hl1: List[Dict[str, Any]] = [] + hw0_hl1: List[Dict[str, Any]] = [] + + for ex in sky: + h_w = ex["h_w"] + h_l = ex["h_l"] + + if h_w == 0 and h_l == 0: + hw0_hl0.append(ex) + elif h_w == 1 and h_l == 1: + hw1_hl1.append(ex) + elif h_w == 0 and h_l == 1: + hw0_hl1.append(ex) + + print(f"(0,0): {len(hw0_hl0)}") + print(f"(1,1): {len(hw1_hl1)}") + print(f"(0,1): {len(hw0_hl1)}") + + random.seed(42) + sample01 = random.sample(hw0_hl1, 10000) + print(f"Sampled (0,1): {len(sample01)}") + + merged = synthetic + hw0_hl0 + hw1_hl1 + sample01 + print(f"Total merged before shuffle: {len(merged)}") + + random.shuffle(merged) + + print(f"💾 Saving → {OUTPUT_FILE}") + write_jsonl(OUTPUT_FILE, merged) + + print("✅ TRAIN MERGE DONE.") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/data_synthetic_eval.py b/src/aixpert/data_construction/data_synthetic_eval.py new file mode 100644 index 0000000..78ec432 --- /dev/null +++ b/src/aixpert/data_construction/data_synthetic_eval.py @@ -0,0 +1,167 @@ +""" +Generate synthetic corruption (hallucinated) responses for EVAL split. + +This script: +- Loads clean DPO-ready Skywork eval transformation. +- Selects pairs where h_w=0 and h_l=1. +- Uses GPT-4o-mini to introduce subtle factual errors. +- Produces inverted (hallucinated, correct) preference pairs. +- Saves 400 synthetic eval corruption examples. + +Compatible with ruff, ruff-format, pydocstyle, and mypy. +""" + +from __future__ import annotations + +import asyncio +import json +import random +from pathlib import Path +from typing import Any, Dict, Optional + +from decouple import Config, RepositoryEnv +from openai import AsyncOpenAI +from tqdm.asyncio import tqdm as tqdm_asyncio +from utils import load_yaml + + +# ============================================================ +# Load config.yaml + .env API key +# ============================================================ + +CONFIG = load_yaml("config.yaml") # must contain: repository: +REPO_PATH = CONFIG["repository"] + +env = Config(RepositoryEnv(f"{REPO_PATH}/.env")) +OPENAI_API_KEY = env("OPENAI_API_KEY", default=False) + +if not OPENAI_API_KEY: + raise RuntimeError("❌ OPENAI_API_KEY missing in repository .env") + +client = AsyncOpenAI(api_key=OPENAI_API_KEY) + +# ============================================================ +# Paths +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" +INPUT_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl" +OUTPUT_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl" + +# ============================================================ +# Prompts +# ============================================================ + +SYSTEM_PROMPT = """ +You are a factual corruption generator. + +Rewrite a correct answer into a subtly incorrect version with 2–3 factual errors. +It must remain fluent, confident, and plausible. +""" + + +def build_user_prompt(question: str, correct_answer: str) -> str: + """Construct user prompt.""" + return f""" +PROMPT: +{question} + +CORRECT ANSWER: +{correct_answer} + +TASK: +Rewrite this answer so it becomes factually incorrect while still sounding natural. +""" + + +# ============================================================ +# API wrapper +# ============================================================ + + +async def generate_corruption( + question: str, answer: str, semaphore: asyncio.Semaphore +) -> Optional[str]: + """Generate a hallucinated version of the answer.""" + user_prompt = build_user_prompt(question, answer) + + async with semaphore: + for retry in range(5): + try: + resp = await client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + temperature=0.8, + ) + return resp.choices[0].message.content.strip() + + except Exception as exc: + print(f"Retry ({retry}) corruption error:", exc) + await asyncio.sleep(1 + retry * 0.5) + + return None + + +# ============================================================ +# Process each item +# ============================================================ + + +async def process_item( + item: Dict[str, Any], semaphore: asyncio.Semaphore +) -> Optional[Dict[str, Any]]: + """Create synthetic corruption pair for an eval item.""" + prompt = item["prompt"] + correct = item["chosen"] + + corrupted = await generate_corruption(prompt, correct, semaphore) + if corrupted is None: + return None + + return { + "prompt": prompt, + "chosen": corrupted, + "rejected": correct, + "h_w": 1, + "h_l": 0, + "source": "synthetic_inversion_eval", + } + + +# ============================================================ +# Main +# ============================================================ + + +async def main() -> None: + """Run synthetic generation for evaluation.""" + target = 400 + + print(f"📥 Loading eval data → {INPUT_FILE}") + items = [json.loads(line) for line in INPUT_FILE.open("r", encoding="utf-8")] + + clean_pairs = [x for x in items if x.get("h_w") == 0 and x.get("h_l") == 1] + + selected = random.sample(clean_pairs, min(target, len(clean_pairs))) + print(f"🔎 Selected {len(selected)} items for corruption.") + + semaphore = asyncio.Semaphore(20) + coros = [process_item(item, semaphore) for item in selected] + + print("⚙️ Generating eval corruptions...") + results = await tqdm_asyncio.gather(*coros) + results = [r for r in results if r is not None] + + print(f"💾 Saving {len(results)} examples → {OUTPUT_FILE}") + with OUTPUT_FILE.open("w", encoding="utf-8") as f: + for r in results: + f.write(json.dumps(r, ensure_ascii=False) + "\n") + + print("✅ Eval synthetic corruption generation complete.") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/aixpert/data_construction/data_synthetic_train.py b/src/aixpert/data_construction/data_synthetic_train.py new file mode 100644 index 0000000..b896f9a --- /dev/null +++ b/src/aixpert/data_construction/data_synthetic_train.py @@ -0,0 +1,179 @@ +""" +Generate synthetic corruption (hallucinated) responses for TRAIN split. + +This script: +- Loads clean DPO-ready Skywork transformation for training. +- Selects items where h_w=0 (winner factual) and h_l=1 (loser incorrect). +- Asks GPT-4o-mini to rewrite the factual answer into a subtle hallucination. +- Produces “inversion pairs” where corrupted is chosen and original is rejected. +- Saves up to 10,000 synthetic hallucination samples. + +Fully compatible with ruff, ruff-format, pydocstyle, and mypy. +""" + +from __future__ import annotations + +import asyncio +import json +import random +from pathlib import Path +from typing import Any, Dict, List, Optional + +from decouple import Config, RepositoryEnv +from openai import AsyncOpenAI +from tqdm.asyncio import tqdm as tqdm_asyncio +from utils import load_yaml + + +# ============================================================ +# Load config.yaml + .env API key +# ============================================================ + +CONFIG = load_yaml("config.yaml") # must contain: repository: +REPO_PATH = CONFIG["repository"] + +env = Config(RepositoryEnv(f"{REPO_PATH}/.env")) +OPENAI_API_KEY = env("OPENAI_API_KEY", default=False) + +if not OPENAI_API_KEY: + raise RuntimeError("❌ OPENAI_API_KEY missing in .env under repository path.") + +client = AsyncOpenAI(api_key=OPENAI_API_KEY) + +# ============================================================ +# Directories +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" +INPUT_FILE = DATA_DIR / "skywork_first_transformed_train.jsonl" +OUTPUT_FILE = DATA_DIR / "synthetic_llm_inversion_train_10k.jsonl" + +# ============================================================ +# Corruption prompt +# ============================================================ + +SYSTEM_PROMPT = """ +You are a factual corruption generator. + +Given a question and its correct answer, produce an incorrect version +that is fluent, confident, and well-structured but it must contain 2–3 factual inaccuracies. +Avoid nonsense. Keep the tone similar to the original answer. +""" + + +def build_user_prompt(question: str, correct_answer: str) -> str: + """Construct the user prompt for hallucination generation.""" + return f""" +PROMPT: +{question} + +CORRECT ANSWER: +{correct_answer} + +TASK: +Rewrite the answer so that it becomes factually wrong, introducing subtle hallucinations +while sounding confident and coherent. +""" + + +# ============================================================ +# API for corrupted (hallucinated) answer +# ============================================================ + + +async def generate_corruption( + question: str, + answer: str, + semaphore: asyncio.Semaphore, +) -> Optional[str]: + """Generate a hallucinated version of the correct answer using GPT-4o-mini.""" + user_prompt = build_user_prompt(question, answer) + + async with semaphore: + for retry in range(5): + try: + resp = await client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + temperature=0.8, + ) + return resp.choices[0].message.content.strip() + + except Exception as exc: + print(f"Retry corruption ({retry}): {exc}") + await asyncio.sleep(1 + retry * 0.5) + + return None + + +# ============================================================ +# Process one item +# ============================================================ + + +async def process_item( + item: Dict[str, Any], + semaphore: asyncio.Semaphore, +) -> Optional[Dict[str, Any]]: + """Produce one synthetic inversion (corruption) DPO sample.""" + prompt = item["prompt"] + correct_answer = item["chosen"] + + corrupted = await generate_corruption(prompt, correct_answer, semaphore) + + if corrupted is None: + return None + + return { + "prompt": prompt, + "chosen": corrupted, # hallucinated / corrupted + "rejected": correct_answer, # original factual answer + "h_w": 1, # corrupted = wrong + "h_l": 0, # original = correct + "source": "synthetic_inversion", + } + + +# ============================================================ +# Main +# ============================================================ + + +async def main() -> None: + """Generate 10k synthetic corruption pairs and save JSONL output.""" + target = 10_000 + print(f"📥 Loading training dataset → {INPUT_FILE}") + + items: List[Dict[str, Any]] = [ + json.loads(line) for line in INPUT_FILE.open("r", encoding="utf-8") + ] + + print("🔍 Selecting factual (0,1) pairs only...") + clean_pairs = [x for x in items if x["h_w"] == 0 and x["h_l"] == 1] + + print(f"Available factual pairs: {len(clean_pairs)}") + selected = random.sample(clean_pairs, target) + print(f"🎯 Selected {len(selected)} items for corruption generation.") + + semaphore = asyncio.Semaphore(20) + + tasks = [process_item(item, semaphore) for item in selected] + + print("⚙️ Generating corrupted answers...") + results = await tqdm_asyncio.gather(*tasks) + + final_rows = [r for r in results if r is not None] + + print(f"💾 Saving {len(final_rows)} synthetic samples → {OUTPUT_FILE}") + with OUTPUT_FILE.open("w", encoding="utf-8") as f: + for row in final_rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + print("✅ Synthetic corruption dataset created.") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/aixpert/data_construction/data_transform_eval.py b/src/aixpert/data_construction/data_transform_eval.py new file mode 100644 index 0000000..48468f6 --- /dev/null +++ b/src/aixpert/data_construction/data_transform_eval.py @@ -0,0 +1,85 @@ +""" +Transform binary factual-scored evaluation preference pairs into DPO-ready format. + +This script: +- Loads binary factual results for eval pairs. +- Converts response_0 / response_1 into (chosen, rejected) using the + better_response_id. +- Copies factual flags into h_w (winner) and h_l (loser). +- Preserves the original responses and adds a flipped=False flag. +- Writes the DPO-ready JSONL file for evaluation. + +Fully compliant with ruff, ruff-format, pydocstyle, and mypy. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + +from tqdm import tqdm + + +DATA_DIR = Path(__file__).resolve().parent / "data" + +INPUT_PATH = DATA_DIR / "skywork_binary_factual_eval.jsonl" +OUTPUT_PATH = DATA_DIR / "skywork_first_transformed_eval.jsonl" + + +def process_item(item: Dict[str, Any]) -> Dict[str, Any]: + """Convert one binary factual-scored eval pair into DPO-ready structure.""" + prompt = item["prompt"] + r0 = item["response_0"] + r1 = item["response_1"] + pref = int(item["better_response_id"]) + + # factual flags + h0 = int(item["h0"]) + h1 = int(item["h1"]) + + if pref == 0: + chosen, rejected = r0, r1 + h_w, h_l = h0, h1 + else: + chosen, rejected = r1, r0 + h_w, h_l = h1, h0 + + return { + "prompt": prompt, + "chosen": chosen, + "rejected": rejected, + "h_w": h_w, + "h_l": h_l, + "better_response_id": pref, + "response_0": r0, + "response_1": r1, + "flipped": False, + } + + +def transform_dataset() -> None: + """Load eval dataset, apply transformation, and save JSONL output.""" + print(f"📥 Loading eval data → {INPUT_PATH}") + items = [json.loads(line) for line in INPUT_PATH.open("r", encoding="utf-8")] + + transformed: List[Dict[str, Any]] = [] + + print(f"⚙️ Processing {len(items)} items…") + for item in tqdm(items): + transformed.append(process_item(item)) + + print(f"💾 Saving output → {OUTPUT_PATH}") + with OUTPUT_PATH.open("w", encoding="utf-8") as f: + for obj in transformed: + f.write(json.dumps(obj, ensure_ascii=False) + "\n") + + print("\n=======================================") + print("✔ EVAL DATASET TRANSFORMATION COMPLETE") + print("✔ NO SAFE-DPO FLIPS APPLIED") + print(f"Total items: {len(items)}") + print("=======================================\n") + + +if __name__ == "__main__": + transform_dataset() diff --git a/src/aixpert/data_construction/data_transform_train.py b/src/aixpert/data_construction/data_transform_train.py new file mode 100644 index 0000000..7b65397 --- /dev/null +++ b/src/aixpert/data_construction/data_transform_train.py @@ -0,0 +1,86 @@ +""" +Transform binary factual-scored training preference pairs into DPO-ready format. + +This script: +- Loads binary factual results for training pairs. +- Converts response_0 / response_1 into (chosen, rejected) strictly based on + better_response_id. +- Copies factual flags into h_w (winner) and h_l (loser). +- Preserves original responses and adds a flipped=False flag. +- Writes the DPO-ready JSONL file for training. + +Fully compatible with ruff, ruff-format, pydocstyle, and mypy. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + +from tqdm import tqdm + + +DATA_DIR = Path(__file__).resolve().parent / "data" + +INPUT_PATH = DATA_DIR / "skywork_binary_factual_train.jsonl" +OUTPUT_PATH = DATA_DIR / "skywork_first_transformed_train.jsonl" + + +def process_item(item: Dict[str, Any]) -> Dict[str, Any]: + """Convert one binary factual-scored pair into DPO-ready structure.""" + prompt = item["prompt"] + r0 = item["response_0"] + r1 = item["response_1"] + pref = int(item["better_response_id"]) + + # factual/hallucination flags + h0 = int(item["h0"]) + h1 = int(item["h1"]) + + # Determine chosen vs rejected based on preference label + if pref == 0: + chosen, rejected = r0, r1 + h_w, h_l = h0, h1 + else: + chosen, rejected = r1, r0 + h_w, h_l = h1, h0 + + return { + "prompt": prompt, + "chosen": chosen, + "rejected": rejected, + "h_w": h_w, + "h_l": h_l, + "better_response_id": pref, + "response_0": r0, + "response_1": r1, + "flipped": False, + } + + +def transform_dataset() -> None: + """Load training dataset, apply transformation, and save JSONL output.""" + print(f"📥 Loading training data → {INPUT_PATH}") + items = [json.loads(line) for line in INPUT_PATH.open("r", encoding="utf-8")] + + transformed: List[Dict[str, Any]] = [] + + print(f"⚙️ Processing {len(items)} items…") + for item in tqdm(items): + transformed.append(process_item(item)) + + print(f"💾 Saving output → {OUTPUT_PATH}") + with OUTPUT_PATH.open("w", encoding="utf-8") as f: + for obj in transformed: + f.write(json.dumps(obj, ensure_ascii=False) + "\n") + + print("\n=======================================") + print("✔ TRAIN DATASET TRANSFORMATION COMPLETE") + print("✔ NO SAFE-DPO FLIPS APPLIED") + print(f"Total items: {len(items)}") + print("=======================================\n") + + +if __name__ == "__main__": + transform_dataset() diff --git a/src/aixpert/data_construction/dataconversion.eval.py b/src/aixpert/data_construction/dataconversion.eval.py new file mode 100644 index 0000000..6f6525d --- /dev/null +++ b/src/aixpert/data_construction/dataconversion.eval.py @@ -0,0 +1,96 @@ +""" +Generate evaluation preference pairs from cleaned Skywork samples. + +This script loads prompt/chosen/rejected rows from the evaluation JSONL dataset, +randomly assigns chosen/rejected responses into response_0 and response_1, +assigns the correct better_response_id, and saves the resulting dataset in JSONL format. + +It mirrors the training script but operates on the evaluation split only. +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path +from typing import Any, Dict, List + + +# ============================================================ +# Configuration +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" +DATA_DIR.mkdir(parents=True, exist_ok=True) + +# Input/output file names for evaluation set +INPUT_FILE = DATA_DIR / "skywork_extracted_eval.jsonl" +OUT_FILE = DATA_DIR / "skywork_preference_pairs_eval.jsonl" + + +def load_jsonl(path: Path) -> List[Dict[str, Any]]: + """Load a JSONL file into a list of dictionaries.""" + rows: List[Dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as f: + for line in f: + rows.append(json.loads(line)) + return rows + + +def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write a list of dictionaries to a JSONL file.""" + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Convert prompt/chosen/rejected rows into preference-pair format.""" + output: List[Dict[str, Any]] = [] + + for item in data: + prompt = item.get("prompt", "") + chosen = item.get("chosen", "") + rejected = item.get("rejected", "") + + # Random assignment + if random.random() < 0.5: + response_0 = chosen + response_1 = rejected + better_response_id = 0 + else: + response_0 = rejected + response_1 = chosen + better_response_id = 1 + + output.append( + { + "prompt": prompt, + "response_0": response_0, + "response_1": response_1, + "better_response_id": better_response_id, + } + ) + + return output + + +def main() -> None: + """Generate evaluation preference pairs and save them to disk.""" + print(f"📥 Loading evaluation dataset from → {INPUT_FILE}") + + data = load_jsonl(INPUT_FILE) + print(f"📄 Loaded {len(data)} rows") + + preference_pairs = create_preference_pairs(data) + + write_jsonl(OUT_FILE, preference_pairs) + + print("======================================") + print(f"✅ DONE! Saved evaluation preference pairs → {OUT_FILE}") + print(f"📦 Total eval pairs: {len(preference_pairs)}") + print("======================================") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/dataconversion.py b/src/aixpert/data_construction/dataconversion.py new file mode 100644 index 0000000..40a034d --- /dev/null +++ b/src/aixpert/data_construction/dataconversion.py @@ -0,0 +1,94 @@ +""" +Generate preference pairs from cleaned Skywork samples. + +This script loads prompt/chosen/rejected rows from a JSONL dataset, randomly +assigns chosen/rejected responses into response_0 and response_1, assigns the +correct better_response_id, and saves the resulting dataset in JSONL format. + +This version is fully compliant with ruff, ruff-format, pydocstyle, and mypy. +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path +from typing import Any, Dict, List + + +# ============================================================ +# Configuration +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" +DATA_DIR.mkdir(parents=True, exist_ok=True) + +INPUT_FILE = DATA_DIR / "skywork_extracted_77k.jsonl" +OUT_FILE = DATA_DIR / "skywork_preference_pairs_77k.jsonl" + + +def load_jsonl(path: Path) -> List[Dict[str, Any]]: + """Load a JSONL file into a list of dictionaries.""" + rows: List[Dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as f: + for line in f: + rows.append(json.loads(line)) + return rows + + +def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write a list of dictionaries to a JSONL file.""" + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Convert prompt/chosen/rejected rows into preference-pair format.""" + output: List[Dict[str, Any]] = [] + + for item in data: + prompt = item.get("prompt", "") + chosen = item.get("chosen", "") + rejected = item.get("rejected", "") + + if random.random() < 0.5: + response_0 = chosen + response_1 = rejected + better_response_id = 0 + else: + response_0 = rejected + response_1 = chosen + better_response_id = 1 + + output.append( + { + "prompt": prompt, + "response_0": response_0, + "response_1": response_1, + "better_response_id": better_response_id, + } + ) + + return output + + +def main() -> None: + """Generate evaluation preference pairs and save them to disk.""" + print(f"📥 Loading dataset from → {INPUT_FILE}") + + data = load_jsonl(INPUT_FILE) + print(f"📄 Loaded {len(data)} rows") + + preference_pairs = create_preference_pairs(data) + + write_jsonl(OUT_FILE, preference_pairs) + + print("======================================") + print(f"✅ DONE! Saved preference pairs → {OUT_FILE}") + print(f"📦 Total pairs: {len(preference_pairs)}") + print("======================================") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/dataset_eval.py b/src/aixpert/data_construction/dataset_eval.py new file mode 100644 index 0000000..faa9d0c --- /dev/null +++ b/src/aixpert/data_construction/dataset_eval.py @@ -0,0 +1,215 @@ +""" +Generate binary factuality labels (0=factual, 1=hallucinated) for test preference pairs. + +This script loads test preference pairs, evaluates each response using a +strict PKU-style binary factual judge. It then saves the dataset with: + + factual_flag_0, factual_flag_1 + h0, h1 (aliases for convenience) + +It supports automatic resume, concurrency throttling, and checkpointing. +Fully compliant with ruff, ruff-format, pydocstyle, and mypy. + +""" + +from __future__ import annotations + +import asyncio +import json +import os +import re +from pathlib import Path +from typing import Any, Dict, List + +from decouple import Config, RepositoryEnv +from openai import AsyncOpenAI +from tqdm.asyncio import tqdm_asyncio +from utils import load_yaml + + +# ============================================================ +# Load config.yaml + .env API key +# ============================================================ + +CONFIG = load_yaml("config.yaml") # MUST contain: repository: +REPO_PATH = CONFIG["repository"] + +env = Config(RepositoryEnv(f"{REPO_PATH}/.env")) +OPENAI_API_KEY = env("OPENAI_API_KEY", default=False) + +if not OPENAI_API_KEY: + raise RuntimeError("❌ OPENAI_API_KEY not found in .env under repository path.") + +client = AsyncOpenAI(api_key=OPENAI_API_KEY) + + +# ============================================================ +# Configuration & Directories +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" +DATA_DIR.mkdir(parents=True, exist_ok=True) + +INPUT_FILE = DATA_DIR / "skywork_preference_pairs_eval.jsonl" +OUTPUT_FILE = DATA_DIR / "skywork_binary_factual_eval.jsonl" + +# ============================================================ +# Factual Judge Prompt +# ============================================================ + +BINARY_FACTUAL_JUDGE_PROMPT = """ +System: +You are an extremely strict factuality judge. Your task is to determine whether the assistant’s +response contains ANY of the following: + +• factual errors +• hallucinations +• unsupported claims +• unverifiable statements +• fabricated details +• incorrect technical descriptions +• invented studies, names, or entities +• logical contradictions +• vague or misleading explanations +• partial answers that omit critical facts + +IMPORTANT: +If the response is vague, incomplete, unsupported, or unverifiable, +classify it as [[1]]. + +Definitions: +[[0]] → Fully factual, precise, verifiable. +[[1]] → ANY issue: vague, incorrect, unverifiable, misleading, fabricated. + +Return ONLY: +[[0]] or [[1]] + +User Question: +{question} + +Assistant Response: +{answer} +""" + + +# ============================================================ +# API Call — Get Binary Factual Flag +# ============================================================ + + +async def get_factual_flag( + question: str, answer: str, semaphore: asyncio.Semaphore +) -> int: + """Evaluate a response for factual correctness (0=factual, 1=hallucinated).""" + prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer) + + async with semaphore: + for retry in range(5): + try: + response = await client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": prompt}], + temperature=0, + ) + + text = response.choices[0].message.content.strip() + match = re.search(r"\[\[(0|1)\]\]", text) + if match: + return int(match.group(1)) + + return 1 # default: hallucinated + + except Exception as exc: + print(f"Retry factual-flag ({retry}): {exc}") + await asyncio.sleep(1 + 0.5 * retry) + + return 1 + + +# ============================================================ +# Process One Item +# ============================================================ + + +async def process_single_item( + item: Dict[str, Any], semaphore: asyncio.Semaphore +) -> Dict[str, Any]: + """Process one preference pair and produce binary factual labels.""" + prompt = item["prompt"] + r0 = item["response_0"] + r1 = item["response_1"] + + f0_task = asyncio.create_task(get_factual_flag(prompt, r0, semaphore)) + f1_task = asyncio.create_task(get_factual_flag(prompt, r1, semaphore)) + + f0 = await f0_task + f1 = await f1_task + + return { + **item, + "factual_flag_0": f0, + "factual_flag_1": f1, + "h0": f0, + "h1": f1, + } + + +# ============================================================ +# Main Async Pipeline +# ============================================================ + + +async def process_dataset() -> None: + """Load test dataset, compute factual flags, resume if needed, and save output.""" + print(f"📥 Loading test dataset → {INPUT_FILE}") + + with INPUT_FILE.open("r", encoding="utf-8") as f: + items = [json.loads(line) for line in f] + + # Resume mode + processed_count = 0 + if OUTPUT_FILE.exists(): + print("♻️ Resuming previous run...") + with OUTPUT_FILE.open("r", encoding="utf-8") as f: + processed_count = sum(1 for _ in f) + print(f"Found {processed_count} completed items.") + + remaining = items[processed_count:] + semaphore = asyncio.Semaphore(25) + + tasks = [ + asyncio.create_task(process_single_item(item, semaphore)) for item in remaining + ] + + buffer: List[str] = [] + count = processed_count + + with OUTPUT_FILE.open("a", encoding="utf-8") as f: + for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)): + result = await coro + buffer.append(json.dumps(result, ensure_ascii=False) + "\n") + count += 1 + + if len(buffer) >= 25: + f.writelines(buffer) + f.flush() + os.fsync(f.fileno()) + buffer.clear() + print(f"Checkpoint saved ({count} items).") + + # Flush final buffer + if buffer: + f.writelines(buffer) + f.flush() + os.fsync(f.fileno()) + print(f"Final checkpoint saved ({count} items).") + + print("✅ Completed test factual evaluation.") + + +# ============================================================ +# Entry Point +# ============================================================ + +if __name__ == "__main__": + asyncio.run(process_dataset()) diff --git a/src/aixpert/data_construction/dataset_train.py b/src/aixpert/data_construction/dataset_train.py new file mode 100644 index 0000000..f24b773 --- /dev/null +++ b/src/aixpert/data_construction/dataset_train.py @@ -0,0 +1,216 @@ +""" +Generate binary factuality labels for evaluation preference pairs using GPT-4o-mini. + +This script loads evaluation preference pairs, evaluates each response using +a strict PKU-style binary factual judge, and saves the resulting dataset with: + + factual_flag_0, factual_flag_1 + h0, h1 (aliases for convenience) + +It supports automatic resume, concurrency throttling, and checkpointing. +Fully compliant with ruff, ruff-format, pydocstyle, and mypy. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import re +from pathlib import Path +from typing import Any, Dict, List + +from decouple import Config, RepositoryEnv +from openai import AsyncOpenAI +from tqdm.asyncio import tqdm_asyncio +from utils import load_yaml # your YAML loader + + +# ============================================================ +# Load OpenAI API key +# ============================================================ + + +YAML_CONFIG = load_yaml("config.yaml") +REPO_PATH = YAML_CONFIG["repository"] + +# Load .env using decouple (the exact method you asked for) +env = Config(RepositoryEnv(f"{REPO_PATH}/.env")) +OPENAI_API_KEY = env("OPENAI_API_KEY", default=False) + +if not OPENAI_API_KEY: + raise ValueError("❌ OPENAI_API_KEY not found in .env file!") + +client = AsyncOpenAI(api_key=OPENAI_API_KEY) + + +# ============================================================ +# Configuration & Directories +# ============================================================ + +DATA_DIR = Path(__file__).resolve().parent / "data" +DATA_DIR.mkdir(parents=True, exist_ok=True) + +INPUT_FILE = DATA_DIR / "skywork_preference_pairs_train.jsonl" +OUTPUT_FILE = DATA_DIR / "skywork_binary_factual_train.jsonl" + +# ============================================================ +# Factual Judge Prompt +# ============================================================ + +BINARY_FACTUAL_JUDGE_PROMPT = """ +System: +You are an extremely strict factuality judge. Your task is to determine whether the assistant’s +response contains ANY of the following: + +• factual errors +• hallucinations +• unsupported claims +• unverifiable statements +• fabricated details +• incorrect technical descriptions +• invented studies, names, or entities +• logical contradictions +• vague or misleading explanations +• partial answers that omit critical facts + +IMPORTANT: +If the response is vague, incomplete, unsupported, or unverifiable, +classify it as [[1]]. + +Definitions: +[[0]] → Fully factual, precise, verifiable. +[[1]] → ANY issue: vague, incorrect, unverifiable, misleading, fabricated. + +Return ONLY: +[[0]] or [[1]] + +User Question: +{question} + +Assistant Response: +{answer} +""" + + +# ============================================================ +# API Call — Get Binary Factual Flag +# ============================================================ + + +async def get_factual_flag( + question: str, answer: str, semaphore: asyncio.Semaphore +) -> int: + """Evaluate a response for factual correctness (0=factual, 1=hallucinated).""" + prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer) + + async with semaphore: + for retry in range(5): + try: + response = await client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": prompt}], + temperature=0, + ) + + text = response.choices[0].message.content.strip() + match = re.search(r"\[\[(0|1)\]\]", text) + if match: + return int(match.group(1)) + + return 1 # default: hallucinated + + except Exception as exc: + print(f"Retry factual-flag ({retry}): {exc}") + await asyncio.sleep(1 + 0.5 * retry) + + return 1 + + +# ============================================================ +# Process One Item +# ============================================================ + + +async def process_single_item( + item: Dict[str, Any], semaphore: asyncio.Semaphore +) -> Dict[str, Any]: + """Process one preference pair and produce binary factual labels.""" + prompt = item["prompt"] + r0 = item["response_0"] + r1 = item["response_1"] + + f0_task = asyncio.create_task(get_factual_flag(prompt, r0, semaphore)) + f1_task = asyncio.create_task(get_factual_flag(prompt, r1, semaphore)) + + f0 = await f0_task + f1 = await f1_task + + return { + **item, + "factual_flag_0": f0, + "factual_flag_1": f1, + "h0": f0, + "h1": f1, + } + + +# ============================================================ +# Main Async Pipeline +# ============================================================ + + +async def process_dataset() -> None: + """Load eval dataset, compute factual flags, resume if needed, and save output.""" + print(f"📥 Loading eval dataset → {INPUT_FILE}") + + with INPUT_FILE.open("r", encoding="utf-8") as f: + items = [json.loads(line) for line in f] + + # Resume mode + processed_count = 0 + if OUTPUT_FILE.exists(): + print("♻️ Resuming previous run...") + with OUTPUT_FILE.open("r", encoding="utf-8") as f: + processed_count = sum(1 for _ in f) + print(f"Found {processed_count} completed items.") + + remaining = items[processed_count:] + semaphore = asyncio.Semaphore(25) + + tasks = [ + asyncio.create_task(process_single_item(item, semaphore)) for item in remaining + ] + + buffer: List[str] = [] + count = processed_count + + with OUTPUT_FILE.open("a", encoding="utf-8") as f: + for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)): + result = await coro + buffer.append(json.dumps(result, ensure_ascii=False) + "\n") + count += 1 + + if len(buffer) >= 25: + f.writelines(buffer) + f.flush() + os.fsync(f.fileno()) + buffer.clear() + print(f"Checkpoint saved ({count} items).") + + # Flush remaining + if buffer: + f.writelines(buffer) + f.flush() + os.fsync(f.fileno()) + print(f"Final checkpoint saved ({count} items).") + + print("✅ Completed factual evaluation.") + + +# ============================================================ +# Entry Point +# ============================================================ + +if __name__ == "__main__": + asyncio.run(process_dataset()) diff --git a/src/aixpert/data_construction/utils.py b/src/aixpert/data_construction/utils.py new file mode 100644 index 0000000..7aed9e6 --- /dev/null +++ b/src/aixpert/data_construction/utils.py @@ -0,0 +1,55 @@ +""" +Utility functions for loading configuration files. + +This module provides: +- `load_yaml`: Read a YAML file into a Python dictionary. +- `load_env_api_key`: Load the OPENAI_API_KEY from a repository `.env` file. + +These helpers centralize configuration handling and ensure consistent behavior +across all data-construction scripts. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict + +import yaml +from decouple import Config, RepositoryEnv + + +def load_yaml(yaml_path: str) -> Dict[str, Any]: + """Load a YAML file and return its content as a dict. + + :param yaml_path: Path to the YAML file. + :return: Parsed YAML content as a dict, or empty dict on failure. + """ + try: + with open(yaml_path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + except Exception as e: + print(f"YAML load error: {e}") + return {} + + +def load_env_api_key(repository_path: str) -> str: + """Load OPENAI_API_KEY from a .env file inside the repository. + + Uses: + env = Config(RepositoryEnv(config["repository"] + "/.env")) + api_key = env("OPENAI_API_KEY", default=False) + + :param repository_path: Path to the repo containing `.env` + :return: The OpenAI API key or an empty string if missing. + """ + env_path = Path(repository_path) / ".env" + + if not env_path.exists(): + print(f"Warning: .env file not found at {env_path}") + return "" + + env = Config(RepositoryEnv(str(env_path))) + return env("OPENAI_API_KEY", default="") + + +__all__ = ["load_yaml", "load_env_api_key"] From da5359c582f3a0d20660a8c2d96a4790c980980b Mon Sep 17 00:00:00 2001 From: sindchad Date: Tue, 2 Dec 2025 13:40:09 -0500 Subject: [PATCH 02/14] data extraction --- .../data_construction/dataextraction.py | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 src/aixpert/data_construction/dataextraction.py diff --git a/src/aixpert/data_construction/dataextraction.py b/src/aixpert/data_construction/dataextraction.py new file mode 100644 index 0000000..15ea172 --- /dev/null +++ b/src/aixpert/data_construction/dataextraction.py @@ -0,0 +1,136 @@ +""" +Skywork extraction utilities. + +This module extracts prompt/chosen/rejected fields from the Skywork Preference +dataset, removes exact duplicates, and writes the cleaned dataset to JSONL +files. Fully compatible with ruff, mypy, and the AI Engineering template. +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any, Dict, List, Union + +import pandas as pd +from datasets import load_dataset + + +# Path to: src/aixpert/data_construction/data/ +DATA_DIR = Path(__file__).resolve().parent / "data" +DATA_DIR.mkdir(parents=True, exist_ok=True) + + +SUBSET_SIZE = 80000 +OUT_FILE = DATA_DIR / "skywork_extracted_77k.jsonl" +REMOVED_FILE = DATA_DIR / "skywork_cleaned_77k.jsonl" + +print(f"📥 Loading first {SUBSET_SIZE} samples from Skywork...") + + +# ============================================================ +# Dataset loading +# ============================================================ +ds = load_dataset( + "Skywork/Skywork-Reward-Preference-80K-v0.1", + split=f"train[:{SUBSET_SIZE}]", +) + +df = ds.to_pandas() + + +# ============================================================ +# Extract prompt / chosen / rejected +# ============================================================ +def extract_prompt_from_dialog(dialog: List[Dict[str, Any]]) -> str: + """ + Extract the first user message from a dialog. + + Parameters + ---------- + dialog : list of dict + A list of message objects with "role" and "content" keys. + + Returns + ------- + str + The content of the first message with role 'user', or an empty string. + """ + for msg in dialog: + if msg.get("role") == "user": + return str(msg.get("content", "")).strip() + return "" + + +def extract_answer_from_dialog(dialog: List[Dict[str, Any]]) -> str: + """ + Extract the first assistant message from a dialog. + + Parameters + ---------- + dialog : list of dict + A list of message objects with "role" and "content" keys. + + Returns + ------- + str + The content of the first message with role 'assistant', or an empty string. + """ + for msg in dialog: + if msg.get("role") == "assistant": + return str(msg.get("content", "")).strip() + return "" + + +df["prompt"] = df["chosen"].apply(extract_prompt_from_dialog) +df["chosen"] = df["chosen"].apply(extract_answer_from_dialog) +df["rejected"] = df["rejected"].apply(extract_answer_from_dialog) + +clean_df = df[["prompt", "chosen", "rejected"]] + +# ============================================================ +# 🔍 Exact-match removal (chosen == rejected) +# ============================================================ +cleaned: List[Dict[str, str]] = [] +removed: List[Dict[str, str]] = [] + +for _, row in clean_df.iterrows(): + chosen = str(row["chosen"]).strip() + rejected = str(row["rejected"]).strip() + + sample = { + "prompt": str(row["prompt"]).strip(), + "chosen": chosen, + "rejected": rejected, + } + + if chosen == rejected: + removed.append(sample) + else: + cleaned.append(sample) + +print(f"🧹 Removed exact duplicates: {len(removed)}") +print(f"📦 Remaining clean samples: {len(cleaned)}") + +# Ensure output directory exists +os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True) + + +# ============================================================ +# Save output JSONL files +# ============================================================ +def write_jsonl(path: Union[str, Path], rows: List[Dict[str, Any]]) -> None: + """Write a list of dictionaries to a JSONL file.""" + with open(str(path), "w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +write_jsonl(OUT_FILE, cleaned) +write_jsonl(REMOVED_FILE, removed) + +print(f"✅ Saved cleaned dataset ({len(cleaned)} samples) → {OUT_FILE}") +print(f"🗑️ Saved removed duplicates ({len(removed)} samples) → {REMOVED_FILE}") + +print(pd.DataFrame(cleaned).head()) From d9b7fbd1e9f7854351c1b501837371fd5b31512f Mon Sep 17 00:00:00 2001 From: sindchad Date: Tue, 2 Dec 2025 13:44:15 -0500 Subject: [PATCH 03/14] data extraction eval --- .../data_construction/dataextraction_eval.py | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 src/aixpert/data_construction/dataextraction_eval.py diff --git a/src/aixpert/data_construction/dataextraction_eval.py b/src/aixpert/data_construction/dataextraction_eval.py new file mode 100644 index 0000000..1595767 --- /dev/null +++ b/src/aixpert/data_construction/dataextraction_eval.py @@ -0,0 +1,94 @@ +""" +Extract the evaluation slice of the Skywork preference dataset. + +This script extracts rows 80001–81000, removes exact duplicates, +and saves the cleaned dataset into JSONL files under the local data folder. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + +from datasets import load_dataset + + +# ============================================================ +# Helpers +# ============================================================ + + +def extract_prompt(dialog: List[Dict[str, Any]]) -> str: + """Extract the first user message.""" + for msg in dialog: + if msg.get("role") == "user": + return str(msg.get("content", "")).strip() + return "" + + +def extract_answer(dialog: List[Dict[str, Any]]) -> str: + """Extract the first assistant message.""" + for msg in dialog: + if msg.get("role") == "assistant": + return str(msg.get("content", "")).strip() + return "" + + +def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write rows to a JSONL file.""" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +# ============================================================ +# Constants for eval split +# ============================================================ + +START = 80001 +END = 81000 # inclusive + +print(f"📥 Loading eval slice: {START} → {END}") + +ds = load_dataset( + "Skywork/Skywork-Reward-Preference-80K-v0.1", + split=f"train[{START}:{END + 1}]", +) + +df = ds.to_pandas() + +df["prompt"] = df["chosen"].apply(extract_prompt) +df["chosen"] = df["chosen"].apply(extract_answer) +df["rejected"] = df["rejected"].apply(extract_answer) + +clean_df = df[["prompt", "chosen", "rejected"]] + +cleaned: List[Dict[str, str]] = [] +removed: List[Dict[str, str]] = [] + +for _, row in clean_df.iterrows(): + chosen = row["chosen"].strip() + rejected = row["rejected"].strip() + + record = { + "prompt": row["prompt"].strip(), + "chosen": chosen, + "rejected": rejected, + } + + if chosen == rejected: + removed.append(record) + else: + cleaned.append(record) + +print(f"🧹 Removed duplicates: {len(removed)}") +print(f"📦 Clean samples: {len(cleaned)}") + +# Save outputs +data_dir = Path(__file__).resolve().parent / "data" +save_jsonl(data_dir / "skywork_extracted_eval.jsonl", cleaned) +save_jsonl(data_dir / "skywork_eval_removed.jsonl", removed) + +print("✅ Saved eval dataset → skywork_eval.jsonl") From 167add7beb8a49051e84fcb68f42caf5eedf0c5f Mon Sep 17 00:00:00 2001 From: sindchad Date: Tue, 2 Dec 2025 13:52:06 -0500 Subject: [PATCH 04/14] data extraction test --- .../data_construction/dataextraction_eval2.py | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 src/aixpert/data_construction/dataextraction_eval2.py diff --git a/src/aixpert/data_construction/dataextraction_eval2.py b/src/aixpert/data_construction/dataextraction_eval2.py new file mode 100644 index 0000000..83fb701 --- /dev/null +++ b/src/aixpert/data_construction/dataextraction_eval2.py @@ -0,0 +1,94 @@ +""" +Extract the test slice of the Skywork preference dataset. + +This script extracts rows 81001–81500, removes exact duplicates, +and saves the cleaned dataset into JSONL files under the local data folder. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + +from datasets import load_dataset + + +# ============================================================ +# Helpers +# ============================================================ + + +def extract_prompt(dialog: List[Dict[str, Any]]) -> str: + """Extract the first user message.""" + for msg in dialog: + if msg.get("role") == "user": + return str(msg.get("content", "")).strip() + return "" + + +def extract_answer(dialog: List[Dict[str, Any]]) -> str: + """Extract the first assistant message.""" + for msg in dialog: + if msg.get("role") == "assistant": + return str(msg.get("content", "")).strip() + return "" + + +def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write rows to a JSONL file.""" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +# ============================================================ +# Constants for test split +# ============================================================ + +START = 81001 +END = 81500 # inclusive + +print(f"📥 Loading test slice: {START} → {END}") + +ds = load_dataset( + "Skywork/Skywork-Reward-Preference-80K-v0.1", + split=f"train[{START}:{END + 1}]", +) + +df = ds.to_pandas() + +df["prompt"] = df["chosen"].apply(extract_prompt) +df["chosen"] = df["chosen"].apply(extract_answer) +df["rejected"] = df["rejected"].apply(extract_answer) + +clean_df = df[["prompt", "chosen", "rejected"]] + +cleaned: List[Dict[str, str]] = [] +removed: List[Dict[str, str]] = [] + +for _, row in clean_df.iterrows(): + chosen = row["chosen"].strip() + rejected = row["rejected"].strip() + + record = { + "prompt": row["prompt"].strip(), + "chosen": chosen, + "rejected": rejected, + } + + if chosen == rejected: + removed.append(record) + else: + cleaned.append(record) + +print(f"🧹 Removed duplicates: {len(removed)}") +print(f"📦 Clean samples: {len(cleaned)}") + +# Save outputs +data_dir = Path(__file__).resolve().parent / "data" +save_jsonl(data_dir / "skywork_extracted_test.jsonl", cleaned) +save_jsonl(data_dir / "skywork_test_removed.jsonl", removed) + +print("✅ Saved test dataset → skywork_test.jsonl") From d80983345fb6a470f2c73e6e307960ae5e90fa7d Mon Sep 17 00:00:00 2001 From: sindchad Date: Tue, 2 Dec 2025 14:25:14 -0500 Subject: [PATCH 05/14] file name change --- .../{dataconversion.eval.py => dataconversion_eval.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/aixpert/data_construction/{dataconversion.eval.py => dataconversion_eval.py} (100%) diff --git a/src/aixpert/data_construction/dataconversion.eval.py b/src/aixpert/data_construction/dataconversion_eval.py similarity index 100% rename from src/aixpert/data_construction/dataconversion.eval.py rename to src/aixpert/data_construction/dataconversion_eval.py From 8cb4603c8f7094381149a64c3063c15dc181b821 Mon Sep 17 00:00:00 2001 From: sindchad Date: Tue, 2 Dec 2025 15:08:08 -0500 Subject: [PATCH 06/14] data flipped --- .../data_construction/data_flipped_eval.py | 60 ++++++++++++++++++ .../data_construction/data_flipped_train.py | 61 +++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 src/aixpert/data_construction/data_flipped_eval.py create mode 100644 src/aixpert/data_construction/data_flipped_train.py diff --git a/src/aixpert/data_construction/data_flipped_eval.py b/src/aixpert/data_construction/data_flipped_eval.py new file mode 100644 index 0000000..20ab90d --- /dev/null +++ b/src/aixpert/data_construction/data_flipped_eval.py @@ -0,0 +1,60 @@ +""" +Flip preference labels for evaluation data. + +This script: +- Converts h_w=1,h_l=0 → h_w=0,h_l=1 +- Swaps chosen/rejected +- Writes a flipped version of the dataset +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + + +DATA_DIR = Path(__file__).resolve().parent / "data" + +INPUT_FILE = DATA_DIR / "eval_final.jsonl" +OUTPUT_FILE = DATA_DIR / "eval_final_flipped.jsonl" + + +def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]: + """Flip the sample if (h_w, h_l) = (1, 0).""" + if item.get("h_w") == 1 and item.get("h_l") == 0: + item["h_w"], item["h_l"] = 0, 1 + item["chosen"], item["rejected"] = item["rejected"], item["chosen"] + return item + + +def main() -> None: + """Execute flipping process for evaluation dataset.""" + print("📥 Loading input file:", INPUT_FILE) + + output: List[Dict[str, Any]] = [] + + with INPUT_FILE.open("r", encoding="utf-8") as f: + for raw_line in f: + line = raw_line.strip() + if not line: + continue + item = json.loads(line) + output.append(flip_sample(item)) + + print(f"✅ Processed {len(output)} samples") + + OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) + print("💾 Saving flipped dataset to:", OUTPUT_FILE) + + with OUTPUT_FILE.open("w", encoding="utf-8") as f: + for item in output: + f.write(json.dumps(item, ensure_ascii=False) + "\n") + + print("\n============================================") + print(f"🎉 Saved flipped dataset → {OUTPUT_FILE.name}") + print("============================================\n") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/data_flipped_train.py b/src/aixpert/data_construction/data_flipped_train.py new file mode 100644 index 0000000..f12c4d3 --- /dev/null +++ b/src/aixpert/data_construction/data_flipped_train.py @@ -0,0 +1,61 @@ +""" +Flip preference labels for training data. + +This script: +- Converts h_w=1,h_l=0 → h_w=0,h_l=1 +- Swaps chosen/rejected +- Writes a flipped version of the dataset +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + + +# Local data directory (same folder as the script → data/) +DATA_DIR = Path(__file__).resolve().parent / "data" + +INPUT_FILE = DATA_DIR / "train_finallast.jsonl" +OUTPUT_FILE = DATA_DIR / "train_finallast_flipped.jsonl" + + +def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]: + """Flip the sample if (h_w, h_l) = (1, 0).""" + if item.get("h_w") == 1 and item.get("h_l") == 0: + item["h_w"], item["h_l"] = 0, 1 + item["chosen"], item["rejected"] = item["rejected"], item["chosen"] + return item + + +def main() -> None: + """Execute flipping process for training dataset.""" + print("📥 Loading input file:", INPUT_FILE) + + output: List[Dict[str, Any]] = [] + + with INPUT_FILE.open("r", encoding="utf-8") as f: + for raw_line in f: + line = raw_line.strip() + if not line: + continue + item = json.loads(line) + output.append(flip_sample(item)) + + print(f"✅ Processed {len(output)} samples") + + OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) + print("💾 Saving flipped dataset to:", OUTPUT_FILE) + + with OUTPUT_FILE.open("w", encoding="utf-8") as f: + for item in output: + f.write(json.dumps(item, ensure_ascii=False) + "\n") + + print("\n============================================") + print(f"🎉 Saved flipped dataset → {OUTPUT_FILE.name}") + print("============================================\n") + + +if __name__ == "__main__": + main() From 102f8b2ec73810b54a3343efe71e94a0d621ad1a Mon Sep 17 00:00:00 2001 From: sindchad Date: Thu, 4 Dec 2025 18:45:44 -0500 Subject: [PATCH 07/14] folder updates --- src/aixpert/data_construction/Readme.md | 0 src/aixpert/data_construction/config.yaml | 4 - .../data_construction/config/config.yaml | 61 +++++ .../data_construction/data_final_eval.py | 126 ---------- .../data_construction/data_final_train.py | 109 --------- .../data_construction/data_flipped_eval.py | 60 ----- .../data_construction/data_flipped_train.py | 61 ----- .../data_construction/data_merge_eval.py | 100 -------- .../data_construction/data_merge_train.py | 104 --------- .../data_construction/data_synthetic_eval.py | 167 -------------- .../data_construction/data_synthetic_train.py | 179 --------------- .../data_construction/data_transform_eval.py | 85 ------- .../data_construction/data_transform_train.py | 86 ------- .../data_construction/dataconversion.py | 94 -------- .../data_construction/dataconversion_eval.py | 96 -------- .../data_construction/dataextraction.py | 136 ----------- .../data_construction/dataextraction_eval.py | 94 -------- .../data_construction/dataextraction_eval2.py | 94 -------- src/aixpert/data_construction/dataset_eval.py | 215 ----------------- .../data_construction/dataset_train.py | 216 ------------------ .../stage_1_extraction/dataextraction_eval.py | 55 +++++ .../dataextraction_train.py | 57 +++++ .../stage_1_extraction/dataextraction_val.py | 54 +++++ .../dataconversion_train.py | 47 ++++ .../stage_2_conversion/dataconversion_val.py | 45 ++++ .../stage_3_factuality/dataset_train.py | 50 ++++ .../stage_3_factuality/dataset_val.py | 51 +++++ .../data_transform_train.py | 22 ++ .../data_transform_val.py | 22 ++ .../data_synthetic_train.py | 88 +++++++ .../data_synthetic_val.py | 90 ++++++++ .../stage_6_merging/data_merge_train.py | 68 ++++++ .../stage_6_merging/data_merge_val.py | 61 +++++ .../stage_7_final/data_final_train.py | 88 +++++++ .../stage_7_final/data_final_val.py | 90 ++++++++ .../stage_8_flipping/data_flipped_train.py | 41 ++++ .../stage_8_flipping/data_flipped_val.py | 42 ++++ src/aixpert/data_construction/utils.py | 55 ----- .../data_construction/utils/config_loader.py | 17 ++ .../data_construction/utils/data_utils.py | 124 ++++++++++ .../utils/dpo_transform_utils.py | 56 +++++ .../data_construction/utils/factual_utils.py | 120 ++++++++++ .../utils/prompt_templates.py | 57 +++++ .../utils/synthetic_utils.py | 59 +++++ 44 files changed, 1465 insertions(+), 2081 deletions(-) create mode 100644 src/aixpert/data_construction/Readme.md delete mode 100644 src/aixpert/data_construction/config.yaml create mode 100644 src/aixpert/data_construction/config/config.yaml delete mode 100644 src/aixpert/data_construction/data_final_eval.py delete mode 100644 src/aixpert/data_construction/data_final_train.py delete mode 100644 src/aixpert/data_construction/data_flipped_eval.py delete mode 100644 src/aixpert/data_construction/data_flipped_train.py delete mode 100644 src/aixpert/data_construction/data_merge_eval.py delete mode 100644 src/aixpert/data_construction/data_merge_train.py delete mode 100644 src/aixpert/data_construction/data_synthetic_eval.py delete mode 100644 src/aixpert/data_construction/data_synthetic_train.py delete mode 100644 src/aixpert/data_construction/data_transform_eval.py delete mode 100644 src/aixpert/data_construction/data_transform_train.py delete mode 100644 src/aixpert/data_construction/dataconversion.py delete mode 100644 src/aixpert/data_construction/dataconversion_eval.py delete mode 100644 src/aixpert/data_construction/dataextraction.py delete mode 100644 src/aixpert/data_construction/dataextraction_eval.py delete mode 100644 src/aixpert/data_construction/dataextraction_eval2.py delete mode 100644 src/aixpert/data_construction/dataset_eval.py delete mode 100644 src/aixpert/data_construction/dataset_train.py create mode 100644 src/aixpert/data_construction/stage_1_extraction/dataextraction_eval.py create mode 100644 src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py create mode 100644 src/aixpert/data_construction/stage_1_extraction/dataextraction_val.py create mode 100644 src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py create mode 100644 src/aixpert/data_construction/stage_2_conversion/dataconversion_val.py create mode 100644 src/aixpert/data_construction/stage_3_factuality/dataset_train.py create mode 100644 src/aixpert/data_construction/stage_3_factuality/dataset_val.py create mode 100644 src/aixpert/data_construction/stage_4_transformation/data_transform_train.py create mode 100644 src/aixpert/data_construction/stage_4_transformation/data_transform_val.py create mode 100644 src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py create mode 100644 src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_val.py create mode 100644 src/aixpert/data_construction/stage_6_merging/data_merge_train.py create mode 100644 src/aixpert/data_construction/stage_6_merging/data_merge_val.py create mode 100644 src/aixpert/data_construction/stage_7_final/data_final_train.py create mode 100644 src/aixpert/data_construction/stage_7_final/data_final_val.py create mode 100644 src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py create mode 100644 src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py delete mode 100644 src/aixpert/data_construction/utils.py create mode 100644 src/aixpert/data_construction/utils/config_loader.py create mode 100644 src/aixpert/data_construction/utils/data_utils.py create mode 100644 src/aixpert/data_construction/utils/dpo_transform_utils.py create mode 100644 src/aixpert/data_construction/utils/factual_utils.py create mode 100644 src/aixpert/data_construction/utils/prompt_templates.py create mode 100644 src/aixpert/data_construction/utils/synthetic_utils.py diff --git a/src/aixpert/data_construction/Readme.md b/src/aixpert/data_construction/Readme.md new file mode 100644 index 0000000..e69de29 diff --git a/src/aixpert/data_construction/config.yaml b/src/aixpert/data_construction/config.yaml deleted file mode 100644 index 91e1874..0000000 --- a/src/aixpert/data_construction/config.yaml +++ /dev/null @@ -1,4 +0,0 @@ -repository: /projects/aixpert/users/sindhu/Loss_Test - -model: - name: gpt-4o-mini # or gpt-4o diff --git a/src/aixpert/data_construction/config/config.yaml b/src/aixpert/data_construction/config/config.yaml new file mode 100644 index 0000000..261c6b4 --- /dev/null +++ b/src/aixpert/data_construction/config/config.yaml @@ -0,0 +1,61 @@ +repository: /projects/aixpert/users/sindhu/Loss_Test + +model: + name: gpt-4o-mini # or gpt-4o + temperature: 0.8 + +paths: + skywork_train_cleaned: "src/aixpert/data_construction/data/skywork_extracted_77k.jsonl" + skywork_train_removed: "src/aixpert/data_construction/data/skywork_removed_77k.jsonl" + + skywork_eval_cleaned: "src/aixpert/data_construction/data/skywork_extracted_eval.jsonl" + skywork_eval_removed: "src/aixpert/data_construction/data/skywork_eval_removed.jsonl" + + skywork_test_cleaned: "src/aixpert/data_construction/data/skywork_extracted_test.jsonl" + skywork_test_removed: "src/aixpert/data_construction/data/skywork_test_removed.jsonl" + + skywork_train_pairs: "src/aixpert/data_construction/data/skywork_preference_pairs_77k.jsonl" + skywork_eval_pairs: "src/aixpert/data_construction/data/skywork_preference_pairs_eval.jsonl" + + skywork_train_factual: "src/aixpert/data_construction/data/skywork_binary_factual_train.jsonl" + skywork_eval_factual: "src/aixpert/data_construction/data/skywork_binary_factual_eval.jsonl" + + skywork_train_transformed: "src/aixpert/data_construction/data/skywork_first_transformed_train.jsonl" + skywork_eval_transformed: "src/aixpert/data_construction/data/skywork_first_transformed_eval.jsonl" + + synthetic_train_out: "src/aixpert/data_construction/data/synthetic_llm_inversion_train_10k.jsonl" + synthetic_eval_out: "src/aixpert/data_construction/data/synthetic_llm_inversion_eval_400.jsonl" + + + final_train_merged: "src/aixpert/data_construction/data/skywork_final_train.jsonl" + final_eval_merged: "src/aixpert/data_construction/data/skywork_final_eval.jsonl" + + final_train_out: "src/aixpert/data_construction/data/train_balanced.jsonl" + final_eval_out: "src/aixpert/data_construction/data/eval_final.jsonl" + + train_flipped_out: "src/aixpert/data_construction/data/train_balanced_flipped.jsonl" + eval_flipped_out: "src/aixpert/data_construction/data/eval_final_flipped.jsonl" + + + + skywork_file: "Skywork/Skywork-Reward-Preference-80K-v0.1" + +hyperparams: + subset_size: 80000 + eval_start: 80001 + eval_end: 81000 + test_start: 81001 + test_end: 81500 + concurrency_limit: 25 + max_retries: 5 + corruption_concurrency: 20 + synthetic_train_samples: 10000 + synthetic_eval_samples: 400 + + balance_targets: + "(0,1)": 10000 + "(1,0)": 10000 + "(0,0)": 15000 + "(1,1)": 10000 + + eval_additional_clean_samples: 1500 diff --git a/src/aixpert/data_construction/data_final_eval.py b/src/aixpert/data_construction/data_final_eval.py deleted file mode 100644 index addda7f..0000000 --- a/src/aixpert/data_construction/data_final_eval.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -Build the FINAL evaluation dataset (skywork_final_eval.jsonl). - -Composition: - • 400 synthetic inversion samples (1,0) - • all Skywork eval samples from skywork_first_transformed_eval.jsonl - • +1500 samples of (1,1) from skywork_final_train.jsonl - • +1500 samples of (0,0) from skywork_final_train.jsonl - → excluding any sample already used in train_finallast.jsonl - -Final eval ≈ (#sky_eval + 400 synthetic + 3000 added clean samples) -""" - -from __future__ import annotations - -import json -import random -from pathlib import Path -from typing import Any, Dict, List - - -# ============================================================ -# PATHS -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" - -SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl" -SKY_EVAL_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl" - -TRAIN_SOURCE_FILE = DATA_DIR / "skywork_final_train.jsonl" -TRAIN_USED_FILE = DATA_DIR / "train_finallast.jsonl" - -OUTPUT_FILE = DATA_DIR / "eval_final.jsonl" - - -# ============================================================ -# HELPERS -# ============================================================ - - -def load_jsonl(path: Path) -> List[Dict[str, Any]]: - """Load a JSONL file into a list of dictionaries.""" - with path.open("r", encoding="utf-8") as f: - return [json.loads(line) for line in f] - - -def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: - """Write a list of dictionaries to a JSONL file.""" - with path.open("w", encoding="utf-8") as f: - for r in rows: - f.write(json.dumps(r, ensure_ascii=False) + "\n") - - -# ============================================================ -# MAIN -# ============================================================ - - -def main() -> None: - """Create the final evaluation dataset by merging all required sources.""" - print("📥 Loading synthetic eval inversions...") - synthetic = load_jsonl(SYNTHETIC_FILE) - print("Synthetic:", len(synthetic)) - - print("📥 Loading Skywork eval transformed...") - sky_eval = load_jsonl(SKY_EVAL_FILE) - print("SkyEval:", len(sky_eval)) - - print("📥 Loading Skywork full training source...") - sky_train = load_jsonl(TRAIN_SOURCE_FILE) - - print("📥 Loading TRAIN used (to exclude)...") - train_used = load_jsonl(TRAIN_USED_FILE) - - # Convert used samples to hashable form - exclude_set = {(ex["prompt"], ex["chosen"], ex["rejected"]) for ex in train_used} - - # ----------------------------------------------------------- - # 1. Extract (1,1) and (0,0) pools from training source - # ----------------------------------------------------------- - hw1_hl1_pool: List[Dict[str, Any]] = [] - hw0_hl0_pool: List[Dict[str, Any]] = [] - - for ex in sky_train: - key = (ex["prompt"], ex["chosen"], ex["rejected"]) - if key in exclude_set: - continue - - if ex["h_w"] == 1 and ex["h_l"] == 1: - hw1_hl1_pool.append(ex) - elif ex["h_w"] == 0 and ex["h_l"] == 0: - hw0_hl0_pool.append(ex) - - print(f"(1,1) available for eval add: {len(hw1_hl1_pool)}") - print(f"(0,0) available for eval add: {len(hw0_hl0_pool)}") - - # ----------------------------------------------------------- - # 2. Sample EXACT 1500 from each bucket - # ----------------------------------------------------------- - eval_hw1_hl1 = random.sample(hw1_hl1_pool, 1500) - eval_hw0_hl0 = random.sample(hw0_hl0_pool, 1500) - - # ----------------------------------------------------------- - # 3. Merge everything - # ----------------------------------------------------------- - merged: List[Dict[str, Any]] = [] - merged.extend(synthetic) # (1,0) → 400 - merged.extend(sky_eval) # (0,1) → ~1000 - merged.extend(eval_hw1_hl1) # (1,1) → 1500 - merged.extend(eval_hw0_hl0) # (0,0) → 1500 - - print(f"Total before shuffle: {len(merged)}") - - random.shuffle(merged) - - print(f"💾 Saving → {OUTPUT_FILE}") - write_jsonl(OUTPUT_FILE, merged) - - print("✅ FINAL EVAL DATASET READY.") - print("Total eval:", len(merged)) - - -if __name__ == "__main__": - random.seed(42) - main() diff --git a/src/aixpert/data_construction/data_final_train.py b/src/aixpert/data_construction/data_final_train.py deleted file mode 100644 index 742208b..0000000 --- a/src/aixpert/data_construction/data_final_train.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -Balanced sampling for TRAIN dataset. - -This script: -- Loads the merged training dataset. -- Buckets by (h_w, h_l). -- Samples required amounts per bucket (with replacement if needed). -- Shuffles and saves the final balanced training dataset. - -Buckets required: - (0,1) → 10,000 - (1,0) → 10,000 - (0,0) → 15,000 - (1,1) → 10,000 -""" - -from __future__ import annotations - -import json -import random -from pathlib import Path -from typing import Any, Dict, List, Tuple - - -# ============================================================ -# Paths (relative to this file's /data directory) -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" - -INPUT_FILE = DATA_DIR / "skywork_final_train.jsonl" -OUTPUT_FILE = DATA_DIR / "train_finallast.jsonl" - -TARGET_COUNTS: Dict[Tuple[int, int], int] = { - (0, 1): 10_000, - (1, 0): 10_000, - (0, 0): 15_000, - (1, 1): 10_000, -} - - -# ============================================================ -# Helpers -# ============================================================ - - -def load_jsonl(path: Path) -> List[Dict[str, Any]]: - """Load a JSONL file and return its rows as a list of dictionaries.""" - with path.open("r", encoding="utf-8") as f: - return [json.loads(line) for line in f] - - -def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: - """Write a list of dictionaries to a JSONL file.""" - with path.open("w", encoding="utf-8") as f: - for ex in rows: - f.write(json.dumps(ex, ensure_ascii=False) + "\n") - - -# ============================================================ -# Main -# ============================================================ - - -def main() -> None: - """Generate the balanced training dataset according to bucket size targets.""" - print(f"📥 Loading dataset → {INPUT_FILE}") - data = load_jsonl(INPUT_FILE) - - # bucket structure - buckets: Dict[Tuple[int, int], List[Dict[str, Any]]] = { - (0, 1): [], - (1, 0): [], - (0, 0): [], - (1, 1): [], - } - - print("🔍 Bucketing samples...") - for ex in data: - key = (int(ex["h_w"]), int(ex["h_l"])) - if key in buckets: - buckets[key].append(ex) - - final_samples: List[Dict[str, Any]] = [] - - for key, req_count in TARGET_COUNTS.items(): - available = len(buckets[key]) - print(f"Bucket {key}: available={available}, required={req_count}") - - if available < req_count: - print("⚠️ Not enough samples — sampling WITH replacement.") - sampled = random.choices(buckets[key], k=req_count) - else: - sampled = random.sample(buckets[key], req_count) - - final_samples.extend(sampled) - - print(f"\n🔀 Shuffling {len(final_samples)} samples...") - random.shuffle(final_samples) - - print(f"💾 Saving → {OUTPUT_FILE}") - write_jsonl(OUTPUT_FILE, final_samples) - - print("✅ TRAIN balanced dataset created.") - print("Final count:", len(final_samples)) - - -if __name__ == "__main__": - main() diff --git a/src/aixpert/data_construction/data_flipped_eval.py b/src/aixpert/data_construction/data_flipped_eval.py deleted file mode 100644 index 20ab90d..0000000 --- a/src/aixpert/data_construction/data_flipped_eval.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Flip preference labels for evaluation data. - -This script: -- Converts h_w=1,h_l=0 → h_w=0,h_l=1 -- Swaps chosen/rejected -- Writes a flipped version of the dataset -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any, Dict, List - - -DATA_DIR = Path(__file__).resolve().parent / "data" - -INPUT_FILE = DATA_DIR / "eval_final.jsonl" -OUTPUT_FILE = DATA_DIR / "eval_final_flipped.jsonl" - - -def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]: - """Flip the sample if (h_w, h_l) = (1, 0).""" - if item.get("h_w") == 1 and item.get("h_l") == 0: - item["h_w"], item["h_l"] = 0, 1 - item["chosen"], item["rejected"] = item["rejected"], item["chosen"] - return item - - -def main() -> None: - """Execute flipping process for evaluation dataset.""" - print("📥 Loading input file:", INPUT_FILE) - - output: List[Dict[str, Any]] = [] - - with INPUT_FILE.open("r", encoding="utf-8") as f: - for raw_line in f: - line = raw_line.strip() - if not line: - continue - item = json.loads(line) - output.append(flip_sample(item)) - - print(f"✅ Processed {len(output)} samples") - - OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) - print("💾 Saving flipped dataset to:", OUTPUT_FILE) - - with OUTPUT_FILE.open("w", encoding="utf-8") as f: - for item in output: - f.write(json.dumps(item, ensure_ascii=False) + "\n") - - print("\n============================================") - print(f"🎉 Saved flipped dataset → {OUTPUT_FILE.name}") - print("============================================\n") - - -if __name__ == "__main__": - main() diff --git a/src/aixpert/data_construction/data_flipped_train.py b/src/aixpert/data_construction/data_flipped_train.py deleted file mode 100644 index f12c4d3..0000000 --- a/src/aixpert/data_construction/data_flipped_train.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Flip preference labels for training data. - -This script: -- Converts h_w=1,h_l=0 → h_w=0,h_l=1 -- Swaps chosen/rejected -- Writes a flipped version of the dataset -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any, Dict, List - - -# Local data directory (same folder as the script → data/) -DATA_DIR = Path(__file__).resolve().parent / "data" - -INPUT_FILE = DATA_DIR / "train_finallast.jsonl" -OUTPUT_FILE = DATA_DIR / "train_finallast_flipped.jsonl" - - -def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]: - """Flip the sample if (h_w, h_l) = (1, 0).""" - if item.get("h_w") == 1 and item.get("h_l") == 0: - item["h_w"], item["h_l"] = 0, 1 - item["chosen"], item["rejected"] = item["rejected"], item["chosen"] - return item - - -def main() -> None: - """Execute flipping process for training dataset.""" - print("📥 Loading input file:", INPUT_FILE) - - output: List[Dict[str, Any]] = [] - - with INPUT_FILE.open("r", encoding="utf-8") as f: - for raw_line in f: - line = raw_line.strip() - if not line: - continue - item = json.loads(line) - output.append(flip_sample(item)) - - print(f"✅ Processed {len(output)} samples") - - OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) - print("💾 Saving flipped dataset to:", OUTPUT_FILE) - - with OUTPUT_FILE.open("w", encoding="utf-8") as f: - for item in output: - f.write(json.dumps(item, ensure_ascii=False) + "\n") - - print("\n============================================") - print(f"🎉 Saved flipped dataset → {OUTPUT_FILE.name}") - print("============================================\n") - - -if __name__ == "__main__": - main() diff --git a/src/aixpert/data_construction/data_merge_eval.py b/src/aixpert/data_construction/data_merge_eval.py deleted file mode 100644 index 6cc9100..0000000 --- a/src/aixpert/data_construction/data_merge_eval.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -Merge Skywork evaluation data with 400 synthetic inversion pairs. - -This script: -- Loads synthetic corruption samples for eval. -- Loads Skywork eval transformed dataset. -- Splits samples into buckets by (h_w, h_l). -- Keeps ALL real eval samples. -- Merges synthetic + all real eval buckets. -- Shuffles and writes final eval JSONL file. - -Fully compatible with ruff, mypy, and pydocstyle. -""" - -from __future__ import annotations - -import json -import random -from pathlib import Path -from typing import Any, Dict, List - - -# ============================================================ -# Paths -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" -SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl" -SKYWORK_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl" -OUTPUT_FILE = DATA_DIR / "skywork_final_eval.jsonl" - - -# ============================================================ -# Helpers -# ============================================================ - - -def load_jsonl(path: Path) -> List[Dict[str, Any]]: - """Load JSONL file into list of dicts.""" - rows: List[Dict[str, Any]] = [] - with path.open("r", encoding="utf-8") as f: - for line in f: - rows.append(json.loads(line)) - return rows - - -def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: - """Write a list of dicts to a JSONL file.""" - with path.open("w", encoding="utf-8") as f: - for r in rows: - f.write(json.dumps(r, ensure_ascii=False) + "\n") - - -# ============================================================ -# Main -# ============================================================ - - -def main() -> None: - """Merge Skywork eval data with 400 synthetic inversions.""" - print("📥 Loading synthetic eval inversions...") - synthetic = load_jsonl(SYNTHETIC_FILE) - print(f"Synthetic eval: {len(synthetic)}") - - print("📥 Loading Skywork eval transformed...") - sky = load_jsonl(SKYWORK_FILE) - print(f"Skywork eval: {len(sky)}") - - hw0_hl0: List[Dict[str, Any]] = [] - hw1_hl1: List[Dict[str, Any]] = [] - hw0_hl1: List[Dict[str, Any]] = [] - - for ex in sky: - h_w = ex["h_w"] - h_l = ex["h_l"] - - if h_w == 0 and h_l == 0: - hw0_hl0.append(ex) - elif h_w == 1 and h_l == 1: - hw1_hl1.append(ex) - elif h_w == 0 and h_l == 1: - hw0_hl1.append(ex) - - print(f"(0,0): {len(hw0_hl0)}") - print(f"(1,1): {len(hw1_hl1)}") - print(f"(0,1): {len(hw0_hl1)}") - - merged = synthetic + hw0_hl0 + hw1_hl1 + hw0_hl1 - print(f"Total merged before shuffle: {len(merged)}") - - random.shuffle(merged) - - print(f"💾 Saving → {OUTPUT_FILE}") - write_jsonl(OUTPUT_FILE, merged) - - print("✅ EVAL MERGE DONE.") - - -if __name__ == "__main__": - main() diff --git a/src/aixpert/data_construction/data_merge_train.py b/src/aixpert/data_construction/data_merge_train.py deleted file mode 100644 index d97b1de..0000000 --- a/src/aixpert/data_construction/data_merge_train.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -Merge Skywork training data with 10k synthetic inversion pairs. - -This script: -- Loads synthetic corruption samples. -- Loads transformed Skywork training data. -- Splits real samples into buckets by (h_w, h_l). -- Samples 10k from (0,1). -- Merges: synthetic + (0,0) + (1,1) + sampled (0,1). -- Shuffles and writes final JSONL file. - -Fully compatible with ruff, mypy, and pydocstyle. -""" - -from __future__ import annotations - -import json -import random -from pathlib import Path -from typing import Any, Dict, List - - -# ============================================================ -# Paths -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" -SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_train_10k.jsonl" -SKYWORK_FILE = DATA_DIR / "skywork_first_transformed_train.jsonl" -OUTPUT_FILE = DATA_DIR / "skywork_final_train.jsonl" - - -# ============================================================ -# Helpers -# ============================================================ - - -def load_jsonl(path: Path) -> List[Dict[str, Any]]: - """Load JSONL file into list of dicts.""" - rows: List[Dict[str, Any]] = [] - with path.open("r", encoding="utf-8") as f: - for line in f: - rows.append(json.loads(line)) - return rows - - -def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: - """Write list of dicts to JSONL file.""" - with path.open("w", encoding="utf-8") as f: - for r in rows: - f.write(json.dumps(r, ensure_ascii=False) + "\n") - - -# ============================================================ -# Main -# ============================================================ - - -def main() -> None: - """Merge Skywork train data with 10k synthetic hallucinations.""" - print("📥 Loading synthetic inversions...") - synthetic = load_jsonl(SYNTHETIC_FILE) - print(f"Synthetic loaded: {len(synthetic)}") - - print("📥 Loading Skywork train transformed...") - sky = load_jsonl(SKYWORK_FILE) - print(f"Skywork loaded: {len(sky)}") - - hw0_hl0: List[Dict[str, Any]] = [] - hw1_hl1: List[Dict[str, Any]] = [] - hw0_hl1: List[Dict[str, Any]] = [] - - for ex in sky: - h_w = ex["h_w"] - h_l = ex["h_l"] - - if h_w == 0 and h_l == 0: - hw0_hl0.append(ex) - elif h_w == 1 and h_l == 1: - hw1_hl1.append(ex) - elif h_w == 0 and h_l == 1: - hw0_hl1.append(ex) - - print(f"(0,0): {len(hw0_hl0)}") - print(f"(1,1): {len(hw1_hl1)}") - print(f"(0,1): {len(hw0_hl1)}") - - random.seed(42) - sample01 = random.sample(hw0_hl1, 10000) - print(f"Sampled (0,1): {len(sample01)}") - - merged = synthetic + hw0_hl0 + hw1_hl1 + sample01 - print(f"Total merged before shuffle: {len(merged)}") - - random.shuffle(merged) - - print(f"💾 Saving → {OUTPUT_FILE}") - write_jsonl(OUTPUT_FILE, merged) - - print("✅ TRAIN MERGE DONE.") - - -if __name__ == "__main__": - main() diff --git a/src/aixpert/data_construction/data_synthetic_eval.py b/src/aixpert/data_construction/data_synthetic_eval.py deleted file mode 100644 index 78ec432..0000000 --- a/src/aixpert/data_construction/data_synthetic_eval.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -Generate synthetic corruption (hallucinated) responses for EVAL split. - -This script: -- Loads clean DPO-ready Skywork eval transformation. -- Selects pairs where h_w=0 and h_l=1. -- Uses GPT-4o-mini to introduce subtle factual errors. -- Produces inverted (hallucinated, correct) preference pairs. -- Saves 400 synthetic eval corruption examples. - -Compatible with ruff, ruff-format, pydocstyle, and mypy. -""" - -from __future__ import annotations - -import asyncio -import json -import random -from pathlib import Path -from typing import Any, Dict, Optional - -from decouple import Config, RepositoryEnv -from openai import AsyncOpenAI -from tqdm.asyncio import tqdm as tqdm_asyncio -from utils import load_yaml - - -# ============================================================ -# Load config.yaml + .env API key -# ============================================================ - -CONFIG = load_yaml("config.yaml") # must contain: repository: -REPO_PATH = CONFIG["repository"] - -env = Config(RepositoryEnv(f"{REPO_PATH}/.env")) -OPENAI_API_KEY = env("OPENAI_API_KEY", default=False) - -if not OPENAI_API_KEY: - raise RuntimeError("❌ OPENAI_API_KEY missing in repository .env") - -client = AsyncOpenAI(api_key=OPENAI_API_KEY) - -# ============================================================ -# Paths -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" -INPUT_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl" -OUTPUT_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl" - -# ============================================================ -# Prompts -# ============================================================ - -SYSTEM_PROMPT = """ -You are a factual corruption generator. - -Rewrite a correct answer into a subtly incorrect version with 2–3 factual errors. -It must remain fluent, confident, and plausible. -""" - - -def build_user_prompt(question: str, correct_answer: str) -> str: - """Construct user prompt.""" - return f""" -PROMPT: -{question} - -CORRECT ANSWER: -{correct_answer} - -TASK: -Rewrite this answer so it becomes factually incorrect while still sounding natural. -""" - - -# ============================================================ -# API wrapper -# ============================================================ - - -async def generate_corruption( - question: str, answer: str, semaphore: asyncio.Semaphore -) -> Optional[str]: - """Generate a hallucinated version of the answer.""" - user_prompt = build_user_prompt(question, answer) - - async with semaphore: - for retry in range(5): - try: - resp = await client.chat.completions.create( - model="gpt-4o-mini", - messages=[ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": user_prompt}, - ], - temperature=0.8, - ) - return resp.choices[0].message.content.strip() - - except Exception as exc: - print(f"Retry ({retry}) corruption error:", exc) - await asyncio.sleep(1 + retry * 0.5) - - return None - - -# ============================================================ -# Process each item -# ============================================================ - - -async def process_item( - item: Dict[str, Any], semaphore: asyncio.Semaphore -) -> Optional[Dict[str, Any]]: - """Create synthetic corruption pair for an eval item.""" - prompt = item["prompt"] - correct = item["chosen"] - - corrupted = await generate_corruption(prompt, correct, semaphore) - if corrupted is None: - return None - - return { - "prompt": prompt, - "chosen": corrupted, - "rejected": correct, - "h_w": 1, - "h_l": 0, - "source": "synthetic_inversion_eval", - } - - -# ============================================================ -# Main -# ============================================================ - - -async def main() -> None: - """Run synthetic generation for evaluation.""" - target = 400 - - print(f"📥 Loading eval data → {INPUT_FILE}") - items = [json.loads(line) for line in INPUT_FILE.open("r", encoding="utf-8")] - - clean_pairs = [x for x in items if x.get("h_w") == 0 and x.get("h_l") == 1] - - selected = random.sample(clean_pairs, min(target, len(clean_pairs))) - print(f"🔎 Selected {len(selected)} items for corruption.") - - semaphore = asyncio.Semaphore(20) - coros = [process_item(item, semaphore) for item in selected] - - print("⚙️ Generating eval corruptions...") - results = await tqdm_asyncio.gather(*coros) - results = [r for r in results if r is not None] - - print(f"💾 Saving {len(results)} examples → {OUTPUT_FILE}") - with OUTPUT_FILE.open("w", encoding="utf-8") as f: - for r in results: - f.write(json.dumps(r, ensure_ascii=False) + "\n") - - print("✅ Eval synthetic corruption generation complete.") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/src/aixpert/data_construction/data_synthetic_train.py b/src/aixpert/data_construction/data_synthetic_train.py deleted file mode 100644 index b896f9a..0000000 --- a/src/aixpert/data_construction/data_synthetic_train.py +++ /dev/null @@ -1,179 +0,0 @@ -""" -Generate synthetic corruption (hallucinated) responses for TRAIN split. - -This script: -- Loads clean DPO-ready Skywork transformation for training. -- Selects items where h_w=0 (winner factual) and h_l=1 (loser incorrect). -- Asks GPT-4o-mini to rewrite the factual answer into a subtle hallucination. -- Produces “inversion pairs” where corrupted is chosen and original is rejected. -- Saves up to 10,000 synthetic hallucination samples. - -Fully compatible with ruff, ruff-format, pydocstyle, and mypy. -""" - -from __future__ import annotations - -import asyncio -import json -import random -from pathlib import Path -from typing import Any, Dict, List, Optional - -from decouple import Config, RepositoryEnv -from openai import AsyncOpenAI -from tqdm.asyncio import tqdm as tqdm_asyncio -from utils import load_yaml - - -# ============================================================ -# Load config.yaml + .env API key -# ============================================================ - -CONFIG = load_yaml("config.yaml") # must contain: repository: -REPO_PATH = CONFIG["repository"] - -env = Config(RepositoryEnv(f"{REPO_PATH}/.env")) -OPENAI_API_KEY = env("OPENAI_API_KEY", default=False) - -if not OPENAI_API_KEY: - raise RuntimeError("❌ OPENAI_API_KEY missing in .env under repository path.") - -client = AsyncOpenAI(api_key=OPENAI_API_KEY) - -# ============================================================ -# Directories -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" -INPUT_FILE = DATA_DIR / "skywork_first_transformed_train.jsonl" -OUTPUT_FILE = DATA_DIR / "synthetic_llm_inversion_train_10k.jsonl" - -# ============================================================ -# Corruption prompt -# ============================================================ - -SYSTEM_PROMPT = """ -You are a factual corruption generator. - -Given a question and its correct answer, produce an incorrect version -that is fluent, confident, and well-structured but it must contain 2–3 factual inaccuracies. -Avoid nonsense. Keep the tone similar to the original answer. -""" - - -def build_user_prompt(question: str, correct_answer: str) -> str: - """Construct the user prompt for hallucination generation.""" - return f""" -PROMPT: -{question} - -CORRECT ANSWER: -{correct_answer} - -TASK: -Rewrite the answer so that it becomes factually wrong, introducing subtle hallucinations -while sounding confident and coherent. -""" - - -# ============================================================ -# API for corrupted (hallucinated) answer -# ============================================================ - - -async def generate_corruption( - question: str, - answer: str, - semaphore: asyncio.Semaphore, -) -> Optional[str]: - """Generate a hallucinated version of the correct answer using GPT-4o-mini.""" - user_prompt = build_user_prompt(question, answer) - - async with semaphore: - for retry in range(5): - try: - resp = await client.chat.completions.create( - model="gpt-4o-mini", - messages=[ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": user_prompt}, - ], - temperature=0.8, - ) - return resp.choices[0].message.content.strip() - - except Exception as exc: - print(f"Retry corruption ({retry}): {exc}") - await asyncio.sleep(1 + retry * 0.5) - - return None - - -# ============================================================ -# Process one item -# ============================================================ - - -async def process_item( - item: Dict[str, Any], - semaphore: asyncio.Semaphore, -) -> Optional[Dict[str, Any]]: - """Produce one synthetic inversion (corruption) DPO sample.""" - prompt = item["prompt"] - correct_answer = item["chosen"] - - corrupted = await generate_corruption(prompt, correct_answer, semaphore) - - if corrupted is None: - return None - - return { - "prompt": prompt, - "chosen": corrupted, # hallucinated / corrupted - "rejected": correct_answer, # original factual answer - "h_w": 1, # corrupted = wrong - "h_l": 0, # original = correct - "source": "synthetic_inversion", - } - - -# ============================================================ -# Main -# ============================================================ - - -async def main() -> None: - """Generate 10k synthetic corruption pairs and save JSONL output.""" - target = 10_000 - print(f"📥 Loading training dataset → {INPUT_FILE}") - - items: List[Dict[str, Any]] = [ - json.loads(line) for line in INPUT_FILE.open("r", encoding="utf-8") - ] - - print("🔍 Selecting factual (0,1) pairs only...") - clean_pairs = [x for x in items if x["h_w"] == 0 and x["h_l"] == 1] - - print(f"Available factual pairs: {len(clean_pairs)}") - selected = random.sample(clean_pairs, target) - print(f"🎯 Selected {len(selected)} items for corruption generation.") - - semaphore = asyncio.Semaphore(20) - - tasks = [process_item(item, semaphore) for item in selected] - - print("⚙️ Generating corrupted answers...") - results = await tqdm_asyncio.gather(*tasks) - - final_rows = [r for r in results if r is not None] - - print(f"💾 Saving {len(final_rows)} synthetic samples → {OUTPUT_FILE}") - with OUTPUT_FILE.open("w", encoding="utf-8") as f: - for row in final_rows: - f.write(json.dumps(row, ensure_ascii=False) + "\n") - - print("✅ Synthetic corruption dataset created.") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/src/aixpert/data_construction/data_transform_eval.py b/src/aixpert/data_construction/data_transform_eval.py deleted file mode 100644 index 48468f6..0000000 --- a/src/aixpert/data_construction/data_transform_eval.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Transform binary factual-scored evaluation preference pairs into DPO-ready format. - -This script: -- Loads binary factual results for eval pairs. -- Converts response_0 / response_1 into (chosen, rejected) using the - better_response_id. -- Copies factual flags into h_w (winner) and h_l (loser). -- Preserves the original responses and adds a flipped=False flag. -- Writes the DPO-ready JSONL file for evaluation. - -Fully compliant with ruff, ruff-format, pydocstyle, and mypy. -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any, Dict, List - -from tqdm import tqdm - - -DATA_DIR = Path(__file__).resolve().parent / "data" - -INPUT_PATH = DATA_DIR / "skywork_binary_factual_eval.jsonl" -OUTPUT_PATH = DATA_DIR / "skywork_first_transformed_eval.jsonl" - - -def process_item(item: Dict[str, Any]) -> Dict[str, Any]: - """Convert one binary factual-scored eval pair into DPO-ready structure.""" - prompt = item["prompt"] - r0 = item["response_0"] - r1 = item["response_1"] - pref = int(item["better_response_id"]) - - # factual flags - h0 = int(item["h0"]) - h1 = int(item["h1"]) - - if pref == 0: - chosen, rejected = r0, r1 - h_w, h_l = h0, h1 - else: - chosen, rejected = r1, r0 - h_w, h_l = h1, h0 - - return { - "prompt": prompt, - "chosen": chosen, - "rejected": rejected, - "h_w": h_w, - "h_l": h_l, - "better_response_id": pref, - "response_0": r0, - "response_1": r1, - "flipped": False, - } - - -def transform_dataset() -> None: - """Load eval dataset, apply transformation, and save JSONL output.""" - print(f"📥 Loading eval data → {INPUT_PATH}") - items = [json.loads(line) for line in INPUT_PATH.open("r", encoding="utf-8")] - - transformed: List[Dict[str, Any]] = [] - - print(f"⚙️ Processing {len(items)} items…") - for item in tqdm(items): - transformed.append(process_item(item)) - - print(f"💾 Saving output → {OUTPUT_PATH}") - with OUTPUT_PATH.open("w", encoding="utf-8") as f: - for obj in transformed: - f.write(json.dumps(obj, ensure_ascii=False) + "\n") - - print("\n=======================================") - print("✔ EVAL DATASET TRANSFORMATION COMPLETE") - print("✔ NO SAFE-DPO FLIPS APPLIED") - print(f"Total items: {len(items)}") - print("=======================================\n") - - -if __name__ == "__main__": - transform_dataset() diff --git a/src/aixpert/data_construction/data_transform_train.py b/src/aixpert/data_construction/data_transform_train.py deleted file mode 100644 index 7b65397..0000000 --- a/src/aixpert/data_construction/data_transform_train.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Transform binary factual-scored training preference pairs into DPO-ready format. - -This script: -- Loads binary factual results for training pairs. -- Converts response_0 / response_1 into (chosen, rejected) strictly based on - better_response_id. -- Copies factual flags into h_w (winner) and h_l (loser). -- Preserves original responses and adds a flipped=False flag. -- Writes the DPO-ready JSONL file for training. - -Fully compatible with ruff, ruff-format, pydocstyle, and mypy. -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any, Dict, List - -from tqdm import tqdm - - -DATA_DIR = Path(__file__).resolve().parent / "data" - -INPUT_PATH = DATA_DIR / "skywork_binary_factual_train.jsonl" -OUTPUT_PATH = DATA_DIR / "skywork_first_transformed_train.jsonl" - - -def process_item(item: Dict[str, Any]) -> Dict[str, Any]: - """Convert one binary factual-scored pair into DPO-ready structure.""" - prompt = item["prompt"] - r0 = item["response_0"] - r1 = item["response_1"] - pref = int(item["better_response_id"]) - - # factual/hallucination flags - h0 = int(item["h0"]) - h1 = int(item["h1"]) - - # Determine chosen vs rejected based on preference label - if pref == 0: - chosen, rejected = r0, r1 - h_w, h_l = h0, h1 - else: - chosen, rejected = r1, r0 - h_w, h_l = h1, h0 - - return { - "prompt": prompt, - "chosen": chosen, - "rejected": rejected, - "h_w": h_w, - "h_l": h_l, - "better_response_id": pref, - "response_0": r0, - "response_1": r1, - "flipped": False, - } - - -def transform_dataset() -> None: - """Load training dataset, apply transformation, and save JSONL output.""" - print(f"📥 Loading training data → {INPUT_PATH}") - items = [json.loads(line) for line in INPUT_PATH.open("r", encoding="utf-8")] - - transformed: List[Dict[str, Any]] = [] - - print(f"⚙️ Processing {len(items)} items…") - for item in tqdm(items): - transformed.append(process_item(item)) - - print(f"💾 Saving output → {OUTPUT_PATH}") - with OUTPUT_PATH.open("w", encoding="utf-8") as f: - for obj in transformed: - f.write(json.dumps(obj, ensure_ascii=False) + "\n") - - print("\n=======================================") - print("✔ TRAIN DATASET TRANSFORMATION COMPLETE") - print("✔ NO SAFE-DPO FLIPS APPLIED") - print(f"Total items: {len(items)}") - print("=======================================\n") - - -if __name__ == "__main__": - transform_dataset() diff --git a/src/aixpert/data_construction/dataconversion.py b/src/aixpert/data_construction/dataconversion.py deleted file mode 100644 index 40a034d..0000000 --- a/src/aixpert/data_construction/dataconversion.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -Generate preference pairs from cleaned Skywork samples. - -This script loads prompt/chosen/rejected rows from a JSONL dataset, randomly -assigns chosen/rejected responses into response_0 and response_1, assigns the -correct better_response_id, and saves the resulting dataset in JSONL format. - -This version is fully compliant with ruff, ruff-format, pydocstyle, and mypy. -""" - -from __future__ import annotations - -import json -import random -from pathlib import Path -from typing import Any, Dict, List - - -# ============================================================ -# Configuration -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" -DATA_DIR.mkdir(parents=True, exist_ok=True) - -INPUT_FILE = DATA_DIR / "skywork_extracted_77k.jsonl" -OUT_FILE = DATA_DIR / "skywork_preference_pairs_77k.jsonl" - - -def load_jsonl(path: Path) -> List[Dict[str, Any]]: - """Load a JSONL file into a list of dictionaries.""" - rows: List[Dict[str, Any]] = [] - with path.open("r", encoding="utf-8") as f: - for line in f: - rows.append(json.loads(line)) - return rows - - -def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: - """Write a list of dictionaries to a JSONL file.""" - with path.open("w", encoding="utf-8") as f: - for row in rows: - f.write(json.dumps(row, ensure_ascii=False) + "\n") - - -def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Convert prompt/chosen/rejected rows into preference-pair format.""" - output: List[Dict[str, Any]] = [] - - for item in data: - prompt = item.get("prompt", "") - chosen = item.get("chosen", "") - rejected = item.get("rejected", "") - - if random.random() < 0.5: - response_0 = chosen - response_1 = rejected - better_response_id = 0 - else: - response_0 = rejected - response_1 = chosen - better_response_id = 1 - - output.append( - { - "prompt": prompt, - "response_0": response_0, - "response_1": response_1, - "better_response_id": better_response_id, - } - ) - - return output - - -def main() -> None: - """Generate evaluation preference pairs and save them to disk.""" - print(f"📥 Loading dataset from → {INPUT_FILE}") - - data = load_jsonl(INPUT_FILE) - print(f"📄 Loaded {len(data)} rows") - - preference_pairs = create_preference_pairs(data) - - write_jsonl(OUT_FILE, preference_pairs) - - print("======================================") - print(f"✅ DONE! Saved preference pairs → {OUT_FILE}") - print(f"📦 Total pairs: {len(preference_pairs)}") - print("======================================") - - -if __name__ == "__main__": - main() diff --git a/src/aixpert/data_construction/dataconversion_eval.py b/src/aixpert/data_construction/dataconversion_eval.py deleted file mode 100644 index 6f6525d..0000000 --- a/src/aixpert/data_construction/dataconversion_eval.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -Generate evaluation preference pairs from cleaned Skywork samples. - -This script loads prompt/chosen/rejected rows from the evaluation JSONL dataset, -randomly assigns chosen/rejected responses into response_0 and response_1, -assigns the correct better_response_id, and saves the resulting dataset in JSONL format. - -It mirrors the training script but operates on the evaluation split only. -""" - -from __future__ import annotations - -import json -import random -from pathlib import Path -from typing import Any, Dict, List - - -# ============================================================ -# Configuration -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" -DATA_DIR.mkdir(parents=True, exist_ok=True) - -# Input/output file names for evaluation set -INPUT_FILE = DATA_DIR / "skywork_extracted_eval.jsonl" -OUT_FILE = DATA_DIR / "skywork_preference_pairs_eval.jsonl" - - -def load_jsonl(path: Path) -> List[Dict[str, Any]]: - """Load a JSONL file into a list of dictionaries.""" - rows: List[Dict[str, Any]] = [] - with path.open("r", encoding="utf-8") as f: - for line in f: - rows.append(json.loads(line)) - return rows - - -def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: - """Write a list of dictionaries to a JSONL file.""" - with path.open("w", encoding="utf-8") as f: - for row in rows: - f.write(json.dumps(row, ensure_ascii=False) + "\n") - - -def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Convert prompt/chosen/rejected rows into preference-pair format.""" - output: List[Dict[str, Any]] = [] - - for item in data: - prompt = item.get("prompt", "") - chosen = item.get("chosen", "") - rejected = item.get("rejected", "") - - # Random assignment - if random.random() < 0.5: - response_0 = chosen - response_1 = rejected - better_response_id = 0 - else: - response_0 = rejected - response_1 = chosen - better_response_id = 1 - - output.append( - { - "prompt": prompt, - "response_0": response_0, - "response_1": response_1, - "better_response_id": better_response_id, - } - ) - - return output - - -def main() -> None: - """Generate evaluation preference pairs and save them to disk.""" - print(f"📥 Loading evaluation dataset from → {INPUT_FILE}") - - data = load_jsonl(INPUT_FILE) - print(f"📄 Loaded {len(data)} rows") - - preference_pairs = create_preference_pairs(data) - - write_jsonl(OUT_FILE, preference_pairs) - - print("======================================") - print(f"✅ DONE! Saved evaluation preference pairs → {OUT_FILE}") - print(f"📦 Total eval pairs: {len(preference_pairs)}") - print("======================================") - - -if __name__ == "__main__": - main() diff --git a/src/aixpert/data_construction/dataextraction.py b/src/aixpert/data_construction/dataextraction.py deleted file mode 100644 index 15ea172..0000000 --- a/src/aixpert/data_construction/dataextraction.py +++ /dev/null @@ -1,136 +0,0 @@ -""" -Skywork extraction utilities. - -This module extracts prompt/chosen/rejected fields from the Skywork Preference -dataset, removes exact duplicates, and writes the cleaned dataset to JSONL -files. Fully compatible with ruff, mypy, and the AI Engineering template. -""" - -from __future__ import annotations - -import json -import os -from pathlib import Path -from typing import Any, Dict, List, Union - -import pandas as pd -from datasets import load_dataset - - -# Path to: src/aixpert/data_construction/data/ -DATA_DIR = Path(__file__).resolve().parent / "data" -DATA_DIR.mkdir(parents=True, exist_ok=True) - - -SUBSET_SIZE = 80000 -OUT_FILE = DATA_DIR / "skywork_extracted_77k.jsonl" -REMOVED_FILE = DATA_DIR / "skywork_cleaned_77k.jsonl" - -print(f"📥 Loading first {SUBSET_SIZE} samples from Skywork...") - - -# ============================================================ -# Dataset loading -# ============================================================ -ds = load_dataset( - "Skywork/Skywork-Reward-Preference-80K-v0.1", - split=f"train[:{SUBSET_SIZE}]", -) - -df = ds.to_pandas() - - -# ============================================================ -# Extract prompt / chosen / rejected -# ============================================================ -def extract_prompt_from_dialog(dialog: List[Dict[str, Any]]) -> str: - """ - Extract the first user message from a dialog. - - Parameters - ---------- - dialog : list of dict - A list of message objects with "role" and "content" keys. - - Returns - ------- - str - The content of the first message with role 'user', or an empty string. - """ - for msg in dialog: - if msg.get("role") == "user": - return str(msg.get("content", "")).strip() - return "" - - -def extract_answer_from_dialog(dialog: List[Dict[str, Any]]) -> str: - """ - Extract the first assistant message from a dialog. - - Parameters - ---------- - dialog : list of dict - A list of message objects with "role" and "content" keys. - - Returns - ------- - str - The content of the first message with role 'assistant', or an empty string. - """ - for msg in dialog: - if msg.get("role") == "assistant": - return str(msg.get("content", "")).strip() - return "" - - -df["prompt"] = df["chosen"].apply(extract_prompt_from_dialog) -df["chosen"] = df["chosen"].apply(extract_answer_from_dialog) -df["rejected"] = df["rejected"].apply(extract_answer_from_dialog) - -clean_df = df[["prompt", "chosen", "rejected"]] - -# ============================================================ -# 🔍 Exact-match removal (chosen == rejected) -# ============================================================ -cleaned: List[Dict[str, str]] = [] -removed: List[Dict[str, str]] = [] - -for _, row in clean_df.iterrows(): - chosen = str(row["chosen"]).strip() - rejected = str(row["rejected"]).strip() - - sample = { - "prompt": str(row["prompt"]).strip(), - "chosen": chosen, - "rejected": rejected, - } - - if chosen == rejected: - removed.append(sample) - else: - cleaned.append(sample) - -print(f"🧹 Removed exact duplicates: {len(removed)}") -print(f"📦 Remaining clean samples: {len(cleaned)}") - -# Ensure output directory exists -os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True) - - -# ============================================================ -# Save output JSONL files -# ============================================================ -def write_jsonl(path: Union[str, Path], rows: List[Dict[str, Any]]) -> None: - """Write a list of dictionaries to a JSONL file.""" - with open(str(path), "w", encoding="utf-8") as f: - for row in rows: - f.write(json.dumps(row, ensure_ascii=False) + "\n") - - -write_jsonl(OUT_FILE, cleaned) -write_jsonl(REMOVED_FILE, removed) - -print(f"✅ Saved cleaned dataset ({len(cleaned)} samples) → {OUT_FILE}") -print(f"🗑️ Saved removed duplicates ({len(removed)} samples) → {REMOVED_FILE}") - -print(pd.DataFrame(cleaned).head()) diff --git a/src/aixpert/data_construction/dataextraction_eval.py b/src/aixpert/data_construction/dataextraction_eval.py deleted file mode 100644 index 1595767..0000000 --- a/src/aixpert/data_construction/dataextraction_eval.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -Extract the evaluation slice of the Skywork preference dataset. - -This script extracts rows 80001–81000, removes exact duplicates, -and saves the cleaned dataset into JSONL files under the local data folder. -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any, Dict, List - -from datasets import load_dataset - - -# ============================================================ -# Helpers -# ============================================================ - - -def extract_prompt(dialog: List[Dict[str, Any]]) -> str: - """Extract the first user message.""" - for msg in dialog: - if msg.get("role") == "user": - return str(msg.get("content", "")).strip() - return "" - - -def extract_answer(dialog: List[Dict[str, Any]]) -> str: - """Extract the first assistant message.""" - for msg in dialog: - if msg.get("role") == "assistant": - return str(msg.get("content", "")).strip() - return "" - - -def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: - """Write rows to a JSONL file.""" - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w", encoding="utf-8") as f: - for row in rows: - f.write(json.dumps(row, ensure_ascii=False) + "\n") - - -# ============================================================ -# Constants for eval split -# ============================================================ - -START = 80001 -END = 81000 # inclusive - -print(f"📥 Loading eval slice: {START} → {END}") - -ds = load_dataset( - "Skywork/Skywork-Reward-Preference-80K-v0.1", - split=f"train[{START}:{END + 1}]", -) - -df = ds.to_pandas() - -df["prompt"] = df["chosen"].apply(extract_prompt) -df["chosen"] = df["chosen"].apply(extract_answer) -df["rejected"] = df["rejected"].apply(extract_answer) - -clean_df = df[["prompt", "chosen", "rejected"]] - -cleaned: List[Dict[str, str]] = [] -removed: List[Dict[str, str]] = [] - -for _, row in clean_df.iterrows(): - chosen = row["chosen"].strip() - rejected = row["rejected"].strip() - - record = { - "prompt": row["prompt"].strip(), - "chosen": chosen, - "rejected": rejected, - } - - if chosen == rejected: - removed.append(record) - else: - cleaned.append(record) - -print(f"🧹 Removed duplicates: {len(removed)}") -print(f"📦 Clean samples: {len(cleaned)}") - -# Save outputs -data_dir = Path(__file__).resolve().parent / "data" -save_jsonl(data_dir / "skywork_extracted_eval.jsonl", cleaned) -save_jsonl(data_dir / "skywork_eval_removed.jsonl", removed) - -print("✅ Saved eval dataset → skywork_eval.jsonl") diff --git a/src/aixpert/data_construction/dataextraction_eval2.py b/src/aixpert/data_construction/dataextraction_eval2.py deleted file mode 100644 index 83fb701..0000000 --- a/src/aixpert/data_construction/dataextraction_eval2.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -Extract the test slice of the Skywork preference dataset. - -This script extracts rows 81001–81500, removes exact duplicates, -and saves the cleaned dataset into JSONL files under the local data folder. -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any, Dict, List - -from datasets import load_dataset - - -# ============================================================ -# Helpers -# ============================================================ - - -def extract_prompt(dialog: List[Dict[str, Any]]) -> str: - """Extract the first user message.""" - for msg in dialog: - if msg.get("role") == "user": - return str(msg.get("content", "")).strip() - return "" - - -def extract_answer(dialog: List[Dict[str, Any]]) -> str: - """Extract the first assistant message.""" - for msg in dialog: - if msg.get("role") == "assistant": - return str(msg.get("content", "")).strip() - return "" - - -def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: - """Write rows to a JSONL file.""" - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w", encoding="utf-8") as f: - for row in rows: - f.write(json.dumps(row, ensure_ascii=False) + "\n") - - -# ============================================================ -# Constants for test split -# ============================================================ - -START = 81001 -END = 81500 # inclusive - -print(f"📥 Loading test slice: {START} → {END}") - -ds = load_dataset( - "Skywork/Skywork-Reward-Preference-80K-v0.1", - split=f"train[{START}:{END + 1}]", -) - -df = ds.to_pandas() - -df["prompt"] = df["chosen"].apply(extract_prompt) -df["chosen"] = df["chosen"].apply(extract_answer) -df["rejected"] = df["rejected"].apply(extract_answer) - -clean_df = df[["prompt", "chosen", "rejected"]] - -cleaned: List[Dict[str, str]] = [] -removed: List[Dict[str, str]] = [] - -for _, row in clean_df.iterrows(): - chosen = row["chosen"].strip() - rejected = row["rejected"].strip() - - record = { - "prompt": row["prompt"].strip(), - "chosen": chosen, - "rejected": rejected, - } - - if chosen == rejected: - removed.append(record) - else: - cleaned.append(record) - -print(f"🧹 Removed duplicates: {len(removed)}") -print(f"📦 Clean samples: {len(cleaned)}") - -# Save outputs -data_dir = Path(__file__).resolve().parent / "data" -save_jsonl(data_dir / "skywork_extracted_test.jsonl", cleaned) -save_jsonl(data_dir / "skywork_test_removed.jsonl", removed) - -print("✅ Saved test dataset → skywork_test.jsonl") diff --git a/src/aixpert/data_construction/dataset_eval.py b/src/aixpert/data_construction/dataset_eval.py deleted file mode 100644 index faa9d0c..0000000 --- a/src/aixpert/data_construction/dataset_eval.py +++ /dev/null @@ -1,215 +0,0 @@ -""" -Generate binary factuality labels (0=factual, 1=hallucinated) for test preference pairs. - -This script loads test preference pairs, evaluates each response using a -strict PKU-style binary factual judge. It then saves the dataset with: - - factual_flag_0, factual_flag_1 - h0, h1 (aliases for convenience) - -It supports automatic resume, concurrency throttling, and checkpointing. -Fully compliant with ruff, ruff-format, pydocstyle, and mypy. - -""" - -from __future__ import annotations - -import asyncio -import json -import os -import re -from pathlib import Path -from typing import Any, Dict, List - -from decouple import Config, RepositoryEnv -from openai import AsyncOpenAI -from tqdm.asyncio import tqdm_asyncio -from utils import load_yaml - - -# ============================================================ -# Load config.yaml + .env API key -# ============================================================ - -CONFIG = load_yaml("config.yaml") # MUST contain: repository: -REPO_PATH = CONFIG["repository"] - -env = Config(RepositoryEnv(f"{REPO_PATH}/.env")) -OPENAI_API_KEY = env("OPENAI_API_KEY", default=False) - -if not OPENAI_API_KEY: - raise RuntimeError("❌ OPENAI_API_KEY not found in .env under repository path.") - -client = AsyncOpenAI(api_key=OPENAI_API_KEY) - - -# ============================================================ -# Configuration & Directories -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" -DATA_DIR.mkdir(parents=True, exist_ok=True) - -INPUT_FILE = DATA_DIR / "skywork_preference_pairs_eval.jsonl" -OUTPUT_FILE = DATA_DIR / "skywork_binary_factual_eval.jsonl" - -# ============================================================ -# Factual Judge Prompt -# ============================================================ - -BINARY_FACTUAL_JUDGE_PROMPT = """ -System: -You are an extremely strict factuality judge. Your task is to determine whether the assistant’s -response contains ANY of the following: - -• factual errors -• hallucinations -• unsupported claims -• unverifiable statements -• fabricated details -• incorrect technical descriptions -• invented studies, names, or entities -• logical contradictions -• vague or misleading explanations -• partial answers that omit critical facts - -IMPORTANT: -If the response is vague, incomplete, unsupported, or unverifiable, -classify it as [[1]]. - -Definitions: -[[0]] → Fully factual, precise, verifiable. -[[1]] → ANY issue: vague, incorrect, unverifiable, misleading, fabricated. - -Return ONLY: -[[0]] or [[1]] - -User Question: -{question} - -Assistant Response: -{answer} -""" - - -# ============================================================ -# API Call — Get Binary Factual Flag -# ============================================================ - - -async def get_factual_flag( - question: str, answer: str, semaphore: asyncio.Semaphore -) -> int: - """Evaluate a response for factual correctness (0=factual, 1=hallucinated).""" - prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer) - - async with semaphore: - for retry in range(5): - try: - response = await client.chat.completions.create( - model="gpt-4o-mini", - messages=[{"role": "user", "content": prompt}], - temperature=0, - ) - - text = response.choices[0].message.content.strip() - match = re.search(r"\[\[(0|1)\]\]", text) - if match: - return int(match.group(1)) - - return 1 # default: hallucinated - - except Exception as exc: - print(f"Retry factual-flag ({retry}): {exc}") - await asyncio.sleep(1 + 0.5 * retry) - - return 1 - - -# ============================================================ -# Process One Item -# ============================================================ - - -async def process_single_item( - item: Dict[str, Any], semaphore: asyncio.Semaphore -) -> Dict[str, Any]: - """Process one preference pair and produce binary factual labels.""" - prompt = item["prompt"] - r0 = item["response_0"] - r1 = item["response_1"] - - f0_task = asyncio.create_task(get_factual_flag(prompt, r0, semaphore)) - f1_task = asyncio.create_task(get_factual_flag(prompt, r1, semaphore)) - - f0 = await f0_task - f1 = await f1_task - - return { - **item, - "factual_flag_0": f0, - "factual_flag_1": f1, - "h0": f0, - "h1": f1, - } - - -# ============================================================ -# Main Async Pipeline -# ============================================================ - - -async def process_dataset() -> None: - """Load test dataset, compute factual flags, resume if needed, and save output.""" - print(f"📥 Loading test dataset → {INPUT_FILE}") - - with INPUT_FILE.open("r", encoding="utf-8") as f: - items = [json.loads(line) for line in f] - - # Resume mode - processed_count = 0 - if OUTPUT_FILE.exists(): - print("♻️ Resuming previous run...") - with OUTPUT_FILE.open("r", encoding="utf-8") as f: - processed_count = sum(1 for _ in f) - print(f"Found {processed_count} completed items.") - - remaining = items[processed_count:] - semaphore = asyncio.Semaphore(25) - - tasks = [ - asyncio.create_task(process_single_item(item, semaphore)) for item in remaining - ] - - buffer: List[str] = [] - count = processed_count - - with OUTPUT_FILE.open("a", encoding="utf-8") as f: - for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)): - result = await coro - buffer.append(json.dumps(result, ensure_ascii=False) + "\n") - count += 1 - - if len(buffer) >= 25: - f.writelines(buffer) - f.flush() - os.fsync(f.fileno()) - buffer.clear() - print(f"Checkpoint saved ({count} items).") - - # Flush final buffer - if buffer: - f.writelines(buffer) - f.flush() - os.fsync(f.fileno()) - print(f"Final checkpoint saved ({count} items).") - - print("✅ Completed test factual evaluation.") - - -# ============================================================ -# Entry Point -# ============================================================ - -if __name__ == "__main__": - asyncio.run(process_dataset()) diff --git a/src/aixpert/data_construction/dataset_train.py b/src/aixpert/data_construction/dataset_train.py deleted file mode 100644 index f24b773..0000000 --- a/src/aixpert/data_construction/dataset_train.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -Generate binary factuality labels for evaluation preference pairs using GPT-4o-mini. - -This script loads evaluation preference pairs, evaluates each response using -a strict PKU-style binary factual judge, and saves the resulting dataset with: - - factual_flag_0, factual_flag_1 - h0, h1 (aliases for convenience) - -It supports automatic resume, concurrency throttling, and checkpointing. -Fully compliant with ruff, ruff-format, pydocstyle, and mypy. -""" - -from __future__ import annotations - -import asyncio -import json -import os -import re -from pathlib import Path -from typing import Any, Dict, List - -from decouple import Config, RepositoryEnv -from openai import AsyncOpenAI -from tqdm.asyncio import tqdm_asyncio -from utils import load_yaml # your YAML loader - - -# ============================================================ -# Load OpenAI API key -# ============================================================ - - -YAML_CONFIG = load_yaml("config.yaml") -REPO_PATH = YAML_CONFIG["repository"] - -# Load .env using decouple (the exact method you asked for) -env = Config(RepositoryEnv(f"{REPO_PATH}/.env")) -OPENAI_API_KEY = env("OPENAI_API_KEY", default=False) - -if not OPENAI_API_KEY: - raise ValueError("❌ OPENAI_API_KEY not found in .env file!") - -client = AsyncOpenAI(api_key=OPENAI_API_KEY) - - -# ============================================================ -# Configuration & Directories -# ============================================================ - -DATA_DIR = Path(__file__).resolve().parent / "data" -DATA_DIR.mkdir(parents=True, exist_ok=True) - -INPUT_FILE = DATA_DIR / "skywork_preference_pairs_train.jsonl" -OUTPUT_FILE = DATA_DIR / "skywork_binary_factual_train.jsonl" - -# ============================================================ -# Factual Judge Prompt -# ============================================================ - -BINARY_FACTUAL_JUDGE_PROMPT = """ -System: -You are an extremely strict factuality judge. Your task is to determine whether the assistant’s -response contains ANY of the following: - -• factual errors -• hallucinations -• unsupported claims -• unverifiable statements -• fabricated details -• incorrect technical descriptions -• invented studies, names, or entities -• logical contradictions -• vague or misleading explanations -• partial answers that omit critical facts - -IMPORTANT: -If the response is vague, incomplete, unsupported, or unverifiable, -classify it as [[1]]. - -Definitions: -[[0]] → Fully factual, precise, verifiable. -[[1]] → ANY issue: vague, incorrect, unverifiable, misleading, fabricated. - -Return ONLY: -[[0]] or [[1]] - -User Question: -{question} - -Assistant Response: -{answer} -""" - - -# ============================================================ -# API Call — Get Binary Factual Flag -# ============================================================ - - -async def get_factual_flag( - question: str, answer: str, semaphore: asyncio.Semaphore -) -> int: - """Evaluate a response for factual correctness (0=factual, 1=hallucinated).""" - prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer) - - async with semaphore: - for retry in range(5): - try: - response = await client.chat.completions.create( - model="gpt-4o-mini", - messages=[{"role": "user", "content": prompt}], - temperature=0, - ) - - text = response.choices[0].message.content.strip() - match = re.search(r"\[\[(0|1)\]\]", text) - if match: - return int(match.group(1)) - - return 1 # default: hallucinated - - except Exception as exc: - print(f"Retry factual-flag ({retry}): {exc}") - await asyncio.sleep(1 + 0.5 * retry) - - return 1 - - -# ============================================================ -# Process One Item -# ============================================================ - - -async def process_single_item( - item: Dict[str, Any], semaphore: asyncio.Semaphore -) -> Dict[str, Any]: - """Process one preference pair and produce binary factual labels.""" - prompt = item["prompt"] - r0 = item["response_0"] - r1 = item["response_1"] - - f0_task = asyncio.create_task(get_factual_flag(prompt, r0, semaphore)) - f1_task = asyncio.create_task(get_factual_flag(prompt, r1, semaphore)) - - f0 = await f0_task - f1 = await f1_task - - return { - **item, - "factual_flag_0": f0, - "factual_flag_1": f1, - "h0": f0, - "h1": f1, - } - - -# ============================================================ -# Main Async Pipeline -# ============================================================ - - -async def process_dataset() -> None: - """Load eval dataset, compute factual flags, resume if needed, and save output.""" - print(f"📥 Loading eval dataset → {INPUT_FILE}") - - with INPUT_FILE.open("r", encoding="utf-8") as f: - items = [json.loads(line) for line in f] - - # Resume mode - processed_count = 0 - if OUTPUT_FILE.exists(): - print("♻️ Resuming previous run...") - with OUTPUT_FILE.open("r", encoding="utf-8") as f: - processed_count = sum(1 for _ in f) - print(f"Found {processed_count} completed items.") - - remaining = items[processed_count:] - semaphore = asyncio.Semaphore(25) - - tasks = [ - asyncio.create_task(process_single_item(item, semaphore)) for item in remaining - ] - - buffer: List[str] = [] - count = processed_count - - with OUTPUT_FILE.open("a", encoding="utf-8") as f: - for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)): - result = await coro - buffer.append(json.dumps(result, ensure_ascii=False) + "\n") - count += 1 - - if len(buffer) >= 25: - f.writelines(buffer) - f.flush() - os.fsync(f.fileno()) - buffer.clear() - print(f"Checkpoint saved ({count} items).") - - # Flush remaining - if buffer: - f.writelines(buffer) - f.flush() - os.fsync(f.fileno()) - print(f"Final checkpoint saved ({count} items).") - - print("✅ Completed factual evaluation.") - - -# ============================================================ -# Entry Point -# ============================================================ - -if __name__ == "__main__": - asyncio.run(process_dataset()) diff --git a/src/aixpert/data_construction/stage_1_extraction/dataextraction_eval.py b/src/aixpert/data_construction/stage_1_extraction/dataextraction_eval.py new file mode 100644 index 0000000..0240f34 --- /dev/null +++ b/src/aixpert/data_construction/stage_1_extraction/dataextraction_eval.py @@ -0,0 +1,55 @@ +""" +Extract the test slice of the Skywork preference dataset. + +This script extracts rows 81001–81500, removes exact duplicates, +and saves the cleaned dataset into JSONL files under the local data folder. +Only the prompts from this test set will be used in evaluation. +""" + +from __future__ import annotations + +from pathlib import Path + +from datasets import load_dataset +from utils.config_loader import load_config +from utils.data_utils import ( + extract_answer, + extract_prompt, + filter_duplicates, + save_jsonl, +) + + +def main() -> None: + """Run test-split extraction and save cleaned JSONL outputs.""" + cfg = load_config() + hp = cfg["hyperparams"] + paths = cfg["paths"] + + start, end = hp["test_start"], hp["test_end"] + + print(f"Extracting test slice {start} → {end}") + + ds = load_dataset( + paths["skywork_file"], + split=f"train[{start}:{end + 1}]", + ) + df = ds.to_pandas() + + df["prompt"] = df["chosen"].apply(extract_prompt) + df["chosen"] = df["chosen"].apply(extract_answer) + df["rejected"] = df["rejected"].apply(extract_answer) + + rows = df[["prompt", "chosen", "rejected"]].to_dict(orient="records") + cleaned, removed = filter_duplicates(rows) + + save_jsonl(Path(paths["skywork_test_cleaned"]), cleaned) + save_jsonl(Path(paths["skywork_test_removed"]), removed) + + print(f"Removed duplicates: {len(removed)}") + print(f"Clean samples: {len(cleaned)}") + print("Test extraction completed.") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py b/src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py new file mode 100644 index 0000000..9ce178a --- /dev/null +++ b/src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py @@ -0,0 +1,57 @@ +""" +Skywork extraction utilities. + +This module extracts prompt/chosen/rejected fields from the Skywork Preference +dataset, removes exact duplicates, and writes the cleaned dataset to JSONL +files. Fully compatible with ruff, mypy, and the AI Engineering template. +""" + +from __future__ import annotations + +from pathlib import Path + +from datasets import load_dataset +from utils.config_loader import load_config +from utils.data_utils import ( + extract_answer, + extract_prompt, + filter_duplicates, + save_jsonl, +) + + +def main() -> None: + """Run train-split extraction and save cleaned JSONL outputs.""" + cfg = load_config() + hp = cfg["hyperparams"] + paths = cfg["paths"] + + subset_size = hp["subset_size"] + + print(f"Loading first {subset_size} samples from Skywork...") + + ds = load_dataset( + paths["skywork_file"], + split=f"train[:{subset_size}]", + ) + + df = ds.to_pandas() + + df["prompt"] = df["chosen"].apply(extract_prompt) + df["chosen"] = df["chosen"].apply(extract_answer) + df["rejected"] = df["rejected"].apply(extract_answer) + + rows = df[["prompt", "chosen", "rejected"]].to_dict(orient="records") + + cleaned, removed = filter_duplicates(rows) + + save_jsonl(Path(paths["skywork_train_cleaned"]), cleaned) + save_jsonl(Path(paths["skywork_train_removed"]), removed) + + print(f"Removed exact duplicates: {len(removed)}") + print(f"Clean samples: {len(cleaned)}") + print("Training extraction completed.") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_1_extraction/dataextraction_val.py b/src/aixpert/data_construction/stage_1_extraction/dataextraction_val.py new file mode 100644 index 0000000..0918298 --- /dev/null +++ b/src/aixpert/data_construction/stage_1_extraction/dataextraction_val.py @@ -0,0 +1,54 @@ +""" +Extract the evaluation slice of the Skywork preference dataset. + +This script extracts rows 80001–81000, removes exact duplicates, +and saves the cleaned dataset into JSONL files under the local data folder. +""" + +from __future__ import annotations + +from pathlib import Path + +from datasets import load_dataset +from utils.config_loader import load_config +from utils.data_utils import ( + extract_answer, + extract_prompt, + filter_duplicates, + save_jsonl, +) + + +def main() -> None: + """Run validation-split extraction and save cleaned JSONL outputs.""" + cfg = load_config() + hp = cfg["hyperparams"] + paths = cfg["paths"] + + start, end = hp["eval_start"], hp["eval_end"] + + print(f"Extracting eval slice {start} → {end}") + + ds = load_dataset( + paths["skywork_file"], + split=f"train[{start}:{end + 1}]", + ) + df = ds.to_pandas() + + df["prompt"] = df["chosen"].apply(extract_prompt) + df["chosen"] = df["chosen"].apply(extract_answer) + df["rejected"] = df["rejected"].apply(extract_answer) + + rows = df[["prompt", "chosen", "rejected"]].to_dict(orient="records") + cleaned, removed = filter_duplicates(rows) + + save_jsonl(Path(paths["skywork_eval_cleaned"]), cleaned) + save_jsonl(Path(paths["skywork_eval_removed"]), removed) + + print(f"Removed duplicates: {len(removed)}") + print(f"Clean samples: {len(cleaned)}") + print("Eval extraction completed.") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py b/src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py new file mode 100644 index 0000000..65c89e4 --- /dev/null +++ b/src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py @@ -0,0 +1,47 @@ +""" +Generate training preference pairs from cleaned Skywork samples. + +Loads prompt/chosen/rejected rows from the cleaned 77k dataset, +creates random preference pairs (response_0/response_1), +assigns correct better_response_id, and writes JSONL output. + +This script uses the shared data utilities and config loader. +""" + +from __future__ import annotations + +from pathlib import Path + +from utils.config_loader import load_config +from utils.data_utils import ( + create_preference_pairs, + load_jsonl, + write_jsonl, +) + + +def main() -> None: + """Generate preference pairs for the training set.""" + cfg = load_config() + paths = cfg["paths"] + + input_path = Path(paths["skywork_train_cleaned"]) + output_path = Path(paths["skywork_train_pairs"]) + + print(f"Loading training dataset → {input_path}") + + data = load_jsonl(input_path) + print(f"Loaded {len(data)} rows") + + preference_pairs = create_preference_pairs(data) + + write_jsonl(output_path, preference_pairs) + + print("======================================") + print(f"Training preference pairs saved → {output_path}") + print(f"Total pairs: {len(preference_pairs)}") + print("======================================") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_2_conversion/dataconversion_val.py b/src/aixpert/data_construction/stage_2_conversion/dataconversion_val.py new file mode 100644 index 0000000..e3d7672 --- /dev/null +++ b/src/aixpert/data_construction/stage_2_conversion/dataconversion_val.py @@ -0,0 +1,45 @@ +""" +Generate evaluation preference pairs from cleaned Skywork samples. + +Loads prompt/chosen/rejected rows for the eval slice, +creates random preference pairs (response_0/response_1), +assigns correct better_response_id, and writes JSONL output. +""" + +from __future__ import annotations + +from pathlib import Path + +from aixpert.utils.config_loader import load_config +from aixpert.utils.data_utils import ( + create_preference_pairs, + load_jsonl, + write_jsonl, +) + + +def main() -> None: + """Generate preference pairs for the evaluation set.""" + cfg = load_config() + paths = cfg["paths"] + + input_path = Path(paths["skywork_eval_cleaned"]) + output_path = Path(paths["skywork_eval_pairs"]) + + print(f"Loading evaluation dataset → {input_path}") + + data = load_jsonl(input_path) + print(f"Loaded {len(data)} rows") + + preference_pairs = create_preference_pairs(data) + + write_jsonl(output_path, preference_pairs) + + print("======================================") + print(f"Eval preference pairs saved → {output_path}") + print(f"Total eval pairs: {len(preference_pairs)}") + print("======================================") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_3_factuality/dataset_train.py b/src/aixpert/data_construction/stage_3_factuality/dataset_train.py new file mode 100644 index 0000000..1557586 --- /dev/null +++ b/src/aixpert/data_construction/stage_3_factuality/dataset_train.py @@ -0,0 +1,50 @@ +"""Run binary factuality evaluation on training preference pairs.""" + +from __future__ import annotations + +import asyncio +from pathlib import Path + +from decouple import Config, RepositoryEnv +from utils.config_loader import load_config +from utils.data_utils import load_jsonl +from utils.factual_utils import ( + factual_evaluation_pipeline, + get_client, +) + + +async def main() -> None: + """Execute factuality evaluation for the training set.""" + cfg = load_config() + + repo_path = cfg["repository"] + paths = cfg["paths"] + hp = cfg["hyperparams"] + + env = Config(RepositoryEnv(f"{repo_path}/.env")) + api_key = env("OPENAI_API_KEY", default=None) + if not api_key: + raise RuntimeError("Missing OPENAI_API_KEY in .env") + + client = get_client(api_key) + + input_path = Path(paths["skywork_train_pairs"]) + output_path = Path(paths["skywork_train_factual"]) + + items = load_jsonl(input_path) + + await factual_evaluation_pipeline( + client=client, + items=items, + output_file=output_path, + model=cfg["model"]["name"], + concurrency=hp["concurrency_limit"], + max_retries=hp["max_retries"], + ) + + print("Completed factual evaluation for training set.") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/aixpert/data_construction/stage_3_factuality/dataset_val.py b/src/aixpert/data_construction/stage_3_factuality/dataset_val.py new file mode 100644 index 0000000..4e306f6 --- /dev/null +++ b/src/aixpert/data_construction/stage_3_factuality/dataset_val.py @@ -0,0 +1,51 @@ +"""Run binary factuality evaluation on evaluation preference pairs.""" + +from __future__ import annotations + +import asyncio +from pathlib import Path + +from decouple import Config, RepositoryEnv +from utils.config_loader import load_config +from utils.data_utils import load_jsonl +from utils.factual_utils import ( + factual_evaluation_pipeline, + get_client, +) + + +async def main() -> None: + """Execute factuality evaluation for the validation set.""" + cfg = load_config() + + repo_path = cfg["repository"] + paths = cfg["paths"] + hp = cfg["hyperparams"] + + # Load API key + env = Config(RepositoryEnv(f"{repo_path}/.env")) + api_key = env("OPENAI_API_KEY", default=None) + if not api_key: + raise RuntimeError("Missing OPENAI_API_KEY in .env") + + client = get_client(api_key) + + input_path = Path(paths["skywork_eval_pairs"]) + output_path = Path(paths["skywork_eval_factual"]) + + items = load_jsonl(input_path) + + await factual_evaluation_pipeline( + client=client, + items=items, + output_file=output_path, + model=cfg["model"]["name"], + concurrency=hp["concurrency_limit"], + max_retries=hp["max_retries"], + ) + + print("Completed factual evaluation for evaluation set.") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/aixpert/data_construction/stage_4_transformation/data_transform_train.py b/src/aixpert/data_construction/stage_4_transformation/data_transform_train.py new file mode 100644 index 0000000..d0f692e --- /dev/null +++ b/src/aixpert/data_construction/stage_4_transformation/data_transform_train.py @@ -0,0 +1,22 @@ +"""Transform factual-scored training pairs into DPO-ready format.""" + +from __future__ import annotations + +from pathlib import Path + +from utils.config_loader import load_config +from utils.dpo_transform_utils import transform_dataset + + +def main() -> None: + """Run dataset transformation for factual-scored training pairs.""" + paths = load_config()["paths"] + + input_path = Path(paths["skywork_train_factual"]) + output_path = Path(paths["skywork_train_transformed"]) + + transform_dataset(input_path, output_path) + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_4_transformation/data_transform_val.py b/src/aixpert/data_construction/stage_4_transformation/data_transform_val.py new file mode 100644 index 0000000..a0036c6 --- /dev/null +++ b/src/aixpert/data_construction/stage_4_transformation/data_transform_val.py @@ -0,0 +1,22 @@ +"""Transform factual-scored evaluation pairs into DPO-ready format.""" + +from __future__ import annotations + +from pathlib import Path + +from utils.config_loader import load_config +from utils.dpo_transform_utils import transform_dataset + + +def main() -> None: + """Run dataset transformation for factual-scored validation pairs.""" + paths = load_config()["paths"] + + input_path = Path(paths["skywork_eval_factual"]) + output_path = Path(paths["skywork_eval_transformed"]) + + transform_dataset(input_path, output_path) + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py b/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py new file mode 100644 index 0000000..defd1cc --- /dev/null +++ b/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py @@ -0,0 +1,88 @@ +""" +Generate synthetic corruption (hallucinated) responses for TRAIN split. + +This script: +- Loads clean DPO-ready Skywork transformation for training. +- Selects items where h_w=0 (winner factual) and h_l=1 (loser incorrect). +- Asks GPT-4o-mini to rewrite the factual answer into a subtle hallucination. +- Produces “inversion pairs” where corrupted is chosen and original is rejected. +- Saves up to 10,000 synthetic hallucination samples. + +Fully compatible with ruff, ruff-format, pydocstyle, and mypy. +""" + +from __future__ import annotations + +import asyncio +import random +from pathlib import Path +from typing import Any, Dict, Optional + +from openai import AsyncOpenAI +from utils.config_loader import load_config +from utils.data_utils import load_jsonl, write_jsonl +from utils.synthetic_utils import build_inversion_item, generate_corruption + + +async def process_item( + item: Dict[str, Any], + client: AsyncOpenAI, + sem: asyncio.Semaphore, + model: str, + max_retries: int, +) -> Optional[Dict[str, Any]]: + """Generate one synthetic inversion sample for training.""" + corrupted = await generate_corruption( + client=client, + model=model, + question=item["prompt"], + answer=item["chosen"], + semaphore=sem, + max_retries=max_retries, + ) + + if corrupted is None: + return None + + return await build_inversion_item(item, corrupted) + + +async def main() -> None: + """Generate synthetic hallucination samples for training.""" + config = load_config() + + model = config["model"]["name"] + api_key = config["openai_api_key"] + + target = config["hyperparams"]["synthetic_train_samples"] + concurrency = config["hyperparams"]["corruption_concurrency"] + max_retries = config["hyperparams"]["max_retries"] + + input_path = Path(config["paths"]["skywork_train_transformed"]) + output_path = Path(config["paths"]["synthetic_train_out"]) + + print(f"Loading transformed training data → {input_path}") + items = load_jsonl(input_path) + + print("🔍 Selecting (h_w=0, h_l=1) candidates…") + valid = [x for x in items if x["h_w"] == 0 and x["h_l"] == 1] + + selected = random.sample(valid, min(target, len(valid))) + print(f"Selected {len(selected)} items for corruption.") + + client = AsyncOpenAI(api_key=api_key) + sem = asyncio.Semaphore(concurrency) + + tasks = [process_item(item, client, sem, model, max_retries) for item in selected] + results = await asyncio.gather(*tasks) + + final_rows = [r for r in results if r is not None] + + print(f"Saving {len(final_rows)} synthetic training samples → {output_path}") + write_jsonl(output_path, final_rows) + + print("Synthetic training corruption generation complete.") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_val.py b/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_val.py new file mode 100644 index 0000000..6dfa633 --- /dev/null +++ b/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_val.py @@ -0,0 +1,90 @@ +""" +Generate synthetic corruption (hallucinated) responses for EVAL split. + +This script: +- Loads clean DPO-ready Skywork eval transformation. +- Selects pairs where h_w=0 and h_l=1. +- Uses GPT-4o-mini to introduce subtle factual errors. +- Produces inverted (hallucinated, correct) preference pairs. +- Saves 400 synthetic eval corruption examples. + +Compatible with ruff, ruff-format, pydocstyle, and mypy. +""" + +from __future__ import annotations + +import asyncio +import random +from pathlib import Path +from typing import Any, Dict, Optional + +from openai import AsyncOpenAI +from utils.config_loader import load_config +from utils.data_utils import load_jsonl, write_jsonl +from utils.synthetic_utils import build_inversion_item, generate_corruption + + +async def process_item( + item: Dict[str, Any], + client: AsyncOpenAI, + sem: asyncio.Semaphore, + model: str, + max_retries: int, +) -> Optional[Dict[str, Any]]: + """Generate one synthetic inversion example for evaluation.""" + corrupted = await generate_corruption( + client=client, + model=model, + question=item["prompt"], + answer=item["chosen"], + semaphore=sem, + max_retries=max_retries, + ) + + if corrupted is None: + return None + + entry = await build_inversion_item(item, corrupted) + entry["source"] = "synthetic_inversion_eval" + return entry + + +async def main() -> None: + """Generate synthetic corruption samples for evaluation.""" + config = load_config() + + model = config["model"]["name"] + api_key = config["openai_api_key"] + + target = config["hyperparams"]["synthetic_eval_samples"] + concurrency = config["hyperparams"]["corruption_concurrency"] + max_retries = config["hyperparams"]["max_retries"] + + input_path = Path(config["paths"]["skywork_eval_transformed"]) + output_path = Path(config["paths"]["synthetic_eval_out"]) + + print(f"Loading transformed eval data → {input_path}") + items = load_jsonl(input_path) + + print("Selecting (h_w=0, h_l=1) eval candidates…") + valid = [x for x in items if x["h_w"] == 0 and x["h_l"] == 1] + + selected = random.sample(valid, min(target, len(valid))) + print(f"Selected {len(selected)} items for corruption.") + + client = AsyncOpenAI(api_key=api_key) + sem = asyncio.Semaphore(concurrency) + + tasks = [process_item(item, client, sem, model, max_retries) for item in selected] + results = await asyncio.gather(*tasks) + + final_rows = [r for r in results if r is not None] + + print(f"Saving {len(final_rows)} synthetic eval samples → {output_path}") + write_jsonl(output_path, final_rows) + + print("Eval synthetic corruption generation complete.") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/aixpert/data_construction/stage_6_merging/data_merge_train.py b/src/aixpert/data_construction/stage_6_merging/data_merge_train.py new file mode 100644 index 0000000..3b76c49 --- /dev/null +++ b/src/aixpert/data_construction/stage_6_merging/data_merge_train.py @@ -0,0 +1,68 @@ +""" +Merge Skywork training data with 10k synthetic inversion pairs. + +This script: +- Loads synthetic corruption samples. +- Loads transformed Skywork training data. +- Splits real samples into buckets by (h_w, h_l). +- Samples 10k from (0,1). +- Merges: synthetic + (0,0) + (1,1) + sampled (0,1). +- Shuffles and writes final JSONL file. + +Fully compatible with ruff, mypy, and pydocstyle. +""" + +from __future__ import annotations + +import random +from pathlib import Path + +from utils.config_loader import load_config +from utils.data_utils import bucket_by_flags, load_jsonl, write_jsonl + + +def main() -> None: + """Merge Skywork train data with synthetic inversion pairs.""" + cfg = load_config() + paths = cfg["paths"] + hp = cfg["hyperparams"] + + synthetic_path = Path(paths["synthetic_train_out"]) + skywork_transformed_path = Path(paths["skywork_train_transformed"]) + output_path = Path(paths["final_train_merged"]) + + sample_size = hp.get("merge_sample_01_train", 10000) + + print(f"📥 Loading synthetic → {synthetic_path}") + synthetic = load_jsonl(synthetic_path) + print(f"Synthetic count: {len(synthetic)}") + + print(f"📥 Loading transformed Skywork train → {skywork_transformed_path}") + sky = load_jsonl(skywork_transformed_path) + print(f"Skywork transformed count: {len(sky)}") + + # Bucket by (h_w, h_l) + b00, b11, b01 = bucket_by_flags(sky) + + print(f"(0,0): {len(b00)}") + print(f"(1,1): {len(b11)}") + print(f"(0,1): {len(b01)}") + + # Sample subset of (0,1) + random.seed(42) + sample_01 = random.sample(b01, min(sample_size, len(b01))) + print(f"Sampled (0,1): {len(sample_01)}") + + merged = synthetic + b00 + b11 + sample_01 + + print(f"Total merged before shuffle: {len(merged)}") + random.shuffle(merged) + + print(f"💾 Saving final merged train → {output_path}") + write_jsonl(output_path, merged) + + print("✅ TRAIN MERGE COMPLETE.\n") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_6_merging/data_merge_val.py b/src/aixpert/data_construction/stage_6_merging/data_merge_val.py new file mode 100644 index 0000000..2696788 --- /dev/null +++ b/src/aixpert/data_construction/stage_6_merging/data_merge_val.py @@ -0,0 +1,61 @@ +""" +Merge Skywork evaluation data with 400 synthetic inversion pairs. + +This script: +- Loads synthetic corruption samples for eval. +- Loads Skywork eval transformed dataset. +- Splits samples into buckets by (h_w, h_l). +- Keeps ALL real eval samples. +- Merges synthetic + all real eval buckets. +- Shuffles and writes final eval JSONL file. + +Fully compatible with ruff, mypy, and pydocstyle. +""" + +from __future__ import annotations + +import random +from pathlib import Path + +from utils.config_loader import load_config +from utils.data_utils import bucket_by_flags, load_jsonl, write_jsonl + + +def main() -> None: + """Merge Skywork eval data with synthetic eval inversion pairs.""" + cfg = load_config() + paths = cfg["paths"] + + synthetic_path = Path(paths["synthetic_eval_out"]) + skywork_transformed_path = Path(paths["skywork_eval_transformed"]) + output_path = Path(paths["final_eval_merged"]) + + print(f"📥 Loading synthetic eval → {synthetic_path}") + synthetic = load_jsonl(synthetic_path) + print(f"Synthetic eval count: {len(synthetic)}") + + print(f"📥 Loading transformed Skywork eval → {skywork_transformed_path}") + sky = load_jsonl(skywork_transformed_path) + print(f"Skywork eval count: {len(sky)}") + + # Split into buckets + b00, b11, b01 = bucket_by_flags(sky) + + print(f"(0,0): {len(b00)}") + print(f"(1,1): {len(b11)}") + print(f"(0,1): {len(b01)}") + + # Eval uses ALL samples (no sampling) + merged = synthetic + b00 + b11 + b01 + print(f"Total merged before shuffle: {len(merged)}") + + random.shuffle(merged) + + print(f"💾 Saving final merged eval → {output_path}") + write_jsonl(output_path, merged) + + print("✅ EVAL MERGE COMPLETE.\n") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_7_final/data_final_train.py b/src/aixpert/data_construction/stage_7_final/data_final_train.py new file mode 100644 index 0000000..15cd7c5 --- /dev/null +++ b/src/aixpert/data_construction/stage_7_final/data_final_train.py @@ -0,0 +1,88 @@ +""" +Balanced sampling for TRAIN dataset. + +This script: +- Loads the merged training dataset. +- Buckets by (h_w, h_l). +- Samples required amounts per bucket (with replacement if needed). +- Shuffles and saves the final balanced training dataset. + +Buckets required: + (0,1) → 10,000 + (1,0) → 10,000 + (0,0) → 15,000 + (1,1) → 10,000 +""" + +from __future__ import annotations + +import random +from pathlib import Path +from typing import Any, Dict, List, Tuple + +from utils.config_loader import load_config +from utils.data_utils import load_jsonl, write_jsonl + + +def main() -> None: + """Balanced sampling for TRAIN dataset.""" + cfg = load_config() + paths = cfg["paths"] + hp = cfg["hyperparams"] + + input_path = Path(paths["skywork_final_train"]) + output_path = Path(paths["final_train_out"]) + + target_counts: Dict[Tuple[int, int], int] = hp["balance_targets"] + + print(f"Loading → {input_path}") + data = load_jsonl(input_path) + + # Initialize buckets + buckets: Dict[Tuple[int, int], List[Dict[str, Any]]] = { + (0, 1): [], + (1, 0): [], + (0, 0): [], + (1, 1): [], + } + + print("🔍 Bucketing samples…") + for ex in data: + key = (int(ex["h_w"]), int(ex["h_l"])) + if key in buckets: + buckets[key].append(ex) + + print("\n=== AVAILABLE PER BUCKET ===") + for key, rows in buckets.items(): + print(f"{key}: {len(rows)}") + + final_rows: List[Dict[str, Any]] = [] + + # Sampling logic + for key, req_count in target_counts.items(): + pool = buckets[key] + available = len(pool) + + print(f"\nBucket {key}: available={available}, required={req_count}") + + if available < req_count: + print("⚠️ Sampling WITH replacement.") + sampled = random.choices(pool, k=req_count) + else: + sampled = random.sample(pool, req_count) + + final_rows.extend(sampled) + + print(f"\nShuffling {len(final_rows)} rows…") + random.shuffle(final_rows) + + print(f"Saving → {output_path}") + write_jsonl(output_path, final_rows) + + print("\nTRAIN balanced dataset ready.") + print(f"Final count: {len(final_rows)}") + + +if __name__ == "__main__": + random.seed(42) + main() diff --git a/src/aixpert/data_construction/stage_7_final/data_final_val.py b/src/aixpert/data_construction/stage_7_final/data_final_val.py new file mode 100644 index 0000000..af74631 --- /dev/null +++ b/src/aixpert/data_construction/stage_7_final/data_final_val.py @@ -0,0 +1,90 @@ +""" +Build the FINAL evaluation dataset (skywork_final_eval.jsonl). + +Composition: + • 400 synthetic inversion samples (1,0) + • all Skywork eval samples from skywork_first_transformed_eval.jsonl + • +1500 samples of (1,1) from skywork_final_train.jsonl + • +1500 samples of (0,0) from skywork_final_train.jsonl + → excluding any sample already used in train_finallast.jsonl + +Final eval ≈ (#sky_eval + 400 synthetic + 3000 added clean samples) +""" + +from __future__ import annotations + +import random +from pathlib import Path +from typing import Any, Dict, List + +from utils.config_loader import load_config +from utils.data_utils import load_jsonl, write_jsonl + + +def main() -> None: + """Build the FINAL evaluation dataset.""" + cfg = load_config() + paths = cfg["paths"] + hp = cfg["hyperparams"] + + synthetic_path = Path(paths["synthetic_eval_out"]) + sky_eval_path = Path(paths["skywork_eval_transformed"]) + train_full_path = Path(paths["final_train_merged"]) + train_used_path = Path(paths["final_train_out"]) + output_path = Path(paths["final_eval_out"]) + + add_n = hp["eval_additional_clean_samples"] + + print(f"Loading synthetic eval → {synthetic_path}") + synthetic = load_jsonl(synthetic_path) + + print(f"Loading Skywork eval transformed → {sky_eval_path}") + sky_eval = load_jsonl(sky_eval_path) + + print(f"Loading full training dataset → {train_full_path}") + sky_train = load_jsonl(train_full_path) + + print(f"Loading train-balanced dataset (to exclude) → {train_used_path}") + train_used = load_jsonl(train_used_path) + + exclude = {(ex["prompt"], ex["chosen"], ex["rejected"]) for ex in train_used} + + pool_11: List[Dict[str, Any]] = [] + pool_00: List[Dict[str, Any]] = [] + + for ex in sky_train: + key = (ex["prompt"], ex["chosen"], ex["rejected"]) + if key in exclude: + continue + + if ex["h_w"] == 1 and ex["h_l"] == 1: + pool_11.append(ex) + elif ex["h_w"] == 0 and ex["h_l"] == 0: + pool_00.append(ex) + + print(f"(1,1) pool after exclusion: {len(pool_11)}") + print(f"(0,0) pool after exclusion: {len(pool_00)}") + + sample_11 = random.sample(pool_11, add_n) + sample_00 = random.sample(pool_00, add_n) + + merged: List[Dict[str, Any]] = [] + merged.extend(synthetic) + merged.extend(sky_eval) + merged.extend(sample_11) + merged.extend(sample_00) + + print(f"\nTotal before shuffle: {len(merged)}") + + random.shuffle(merged) + + print(f"Saving final eval → {output_path}") + write_jsonl(output_path, merged) + + print("\nFINAL EVAL DATASET READY.") + print(f"Final count: {len(merged)}") + + +if __name__ == "__main__": + random.seed(42) + main() diff --git a/src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py b/src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py new file mode 100644 index 0000000..f1187d9 --- /dev/null +++ b/src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py @@ -0,0 +1,41 @@ +""" +Flip preference labels for training data. + +This script: +- Converts h_w=1,h_l=0 → h_w=0,h_l=1 +- Swaps chosen/rejected +- Writes a flipped version of the dataset +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List + +from utils.config_loader import load_config +from utils.data_utils import flip_sample, load_jsonl, write_jsonl + + +def main() -> None: + """Flip (1,0) preference labels in the final training dataset.""" + paths = load_config()["paths"] + + input_path = Path(paths["final_train_out"]) + output_path = Path(paths["train_flipped_out"]) + print(f"Loading → {input_path}") + items: List[Dict[str, Any]] = load_jsonl(input_path) + + print("Flipping (h_w=1, h_l=0) samples...") + flipped = [flip_sample(item) for item in items] + + print(f"Saving flipped dataset → {output_path}") + write_jsonl(output_path, flipped) + + print("\n==========================================") + print("TRAIN FLIP COMPLETE") + print(f"Total samples processed: {len(flipped)}") + print("==========================================\n") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py b/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py new file mode 100644 index 0000000..46cd3bc --- /dev/null +++ b/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py @@ -0,0 +1,42 @@ +""" +Flip preference labels for evaluation data. + +This script: +- Converts h_w=1,h_l=0 → h_w=0,h_l=1 +- Swaps chosen/rejected +- Writes a flipped version of the dataset +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List + +from utils.config_loader import load_config +from utils.data_utils import flip_sample, load_jsonl, write_jsonl + + +def main() -> None: + """Flip (1,0) preference labels in the final evaluation dataset.""" + paths = load_config()["paths"] + + input_path = Path(paths["final_eval_out"]) + output_path = Path(paths["eval_flipped_out"]) + + print(f"Loading → {input_path}") + items: List[Dict[str, Any]] = load_jsonl(input_path) + + print("Flipping (h_w=1, h_l=0) samples...") + flipped = [flip_sample(item) for item in items] + + print(f"💾 Saving flipped dataset → {output_path}") + write_jsonl(output_path, flipped) + + print("\n==========================================") + print("EVAL FLIP COMPLETE") + print(f"Total samples processed: {len(flipped)}") + print("==========================================\n") + + +if __name__ == "__main__": + main() diff --git a/src/aixpert/data_construction/utils.py b/src/aixpert/data_construction/utils.py deleted file mode 100644 index 7aed9e6..0000000 --- a/src/aixpert/data_construction/utils.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Utility functions for loading configuration files. - -This module provides: -- `load_yaml`: Read a YAML file into a Python dictionary. -- `load_env_api_key`: Load the OPENAI_API_KEY from a repository `.env` file. - -These helpers centralize configuration handling and ensure consistent behavior -across all data-construction scripts. -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Dict - -import yaml -from decouple import Config, RepositoryEnv - - -def load_yaml(yaml_path: str) -> Dict[str, Any]: - """Load a YAML file and return its content as a dict. - - :param yaml_path: Path to the YAML file. - :return: Parsed YAML content as a dict, or empty dict on failure. - """ - try: - with open(yaml_path, "r", encoding="utf-8") as f: - return yaml.safe_load(f) or {} - except Exception as e: - print(f"YAML load error: {e}") - return {} - - -def load_env_api_key(repository_path: str) -> str: - """Load OPENAI_API_KEY from a .env file inside the repository. - - Uses: - env = Config(RepositoryEnv(config["repository"] + "/.env")) - api_key = env("OPENAI_API_KEY", default=False) - - :param repository_path: Path to the repo containing `.env` - :return: The OpenAI API key or an empty string if missing. - """ - env_path = Path(repository_path) / ".env" - - if not env_path.exists(): - print(f"Warning: .env file not found at {env_path}") - return "" - - env = Config(RepositoryEnv(str(env_path))) - return env("OPENAI_API_KEY", default="") - - -__all__ = ["load_yaml", "load_env_api_key"] diff --git a/src/aixpert/data_construction/utils/config_loader.py b/src/aixpert/data_construction/utils/config_loader.py new file mode 100644 index 0000000..88b3060 --- /dev/null +++ b/src/aixpert/data_construction/utils/config_loader.py @@ -0,0 +1,17 @@ +"""Utility module for loading the global YAML configuration file.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict + +import yaml + + +CONFIG_PATH = Path(__file__).resolve().parents[1] / "config" / "config.yaml" + + +def load_config() -> Dict[str, Any]: + """Load YAML config into a dictionary.""" + with open(CONFIG_PATH, "r", encoding="utf-8") as f: + return yaml.safe_load(f) diff --git a/src/aixpert/data_construction/utils/data_utils.py b/src/aixpert/data_construction/utils/data_utils.py new file mode 100644 index 0000000..b70c9eb --- /dev/null +++ b/src/aixpert/data_construction/utils/data_utils.py @@ -0,0 +1,124 @@ +"""Utility functions for dataset extraction, cleaning, formatting, and flipping. + +These helpers are used across the data-construction pipeline for DPO, SafeDPO, +Factual-DPO, and evaluation preprocessing. +""" + +from __future__ import annotations + +import json +import random +from pathlib import Path +from typing import Any, Dict, List, Tuple + + +def extract_prompt(dialog: List[Dict[str, Any]]) -> str: + """Extract the first user message.""" + for msg in dialog: + if msg.get("role") == "user": + return str(msg.get("content", "")).strip() + return "" + + +def extract_answer(dialog: List[Dict[str, Any]]) -> str: + """Extract the first assistant reply.""" + for msg in dialog: + if msg.get("role") == "assistant": + return str(msg.get("content", "")).strip() + return "" + + +def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write list of dictionaries to JSONL.""" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def filter_duplicates( + rows: List[Dict[str, str]], +) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]: + """Split rows into cleaned (chosen != rejected) and removed (exact duplicates).""" + cleaned: List[Dict[str, str]] = [] + removed: List[Dict[str, str]] = [] + + for row in rows: + if row["chosen"] == row["rejected"]: + removed.append(row) + else: + cleaned.append(row) + + return cleaned, removed + + +def load_jsonl(path: Path) -> List[Dict[str, Any]]: + """Load JSONL file into a list of dictionaries.""" + with path.open("r", encoding="utf-8") as f: + return [json.loads(line) for line in f] + + +def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + """Write list of dictionaries to a JSONL file.""" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Convert rows into DPO preference-pair format.""" + output: List[Dict[str, Any]] = [] + + for item in data: + prompt = item.get("prompt", "") + chosen = item.get("chosen", "") + rejected = item.get("rejected", "") + + # Random symmetric assignment + if random.random() < 0.5: + response_0 = chosen + response_1 = rejected + better_response_id = 0 + else: + response_0 = rejected + response_1 = chosen + better_response_id = 1 + + output.append( + { + "prompt": prompt, + "response_0": response_0, + "response_1": response_1, + "better_response_id": better_response_id, + } + ) + + return output + + +def bucket_by_flags( + items: List[Dict[str, Any]], +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: + """Split items into (0,0), (1,1), and (0,1) buckets.""" + b00, b11, b01 = [], [], [] + + for ex in items: + h_w, h_l = ex["h_w"], ex["h_l"] + + if h_w == 0 and h_l == 0: + b00.append(ex) + elif h_w == 1 and h_l == 1: + b11.append(ex) + elif h_w == 0 and h_l == 1: + b01.append(ex) + + return b00, b11, b01 + + +def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]: + """Flip a sample if (h_w, h_l) = (1, 0).""" + if item.get("h_w") == 1 and item.get("h_l") == 0: + item["h_w"], item["h_l"] = 0, 1 + item["chosen"], item["rejected"] = item["rejected"], item["chosen"] + return item diff --git a/src/aixpert/data_construction/utils/dpo_transform_utils.py b/src/aixpert/data_construction/utils/dpo_transform_utils.py new file mode 100644 index 0000000..fea34d9 --- /dev/null +++ b/src/aixpert/data_construction/utils/dpo_transform_utils.py @@ -0,0 +1,56 @@ +"""Utilities for transforming factual-scored pairs into DPO-ready format.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict + +from tqdm import tqdm +from utils.data_utils import load_jsonl, write_jsonl + + +def process_item(item: Dict[str, Any]) -> Dict[str, Any]: + """Convert one factual-scored item into DPO-ready structure.""" + prompt = item["prompt"] + r0 = item["response_0"] + r1 = item["response_1"] + pref = int(item["better_response_id"]) + + h0 = int(item["h0"]) + h1 = int(item["h1"]) + + if pref == 0: + chosen, rejected = r0, r1 + h_w, h_l = h0, h1 + else: + chosen, rejected = r1, r0 + h_w, h_l = h1, h0 + + return { + "prompt": prompt, + "chosen": chosen, + "rejected": rejected, + "h_w": h_w, + "h_l": h_l, + "better_response_id": pref, + "response_0": r0, + "response_1": r1, + "flipped": False, + } + + +def transform_dataset(input_path: Path, output_path: Path) -> None: + """Load dataset, apply transformation, and save output JSONL.""" + print(f"Loading → {input_path}") + items = load_jsonl(input_path) + + print(f"⚙️ Transforming {len(items)} items…") + transformed = [process_item(it) for it in tqdm(items)] + + print(f"Saving → {output_path}") + write_jsonl(output_path, transformed) + + print("\n=======================================") + print("✔ TRANSFORMATION COMPLETE") + print(f"Total items: {len(items)}") + print("=======================================\n") diff --git a/src/aixpert/data_construction/utils/factual_utils.py b/src/aixpert/data_construction/utils/factual_utils.py new file mode 100644 index 0000000..f9996b1 --- /dev/null +++ b/src/aixpert/data_construction/utils/factual_utils.py @@ -0,0 +1,120 @@ +""" +Async factuality evaluation utilities. + +This module runs factual-flag scoring for preference pairs using an +LLM judge, supports concurrency, retries, and resume-safe checkpointing. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import re +from pathlib import Path +from typing import Any, Dict, List + +from openai import AsyncOpenAI +from tqdm.asyncio import tqdm_asyncio +from utils.prompt_templates import BINARY_FACTUAL_JUDGE_PROMPT + + +def get_client(api_key: str) -> AsyncOpenAI: + """Return AsyncOpenAI client.""" + return AsyncOpenAI(api_key=api_key) + + +async def get_factual_flag( + client: AsyncOpenAI, + model: str, + question: str, + answer: str, + semaphore: asyncio.Semaphore, + max_retries: int, +) -> int: + """Evaluate factual correctness (0 factual, 1 hallucinated).""" + prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer) + + async with semaphore: + for retry in range(max_retries): + try: + resp = await client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + temperature=0, + ) + text = resp.choices[0].message.content.strip() + match = re.search(r"\[\[(0|1)\]\]", text) + return int(match.group(1)) if match else 1 + except Exception: + await asyncio.sleep(1 + retry * 0.5) + + return 1 + + +async def evaluate_pair( + client: AsyncOpenAI, + item: Dict[str, Any], + model: str, + sem: asyncio.Semaphore, + retries: int, +) -> Dict[str, Any]: + """Compute factual flags for response_0 and response_1.""" + prompt = item["prompt"] + + t0 = asyncio.create_task( + get_factual_flag(client, model, prompt, item["response_0"], sem, retries) + ) + t1 = asyncio.create_task( + get_factual_flag(client, model, prompt, item["response_1"], sem, retries) + ) + + f0, f1 = await asyncio.gather(t0, t1) + + return { + **item, + "factual_flag_0": f0, + "factual_flag_1": f1, + "h0": f0, + "h1": f1, + } + + +async def factual_evaluation_pipeline( + client: AsyncOpenAI, + items: List[Dict[str, Any]], + output_file: Path, + model: str, + concurrency: int, + max_retries: int, +) -> None: + """Run factuality evaluation with resume and checkpoint support.""" + processed = 0 + if output_file.exists(): + with output_file.open("r", encoding="utf-8") as f: + processed = sum(1 for _ in f) + + remaining = items[processed:] + sem = asyncio.Semaphore(concurrency) + + tasks = [evaluate_pair(client, item, model, sem, max_retries) for item in remaining] + + buffer: List[str] = [] + counter = processed + + with output_file.open("a", encoding="utf-8") as f: + for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)): + out = await coro + buffer.append(json.dumps(out, ensure_ascii=False) + "\n") + counter += 1 + + if len(buffer) >= 25: + f.writelines(buffer) + f.flush() + os.fsync(f.fileno()) + buffer.clear() + + if buffer: + f.writelines(buffer) + f.flush() + os.fsync(f.fileno()) diff --git a/src/aixpert/data_construction/utils/prompt_templates.py b/src/aixpert/data_construction/utils/prompt_templates.py new file mode 100644 index 0000000..86cbe86 --- /dev/null +++ b/src/aixpert/data_construction/utils/prompt_templates.py @@ -0,0 +1,57 @@ +"""Central location for prompt templates (evaluation, DPO scoring, etc.).""" + +from __future__ import annotations + + +BINARY_FACTUAL_JUDGE_PROMPT = """ +System: +You are an extremely strict factuality judge. Your task is to determine whether the assistant’s +response contains ANY of the following: + +• factual errors +• hallucinations +• unsupported claims +• unverifiable statements +• fabricated details +• incorrect technical descriptions +• invented studies, names, or entities +• logical contradictions +• vague or misleading explanations +• partial answers that omit critical facts + +IMPORTANT: +If the response is vague, incomplete, unsupported, or unverifiable, +classify it as [[1]]. + +Definitions: +[[0]] → Fully factual, precise, verifiable. +[[1]] → ANY issue. + +Return ONLY: +[[0]] or [[1]] + +User Question: +{question} + +Assistant Response: +{answer} +""" + +CORRUPTION_SYSTEM_PROMPT = """ +You are a factual corruption generator. + +Rewrite a correct answer into a subtly incorrect version with 2–3 factual errors. +It must remain fluent, confident, and plausible. +""" + +CORRUPTION_USER_PROMPT = """ +PROMPT: +{question} + +CORRECT ANSWER: +{answer} + +TASK: +Rewrite the answer so that it becomes factually wrong, introducing subtle hallucinations +while sounding confident and coherent. +""" diff --git a/src/aixpert/data_construction/utils/synthetic_utils.py b/src/aixpert/data_construction/utils/synthetic_utils.py new file mode 100644 index 0000000..000cab6 --- /dev/null +++ b/src/aixpert/data_construction/utils/synthetic_utils.py @@ -0,0 +1,59 @@ +"""Async utilities for generating corrupted answers and synthetic inversions.""" + +from __future__ import annotations + +import asyncio +from typing import Any, Dict, Optional + +from openai import AsyncOpenAI +from utils.prompt_templates import ( + CORRUPTION_SYSTEM_PROMPT, + CORRUPTION_USER_PROMPT, +) + + +async def generate_corruption( + client: AsyncOpenAI, + model: str, + question: str, + answer: str, + semaphore: asyncio.Semaphore, + max_retries: int = 5, + temperature: float = 0.8, +) -> Optional[str]: + """Generate a hallucinated / corrupted answer using GPT.""" + user_prompt = CORRUPTION_USER_PROMPT.format(question=question, answer=answer) + + async with semaphore: + for retry in range(max_retries): + try: + resp = await client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": CORRUPTION_SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + temperature=temperature, + ) + return resp.choices[0].message.content.strip() + + except Exception as exc: + print(f"[Retry {retry}] corruption generation failed: {exc}") + await asyncio.sleep(1 + retry * 0.5) + + return None + + +async def build_inversion_item( + item: Dict[str, Any], + corrupted: str, +) -> Dict[str, Any]: + """Return a synthetic inversion DPO sample.""" + return { + "prompt": item["prompt"], + "chosen": corrupted, + "rejected": item["chosen"], + "h_w": 1, + "h_l": 0, + "source": "synthetic_inversion", + } From 7d3ee5a645af7484992c5cd26fd2f18c2d7b014b Mon Sep 17 00:00:00 2001 From: sindchad Date: Thu, 4 Dec 2025 19:12:16 -0500 Subject: [PATCH 08/14] readme file --- src/aixpert/data_construction/Readme.md | 20 +++++++++++++++++++ .../stage_3_factuality/dataset_val.py | 1 - .../stage_6_merging/data_merge_train.py | 10 ++++------ .../stage_6_merging/data_merge_val.py | 10 ++++------ .../stage_7_final/data_final_train.py | 2 +- .../stage_8_flipping/data_flipped_val.py | 2 +- .../data_construction/utils/data_utils.py | 1 - 7 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/aixpert/data_construction/Readme.md b/src/aixpert/data_construction/Readme.md index e69de29..1a94f19 100644 --- a/src/aixpert/data_construction/Readme.md +++ b/src/aixpert/data_construction/Readme.md @@ -0,0 +1,20 @@ +# Skywork → Factual-DPO Data Construction Pipeline + +This repository contains a complete, modular, and type-safe data-construction pipeline for generating **factual-aware DPO datasets** from the **Skywork Reward-Preference-80K** dataset. + +The pipeline supports: +- Direct Preference Optimization (DPO) +- Factual-DPO +- Synthetic hallucination inversion pairs +- Balanced and flipped datasets + +## Configuration + +All configuration is centralized in: + +```bash +src/aixpert/config/config.yaml +``` +Loaded dynamically using: +```python +utils/config_loader.load_config() diff --git a/src/aixpert/data_construction/stage_3_factuality/dataset_val.py b/src/aixpert/data_construction/stage_3_factuality/dataset_val.py index 4e306f6..0aea440 100644 --- a/src/aixpert/data_construction/stage_3_factuality/dataset_val.py +++ b/src/aixpert/data_construction/stage_3_factuality/dataset_val.py @@ -22,7 +22,6 @@ async def main() -> None: paths = cfg["paths"] hp = cfg["hyperparams"] - # Load API key env = Config(RepositoryEnv(f"{repo_path}/.env")) api_key = env("OPENAI_API_KEY", default=None) if not api_key: diff --git a/src/aixpert/data_construction/stage_6_merging/data_merge_train.py b/src/aixpert/data_construction/stage_6_merging/data_merge_train.py index 3b76c49..b70f5f2 100644 --- a/src/aixpert/data_construction/stage_6_merging/data_merge_train.py +++ b/src/aixpert/data_construction/stage_6_merging/data_merge_train.py @@ -33,22 +33,20 @@ def main() -> None: sample_size = hp.get("merge_sample_01_train", 10000) - print(f"📥 Loading synthetic → {synthetic_path}") + print(f"Loading synthetic → {synthetic_path}") synthetic = load_jsonl(synthetic_path) print(f"Synthetic count: {len(synthetic)}") - print(f"📥 Loading transformed Skywork train → {skywork_transformed_path}") + print(f"Loading transformed Skywork train → {skywork_transformed_path}") sky = load_jsonl(skywork_transformed_path) print(f"Skywork transformed count: {len(sky)}") - # Bucket by (h_w, h_l) b00, b11, b01 = bucket_by_flags(sky) print(f"(0,0): {len(b00)}") print(f"(1,1): {len(b11)}") print(f"(0,1): {len(b01)}") - # Sample subset of (0,1) random.seed(42) sample_01 = random.sample(b01, min(sample_size, len(b01))) print(f"Sampled (0,1): {len(sample_01)}") @@ -58,10 +56,10 @@ def main() -> None: print(f"Total merged before shuffle: {len(merged)}") random.shuffle(merged) - print(f"💾 Saving final merged train → {output_path}") + print(f"Saving final merged train → {output_path}") write_jsonl(output_path, merged) - print("✅ TRAIN MERGE COMPLETE.\n") + print("TRAIN MERGE COMPLETE.\n") if __name__ == "__main__": diff --git a/src/aixpert/data_construction/stage_6_merging/data_merge_val.py b/src/aixpert/data_construction/stage_6_merging/data_merge_val.py index 2696788..2e114eb 100644 --- a/src/aixpert/data_construction/stage_6_merging/data_merge_val.py +++ b/src/aixpert/data_construction/stage_6_merging/data_merge_val.py @@ -30,31 +30,29 @@ def main() -> None: skywork_transformed_path = Path(paths["skywork_eval_transformed"]) output_path = Path(paths["final_eval_merged"]) - print(f"📥 Loading synthetic eval → {synthetic_path}") + print(f"Loading synthetic eval → {synthetic_path}") synthetic = load_jsonl(synthetic_path) print(f"Synthetic eval count: {len(synthetic)}") - print(f"📥 Loading transformed Skywork eval → {skywork_transformed_path}") + print(f"Loading transformed Skywork eval → {skywork_transformed_path}") sky = load_jsonl(skywork_transformed_path) print(f"Skywork eval count: {len(sky)}") - # Split into buckets b00, b11, b01 = bucket_by_flags(sky) print(f"(0,0): {len(b00)}") print(f"(1,1): {len(b11)}") print(f"(0,1): {len(b01)}") - # Eval uses ALL samples (no sampling) merged = synthetic + b00 + b11 + b01 print(f"Total merged before shuffle: {len(merged)}") random.shuffle(merged) - print(f"💾 Saving final merged eval → {output_path}") + print(f"Saving final merged eval → {output_path}") write_jsonl(output_path, merged) - print("✅ EVAL MERGE COMPLETE.\n") + print("EVAL MERGE COMPLETE.\n") if __name__ == "__main__": diff --git a/src/aixpert/data_construction/stage_7_final/data_final_train.py b/src/aixpert/data_construction/stage_7_final/data_final_train.py index 15cd7c5..7754d4d 100644 --- a/src/aixpert/data_construction/stage_7_final/data_final_train.py +++ b/src/aixpert/data_construction/stage_7_final/data_final_train.py @@ -46,7 +46,7 @@ def main() -> None: (1, 1): [], } - print("🔍 Bucketing samples…") + print("Bucketing samples…") for ex in data: key = (int(ex["h_w"]), int(ex["h_l"])) if key in buckets: diff --git a/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py b/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py index 46cd3bc..f34b1ad 100644 --- a/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py +++ b/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py @@ -29,7 +29,7 @@ def main() -> None: print("Flipping (h_w=1, h_l=0) samples...") flipped = [flip_sample(item) for item in items] - print(f"💾 Saving flipped dataset → {output_path}") + print(f"Saving flipped dataset → {output_path}") write_jsonl(output_path, flipped) print("\n==========================================") diff --git a/src/aixpert/data_construction/utils/data_utils.py b/src/aixpert/data_construction/utils/data_utils.py index b70c9eb..d4268b2 100644 --- a/src/aixpert/data_construction/utils/data_utils.py +++ b/src/aixpert/data_construction/utils/data_utils.py @@ -75,7 +75,6 @@ def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: chosen = item.get("chosen", "") rejected = item.get("rejected", "") - # Random symmetric assignment if random.random() < 0.5: response_0 = chosen response_1 = rejected From d774776ee27d56f76b0bbf5521d57e2deda4e957 Mon Sep 17 00:00:00 2001 From: sindchad Date: Thu, 4 Dec 2025 19:19:17 -0500 Subject: [PATCH 09/14] readme updates --- src/aixpert/data_construction/Readme.md | 245 ++++++++++++++++++++++++ 1 file changed, 245 insertions(+) diff --git a/src/aixpert/data_construction/Readme.md b/src/aixpert/data_construction/Readme.md index 1a94f19..ca166f8 100644 --- a/src/aixpert/data_construction/Readme.md +++ b/src/aixpert/data_construction/Readme.md @@ -18,3 +18,248 @@ src/aixpert/config/config.yaml Loaded dynamically using: ```python utils/config_loader.load_config() +``` +## Project Structure +src/aixpert/ +│ +├── config/ +│ └── config.yaml # All paths + hyperparameters +│ +├── data_construction/ +│ ├── stage_1_extraction/ # Train/Eval/Test extraction +│ ├── stage_2_conversion/ # Preference-pair creation +│ ├── stage_3_factuality/ # Binary factual scoring (LLM) +│ ├── stage_4_transformation/ # Convert factual pairs → DPO format +│ ├── stage_5_syntheticdata/ # Synthetic hallucination generation +│ ├── stage_6_merging/ # Merge Skywork + synthetic +│ ├── stage_7_balancing/ # Balanced train/eval dataset construction +│ ├── stage_8_flipping/ # Flip (1,0) → (0,1) datasets +│ └── utils/ # Core shared utils +│ +└── ... + +## Configuration Summary (`config.yaml`) + +### Model Settings +- **model.name:** `gpt-4o-mini` +- **model.temperature:** `0.8` + +--- + +### Paths (All datasets + intermediate outputs) + +The configuration tracks every stage of the data pipeline, including: + +- Cleaned **train / eval / test** splits +- **Preference pairs** (DPO-style) +- **Factual-scored** outputs +- **Synthetic inversion** samples (train + eval) +- **Merged** intermediate datasets +- **Balanced** final datasets +- **Flipped** datasets for ablation + +**Examples:** +```yaml +skywork_train_cleaned: "src/.../skywork_extracted_77k.jsonl" +skywork_train_pairs: "src/.../skywork_preference_pairs_77k.jsonl" +skywork_train_factual: "src/.../skywork_binary_factual_train.jsonl" +final_train_out: "src/.../train_balanced.jsonl" +``` + +## Pipeline Stages — Summary + +Below is a concise overview of all eight stages in the Skywork → Factual-DPO data pipeline. + +--- + +### ** Stage 1 — Skywork Extraction** +**Scripts:** +- `dataextraction_train.py` +- `dataextraction_eval.py` +- `dataextraction_test.py`(These samples are directly used in evaluation) + +**Tasks:** +- Load slices from Skywork Preference dataset +- Extract: + - **prompt** (first user message) + - **chosen** (assistant reply) + - **rejected** (assistant reply) +- Remove exact duplicates +- Save cleaned JSONL files + +--- + +### ** Stage 2 — Preference Pair Conversion** +**Scripts:** +- `dataconversion_train.py` +- `dataconversion_eval.py` + +**Tasks:** +- Convert `(prompt, chosen, rejected)` → **DPO-style preference pairs** +- Produce: + - `response_0`, `response_1` + - `better_response_id` +- Random symmetric assignment for unbiased supervision + +--- + +### ** Stage 3 — Binary Factuality Evaluation** +**Scripts:** +- `dataset_train.py` +- `dataset_val.py` + +**Components:** +Uses `utils.factual_utils` to evaluate factual correctness using **GPT-4o-mini**. + +**Outputs:** +- Binary hallucination flags: + - `h0`, `h1` (aliases for `factual_flag_0`, `factual_flag_1`) + +**Features:** +- Resume-safe incremental scoring +- Async concurrency +- Retry logic + +--- + +### ** Stage 4 — DPO Transformation** +**Scripts:** +- `data_transform_train.py` +- `data_transform_val.py` + +**Tasks:** +Transform factual-scored items into canonical DPO format: + +- `prompt`, `chosen`, `rejected` +- `h_w`, `h_l` +- `response_0`, `response_1` +- `flipped=False` + +--- + +### ** Stage 5 — Synthetic Hallucination Generation** +**Scripts:** +- `data_synthetic_train.py` +- `data_synthetic_val.py` + +**Tasks:** +- Select samples where winner is factual (`h_w=0`) and loser is incorrect (`h_l=1`) +- Use **GPT-4o-mini** to generate hallucinated corruptions +- Build synthetic inversion pairs + +**Outputs:** +- **10,000** synthetic train samples +- **400** synthetic eval samples + +--- + +### ** Stage 6 — Merging** +**Scripts:** +- `merge_train.py` +- `merge_eval.py` + +**Tasks:** +- Merge Skywork transformed data with synthetic inversion pairs +- Bucket by `(h_w, h_l)` +- Sample subsets +- Shuffle and save merged datasets + +--- + +### ** Stage 7 — Balanced Dataset Construction** +**Scripts:** +- `balance_train.py` +- `build_final_eval.py` + +**Train Balancing:** +Use `balance_targets` to create balanced buckets: + +- `(0,1)` — 10,000 +- `(1,0)` — 10,000 +- `(0,0)` — 15,000 +- `(1,1)` — 10,000 + +**Eval Construction:** +Combine: +- Skywork eval transformed +- 400 synthetic eval inversion samples +- 1500 clean `(1,1)` samples (unused in train) +- 1500 clean `(0,0)` samples (unused in train) + +--- + +### ** Stage 8 — Flipping (Optional)** +**Scripts:** +- `data_flipped_train.py` +- `data_flipped_val.py` + +**Tasks:** +- Flip all `(1,0)` samples → `(0,1)` +- Swap `chosen` ↔ `rejected` +- Produce alternate dataset for inversion or ablation studies + +--- + +This structured overview provides a clear high-level map of the complete Factual-DPO data construction workflow. + +## Utilities Summary + +### `utils/config_loader.py` +- Centralized configuration loader +- All stages call `load_config()` to read `config.yaml` + +--- + +### `utils/data_utils.py` +Core data-processing helpers: +- `extract_prompt()` — first user message +- `extract_answer()` — first assistant reply +- `filter_duplicates()` — removes exact matches +- `create_preference_pairs()` — builds DPO response pairs +- `bucket_by_flags()` — groups by (h_w, h_l) +- `flip_sample()` — converts (1,0) → (0,1) +- JSONL read/write utilities + +--- + +### `utils/factual_utils.py` +- Async binary factuality scoring using GPT-4o-mini +- Concurrency + retry logic +- Resume-safe checkpointing +- Produces `h0`, `h1` hallucination flags + +--- + +### `utils/dpo_transform_utils.py` +- Converts factual-scored items into final DPO format: + - `prompt`, `chosen`, `rejected`, `h_w`, `h_l`, `response_0`, `response_1`, `flipped=False` + +--- + +### `utils/synthetic_utils.py` +- GPT-based corruption generator +- Creates synthetic inversion pairs (hallucinated → correct) + +--- + +### `utils/prompt_templates.py` +Provides all system/user prompts: +- Strict factuality judge prompt +- Hallucination corruption prompts + +--- + +## Running the Pipeline + +Example sequence for **training pipeline**: + +```bash +python src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py +python src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py +python src/aixpert/data_construction/stage_3_factuality/dataset_train.py +python src/aixpert/data_construction/stage_4_transformation/data_transform_train.py +python src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py +python src/aixpert/data_construction/stage_6_merging/merge_train.py +python src/aixpert/data_construction/stage_7_balancing/balance_train.py +python src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py +``` From 0cd7a9ae50329ebd5b7cada8056a1efb188360d2 Mon Sep 17 00:00:00 2001 From: sindchad Date: Thu, 4 Dec 2025 19:22:59 -0500 Subject: [PATCH 10/14] readme updates --- src/aixpert/data_construction/Readme.md | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/aixpert/data_construction/Readme.md b/src/aixpert/data_construction/Readme.md index ca166f8..2b3fee7 100644 --- a/src/aixpert/data_construction/Readme.md +++ b/src/aixpert/data_construction/Readme.md @@ -19,25 +19,6 @@ Loaded dynamically using: ```python utils/config_loader.load_config() ``` -## Project Structure -src/aixpert/ -│ -├── config/ -│ └── config.yaml # All paths + hyperparameters -│ -├── data_construction/ -│ ├── stage_1_extraction/ # Train/Eval/Test extraction -│ ├── stage_2_conversion/ # Preference-pair creation -│ ├── stage_3_factuality/ # Binary factual scoring (LLM) -│ ├── stage_4_transformation/ # Convert factual pairs → DPO format -│ ├── stage_5_syntheticdata/ # Synthetic hallucination generation -│ ├── stage_6_merging/ # Merge Skywork + synthetic -│ ├── stage_7_balancing/ # Balanced train/eval dataset construction -│ ├── stage_8_flipping/ # Flip (1,0) → (0,1) datasets -│ └── utils/ # Core shared utils -│ -└── ... - ## Configuration Summary (`config.yaml`) ### Model Settings From ff117a7bef50f5ffbad07ba26414e26795ddc67f Mon Sep 17 00:00:00 2001 From: sindchad Date: Fri, 5 Dec 2025 15:42:27 -0500 Subject: [PATCH 11/14] Fix pip-audit workflow configuration --- .github/workflows/code_checks.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml index a10d07f..5ad415d 100644 --- a/.github/workflows/code_checks.yml +++ b/.github/workflows/code_checks.yml @@ -55,5 +55,4 @@ jobs: uses: pypa/gh-action-pip-audit@v1.1.0 with: virtual-environment: .venv - additional-args: "--ignore PYSEC-2024-161" - strict: false + ignore-vulns: "PYSEC-2024-161" From 968a82703d456f9a2c5b8c08afe28980ae75fa17 Mon Sep 17 00:00:00 2001 From: sindchad Date: Fri, 5 Dec 2025 15:49:14 -0500 Subject: [PATCH 12/14] Fix pip-audit workflow configuration --- .github/workflows/code_checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml index 5ad415d..29333cd 100644 --- a/.github/workflows/code_checks.yml +++ b/.github/workflows/code_checks.yml @@ -55,4 +55,4 @@ jobs: uses: pypa/gh-action-pip-audit@v1.1.0 with: virtual-environment: .venv - ignore-vulns: "PYSEC-2024-161" + ignore-vulns: "PYSEC-2024-161, GHSA-gm62-xv2j-4w53, GHSA-2xpw-w6gg-jr37" From 698f51cd898cad77db59a2732bec85667a85dc4d Mon Sep 17 00:00:00 2001 From: sindchad Date: Fri, 5 Dec 2025 15:59:13 -0500 Subject: [PATCH 13/14] Fix pip-audit workflow configuration --- .github/workflows/code_checks.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml index 29333cd..648712e 100644 --- a/.github/workflows/code_checks.yml +++ b/.github/workflows/code_checks.yml @@ -55,4 +55,7 @@ jobs: uses: pypa/gh-action-pip-audit@v1.1.0 with: virtual-environment: .venv - ignore-vulns: "PYSEC-2024-161, GHSA-gm62-xv2j-4w53, GHSA-2xpw-w6gg-jr37" + ignore-vulns: | + PYSEC-2024-161 + GHSA-gm62-xv2j-4w53 + GHSA-2xpw-w6gg-jr37 From 06bbb1b965f23185119bd4a9350011595f0f77b9 Mon Sep 17 00:00:00 2001 From: sindchad Date: Mon, 8 Dec 2025 13:21:50 -0500 Subject: [PATCH 14/14] removed unnecessary print statements --- .../data_construction/stage_7_final/data_final_train.py | 4 +--- src/aixpert/data_construction/utils/dpo_transform_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/aixpert/data_construction/stage_7_final/data_final_train.py b/src/aixpert/data_construction/stage_7_final/data_final_train.py index 7754d4d..f7b6cd2 100644 --- a/src/aixpert/data_construction/stage_7_final/data_final_train.py +++ b/src/aixpert/data_construction/stage_7_final/data_final_train.py @@ -38,7 +38,6 @@ def main() -> None: print(f"Loading → {input_path}") data = load_jsonl(input_path) - # Initialize buckets buckets: Dict[Tuple[int, int], List[Dict[str, Any]]] = { (0, 1): [], (1, 0): [], @@ -58,7 +57,6 @@ def main() -> None: final_rows: List[Dict[str, Any]] = [] - # Sampling logic for key, req_count in target_counts.items(): pool = buckets[key] available = len(pool) @@ -66,7 +64,7 @@ def main() -> None: print(f"\nBucket {key}: available={available}, required={req_count}") if available < req_count: - print("⚠️ Sampling WITH replacement.") + print("Sampling WITH replacement.") sampled = random.choices(pool, k=req_count) else: sampled = random.sample(pool, req_count) diff --git a/src/aixpert/data_construction/utils/dpo_transform_utils.py b/src/aixpert/data_construction/utils/dpo_transform_utils.py index fea34d9..98684f7 100644 --- a/src/aixpert/data_construction/utils/dpo_transform_utils.py +++ b/src/aixpert/data_construction/utils/dpo_transform_utils.py @@ -44,13 +44,13 @@ def transform_dataset(input_path: Path, output_path: Path) -> None: print(f"Loading → {input_path}") items = load_jsonl(input_path) - print(f"⚙️ Transforming {len(items)} items…") + print(f"Transforming {len(items)} items…") transformed = [process_item(it) for it in tqdm(items)] print(f"Saving → {output_path}") write_jsonl(output_path, transformed) print("\n=======================================") - print("✔ TRANSFORMATION COMPLETE") + print("TRANSFORMATION COMPLETE") print(f"Total items: {len(items)}") print("=======================================\n")