From 8a4658af922e8260c01b0da3588b27f210baa568 Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Tue, 2 Dec 2025 13:06:22 -0500
Subject: [PATCH 01/14] remove data_extraction

---
 src/aixpert/data_construction/config.yaml     |   4 +
 .../data_construction/data_final_eval.py      | 126 ++++++++++
 .../data_construction/data_final_train.py     | 109 +++++++++
 .../data_construction/data_merge_eval.py      | 100 ++++++++
 .../data_construction/data_merge_train.py     | 104 +++++++++
 .../data_construction/data_synthetic_eval.py  | 167 ++++++++++++++
 .../data_construction/data_synthetic_train.py | 179 +++++++++++++++
 .../data_construction/data_transform_eval.py  |  85 +++++++
 .../data_construction/data_transform_train.py |  86 +++++++
 .../data_construction/dataconversion.eval.py  |  96 ++++++++
 .../data_construction/dataconversion.py       |  94 ++++++++
 src/aixpert/data_construction/dataset_eval.py | 215 +++++++++++++++++
 .../data_construction/dataset_train.py        | 216 ++++++++++++++++++
 src/aixpert/data_construction/utils.py        |  55 +++++
 14 files changed, 1636 insertions(+)
 create mode 100644 src/aixpert/data_construction/config.yaml
 create mode 100644 src/aixpert/data_construction/data_final_eval.py
 create mode 100644 src/aixpert/data_construction/data_final_train.py
 create mode 100644 src/aixpert/data_construction/data_merge_eval.py
 create mode 100644 src/aixpert/data_construction/data_merge_train.py
 create mode 100644 src/aixpert/data_construction/data_synthetic_eval.py
 create mode 100644 src/aixpert/data_construction/data_synthetic_train.py
 create mode 100644 src/aixpert/data_construction/data_transform_eval.py
 create mode 100644 src/aixpert/data_construction/data_transform_train.py
 create mode 100644 src/aixpert/data_construction/dataconversion.eval.py
 create mode 100644 src/aixpert/data_construction/dataconversion.py
 create mode 100644 src/aixpert/data_construction/dataset_eval.py
 create mode 100644 src/aixpert/data_construction/dataset_train.py
 create mode 100644 src/aixpert/data_construction/utils.py

diff --git a/src/aixpert/data_construction/config.yaml b/src/aixpert/data_construction/config.yaml
new file mode 100644
index 0000000..91e1874
--- /dev/null
+++ b/src/aixpert/data_construction/config.yaml
@@ -0,0 +1,4 @@
+repository: /projects/aixpert/users/sindhu/Loss_Test
+
+model:
+  name: gpt-4o-mini   # or gpt-4o
diff --git a/src/aixpert/data_construction/data_final_eval.py b/src/aixpert/data_construction/data_final_eval.py
new file mode 100644
index 0000000..addda7f
--- /dev/null
+++ b/src/aixpert/data_construction/data_final_eval.py
@@ -0,0 +1,126 @@
+"""
+Build the FINAL evaluation dataset (skywork_final_eval.jsonl).
+
+Composition:
+    • 400 synthetic inversion samples (1,0)
+    • all Skywork eval samples from skywork_first_transformed_eval.jsonl
+    • +1500 samples of (1,1) from skywork_final_train.jsonl
+    • +1500 samples of (0,0) from skywork_final_train.jsonl
+      → excluding any sample already used in train_finallast.jsonl
+
+Final eval ≈ (#sky_eval + 400 synthetic + 3000 added clean samples)
+"""
+
+from __future__ import annotations
+
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+# ============================================================
+# PATHS
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+
+SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl"
+SKY_EVAL_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl"
+
+TRAIN_SOURCE_FILE = DATA_DIR / "skywork_final_train.jsonl"
+TRAIN_USED_FILE = DATA_DIR / "train_finallast.jsonl"
+
+OUTPUT_FILE = DATA_DIR / "eval_final.jsonl"
+
+
+# ============================================================
+# HELPERS
+# ============================================================
+
+
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    """Load a JSONL file into a list of dictionaries."""
+    with path.open("r", encoding="utf-8") as f:
+        return [json.loads(line) for line in f]
+
+
+def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write a list of dictionaries to a JSONL file."""
+    with path.open("w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+
+# ============================================================
+# MAIN
+# ============================================================
+
+
+def main() -> None:
+    """Create the final evaluation dataset by merging all required sources."""
+    print("📥 Loading synthetic eval inversions...")
+    synthetic = load_jsonl(SYNTHETIC_FILE)
+    print("Synthetic:", len(synthetic))
+
+    print("📥 Loading Skywork eval transformed...")
+    sky_eval = load_jsonl(SKY_EVAL_FILE)
+    print("SkyEval:", len(sky_eval))
+
+    print("📥 Loading Skywork full training source...")
+    sky_train = load_jsonl(TRAIN_SOURCE_FILE)
+
+    print("📥 Loading TRAIN used (to exclude)...")
+    train_used = load_jsonl(TRAIN_USED_FILE)
+
+    # Convert used samples to hashable form
+    exclude_set = {(ex["prompt"], ex["chosen"], ex["rejected"]) for ex in train_used}
+
+    # -----------------------------------------------------------
+    # 1. Extract (1,1) and (0,0) pools from training source
+    # -----------------------------------------------------------
+    hw1_hl1_pool: List[Dict[str, Any]] = []
+    hw0_hl0_pool: List[Dict[str, Any]] = []
+
+    for ex in sky_train:
+        key = (ex["prompt"], ex["chosen"], ex["rejected"])
+        if key in exclude_set:
+            continue
+
+        if ex["h_w"] == 1 and ex["h_l"] == 1:
+            hw1_hl1_pool.append(ex)
+        elif ex["h_w"] == 0 and ex["h_l"] == 0:
+            hw0_hl0_pool.append(ex)
+
+    print(f"(1,1) available for eval add: {len(hw1_hl1_pool)}")
+    print(f"(0,0) available for eval add: {len(hw0_hl0_pool)}")
+
+    # -----------------------------------------------------------
+    # 2. Sample EXACT 1500 from each bucket
+    # -----------------------------------------------------------
+    eval_hw1_hl1 = random.sample(hw1_hl1_pool, 1500)
+    eval_hw0_hl0 = random.sample(hw0_hl0_pool, 1500)
+
+    # -----------------------------------------------------------
+    # 3. Merge everything
+    # -----------------------------------------------------------
+    merged: List[Dict[str, Any]] = []
+    merged.extend(synthetic)  # (1,0) → 400
+    merged.extend(sky_eval)  # (0,1) → ~1000
+    merged.extend(eval_hw1_hl1)  # (1,1) → 1500
+    merged.extend(eval_hw0_hl0)  # (0,0) → 1500
+
+    print(f"Total before shuffle: {len(merged)}")
+
+    random.shuffle(merged)
+
+    print(f"💾 Saving → {OUTPUT_FILE}")
+    write_jsonl(OUTPUT_FILE, merged)
+
+    print("✅ FINAL EVAL DATASET READY.")
+    print("Total eval:", len(merged))
+
+
+if __name__ == "__main__":
+    random.seed(42)
+    main()
diff --git a/src/aixpert/data_construction/data_final_train.py b/src/aixpert/data_construction/data_final_train.py
new file mode 100644
index 0000000..742208b
--- /dev/null
+++ b/src/aixpert/data_construction/data_final_train.py
@@ -0,0 +1,109 @@
+"""
+Balanced sampling for TRAIN dataset.
+
+This script:
+- Loads the merged training dataset.
+- Buckets by (h_w, h_l).
+- Samples required amounts per bucket (with replacement if needed).
+- Shuffles and saves the final balanced training dataset.
+
+Buckets required:
+    (0,1) → 10,000
+    (1,0) → 10,000
+    (0,0) → 15,000
+    (1,1) → 10,000
+"""
+
+from __future__ import annotations
+
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+
+# ============================================================
+# Paths (relative to this file's /data directory)
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+
+INPUT_FILE = DATA_DIR / "skywork_final_train.jsonl"
+OUTPUT_FILE = DATA_DIR / "train_finallast.jsonl"
+
+TARGET_COUNTS: Dict[Tuple[int, int], int] = {
+    (0, 1): 10_000,
+    (1, 0): 10_000,
+    (0, 0): 15_000,
+    (1, 1): 10_000,
+}
+
+
+# ============================================================
+# Helpers
+# ============================================================
+
+
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    """Load a JSONL file and return its rows as a list of dictionaries."""
+    with path.open("r", encoding="utf-8") as f:
+        return [json.loads(line) for line in f]
+
+
+def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write a list of dictionaries to a JSONL file."""
+    with path.open("w", encoding="utf-8") as f:
+        for ex in rows:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+
+# ============================================================
+# Main
+# ============================================================
+
+
+def main() -> None:
+    """Generate the balanced training dataset according to bucket size targets."""
+    print(f"📥 Loading dataset → {INPUT_FILE}")
+    data = load_jsonl(INPUT_FILE)
+
+    # bucket structure
+    buckets: Dict[Tuple[int, int], List[Dict[str, Any]]] = {
+        (0, 1): [],
+        (1, 0): [],
+        (0, 0): [],
+        (1, 1): [],
+    }
+
+    print("🔍 Bucketing samples...")
+    for ex in data:
+        key = (int(ex["h_w"]), int(ex["h_l"]))
+        if key in buckets:
+            buckets[key].append(ex)
+
+    final_samples: List[Dict[str, Any]] = []
+
+    for key, req_count in TARGET_COUNTS.items():
+        available = len(buckets[key])
+        print(f"Bucket {key}: available={available}, required={req_count}")
+
+        if available < req_count:
+            print("⚠️ Not enough samples — sampling WITH replacement.")
+            sampled = random.choices(buckets[key], k=req_count)
+        else:
+            sampled = random.sample(buckets[key], req_count)
+
+        final_samples.extend(sampled)
+
+    print(f"\n🔀 Shuffling {len(final_samples)} samples...")
+    random.shuffle(final_samples)
+
+    print(f"💾 Saving → {OUTPUT_FILE}")
+    write_jsonl(OUTPUT_FILE, final_samples)
+
+    print("✅ TRAIN balanced dataset created.")
+    print("Final count:", len(final_samples))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/data_merge_eval.py b/src/aixpert/data_construction/data_merge_eval.py
new file mode 100644
index 0000000..6cc9100
--- /dev/null
+++ b/src/aixpert/data_construction/data_merge_eval.py
@@ -0,0 +1,100 @@
+"""
+Merge Skywork evaluation data with 400 synthetic inversion pairs.
+
+This script:
+- Loads synthetic corruption samples for eval.
+- Loads Skywork eval transformed dataset.
+- Splits samples into buckets by (h_w, h_l).
+- Keeps ALL real eval samples.
+- Merges synthetic + all real eval buckets.
+- Shuffles and writes final eval JSONL file.
+
+Fully compatible with ruff, mypy, and pydocstyle.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+# ============================================================
+# Paths
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl"
+SKYWORK_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl"
+OUTPUT_FILE = DATA_DIR / "skywork_final_eval.jsonl"
+
+
+# ============================================================
+# Helpers
+# ============================================================
+
+
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    """Load JSONL file into list of dicts."""
+    rows: List[Dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            rows.append(json.loads(line))
+    return rows
+
+
+def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write a list of dicts to a JSONL file."""
+    with path.open("w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+
+# ============================================================
+# Main
+# ============================================================
+
+
+def main() -> None:
+    """Merge Skywork eval data with 400 synthetic inversions."""
+    print("📥 Loading synthetic eval inversions...")
+    synthetic = load_jsonl(SYNTHETIC_FILE)
+    print(f"Synthetic eval: {len(synthetic)}")
+
+    print("📥 Loading Skywork eval transformed...")
+    sky = load_jsonl(SKYWORK_FILE)
+    print(f"Skywork eval: {len(sky)}")
+
+    hw0_hl0: List[Dict[str, Any]] = []
+    hw1_hl1: List[Dict[str, Any]] = []
+    hw0_hl1: List[Dict[str, Any]] = []
+
+    for ex in sky:
+        h_w = ex["h_w"]
+        h_l = ex["h_l"]
+
+        if h_w == 0 and h_l == 0:
+            hw0_hl0.append(ex)
+        elif h_w == 1 and h_l == 1:
+            hw1_hl1.append(ex)
+        elif h_w == 0 and h_l == 1:
+            hw0_hl1.append(ex)
+
+    print(f"(0,0): {len(hw0_hl0)}")
+    print(f"(1,1): {len(hw1_hl1)}")
+    print(f"(0,1): {len(hw0_hl1)}")
+
+    merged = synthetic + hw0_hl0 + hw1_hl1 + hw0_hl1
+    print(f"Total merged before shuffle: {len(merged)}")
+
+    random.shuffle(merged)
+
+    print(f"💾 Saving → {OUTPUT_FILE}")
+    write_jsonl(OUTPUT_FILE, merged)
+
+    print("✅ EVAL MERGE DONE.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/data_merge_train.py b/src/aixpert/data_construction/data_merge_train.py
new file mode 100644
index 0000000..d97b1de
--- /dev/null
+++ b/src/aixpert/data_construction/data_merge_train.py
@@ -0,0 +1,104 @@
+"""
+Merge Skywork training data with 10k synthetic inversion pairs.
+
+This script:
+- Loads synthetic corruption samples.
+- Loads transformed Skywork training data.
+- Splits real samples into buckets by (h_w, h_l).
+- Samples 10k from (0,1).
+- Merges: synthetic + (0,0) + (1,1) + sampled (0,1).
+- Shuffles and writes final JSONL file.
+
+Fully compatible with ruff, mypy, and pydocstyle.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+# ============================================================
+# Paths
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_train_10k.jsonl"
+SKYWORK_FILE = DATA_DIR / "skywork_first_transformed_train.jsonl"
+OUTPUT_FILE = DATA_DIR / "skywork_final_train.jsonl"
+
+
+# ============================================================
+# Helpers
+# ============================================================
+
+
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    """Load JSONL file into list of dicts."""
+    rows: List[Dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            rows.append(json.loads(line))
+    return rows
+
+
+def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write list of dicts to JSONL file."""
+    with path.open("w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+
+# ============================================================
+# Main
+# ============================================================
+
+
+def main() -> None:
+    """Merge Skywork train data with 10k synthetic hallucinations."""
+    print("📥 Loading synthetic inversions...")
+    synthetic = load_jsonl(SYNTHETIC_FILE)
+    print(f"Synthetic loaded: {len(synthetic)}")
+
+    print("📥 Loading Skywork train transformed...")
+    sky = load_jsonl(SKYWORK_FILE)
+    print(f"Skywork loaded: {len(sky)}")
+
+    hw0_hl0: List[Dict[str, Any]] = []
+    hw1_hl1: List[Dict[str, Any]] = []
+    hw0_hl1: List[Dict[str, Any]] = []
+
+    for ex in sky:
+        h_w = ex["h_w"]
+        h_l = ex["h_l"]
+
+        if h_w == 0 and h_l == 0:
+            hw0_hl0.append(ex)
+        elif h_w == 1 and h_l == 1:
+            hw1_hl1.append(ex)
+        elif h_w == 0 and h_l == 1:
+            hw0_hl1.append(ex)
+
+    print(f"(0,0): {len(hw0_hl0)}")
+    print(f"(1,1): {len(hw1_hl1)}")
+    print(f"(0,1): {len(hw0_hl1)}")
+
+    random.seed(42)
+    sample01 = random.sample(hw0_hl1, 10000)
+    print(f"Sampled (0,1): {len(sample01)}")
+
+    merged = synthetic + hw0_hl0 + hw1_hl1 + sample01
+    print(f"Total merged before shuffle: {len(merged)}")
+
+    random.shuffle(merged)
+
+    print(f"💾 Saving → {OUTPUT_FILE}")
+    write_jsonl(OUTPUT_FILE, merged)
+
+    print("✅ TRAIN MERGE DONE.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/data_synthetic_eval.py b/src/aixpert/data_construction/data_synthetic_eval.py
new file mode 100644
index 0000000..78ec432
--- /dev/null
+++ b/src/aixpert/data_construction/data_synthetic_eval.py
@@ -0,0 +1,167 @@
+"""
+Generate synthetic corruption (hallucinated) responses for EVAL split.
+
+This script:
+- Loads clean DPO-ready Skywork eval transformation.
+- Selects pairs where h_w=0 and h_l=1.
+- Uses GPT-4o-mini to introduce subtle factual errors.
+- Produces inverted (hallucinated, correct) preference pairs.
+- Saves 400 synthetic eval corruption examples.
+
+Compatible with ruff, ruff-format, pydocstyle, and mypy.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from decouple import Config, RepositoryEnv
+from openai import AsyncOpenAI
+from tqdm.asyncio import tqdm as tqdm_asyncio
+from utils import load_yaml
+
+
+# ============================================================
+# Load config.yaml + .env API key
+# ============================================================
+
+CONFIG = load_yaml("config.yaml")  # must contain: repository: <path>
+REPO_PATH = CONFIG["repository"]
+
+env = Config(RepositoryEnv(f"{REPO_PATH}/.env"))
+OPENAI_API_KEY = env("OPENAI_API_KEY", default=False)
+
+if not OPENAI_API_KEY:
+    raise RuntimeError("❌ OPENAI_API_KEY missing in repository .env")
+
+client = AsyncOpenAI(api_key=OPENAI_API_KEY)
+
+# ============================================================
+# Paths
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+INPUT_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl"
+OUTPUT_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl"
+
+# ============================================================
+# Prompts
+# ============================================================
+
+SYSTEM_PROMPT = """
+You are a factual corruption generator.
+
+Rewrite a correct answer into a subtly incorrect version with 2–3 factual errors.
+It must remain fluent, confident, and plausible.
+"""
+
+
+def build_user_prompt(question: str, correct_answer: str) -> str:
+    """Construct user prompt."""
+    return f"""
+PROMPT:
+{question}
+
+CORRECT ANSWER:
+{correct_answer}
+
+TASK:
+Rewrite this answer so it becomes factually incorrect while still sounding natural.
+"""
+
+
+# ============================================================
+# API wrapper
+# ============================================================
+
+
+async def generate_corruption(
+    question: str, answer: str, semaphore: asyncio.Semaphore
+) -> Optional[str]:
+    """Generate a hallucinated version of the answer."""
+    user_prompt = build_user_prompt(question, answer)
+
+    async with semaphore:
+        for retry in range(5):
+            try:
+                resp = await client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {"role": "system", "content": SYSTEM_PROMPT},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    temperature=0.8,
+                )
+                return resp.choices[0].message.content.strip()
+
+            except Exception as exc:
+                print(f"Retry ({retry}) corruption error:", exc)
+                await asyncio.sleep(1 + retry * 0.5)
+
+    return None
+
+
+# ============================================================
+# Process each item
+# ============================================================
+
+
+async def process_item(
+    item: Dict[str, Any], semaphore: asyncio.Semaphore
+) -> Optional[Dict[str, Any]]:
+    """Create synthetic corruption pair for an eval item."""
+    prompt = item["prompt"]
+    correct = item["chosen"]
+
+    corrupted = await generate_corruption(prompt, correct, semaphore)
+    if corrupted is None:
+        return None
+
+    return {
+        "prompt": prompt,
+        "chosen": corrupted,
+        "rejected": correct,
+        "h_w": 1,
+        "h_l": 0,
+        "source": "synthetic_inversion_eval",
+    }
+
+
+# ============================================================
+# Main
+# ============================================================
+
+
+async def main() -> None:
+    """Run synthetic generation for evaluation."""
+    target = 400
+
+    print(f"📥 Loading eval data → {INPUT_FILE}")
+    items = [json.loads(line) for line in INPUT_FILE.open("r", encoding="utf-8")]
+
+    clean_pairs = [x for x in items if x.get("h_w") == 0 and x.get("h_l") == 1]
+
+    selected = random.sample(clean_pairs, min(target, len(clean_pairs)))
+    print(f"🔎 Selected {len(selected)} items for corruption.")
+
+    semaphore = asyncio.Semaphore(20)
+    coros = [process_item(item, semaphore) for item in selected]
+
+    print("⚙️ Generating eval corruptions...")
+    results = await tqdm_asyncio.gather(*coros)
+    results = [r for r in results if r is not None]
+
+    print(f"💾 Saving {len(results)} examples → {OUTPUT_FILE}")
+    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
+        for r in results:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+
+    print("✅ Eval synthetic corruption generation complete.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/aixpert/data_construction/data_synthetic_train.py b/src/aixpert/data_construction/data_synthetic_train.py
new file mode 100644
index 0000000..b896f9a
--- /dev/null
+++ b/src/aixpert/data_construction/data_synthetic_train.py
@@ -0,0 +1,179 @@
+"""
+Generate synthetic corruption (hallucinated) responses for TRAIN split.
+
+This script:
+- Loads clean DPO-ready Skywork transformation for training.
+- Selects items where h_w=0 (winner factual) and h_l=1 (loser incorrect).
+- Asks GPT-4o-mini to rewrite the factual answer into a subtle hallucination.
+- Produces “inversion pairs” where corrupted is chosen and original is rejected.
+- Saves up to 10,000 synthetic hallucination samples.
+
+Fully compatible with ruff, ruff-format, pydocstyle, and mypy.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from decouple import Config, RepositoryEnv
+from openai import AsyncOpenAI
+from tqdm.asyncio import tqdm as tqdm_asyncio
+from utils import load_yaml
+
+
+# ============================================================
+# Load config.yaml + .env API key
+# ============================================================
+
+CONFIG = load_yaml("config.yaml")  # must contain: repository: <path>
+REPO_PATH = CONFIG["repository"]
+
+env = Config(RepositoryEnv(f"{REPO_PATH}/.env"))
+OPENAI_API_KEY = env("OPENAI_API_KEY", default=False)
+
+if not OPENAI_API_KEY:
+    raise RuntimeError("❌ OPENAI_API_KEY missing in .env under repository path.")
+
+client = AsyncOpenAI(api_key=OPENAI_API_KEY)
+
+# ============================================================
+# Directories
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+INPUT_FILE = DATA_DIR / "skywork_first_transformed_train.jsonl"
+OUTPUT_FILE = DATA_DIR / "synthetic_llm_inversion_train_10k.jsonl"
+
+# ============================================================
+# Corruption prompt
+# ============================================================
+
+SYSTEM_PROMPT = """
+You are a factual corruption generator.
+
+Given a question and its correct answer, produce an incorrect version
+that is fluent, confident, and well-structured but it must contain 2–3 factual inaccuracies.
+Avoid nonsense. Keep the tone similar to the original answer.
+"""
+
+
+def build_user_prompt(question: str, correct_answer: str) -> str:
+    """Construct the user prompt for hallucination generation."""
+    return f"""
+PROMPT:
+{question}
+
+CORRECT ANSWER:
+{correct_answer}
+
+TASK:
+Rewrite the answer so that it becomes factually wrong, introducing subtle hallucinations
+while sounding confident and coherent.
+"""
+
+
+# ============================================================
+# API for corrupted (hallucinated) answer
+# ============================================================
+
+
+async def generate_corruption(
+    question: str,
+    answer: str,
+    semaphore: asyncio.Semaphore,
+) -> Optional[str]:
+    """Generate a hallucinated version of the correct answer using GPT-4o-mini."""
+    user_prompt = build_user_prompt(question, answer)
+
+    async with semaphore:
+        for retry in range(5):
+            try:
+                resp = await client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {"role": "system", "content": SYSTEM_PROMPT},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    temperature=0.8,
+                )
+                return resp.choices[0].message.content.strip()
+
+            except Exception as exc:
+                print(f"Retry corruption ({retry}): {exc}")
+                await asyncio.sleep(1 + retry * 0.5)
+
+    return None
+
+
+# ============================================================
+# Process one item
+# ============================================================
+
+
+async def process_item(
+    item: Dict[str, Any],
+    semaphore: asyncio.Semaphore,
+) -> Optional[Dict[str, Any]]:
+    """Produce one synthetic inversion (corruption) DPO sample."""
+    prompt = item["prompt"]
+    correct_answer = item["chosen"]
+
+    corrupted = await generate_corruption(prompt, correct_answer, semaphore)
+
+    if corrupted is None:
+        return None
+
+    return {
+        "prompt": prompt,
+        "chosen": corrupted,  # hallucinated / corrupted
+        "rejected": correct_answer,  # original factual answer
+        "h_w": 1,  # corrupted = wrong
+        "h_l": 0,  # original = correct
+        "source": "synthetic_inversion",
+    }
+
+
+# ============================================================
+# Main
+# ============================================================
+
+
+async def main() -> None:
+    """Generate 10k synthetic corruption pairs and save JSONL output."""
+    target = 10_000
+    print(f"📥 Loading training dataset → {INPUT_FILE}")
+
+    items: List[Dict[str, Any]] = [
+        json.loads(line) for line in INPUT_FILE.open("r", encoding="utf-8")
+    ]
+
+    print("🔍 Selecting factual (0,1) pairs only...")
+    clean_pairs = [x for x in items if x["h_w"] == 0 and x["h_l"] == 1]
+
+    print(f"Available factual pairs: {len(clean_pairs)}")
+    selected = random.sample(clean_pairs, target)
+    print(f"🎯 Selected {len(selected)} items for corruption generation.")
+
+    semaphore = asyncio.Semaphore(20)
+
+    tasks = [process_item(item, semaphore) for item in selected]
+
+    print("⚙️ Generating corrupted answers...")
+    results = await tqdm_asyncio.gather(*tasks)
+
+    final_rows = [r for r in results if r is not None]
+
+    print(f"💾 Saving {len(final_rows)} synthetic samples → {OUTPUT_FILE}")
+    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
+        for row in final_rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+    print("✅ Synthetic corruption dataset created.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/aixpert/data_construction/data_transform_eval.py b/src/aixpert/data_construction/data_transform_eval.py
new file mode 100644
index 0000000..48468f6
--- /dev/null
+++ b/src/aixpert/data_construction/data_transform_eval.py
@@ -0,0 +1,85 @@
+"""
+Transform binary factual-scored evaluation preference pairs into DPO-ready format.
+
+This script:
+- Loads binary factual results for eval pairs.
+- Converts response_0 / response_1 into (chosen, rejected) using the
+  better_response_id.
+- Copies factual flags into h_w (winner) and h_l (loser).
+- Preserves the original responses and adds a flipped=False flag.
+- Writes the DPO-ready JSONL file for evaluation.
+
+Fully compliant with ruff, ruff-format, pydocstyle, and mypy.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+from tqdm import tqdm
+
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+
+INPUT_PATH = DATA_DIR / "skywork_binary_factual_eval.jsonl"
+OUTPUT_PATH = DATA_DIR / "skywork_first_transformed_eval.jsonl"
+
+
+def process_item(item: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert one binary factual-scored eval pair into DPO-ready structure."""
+    prompt = item["prompt"]
+    r0 = item["response_0"]
+    r1 = item["response_1"]
+    pref = int(item["better_response_id"])
+
+    # factual flags
+    h0 = int(item["h0"])
+    h1 = int(item["h1"])
+
+    if pref == 0:
+        chosen, rejected = r0, r1
+        h_w, h_l = h0, h1
+    else:
+        chosen, rejected = r1, r0
+        h_w, h_l = h1, h0
+
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "h_w": h_w,
+        "h_l": h_l,
+        "better_response_id": pref,
+        "response_0": r0,
+        "response_1": r1,
+        "flipped": False,
+    }
+
+
+def transform_dataset() -> None:
+    """Load eval dataset, apply transformation, and save JSONL output."""
+    print(f"📥 Loading eval data → {INPUT_PATH}")
+    items = [json.loads(line) for line in INPUT_PATH.open("r", encoding="utf-8")]
+
+    transformed: List[Dict[str, Any]] = []
+
+    print(f"⚙️ Processing {len(items)} items…")
+    for item in tqdm(items):
+        transformed.append(process_item(item))
+
+    print(f"💾 Saving output → {OUTPUT_PATH}")
+    with OUTPUT_PATH.open("w", encoding="utf-8") as f:
+        for obj in transformed:
+            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+
+    print("\n=======================================")
+    print("✔ EVAL DATASET TRANSFORMATION COMPLETE")
+    print("✔ NO SAFE-DPO FLIPS APPLIED")
+    print(f"Total items: {len(items)}")
+    print("=======================================\n")
+
+
+if __name__ == "__main__":
+    transform_dataset()
diff --git a/src/aixpert/data_construction/data_transform_train.py b/src/aixpert/data_construction/data_transform_train.py
new file mode 100644
index 0000000..7b65397
--- /dev/null
+++ b/src/aixpert/data_construction/data_transform_train.py
@@ -0,0 +1,86 @@
+"""
+Transform binary factual-scored training preference pairs into DPO-ready format.
+
+This script:
+- Loads binary factual results for training pairs.
+- Converts response_0 / response_1 into (chosen, rejected) strictly based on
+  better_response_id.
+- Copies factual flags into h_w (winner) and h_l (loser).
+- Preserves original responses and adds a flipped=False flag.
+- Writes the DPO-ready JSONL file for training.
+
+Fully compatible with ruff, ruff-format, pydocstyle, and mypy.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+from tqdm import tqdm
+
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+
+INPUT_PATH = DATA_DIR / "skywork_binary_factual_train.jsonl"
+OUTPUT_PATH = DATA_DIR / "skywork_first_transformed_train.jsonl"
+
+
+def process_item(item: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert one binary factual-scored pair into DPO-ready structure."""
+    prompt = item["prompt"]
+    r0 = item["response_0"]
+    r1 = item["response_1"]
+    pref = int(item["better_response_id"])
+
+    # factual/hallucination flags
+    h0 = int(item["h0"])
+    h1 = int(item["h1"])
+
+    # Determine chosen vs rejected based on preference label
+    if pref == 0:
+        chosen, rejected = r0, r1
+        h_w, h_l = h0, h1
+    else:
+        chosen, rejected = r1, r0
+        h_w, h_l = h1, h0
+
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "h_w": h_w,
+        "h_l": h_l,
+        "better_response_id": pref,
+        "response_0": r0,
+        "response_1": r1,
+        "flipped": False,
+    }
+
+
+def transform_dataset() -> None:
+    """Load training dataset, apply transformation, and save JSONL output."""
+    print(f"📥 Loading training data → {INPUT_PATH}")
+    items = [json.loads(line) for line in INPUT_PATH.open("r", encoding="utf-8")]
+
+    transformed: List[Dict[str, Any]] = []
+
+    print(f"⚙️ Processing {len(items)} items…")
+    for item in tqdm(items):
+        transformed.append(process_item(item))
+
+    print(f"💾 Saving output → {OUTPUT_PATH}")
+    with OUTPUT_PATH.open("w", encoding="utf-8") as f:
+        for obj in transformed:
+            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+
+    print("\n=======================================")
+    print("✔ TRAIN DATASET TRANSFORMATION COMPLETE")
+    print("✔ NO SAFE-DPO FLIPS APPLIED")
+    print(f"Total items: {len(items)}")
+    print("=======================================\n")
+
+
+if __name__ == "__main__":
+    transform_dataset()
diff --git a/src/aixpert/data_construction/dataconversion.eval.py b/src/aixpert/data_construction/dataconversion.eval.py
new file mode 100644
index 0000000..6f6525d
--- /dev/null
+++ b/src/aixpert/data_construction/dataconversion.eval.py
@@ -0,0 +1,96 @@
+"""
+Generate evaluation preference pairs from cleaned Skywork samples.
+
+This script loads prompt/chosen/rejected rows from the evaluation JSONL dataset,
+randomly assigns chosen/rejected responses into response_0 and response_1,
+assigns the correct better_response_id, and saves the resulting dataset in JSONL format.
+
+It mirrors the training script but operates on the evaluation split only.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+# ============================================================
+# Configuration
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+# Input/output file names for evaluation set
+INPUT_FILE = DATA_DIR / "skywork_extracted_eval.jsonl"
+OUT_FILE = DATA_DIR / "skywork_preference_pairs_eval.jsonl"
+
+
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    """Load a JSONL file into a list of dictionaries."""
+    rows: List[Dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            rows.append(json.loads(line))
+    return rows
+
+
+def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write a list of dictionaries to a JSONL file."""
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Convert prompt/chosen/rejected rows into preference-pair format."""
+    output: List[Dict[str, Any]] = []
+
+    for item in data:
+        prompt = item.get("prompt", "")
+        chosen = item.get("chosen", "")
+        rejected = item.get("rejected", "")
+
+        # Random assignment
+        if random.random() < 0.5:
+            response_0 = chosen
+            response_1 = rejected
+            better_response_id = 0
+        else:
+            response_0 = rejected
+            response_1 = chosen
+            better_response_id = 1
+
+        output.append(
+            {
+                "prompt": prompt,
+                "response_0": response_0,
+                "response_1": response_1,
+                "better_response_id": better_response_id,
+            }
+        )
+
+    return output
+
+
+def main() -> None:
+    """Generate evaluation preference pairs and save them to disk."""
+    print(f"📥 Loading evaluation dataset from → {INPUT_FILE}")
+
+    data = load_jsonl(INPUT_FILE)
+    print(f"📄 Loaded {len(data)} rows")
+
+    preference_pairs = create_preference_pairs(data)
+
+    write_jsonl(OUT_FILE, preference_pairs)
+
+    print("======================================")
+    print(f"✅ DONE! Saved evaluation preference pairs → {OUT_FILE}")
+    print(f"📦 Total eval pairs: {len(preference_pairs)}")
+    print("======================================")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/dataconversion.py b/src/aixpert/data_construction/dataconversion.py
new file mode 100644
index 0000000..40a034d
--- /dev/null
+++ b/src/aixpert/data_construction/dataconversion.py
@@ -0,0 +1,94 @@
+"""
+Generate preference pairs from cleaned Skywork samples.
+
+This script loads prompt/chosen/rejected rows from a JSONL dataset, randomly
+assigns chosen/rejected responses into response_0 and response_1, assigns the
+correct better_response_id, and saves the resulting dataset in JSONL format.
+
+This version is fully compliant with ruff, ruff-format, pydocstyle, and mypy.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+# ============================================================
+# Configuration
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+INPUT_FILE = DATA_DIR / "skywork_extracted_77k.jsonl"
+OUT_FILE = DATA_DIR / "skywork_preference_pairs_77k.jsonl"
+
+
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    """Load a JSONL file into a list of dictionaries."""
+    rows: List[Dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            rows.append(json.loads(line))
+    return rows
+
+
+def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write a list of dictionaries to a JSONL file."""
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Convert prompt/chosen/rejected rows into preference-pair format."""
+    output: List[Dict[str, Any]] = []
+
+    for item in data:
+        prompt = item.get("prompt", "")
+        chosen = item.get("chosen", "")
+        rejected = item.get("rejected", "")
+
+        if random.random() < 0.5:
+            response_0 = chosen
+            response_1 = rejected
+            better_response_id = 0
+        else:
+            response_0 = rejected
+            response_1 = chosen
+            better_response_id = 1
+
+        output.append(
+            {
+                "prompt": prompt,
+                "response_0": response_0,
+                "response_1": response_1,
+                "better_response_id": better_response_id,
+            }
+        )
+
+    return output
+
+
+def main() -> None:
+    """Generate evaluation preference pairs and save them to disk."""
+    print(f"📥 Loading dataset from → {INPUT_FILE}")
+
+    data = load_jsonl(INPUT_FILE)
+    print(f"📄 Loaded {len(data)} rows")
+
+    preference_pairs = create_preference_pairs(data)
+
+    write_jsonl(OUT_FILE, preference_pairs)
+
+    print("======================================")
+    print(f"✅ DONE! Saved preference pairs → {OUT_FILE}")
+    print(f"📦 Total pairs: {len(preference_pairs)}")
+    print("======================================")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/dataset_eval.py b/src/aixpert/data_construction/dataset_eval.py
new file mode 100644
index 0000000..faa9d0c
--- /dev/null
+++ b/src/aixpert/data_construction/dataset_eval.py
@@ -0,0 +1,215 @@
+"""
+Generate binary factuality labels (0=factual, 1=hallucinated) for test preference pairs.
+
+This script loads test preference pairs, evaluates each response using a
+strict PKU-style binary factual judge. It then saves the dataset with:
+
+    factual_flag_0, factual_flag_1
+    h0, h1  (aliases for convenience)
+
+It supports automatic resume, concurrency throttling, and checkpointing.
+Fully compliant with ruff, ruff-format, pydocstyle, and mypy.
+
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+
+from decouple import Config, RepositoryEnv
+from openai import AsyncOpenAI
+from tqdm.asyncio import tqdm_asyncio
+from utils import load_yaml
+
+
+# ============================================================
+# Load config.yaml + .env API key
+# ============================================================
+
+CONFIG = load_yaml("config.yaml")  # MUST contain: repository: <path>
+REPO_PATH = CONFIG["repository"]
+
+env = Config(RepositoryEnv(f"{REPO_PATH}/.env"))
+OPENAI_API_KEY = env("OPENAI_API_KEY", default=False)
+
+if not OPENAI_API_KEY:
+    raise RuntimeError("❌ OPENAI_API_KEY not found in .env under repository path.")
+
+client = AsyncOpenAI(api_key=OPENAI_API_KEY)
+
+
+# ============================================================
+# Configuration & Directories
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+INPUT_FILE = DATA_DIR / "skywork_preference_pairs_eval.jsonl"
+OUTPUT_FILE = DATA_DIR / "skywork_binary_factual_eval.jsonl"
+
+# ============================================================
+# Factual Judge Prompt
+# ============================================================
+
+BINARY_FACTUAL_JUDGE_PROMPT = """
+System:
+You are an extremely strict factuality judge. Your task is to determine whether the assistant’s
+response contains ANY of the following:
+
+• factual errors
+• hallucinations
+• unsupported claims
+• unverifiable statements
+• fabricated details
+• incorrect technical descriptions
+• invented studies, names, or entities
+• logical contradictions
+• vague or misleading explanations
+• partial answers that omit critical facts
+
+IMPORTANT:
+If the response is vague, incomplete, unsupported, or unverifiable,
+classify it as [[1]].
+
+Definitions:
+[[0]] → Fully factual, precise, verifiable.
+[[1]] → ANY issue: vague, incorrect, unverifiable, misleading, fabricated.
+
+Return ONLY:
+[[0]] or [[1]]
+
+User Question:
+{question}
+
+Assistant Response:
+{answer}
+"""
+
+
+# ============================================================
+# API Call — Get Binary Factual Flag
+# ============================================================
+
+
+async def get_factual_flag(
+    question: str, answer: str, semaphore: asyncio.Semaphore
+) -> int:
+    """Evaluate a response for factual correctness (0=factual, 1=hallucinated)."""
+    prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer)
+
+    async with semaphore:
+        for retry in range(5):
+            try:
+                response = await client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0,
+                )
+
+                text = response.choices[0].message.content.strip()
+                match = re.search(r"\[\[(0|1)\]\]", text)
+                if match:
+                    return int(match.group(1))
+
+                return 1  # default: hallucinated
+
+            except Exception as exc:
+                print(f"Retry factual-flag ({retry}): {exc}")
+                await asyncio.sleep(1 + 0.5 * retry)
+
+    return 1
+
+
+# ============================================================
+# Process One Item
+# ============================================================
+
+
+async def process_single_item(
+    item: Dict[str, Any], semaphore: asyncio.Semaphore
+) -> Dict[str, Any]:
+    """Process one preference pair and produce binary factual labels."""
+    prompt = item["prompt"]
+    r0 = item["response_0"]
+    r1 = item["response_1"]
+
+    f0_task = asyncio.create_task(get_factual_flag(prompt, r0, semaphore))
+    f1_task = asyncio.create_task(get_factual_flag(prompt, r1, semaphore))
+
+    f0 = await f0_task
+    f1 = await f1_task
+
+    return {
+        **item,
+        "factual_flag_0": f0,
+        "factual_flag_1": f1,
+        "h0": f0,
+        "h1": f1,
+    }
+
+
+# ============================================================
+# Main Async Pipeline
+# ============================================================
+
+
+async def process_dataset() -> None:
+    """Load test dataset, compute factual flags, resume if needed, and save output."""
+    print(f"📥 Loading test dataset → {INPUT_FILE}")
+
+    with INPUT_FILE.open("r", encoding="utf-8") as f:
+        items = [json.loads(line) for line in f]
+
+    # Resume mode
+    processed_count = 0
+    if OUTPUT_FILE.exists():
+        print("♻️ Resuming previous run...")
+        with OUTPUT_FILE.open("r", encoding="utf-8") as f:
+            processed_count = sum(1 for _ in f)
+        print(f"Found {processed_count} completed items.")
+
+    remaining = items[processed_count:]
+    semaphore = asyncio.Semaphore(25)
+
+    tasks = [
+        asyncio.create_task(process_single_item(item, semaphore)) for item in remaining
+    ]
+
+    buffer: List[str] = []
+    count = processed_count
+
+    with OUTPUT_FILE.open("a", encoding="utf-8") as f:
+        for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)):
+            result = await coro
+            buffer.append(json.dumps(result, ensure_ascii=False) + "\n")
+            count += 1
+
+            if len(buffer) >= 25:
+                f.writelines(buffer)
+                f.flush()
+                os.fsync(f.fileno())
+                buffer.clear()
+                print(f"Checkpoint saved ({count} items).")
+
+        # Flush final buffer
+        if buffer:
+            f.writelines(buffer)
+            f.flush()
+            os.fsync(f.fileno())
+            print(f"Final checkpoint saved ({count} items).")
+
+    print("✅ Completed test factual evaluation.")
+
+
+# ============================================================
+# Entry Point
+# ============================================================
+
+if __name__ == "__main__":
+    asyncio.run(process_dataset())
diff --git a/src/aixpert/data_construction/dataset_train.py b/src/aixpert/data_construction/dataset_train.py
new file mode 100644
index 0000000..f24b773
--- /dev/null
+++ b/src/aixpert/data_construction/dataset_train.py
@@ -0,0 +1,216 @@
+"""
+Generate binary factuality labels for evaluation preference pairs using GPT-4o-mini.
+
+This script loads evaluation preference pairs, evaluates each response using
+a strict PKU-style binary factual judge, and saves the resulting dataset with:
+
+    factual_flag_0, factual_flag_1
+    h0, h1  (aliases for convenience)
+
+It supports automatic resume, concurrency throttling, and checkpointing.
+Fully compliant with ruff, ruff-format, pydocstyle, and mypy.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+
+from decouple import Config, RepositoryEnv
+from openai import AsyncOpenAI
+from tqdm.asyncio import tqdm_asyncio
+from utils import load_yaml  # your YAML loader
+
+
+# ============================================================
+# Load OpenAI API key
+# ============================================================
+
+
+YAML_CONFIG = load_yaml("config.yaml")
+REPO_PATH = YAML_CONFIG["repository"]
+
+# Load .env using decouple (the exact method you asked for)
+env = Config(RepositoryEnv(f"{REPO_PATH}/.env"))
+OPENAI_API_KEY = env("OPENAI_API_KEY", default=False)
+
+if not OPENAI_API_KEY:
+    raise ValueError("❌ OPENAI_API_KEY not found in .env file!")
+
+client = AsyncOpenAI(api_key=OPENAI_API_KEY)
+
+
+# ============================================================
+# Configuration & Directories
+# ============================================================
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+INPUT_FILE = DATA_DIR / "skywork_preference_pairs_train.jsonl"
+OUTPUT_FILE = DATA_DIR / "skywork_binary_factual_train.jsonl"
+
+# ============================================================
+# Factual Judge Prompt
+# ============================================================
+
+BINARY_FACTUAL_JUDGE_PROMPT = """
+System:
+You are an extremely strict factuality judge. Your task is to determine whether the assistant’s
+response contains ANY of the following:
+
+• factual errors
+• hallucinations
+• unsupported claims
+• unverifiable statements
+• fabricated details
+• incorrect technical descriptions
+• invented studies, names, or entities
+• logical contradictions
+• vague or misleading explanations
+• partial answers that omit critical facts
+
+IMPORTANT:
+If the response is vague, incomplete, unsupported, or unverifiable,
+classify it as [[1]].
+
+Definitions:
+[[0]] → Fully factual, precise, verifiable.
+[[1]] → ANY issue: vague, incorrect, unverifiable, misleading, fabricated.
+
+Return ONLY:
+[[0]] or [[1]]
+
+User Question:
+{question}
+
+Assistant Response:
+{answer}
+"""
+
+
+# ============================================================
+# API Call — Get Binary Factual Flag
+# ============================================================
+
+
+async def get_factual_flag(
+    question: str, answer: str, semaphore: asyncio.Semaphore
+) -> int:
+    """Evaluate a response for factual correctness (0=factual, 1=hallucinated)."""
+    prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer)
+
+    async with semaphore:
+        for retry in range(5):
+            try:
+                response = await client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0,
+                )
+
+                text = response.choices[0].message.content.strip()
+                match = re.search(r"\[\[(0|1)\]\]", text)
+                if match:
+                    return int(match.group(1))
+
+                return 1  # default: hallucinated
+
+            except Exception as exc:
+                print(f"Retry factual-flag ({retry}): {exc}")
+                await asyncio.sleep(1 + 0.5 * retry)
+
+    return 1
+
+
+# ============================================================
+# Process One Item
+# ============================================================
+
+
+async def process_single_item(
+    item: Dict[str, Any], semaphore: asyncio.Semaphore
+) -> Dict[str, Any]:
+    """Process one preference pair and produce binary factual labels."""
+    prompt = item["prompt"]
+    r0 = item["response_0"]
+    r1 = item["response_1"]
+
+    f0_task = asyncio.create_task(get_factual_flag(prompt, r0, semaphore))
+    f1_task = asyncio.create_task(get_factual_flag(prompt, r1, semaphore))
+
+    f0 = await f0_task
+    f1 = await f1_task
+
+    return {
+        **item,
+        "factual_flag_0": f0,
+        "factual_flag_1": f1,
+        "h0": f0,
+        "h1": f1,
+    }
+
+
+# ============================================================
+# Main Async Pipeline
+# ============================================================
+
+
+async def process_dataset() -> None:
+    """Load eval dataset, compute factual flags, resume if needed, and save output."""
+    print(f"📥 Loading eval dataset → {INPUT_FILE}")
+
+    with INPUT_FILE.open("r", encoding="utf-8") as f:
+        items = [json.loads(line) for line in f]
+
+    # Resume mode
+    processed_count = 0
+    if OUTPUT_FILE.exists():
+        print("♻️ Resuming previous run...")
+        with OUTPUT_FILE.open("r", encoding="utf-8") as f:
+            processed_count = sum(1 for _ in f)
+        print(f"Found {processed_count} completed items.")
+
+    remaining = items[processed_count:]
+    semaphore = asyncio.Semaphore(25)
+
+    tasks = [
+        asyncio.create_task(process_single_item(item, semaphore)) for item in remaining
+    ]
+
+    buffer: List[str] = []
+    count = processed_count
+
+    with OUTPUT_FILE.open("a", encoding="utf-8") as f:
+        for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)):
+            result = await coro
+            buffer.append(json.dumps(result, ensure_ascii=False) + "\n")
+            count += 1
+
+            if len(buffer) >= 25:
+                f.writelines(buffer)
+                f.flush()
+                os.fsync(f.fileno())
+                buffer.clear()
+                print(f"Checkpoint saved ({count} items).")
+
+        # Flush remaining
+        if buffer:
+            f.writelines(buffer)
+            f.flush()
+            os.fsync(f.fileno())
+            print(f"Final checkpoint saved ({count} items).")
+
+    print("✅ Completed factual evaluation.")
+
+
+# ============================================================
+# Entry Point
+# ============================================================
+
+if __name__ == "__main__":
+    asyncio.run(process_dataset())
diff --git a/src/aixpert/data_construction/utils.py b/src/aixpert/data_construction/utils.py
new file mode 100644
index 0000000..7aed9e6
--- /dev/null
+++ b/src/aixpert/data_construction/utils.py
@@ -0,0 +1,55 @@
+"""
+Utility functions for loading configuration files.
+
+This module provides:
+- `load_yaml`: Read a YAML file into a Python dictionary.
+- `load_env_api_key`: Load the OPENAI_API_KEY from a repository `.env` file.
+
+These helpers centralize configuration handling and ensure consistent behavior
+across all data-construction scripts.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict
+
+import yaml
+from decouple import Config, RepositoryEnv
+
+
+def load_yaml(yaml_path: str) -> Dict[str, Any]:
+    """Load a YAML file and return its content as a dict.
+
+    :param yaml_path: Path to the YAML file.
+    :return: Parsed YAML content as a dict, or empty dict on failure.
+    """
+    try:
+        with open(yaml_path, "r", encoding="utf-8") as f:
+            return yaml.safe_load(f) or {}
+    except Exception as e:
+        print(f"YAML load error: {e}")
+        return {}
+
+
+def load_env_api_key(repository_path: str) -> str:
+    """Load OPENAI_API_KEY from a .env file inside the repository.
+
+    Uses:
+        env = Config(RepositoryEnv(config["repository"] + "/.env"))
+        api_key = env("OPENAI_API_KEY", default=False)
+
+    :param repository_path: Path to the repo containing `.env`
+    :return: The OpenAI API key or an empty string if missing.
+    """
+    env_path = Path(repository_path) / ".env"
+
+    if not env_path.exists():
+        print(f"Warning: .env file not found at {env_path}")
+        return ""
+
+    env = Config(RepositoryEnv(str(env_path)))
+    return env("OPENAI_API_KEY", default="")
+
+
+__all__ = ["load_yaml", "load_env_api_key"]

From da5359c582f3a0d20660a8c2d96a4790c980980b Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Tue, 2 Dec 2025 13:40:09 -0500
Subject: [PATCH 02/14] data extraction

---
 .../data_construction/dataextraction.py       | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 src/aixpert/data_construction/dataextraction.py

diff --git a/src/aixpert/data_construction/dataextraction.py b/src/aixpert/data_construction/dataextraction.py
new file mode 100644
index 0000000..15ea172
--- /dev/null
+++ b/src/aixpert/data_construction/dataextraction.py
@@ -0,0 +1,136 @@
+"""
+Skywork extraction utilities.
+
+This module extracts prompt/chosen/rejected fields from the Skywork Preference
+dataset, removes exact duplicates, and writes the cleaned dataset to JSONL
+files. Fully compatible with ruff, mypy, and the AI Engineering template.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+import pandas as pd
+from datasets import load_dataset
+
+
+# Path to: src/aixpert/data_construction/data/
+DATA_DIR = Path(__file__).resolve().parent / "data"
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+
+SUBSET_SIZE = 80000
+OUT_FILE = DATA_DIR / "skywork_extracted_77k.jsonl"
+REMOVED_FILE = DATA_DIR / "skywork_cleaned_77k.jsonl"
+
+print(f"📥 Loading first {SUBSET_SIZE} samples from Skywork...")
+
+
+# ============================================================
+# Dataset loading
+# ============================================================
+ds = load_dataset(
+    "Skywork/Skywork-Reward-Preference-80K-v0.1",
+    split=f"train[:{SUBSET_SIZE}]",
+)
+
+df = ds.to_pandas()
+
+
+# ============================================================
+# Extract prompt / chosen / rejected
+# ============================================================
+def extract_prompt_from_dialog(dialog: List[Dict[str, Any]]) -> str:
+    """
+    Extract the first user message from a dialog.
+
+    Parameters
+    ----------
+    dialog : list of dict
+        A list of message objects with "role" and "content" keys.
+
+    Returns
+    -------
+    str
+        The content of the first message with role 'user', or an empty string.
+    """
+    for msg in dialog:
+        if msg.get("role") == "user":
+            return str(msg.get("content", "")).strip()
+    return ""
+
+
+def extract_answer_from_dialog(dialog: List[Dict[str, Any]]) -> str:
+    """
+    Extract the first assistant message from a dialog.
+
+    Parameters
+    ----------
+    dialog : list of dict
+        A list of message objects with "role" and "content" keys.
+
+    Returns
+    -------
+    str
+        The content of the first message with role 'assistant', or an empty string.
+    """
+    for msg in dialog:
+        if msg.get("role") == "assistant":
+            return str(msg.get("content", "")).strip()
+    return ""
+
+
+df["prompt"] = df["chosen"].apply(extract_prompt_from_dialog)
+df["chosen"] = df["chosen"].apply(extract_answer_from_dialog)
+df["rejected"] = df["rejected"].apply(extract_answer_from_dialog)
+
+clean_df = df[["prompt", "chosen", "rejected"]]
+
+# ============================================================
+# 🔍 Exact-match removal (chosen == rejected)
+# ============================================================
+cleaned: List[Dict[str, str]] = []
+removed: List[Dict[str, str]] = []
+
+for _, row in clean_df.iterrows():
+    chosen = str(row["chosen"]).strip()
+    rejected = str(row["rejected"]).strip()
+
+    sample = {
+        "prompt": str(row["prompt"]).strip(),
+        "chosen": chosen,
+        "rejected": rejected,
+    }
+
+    if chosen == rejected:
+        removed.append(sample)
+    else:
+        cleaned.append(sample)
+
+print(f"🧹 Removed exact duplicates: {len(removed)}")
+print(f"📦 Remaining clean samples: {len(cleaned)}")
+
+# Ensure output directory exists
+os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
+
+
+# ============================================================
+# Save output JSONL files
+# ============================================================
+def write_jsonl(path: Union[str, Path], rows: List[Dict[str, Any]]) -> None:
+    """Write a list of dictionaries to a JSONL file."""
+    with open(str(path), "w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+write_jsonl(OUT_FILE, cleaned)
+write_jsonl(REMOVED_FILE, removed)
+
+print(f"✅ Saved cleaned dataset ({len(cleaned)} samples) → {OUT_FILE}")
+print(f"🗑️  Saved removed duplicates ({len(removed)} samples) → {REMOVED_FILE}")
+
+print(pd.DataFrame(cleaned).head())

From d9b7fbd1e9f7854351c1b501837371fd5b31512f Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Tue, 2 Dec 2025 13:44:15 -0500
Subject: [PATCH 03/14] data extraction eval

---
 .../data_construction/dataextraction_eval.py  | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 src/aixpert/data_construction/dataextraction_eval.py

diff --git a/src/aixpert/data_construction/dataextraction_eval.py b/src/aixpert/data_construction/dataextraction_eval.py
new file mode 100644
index 0000000..1595767
--- /dev/null
+++ b/src/aixpert/data_construction/dataextraction_eval.py
@@ -0,0 +1,94 @@
+"""
+Extract the evaluation slice of the Skywork preference dataset.
+
+This script extracts rows 80001–81000, removes exact duplicates,
+and saves the cleaned dataset into JSONL files under the local data folder.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+from datasets import load_dataset
+
+
+# ============================================================
+# Helpers
+# ============================================================
+
+
+def extract_prompt(dialog: List[Dict[str, Any]]) -> str:
+    """Extract the first user message."""
+    for msg in dialog:
+        if msg.get("role") == "user":
+            return str(msg.get("content", "")).strip()
+    return ""
+
+
+def extract_answer(dialog: List[Dict[str, Any]]) -> str:
+    """Extract the first assistant message."""
+    for msg in dialog:
+        if msg.get("role") == "assistant":
+            return str(msg.get("content", "")).strip()
+    return ""
+
+
+def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write rows to a JSONL file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+# ============================================================
+# Constants for eval split
+# ============================================================
+
+START = 80001
+END = 81000  # inclusive
+
+print(f"📥 Loading eval slice: {START} → {END}")
+
+ds = load_dataset(
+    "Skywork/Skywork-Reward-Preference-80K-v0.1",
+    split=f"train[{START}:{END + 1}]",
+)
+
+df = ds.to_pandas()
+
+df["prompt"] = df["chosen"].apply(extract_prompt)
+df["chosen"] = df["chosen"].apply(extract_answer)
+df["rejected"] = df["rejected"].apply(extract_answer)
+
+clean_df = df[["prompt", "chosen", "rejected"]]
+
+cleaned: List[Dict[str, str]] = []
+removed: List[Dict[str, str]] = []
+
+for _, row in clean_df.iterrows():
+    chosen = row["chosen"].strip()
+    rejected = row["rejected"].strip()
+
+    record = {
+        "prompt": row["prompt"].strip(),
+        "chosen": chosen,
+        "rejected": rejected,
+    }
+
+    if chosen == rejected:
+        removed.append(record)
+    else:
+        cleaned.append(record)
+
+print(f"🧹 Removed duplicates: {len(removed)}")
+print(f"📦 Clean samples: {len(cleaned)}")
+
+# Save outputs
+data_dir = Path(__file__).resolve().parent / "data"
+save_jsonl(data_dir / "skywork_extracted_eval.jsonl", cleaned)
+save_jsonl(data_dir / "skywork_eval_removed.jsonl", removed)
+
+print("✅ Saved eval dataset → skywork_eval.jsonl")

From 167add7beb8a49051e84fcb68f42caf5eedf0c5f Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Tue, 2 Dec 2025 13:52:06 -0500
Subject: [PATCH 04/14] data extraction test

---
 .../data_construction/dataextraction_eval2.py | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 src/aixpert/data_construction/dataextraction_eval2.py

diff --git a/src/aixpert/data_construction/dataextraction_eval2.py b/src/aixpert/data_construction/dataextraction_eval2.py
new file mode 100644
index 0000000..83fb701
--- /dev/null
+++ b/src/aixpert/data_construction/dataextraction_eval2.py
@@ -0,0 +1,94 @@
+"""
+Extract the test slice of the Skywork preference dataset.
+
+This script extracts rows 81001–81500, removes exact duplicates,
+and saves the cleaned dataset into JSONL files under the local data folder.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+from datasets import load_dataset
+
+
+# ============================================================
+# Helpers
+# ============================================================
+
+
+def extract_prompt(dialog: List[Dict[str, Any]]) -> str:
+    """Extract the first user message."""
+    for msg in dialog:
+        if msg.get("role") == "user":
+            return str(msg.get("content", "")).strip()
+    return ""
+
+
+def extract_answer(dialog: List[Dict[str, Any]]) -> str:
+    """Extract the first assistant message."""
+    for msg in dialog:
+        if msg.get("role") == "assistant":
+            return str(msg.get("content", "")).strip()
+    return ""
+
+
+def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write rows to a JSONL file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+# ============================================================
+# Constants for test split
+# ============================================================
+
+START = 81001
+END = 81500  # inclusive
+
+print(f"📥 Loading test slice: {START} → {END}")
+
+ds = load_dataset(
+    "Skywork/Skywork-Reward-Preference-80K-v0.1",
+    split=f"train[{START}:{END + 1}]",
+)
+
+df = ds.to_pandas()
+
+df["prompt"] = df["chosen"].apply(extract_prompt)
+df["chosen"] = df["chosen"].apply(extract_answer)
+df["rejected"] = df["rejected"].apply(extract_answer)
+
+clean_df = df[["prompt", "chosen", "rejected"]]
+
+cleaned: List[Dict[str, str]] = []
+removed: List[Dict[str, str]] = []
+
+for _, row in clean_df.iterrows():
+    chosen = row["chosen"].strip()
+    rejected = row["rejected"].strip()
+
+    record = {
+        "prompt": row["prompt"].strip(),
+        "chosen": chosen,
+        "rejected": rejected,
+    }
+
+    if chosen == rejected:
+        removed.append(record)
+    else:
+        cleaned.append(record)
+
+print(f"🧹 Removed duplicates: {len(removed)}")
+print(f"📦 Clean samples: {len(cleaned)}")
+
+# Save outputs
+data_dir = Path(__file__).resolve().parent / "data"
+save_jsonl(data_dir / "skywork_extracted_test.jsonl", cleaned)
+save_jsonl(data_dir / "skywork_test_removed.jsonl", removed)
+
+print("✅ Saved test dataset → skywork_test.jsonl")

From d80983345fb6a470f2c73e6e307960ae5e90fa7d Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Tue, 2 Dec 2025 14:25:14 -0500
Subject: [PATCH 05/14] file name change

---
 .../{dataconversion.eval.py => dataconversion_eval.py}            | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/aixpert/data_construction/{dataconversion.eval.py => dataconversion_eval.py} (100%)

diff --git a/src/aixpert/data_construction/dataconversion.eval.py b/src/aixpert/data_construction/dataconversion_eval.py
similarity index 100%
rename from src/aixpert/data_construction/dataconversion.eval.py
rename to src/aixpert/data_construction/dataconversion_eval.py

From 8cb4603c8f7094381149a64c3063c15dc181b821 Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Tue, 2 Dec 2025 15:08:08 -0500
Subject: [PATCH 06/14] data flipped

---
 .../data_construction/data_flipped_eval.py    | 60 ++++++++++++++++++
 .../data_construction/data_flipped_train.py   | 61 +++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 src/aixpert/data_construction/data_flipped_eval.py
 create mode 100644 src/aixpert/data_construction/data_flipped_train.py

diff --git a/src/aixpert/data_construction/data_flipped_eval.py b/src/aixpert/data_construction/data_flipped_eval.py
new file mode 100644
index 0000000..20ab90d
--- /dev/null
+++ b/src/aixpert/data_construction/data_flipped_eval.py
@@ -0,0 +1,60 @@
+"""
+Flip preference labels for evaluation data.
+
+This script:
+- Converts h_w=1,h_l=0 → h_w=0,h_l=1
+- Swaps chosen/rejected
+- Writes a flipped version of the dataset
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+DATA_DIR = Path(__file__).resolve().parent / "data"
+
+INPUT_FILE = DATA_DIR / "eval_final.jsonl"
+OUTPUT_FILE = DATA_DIR / "eval_final_flipped.jsonl"
+
+
+def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]:
+    """Flip the sample if (h_w, h_l) = (1, 0)."""
+    if item.get("h_w") == 1 and item.get("h_l") == 0:
+        item["h_w"], item["h_l"] = 0, 1
+        item["chosen"], item["rejected"] = item["rejected"], item["chosen"]
+    return item
+
+
+def main() -> None:
+    """Execute flipping process for evaluation dataset."""
+    print("📥 Loading input file:", INPUT_FILE)
+
+    output: List[Dict[str, Any]] = []
+
+    with INPUT_FILE.open("r", encoding="utf-8") as f:
+        for raw_line in f:
+            line = raw_line.strip()
+            if not line:
+                continue
+            item = json.loads(line)
+            output.append(flip_sample(item))
+
+    print(f"✅ Processed {len(output)} samples")
+
+    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+    print("💾 Saving flipped dataset to:", OUTPUT_FILE)
+
+    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
+        for item in output:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+    print("\n============================================")
+    print(f"🎉 Saved flipped dataset → {OUTPUT_FILE.name}")
+    print("============================================\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/data_flipped_train.py b/src/aixpert/data_construction/data_flipped_train.py
new file mode 100644
index 0000000..f12c4d3
--- /dev/null
+++ b/src/aixpert/data_construction/data_flipped_train.py
@@ -0,0 +1,61 @@
+"""
+Flip preference labels for training data.
+
+This script:
+- Converts h_w=1,h_l=0 → h_w=0,h_l=1
+- Swaps chosen/rejected
+- Writes a flipped version of the dataset
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+# Local data directory (same folder as the script → data/)
+DATA_DIR = Path(__file__).resolve().parent / "data"
+
+INPUT_FILE = DATA_DIR / "train_finallast.jsonl"
+OUTPUT_FILE = DATA_DIR / "train_finallast_flipped.jsonl"
+
+
+def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]:
+    """Flip the sample if (h_w, h_l) = (1, 0)."""
+    if item.get("h_w") == 1 and item.get("h_l") == 0:
+        item["h_w"], item["h_l"] = 0, 1
+        item["chosen"], item["rejected"] = item["rejected"], item["chosen"]
+    return item
+
+
+def main() -> None:
+    """Execute flipping process for training dataset."""
+    print("📥 Loading input file:", INPUT_FILE)
+
+    output: List[Dict[str, Any]] = []
+
+    with INPUT_FILE.open("r", encoding="utf-8") as f:
+        for raw_line in f:
+            line = raw_line.strip()
+            if not line:
+                continue
+            item = json.loads(line)
+            output.append(flip_sample(item))
+
+    print(f"✅ Processed {len(output)} samples")
+
+    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+    print("💾 Saving flipped dataset to:", OUTPUT_FILE)
+
+    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
+        for item in output:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+    print("\n============================================")
+    print(f"🎉 Saved flipped dataset → {OUTPUT_FILE.name}")
+    print("============================================\n")
+
+
+if __name__ == "__main__":
+    main()

From 102f8b2ec73810b54a3343efe71e94a0d621ad1a Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Thu, 4 Dec 2025 18:45:44 -0500
Subject: [PATCH 07/14] folder updates

---
 src/aixpert/data_construction/Readme.md       |   0
 src/aixpert/data_construction/config.yaml     |   4 -
 .../data_construction/config/config.yaml      |  61 +++++
 .../data_construction/data_final_eval.py      | 126 ----------
 .../data_construction/data_final_train.py     | 109 ---------
 .../data_construction/data_flipped_eval.py    |  60 -----
 .../data_construction/data_flipped_train.py   |  61 -----
 .../data_construction/data_merge_eval.py      | 100 --------
 .../data_construction/data_merge_train.py     | 104 ---------
 .../data_construction/data_synthetic_eval.py  | 167 --------------
 .../data_construction/data_synthetic_train.py | 179 ---------------
 .../data_construction/data_transform_eval.py  |  85 -------
 .../data_construction/data_transform_train.py |  86 -------
 .../data_construction/dataconversion.py       |  94 --------
 .../data_construction/dataconversion_eval.py  |  96 --------
 .../data_construction/dataextraction.py       | 136 -----------
 .../data_construction/dataextraction_eval.py  |  94 --------
 .../data_construction/dataextraction_eval2.py |  94 --------
 src/aixpert/data_construction/dataset_eval.py | 215 -----------------
 .../data_construction/dataset_train.py        | 216 ------------------
 .../stage_1_extraction/dataextraction_eval.py |  55 +++++
 .../dataextraction_train.py                   |  57 +++++
 .../stage_1_extraction/dataextraction_val.py  |  54 +++++
 .../dataconversion_train.py                   |  47 ++++
 .../stage_2_conversion/dataconversion_val.py  |  45 ++++
 .../stage_3_factuality/dataset_train.py       |  50 ++++
 .../stage_3_factuality/dataset_val.py         |  51 +++++
 .../data_transform_train.py                   |  22 ++
 .../data_transform_val.py                     |  22 ++
 .../data_synthetic_train.py                   |  88 +++++++
 .../data_synthetic_val.py                     |  90 ++++++++
 .../stage_6_merging/data_merge_train.py       |  68 ++++++
 .../stage_6_merging/data_merge_val.py         |  61 +++++
 .../stage_7_final/data_final_train.py         |  88 +++++++
 .../stage_7_final/data_final_val.py           |  90 ++++++++
 .../stage_8_flipping/data_flipped_train.py    |  41 ++++
 .../stage_8_flipping/data_flipped_val.py      |  42 ++++
 src/aixpert/data_construction/utils.py        |  55 -----
 .../data_construction/utils/config_loader.py  |  17 ++
 .../data_construction/utils/data_utils.py     | 124 ++++++++++
 .../utils/dpo_transform_utils.py              |  56 +++++
 .../data_construction/utils/factual_utils.py  | 120 ++++++++++
 .../utils/prompt_templates.py                 |  57 +++++
 .../utils/synthetic_utils.py                  |  59 +++++
 44 files changed, 1465 insertions(+), 2081 deletions(-)
 create mode 100644 src/aixpert/data_construction/Readme.md
 delete mode 100644 src/aixpert/data_construction/config.yaml
 create mode 100644 src/aixpert/data_construction/config/config.yaml
 delete mode 100644 src/aixpert/data_construction/data_final_eval.py
 delete mode 100644 src/aixpert/data_construction/data_final_train.py
 delete mode 100644 src/aixpert/data_construction/data_flipped_eval.py
 delete mode 100644 src/aixpert/data_construction/data_flipped_train.py
 delete mode 100644 src/aixpert/data_construction/data_merge_eval.py
 delete mode 100644 src/aixpert/data_construction/data_merge_train.py
 delete mode 100644 src/aixpert/data_construction/data_synthetic_eval.py
 delete mode 100644 src/aixpert/data_construction/data_synthetic_train.py
 delete mode 100644 src/aixpert/data_construction/data_transform_eval.py
 delete mode 100644 src/aixpert/data_construction/data_transform_train.py
 delete mode 100644 src/aixpert/data_construction/dataconversion.py
 delete mode 100644 src/aixpert/data_construction/dataconversion_eval.py
 delete mode 100644 src/aixpert/data_construction/dataextraction.py
 delete mode 100644 src/aixpert/data_construction/dataextraction_eval.py
 delete mode 100644 src/aixpert/data_construction/dataextraction_eval2.py
 delete mode 100644 src/aixpert/data_construction/dataset_eval.py
 delete mode 100644 src/aixpert/data_construction/dataset_train.py
 create mode 100644 src/aixpert/data_construction/stage_1_extraction/dataextraction_eval.py
 create mode 100644 src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py
 create mode 100644 src/aixpert/data_construction/stage_1_extraction/dataextraction_val.py
 create mode 100644 src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py
 create mode 100644 src/aixpert/data_construction/stage_2_conversion/dataconversion_val.py
 create mode 100644 src/aixpert/data_construction/stage_3_factuality/dataset_train.py
 create mode 100644 src/aixpert/data_construction/stage_3_factuality/dataset_val.py
 create mode 100644 src/aixpert/data_construction/stage_4_transformation/data_transform_train.py
 create mode 100644 src/aixpert/data_construction/stage_4_transformation/data_transform_val.py
 create mode 100644 src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py
 create mode 100644 src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_val.py
 create mode 100644 src/aixpert/data_construction/stage_6_merging/data_merge_train.py
 create mode 100644 src/aixpert/data_construction/stage_6_merging/data_merge_val.py
 create mode 100644 src/aixpert/data_construction/stage_7_final/data_final_train.py
 create mode 100644 src/aixpert/data_construction/stage_7_final/data_final_val.py
 create mode 100644 src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py
 create mode 100644 src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py
 delete mode 100644 src/aixpert/data_construction/utils.py
 create mode 100644 src/aixpert/data_construction/utils/config_loader.py
 create mode 100644 src/aixpert/data_construction/utils/data_utils.py
 create mode 100644 src/aixpert/data_construction/utils/dpo_transform_utils.py
 create mode 100644 src/aixpert/data_construction/utils/factual_utils.py
 create mode 100644 src/aixpert/data_construction/utils/prompt_templates.py
 create mode 100644 src/aixpert/data_construction/utils/synthetic_utils.py

diff --git a/src/aixpert/data_construction/Readme.md b/src/aixpert/data_construction/Readme.md
new file mode 100644
index 0000000..e69de29
diff --git a/src/aixpert/data_construction/config.yaml b/src/aixpert/data_construction/config.yaml
deleted file mode 100644
index 91e1874..0000000
--- a/src/aixpert/data_construction/config.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-repository: /projects/aixpert/users/sindhu/Loss_Test
-
-model:
-  name: gpt-4o-mini   # or gpt-4o
diff --git a/src/aixpert/data_construction/config/config.yaml b/src/aixpert/data_construction/config/config.yaml
new file mode 100644
index 0000000..261c6b4
--- /dev/null
+++ b/src/aixpert/data_construction/config/config.yaml
@@ -0,0 +1,61 @@
+repository: /projects/aixpert/users/sindhu/Loss_Test
+
+model:
+  name: gpt-4o-mini   # or gpt-4o
+  temperature: 0.8
+
+paths:
+  skywork_train_cleaned: "src/aixpert/data_construction/data/skywork_extracted_77k.jsonl"
+  skywork_train_removed: "src/aixpert/data_construction/data/skywork_removed_77k.jsonl"
+
+  skywork_eval_cleaned: "src/aixpert/data_construction/data/skywork_extracted_eval.jsonl"
+  skywork_eval_removed: "src/aixpert/data_construction/data/skywork_eval_removed.jsonl"
+
+  skywork_test_cleaned: "src/aixpert/data_construction/data/skywork_extracted_test.jsonl"
+  skywork_test_removed: "src/aixpert/data_construction/data/skywork_test_removed.jsonl"
+
+  skywork_train_pairs: "src/aixpert/data_construction/data/skywork_preference_pairs_77k.jsonl"
+  skywork_eval_pairs: "src/aixpert/data_construction/data/skywork_preference_pairs_eval.jsonl"
+
+  skywork_train_factual: "src/aixpert/data_construction/data/skywork_binary_factual_train.jsonl"
+  skywork_eval_factual: "src/aixpert/data_construction/data/skywork_binary_factual_eval.jsonl"
+
+  skywork_train_transformed: "src/aixpert/data_construction/data/skywork_first_transformed_train.jsonl"
+  skywork_eval_transformed: "src/aixpert/data_construction/data/skywork_first_transformed_eval.jsonl"
+
+  synthetic_train_out: "src/aixpert/data_construction/data/synthetic_llm_inversion_train_10k.jsonl"
+  synthetic_eval_out:  "src/aixpert/data_construction/data/synthetic_llm_inversion_eval_400.jsonl"
+
+
+  final_train_merged: "src/aixpert/data_construction/data/skywork_final_train.jsonl"
+  final_eval_merged: "src/aixpert/data_construction/data/skywork_final_eval.jsonl"
+
+  final_train_out: "src/aixpert/data_construction/data/train_balanced.jsonl"
+  final_eval_out: "src/aixpert/data_construction/data/eval_final.jsonl"
+
+  train_flipped_out: "src/aixpert/data_construction/data/train_balanced_flipped.jsonl"
+  eval_flipped_out: "src/aixpert/data_construction/data/eval_final_flipped.jsonl"
+
+
+
+  skywork_file: "Skywork/Skywork-Reward-Preference-80K-v0.1"
+
+hyperparams:
+  subset_size: 80000
+  eval_start: 80001
+  eval_end: 81000
+  test_start: 81001
+  test_end: 81500
+  concurrency_limit: 25
+  max_retries: 5
+  corruption_concurrency: 20
+  synthetic_train_samples: 10000
+  synthetic_eval_samples: 400
+
+  balance_targets:
+      "(0,1)": 10000
+      "(1,0)": 10000
+      "(0,0)": 15000
+      "(1,1)": 10000
+
+  eval_additional_clean_samples: 1500
diff --git a/src/aixpert/data_construction/data_final_eval.py b/src/aixpert/data_construction/data_final_eval.py
deleted file mode 100644
index addda7f..0000000
--- a/src/aixpert/data_construction/data_final_eval.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""
-Build the FINAL evaluation dataset (skywork_final_eval.jsonl).
-
-Composition:
-    • 400 synthetic inversion samples (1,0)
-    • all Skywork eval samples from skywork_first_transformed_eval.jsonl
-    • +1500 samples of (1,1) from skywork_final_train.jsonl
-    • +1500 samples of (0,0) from skywork_final_train.jsonl
-      → excluding any sample already used in train_finallast.jsonl
-
-Final eval ≈ (#sky_eval + 400 synthetic + 3000 added clean samples)
-"""
-
-from __future__ import annotations
-
-import json
-import random
-from pathlib import Path
-from typing import Any, Dict, List
-
-
-# ============================================================
-# PATHS
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-
-SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl"
-SKY_EVAL_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl"
-
-TRAIN_SOURCE_FILE = DATA_DIR / "skywork_final_train.jsonl"
-TRAIN_USED_FILE = DATA_DIR / "train_finallast.jsonl"
-
-OUTPUT_FILE = DATA_DIR / "eval_final.jsonl"
-
-
-# ============================================================
-# HELPERS
-# ============================================================
-
-
-def load_jsonl(path: Path) -> List[Dict[str, Any]]:
-    """Load a JSONL file into a list of dictionaries."""
-    with path.open("r", encoding="utf-8") as f:
-        return [json.loads(line) for line in f]
-
-
-def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
-    """Write a list of dictionaries to a JSONL file."""
-    with path.open("w", encoding="utf-8") as f:
-        for r in rows:
-            f.write(json.dumps(r, ensure_ascii=False) + "\n")
-
-
-# ============================================================
-# MAIN
-# ============================================================
-
-
-def main() -> None:
-    """Create the final evaluation dataset by merging all required sources."""
-    print("📥 Loading synthetic eval inversions...")
-    synthetic = load_jsonl(SYNTHETIC_FILE)
-    print("Synthetic:", len(synthetic))
-
-    print("📥 Loading Skywork eval transformed...")
-    sky_eval = load_jsonl(SKY_EVAL_FILE)
-    print("SkyEval:", len(sky_eval))
-
-    print("📥 Loading Skywork full training source...")
-    sky_train = load_jsonl(TRAIN_SOURCE_FILE)
-
-    print("📥 Loading TRAIN used (to exclude)...")
-    train_used = load_jsonl(TRAIN_USED_FILE)
-
-    # Convert used samples to hashable form
-    exclude_set = {(ex["prompt"], ex["chosen"], ex["rejected"]) for ex in train_used}
-
-    # -----------------------------------------------------------
-    # 1. Extract (1,1) and (0,0) pools from training source
-    # -----------------------------------------------------------
-    hw1_hl1_pool: List[Dict[str, Any]] = []
-    hw0_hl0_pool: List[Dict[str, Any]] = []
-
-    for ex in sky_train:
-        key = (ex["prompt"], ex["chosen"], ex["rejected"])
-        if key in exclude_set:
-            continue
-
-        if ex["h_w"] == 1 and ex["h_l"] == 1:
-            hw1_hl1_pool.append(ex)
-        elif ex["h_w"] == 0 and ex["h_l"] == 0:
-            hw0_hl0_pool.append(ex)
-
-    print(f"(1,1) available for eval add: {len(hw1_hl1_pool)}")
-    print(f"(0,0) available for eval add: {len(hw0_hl0_pool)}")
-
-    # -----------------------------------------------------------
-    # 2. Sample EXACT 1500 from each bucket
-    # -----------------------------------------------------------
-    eval_hw1_hl1 = random.sample(hw1_hl1_pool, 1500)
-    eval_hw0_hl0 = random.sample(hw0_hl0_pool, 1500)
-
-    # -----------------------------------------------------------
-    # 3. Merge everything
-    # -----------------------------------------------------------
-    merged: List[Dict[str, Any]] = []
-    merged.extend(synthetic)  # (1,0) → 400
-    merged.extend(sky_eval)  # (0,1) → ~1000
-    merged.extend(eval_hw1_hl1)  # (1,1) → 1500
-    merged.extend(eval_hw0_hl0)  # (0,0) → 1500
-
-    print(f"Total before shuffle: {len(merged)}")
-
-    random.shuffle(merged)
-
-    print(f"💾 Saving → {OUTPUT_FILE}")
-    write_jsonl(OUTPUT_FILE, merged)
-
-    print("✅ FINAL EVAL DATASET READY.")
-    print("Total eval:", len(merged))
-
-
-if __name__ == "__main__":
-    random.seed(42)
-    main()
diff --git a/src/aixpert/data_construction/data_final_train.py b/src/aixpert/data_construction/data_final_train.py
deleted file mode 100644
index 742208b..0000000
--- a/src/aixpert/data_construction/data_final_train.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""
-Balanced sampling for TRAIN dataset.
-
-This script:
-- Loads the merged training dataset.
-- Buckets by (h_w, h_l).
-- Samples required amounts per bucket (with replacement if needed).
-- Shuffles and saves the final balanced training dataset.
-
-Buckets required:
-    (0,1) → 10,000
-    (1,0) → 10,000
-    (0,0) → 15,000
-    (1,1) → 10,000
-"""
-
-from __future__ import annotations
-
-import json
-import random
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
-
-
-# ============================================================
-# Paths (relative to this file's /data directory)
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-
-INPUT_FILE = DATA_DIR / "skywork_final_train.jsonl"
-OUTPUT_FILE = DATA_DIR / "train_finallast.jsonl"
-
-TARGET_COUNTS: Dict[Tuple[int, int], int] = {
-    (0, 1): 10_000,
-    (1, 0): 10_000,
-    (0, 0): 15_000,
-    (1, 1): 10_000,
-}
-
-
-# ============================================================
-# Helpers
-# ============================================================
-
-
-def load_jsonl(path: Path) -> List[Dict[str, Any]]:
-    """Load a JSONL file and return its rows as a list of dictionaries."""
-    with path.open("r", encoding="utf-8") as f:
-        return [json.loads(line) for line in f]
-
-
-def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
-    """Write a list of dictionaries to a JSONL file."""
-    with path.open("w", encoding="utf-8") as f:
-        for ex in rows:
-            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
-
-
-# ============================================================
-# Main
-# ============================================================
-
-
-def main() -> None:
-    """Generate the balanced training dataset according to bucket size targets."""
-    print(f"📥 Loading dataset → {INPUT_FILE}")
-    data = load_jsonl(INPUT_FILE)
-
-    # bucket structure
-    buckets: Dict[Tuple[int, int], List[Dict[str, Any]]] = {
-        (0, 1): [],
-        (1, 0): [],
-        (0, 0): [],
-        (1, 1): [],
-    }
-
-    print("🔍 Bucketing samples...")
-    for ex in data:
-        key = (int(ex["h_w"]), int(ex["h_l"]))
-        if key in buckets:
-            buckets[key].append(ex)
-
-    final_samples: List[Dict[str, Any]] = []
-
-    for key, req_count in TARGET_COUNTS.items():
-        available = len(buckets[key])
-        print(f"Bucket {key}: available={available}, required={req_count}")
-
-        if available < req_count:
-            print("⚠️ Not enough samples — sampling WITH replacement.")
-            sampled = random.choices(buckets[key], k=req_count)
-        else:
-            sampled = random.sample(buckets[key], req_count)
-
-        final_samples.extend(sampled)
-
-    print(f"\n🔀 Shuffling {len(final_samples)} samples...")
-    random.shuffle(final_samples)
-
-    print(f"💾 Saving → {OUTPUT_FILE}")
-    write_jsonl(OUTPUT_FILE, final_samples)
-
-    print("✅ TRAIN balanced dataset created.")
-    print("Final count:", len(final_samples))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/aixpert/data_construction/data_flipped_eval.py b/src/aixpert/data_construction/data_flipped_eval.py
deleted file mode 100644
index 20ab90d..0000000
--- a/src/aixpert/data_construction/data_flipped_eval.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""
-Flip preference labels for evaluation data.
-
-This script:
-- Converts h_w=1,h_l=0 → h_w=0,h_l=1
-- Swaps chosen/rejected
-- Writes a flipped version of the dataset
-"""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any, Dict, List
-
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-
-INPUT_FILE = DATA_DIR / "eval_final.jsonl"
-OUTPUT_FILE = DATA_DIR / "eval_final_flipped.jsonl"
-
-
-def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]:
-    """Flip the sample if (h_w, h_l) = (1, 0)."""
-    if item.get("h_w") == 1 and item.get("h_l") == 0:
-        item["h_w"], item["h_l"] = 0, 1
-        item["chosen"], item["rejected"] = item["rejected"], item["chosen"]
-    return item
-
-
-def main() -> None:
-    """Execute flipping process for evaluation dataset."""
-    print("📥 Loading input file:", INPUT_FILE)
-
-    output: List[Dict[str, Any]] = []
-
-    with INPUT_FILE.open("r", encoding="utf-8") as f:
-        for raw_line in f:
-            line = raw_line.strip()
-            if not line:
-                continue
-            item = json.loads(line)
-            output.append(flip_sample(item))
-
-    print(f"✅ Processed {len(output)} samples")
-
-    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
-    print("💾 Saving flipped dataset to:", OUTPUT_FILE)
-
-    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
-        for item in output:
-            f.write(json.dumps(item, ensure_ascii=False) + "\n")
-
-    print("\n============================================")
-    print(f"🎉 Saved flipped dataset → {OUTPUT_FILE.name}")
-    print("============================================\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/aixpert/data_construction/data_flipped_train.py b/src/aixpert/data_construction/data_flipped_train.py
deleted file mode 100644
index f12c4d3..0000000
--- a/src/aixpert/data_construction/data_flipped_train.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""
-Flip preference labels for training data.
-
-This script:
-- Converts h_w=1,h_l=0 → h_w=0,h_l=1
-- Swaps chosen/rejected
-- Writes a flipped version of the dataset
-"""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any, Dict, List
-
-
-# Local data directory (same folder as the script → data/)
-DATA_DIR = Path(__file__).resolve().parent / "data"
-
-INPUT_FILE = DATA_DIR / "train_finallast.jsonl"
-OUTPUT_FILE = DATA_DIR / "train_finallast_flipped.jsonl"
-
-
-def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]:
-    """Flip the sample if (h_w, h_l) = (1, 0)."""
-    if item.get("h_w") == 1 and item.get("h_l") == 0:
-        item["h_w"], item["h_l"] = 0, 1
-        item["chosen"], item["rejected"] = item["rejected"], item["chosen"]
-    return item
-
-
-def main() -> None:
-    """Execute flipping process for training dataset."""
-    print("📥 Loading input file:", INPUT_FILE)
-
-    output: List[Dict[str, Any]] = []
-
-    with INPUT_FILE.open("r", encoding="utf-8") as f:
-        for raw_line in f:
-            line = raw_line.strip()
-            if not line:
-                continue
-            item = json.loads(line)
-            output.append(flip_sample(item))
-
-    print(f"✅ Processed {len(output)} samples")
-
-    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
-    print("💾 Saving flipped dataset to:", OUTPUT_FILE)
-
-    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
-        for item in output:
-            f.write(json.dumps(item, ensure_ascii=False) + "\n")
-
-    print("\n============================================")
-    print(f"🎉 Saved flipped dataset → {OUTPUT_FILE.name}")
-    print("============================================\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/aixpert/data_construction/data_merge_eval.py b/src/aixpert/data_construction/data_merge_eval.py
deleted file mode 100644
index 6cc9100..0000000
--- a/src/aixpert/data_construction/data_merge_eval.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""
-Merge Skywork evaluation data with 400 synthetic inversion pairs.
-
-This script:
-- Loads synthetic corruption samples for eval.
-- Loads Skywork eval transformed dataset.
-- Splits samples into buckets by (h_w, h_l).
-- Keeps ALL real eval samples.
-- Merges synthetic + all real eval buckets.
-- Shuffles and writes final eval JSONL file.
-
-Fully compatible with ruff, mypy, and pydocstyle.
-"""
-
-from __future__ import annotations
-
-import json
-import random
-from pathlib import Path
-from typing import Any, Dict, List
-
-
-# ============================================================
-# Paths
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl"
-SKYWORK_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl"
-OUTPUT_FILE = DATA_DIR / "skywork_final_eval.jsonl"
-
-
-# ============================================================
-# Helpers
-# ============================================================
-
-
-def load_jsonl(path: Path) -> List[Dict[str, Any]]:
-    """Load JSONL file into list of dicts."""
-    rows: List[Dict[str, Any]] = []
-    with path.open("r", encoding="utf-8") as f:
-        for line in f:
-            rows.append(json.loads(line))
-    return rows
-
-
-def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
-    """Write a list of dicts to a JSONL file."""
-    with path.open("w", encoding="utf-8") as f:
-        for r in rows:
-            f.write(json.dumps(r, ensure_ascii=False) + "\n")
-
-
-# ============================================================
-# Main
-# ============================================================
-
-
-def main() -> None:
-    """Merge Skywork eval data with 400 synthetic inversions."""
-    print("📥 Loading synthetic eval inversions...")
-    synthetic = load_jsonl(SYNTHETIC_FILE)
-    print(f"Synthetic eval: {len(synthetic)}")
-
-    print("📥 Loading Skywork eval transformed...")
-    sky = load_jsonl(SKYWORK_FILE)
-    print(f"Skywork eval: {len(sky)}")
-
-    hw0_hl0: List[Dict[str, Any]] = []
-    hw1_hl1: List[Dict[str, Any]] = []
-    hw0_hl1: List[Dict[str, Any]] = []
-
-    for ex in sky:
-        h_w = ex["h_w"]
-        h_l = ex["h_l"]
-
-        if h_w == 0 and h_l == 0:
-            hw0_hl0.append(ex)
-        elif h_w == 1 and h_l == 1:
-            hw1_hl1.append(ex)
-        elif h_w == 0 and h_l == 1:
-            hw0_hl1.append(ex)
-
-    print(f"(0,0): {len(hw0_hl0)}")
-    print(f"(1,1): {len(hw1_hl1)}")
-    print(f"(0,1): {len(hw0_hl1)}")
-
-    merged = synthetic + hw0_hl0 + hw1_hl1 + hw0_hl1
-    print(f"Total merged before shuffle: {len(merged)}")
-
-    random.shuffle(merged)
-
-    print(f"💾 Saving → {OUTPUT_FILE}")
-    write_jsonl(OUTPUT_FILE, merged)
-
-    print("✅ EVAL MERGE DONE.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/aixpert/data_construction/data_merge_train.py b/src/aixpert/data_construction/data_merge_train.py
deleted file mode 100644
index d97b1de..0000000
--- a/src/aixpert/data_construction/data_merge_train.py
+++ /dev/null
@@ -1,104 +0,0 @@
-"""
-Merge Skywork training data with 10k synthetic inversion pairs.
-
-This script:
-- Loads synthetic corruption samples.
-- Loads transformed Skywork training data.
-- Splits real samples into buckets by (h_w, h_l).
-- Samples 10k from (0,1).
-- Merges: synthetic + (0,0) + (1,1) + sampled (0,1).
-- Shuffles and writes final JSONL file.
-
-Fully compatible with ruff, mypy, and pydocstyle.
-"""
-
-from __future__ import annotations
-
-import json
-import random
-from pathlib import Path
-from typing import Any, Dict, List
-
-
-# ============================================================
-# Paths
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-SYNTHETIC_FILE = DATA_DIR / "synthetic_llm_inversion_train_10k.jsonl"
-SKYWORK_FILE = DATA_DIR / "skywork_first_transformed_train.jsonl"
-OUTPUT_FILE = DATA_DIR / "skywork_final_train.jsonl"
-
-
-# ============================================================
-# Helpers
-# ============================================================
-
-
-def load_jsonl(path: Path) -> List[Dict[str, Any]]:
-    """Load JSONL file into list of dicts."""
-    rows: List[Dict[str, Any]] = []
-    with path.open("r", encoding="utf-8") as f:
-        for line in f:
-            rows.append(json.loads(line))
-    return rows
-
-
-def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
-    """Write list of dicts to JSONL file."""
-    with path.open("w", encoding="utf-8") as f:
-        for r in rows:
-            f.write(json.dumps(r, ensure_ascii=False) + "\n")
-
-
-# ============================================================
-# Main
-# ============================================================
-
-
-def main() -> None:
-    """Merge Skywork train data with 10k synthetic hallucinations."""
-    print("📥 Loading synthetic inversions...")
-    synthetic = load_jsonl(SYNTHETIC_FILE)
-    print(f"Synthetic loaded: {len(synthetic)}")
-
-    print("📥 Loading Skywork train transformed...")
-    sky = load_jsonl(SKYWORK_FILE)
-    print(f"Skywork loaded: {len(sky)}")
-
-    hw0_hl0: List[Dict[str, Any]] = []
-    hw1_hl1: List[Dict[str, Any]] = []
-    hw0_hl1: List[Dict[str, Any]] = []
-
-    for ex in sky:
-        h_w = ex["h_w"]
-        h_l = ex["h_l"]
-
-        if h_w == 0 and h_l == 0:
-            hw0_hl0.append(ex)
-        elif h_w == 1 and h_l == 1:
-            hw1_hl1.append(ex)
-        elif h_w == 0 and h_l == 1:
-            hw0_hl1.append(ex)
-
-    print(f"(0,0): {len(hw0_hl0)}")
-    print(f"(1,1): {len(hw1_hl1)}")
-    print(f"(0,1): {len(hw0_hl1)}")
-
-    random.seed(42)
-    sample01 = random.sample(hw0_hl1, 10000)
-    print(f"Sampled (0,1): {len(sample01)}")
-
-    merged = synthetic + hw0_hl0 + hw1_hl1 + sample01
-    print(f"Total merged before shuffle: {len(merged)}")
-
-    random.shuffle(merged)
-
-    print(f"💾 Saving → {OUTPUT_FILE}")
-    write_jsonl(OUTPUT_FILE, merged)
-
-    print("✅ TRAIN MERGE DONE.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/aixpert/data_construction/data_synthetic_eval.py b/src/aixpert/data_construction/data_synthetic_eval.py
deleted file mode 100644
index 78ec432..0000000
--- a/src/aixpert/data_construction/data_synthetic_eval.py
+++ /dev/null
@@ -1,167 +0,0 @@
-"""
-Generate synthetic corruption (hallucinated) responses for EVAL split.
-
-This script:
-- Loads clean DPO-ready Skywork eval transformation.
-- Selects pairs where h_w=0 and h_l=1.
-- Uses GPT-4o-mini to introduce subtle factual errors.
-- Produces inverted (hallucinated, correct) preference pairs.
-- Saves 400 synthetic eval corruption examples.
-
-Compatible with ruff, ruff-format, pydocstyle, and mypy.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import json
-import random
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-from decouple import Config, RepositoryEnv
-from openai import AsyncOpenAI
-from tqdm.asyncio import tqdm as tqdm_asyncio
-from utils import load_yaml
-
-
-# ============================================================
-# Load config.yaml + .env API key
-# ============================================================
-
-CONFIG = load_yaml("config.yaml")  # must contain: repository: <path>
-REPO_PATH = CONFIG["repository"]
-
-env = Config(RepositoryEnv(f"{REPO_PATH}/.env"))
-OPENAI_API_KEY = env("OPENAI_API_KEY", default=False)
-
-if not OPENAI_API_KEY:
-    raise RuntimeError("❌ OPENAI_API_KEY missing in repository .env")
-
-client = AsyncOpenAI(api_key=OPENAI_API_KEY)
-
-# ============================================================
-# Paths
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-INPUT_FILE = DATA_DIR / "skywork_first_transformed_eval.jsonl"
-OUTPUT_FILE = DATA_DIR / "synthetic_llm_inversion_eval_400.jsonl"
-
-# ============================================================
-# Prompts
-# ============================================================
-
-SYSTEM_PROMPT = """
-You are a factual corruption generator.
-
-Rewrite a correct answer into a subtly incorrect version with 2–3 factual errors.
-It must remain fluent, confident, and plausible.
-"""
-
-
-def build_user_prompt(question: str, correct_answer: str) -> str:
-    """Construct user prompt."""
-    return f"""
-PROMPT:
-{question}
-
-CORRECT ANSWER:
-{correct_answer}
-
-TASK:
-Rewrite this answer so it becomes factually incorrect while still sounding natural.
-"""
-
-
-# ============================================================
-# API wrapper
-# ============================================================
-
-
-async def generate_corruption(
-    question: str, answer: str, semaphore: asyncio.Semaphore
-) -> Optional[str]:
-    """Generate a hallucinated version of the answer."""
-    user_prompt = build_user_prompt(question, answer)
-
-    async with semaphore:
-        for retry in range(5):
-            try:
-                resp = await client.chat.completions.create(
-                    model="gpt-4o-mini",
-                    messages=[
-                        {"role": "system", "content": SYSTEM_PROMPT},
-                        {"role": "user", "content": user_prompt},
-                    ],
-                    temperature=0.8,
-                )
-                return resp.choices[0].message.content.strip()
-
-            except Exception as exc:
-                print(f"Retry ({retry}) corruption error:", exc)
-                await asyncio.sleep(1 + retry * 0.5)
-
-    return None
-
-
-# ============================================================
-# Process each item
-# ============================================================
-
-
-async def process_item(
-    item: Dict[str, Any], semaphore: asyncio.Semaphore
-) -> Optional[Dict[str, Any]]:
-    """Create synthetic corruption pair for an eval item."""
-    prompt = item["prompt"]
-    correct = item["chosen"]
-
-    corrupted = await generate_corruption(prompt, correct, semaphore)
-    if corrupted is None:
-        return None
-
-    return {
-        "prompt": prompt,
-        "chosen": corrupted,
-        "rejected": correct,
-        "h_w": 1,
-        "h_l": 0,
-        "source": "synthetic_inversion_eval",
-    }
-
-
-# ============================================================
-# Main
-# ============================================================
-
-
-async def main() -> None:
-    """Run synthetic generation for evaluation."""
-    target = 400
-
-    print(f"📥 Loading eval data → {INPUT_FILE}")
-    items = [json.loads(line) for line in INPUT_FILE.open("r", encoding="utf-8")]
-
-    clean_pairs = [x for x in items if x.get("h_w") == 0 and x.get("h_l") == 1]
-
-    selected = random.sample(clean_pairs, min(target, len(clean_pairs)))
-    print(f"🔎 Selected {len(selected)} items for corruption.")
-
-    semaphore = asyncio.Semaphore(20)
-    coros = [process_item(item, semaphore) for item in selected]
-
-    print("⚙️ Generating eval corruptions...")
-    results = await tqdm_asyncio.gather(*coros)
-    results = [r for r in results if r is not None]
-
-    print(f"💾 Saving {len(results)} examples → {OUTPUT_FILE}")
-    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
-        for r in results:
-            f.write(json.dumps(r, ensure_ascii=False) + "\n")
-
-    print("✅ Eval synthetic corruption generation complete.")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/src/aixpert/data_construction/data_synthetic_train.py b/src/aixpert/data_construction/data_synthetic_train.py
deleted file mode 100644
index b896f9a..0000000
--- a/src/aixpert/data_construction/data_synthetic_train.py
+++ /dev/null
@@ -1,179 +0,0 @@
-"""
-Generate synthetic corruption (hallucinated) responses for TRAIN split.
-
-This script:
-- Loads clean DPO-ready Skywork transformation for training.
-- Selects items where h_w=0 (winner factual) and h_l=1 (loser incorrect).
-- Asks GPT-4o-mini to rewrite the factual answer into a subtle hallucination.
-- Produces “inversion pairs” where corrupted is chosen and original is rejected.
-- Saves up to 10,000 synthetic hallucination samples.
-
-Fully compatible with ruff, ruff-format, pydocstyle, and mypy.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import json
-import random
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from decouple import Config, RepositoryEnv
-from openai import AsyncOpenAI
-from tqdm.asyncio import tqdm as tqdm_asyncio
-from utils import load_yaml
-
-
-# ============================================================
-# Load config.yaml + .env API key
-# ============================================================
-
-CONFIG = load_yaml("config.yaml")  # must contain: repository: <path>
-REPO_PATH = CONFIG["repository"]
-
-env = Config(RepositoryEnv(f"{REPO_PATH}/.env"))
-OPENAI_API_KEY = env("OPENAI_API_KEY", default=False)
-
-if not OPENAI_API_KEY:
-    raise RuntimeError("❌ OPENAI_API_KEY missing in .env under repository path.")
-
-client = AsyncOpenAI(api_key=OPENAI_API_KEY)
-
-# ============================================================
-# Directories
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-INPUT_FILE = DATA_DIR / "skywork_first_transformed_train.jsonl"
-OUTPUT_FILE = DATA_DIR / "synthetic_llm_inversion_train_10k.jsonl"
-
-# ============================================================
-# Corruption prompt
-# ============================================================
-
-SYSTEM_PROMPT = """
-You are a factual corruption generator.
-
-Given a question and its correct answer, produce an incorrect version
-that is fluent, confident, and well-structured but it must contain 2–3 factual inaccuracies.
-Avoid nonsense. Keep the tone similar to the original answer.
-"""
-
-
-def build_user_prompt(question: str, correct_answer: str) -> str:
-    """Construct the user prompt for hallucination generation."""
-    return f"""
-PROMPT:
-{question}
-
-CORRECT ANSWER:
-{correct_answer}
-
-TASK:
-Rewrite the answer so that it becomes factually wrong, introducing subtle hallucinations
-while sounding confident and coherent.
-"""
-
-
-# ============================================================
-# API for corrupted (hallucinated) answer
-# ============================================================
-
-
-async def generate_corruption(
-    question: str,
-    answer: str,
-    semaphore: asyncio.Semaphore,
-) -> Optional[str]:
-    """Generate a hallucinated version of the correct answer using GPT-4o-mini."""
-    user_prompt = build_user_prompt(question, answer)
-
-    async with semaphore:
-        for retry in range(5):
-            try:
-                resp = await client.chat.completions.create(
-                    model="gpt-4o-mini",
-                    messages=[
-                        {"role": "system", "content": SYSTEM_PROMPT},
-                        {"role": "user", "content": user_prompt},
-                    ],
-                    temperature=0.8,
-                )
-                return resp.choices[0].message.content.strip()
-
-            except Exception as exc:
-                print(f"Retry corruption ({retry}): {exc}")
-                await asyncio.sleep(1 + retry * 0.5)
-
-    return None
-
-
-# ============================================================
-# Process one item
-# ============================================================
-
-
-async def process_item(
-    item: Dict[str, Any],
-    semaphore: asyncio.Semaphore,
-) -> Optional[Dict[str, Any]]:
-    """Produce one synthetic inversion (corruption) DPO sample."""
-    prompt = item["prompt"]
-    correct_answer = item["chosen"]
-
-    corrupted = await generate_corruption(prompt, correct_answer, semaphore)
-
-    if corrupted is None:
-        return None
-
-    return {
-        "prompt": prompt,
-        "chosen": corrupted,  # hallucinated / corrupted
-        "rejected": correct_answer,  # original factual answer
-        "h_w": 1,  # corrupted = wrong
-        "h_l": 0,  # original = correct
-        "source": "synthetic_inversion",
-    }
-
-
-# ============================================================
-# Main
-# ============================================================
-
-
-async def main() -> None:
-    """Generate 10k synthetic corruption pairs and save JSONL output."""
-    target = 10_000
-    print(f"📥 Loading training dataset → {INPUT_FILE}")
-
-    items: List[Dict[str, Any]] = [
-        json.loads(line) for line in INPUT_FILE.open("r", encoding="utf-8")
-    ]
-
-    print("🔍 Selecting factual (0,1) pairs only...")
-    clean_pairs = [x for x in items if x["h_w"] == 0 and x["h_l"] == 1]
-
-    print(f"Available factual pairs: {len(clean_pairs)}")
-    selected = random.sample(clean_pairs, target)
-    print(f"🎯 Selected {len(selected)} items for corruption generation.")
-
-    semaphore = asyncio.Semaphore(20)
-
-    tasks = [process_item(item, semaphore) for item in selected]
-
-    print("⚙️ Generating corrupted answers...")
-    results = await tqdm_asyncio.gather(*tasks)
-
-    final_rows = [r for r in results if r is not None]
-
-    print(f"💾 Saving {len(final_rows)} synthetic samples → {OUTPUT_FILE}")
-    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
-        for row in final_rows:
-            f.write(json.dumps(row, ensure_ascii=False) + "\n")
-
-    print("✅ Synthetic corruption dataset created.")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/src/aixpert/data_construction/data_transform_eval.py b/src/aixpert/data_construction/data_transform_eval.py
deleted file mode 100644
index 48468f6..0000000
--- a/src/aixpert/data_construction/data_transform_eval.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-Transform binary factual-scored evaluation preference pairs into DPO-ready format.
-
-This script:
-- Loads binary factual results for eval pairs.
-- Converts response_0 / response_1 into (chosen, rejected) using the
-  better_response_id.
-- Copies factual flags into h_w (winner) and h_l (loser).
-- Preserves the original responses and adds a flipped=False flag.
-- Writes the DPO-ready JSONL file for evaluation.
-
-Fully compliant with ruff, ruff-format, pydocstyle, and mypy.
-"""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any, Dict, List
-
-from tqdm import tqdm
-
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-
-INPUT_PATH = DATA_DIR / "skywork_binary_factual_eval.jsonl"
-OUTPUT_PATH = DATA_DIR / "skywork_first_transformed_eval.jsonl"
-
-
-def process_item(item: Dict[str, Any]) -> Dict[str, Any]:
-    """Convert one binary factual-scored eval pair into DPO-ready structure."""
-    prompt = item["prompt"]
-    r0 = item["response_0"]
-    r1 = item["response_1"]
-    pref = int(item["better_response_id"])
-
-    # factual flags
-    h0 = int(item["h0"])
-    h1 = int(item["h1"])
-
-    if pref == 0:
-        chosen, rejected = r0, r1
-        h_w, h_l = h0, h1
-    else:
-        chosen, rejected = r1, r0
-        h_w, h_l = h1, h0
-
-    return {
-        "prompt": prompt,
-        "chosen": chosen,
-        "rejected": rejected,
-        "h_w": h_w,
-        "h_l": h_l,
-        "better_response_id": pref,
-        "response_0": r0,
-        "response_1": r1,
-        "flipped": False,
-    }
-
-
-def transform_dataset() -> None:
-    """Load eval dataset, apply transformation, and save JSONL output."""
-    print(f"📥 Loading eval data → {INPUT_PATH}")
-    items = [json.loads(line) for line in INPUT_PATH.open("r", encoding="utf-8")]
-
-    transformed: List[Dict[str, Any]] = []
-
-    print(f"⚙️ Processing {len(items)} items…")
-    for item in tqdm(items):
-        transformed.append(process_item(item))
-
-    print(f"💾 Saving output → {OUTPUT_PATH}")
-    with OUTPUT_PATH.open("w", encoding="utf-8") as f:
-        for obj in transformed:
-            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
-
-    print("\n=======================================")
-    print("✔ EVAL DATASET TRANSFORMATION COMPLETE")
-    print("✔ NO SAFE-DPO FLIPS APPLIED")
-    print(f"Total items: {len(items)}")
-    print("=======================================\n")
-
-
-if __name__ == "__main__":
-    transform_dataset()
diff --git a/src/aixpert/data_construction/data_transform_train.py b/src/aixpert/data_construction/data_transform_train.py
deleted file mode 100644
index 7b65397..0000000
--- a/src/aixpert/data_construction/data_transform_train.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""
-Transform binary factual-scored training preference pairs into DPO-ready format.
-
-This script:
-- Loads binary factual results for training pairs.
-- Converts response_0 / response_1 into (chosen, rejected) strictly based on
-  better_response_id.
-- Copies factual flags into h_w (winner) and h_l (loser).
-- Preserves original responses and adds a flipped=False flag.
-- Writes the DPO-ready JSONL file for training.
-
-Fully compatible with ruff, ruff-format, pydocstyle, and mypy.
-"""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any, Dict, List
-
-from tqdm import tqdm
-
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-
-INPUT_PATH = DATA_DIR / "skywork_binary_factual_train.jsonl"
-OUTPUT_PATH = DATA_DIR / "skywork_first_transformed_train.jsonl"
-
-
-def process_item(item: Dict[str, Any]) -> Dict[str, Any]:
-    """Convert one binary factual-scored pair into DPO-ready structure."""
-    prompt = item["prompt"]
-    r0 = item["response_0"]
-    r1 = item["response_1"]
-    pref = int(item["better_response_id"])
-
-    # factual/hallucination flags
-    h0 = int(item["h0"])
-    h1 = int(item["h1"])
-
-    # Determine chosen vs rejected based on preference label
-    if pref == 0:
-        chosen, rejected = r0, r1
-        h_w, h_l = h0, h1
-    else:
-        chosen, rejected = r1, r0
-        h_w, h_l = h1, h0
-
-    return {
-        "prompt": prompt,
-        "chosen": chosen,
-        "rejected": rejected,
-        "h_w": h_w,
-        "h_l": h_l,
-        "better_response_id": pref,
-        "response_0": r0,
-        "response_1": r1,
-        "flipped": False,
-    }
-
-
-def transform_dataset() -> None:
-    """Load training dataset, apply transformation, and save JSONL output."""
-    print(f"📥 Loading training data → {INPUT_PATH}")
-    items = [json.loads(line) for line in INPUT_PATH.open("r", encoding="utf-8")]
-
-    transformed: List[Dict[str, Any]] = []
-
-    print(f"⚙️ Processing {len(items)} items…")
-    for item in tqdm(items):
-        transformed.append(process_item(item))
-
-    print(f"💾 Saving output → {OUTPUT_PATH}")
-    with OUTPUT_PATH.open("w", encoding="utf-8") as f:
-        for obj in transformed:
-            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
-
-    print("\n=======================================")
-    print("✔ TRAIN DATASET TRANSFORMATION COMPLETE")
-    print("✔ NO SAFE-DPO FLIPS APPLIED")
-    print(f"Total items: {len(items)}")
-    print("=======================================\n")
-
-
-if __name__ == "__main__":
-    transform_dataset()
diff --git a/src/aixpert/data_construction/dataconversion.py b/src/aixpert/data_construction/dataconversion.py
deleted file mode 100644
index 40a034d..0000000
--- a/src/aixpert/data_construction/dataconversion.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""
-Generate preference pairs from cleaned Skywork samples.
-
-This script loads prompt/chosen/rejected rows from a JSONL dataset, randomly
-assigns chosen/rejected responses into response_0 and response_1, assigns the
-correct better_response_id, and saves the resulting dataset in JSONL format.
-
-This version is fully compliant with ruff, ruff-format, pydocstyle, and mypy.
-"""
-
-from __future__ import annotations
-
-import json
-import random
-from pathlib import Path
-from typing import Any, Dict, List
-
-
-# ============================================================
-# Configuration
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-DATA_DIR.mkdir(parents=True, exist_ok=True)
-
-INPUT_FILE = DATA_DIR / "skywork_extracted_77k.jsonl"
-OUT_FILE = DATA_DIR / "skywork_preference_pairs_77k.jsonl"
-
-
-def load_jsonl(path: Path) -> List[Dict[str, Any]]:
-    """Load a JSONL file into a list of dictionaries."""
-    rows: List[Dict[str, Any]] = []
-    with path.open("r", encoding="utf-8") as f:
-        for line in f:
-            rows.append(json.loads(line))
-    return rows
-
-
-def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
-    """Write a list of dictionaries to a JSONL file."""
-    with path.open("w", encoding="utf-8") as f:
-        for row in rows:
-            f.write(json.dumps(row, ensure_ascii=False) + "\n")
-
-
-def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Convert prompt/chosen/rejected rows into preference-pair format."""
-    output: List[Dict[str, Any]] = []
-
-    for item in data:
-        prompt = item.get("prompt", "")
-        chosen = item.get("chosen", "")
-        rejected = item.get("rejected", "")
-
-        if random.random() < 0.5:
-            response_0 = chosen
-            response_1 = rejected
-            better_response_id = 0
-        else:
-            response_0 = rejected
-            response_1 = chosen
-            better_response_id = 1
-
-        output.append(
-            {
-                "prompt": prompt,
-                "response_0": response_0,
-                "response_1": response_1,
-                "better_response_id": better_response_id,
-            }
-        )
-
-    return output
-
-
-def main() -> None:
-    """Generate evaluation preference pairs and save them to disk."""
-    print(f"📥 Loading dataset from → {INPUT_FILE}")
-
-    data = load_jsonl(INPUT_FILE)
-    print(f"📄 Loaded {len(data)} rows")
-
-    preference_pairs = create_preference_pairs(data)
-
-    write_jsonl(OUT_FILE, preference_pairs)
-
-    print("======================================")
-    print(f"✅ DONE! Saved preference pairs → {OUT_FILE}")
-    print(f"📦 Total pairs: {len(preference_pairs)}")
-    print("======================================")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/aixpert/data_construction/dataconversion_eval.py b/src/aixpert/data_construction/dataconversion_eval.py
deleted file mode 100644
index 6f6525d..0000000
--- a/src/aixpert/data_construction/dataconversion_eval.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""
-Generate evaluation preference pairs from cleaned Skywork samples.
-
-This script loads prompt/chosen/rejected rows from the evaluation JSONL dataset,
-randomly assigns chosen/rejected responses into response_0 and response_1,
-assigns the correct better_response_id, and saves the resulting dataset in JSONL format.
-
-It mirrors the training script but operates on the evaluation split only.
-"""
-
-from __future__ import annotations
-
-import json
-import random
-from pathlib import Path
-from typing import Any, Dict, List
-
-
-# ============================================================
-# Configuration
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-DATA_DIR.mkdir(parents=True, exist_ok=True)
-
-# Input/output file names for evaluation set
-INPUT_FILE = DATA_DIR / "skywork_extracted_eval.jsonl"
-OUT_FILE = DATA_DIR / "skywork_preference_pairs_eval.jsonl"
-
-
-def load_jsonl(path: Path) -> List[Dict[str, Any]]:
-    """Load a JSONL file into a list of dictionaries."""
-    rows: List[Dict[str, Any]] = []
-    with path.open("r", encoding="utf-8") as f:
-        for line in f:
-            rows.append(json.loads(line))
-    return rows
-
-
-def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
-    """Write a list of dictionaries to a JSONL file."""
-    with path.open("w", encoding="utf-8") as f:
-        for row in rows:
-            f.write(json.dumps(row, ensure_ascii=False) + "\n")
-
-
-def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Convert prompt/chosen/rejected rows into preference-pair format."""
-    output: List[Dict[str, Any]] = []
-
-    for item in data:
-        prompt = item.get("prompt", "")
-        chosen = item.get("chosen", "")
-        rejected = item.get("rejected", "")
-
-        # Random assignment
-        if random.random() < 0.5:
-            response_0 = chosen
-            response_1 = rejected
-            better_response_id = 0
-        else:
-            response_0 = rejected
-            response_1 = chosen
-            better_response_id = 1
-
-        output.append(
-            {
-                "prompt": prompt,
-                "response_0": response_0,
-                "response_1": response_1,
-                "better_response_id": better_response_id,
-            }
-        )
-
-    return output
-
-
-def main() -> None:
-    """Generate evaluation preference pairs and save them to disk."""
-    print(f"📥 Loading evaluation dataset from → {INPUT_FILE}")
-
-    data = load_jsonl(INPUT_FILE)
-    print(f"📄 Loaded {len(data)} rows")
-
-    preference_pairs = create_preference_pairs(data)
-
-    write_jsonl(OUT_FILE, preference_pairs)
-
-    print("======================================")
-    print(f"✅ DONE! Saved evaluation preference pairs → {OUT_FILE}")
-    print(f"📦 Total eval pairs: {len(preference_pairs)}")
-    print("======================================")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/aixpert/data_construction/dataextraction.py b/src/aixpert/data_construction/dataextraction.py
deleted file mode 100644
index 15ea172..0000000
--- a/src/aixpert/data_construction/dataextraction.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""
-Skywork extraction utilities.
-
-This module extracts prompt/chosen/rejected fields from the Skywork Preference
-dataset, removes exact duplicates, and writes the cleaned dataset to JSONL
-files. Fully compatible with ruff, mypy, and the AI Engineering template.
-"""
-
-from __future__ import annotations
-
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Union
-
-import pandas as pd
-from datasets import load_dataset
-
-
-# Path to: src/aixpert/data_construction/data/
-DATA_DIR = Path(__file__).resolve().parent / "data"
-DATA_DIR.mkdir(parents=True, exist_ok=True)
-
-
-SUBSET_SIZE = 80000
-OUT_FILE = DATA_DIR / "skywork_extracted_77k.jsonl"
-REMOVED_FILE = DATA_DIR / "skywork_cleaned_77k.jsonl"
-
-print(f"📥 Loading first {SUBSET_SIZE} samples from Skywork...")
-
-
-# ============================================================
-# Dataset loading
-# ============================================================
-ds = load_dataset(
-    "Skywork/Skywork-Reward-Preference-80K-v0.1",
-    split=f"train[:{SUBSET_SIZE}]",
-)
-
-df = ds.to_pandas()
-
-
-# ============================================================
-# Extract prompt / chosen / rejected
-# ============================================================
-def extract_prompt_from_dialog(dialog: List[Dict[str, Any]]) -> str:
-    """
-    Extract the first user message from a dialog.
-
-    Parameters
-    ----------
-    dialog : list of dict
-        A list of message objects with "role" and "content" keys.
-
-    Returns
-    -------
-    str
-        The content of the first message with role 'user', or an empty string.
-    """
-    for msg in dialog:
-        if msg.get("role") == "user":
-            return str(msg.get("content", "")).strip()
-    return ""
-
-
-def extract_answer_from_dialog(dialog: List[Dict[str, Any]]) -> str:
-    """
-    Extract the first assistant message from a dialog.
-
-    Parameters
-    ----------
-    dialog : list of dict
-        A list of message objects with "role" and "content" keys.
-
-    Returns
-    -------
-    str
-        The content of the first message with role 'assistant', or an empty string.
-    """
-    for msg in dialog:
-        if msg.get("role") == "assistant":
-            return str(msg.get("content", "")).strip()
-    return ""
-
-
-df["prompt"] = df["chosen"].apply(extract_prompt_from_dialog)
-df["chosen"] = df["chosen"].apply(extract_answer_from_dialog)
-df["rejected"] = df["rejected"].apply(extract_answer_from_dialog)
-
-clean_df = df[["prompt", "chosen", "rejected"]]
-
-# ============================================================
-# 🔍 Exact-match removal (chosen == rejected)
-# ============================================================
-cleaned: List[Dict[str, str]] = []
-removed: List[Dict[str, str]] = []
-
-for _, row in clean_df.iterrows():
-    chosen = str(row["chosen"]).strip()
-    rejected = str(row["rejected"]).strip()
-
-    sample = {
-        "prompt": str(row["prompt"]).strip(),
-        "chosen": chosen,
-        "rejected": rejected,
-    }
-
-    if chosen == rejected:
-        removed.append(sample)
-    else:
-        cleaned.append(sample)
-
-print(f"🧹 Removed exact duplicates: {len(removed)}")
-print(f"📦 Remaining clean samples: {len(cleaned)}")
-
-# Ensure output directory exists
-os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
-
-
-# ============================================================
-# Save output JSONL files
-# ============================================================
-def write_jsonl(path: Union[str, Path], rows: List[Dict[str, Any]]) -> None:
-    """Write a list of dictionaries to a JSONL file."""
-    with open(str(path), "w", encoding="utf-8") as f:
-        for row in rows:
-            f.write(json.dumps(row, ensure_ascii=False) + "\n")
-
-
-write_jsonl(OUT_FILE, cleaned)
-write_jsonl(REMOVED_FILE, removed)
-
-print(f"✅ Saved cleaned dataset ({len(cleaned)} samples) → {OUT_FILE}")
-print(f"🗑️  Saved removed duplicates ({len(removed)} samples) → {REMOVED_FILE}")
-
-print(pd.DataFrame(cleaned).head())
diff --git a/src/aixpert/data_construction/dataextraction_eval.py b/src/aixpert/data_construction/dataextraction_eval.py
deleted file mode 100644
index 1595767..0000000
--- a/src/aixpert/data_construction/dataextraction_eval.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""
-Extract the evaluation slice of the Skywork preference dataset.
-
-This script extracts rows 80001–81000, removes exact duplicates,
-and saves the cleaned dataset into JSONL files under the local data folder.
-"""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any, Dict, List
-
-from datasets import load_dataset
-
-
-# ============================================================
-# Helpers
-# ============================================================
-
-
-def extract_prompt(dialog: List[Dict[str, Any]]) -> str:
-    """Extract the first user message."""
-    for msg in dialog:
-        if msg.get("role") == "user":
-            return str(msg.get("content", "")).strip()
-    return ""
-
-
-def extract_answer(dialog: List[Dict[str, Any]]) -> str:
-    """Extract the first assistant message."""
-    for msg in dialog:
-        if msg.get("role") == "assistant":
-            return str(msg.get("content", "")).strip()
-    return ""
-
-
-def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
-    """Write rows to a JSONL file."""
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("w", encoding="utf-8") as f:
-        for row in rows:
-            f.write(json.dumps(row, ensure_ascii=False) + "\n")
-
-
-# ============================================================
-# Constants for eval split
-# ============================================================
-
-START = 80001
-END = 81000  # inclusive
-
-print(f"📥 Loading eval slice: {START} → {END}")
-
-ds = load_dataset(
-    "Skywork/Skywork-Reward-Preference-80K-v0.1",
-    split=f"train[{START}:{END + 1}]",
-)
-
-df = ds.to_pandas()
-
-df["prompt"] = df["chosen"].apply(extract_prompt)
-df["chosen"] = df["chosen"].apply(extract_answer)
-df["rejected"] = df["rejected"].apply(extract_answer)
-
-clean_df = df[["prompt", "chosen", "rejected"]]
-
-cleaned: List[Dict[str, str]] = []
-removed: List[Dict[str, str]] = []
-
-for _, row in clean_df.iterrows():
-    chosen = row["chosen"].strip()
-    rejected = row["rejected"].strip()
-
-    record = {
-        "prompt": row["prompt"].strip(),
-        "chosen": chosen,
-        "rejected": rejected,
-    }
-
-    if chosen == rejected:
-        removed.append(record)
-    else:
-        cleaned.append(record)
-
-print(f"🧹 Removed duplicates: {len(removed)}")
-print(f"📦 Clean samples: {len(cleaned)}")
-
-# Save outputs
-data_dir = Path(__file__).resolve().parent / "data"
-save_jsonl(data_dir / "skywork_extracted_eval.jsonl", cleaned)
-save_jsonl(data_dir / "skywork_eval_removed.jsonl", removed)
-
-print("✅ Saved eval dataset → skywork_eval.jsonl")
diff --git a/src/aixpert/data_construction/dataextraction_eval2.py b/src/aixpert/data_construction/dataextraction_eval2.py
deleted file mode 100644
index 83fb701..0000000
--- a/src/aixpert/data_construction/dataextraction_eval2.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""
-Extract the test slice of the Skywork preference dataset.
-
-This script extracts rows 81001–81500, removes exact duplicates,
-and saves the cleaned dataset into JSONL files under the local data folder.
-"""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any, Dict, List
-
-from datasets import load_dataset
-
-
-# ============================================================
-# Helpers
-# ============================================================
-
-
-def extract_prompt(dialog: List[Dict[str, Any]]) -> str:
-    """Extract the first user message."""
-    for msg in dialog:
-        if msg.get("role") == "user":
-            return str(msg.get("content", "")).strip()
-    return ""
-
-
-def extract_answer(dialog: List[Dict[str, Any]]) -> str:
-    """Extract the first assistant message."""
-    for msg in dialog:
-        if msg.get("role") == "assistant":
-            return str(msg.get("content", "")).strip()
-    return ""
-
-
-def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
-    """Write rows to a JSONL file."""
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("w", encoding="utf-8") as f:
-        for row in rows:
-            f.write(json.dumps(row, ensure_ascii=False) + "\n")
-
-
-# ============================================================
-# Constants for test split
-# ============================================================
-
-START = 81001
-END = 81500  # inclusive
-
-print(f"📥 Loading test slice: {START} → {END}")
-
-ds = load_dataset(
-    "Skywork/Skywork-Reward-Preference-80K-v0.1",
-    split=f"train[{START}:{END + 1}]",
-)
-
-df = ds.to_pandas()
-
-df["prompt"] = df["chosen"].apply(extract_prompt)
-df["chosen"] = df["chosen"].apply(extract_answer)
-df["rejected"] = df["rejected"].apply(extract_answer)
-
-clean_df = df[["prompt", "chosen", "rejected"]]
-
-cleaned: List[Dict[str, str]] = []
-removed: List[Dict[str, str]] = []
-
-for _, row in clean_df.iterrows():
-    chosen = row["chosen"].strip()
-    rejected = row["rejected"].strip()
-
-    record = {
-        "prompt": row["prompt"].strip(),
-        "chosen": chosen,
-        "rejected": rejected,
-    }
-
-    if chosen == rejected:
-        removed.append(record)
-    else:
-        cleaned.append(record)
-
-print(f"🧹 Removed duplicates: {len(removed)}")
-print(f"📦 Clean samples: {len(cleaned)}")
-
-# Save outputs
-data_dir = Path(__file__).resolve().parent / "data"
-save_jsonl(data_dir / "skywork_extracted_test.jsonl", cleaned)
-save_jsonl(data_dir / "skywork_test_removed.jsonl", removed)
-
-print("✅ Saved test dataset → skywork_test.jsonl")
diff --git a/src/aixpert/data_construction/dataset_eval.py b/src/aixpert/data_construction/dataset_eval.py
deleted file mode 100644
index faa9d0c..0000000
--- a/src/aixpert/data_construction/dataset_eval.py
+++ /dev/null
@@ -1,215 +0,0 @@
-"""
-Generate binary factuality labels (0=factual, 1=hallucinated) for test preference pairs.
-
-This script loads test preference pairs, evaluates each response using a
-strict PKU-style binary factual judge. It then saves the dataset with:
-
-    factual_flag_0, factual_flag_1
-    h0, h1  (aliases for convenience)
-
-It supports automatic resume, concurrency throttling, and checkpointing.
-Fully compliant with ruff, ruff-format, pydocstyle, and mypy.
-
-"""
-
-from __future__ import annotations
-
-import asyncio
-import json
-import os
-import re
-from pathlib import Path
-from typing import Any, Dict, List
-
-from decouple import Config, RepositoryEnv
-from openai import AsyncOpenAI
-from tqdm.asyncio import tqdm_asyncio
-from utils import load_yaml
-
-
-# ============================================================
-# Load config.yaml + .env API key
-# ============================================================
-
-CONFIG = load_yaml("config.yaml")  # MUST contain: repository: <path>
-REPO_PATH = CONFIG["repository"]
-
-env = Config(RepositoryEnv(f"{REPO_PATH}/.env"))
-OPENAI_API_KEY = env("OPENAI_API_KEY", default=False)
-
-if not OPENAI_API_KEY:
-    raise RuntimeError("❌ OPENAI_API_KEY not found in .env under repository path.")
-
-client = AsyncOpenAI(api_key=OPENAI_API_KEY)
-
-
-# ============================================================
-# Configuration & Directories
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-DATA_DIR.mkdir(parents=True, exist_ok=True)
-
-INPUT_FILE = DATA_DIR / "skywork_preference_pairs_eval.jsonl"
-OUTPUT_FILE = DATA_DIR / "skywork_binary_factual_eval.jsonl"
-
-# ============================================================
-# Factual Judge Prompt
-# ============================================================
-
-BINARY_FACTUAL_JUDGE_PROMPT = """
-System:
-You are an extremely strict factuality judge. Your task is to determine whether the assistant’s
-response contains ANY of the following:
-
-• factual errors
-• hallucinations
-• unsupported claims
-• unverifiable statements
-• fabricated details
-• incorrect technical descriptions
-• invented studies, names, or entities
-• logical contradictions
-• vague or misleading explanations
-• partial answers that omit critical facts
-
-IMPORTANT:
-If the response is vague, incomplete, unsupported, or unverifiable,
-classify it as [[1]].
-
-Definitions:
-[[0]] → Fully factual, precise, verifiable.
-[[1]] → ANY issue: vague, incorrect, unverifiable, misleading, fabricated.
-
-Return ONLY:
-[[0]] or [[1]]
-
-User Question:
-{question}
-
-Assistant Response:
-{answer}
-"""
-
-
-# ============================================================
-# API Call — Get Binary Factual Flag
-# ============================================================
-
-
-async def get_factual_flag(
-    question: str, answer: str, semaphore: asyncio.Semaphore
-) -> int:
-    """Evaluate a response for factual correctness (0=factual, 1=hallucinated)."""
-    prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer)
-
-    async with semaphore:
-        for retry in range(5):
-            try:
-                response = await client.chat.completions.create(
-                    model="gpt-4o-mini",
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=0,
-                )
-
-                text = response.choices[0].message.content.strip()
-                match = re.search(r"\[\[(0|1)\]\]", text)
-                if match:
-                    return int(match.group(1))
-
-                return 1  # default: hallucinated
-
-            except Exception as exc:
-                print(f"Retry factual-flag ({retry}): {exc}")
-                await asyncio.sleep(1 + 0.5 * retry)
-
-    return 1
-
-
-# ============================================================
-# Process One Item
-# ============================================================
-
-
-async def process_single_item(
-    item: Dict[str, Any], semaphore: asyncio.Semaphore
-) -> Dict[str, Any]:
-    """Process one preference pair and produce binary factual labels."""
-    prompt = item["prompt"]
-    r0 = item["response_0"]
-    r1 = item["response_1"]
-
-    f0_task = asyncio.create_task(get_factual_flag(prompt, r0, semaphore))
-    f1_task = asyncio.create_task(get_factual_flag(prompt, r1, semaphore))
-
-    f0 = await f0_task
-    f1 = await f1_task
-
-    return {
-        **item,
-        "factual_flag_0": f0,
-        "factual_flag_1": f1,
-        "h0": f0,
-        "h1": f1,
-    }
-
-
-# ============================================================
-# Main Async Pipeline
-# ============================================================
-
-
-async def process_dataset() -> None:
-    """Load test dataset, compute factual flags, resume if needed, and save output."""
-    print(f"📥 Loading test dataset → {INPUT_FILE}")
-
-    with INPUT_FILE.open("r", encoding="utf-8") as f:
-        items = [json.loads(line) for line in f]
-
-    # Resume mode
-    processed_count = 0
-    if OUTPUT_FILE.exists():
-        print("♻️ Resuming previous run...")
-        with OUTPUT_FILE.open("r", encoding="utf-8") as f:
-            processed_count = sum(1 for _ in f)
-        print(f"Found {processed_count} completed items.")
-
-    remaining = items[processed_count:]
-    semaphore = asyncio.Semaphore(25)
-
-    tasks = [
-        asyncio.create_task(process_single_item(item, semaphore)) for item in remaining
-    ]
-
-    buffer: List[str] = []
-    count = processed_count
-
-    with OUTPUT_FILE.open("a", encoding="utf-8") as f:
-        for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)):
-            result = await coro
-            buffer.append(json.dumps(result, ensure_ascii=False) + "\n")
-            count += 1
-
-            if len(buffer) >= 25:
-                f.writelines(buffer)
-                f.flush()
-                os.fsync(f.fileno())
-                buffer.clear()
-                print(f"Checkpoint saved ({count} items).")
-
-        # Flush final buffer
-        if buffer:
-            f.writelines(buffer)
-            f.flush()
-            os.fsync(f.fileno())
-            print(f"Final checkpoint saved ({count} items).")
-
-    print("✅ Completed test factual evaluation.")
-
-
-# ============================================================
-# Entry Point
-# ============================================================
-
-if __name__ == "__main__":
-    asyncio.run(process_dataset())
diff --git a/src/aixpert/data_construction/dataset_train.py b/src/aixpert/data_construction/dataset_train.py
deleted file mode 100644
index f24b773..0000000
--- a/src/aixpert/data_construction/dataset_train.py
+++ /dev/null
@@ -1,216 +0,0 @@
-"""
-Generate binary factuality labels for evaluation preference pairs using GPT-4o-mini.
-
-This script loads evaluation preference pairs, evaluates each response using
-a strict PKU-style binary factual judge, and saves the resulting dataset with:
-
-    factual_flag_0, factual_flag_1
-    h0, h1  (aliases for convenience)
-
-It supports automatic resume, concurrency throttling, and checkpointing.
-Fully compliant with ruff, ruff-format, pydocstyle, and mypy.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import json
-import os
-import re
-from pathlib import Path
-from typing import Any, Dict, List
-
-from decouple import Config, RepositoryEnv
-from openai import AsyncOpenAI
-from tqdm.asyncio import tqdm_asyncio
-from utils import load_yaml  # your YAML loader
-
-
-# ============================================================
-# Load OpenAI API key
-# ============================================================
-
-
-YAML_CONFIG = load_yaml("config.yaml")
-REPO_PATH = YAML_CONFIG["repository"]
-
-# Load .env using decouple (the exact method you asked for)
-env = Config(RepositoryEnv(f"{REPO_PATH}/.env"))
-OPENAI_API_KEY = env("OPENAI_API_KEY", default=False)
-
-if not OPENAI_API_KEY:
-    raise ValueError("❌ OPENAI_API_KEY not found in .env file!")
-
-client = AsyncOpenAI(api_key=OPENAI_API_KEY)
-
-
-# ============================================================
-# Configuration & Directories
-# ============================================================
-
-DATA_DIR = Path(__file__).resolve().parent / "data"
-DATA_DIR.mkdir(parents=True, exist_ok=True)
-
-INPUT_FILE = DATA_DIR / "skywork_preference_pairs_train.jsonl"
-OUTPUT_FILE = DATA_DIR / "skywork_binary_factual_train.jsonl"
-
-# ============================================================
-# Factual Judge Prompt
-# ============================================================
-
-BINARY_FACTUAL_JUDGE_PROMPT = """
-System:
-You are an extremely strict factuality judge. Your task is to determine whether the assistant’s
-response contains ANY of the following:
-
-• factual errors
-• hallucinations
-• unsupported claims
-• unverifiable statements
-• fabricated details
-• incorrect technical descriptions
-• invented studies, names, or entities
-• logical contradictions
-• vague or misleading explanations
-• partial answers that omit critical facts
-
-IMPORTANT:
-If the response is vague, incomplete, unsupported, or unverifiable,
-classify it as [[1]].
-
-Definitions:
-[[0]] → Fully factual, precise, verifiable.
-[[1]] → ANY issue: vague, incorrect, unverifiable, misleading, fabricated.
-
-Return ONLY:
-[[0]] or [[1]]
-
-User Question:
-{question}
-
-Assistant Response:
-{answer}
-"""
-
-
-# ============================================================
-# API Call — Get Binary Factual Flag
-# ============================================================
-
-
-async def get_factual_flag(
-    question: str, answer: str, semaphore: asyncio.Semaphore
-) -> int:
-    """Evaluate a response for factual correctness (0=factual, 1=hallucinated)."""
-    prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer)
-
-    async with semaphore:
-        for retry in range(5):
-            try:
-                response = await client.chat.completions.create(
-                    model="gpt-4o-mini",
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=0,
-                )
-
-                text = response.choices[0].message.content.strip()
-                match = re.search(r"\[\[(0|1)\]\]", text)
-                if match:
-                    return int(match.group(1))
-
-                return 1  # default: hallucinated
-
-            except Exception as exc:
-                print(f"Retry factual-flag ({retry}): {exc}")
-                await asyncio.sleep(1 + 0.5 * retry)
-
-    return 1
-
-
-# ============================================================
-# Process One Item
-# ============================================================
-
-
-async def process_single_item(
-    item: Dict[str, Any], semaphore: asyncio.Semaphore
-) -> Dict[str, Any]:
-    """Process one preference pair and produce binary factual labels."""
-    prompt = item["prompt"]
-    r0 = item["response_0"]
-    r1 = item["response_1"]
-
-    f0_task = asyncio.create_task(get_factual_flag(prompt, r0, semaphore))
-    f1_task = asyncio.create_task(get_factual_flag(prompt, r1, semaphore))
-
-    f0 = await f0_task
-    f1 = await f1_task
-
-    return {
-        **item,
-        "factual_flag_0": f0,
-        "factual_flag_1": f1,
-        "h0": f0,
-        "h1": f1,
-    }
-
-
-# ============================================================
-# Main Async Pipeline
-# ============================================================
-
-
-async def process_dataset() -> None:
-    """Load eval dataset, compute factual flags, resume if needed, and save output."""
-    print(f"📥 Loading eval dataset → {INPUT_FILE}")
-
-    with INPUT_FILE.open("r", encoding="utf-8") as f:
-        items = [json.loads(line) for line in f]
-
-    # Resume mode
-    processed_count = 0
-    if OUTPUT_FILE.exists():
-        print("♻️ Resuming previous run...")
-        with OUTPUT_FILE.open("r", encoding="utf-8") as f:
-            processed_count = sum(1 for _ in f)
-        print(f"Found {processed_count} completed items.")
-
-    remaining = items[processed_count:]
-    semaphore = asyncio.Semaphore(25)
-
-    tasks = [
-        asyncio.create_task(process_single_item(item, semaphore)) for item in remaining
-    ]
-
-    buffer: List[str] = []
-    count = processed_count
-
-    with OUTPUT_FILE.open("a", encoding="utf-8") as f:
-        for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)):
-            result = await coro
-            buffer.append(json.dumps(result, ensure_ascii=False) + "\n")
-            count += 1
-
-            if len(buffer) >= 25:
-                f.writelines(buffer)
-                f.flush()
-                os.fsync(f.fileno())
-                buffer.clear()
-                print(f"Checkpoint saved ({count} items).")
-
-        # Flush remaining
-        if buffer:
-            f.writelines(buffer)
-            f.flush()
-            os.fsync(f.fileno())
-            print(f"Final checkpoint saved ({count} items).")
-
-    print("✅ Completed factual evaluation.")
-
-
-# ============================================================
-# Entry Point
-# ============================================================
-
-if __name__ == "__main__":
-    asyncio.run(process_dataset())
diff --git a/src/aixpert/data_construction/stage_1_extraction/dataextraction_eval.py b/src/aixpert/data_construction/stage_1_extraction/dataextraction_eval.py
new file mode 100644
index 0000000..0240f34
--- /dev/null
+++ b/src/aixpert/data_construction/stage_1_extraction/dataextraction_eval.py
@@ -0,0 +1,55 @@
+"""
+Extract the test slice of the Skywork preference dataset.
+
+This script extracts rows 81001–81500, removes exact duplicates,
+and saves the cleaned dataset into JSONL files under the local data folder.
+Only the prompts from this test set will be used in evaluation.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from datasets import load_dataset
+from utils.config_loader import load_config
+from utils.data_utils import (
+    extract_answer,
+    extract_prompt,
+    filter_duplicates,
+    save_jsonl,
+)
+
+
+def main() -> None:
+    """Run test-split extraction and save cleaned JSONL outputs."""
+    cfg = load_config()
+    hp = cfg["hyperparams"]
+    paths = cfg["paths"]
+
+    start, end = hp["test_start"], hp["test_end"]
+
+    print(f"Extracting test slice {start} → {end}")
+
+    ds = load_dataset(
+        paths["skywork_file"],
+        split=f"train[{start}:{end + 1}]",
+    )
+    df = ds.to_pandas()
+
+    df["prompt"] = df["chosen"].apply(extract_prompt)
+    df["chosen"] = df["chosen"].apply(extract_answer)
+    df["rejected"] = df["rejected"].apply(extract_answer)
+
+    rows = df[["prompt", "chosen", "rejected"]].to_dict(orient="records")
+    cleaned, removed = filter_duplicates(rows)
+
+    save_jsonl(Path(paths["skywork_test_cleaned"]), cleaned)
+    save_jsonl(Path(paths["skywork_test_removed"]), removed)
+
+    print(f"Removed duplicates: {len(removed)}")
+    print(f"Clean samples: {len(cleaned)}")
+    print("Test extraction completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py b/src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py
new file mode 100644
index 0000000..9ce178a
--- /dev/null
+++ b/src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py
@@ -0,0 +1,57 @@
+"""
+Skywork extraction utilities.
+
+This module extracts prompt/chosen/rejected fields from the Skywork Preference
+dataset, removes exact duplicates, and writes the cleaned dataset to JSONL
+files. Fully compatible with ruff, mypy, and the AI Engineering template.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from datasets import load_dataset
+from utils.config_loader import load_config
+from utils.data_utils import (
+    extract_answer,
+    extract_prompt,
+    filter_duplicates,
+    save_jsonl,
+)
+
+
+def main() -> None:
+    """Run train-split extraction and save cleaned JSONL outputs."""
+    cfg = load_config()
+    hp = cfg["hyperparams"]
+    paths = cfg["paths"]
+
+    subset_size = hp["subset_size"]
+
+    print(f"Loading first {subset_size} samples from Skywork...")
+
+    ds = load_dataset(
+        paths["skywork_file"],
+        split=f"train[:{subset_size}]",
+    )
+
+    df = ds.to_pandas()
+
+    df["prompt"] = df["chosen"].apply(extract_prompt)
+    df["chosen"] = df["chosen"].apply(extract_answer)
+    df["rejected"] = df["rejected"].apply(extract_answer)
+
+    rows = df[["prompt", "chosen", "rejected"]].to_dict(orient="records")
+
+    cleaned, removed = filter_duplicates(rows)
+
+    save_jsonl(Path(paths["skywork_train_cleaned"]), cleaned)
+    save_jsonl(Path(paths["skywork_train_removed"]), removed)
+
+    print(f"Removed exact duplicates: {len(removed)}")
+    print(f"Clean samples: {len(cleaned)}")
+    print("Training extraction completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_1_extraction/dataextraction_val.py b/src/aixpert/data_construction/stage_1_extraction/dataextraction_val.py
new file mode 100644
index 0000000..0918298
--- /dev/null
+++ b/src/aixpert/data_construction/stage_1_extraction/dataextraction_val.py
@@ -0,0 +1,54 @@
+"""
+Extract the evaluation slice of the Skywork preference dataset.
+
+This script extracts rows 80001–81000, removes exact duplicates,
+and saves the cleaned dataset into JSONL files under the local data folder.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from datasets import load_dataset
+from utils.config_loader import load_config
+from utils.data_utils import (
+    extract_answer,
+    extract_prompt,
+    filter_duplicates,
+    save_jsonl,
+)
+
+
+def main() -> None:
+    """Run validation-split extraction and save cleaned JSONL outputs."""
+    cfg = load_config()
+    hp = cfg["hyperparams"]
+    paths = cfg["paths"]
+
+    start, end = hp["eval_start"], hp["eval_end"]
+
+    print(f"Extracting eval slice {start} → {end}")
+
+    ds = load_dataset(
+        paths["skywork_file"],
+        split=f"train[{start}:{end + 1}]",
+    )
+    df = ds.to_pandas()
+
+    df["prompt"] = df["chosen"].apply(extract_prompt)
+    df["chosen"] = df["chosen"].apply(extract_answer)
+    df["rejected"] = df["rejected"].apply(extract_answer)
+
+    rows = df[["prompt", "chosen", "rejected"]].to_dict(orient="records")
+    cleaned, removed = filter_duplicates(rows)
+
+    save_jsonl(Path(paths["skywork_eval_cleaned"]), cleaned)
+    save_jsonl(Path(paths["skywork_eval_removed"]), removed)
+
+    print(f"Removed duplicates: {len(removed)}")
+    print(f"Clean samples: {len(cleaned)}")
+    print("Eval extraction completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py b/src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py
new file mode 100644
index 0000000..65c89e4
--- /dev/null
+++ b/src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py
@@ -0,0 +1,47 @@
+"""
+Generate training preference pairs from cleaned Skywork samples.
+
+Loads prompt/chosen/rejected rows from the cleaned 77k dataset,
+creates random preference pairs (response_0/response_1),
+assigns correct better_response_id, and writes JSONL output.
+
+This script uses the shared data utilities and config loader.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.data_utils import (
+    create_preference_pairs,
+    load_jsonl,
+    write_jsonl,
+)
+
+
+def main() -> None:
+    """Generate preference pairs for the training set."""
+    cfg = load_config()
+    paths = cfg["paths"]
+
+    input_path = Path(paths["skywork_train_cleaned"])
+    output_path = Path(paths["skywork_train_pairs"])
+
+    print(f"Loading training dataset → {input_path}")
+
+    data = load_jsonl(input_path)
+    print(f"Loaded {len(data)} rows")
+
+    preference_pairs = create_preference_pairs(data)
+
+    write_jsonl(output_path, preference_pairs)
+
+    print("======================================")
+    print(f"Training preference pairs saved → {output_path}")
+    print(f"Total pairs: {len(preference_pairs)}")
+    print("======================================")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_2_conversion/dataconversion_val.py b/src/aixpert/data_construction/stage_2_conversion/dataconversion_val.py
new file mode 100644
index 0000000..e3d7672
--- /dev/null
+++ b/src/aixpert/data_construction/stage_2_conversion/dataconversion_val.py
@@ -0,0 +1,45 @@
+"""
+Generate evaluation preference pairs from cleaned Skywork samples.
+
+Loads prompt/chosen/rejected rows for the eval slice,
+creates random preference pairs (response_0/response_1),
+assigns correct better_response_id, and writes JSONL output.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from aixpert.utils.config_loader import load_config
+from aixpert.utils.data_utils import (
+    create_preference_pairs,
+    load_jsonl,
+    write_jsonl,
+)
+
+
+def main() -> None:
+    """Generate preference pairs for the evaluation set."""
+    cfg = load_config()
+    paths = cfg["paths"]
+
+    input_path = Path(paths["skywork_eval_cleaned"])
+    output_path = Path(paths["skywork_eval_pairs"])
+
+    print(f"Loading evaluation dataset → {input_path}")
+
+    data = load_jsonl(input_path)
+    print(f"Loaded {len(data)} rows")
+
+    preference_pairs = create_preference_pairs(data)
+
+    write_jsonl(output_path, preference_pairs)
+
+    print("======================================")
+    print(f"Eval preference pairs saved → {output_path}")
+    print(f"Total eval pairs: {len(preference_pairs)}")
+    print("======================================")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_3_factuality/dataset_train.py b/src/aixpert/data_construction/stage_3_factuality/dataset_train.py
new file mode 100644
index 0000000..1557586
--- /dev/null
+++ b/src/aixpert/data_construction/stage_3_factuality/dataset_train.py
@@ -0,0 +1,50 @@
+"""Run binary factuality evaluation on training preference pairs."""
+
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+
+from decouple import Config, RepositoryEnv
+from utils.config_loader import load_config
+from utils.data_utils import load_jsonl
+from utils.factual_utils import (
+    factual_evaluation_pipeline,
+    get_client,
+)
+
+
+async def main() -> None:
+    """Execute factuality evaluation for the training set."""
+    cfg = load_config()
+
+    repo_path = cfg["repository"]
+    paths = cfg["paths"]
+    hp = cfg["hyperparams"]
+
+    env = Config(RepositoryEnv(f"{repo_path}/.env"))
+    api_key = env("OPENAI_API_KEY", default=None)
+    if not api_key:
+        raise RuntimeError("Missing OPENAI_API_KEY in .env")
+
+    client = get_client(api_key)
+
+    input_path = Path(paths["skywork_train_pairs"])
+    output_path = Path(paths["skywork_train_factual"])
+
+    items = load_jsonl(input_path)
+
+    await factual_evaluation_pipeline(
+        client=client,
+        items=items,
+        output_file=output_path,
+        model=cfg["model"]["name"],
+        concurrency=hp["concurrency_limit"],
+        max_retries=hp["max_retries"],
+    )
+
+    print("Completed factual evaluation for training set.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/aixpert/data_construction/stage_3_factuality/dataset_val.py b/src/aixpert/data_construction/stage_3_factuality/dataset_val.py
new file mode 100644
index 0000000..4e306f6
--- /dev/null
+++ b/src/aixpert/data_construction/stage_3_factuality/dataset_val.py
@@ -0,0 +1,51 @@
+"""Run binary factuality evaluation on evaluation preference pairs."""
+
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+
+from decouple import Config, RepositoryEnv
+from utils.config_loader import load_config
+from utils.data_utils import load_jsonl
+from utils.factual_utils import (
+    factual_evaluation_pipeline,
+    get_client,
+)
+
+
+async def main() -> None:
+    """Execute factuality evaluation for the validation set."""
+    cfg = load_config()
+
+    repo_path = cfg["repository"]
+    paths = cfg["paths"]
+    hp = cfg["hyperparams"]
+
+    # Load API key
+    env = Config(RepositoryEnv(f"{repo_path}/.env"))
+    api_key = env("OPENAI_API_KEY", default=None)
+    if not api_key:
+        raise RuntimeError("Missing OPENAI_API_KEY in .env")
+
+    client = get_client(api_key)
+
+    input_path = Path(paths["skywork_eval_pairs"])
+    output_path = Path(paths["skywork_eval_factual"])
+
+    items = load_jsonl(input_path)
+
+    await factual_evaluation_pipeline(
+        client=client,
+        items=items,
+        output_file=output_path,
+        model=cfg["model"]["name"],
+        concurrency=hp["concurrency_limit"],
+        max_retries=hp["max_retries"],
+    )
+
+    print("Completed factual evaluation for evaluation set.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/aixpert/data_construction/stage_4_transformation/data_transform_train.py b/src/aixpert/data_construction/stage_4_transformation/data_transform_train.py
new file mode 100644
index 0000000..d0f692e
--- /dev/null
+++ b/src/aixpert/data_construction/stage_4_transformation/data_transform_train.py
@@ -0,0 +1,22 @@
+"""Transform factual-scored training pairs into DPO-ready format."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.dpo_transform_utils import transform_dataset
+
+
+def main() -> None:
+    """Run dataset transformation for factual-scored training pairs."""
+    paths = load_config()["paths"]
+
+    input_path = Path(paths["skywork_train_factual"])
+    output_path = Path(paths["skywork_train_transformed"])
+
+    transform_dataset(input_path, output_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_4_transformation/data_transform_val.py b/src/aixpert/data_construction/stage_4_transformation/data_transform_val.py
new file mode 100644
index 0000000..a0036c6
--- /dev/null
+++ b/src/aixpert/data_construction/stage_4_transformation/data_transform_val.py
@@ -0,0 +1,22 @@
+"""Transform factual-scored evaluation pairs into DPO-ready format."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.dpo_transform_utils import transform_dataset
+
+
+def main() -> None:
+    """Run dataset transformation for factual-scored validation pairs."""
+    paths = load_config()["paths"]
+
+    input_path = Path(paths["skywork_eval_factual"])
+    output_path = Path(paths["skywork_eval_transformed"])
+
+    transform_dataset(input_path, output_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py b/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py
new file mode 100644
index 0000000..defd1cc
--- /dev/null
+++ b/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py
@@ -0,0 +1,88 @@
+"""
+Generate synthetic corruption (hallucinated) responses for TRAIN split.
+
+This script:
+- Loads clean DPO-ready Skywork transformation for training.
+- Selects items where h_w=0 (winner factual) and h_l=1 (loser incorrect).
+- Asks GPT-4o-mini to rewrite the factual answer into a subtle hallucination.
+- Produces “inversion pairs” where corrupted is chosen and original is rejected.
+- Saves up to 10,000 synthetic hallucination samples.
+
+Fully compatible with ruff, ruff-format, pydocstyle, and mypy.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import random
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from openai import AsyncOpenAI
+from utils.config_loader import load_config
+from utils.data_utils import load_jsonl, write_jsonl
+from utils.synthetic_utils import build_inversion_item, generate_corruption
+
+
+async def process_item(
+    item: Dict[str, Any],
+    client: AsyncOpenAI,
+    sem: asyncio.Semaphore,
+    model: str,
+    max_retries: int,
+) -> Optional[Dict[str, Any]]:
+    """Generate one synthetic inversion sample for training."""
+    corrupted = await generate_corruption(
+        client=client,
+        model=model,
+        question=item["prompt"],
+        answer=item["chosen"],
+        semaphore=sem,
+        max_retries=max_retries,
+    )
+
+    if corrupted is None:
+        return None
+
+    return await build_inversion_item(item, corrupted)
+
+
+async def main() -> None:
+    """Generate synthetic hallucination samples for training."""
+    config = load_config()
+
+    model = config["model"]["name"]
+    api_key = config["openai_api_key"]
+
+    target = config["hyperparams"]["synthetic_train_samples"]
+    concurrency = config["hyperparams"]["corruption_concurrency"]
+    max_retries = config["hyperparams"]["max_retries"]
+
+    input_path = Path(config["paths"]["skywork_train_transformed"])
+    output_path = Path(config["paths"]["synthetic_train_out"])
+
+    print(f"Loading transformed training data → {input_path}")
+    items = load_jsonl(input_path)
+
+    print("🔍 Selecting (h_w=0, h_l=1) candidates…")
+    valid = [x for x in items if x["h_w"] == 0 and x["h_l"] == 1]
+
+    selected = random.sample(valid, min(target, len(valid)))
+    print(f"Selected {len(selected)} items for corruption.")
+
+    client = AsyncOpenAI(api_key=api_key)
+    sem = asyncio.Semaphore(concurrency)
+
+    tasks = [process_item(item, client, sem, model, max_retries) for item in selected]
+    results = await asyncio.gather(*tasks)
+
+    final_rows = [r for r in results if r is not None]
+
+    print(f"Saving {len(final_rows)} synthetic training samples → {output_path}")
+    write_jsonl(output_path, final_rows)
+
+    print("Synthetic training corruption generation complete.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_val.py b/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_val.py
new file mode 100644
index 0000000..6dfa633
--- /dev/null
+++ b/src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_val.py
@@ -0,0 +1,90 @@
+"""
+Generate synthetic corruption (hallucinated) responses for EVAL split.
+
+This script:
+- Loads clean DPO-ready Skywork eval transformation.
+- Selects pairs where h_w=0 and h_l=1.
+- Uses GPT-4o-mini to introduce subtle factual errors.
+- Produces inverted (hallucinated, correct) preference pairs.
+- Saves 400 synthetic eval corruption examples.
+
+Compatible with ruff, ruff-format, pydocstyle, and mypy.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import random
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from openai import AsyncOpenAI
+from utils.config_loader import load_config
+from utils.data_utils import load_jsonl, write_jsonl
+from utils.synthetic_utils import build_inversion_item, generate_corruption
+
+
+async def process_item(
+    item: Dict[str, Any],
+    client: AsyncOpenAI,
+    sem: asyncio.Semaphore,
+    model: str,
+    max_retries: int,
+) -> Optional[Dict[str, Any]]:
+    """Generate one synthetic inversion example for evaluation."""
+    corrupted = await generate_corruption(
+        client=client,
+        model=model,
+        question=item["prompt"],
+        answer=item["chosen"],
+        semaphore=sem,
+        max_retries=max_retries,
+    )
+
+    if corrupted is None:
+        return None
+
+    entry = await build_inversion_item(item, corrupted)
+    entry["source"] = "synthetic_inversion_eval"
+    return entry
+
+
+async def main() -> None:
+    """Generate synthetic corruption samples for evaluation."""
+    config = load_config()
+
+    model = config["model"]["name"]
+    api_key = config["openai_api_key"]
+
+    target = config["hyperparams"]["synthetic_eval_samples"]
+    concurrency = config["hyperparams"]["corruption_concurrency"]
+    max_retries = config["hyperparams"]["max_retries"]
+
+    input_path = Path(config["paths"]["skywork_eval_transformed"])
+    output_path = Path(config["paths"]["synthetic_eval_out"])
+
+    print(f"Loading transformed eval data → {input_path}")
+    items = load_jsonl(input_path)
+
+    print("Selecting (h_w=0, h_l=1) eval candidates…")
+    valid = [x for x in items if x["h_w"] == 0 and x["h_l"] == 1]
+
+    selected = random.sample(valid, min(target, len(valid)))
+    print(f"Selected {len(selected)} items for corruption.")
+
+    client = AsyncOpenAI(api_key=api_key)
+    sem = asyncio.Semaphore(concurrency)
+
+    tasks = [process_item(item, client, sem, model, max_retries) for item in selected]
+    results = await asyncio.gather(*tasks)
+
+    final_rows = [r for r in results if r is not None]
+
+    print(f"Saving {len(final_rows)} synthetic eval samples → {output_path}")
+    write_jsonl(output_path, final_rows)
+
+    print("Eval synthetic corruption generation complete.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/aixpert/data_construction/stage_6_merging/data_merge_train.py b/src/aixpert/data_construction/stage_6_merging/data_merge_train.py
new file mode 100644
index 0000000..3b76c49
--- /dev/null
+++ b/src/aixpert/data_construction/stage_6_merging/data_merge_train.py
@@ -0,0 +1,68 @@
+"""
+Merge Skywork training data with 10k synthetic inversion pairs.
+
+This script:
+- Loads synthetic corruption samples.
+- Loads transformed Skywork training data.
+- Splits real samples into buckets by (h_w, h_l).
+- Samples 10k from (0,1).
+- Merges: synthetic + (0,0) + (1,1) + sampled (0,1).
+- Shuffles and writes final JSONL file.
+
+Fully compatible with ruff, mypy, and pydocstyle.
+"""
+
+from __future__ import annotations
+
+import random
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.data_utils import bucket_by_flags, load_jsonl, write_jsonl
+
+
+def main() -> None:
+    """Merge Skywork train data with synthetic inversion pairs."""
+    cfg = load_config()
+    paths = cfg["paths"]
+    hp = cfg["hyperparams"]
+
+    synthetic_path = Path(paths["synthetic_train_out"])
+    skywork_transformed_path = Path(paths["skywork_train_transformed"])
+    output_path = Path(paths["final_train_merged"])
+
+    sample_size = hp.get("merge_sample_01_train", 10000)
+
+    print(f"📥 Loading synthetic → {synthetic_path}")
+    synthetic = load_jsonl(synthetic_path)
+    print(f"Synthetic count: {len(synthetic)}")
+
+    print(f"📥 Loading transformed Skywork train → {skywork_transformed_path}")
+    sky = load_jsonl(skywork_transformed_path)
+    print(f"Skywork transformed count: {len(sky)}")
+
+    # Bucket by (h_w, h_l)
+    b00, b11, b01 = bucket_by_flags(sky)
+
+    print(f"(0,0): {len(b00)}")
+    print(f"(1,1): {len(b11)}")
+    print(f"(0,1): {len(b01)}")
+
+    # Sample subset of (0,1)
+    random.seed(42)
+    sample_01 = random.sample(b01, min(sample_size, len(b01)))
+    print(f"Sampled (0,1): {len(sample_01)}")
+
+    merged = synthetic + b00 + b11 + sample_01
+
+    print(f"Total merged before shuffle: {len(merged)}")
+    random.shuffle(merged)
+
+    print(f"💾 Saving final merged train → {output_path}")
+    write_jsonl(output_path, merged)
+
+    print("✅ TRAIN MERGE COMPLETE.\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_6_merging/data_merge_val.py b/src/aixpert/data_construction/stage_6_merging/data_merge_val.py
new file mode 100644
index 0000000..2696788
--- /dev/null
+++ b/src/aixpert/data_construction/stage_6_merging/data_merge_val.py
@@ -0,0 +1,61 @@
+"""
+Merge Skywork evaluation data with 400 synthetic inversion pairs.
+
+This script:
+- Loads synthetic corruption samples for eval.
+- Loads Skywork eval transformed dataset.
+- Splits samples into buckets by (h_w, h_l).
+- Keeps ALL real eval samples.
+- Merges synthetic + all real eval buckets.
+- Shuffles and writes final eval JSONL file.
+
+Fully compatible with ruff, mypy, and pydocstyle.
+"""
+
+from __future__ import annotations
+
+import random
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.data_utils import bucket_by_flags, load_jsonl, write_jsonl
+
+
+def main() -> None:
+    """Merge Skywork eval data with synthetic eval inversion pairs."""
+    cfg = load_config()
+    paths = cfg["paths"]
+
+    synthetic_path = Path(paths["synthetic_eval_out"])
+    skywork_transformed_path = Path(paths["skywork_eval_transformed"])
+    output_path = Path(paths["final_eval_merged"])
+
+    print(f"📥 Loading synthetic eval → {synthetic_path}")
+    synthetic = load_jsonl(synthetic_path)
+    print(f"Synthetic eval count: {len(synthetic)}")
+
+    print(f"📥 Loading transformed Skywork eval → {skywork_transformed_path}")
+    sky = load_jsonl(skywork_transformed_path)
+    print(f"Skywork eval count: {len(sky)}")
+
+    # Split into buckets
+    b00, b11, b01 = bucket_by_flags(sky)
+
+    print(f"(0,0): {len(b00)}")
+    print(f"(1,1): {len(b11)}")
+    print(f"(0,1): {len(b01)}")
+
+    # Eval uses ALL samples (no sampling)
+    merged = synthetic + b00 + b11 + b01
+    print(f"Total merged before shuffle: {len(merged)}")
+
+    random.shuffle(merged)
+
+    print(f"💾 Saving final merged eval → {output_path}")
+    write_jsonl(output_path, merged)
+
+    print("✅ EVAL MERGE COMPLETE.\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_7_final/data_final_train.py b/src/aixpert/data_construction/stage_7_final/data_final_train.py
new file mode 100644
index 0000000..15cd7c5
--- /dev/null
+++ b/src/aixpert/data_construction/stage_7_final/data_final_train.py
@@ -0,0 +1,88 @@
+"""
+Balanced sampling for TRAIN dataset.
+
+This script:
+- Loads the merged training dataset.
+- Buckets by (h_w, h_l).
+- Samples required amounts per bucket (with replacement if needed).
+- Shuffles and saves the final balanced training dataset.
+
+Buckets required:
+    (0,1) → 10,000
+    (1,0) → 10,000
+    (0,0) → 15,000
+    (1,1) → 10,000
+"""
+
+from __future__ import annotations
+
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+from utils.config_loader import load_config
+from utils.data_utils import load_jsonl, write_jsonl
+
+
+def main() -> None:
+    """Balanced sampling for TRAIN dataset."""
+    cfg = load_config()
+    paths = cfg["paths"]
+    hp = cfg["hyperparams"]
+
+    input_path = Path(paths["skywork_final_train"])
+    output_path = Path(paths["final_train_out"])
+
+    target_counts: Dict[Tuple[int, int], int] = hp["balance_targets"]
+
+    print(f"Loading → {input_path}")
+    data = load_jsonl(input_path)
+
+    # Initialize buckets
+    buckets: Dict[Tuple[int, int], List[Dict[str, Any]]] = {
+        (0, 1): [],
+        (1, 0): [],
+        (0, 0): [],
+        (1, 1): [],
+    }
+
+    print("🔍 Bucketing samples…")
+    for ex in data:
+        key = (int(ex["h_w"]), int(ex["h_l"]))
+        if key in buckets:
+            buckets[key].append(ex)
+
+    print("\n=== AVAILABLE PER BUCKET ===")
+    for key, rows in buckets.items():
+        print(f"{key}: {len(rows)}")
+
+    final_rows: List[Dict[str, Any]] = []
+
+    # Sampling logic
+    for key, req_count in target_counts.items():
+        pool = buckets[key]
+        available = len(pool)
+
+        print(f"\nBucket {key}: available={available}, required={req_count}")
+
+        if available < req_count:
+            print("⚠️ Sampling WITH replacement.")
+            sampled = random.choices(pool, k=req_count)
+        else:
+            sampled = random.sample(pool, req_count)
+
+        final_rows.extend(sampled)
+
+    print(f"\nShuffling {len(final_rows)} rows…")
+    random.shuffle(final_rows)
+
+    print(f"Saving → {output_path}")
+    write_jsonl(output_path, final_rows)
+
+    print("\nTRAIN balanced dataset ready.")
+    print(f"Final count: {len(final_rows)}")
+
+
+if __name__ == "__main__":
+    random.seed(42)
+    main()
diff --git a/src/aixpert/data_construction/stage_7_final/data_final_val.py b/src/aixpert/data_construction/stage_7_final/data_final_val.py
new file mode 100644
index 0000000..af74631
--- /dev/null
+++ b/src/aixpert/data_construction/stage_7_final/data_final_val.py
@@ -0,0 +1,90 @@
+"""
+Build the FINAL evaluation dataset (skywork_final_eval.jsonl).
+
+Composition:
+    • 400 synthetic inversion samples (1,0)
+    • all Skywork eval samples from skywork_first_transformed_eval.jsonl
+    • +1500 samples of (1,1) from skywork_final_train.jsonl
+    • +1500 samples of (0,0) from skywork_final_train.jsonl
+      → excluding any sample already used in train_finallast.jsonl
+
+Final eval ≈ (#sky_eval + 400 synthetic + 3000 added clean samples)
+"""
+
+from __future__ import annotations
+
+import random
+from pathlib import Path
+from typing import Any, Dict, List
+
+from utils.config_loader import load_config
+from utils.data_utils import load_jsonl, write_jsonl
+
+
+def main() -> None:
+    """Build the FINAL evaluation dataset."""
+    cfg = load_config()
+    paths = cfg["paths"]
+    hp = cfg["hyperparams"]
+
+    synthetic_path = Path(paths["synthetic_eval_out"])
+    sky_eval_path = Path(paths["skywork_eval_transformed"])
+    train_full_path = Path(paths["final_train_merged"])
+    train_used_path = Path(paths["final_train_out"])
+    output_path = Path(paths["final_eval_out"])
+
+    add_n = hp["eval_additional_clean_samples"]
+
+    print(f"Loading synthetic eval → {synthetic_path}")
+    synthetic = load_jsonl(synthetic_path)
+
+    print(f"Loading Skywork eval transformed → {sky_eval_path}")
+    sky_eval = load_jsonl(sky_eval_path)
+
+    print(f"Loading full training dataset → {train_full_path}")
+    sky_train = load_jsonl(train_full_path)
+
+    print(f"Loading train-balanced dataset (to exclude) → {train_used_path}")
+    train_used = load_jsonl(train_used_path)
+
+    exclude = {(ex["prompt"], ex["chosen"], ex["rejected"]) for ex in train_used}
+
+    pool_11: List[Dict[str, Any]] = []
+    pool_00: List[Dict[str, Any]] = []
+
+    for ex in sky_train:
+        key = (ex["prompt"], ex["chosen"], ex["rejected"])
+        if key in exclude:
+            continue
+
+        if ex["h_w"] == 1 and ex["h_l"] == 1:
+            pool_11.append(ex)
+        elif ex["h_w"] == 0 and ex["h_l"] == 0:
+            pool_00.append(ex)
+
+    print(f"(1,1) pool after exclusion: {len(pool_11)}")
+    print(f"(0,0) pool after exclusion: {len(pool_00)}")
+
+    sample_11 = random.sample(pool_11, add_n)
+    sample_00 = random.sample(pool_00, add_n)
+
+    merged: List[Dict[str, Any]] = []
+    merged.extend(synthetic)
+    merged.extend(sky_eval)
+    merged.extend(sample_11)
+    merged.extend(sample_00)
+
+    print(f"\nTotal before shuffle: {len(merged)}")
+
+    random.shuffle(merged)
+
+    print(f"Saving final eval → {output_path}")
+    write_jsonl(output_path, merged)
+
+    print("\nFINAL EVAL DATASET READY.")
+    print(f"Final count: {len(merged)}")
+
+
+if __name__ == "__main__":
+    random.seed(42)
+    main()
diff --git a/src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py b/src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py
new file mode 100644
index 0000000..f1187d9
--- /dev/null
+++ b/src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py
@@ -0,0 +1,41 @@
+"""
+Flip preference labels for training data.
+
+This script:
+- Converts h_w=1,h_l=0 → h_w=0,h_l=1
+- Swaps chosen/rejected
+- Writes a flipped version of the dataset
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, List
+
+from utils.config_loader import load_config
+from utils.data_utils import flip_sample, load_jsonl, write_jsonl
+
+
+def main() -> None:
+    """Flip (1,0) preference labels in the final training dataset."""
+    paths = load_config()["paths"]
+
+    input_path = Path(paths["final_train_out"])
+    output_path = Path(paths["train_flipped_out"])
+    print(f"Loading → {input_path}")
+    items: List[Dict[str, Any]] = load_jsonl(input_path)
+
+    print("Flipping (h_w=1, h_l=0) samples...")
+    flipped = [flip_sample(item) for item in items]
+
+    print(f"Saving flipped dataset → {output_path}")
+    write_jsonl(output_path, flipped)
+
+    print("\n==========================================")
+    print("TRAIN FLIP COMPLETE")
+    print(f"Total samples processed: {len(flipped)}")
+    print("==========================================\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py b/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py
new file mode 100644
index 0000000..46cd3bc
--- /dev/null
+++ b/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py
@@ -0,0 +1,42 @@
+"""
+Flip preference labels for evaluation data.
+
+This script:
+- Converts h_w=1,h_l=0 → h_w=0,h_l=1
+- Swaps chosen/rejected
+- Writes a flipped version of the dataset
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, List
+
+from utils.config_loader import load_config
+from utils.data_utils import flip_sample, load_jsonl, write_jsonl
+
+
+def main() -> None:
+    """Flip (1,0) preference labels in the final evaluation dataset."""
+    paths = load_config()["paths"]
+
+    input_path = Path(paths["final_eval_out"])
+    output_path = Path(paths["eval_flipped_out"])
+
+    print(f"Loading → {input_path}")
+    items: List[Dict[str, Any]] = load_jsonl(input_path)
+
+    print("Flipping (h_w=1, h_l=0) samples...")
+    flipped = [flip_sample(item) for item in items]
+
+    print(f"💾 Saving flipped dataset → {output_path}")
+    write_jsonl(output_path, flipped)
+
+    print("\n==========================================")
+    print("EVAL FLIP COMPLETE")
+    print(f"Total samples processed: {len(flipped)}")
+    print("==========================================\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/utils.py b/src/aixpert/data_construction/utils.py
deleted file mode 100644
index 7aed9e6..0000000
--- a/src/aixpert/data_construction/utils.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-Utility functions for loading configuration files.
-
-This module provides:
-- `load_yaml`: Read a YAML file into a Python dictionary.
-- `load_env_api_key`: Load the OPENAI_API_KEY from a repository `.env` file.
-
-These helpers centralize configuration handling and ensure consistent behavior
-across all data-construction scripts.
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any, Dict
-
-import yaml
-from decouple import Config, RepositoryEnv
-
-
-def load_yaml(yaml_path: str) -> Dict[str, Any]:
-    """Load a YAML file and return its content as a dict.
-
-    :param yaml_path: Path to the YAML file.
-    :return: Parsed YAML content as a dict, or empty dict on failure.
-    """
-    try:
-        with open(yaml_path, "r", encoding="utf-8") as f:
-            return yaml.safe_load(f) or {}
-    except Exception as e:
-        print(f"YAML load error: {e}")
-        return {}
-
-
-def load_env_api_key(repository_path: str) -> str:
-    """Load OPENAI_API_KEY from a .env file inside the repository.
-
-    Uses:
-        env = Config(RepositoryEnv(config["repository"] + "/.env"))
-        api_key = env("OPENAI_API_KEY", default=False)
-
-    :param repository_path: Path to the repo containing `.env`
-    :return: The OpenAI API key or an empty string if missing.
-    """
-    env_path = Path(repository_path) / ".env"
-
-    if not env_path.exists():
-        print(f"Warning: .env file not found at {env_path}")
-        return ""
-
-    env = Config(RepositoryEnv(str(env_path)))
-    return env("OPENAI_API_KEY", default="")
-
-
-__all__ = ["load_yaml", "load_env_api_key"]
diff --git a/src/aixpert/data_construction/utils/config_loader.py b/src/aixpert/data_construction/utils/config_loader.py
new file mode 100644
index 0000000..88b3060
--- /dev/null
+++ b/src/aixpert/data_construction/utils/config_loader.py
@@ -0,0 +1,17 @@
+"""Utility module for loading the global YAML configuration file."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict
+
+import yaml
+
+
+CONFIG_PATH = Path(__file__).resolve().parents[1] / "config" / "config.yaml"
+
+
+def load_config() -> Dict[str, Any]:
+    """Load YAML config into a dictionary."""
+    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
diff --git a/src/aixpert/data_construction/utils/data_utils.py b/src/aixpert/data_construction/utils/data_utils.py
new file mode 100644
index 0000000..b70c9eb
--- /dev/null
+++ b/src/aixpert/data_construction/utils/data_utils.py
@@ -0,0 +1,124 @@
+"""Utility functions for dataset extraction, cleaning, formatting, and flipping.
+
+These helpers are used across the data-construction pipeline for DPO, SafeDPO,
+Factual-DPO, and evaluation preprocessing.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+
+def extract_prompt(dialog: List[Dict[str, Any]]) -> str:
+    """Extract the first user message."""
+    for msg in dialog:
+        if msg.get("role") == "user":
+            return str(msg.get("content", "")).strip()
+    return ""
+
+
+def extract_answer(dialog: List[Dict[str, Any]]) -> str:
+    """Extract the first assistant reply."""
+    for msg in dialog:
+        if msg.get("role") == "assistant":
+            return str(msg.get("content", "")).strip()
+    return ""
+
+
+def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write list of dictionaries to JSONL."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def filter_duplicates(
+    rows: List[Dict[str, str]],
+) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
+    """Split rows into cleaned (chosen != rejected) and removed (exact duplicates)."""
+    cleaned: List[Dict[str, str]] = []
+    removed: List[Dict[str, str]] = []
+
+    for row in rows:
+        if row["chosen"] == row["rejected"]:
+            removed.append(row)
+        else:
+            cleaned.append(row)
+
+    return cleaned, removed
+
+
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    """Load JSONL file into a list of dictionaries."""
+    with path.open("r", encoding="utf-8") as f:
+        return [json.loads(line) for line in f]
+
+
+def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
+    """Write list of dictionaries to a JSONL file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Convert rows into DPO preference-pair format."""
+    output: List[Dict[str, Any]] = []
+
+    for item in data:
+        prompt = item.get("prompt", "")
+        chosen = item.get("chosen", "")
+        rejected = item.get("rejected", "")
+
+        # Random symmetric assignment
+        if random.random() < 0.5:
+            response_0 = chosen
+            response_1 = rejected
+            better_response_id = 0
+        else:
+            response_0 = rejected
+            response_1 = chosen
+            better_response_id = 1
+
+        output.append(
+            {
+                "prompt": prompt,
+                "response_0": response_0,
+                "response_1": response_1,
+                "better_response_id": better_response_id,
+            }
+        )
+
+    return output
+
+
+def bucket_by_flags(
+    items: List[Dict[str, Any]],
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Split items into (0,0), (1,1), and (0,1) buckets."""
+    b00, b11, b01 = [], [], []
+
+    for ex in items:
+        h_w, h_l = ex["h_w"], ex["h_l"]
+
+        if h_w == 0 and h_l == 0:
+            b00.append(ex)
+        elif h_w == 1 and h_l == 1:
+            b11.append(ex)
+        elif h_w == 0 and h_l == 1:
+            b01.append(ex)
+
+    return b00, b11, b01
+
+
+def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]:
+    """Flip a sample if (h_w, h_l) = (1, 0)."""
+    if item.get("h_w") == 1 and item.get("h_l") == 0:
+        item["h_w"], item["h_l"] = 0, 1
+        item["chosen"], item["rejected"] = item["rejected"], item["chosen"]
+    return item
diff --git a/src/aixpert/data_construction/utils/dpo_transform_utils.py b/src/aixpert/data_construction/utils/dpo_transform_utils.py
new file mode 100644
index 0000000..fea34d9
--- /dev/null
+++ b/src/aixpert/data_construction/utils/dpo_transform_utils.py
@@ -0,0 +1,56 @@
+"""Utilities for transforming factual-scored pairs into DPO-ready format."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict
+
+from tqdm import tqdm
+from utils.data_utils import load_jsonl, write_jsonl
+
+
+def process_item(item: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert one factual-scored item into DPO-ready structure."""
+    prompt = item["prompt"]
+    r0 = item["response_0"]
+    r1 = item["response_1"]
+    pref = int(item["better_response_id"])
+
+    h0 = int(item["h0"])
+    h1 = int(item["h1"])
+
+    if pref == 0:
+        chosen, rejected = r0, r1
+        h_w, h_l = h0, h1
+    else:
+        chosen, rejected = r1, r0
+        h_w, h_l = h1, h0
+
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "h_w": h_w,
+        "h_l": h_l,
+        "better_response_id": pref,
+        "response_0": r0,
+        "response_1": r1,
+        "flipped": False,
+    }
+
+
+def transform_dataset(input_path: Path, output_path: Path) -> None:
+    """Load dataset, apply transformation, and save output JSONL."""
+    print(f"Loading → {input_path}")
+    items = load_jsonl(input_path)
+
+    print(f"⚙️ Transforming {len(items)} items…")
+    transformed = [process_item(it) for it in tqdm(items)]
+
+    print(f"Saving → {output_path}")
+    write_jsonl(output_path, transformed)
+
+    print("\n=======================================")
+    print("✔ TRANSFORMATION COMPLETE")
+    print(f"Total items: {len(items)}")
+    print("=======================================\n")
diff --git a/src/aixpert/data_construction/utils/factual_utils.py b/src/aixpert/data_construction/utils/factual_utils.py
new file mode 100644
index 0000000..f9996b1
--- /dev/null
+++ b/src/aixpert/data_construction/utils/factual_utils.py
@@ -0,0 +1,120 @@
+"""
+Async factuality evaluation utilities.
+
+This module runs factual-flag scoring for preference pairs using an
+LLM judge, supports concurrency, retries, and resume-safe checkpointing.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+
+from openai import AsyncOpenAI
+from tqdm.asyncio import tqdm_asyncio
+from utils.prompt_templates import BINARY_FACTUAL_JUDGE_PROMPT
+
+
+def get_client(api_key: str) -> AsyncOpenAI:
+    """Return AsyncOpenAI client."""
+    return AsyncOpenAI(api_key=api_key)
+
+
+async def get_factual_flag(
+    client: AsyncOpenAI,
+    model: str,
+    question: str,
+    answer: str,
+    semaphore: asyncio.Semaphore,
+    max_retries: int,
+) -> int:
+    """Evaluate factual correctness (0 factual, 1 hallucinated)."""
+    prompt = BINARY_FACTUAL_JUDGE_PROMPT.format(question=question, answer=answer)
+
+    async with semaphore:
+        for retry in range(max_retries):
+            try:
+                resp = await client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0,
+                )
+                text = resp.choices[0].message.content.strip()
+                match = re.search(r"\[\[(0|1)\]\]", text)
+                return int(match.group(1)) if match else 1
+            except Exception:
+                await asyncio.sleep(1 + retry * 0.5)
+
+    return 1
+
+
+async def evaluate_pair(
+    client: AsyncOpenAI,
+    item: Dict[str, Any],
+    model: str,
+    sem: asyncio.Semaphore,
+    retries: int,
+) -> Dict[str, Any]:
+    """Compute factual flags for response_0 and response_1."""
+    prompt = item["prompt"]
+
+    t0 = asyncio.create_task(
+        get_factual_flag(client, model, prompt, item["response_0"], sem, retries)
+    )
+    t1 = asyncio.create_task(
+        get_factual_flag(client, model, prompt, item["response_1"], sem, retries)
+    )
+
+    f0, f1 = await asyncio.gather(t0, t1)
+
+    return {
+        **item,
+        "factual_flag_0": f0,
+        "factual_flag_1": f1,
+        "h0": f0,
+        "h1": f1,
+    }
+
+
+async def factual_evaluation_pipeline(
+    client: AsyncOpenAI,
+    items: List[Dict[str, Any]],
+    output_file: Path,
+    model: str,
+    concurrency: int,
+    max_retries: int,
+) -> None:
+    """Run factuality evaluation with resume and checkpoint support."""
+    processed = 0
+    if output_file.exists():
+        with output_file.open("r", encoding="utf-8") as f:
+            processed = sum(1 for _ in f)
+
+    remaining = items[processed:]
+    sem = asyncio.Semaphore(concurrency)
+
+    tasks = [evaluate_pair(client, item, model, sem, max_retries) for item in remaining]
+
+    buffer: List[str] = []
+    counter = processed
+
+    with output_file.open("a", encoding="utf-8") as f:
+        for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)):
+            out = await coro
+            buffer.append(json.dumps(out, ensure_ascii=False) + "\n")
+            counter += 1
+
+            if len(buffer) >= 25:
+                f.writelines(buffer)
+                f.flush()
+                os.fsync(f.fileno())
+                buffer.clear()
+
+        if buffer:
+            f.writelines(buffer)
+            f.flush()
+            os.fsync(f.fileno())
diff --git a/src/aixpert/data_construction/utils/prompt_templates.py b/src/aixpert/data_construction/utils/prompt_templates.py
new file mode 100644
index 0000000..86cbe86
--- /dev/null
+++ b/src/aixpert/data_construction/utils/prompt_templates.py
@@ -0,0 +1,57 @@
+"""Central location for prompt templates (evaluation, DPO scoring, etc.)."""
+
+from __future__ import annotations
+
+
+BINARY_FACTUAL_JUDGE_PROMPT = """
+System:
+You are an extremely strict factuality judge. Your task is to determine whether the assistant’s
+response contains ANY of the following:
+
+• factual errors
+• hallucinations
+• unsupported claims
+• unverifiable statements
+• fabricated details
+• incorrect technical descriptions
+• invented studies, names, or entities
+• logical contradictions
+• vague or misleading explanations
+• partial answers that omit critical facts
+
+IMPORTANT:
+If the response is vague, incomplete, unsupported, or unverifiable,
+classify it as [[1]].
+
+Definitions:
+[[0]] → Fully factual, precise, verifiable.
+[[1]] → ANY issue.
+
+Return ONLY:
+[[0]] or [[1]]
+
+User Question:
+{question}
+
+Assistant Response:
+{answer}
+"""
+
+CORRUPTION_SYSTEM_PROMPT = """
+You are a factual corruption generator.
+
+Rewrite a correct answer into a subtly incorrect version with 2–3 factual errors.
+It must remain fluent, confident, and plausible.
+"""
+
+CORRUPTION_USER_PROMPT = """
+PROMPT:
+{question}
+
+CORRECT ANSWER:
+{answer}
+
+TASK:
+Rewrite the answer so that it becomes factually wrong, introducing subtle hallucinations
+while sounding confident and coherent.
+"""
diff --git a/src/aixpert/data_construction/utils/synthetic_utils.py b/src/aixpert/data_construction/utils/synthetic_utils.py
new file mode 100644
index 0000000..000cab6
--- /dev/null
+++ b/src/aixpert/data_construction/utils/synthetic_utils.py
@@ -0,0 +1,59 @@
+"""Async utilities for generating corrupted answers and synthetic inversions."""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any, Dict, Optional
+
+from openai import AsyncOpenAI
+from utils.prompt_templates import (
+    CORRUPTION_SYSTEM_PROMPT,
+    CORRUPTION_USER_PROMPT,
+)
+
+
+async def generate_corruption(
+    client: AsyncOpenAI,
+    model: str,
+    question: str,
+    answer: str,
+    semaphore: asyncio.Semaphore,
+    max_retries: int = 5,
+    temperature: float = 0.8,
+) -> Optional[str]:
+    """Generate a hallucinated / corrupted answer using GPT."""
+    user_prompt = CORRUPTION_USER_PROMPT.format(question=question, answer=answer)
+
+    async with semaphore:
+        for retry in range(max_retries):
+            try:
+                resp = await client.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {"role": "system", "content": CORRUPTION_SYSTEM_PROMPT},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    temperature=temperature,
+                )
+                return resp.choices[0].message.content.strip()
+
+            except Exception as exc:
+                print(f"[Retry {retry}] corruption generation failed: {exc}")
+                await asyncio.sleep(1 + retry * 0.5)
+
+    return None
+
+
+async def build_inversion_item(
+    item: Dict[str, Any],
+    corrupted: str,
+) -> Dict[str, Any]:
+    """Return a synthetic inversion DPO sample."""
+    return {
+        "prompt": item["prompt"],
+        "chosen": corrupted,
+        "rejected": item["chosen"],
+        "h_w": 1,
+        "h_l": 0,
+        "source": "synthetic_inversion",
+    }

From 7d3ee5a645af7484992c5cd26fd2f18c2d7b014b Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Thu, 4 Dec 2025 19:12:16 -0500
Subject: [PATCH 08/14] readme file

---
 src/aixpert/data_construction/Readme.md       | 20 +++++++++++++++++++
 .../stage_3_factuality/dataset_val.py         |  1 -
 .../stage_6_merging/data_merge_train.py       | 10 ++++------
 .../stage_6_merging/data_merge_val.py         | 10 ++++------
 .../stage_7_final/data_final_train.py         |  2 +-
 .../stage_8_flipping/data_flipped_val.py      |  2 +-
 .../data_construction/utils/data_utils.py     |  1 -
 7 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/aixpert/data_construction/Readme.md b/src/aixpert/data_construction/Readme.md
index e69de29..1a94f19 100644
--- a/src/aixpert/data_construction/Readme.md
+++ b/src/aixpert/data_construction/Readme.md
@@ -0,0 +1,20 @@
+# Skywork → Factual-DPO Data Construction Pipeline
+
+This repository contains a complete, modular, and type-safe data-construction pipeline for generating **factual-aware DPO datasets** from the **Skywork Reward-Preference-80K** dataset.
+
+The pipeline supports:
+- Direct Preference Optimization (DPO)
+- Factual-DPO
+- Synthetic hallucination inversion pairs
+- Balanced and flipped datasets
+
+## Configuration
+
+All configuration is centralized in:
+
+```bash
+src/aixpert/config/config.yaml
+```
+Loaded dynamically using:
+```python
+utils/config_loader.load_config()
diff --git a/src/aixpert/data_construction/stage_3_factuality/dataset_val.py b/src/aixpert/data_construction/stage_3_factuality/dataset_val.py
index 4e306f6..0aea440 100644
--- a/src/aixpert/data_construction/stage_3_factuality/dataset_val.py
+++ b/src/aixpert/data_construction/stage_3_factuality/dataset_val.py
@@ -22,7 +22,6 @@ async def main() -> None:
     paths = cfg["paths"]
     hp = cfg["hyperparams"]
 
-    # Load API key
     env = Config(RepositoryEnv(f"{repo_path}/.env"))
     api_key = env("OPENAI_API_KEY", default=None)
     if not api_key:
diff --git a/src/aixpert/data_construction/stage_6_merging/data_merge_train.py b/src/aixpert/data_construction/stage_6_merging/data_merge_train.py
index 3b76c49..b70f5f2 100644
--- a/src/aixpert/data_construction/stage_6_merging/data_merge_train.py
+++ b/src/aixpert/data_construction/stage_6_merging/data_merge_train.py
@@ -33,22 +33,20 @@ def main() -> None:
 
     sample_size = hp.get("merge_sample_01_train", 10000)
 
-    print(f"📥 Loading synthetic → {synthetic_path}")
+    print(f"Loading synthetic → {synthetic_path}")
     synthetic = load_jsonl(synthetic_path)
     print(f"Synthetic count: {len(synthetic)}")
 
-    print(f"📥 Loading transformed Skywork train → {skywork_transformed_path}")
+    print(f"Loading transformed Skywork train → {skywork_transformed_path}")
     sky = load_jsonl(skywork_transformed_path)
     print(f"Skywork transformed count: {len(sky)}")
 
-    # Bucket by (h_w, h_l)
     b00, b11, b01 = bucket_by_flags(sky)
 
     print(f"(0,0): {len(b00)}")
     print(f"(1,1): {len(b11)}")
     print(f"(0,1): {len(b01)}")
 
-    # Sample subset of (0,1)
     random.seed(42)
     sample_01 = random.sample(b01, min(sample_size, len(b01)))
     print(f"Sampled (0,1): {len(sample_01)}")
@@ -58,10 +56,10 @@ def main() -> None:
     print(f"Total merged before shuffle: {len(merged)}")
     random.shuffle(merged)
 
-    print(f"💾 Saving final merged train → {output_path}")
+    print(f"Saving final merged train → {output_path}")
     write_jsonl(output_path, merged)
 
-    print("✅ TRAIN MERGE COMPLETE.\n")
+    print("TRAIN MERGE COMPLETE.\n")
 
 
 if __name__ == "__main__":
diff --git a/src/aixpert/data_construction/stage_6_merging/data_merge_val.py b/src/aixpert/data_construction/stage_6_merging/data_merge_val.py
index 2696788..2e114eb 100644
--- a/src/aixpert/data_construction/stage_6_merging/data_merge_val.py
+++ b/src/aixpert/data_construction/stage_6_merging/data_merge_val.py
@@ -30,31 +30,29 @@ def main() -> None:
     skywork_transformed_path = Path(paths["skywork_eval_transformed"])
     output_path = Path(paths["final_eval_merged"])
 
-    print(f"📥 Loading synthetic eval → {synthetic_path}")
+    print(f"Loading synthetic eval → {synthetic_path}")
     synthetic = load_jsonl(synthetic_path)
     print(f"Synthetic eval count: {len(synthetic)}")
 
-    print(f"📥 Loading transformed Skywork eval → {skywork_transformed_path}")
+    print(f"Loading transformed Skywork eval → {skywork_transformed_path}")
     sky = load_jsonl(skywork_transformed_path)
     print(f"Skywork eval count: {len(sky)}")
 
-    # Split into buckets
     b00, b11, b01 = bucket_by_flags(sky)
 
     print(f"(0,0): {len(b00)}")
     print(f"(1,1): {len(b11)}")
     print(f"(0,1): {len(b01)}")
 
-    # Eval uses ALL samples (no sampling)
     merged = synthetic + b00 + b11 + b01
     print(f"Total merged before shuffle: {len(merged)}")
 
     random.shuffle(merged)
 
-    print(f"💾 Saving final merged eval → {output_path}")
+    print(f"Saving final merged eval → {output_path}")
     write_jsonl(output_path, merged)
 
-    print("✅ EVAL MERGE COMPLETE.\n")
+    print("EVAL MERGE COMPLETE.\n")
 
 
 if __name__ == "__main__":
diff --git a/src/aixpert/data_construction/stage_7_final/data_final_train.py b/src/aixpert/data_construction/stage_7_final/data_final_train.py
index 15cd7c5..7754d4d 100644
--- a/src/aixpert/data_construction/stage_7_final/data_final_train.py
+++ b/src/aixpert/data_construction/stage_7_final/data_final_train.py
@@ -46,7 +46,7 @@ def main() -> None:
         (1, 1): [],
     }
 
-    print("🔍 Bucketing samples…")
+    print("Bucketing samples…")
     for ex in data:
         key = (int(ex["h_w"]), int(ex["h_l"]))
         if key in buckets:
diff --git a/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py b/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py
index 46cd3bc..f34b1ad 100644
--- a/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py
+++ b/src/aixpert/data_construction/stage_8_flipping/data_flipped_val.py
@@ -29,7 +29,7 @@ def main() -> None:
     print("Flipping (h_w=1, h_l=0) samples...")
     flipped = [flip_sample(item) for item in items]
 
-    print(f"💾 Saving flipped dataset → {output_path}")
+    print(f"Saving flipped dataset → {output_path}")
     write_jsonl(output_path, flipped)
 
     print("\n==========================================")
diff --git a/src/aixpert/data_construction/utils/data_utils.py b/src/aixpert/data_construction/utils/data_utils.py
index b70c9eb..d4268b2 100644
--- a/src/aixpert/data_construction/utils/data_utils.py
+++ b/src/aixpert/data_construction/utils/data_utils.py
@@ -75,7 +75,6 @@ def create_preference_pairs(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         chosen = item.get("chosen", "")
         rejected = item.get("rejected", "")
 
-        # Random symmetric assignment
         if random.random() < 0.5:
             response_0 = chosen
             response_1 = rejected

From d774776ee27d56f76b0bbf5521d57e2deda4e957 Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Thu, 4 Dec 2025 19:19:17 -0500
Subject: [PATCH 09/14] readme updates

---
 src/aixpert/data_construction/Readme.md | 245 ++++++++++++++++++++++++
 1 file changed, 245 insertions(+)

diff --git a/src/aixpert/data_construction/Readme.md b/src/aixpert/data_construction/Readme.md
index 1a94f19..ca166f8 100644
--- a/src/aixpert/data_construction/Readme.md
+++ b/src/aixpert/data_construction/Readme.md
@@ -18,3 +18,248 @@ src/aixpert/config/config.yaml
 Loaded dynamically using:
 ```python
 utils/config_loader.load_config()
+```
+## Project Structure
+src/aixpert/
+│
+├── config/
+│   └── config.yaml                  # All paths + hyperparameters
+│
+├── data_construction/
+│   ├── stage_1_extraction/          # Train/Eval/Test extraction
+│   ├── stage_2_conversion/          # Preference-pair creation
+│   ├── stage_3_factuality/          # Binary factual scoring (LLM)
+│   ├── stage_4_transformation/      # Convert factual pairs → DPO format
+│   ├── stage_5_syntheticdata/       # Synthetic hallucination generation
+│   ├── stage_6_merging/             # Merge Skywork + synthetic
+│   ├── stage_7_balancing/           # Balanced train/eval dataset construction
+│   ├── stage_8_flipping/            # Flip (1,0) → (0,1) datasets
+│   └── utils/                       # Core shared utils
+│
+└── ...
+
+## Configuration Summary (`config.yaml`)
+
+### Model Settings
+- **model.name:** `gpt-4o-mini`
+- **model.temperature:** `0.8`
+
+---
+
+### Paths (All datasets + intermediate outputs)
+
+The configuration tracks every stage of the data pipeline, including:
+
+- Cleaned **train / eval / test** splits
+- **Preference pairs** (DPO-style)
+- **Factual-scored** outputs
+- **Synthetic inversion** samples (train + eval)
+- **Merged** intermediate datasets
+- **Balanced** final datasets
+- **Flipped** datasets for ablation
+
+**Examples:**
+```yaml
+skywork_train_cleaned: "src/.../skywork_extracted_77k.jsonl"
+skywork_train_pairs:   "src/.../skywork_preference_pairs_77k.jsonl"
+skywork_train_factual: "src/.../skywork_binary_factual_train.jsonl"
+final_train_out:       "src/.../train_balanced.jsonl"
+```
+
+## Pipeline Stages — Summary
+
+Below is a concise overview of all eight stages in the Skywork → Factual-DPO data pipeline.
+
+---
+
+### ** Stage 1 — Skywork Extraction**
+**Scripts:**
+- `dataextraction_train.py`
+- `dataextraction_eval.py`
+- `dataextraction_test.py`(These samples are directly used in evaluation)
+
+**Tasks:**
+- Load slices from Skywork Preference dataset
+- Extract:
+  - **prompt** (first user message)
+  - **chosen** (assistant reply)
+  - **rejected** (assistant reply)
+- Remove exact duplicates
+- Save cleaned JSONL files
+
+---
+
+### ** Stage 2 — Preference Pair Conversion**
+**Scripts:**
+- `dataconversion_train.py`
+- `dataconversion_eval.py`
+
+**Tasks:**
+- Convert `(prompt, chosen, rejected)` → **DPO-style preference pairs**
+- Produce:
+  - `response_0`, `response_1`
+  - `better_response_id`
+- Random symmetric assignment for unbiased supervision
+
+---
+
+### ** Stage 3 — Binary Factuality Evaluation**
+**Scripts:**
+- `dataset_train.py`
+- `dataset_val.py`
+
+**Components:**
+Uses `utils.factual_utils` to evaluate factual correctness using **GPT-4o-mini**.
+
+**Outputs:**
+- Binary hallucination flags:
+  - `h0`, `h1` (aliases for `factual_flag_0`, `factual_flag_1`)
+
+**Features:**
+- Resume-safe incremental scoring
+- Async concurrency
+- Retry logic
+
+---
+
+### ** Stage 4 — DPO Transformation**
+**Scripts:**
+- `data_transform_train.py`
+- `data_transform_val.py`
+
+**Tasks:**
+Transform factual-scored items into canonical DPO format:
+
+- `prompt`, `chosen`, `rejected`
+- `h_w`, `h_l`
+- `response_0`, `response_1`
+- `flipped=False`
+
+---
+
+### ** Stage 5 — Synthetic Hallucination Generation**
+**Scripts:**
+- `data_synthetic_train.py`
+- `data_synthetic_val.py`
+
+**Tasks:**
+- Select samples where winner is factual (`h_w=0`) and loser is incorrect (`h_l=1`)
+- Use **GPT-4o-mini** to generate hallucinated corruptions
+- Build synthetic inversion pairs
+
+**Outputs:**
+- **10,000** synthetic train samples
+- **400** synthetic eval samples
+
+---
+
+### ** Stage 6 — Merging**
+**Scripts:**
+- `merge_train.py`
+- `merge_eval.py`
+
+**Tasks:**
+- Merge Skywork transformed data with synthetic inversion pairs
+- Bucket by `(h_w, h_l)`
+- Sample subsets
+- Shuffle and save merged datasets
+
+---
+
+### ** Stage 7 — Balanced Dataset Construction**
+**Scripts:**
+- `balance_train.py`
+- `build_final_eval.py`
+
+**Train Balancing:**
+Use `balance_targets` to create balanced buckets:
+
+- `(0,1)` — 10,000
+- `(1,0)` — 10,000
+- `(0,0)` — 15,000
+- `(1,1)` — 10,000
+
+**Eval Construction:**
+Combine:
+- Skywork eval transformed
+- 400 synthetic eval inversion samples
+- 1500 clean `(1,1)` samples (unused in train)
+- 1500 clean `(0,0)` samples (unused in train)
+
+---
+
+### ** Stage 8 — Flipping (Optional)**
+**Scripts:**
+- `data_flipped_train.py`
+- `data_flipped_val.py`
+
+**Tasks:**
+- Flip all `(1,0)` samples → `(0,1)`
+- Swap `chosen` ↔ `rejected`
+- Produce alternate dataset for inversion or ablation studies
+
+---
+
+This structured overview provides a clear high-level map of the complete Factual-DPO data construction workflow.
+
+## Utilities Summary
+
+### `utils/config_loader.py`
+- Centralized configuration loader
+- All stages call `load_config()` to read `config.yaml`
+
+---
+
+### `utils/data_utils.py`
+Core data-processing helpers:
+- `extract_prompt()` — first user message
+- `extract_answer()` — first assistant reply
+- `filter_duplicates()` — removes exact matches
+- `create_preference_pairs()` — builds DPO response pairs
+- `bucket_by_flags()` — groups by (h_w, h_l)
+- `flip_sample()` — converts (1,0) → (0,1)
+- JSONL read/write utilities
+
+---
+
+### `utils/factual_utils.py`
+- Async binary factuality scoring using GPT-4o-mini
+- Concurrency + retry logic
+- Resume-safe checkpointing
+- Produces `h0`, `h1` hallucination flags
+
+---
+
+### `utils/dpo_transform_utils.py`
+- Converts factual-scored items into final DPO format:
+  - `prompt`, `chosen`, `rejected`, `h_w`, `h_l`, `response_0`, `response_1`, `flipped=False`
+
+---
+
+### `utils/synthetic_utils.py`
+- GPT-based corruption generator
+- Creates synthetic inversion pairs (hallucinated → correct)
+
+---
+
+### `utils/prompt_templates.py`
+Provides all system/user prompts:
+- Strict factuality judge prompt
+- Hallucination corruption prompts
+
+---
+
+## Running the Pipeline
+
+Example sequence for **training pipeline**:
+
+```bash
+python src/aixpert/data_construction/stage_1_extraction/dataextraction_train.py
+python src/aixpert/data_construction/stage_2_conversion/dataconversion_train.py
+python src/aixpert/data_construction/stage_3_factuality/dataset_train.py
+python src/aixpert/data_construction/stage_4_transformation/data_transform_train.py
+python src/aixpert/data_construction/stage_5_syntheticdata/data_synthetic_train.py
+python src/aixpert/data_construction/stage_6_merging/merge_train.py
+python src/aixpert/data_construction/stage_7_balancing/balance_train.py
+python src/aixpert/data_construction/stage_8_flipping/data_flipped_train.py
+```

From 0cd7a9ae50329ebd5b7cada8056a1efb188360d2 Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v2.cluster.local>
Date: Thu, 4 Dec 2025 19:22:59 -0500
Subject: [PATCH 10/14] readme updates

---
 src/aixpert/data_construction/Readme.md | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/aixpert/data_construction/Readme.md b/src/aixpert/data_construction/Readme.md
index ca166f8..2b3fee7 100644
--- a/src/aixpert/data_construction/Readme.md
+++ b/src/aixpert/data_construction/Readme.md
@@ -19,25 +19,6 @@ Loaded dynamically using:
 ```python
 utils/config_loader.load_config()
 ```
-## Project Structure
-src/aixpert/
-│
-├── config/
-│   └── config.yaml                  # All paths + hyperparameters
-│
-├── data_construction/
-│   ├── stage_1_extraction/          # Train/Eval/Test extraction
-│   ├── stage_2_conversion/          # Preference-pair creation
-│   ├── stage_3_factuality/          # Binary factual scoring (LLM)
-│   ├── stage_4_transformation/      # Convert factual pairs → DPO format
-│   ├── stage_5_syntheticdata/       # Synthetic hallucination generation
-│   ├── stage_6_merging/             # Merge Skywork + synthetic
-│   ├── stage_7_balancing/           # Balanced train/eval dataset construction
-│   ├── stage_8_flipping/            # Flip (1,0) → (0,1) datasets
-│   └── utils/                       # Core shared utils
-│
-└── ...
-
 ## Configuration Summary (`config.yaml`)
 
 ### Model Settings

From ff117a7bef50f5ffbad07ba26414e26795ddc67f Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v3.cluster.local>
Date: Fri, 5 Dec 2025 15:42:27 -0500
Subject: [PATCH 11/14] Fix pip-audit workflow configuration

---
 .github/workflows/code_checks.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
index a10d07f..5ad415d 100644
--- a/.github/workflows/code_checks.yml
+++ b/.github/workflows/code_checks.yml
@@ -55,5 +55,4 @@ jobs:
         uses: pypa/gh-action-pip-audit@v1.1.0
         with:
           virtual-environment: .venv
-          additional-args: "--ignore PYSEC-2024-161"
-          strict: false
+          ignore-vulns: "PYSEC-2024-161"

From 968a82703d456f9a2c5b8c08afe28980ae75fa17 Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v3.cluster.local>
Date: Fri, 5 Dec 2025 15:49:14 -0500
Subject: [PATCH 12/14] Fix pip-audit workflow configuration

---
 .github/workflows/code_checks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
index 5ad415d..29333cd 100644
--- a/.github/workflows/code_checks.yml
+++ b/.github/workflows/code_checks.yml
@@ -55,4 +55,4 @@ jobs:
         uses: pypa/gh-action-pip-audit@v1.1.0
         with:
           virtual-environment: .venv
-          ignore-vulns: "PYSEC-2024-161"
+          ignore-vulns: "PYSEC-2024-161, GHSA-gm62-xv2j-4w53, GHSA-2xpw-w6gg-jr37"

From 698f51cd898cad77db59a2732bec85667a85dc4d Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v3.cluster.local>
Date: Fri, 5 Dec 2025 15:59:13 -0500
Subject: [PATCH 13/14] Fix pip-audit workflow configuration

---
 .github/workflows/code_checks.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
index 29333cd..648712e 100644
--- a/.github/workflows/code_checks.yml
+++ b/.github/workflows/code_checks.yml
@@ -55,4 +55,7 @@ jobs:
         uses: pypa/gh-action-pip-audit@v1.1.0
         with:
           virtual-environment: .venv
-          ignore-vulns: "PYSEC-2024-161, GHSA-gm62-xv2j-4w53, GHSA-2xpw-w6gg-jr37"
+          ignore-vulns: |
+            PYSEC-2024-161
+            GHSA-gm62-xv2j-4w53
+            GHSA-2xpw-w6gg-jr37

From 06bbb1b965f23185119bd4a9350011595f0f77b9 Mon Sep 17 00:00:00 2001
From: sindchad <sindchad@v3.cluster.local>
Date: Mon, 8 Dec 2025 13:21:50 -0500
Subject: [PATCH 14/14] removed unnecessary print statements

---
 .../data_construction/stage_7_final/data_final_train.py       | 4 +---
 src/aixpert/data_construction/utils/dpo_transform_utils.py    | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/aixpert/data_construction/stage_7_final/data_final_train.py b/src/aixpert/data_construction/stage_7_final/data_final_train.py
index 7754d4d..f7b6cd2 100644
--- a/src/aixpert/data_construction/stage_7_final/data_final_train.py
+++ b/src/aixpert/data_construction/stage_7_final/data_final_train.py
@@ -38,7 +38,6 @@ def main() -> None:
     print(f"Loading → {input_path}")
     data = load_jsonl(input_path)
 
-    # Initialize buckets
     buckets: Dict[Tuple[int, int], List[Dict[str, Any]]] = {
         (0, 1): [],
         (1, 0): [],
@@ -58,7 +57,6 @@ def main() -> None:
 
     final_rows: List[Dict[str, Any]] = []
 
-    # Sampling logic
     for key, req_count in target_counts.items():
         pool = buckets[key]
         available = len(pool)
@@ -66,7 +64,7 @@ def main() -> None:
         print(f"\nBucket {key}: available={available}, required={req_count}")
 
         if available < req_count:
-            print("⚠️ Sampling WITH replacement.")
+            print("Sampling WITH replacement.")
             sampled = random.choices(pool, k=req_count)
         else:
             sampled = random.sample(pool, req_count)
diff --git a/src/aixpert/data_construction/utils/dpo_transform_utils.py b/src/aixpert/data_construction/utils/dpo_transform_utils.py
index fea34d9..98684f7 100644
--- a/src/aixpert/data_construction/utils/dpo_transform_utils.py
+++ b/src/aixpert/data_construction/utils/dpo_transform_utils.py
@@ -44,13 +44,13 @@ def transform_dataset(input_path: Path, output_path: Path) -> None:
     print(f"Loading → {input_path}")
     items = load_jsonl(input_path)
 
-    print(f"⚙️ Transforming {len(items)} items…")
+    print(f"Transforming {len(items)} items…")
     transformed = [process_item(it) for it in tqdm(items)]
 
     print(f"Saving → {output_path}")
     write_jsonl(output_path, transformed)
 
     print("\n=======================================")
-    print("✔ TRANSFORMATION COMPLETE")
+    print("TRANSFORMATION COMPLETE")
     print(f"Total items: {len(items)}")
     print("=======================================\n")