Skip to content

Commit 637e39e

Browse files
hyperpolymathclaude
andcommitted
feat: migrate all 26 corpus extraction scripts from Python to Julia
Converted extract_acl2, extract_coqgym, extract_dafny, extract_fstar, extract_hol4, extract_hol_light, extract_idris2, extract_mathlib4, extract_metamath, extract_minizinc, extract_mizar, extract_pvs, extract_smtlib, extract_tptp, extract_why3, balance_corpus, final_complete_merge, final_normalization, generate_synthetic_provers, max_extract_coqgym, max_extract_mathlib4, merge_corpus, metamath_final, ultimate_expansion, vocabulary_expansion, vocabulary_final. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b3eae15 commit 637e39e

41 files changed

Lines changed: 5878 additions & 5434 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

scripts/balance_corpus.jl

Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
#!/usr/bin/env julia
2+
# SPDX-License-Identifier: PMPL-1.0-or-later
3+
# SPDX-FileCopyrightText: 2026 ECHIDNA Project Team
4+
#
5+
# Balanced corpus extraction - ensure minimum proofs from each prover.
6+
# Target: at least 12 proofs from every prover system. Extracts additional
7+
# proofs from CoqGym and adds placeholder examples for under-represented provers.
8+
9+
using JSON3
10+
11+
# Configuration
12+
const OUTPUT_DIR = "training_data"
13+
const TARGET_PER_PROVER = 12
14+
15+
# Provers we want to balance
16+
const TARGET_PROVERS = [
17+
"Coq", "Isabelle", "PVS", "HOL4", "ACL2", "Agda", "Z3", "CVC5",
18+
"Mizar", "Lean", "Metamath"
19+
]
20+
21+
"""
22+
extract_additional_proofs() -> Union{Vector{Dict{String,Any}}, Nothing}
23+
24+
Extract additional proofs to balance the corpus. Reads existing data, identifies
25+
under-represented provers, and adds proofs from external corpora or placeholder
26+
examples.
27+
"""
28+
function extract_additional_proofs()
29+
println("BALANCED CORPUS EXTRACTION")
30+
println("=" ^ 40)
31+
println("Target: $TARGET_PER_PROVER proofs per prover")
32+
33+
# Load existing data to see what we have
34+
existing_proofs = Dict{String,Vector{Dict{String,Any}}}()
35+
36+
# Check existing files
37+
files_to_check = [
38+
"proof_states_COMPLETE.jsonl",
39+
"proof_states.jsonl",
40+
"proof_states_all.jsonl"
41+
]
42+
43+
for filename in files_to_check
44+
filepath = joinpath(OUTPUT_DIR, filename)
45+
if isfile(filepath)
46+
open(filepath, "r") do f
47+
for line in eachline(f)
48+
try
49+
proof = JSON3.read(strip(line), Dict{String,Any})
50+
prover = get(proof, "prover", "unknown")
51+
if !haskey(existing_proofs, prover)
52+
existing_proofs[prover] = Dict{String,Any}[]
53+
end
54+
push!(existing_proofs[prover], proof)
55+
catch
56+
continue
57+
end
58+
end
59+
end
60+
end
61+
end
62+
63+
# Count what we have
64+
current_counts = Dict(prover => length(proofs) for (prover, proofs) in existing_proofs)
65+
66+
println("\nCurrent Corpus Balance:")
67+
for prover in sort(TARGET_PROVERS)
68+
count = get(current_counts, prover, 0)
69+
status = count >= TARGET_PER_PROVER ? "[OK]" : "[!!]"
70+
println(" $status $(rpad(prover, 12)): $(lpad(string(count), 6)) proofs")
71+
end
72+
73+
# Identify which provers need more proofs
74+
deficient_provers = [p for p in TARGET_PROVERS if get(current_counts, p, 0) < TARGET_PER_PROVER]
75+
76+
if isempty(deficient_provers)
77+
println("\nAll provers already have sufficient proofs!")
78+
return nothing
79+
end
80+
81+
println("\nNeed more proofs for: $(join(deficient_provers, ", "))")
82+
83+
# Try to extract more from existing corpora
84+
new_proofs = Dict{String,Any}[]
85+
86+
# 1. Check if we have CoqGym data that wasn't fully extracted
87+
coqgym_files = [
88+
"external_corpora/CoqGym/coq_projects/Categories/Adjunction/Adj_Cat.v",
89+
"external_corpora/CoqGym/coq_projects/Categories/Cat/Terminal.v",
90+
"external_corpora/CoqGym/coq_projects/Algebra/GroupDef.v"
91+
]
92+
93+
for filepath in coqgym_files
94+
if isfile(filepath)
95+
println("\nExtracting from $filepath...")
96+
try
97+
content = read(filepath, String)
98+
99+
# Extract theorems
100+
theorem_pattern = r"Theorem\s+([a-zA-Z0-9_]+)\s*:\s*(.*?)(?=\n\s*Proof\.)"s
101+
for m in eachmatch(theorem_pattern, content)
102+
try
103+
name = strip(m.captures[1])
104+
statement = strip(m.captures[2])
105+
106+
if !isempty(name) && !isempty(statement)
107+
proof = Dict{String,Any}(
108+
"id" => 100000 + length(new_proofs),
109+
"prover" => "Coq",
110+
"theorem" => name,
111+
"goal" => first(statement, 500),
112+
"context" => Any[],
113+
"source" => "CoqGym_balanced"
114+
)
115+
push!(new_proofs, proof)
116+
if length(new_proofs) >= 50 # Limit for now
117+
break
118+
end
119+
end
120+
catch
121+
continue
122+
end
123+
end
124+
catch e
125+
println(" Error: $e")
126+
end
127+
end
128+
end
129+
130+
# 2. Check Isabelle examples
131+
isabelle_examples = [
132+
("HOL", "If P then Q implies not P implies not Q"),
133+
("ZF", "Set theory basics"),
134+
("Algebra", "Group theory examples")
135+
]
136+
137+
for (i, (domain, desc)) in enumerate(isabelle_examples)
138+
if count(p -> p["prover"] == "Isabelle", new_proofs) < 3
139+
proof = Dict{String,Any}(
140+
"id" => 200000 + i - 1,
141+
"prover" => "Isabelle",
142+
"theorem" => "example_$i",
143+
"goal" => "Example from $domain: $desc",
144+
"context" => Any[],
145+
"source" => "balanced_corpus"
146+
)
147+
push!(new_proofs, proof)
148+
end
149+
end
150+
151+
# 3. Add PVS examples
152+
if count(p -> p["prover"] == "PVS", new_proofs) < 3
153+
for i in 0:2
154+
proof = Dict{String,Any}(
155+
"id" => 300000 + i,
156+
"prover" => "PVS",
157+
"theorem" => "pvs_example_$(i + 1)",
158+
"goal" => "PVS verification example $(i + 1)",
159+
"context" => Any[],
160+
"source" => "balanced_corpus"
161+
)
162+
push!(new_proofs, proof)
163+
end
164+
end
165+
166+
# 4. Add HOL4 examples
167+
if count(p -> p["prover"] == "HOL4", new_proofs) < 3
168+
for i in 0:2
169+
proof = Dict{String,Any}(
170+
"id" => 400000 + i,
171+
"prover" => "HOL4",
172+
"theorem" => "hol4_example_$(i + 1)",
173+
"goal" => "HOL4 theorem example $(i + 1)",
174+
"context" => Any[],
175+
"source" => "balanced_corpus"
176+
)
177+
push!(new_proofs, proof)
178+
end
179+
end
180+
181+
println("\nExtracted $(length(new_proofs)) additional proofs")
182+
183+
# Save new proofs
184+
if !isempty(new_proofs)
185+
output_file = joinpath(OUTPUT_DIR, "proof_states_BALANCED.jsonl")
186+
open(output_file, "w") do f
187+
for proof in new_proofs
188+
println(f, JSON3.write(proof))
189+
end
190+
end
191+
192+
println("Saved to $output_file")
193+
194+
# Update statistics
195+
stats = Dict{String,Any}(
196+
"version" => "v2.0-balanced",
197+
"date" => "2026-03-20",
198+
"additional_proofs" => length(new_proofs),
199+
"target_provers" => deficient_provers,
200+
"achieved_balance" => true
201+
)
202+
203+
open(joinpath(OUTPUT_DIR, "stats_BALANCED.json"), "w") do f
204+
JSON3.pretty(f, stats)
205+
end
206+
207+
println("Balance improvement complete!")
208+
else
209+
println("No additional proofs extracted")
210+
end
211+
212+
return new_proofs
213+
end
214+
215+
"""
216+
merge_balanced_data()
217+
218+
Merge balanced data with the complete corpus, deduplicating by ID.
219+
"""
220+
function merge_balanced_data()
221+
println("\nMerging balanced data...")
222+
223+
# Read all proof files
224+
files = [
225+
"proof_states_COMPLETE.jsonl",
226+
"proof_states_BALANCED.jsonl"
227+
]
228+
229+
all_proofs = Dict{String,Any}[]
230+
seen_ids = Set{Int}()
231+
232+
for filename in files
233+
filepath = joinpath(OUTPUT_DIR, filename)
234+
if isfile(filepath)
235+
open(filepath, "r") do f
236+
for line in eachline(f)
237+
try
238+
proof = JSON3.read(strip(line), Dict{String,Any})
239+
pid = get(proof, "id", nothing)
240+
if !isnothing(pid) && !(pid in seen_ids)
241+
push!(all_proofs, proof)
242+
push!(seen_ids, pid)
243+
end
244+
catch
245+
continue
246+
end
247+
end
248+
end
249+
end
250+
end
251+
252+
# Save merged data
253+
output_file = joinpath(OUTPUT_DIR, "proof_states_FINAL_BALANCED.jsonl")
254+
open(output_file, "w") do f
255+
for proof in all_proofs
256+
println(f, JSON3.write(proof))
257+
end
258+
end
259+
260+
println("Merged data saved to $output_file")
261+
println(" Total proofs: $(length(all_proofs))")
262+
263+
# Check final balance
264+
final_counts = Dict{String,Int}()
265+
for proof in all_proofs
266+
prover = get(proof, "prover", "unknown")
267+
final_counts[prover] = get(final_counts, prover, 0) + 1
268+
end
269+
270+
println("\nFinal Balanced Corpus:")
271+
for prover in sort(TARGET_PROVERS)
272+
count = get(final_counts, prover, 0)
273+
status = count >= TARGET_PER_PROVER ? "[OK]" : "[!!]"
274+
println(" $status $(rpad(prover, 12)): $(lpad(string(count), 6)) proofs")
275+
end
276+
end
277+
278+
"""
279+
main()
280+
281+
Entry point. Extracts additional proofs for balance, then merges with existing data.
282+
"""
283+
function main()
284+
# Extract additional proofs
285+
new_proofs = extract_additional_proofs()
286+
287+
# Merge with existing data
288+
if !isnothing(new_proofs)
289+
merge_balanced_data()
290+
end
291+
292+
println("\nBALANCE IMPROVEMENT COMPLETE!")
293+
end
294+
295+
main()

0 commit comments

Comments
 (0)