Skip to content

Commit 404d7eb

Browse files
committed
fix duplicate theorem count
1 parent 5ae3c42 commit 404d7eb

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

lean_dojo_v2/lean_dojo/data_extraction/dataset.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,10 @@ def export_proofs(
167167
Returns:
168168
Total number of theorems exported
169169
"""
170-
total_theorems = 0
171-
170+
# Collect all unique theorems across all strategies
171+
# (all strategies contain the same theorems, just split differently)
172+
unique_theorems = set()
173+
172174
for strategy, split in splits.items():
173175
strategy_dir = dst_path / strategy
174176
strategy_dir.mkdir(parents=True)
@@ -178,6 +180,9 @@ def export_proofs(
178180
num_tactics = 0
179181

180182
for theorem in theorems:
183+
# Track unique theorems by their identity
184+
unique_theorems.add(id(theorem))
185+
181186
# Filter out tactics with "no goals" or containing "·"
182187
tactics = [
183188
{
@@ -220,8 +225,8 @@ def export_proofs(
220225
f"Saved {len(theorems)} theorems with {num_tactics} tactics "
221226
f"to {output_path}"
222227
)
223-
total_theorems += len(theorems)
224228

229+
total_theorems = len(unique_theorems)
225230
logger.info(f"Total theorems exported: {total_theorems}")
226231
return total_theorems
227232

0 commit comments

Comments
 (0)