-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_lean.py
More file actions
131 lines (109 loc) · 4.74 KB
/
process_lean.py
File metadata and controls
131 lines (109 loc) · 4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import re
import logging
from pathlib import Path
# Import configuration
from configs.config_loader import (
LEAN_SOURCE_FILE,
LEAN_OUTPUT_FILE,
LOG_FORMAT,
validate_config
)
# --- Logging Setup ---
# Configures basic logging to provide visibility into the script's operations.
logging.basicConfig(
level=logging.INFO,
format=LOG_FORMAT
)
logger = logging.getLogger(__name__)
def read_lean_file(filepath: Path) -> str:
"""Reads the contents of a Lean file."""
logger.info(f"Reading Lean file: {filepath}")
try:
with filepath.open('r', encoding='utf-8') as f:
return f.read()
except FileNotFoundError:
logger.error(f"Error: File not found at {filepath}")
raise
except Exception as e:
logger.error(f"Error reading file {filepath}: {e}")
raise
def process_lean_theorems(lean_content: str) -> list[dict]:
"""
Parses a string of Lean code to extract theorem and lemma statements.
This function uses regular expressions to find all declarations (like theorem,
lemma, def, etc.), extracts their names and full statements, and determines
if they have a complete proof or are left with `sorry`.
Solved proofs are replaced with `sorry` to create a uniform set of problems
for the language model.
Returns a list of dictionaries, where each dictionary represents a theorem.
"""
logger.info("Processing theorems from Lean content.")
theorems = []
# This regular expression is designed to find and capture blocks of code
# that define a theorem, lemma, definition, etc. It captures the full
# statement and the name of the declaration.
# It looks for a declaration keyword and captures until it finds the next
# one or the end of the file.
theorem_blocks_pattern = re.compile(
r'(?m)^' # Start of line (multiline mode)
r'(?P<full_statement>' # 'full_statement' group captures the entire declaration
r'(?:theorem|lemma|def|example|instance|abbrev)\s+' # Declaration keyword
r'(?P<name>[a-zA-Z0-9_.]+)\s*' # Declaration name (group 'name')
r'.*?)' # Non-greedy capture of any content until next declaration or end of file
r'(?=\n*(?:theorem|lemma|def|example|instance|abbrev)|\Z)', # Lookahead: until next declaration or end of file
re.DOTALL
)
for match in theorem_blocks_pattern.finditer(lean_content):
full_statement = match.group('full_statement').strip()
name = match.group('name')
# Detect proof styles:
# 1. Tactic blocks: `begin ... end`
# 2. Term-style proofs: `:= by <proof>`
is_solved = False
# Pattern for `begin ... end`
begin_end_pattern = re.compile(r'begin\s*(.*?)\s*end', re.DOTALL)
# Pattern for `by ...` (captures everything after `:= by` until EOL)
by_pattern = re.compile(r':=\s*by\s+(.*)', re.DOTALL)
if begin_end_pattern.search(full_statement):
proof_content = begin_end_pattern.search(full_statement).group(1).strip()
if proof_content and 'sorry' not in proof_content:
is_solved = True
# Replace any existing proof with placeholder
full_statement = begin_end_pattern.sub('begin\n sorry\nend', full_statement)
elif by_pattern.search(full_statement):
proof_content = by_pattern.search(full_statement).group(1).strip()
if proof_content and 'sorry' not in proof_content:
is_solved = True
# Replace proof with Lean 4 placeholder `by sorry`
full_statement = by_pattern.sub(':= by sorry', full_statement)
theorems.append({
'name': name,
'statement': full_statement,
'is_solved': is_solved
})
logger.info(f"Found {len(theorems)} theorems in total.")
return theorems
def main():
"""
Main function to execute the script.
It defines the input Lean file and the output JSON file, then orchestrates
the process of reading the Lean file, processing its content to extract
theorems, and writing the results to the JSON file.
"""
# Validate configuration
validate_config()
try:
# Read file contents
lean_content = read_lean_file(LEAN_SOURCE_FILE)
# Process theorems
theorems = process_lean_theorems(lean_content)
# Save result to JSON
with LEAN_OUTPUT_FILE.open('w', encoding='utf-8') as f:
json.dump(theorems, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully processed {len(theorems)} theorems and saved to {LEAN_OUTPUT_FILE}")
except Exception as e:
logger.error(f"An error occurred: {e}")
raise
if __name__ == "__main__":
main()