-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreproc.py
More file actions
68 lines (53 loc) · 1.78 KB
/
preproc.py
File metadata and controls
68 lines (53 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from argparse import ArgumentParser
import json
import os
import re
FIGURES_SUBDIR = 'images'
def extract_figures(tex_content):
pattern = r'\\includegraphics(?:\[[^\]]*\])?\{([^}]+)\}'
return re.findall(pattern, tex_content)
def preprocess_tex_file(tex_path, figures_dir):
with open(tex_path, 'r', encoding='utf-8') as f:
content = f.read()
match = re.search(r'\\begin\{document\}(.*?)\\end\{document\}', content, flags=re.DOTALL)
content = match.group(1) if match else content
content = content.strip()
figures = extract_figures(content)
figures_paths = []
for fig in figures:
fig_fname = fig.split("/")[-1]
fig_path = os.path.join(figures_dir, fig_fname)
if not fig_path.endswith(".png"):
fig_path += ".jpg"
figures_paths.append(fig_path)
return {
'filename': tex_path,
'problem': content,
'figures': figures_paths
}
def preprocess_data_folder(data_dir):
results = []
for root, _, files in os.walk(data_dir):
for fname in files:
if fname.endswith('.tex'):
tex_path = os.path.join(root, fname)
figures_dir = os.path.join(root, FIGURES_SUBDIR)
result = preprocess_tex_file(tex_path, figures_dir)
results.append(result)
return results
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument(
"--data_dir",
type=str,
default="data/2025",
)
parser.add_argument(
"--output_file",
type=str,
default="ioaa_data.json",
)
args = parser.parse_args()
processed = preprocess_data_folder(args.data_dir)
with open(args.output_file, 'w', encoding='utf-8') as f:
json.dump(processed, f, indent=2)