Skip to content

Commit 1e531b9

Browse files
LittleorTheNetAdmin
authored andcommitted
feat: support counting '.tex' text without local compilation
1 parent dc352e0 commit 1e531b9

File tree

1 file changed

+188
-0
lines changed

1 file changed

+188
-0
lines changed

script/utils/count_tex_words.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import os
2+
import re
3+
import argparse
4+
from pathlib import Path
5+
6+
def clean_tex_content(content):
7+
"""
8+
Remove LaTeX commands and comments to extract countable text.
9+
"""
10+
# 1. Remove comments
11+
content = re.sub(r'%.*', '', content)
12+
13+
# 2. Remove specific environments that shouldn't be counted
14+
content = re.sub(r'\\\[.*?\\\]', '', content, flags=re.DOTALL)
15+
content = re.sub(r'\$\$.*?\$\$', '', content, flags=re.DOTALL)
16+
content = re.sub(r'\$.*?\$', '', content)
17+
18+
# 3. Remove commands
19+
ignored_commands = [
20+
'cite', 'ref', 'label', 'usepackage', 'input', 'include',
21+
'bibliography', 'bibliographystyle', 'documentclass', 'pagestyle',
22+
'thispagestyle', 'vskip', 'vspace', 'hspace', 'setlength', 'setcounter'
23+
]
24+
for cmd in ignored_commands:
25+
content = re.sub(r'\\' + cmd + r'\{[^}]*\}', '', content)
26+
27+
content = re.sub(r'\\[a-zA-Z]+', ' ', content)
28+
content = content.replace('{', ' ').replace('}', ' ')
29+
30+
return content
31+
32+
def count_words_in_content(content):
33+
cleaned_content = clean_tex_content(content)
34+
35+
# Count Chinese characters
36+
chinese_chars = re.findall(r'[\u4e00-\u9fff]', cleaned_content)
37+
num_chinese = len(chinese_chars)
38+
39+
# Count English words
40+
content_no_chinese = re.sub(r'[\u4e00-\u9fff]', ' ', cleaned_content)
41+
content_no_punct = re.sub(r'[^\w\s]', '', content_no_chinese)
42+
english_words = content_no_punct.split()
43+
num_english = len(english_words)
44+
45+
return num_chinese, num_english
46+
47+
def resolve_path(base_path, input_path):
48+
"""
49+
Resolve the input path relative to the project root or base path.
50+
"""
51+
# Handle missing extension
52+
if not input_path.endswith('.tex'):
53+
input_path += '.tex'
54+
55+
# 1. Check if it exists relative to CWD (Project Root)
56+
p_cwd = Path(input_path)
57+
if p_cwd.exists():
58+
return p_cwd
59+
60+
# 2. Check if it exists relative to the parent file
61+
if base_path:
62+
p_rel = base_path.parent / input_path
63+
if p_rel.exists():
64+
return p_rel
65+
66+
return p_cwd # Return the path even if it doesn't exist, to report error later
67+
68+
def process_file(file_path, processed_files):
69+
"""
70+
Recursively process a file and its inputs, returning a tree structure.
71+
"""
72+
file_path = Path(file_path)
73+
resolved_path = file_path.resolve()
74+
75+
# Avoid infinite loops
76+
if resolved_path in processed_files:
77+
return None
78+
79+
if not file_path.exists():
80+
return {'path': str(file_path), 'cn': 0, 'en': 0, 'total': 0, 'error': 'File not found', 'children': []}
81+
82+
processed_files.add(resolved_path)
83+
84+
try:
85+
with open(file_path, 'r', encoding='utf-8') as f:
86+
content = f.read()
87+
except Exception as e:
88+
return {'path': str(file_path), 'cn': 0, 'en': 0, 'total': 0, 'error': str(e), 'children': []}
89+
90+
# Find inputs BEFORE cleaning
91+
content_no_comments = re.sub(r'%.*', '', content)
92+
inputs = re.findall(r'\\(?:input|include)\{([^}]+)\}', content_no_comments)
93+
94+
# Count in current file
95+
cn, en = count_words_in_content(content)
96+
97+
node = {
98+
'path': str(file_path),
99+
'cn': cn,
100+
'en': en,
101+
'total': cn + en,
102+
'children': []
103+
}
104+
105+
# Process inputs
106+
for inp in inputs:
107+
child_path = resolve_path(file_path, inp)
108+
child_node = process_file(child_path, processed_files)
109+
if child_node:
110+
node['children'].append(child_node)
111+
112+
return node
113+
114+
def print_tree(node, prefix="", is_last=True, current_depth=0, max_depth=None):
115+
if not node:
116+
return
117+
118+
# Prepare the line to print
119+
connector = "└── " if is_last else "├── "
120+
121+
# Calculate stats string
122+
stats = f"(CN: {node['cn']}, EN: {node['en']}, Total: {node['total']})"
123+
if 'error' in node:
124+
stats += f" [Error: {node['error']}]"
125+
126+
print(f"{prefix}{connector}{node['path']} {stats}")
127+
128+
# Check depth limit
129+
if max_depth is not None and current_depth >= max_depth:
130+
return
131+
132+
# Prepare prefix for children
133+
child_prefix = prefix + (" " if is_last else "│ ")
134+
135+
# Print children
136+
children = node['children']
137+
for i, child in enumerate(children):
138+
print_tree(child, child_prefix, i == len(children) - 1, current_depth + 1, max_depth)
139+
140+
def calculate_total_stats(node):
141+
"""
142+
Calculate total stats for the tree recursively.
143+
"""
144+
total_cn = node['cn']
145+
total_en = node['en']
146+
147+
for child in node['children']:
148+
c_cn, c_en = calculate_total_stats(child)
149+
total_cn += c_cn
150+
total_en += c_en
151+
152+
return total_cn, total_en
153+
154+
def main():
155+
parser = argparse.ArgumentParser(description="Count words in LaTeX project.")
156+
parser.add_argument('root_file', nargs='?', default='body/graduate/content.tex', help="Root TeX file to start counting from.")
157+
parser.add_argument('--max-depth', type=int, default=None, help="Maximum depth of the tree to display.")
158+
args = parser.parse_args()
159+
160+
root_file = Path(args.root_file)
161+
162+
if not root_file.exists():
163+
# Try prepending body/graduate if user just gave filename
164+
alt_path = Path('body/graduate') / root_file
165+
if alt_path.exists():
166+
root_file = alt_path
167+
else:
168+
print(f"Error: Root file {root_file} not found.")
169+
return
170+
171+
processed_files = set()
172+
root_node = process_file(root_file, processed_files)
173+
174+
print("Word Count Tree Structure:")
175+
# Special handling for root to avoid the connector
176+
stats = f"(CN: {root_node['cn']}, EN: {root_node['en']}, Total: {root_node['total']})"
177+
print(f"{root_node['path']} {stats}")
178+
179+
children = root_node['children']
180+
for i, child in enumerate(children):
181+
print_tree(child, "", i == len(children) - 1, current_depth=1, max_depth=args.max_depth)
182+
183+
total_cn, total_en = calculate_total_stats(root_node)
184+
print("-" * 60)
185+
print(f"GRAND TOTAL: CN: {total_cn}, EN: {total_en}, Total: {total_cn + total_en}")
186+
187+
if __name__ == "__main__":
188+
main()

0 commit comments

Comments
 (0)