|
| 1 | +import os |
| 2 | +import re |
| 3 | +import argparse |
| 4 | +from pathlib import Path |
| 5 | + |
| 6 | +def clean_tex_content(content): |
| 7 | + """ |
| 8 | + Remove LaTeX commands and comments to extract countable text. |
| 9 | + """ |
| 10 | + # 1. Remove comments |
| 11 | + content = re.sub(r'%.*', '', content) |
| 12 | + |
| 13 | + # 2. Remove specific environments that shouldn't be counted |
| 14 | + content = re.sub(r'\\\[.*?\\\]', '', content, flags=re.DOTALL) |
| 15 | + content = re.sub(r'\$\$.*?\$\$', '', content, flags=re.DOTALL) |
| 16 | + content = re.sub(r'\$.*?\$', '', content) |
| 17 | + |
| 18 | + # 3. Remove commands |
| 19 | + ignored_commands = [ |
| 20 | + 'cite', 'ref', 'label', 'usepackage', 'input', 'include', |
| 21 | + 'bibliography', 'bibliographystyle', 'documentclass', 'pagestyle', |
| 22 | + 'thispagestyle', 'vskip', 'vspace', 'hspace', 'setlength', 'setcounter' |
| 23 | + ] |
| 24 | + for cmd in ignored_commands: |
| 25 | + content = re.sub(r'\\' + cmd + r'\{[^}]*\}', '', content) |
| 26 | + |
| 27 | + content = re.sub(r'\\[a-zA-Z]+', ' ', content) |
| 28 | + content = content.replace('{', ' ').replace('}', ' ') |
| 29 | + |
| 30 | + return content |
| 31 | + |
| 32 | +def count_words_in_content(content): |
| 33 | + cleaned_content = clean_tex_content(content) |
| 34 | + |
| 35 | + # Count Chinese characters |
| 36 | + chinese_chars = re.findall(r'[\u4e00-\u9fff]', cleaned_content) |
| 37 | + num_chinese = len(chinese_chars) |
| 38 | + |
| 39 | + # Count English words |
| 40 | + content_no_chinese = re.sub(r'[\u4e00-\u9fff]', ' ', cleaned_content) |
| 41 | + content_no_punct = re.sub(r'[^\w\s]', '', content_no_chinese) |
| 42 | + english_words = content_no_punct.split() |
| 43 | + num_english = len(english_words) |
| 44 | + |
| 45 | + return num_chinese, num_english |
| 46 | + |
| 47 | +def resolve_path(base_path, input_path): |
| 48 | + """ |
| 49 | + Resolve the input path relative to the project root or base path. |
| 50 | + """ |
| 51 | + # Handle missing extension |
| 52 | + if not input_path.endswith('.tex'): |
| 53 | + input_path += '.tex' |
| 54 | + |
| 55 | + # 1. Check if it exists relative to CWD (Project Root) |
| 56 | + p_cwd = Path(input_path) |
| 57 | + if p_cwd.exists(): |
| 58 | + return p_cwd |
| 59 | + |
| 60 | + # 2. Check if it exists relative to the parent file |
| 61 | + if base_path: |
| 62 | + p_rel = base_path.parent / input_path |
| 63 | + if p_rel.exists(): |
| 64 | + return p_rel |
| 65 | + |
| 66 | + return p_cwd # Return the path even if it doesn't exist, to report error later |
| 67 | + |
| 68 | +def process_file(file_path, processed_files): |
| 69 | + """ |
| 70 | + Recursively process a file and its inputs, returning a tree structure. |
| 71 | + """ |
| 72 | + file_path = Path(file_path) |
| 73 | + resolved_path = file_path.resolve() |
| 74 | + |
| 75 | + # Avoid infinite loops |
| 76 | + if resolved_path in processed_files: |
| 77 | + return None |
| 78 | + |
| 79 | + if not file_path.exists(): |
| 80 | + return {'path': str(file_path), 'cn': 0, 'en': 0, 'total': 0, 'error': 'File not found', 'children': []} |
| 81 | + |
| 82 | + processed_files.add(resolved_path) |
| 83 | + |
| 84 | + try: |
| 85 | + with open(file_path, 'r', encoding='utf-8') as f: |
| 86 | + content = f.read() |
| 87 | + except Exception as e: |
| 88 | + return {'path': str(file_path), 'cn': 0, 'en': 0, 'total': 0, 'error': str(e), 'children': []} |
| 89 | + |
| 90 | + # Find inputs BEFORE cleaning |
| 91 | + content_no_comments = re.sub(r'%.*', '', content) |
| 92 | + inputs = re.findall(r'\\(?:input|include)\{([^}]+)\}', content_no_comments) |
| 93 | + |
| 94 | + # Count in current file |
| 95 | + cn, en = count_words_in_content(content) |
| 96 | + |
| 97 | + node = { |
| 98 | + 'path': str(file_path), |
| 99 | + 'cn': cn, |
| 100 | + 'en': en, |
| 101 | + 'total': cn + en, |
| 102 | + 'children': [] |
| 103 | + } |
| 104 | + |
| 105 | + # Process inputs |
| 106 | + for inp in inputs: |
| 107 | + child_path = resolve_path(file_path, inp) |
| 108 | + child_node = process_file(child_path, processed_files) |
| 109 | + if child_node: |
| 110 | + node['children'].append(child_node) |
| 111 | + |
| 112 | + return node |
| 113 | + |
| 114 | +def print_tree(node, prefix="", is_last=True, current_depth=0, max_depth=None): |
| 115 | + if not node: |
| 116 | + return |
| 117 | + |
| 118 | + # Prepare the line to print |
| 119 | + connector = "└── " if is_last else "├── " |
| 120 | + |
| 121 | + # Calculate stats string |
| 122 | + stats = f"(CN: {node['cn']}, EN: {node['en']}, Total: {node['total']})" |
| 123 | + if 'error' in node: |
| 124 | + stats += f" [Error: {node['error']}]" |
| 125 | + |
| 126 | + print(f"{prefix}{connector}{node['path']} {stats}") |
| 127 | + |
| 128 | + # Check depth limit |
| 129 | + if max_depth is not None and current_depth >= max_depth: |
| 130 | + return |
| 131 | + |
| 132 | + # Prepare prefix for children |
| 133 | + child_prefix = prefix + (" " if is_last else "│ ") |
| 134 | + |
| 135 | + # Print children |
| 136 | + children = node['children'] |
| 137 | + for i, child in enumerate(children): |
| 138 | + print_tree(child, child_prefix, i == len(children) - 1, current_depth + 1, max_depth) |
| 139 | + |
| 140 | +def calculate_total_stats(node): |
| 141 | + """ |
| 142 | + Calculate total stats for the tree recursively. |
| 143 | + """ |
| 144 | + total_cn = node['cn'] |
| 145 | + total_en = node['en'] |
| 146 | + |
| 147 | + for child in node['children']: |
| 148 | + c_cn, c_en = calculate_total_stats(child) |
| 149 | + total_cn += c_cn |
| 150 | + total_en += c_en |
| 151 | + |
| 152 | + return total_cn, total_en |
| 153 | + |
| 154 | +def main(): |
| 155 | + parser = argparse.ArgumentParser(description="Count words in LaTeX project.") |
| 156 | + parser.add_argument('root_file', nargs='?', default='body/graduate/content.tex', help="Root TeX file to start counting from.") |
| 157 | + parser.add_argument('--max-depth', type=int, default=None, help="Maximum depth of the tree to display.") |
| 158 | + args = parser.parse_args() |
| 159 | + |
| 160 | + root_file = Path(args.root_file) |
| 161 | + |
| 162 | + if not root_file.exists(): |
| 163 | + # Try prepending body/graduate if user just gave filename |
| 164 | + alt_path = Path('body/graduate') / root_file |
| 165 | + if alt_path.exists(): |
| 166 | + root_file = alt_path |
| 167 | + else: |
| 168 | + print(f"Error: Root file {root_file} not found.") |
| 169 | + return |
| 170 | + |
| 171 | + processed_files = set() |
| 172 | + root_node = process_file(root_file, processed_files) |
| 173 | + |
| 174 | + print("Word Count Tree Structure:") |
| 175 | + # Special handling for root to avoid the connector |
| 176 | + stats = f"(CN: {root_node['cn']}, EN: {root_node['en']}, Total: {root_node['total']})" |
| 177 | + print(f"{root_node['path']} {stats}") |
| 178 | + |
| 179 | + children = root_node['children'] |
| 180 | + for i, child in enumerate(children): |
| 181 | + print_tree(child, "", i == len(children) - 1, current_depth=1, max_depth=args.max_depth) |
| 182 | + |
| 183 | + total_cn, total_en = calculate_total_stats(root_node) |
| 184 | + print("-" * 60) |
| 185 | + print(f"GRAND TOTAL: CN: {total_cn}, EN: {total_en}, Total: {total_cn + total_en}") |
| 186 | + |
| 187 | +if __name__ == "__main__": |
| 188 | + main() |
0 commit comments