|
5 | 5 | """ |
6 | 6 |
|
7 | 7 | import os |
8 | | -import re |
9 | | -import json |
| 8 | +import sys |
10 | 9 | import shutil |
11 | 10 | from pathlib import Path |
12 | | -from dataclasses import dataclass, asdict |
13 | | -from typing import List, Optional |
14 | 11 | import markdown |
15 | | -from bs4 import BeautifulSoup |
16 | 12 | from jinja2 import Environment, FileSystemLoader |
17 | 13 |
|
18 | | -@dataclass |
19 | | -class XLSDocument: |
20 | | - """Represents an XLS document with metadata.""" |
21 | | - number: str |
22 | | - title: str |
23 | | - description: str |
24 | | - author: str |
25 | | - folder: str |
26 | | - filename: str |
27 | | - status: str # draft, candidate, released, etc. |
28 | | - |
29 | | - def to_dict(self): |
30 | | - return asdict(self) |
31 | | - |
32 | | -def extract_xls_metadata(content: str, folder_name: str) -> Optional[XLSDocument]: |
33 | | - """Extract metadata from XLS markdown content.""" |
34 | | - |
35 | | - # Initialize metadata with defaults |
36 | | - metadata = { |
37 | | - 'title': 'Unknown Title', |
38 | | - 'description': '', |
39 | | - 'author': 'Unknown Author' |
40 | | - } |
41 | | - |
42 | | - # Parse HTML pre block for metadata |
43 | | - soup = BeautifulSoup(content, 'html.parser') |
44 | | - pre_block = soup.find('pre') |
45 | | - |
46 | | - if pre_block: |
47 | | - pre_text = pre_block.get_text() |
48 | | - |
49 | | - # Extract metadata using various patterns |
50 | | - patterns = { |
51 | | - 'title': [ |
52 | | - r'title:\s*<b>(.*?)</b>', |
53 | | - r'Title:\s*<b>(.*?)</b>', |
54 | | - r'title:\s*(.*?)(?:\n|$)', |
55 | | - r'Title:\s*(.*?)(?:\n|$)' |
56 | | - ], |
57 | | - 'description': [ |
58 | | - r'description:\s*(.*?)(?:\n|$)', |
59 | | - r'Description:\s*(.*?)(?:\n|$)' |
60 | | - ], |
61 | | - 'author': [ |
62 | | - r'author:\s*(.*?)(?:\n|$)', |
63 | | - r'Author:\s*(.*?)(?:\n|$)' |
64 | | - ] |
65 | | - } |
66 | | - |
67 | | - for key, pattern_list in patterns.items(): |
68 | | - for pattern in pattern_list: |
69 | | - match = re.search(pattern, pre_text, re.IGNORECASE | re.DOTALL) |
70 | | - if match: |
71 | | - value = match.group(1).strip() |
72 | | - # Clean HTML tags from value |
73 | | - value = BeautifulSoup(value, 'html.parser').get_text() |
74 | | - metadata[key] = value |
75 | | - break |
76 | | - else: |
77 | | - # Try to extract from first heading and content |
78 | | - lines = content.split('\n') |
79 | | - first_line = lines[0].strip() if lines else '' |
80 | | - |
81 | | - # Try to extract title from first heading |
82 | | - heading_match = re.match(r'^#\s*(.*)', first_line) |
83 | | - if heading_match: |
84 | | - metadata['title'] = heading_match.group(1).strip() |
85 | | - |
86 | | - # For files without pre blocks, try to infer some info |
87 | | - print(f"Warning: No metadata pre block found in {folder_name}, using fallback extraction") |
88 | | - |
89 | | - # Extract XLS number from folder name |
90 | | - xls_match = re.match(r'XLS-(\d+)([d]?)', folder_name) |
91 | | - if xls_match: |
92 | | - number = xls_match.group(1) |
93 | | - is_draft = xls_match.group(2) == 'd' |
94 | | - status = 'draft' if is_draft else 'released' |
95 | | - else: |
96 | | - number = '000' |
97 | | - status = 'unknown' |
98 | | - |
99 | | - return XLSDocument( |
100 | | - number=number, |
101 | | - title=metadata['title'], |
102 | | - description=metadata['description'], |
103 | | - author=metadata['author'], |
104 | | - folder=folder_name, |
105 | | - filename='README.md', |
106 | | - status=status |
107 | | - ) |
| 14 | +# Add site directory to Python path for imports |
| 15 | +sys.path.append('site') |
| 16 | +from xls_parser import find_xls_documents |
108 | 17 |
|
109 | 18 | def convert_markdown_to_html(content: str) -> str: |
110 | 19 | """Convert markdown content to HTML.""" |
@@ -153,43 +62,40 @@ def build_site(): |
153 | 62 |
|
154 | 63 | env = Environment(loader=FileSystemLoader(template_dir)) |
155 | 64 |
|
156 | | - # Find all XLS documents |
157 | | - xls_docs = [] |
158 | | - xls_folders = [d for d in root_dir.iterdir() |
159 | | - if d.is_dir() and d.name.startswith('XLS-')] |
| 65 | + # Find and parse all XLS documents using the parser module |
| 66 | + xls_docs = find_xls_documents(root_dir) |
160 | 67 |
|
161 | | - for folder in xls_folders: |
| 68 | + # Generate HTML for each document |
| 69 | + for doc in xls_docs: |
| 70 | + folder = root_dir / doc.folder |
162 | 71 | readme_path = folder / 'README.md' |
163 | | - if readme_path.exists(): |
164 | | - try: |
165 | | - with open(readme_path, 'r', encoding='utf-8') as f: |
166 | | - content = f.read() |
167 | | - |
168 | | - doc = extract_xls_metadata(content, folder.name) |
169 | | - if doc: |
170 | | - xls_docs.append(doc) |
171 | | - |
172 | | - # Convert to HTML |
173 | | - html_content = convert_markdown_to_html(content) |
174 | | - |
175 | | - # Render XLS page |
176 | | - xls_template = env.get_template('xls.html') |
177 | | - rendered_html = xls_template.render( |
178 | | - doc=doc, |
179 | | - content=html_content, |
180 | | - title=f"XLS-{doc.number}: {doc.title}", |
181 | | - base_url=base_url |
182 | | - ) |
183 | | - |
184 | | - # Write XLS HTML file |
185 | | - output_path = site_dir / 'xls' / f"{folder.name}.html" |
186 | | - with open(output_path, 'w', encoding='utf-8') as f: |
187 | | - f.write(rendered_html) |
188 | | - |
189 | | - print(f"Generated: {output_path}") |
190 | | - |
191 | | - except Exception as e: |
192 | | - print(f"Error processing {folder.name}: {e}") |
| 72 | + |
| 73 | + try: |
| 74 | + with open(readme_path, 'r', encoding='utf-8') as f: |
| 75 | + content = f.read() |
| 76 | + |
| 77 | + # Convert to HTML |
| 78 | + html_content = convert_markdown_to_html(content) |
| 79 | + |
| 80 | + # Render XLS page |
| 81 | + xls_template = env.get_template('xls.html') |
| 82 | + rendered_html = xls_template.render( |
| 83 | + doc=doc, |
| 84 | + content=html_content, |
| 85 | + title=f"XLS-{doc.number}: {doc.title}", |
| 86 | + base_url=base_url |
| 87 | + ) |
| 88 | + |
| 89 | + # Write XLS HTML file |
| 90 | + output_path = site_dir / 'xls' / f"{doc.folder}.html" |
| 91 | + with open(output_path, 'w', encoding='utf-8') as f: |
| 92 | + f.write(rendered_html) |
| 93 | + |
| 94 | + print(f"Generated: {output_path}") |
| 95 | + |
| 96 | + except Exception as e: |
| 97 | + print(f"Error processing {doc.folder}: {e}") |
| 98 | + raise |
193 | 99 |
|
194 | 100 | # Sort documents by number in reverse order (later ones more relevant) |
195 | 101 | xls_docs.sort(key=lambda x: int(x.number), reverse=True) |
|
0 commit comments