Skip to content

Commit 8bcf122

Browse files
Copilotmvadari
andcommitted
Split XLS parsing into separate module and add CI validation workflow
Co-authored-by: mvadari <8029314+mvadari@users.noreply.github.com>
1 parent c227fc7 commit 8bcf122

File tree

4 files changed

+280
-130
lines changed

4 files changed

+280
-130
lines changed

.github/workflows/validate-xls.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: Validate XLS Documents
2+
3+
on:
4+
push:
5+
branches: [ master, main ]
6+
pull_request:
7+
branches: [ master, main ]
8+
workflow_dispatch:
9+
10+
jobs:
11+
validate-xls:
12+
runs-on: ubuntu-latest
13+
name: Validate XLS Document Parsing
14+
15+
steps:
16+
- name: Checkout repository
17+
uses: actions/checkout@v4
18+
19+
- name: Setup Python
20+
uses: actions/setup-python@v4
21+
with:
22+
python-version: '3.11'
23+
24+
- name: Install dependencies
25+
run: |
26+
python -m pip install --upgrade pip
27+
pip install beautifulsoup4
28+
29+
- name: Validate XLS document parsing
30+
run: |
31+
echo "Running XLS document validation..."
32+
PYTHONPATH=site python site/xls_parser.py
33+
34+
- name: Report validation results
35+
if: always()
36+
run: |
37+
echo "XLS validation completed"
38+
echo "This pipeline validates that all XLS documents can be parsed correctly"
39+
echo "If this fails, it indicates issues with XLS document formatting or metadata"

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
11
.DS_Store
22
/.idea
33
_site/
4+
5+
# Python cache files
6+
__pycache__/
7+
*.pyc
8+
*.pyo

build_site.py

Lines changed: 36 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -5,106 +5,15 @@
55
"""
66

77
import os
8-
import re
9-
import json
8+
import sys
109
import shutil
1110
from pathlib import Path
12-
from dataclasses import dataclass, asdict
13-
from typing import List, Optional
1411
import markdown
15-
from bs4 import BeautifulSoup
1612
from jinja2 import Environment, FileSystemLoader
1713

18-
@dataclass
19-
class XLSDocument:
20-
"""Represents an XLS document with metadata."""
21-
number: str
22-
title: str
23-
description: str
24-
author: str
25-
folder: str
26-
filename: str
27-
status: str # draft, candidate, released, etc.
28-
29-
def to_dict(self):
30-
return asdict(self)
31-
32-
def extract_xls_metadata(content: str, folder_name: str) -> Optional[XLSDocument]:
33-
"""Extract metadata from XLS markdown content."""
34-
35-
# Initialize metadata with defaults
36-
metadata = {
37-
'title': 'Unknown Title',
38-
'description': '',
39-
'author': 'Unknown Author'
40-
}
41-
42-
# Parse HTML pre block for metadata
43-
soup = BeautifulSoup(content, 'html.parser')
44-
pre_block = soup.find('pre')
45-
46-
if pre_block:
47-
pre_text = pre_block.get_text()
48-
49-
# Extract metadata using various patterns
50-
patterns = {
51-
'title': [
52-
r'title:\s*<b>(.*?)</b>',
53-
r'Title:\s*<b>(.*?)</b>',
54-
r'title:\s*(.*?)(?:\n|$)',
55-
r'Title:\s*(.*?)(?:\n|$)'
56-
],
57-
'description': [
58-
r'description:\s*(.*?)(?:\n|$)',
59-
r'Description:\s*(.*?)(?:\n|$)'
60-
],
61-
'author': [
62-
r'author:\s*(.*?)(?:\n|$)',
63-
r'Author:\s*(.*?)(?:\n|$)'
64-
]
65-
}
66-
67-
for key, pattern_list in patterns.items():
68-
for pattern in pattern_list:
69-
match = re.search(pattern, pre_text, re.IGNORECASE | re.DOTALL)
70-
if match:
71-
value = match.group(1).strip()
72-
# Clean HTML tags from value
73-
value = BeautifulSoup(value, 'html.parser').get_text()
74-
metadata[key] = value
75-
break
76-
else:
77-
# Try to extract from first heading and content
78-
lines = content.split('\n')
79-
first_line = lines[0].strip() if lines else ''
80-
81-
# Try to extract title from first heading
82-
heading_match = re.match(r'^#\s*(.*)', first_line)
83-
if heading_match:
84-
metadata['title'] = heading_match.group(1).strip()
85-
86-
# For files without pre blocks, try to infer some info
87-
print(f"Warning: No metadata pre block found in {folder_name}, using fallback extraction")
88-
89-
# Extract XLS number from folder name
90-
xls_match = re.match(r'XLS-(\d+)([d]?)', folder_name)
91-
if xls_match:
92-
number = xls_match.group(1)
93-
is_draft = xls_match.group(2) == 'd'
94-
status = 'draft' if is_draft else 'released'
95-
else:
96-
number = '000'
97-
status = 'unknown'
98-
99-
return XLSDocument(
100-
number=number,
101-
title=metadata['title'],
102-
description=metadata['description'],
103-
author=metadata['author'],
104-
folder=folder_name,
105-
filename='README.md',
106-
status=status
107-
)
14+
# Add site directory to Python path for imports
15+
sys.path.append('site')
16+
from xls_parser import find_xls_documents
10817

10918
def convert_markdown_to_html(content: str) -> str:
11019
"""Convert markdown content to HTML."""
@@ -153,43 +62,40 @@ def build_site():
15362

15463
env = Environment(loader=FileSystemLoader(template_dir))
15564

156-
# Find all XLS documents
157-
xls_docs = []
158-
xls_folders = [d for d in root_dir.iterdir()
159-
if d.is_dir() and d.name.startswith('XLS-')]
65+
# Find and parse all XLS documents using the parser module
66+
xls_docs = find_xls_documents(root_dir)
16067

161-
for folder in xls_folders:
68+
# Generate HTML for each document
69+
for doc in xls_docs:
70+
folder = root_dir / doc.folder
16271
readme_path = folder / 'README.md'
163-
if readme_path.exists():
164-
try:
165-
with open(readme_path, 'r', encoding='utf-8') as f:
166-
content = f.read()
167-
168-
doc = extract_xls_metadata(content, folder.name)
169-
if doc:
170-
xls_docs.append(doc)
171-
172-
# Convert to HTML
173-
html_content = convert_markdown_to_html(content)
174-
175-
# Render XLS page
176-
xls_template = env.get_template('xls.html')
177-
rendered_html = xls_template.render(
178-
doc=doc,
179-
content=html_content,
180-
title=f"XLS-{doc.number}: {doc.title}",
181-
base_url=base_url
182-
)
183-
184-
# Write XLS HTML file
185-
output_path = site_dir / 'xls' / f"{folder.name}.html"
186-
with open(output_path, 'w', encoding='utf-8') as f:
187-
f.write(rendered_html)
188-
189-
print(f"Generated: {output_path}")
190-
191-
except Exception as e:
192-
print(f"Error processing {folder.name}: {e}")
72+
73+
try:
74+
with open(readme_path, 'r', encoding='utf-8') as f:
75+
content = f.read()
76+
77+
# Convert to HTML
78+
html_content = convert_markdown_to_html(content)
79+
80+
# Render XLS page
81+
xls_template = env.get_template('xls.html')
82+
rendered_html = xls_template.render(
83+
doc=doc,
84+
content=html_content,
85+
title=f"XLS-{doc.number}: {doc.title}",
86+
base_url=base_url
87+
)
88+
89+
# Write XLS HTML file
90+
output_path = site_dir / 'xls' / f"{doc.folder}.html"
91+
with open(output_path, 'w', encoding='utf-8') as f:
92+
f.write(rendered_html)
93+
94+
print(f"Generated: {output_path}")
95+
96+
except Exception as e:
97+
print(f"Error processing {doc.folder}: {e}")
98+
raise
19399

194100
# Sort documents by number in reverse order (later ones more relevant)
195101
xls_docs.sort(key=lambda x: int(x.number), reverse=True)

0 commit comments

Comments
 (0)