Split XLS parsing into separate module and add CI validation workflow

Copilot · mvadari · Copilot · commit 8bcf12275487 · 2025-08-20T04:04:26.000Z
Co-authored-by: mvadari &lt;8029314+mvadari@users.noreply.github.com&gt;
diff --git a/.github/workflows/validate-xls.yml b/.github/workflows/validate-xls.yml
@@ -0,0 +1,39 @@
+name: Validate XLS Documents
+
+on:
+  push:
+    branches: [ master, main ]
+  pull_request:
+    branches: [ master, main ]
+  workflow_dispatch:
+
+jobs:
+  validate-xls:
+    runs-on: ubuntu-latest
+    name: Validate XLS Document Parsing
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install beautifulsoup4
+          
+      - name: Validate XLS document parsing
+        run: |
+          echo "Running XLS document validation..."
+          PYTHONPATH=site python site/xls_parser.py
+          
+      - name: Report validation results
+        if: always()
+        run: |
+          echo "XLS validation completed"
+          echo "This pipeline validates that all XLS documents can be parsed correctly"
+          echo "If this fails, it indicates issues with XLS document formatting or metadata"
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
 .DS_Store
 /.idea
 _site/
+
+# Python cache files
+__pycache__/
+*.pyc
+*.pyo
diff --git a/build_site.py b/build_site.py
@@ -5,106 +5,15 @@
 """
 
 import os
-import re
-import json
+import sys
 import shutil
 from pathlib import Path
-from dataclasses import dataclass, asdict
-from typing import List, Optional
 import markdown
-from bs4 import BeautifulSoup
 from jinja2 import Environment, FileSystemLoader
 
-@dataclass
-class XLSDocument:
-    """Represents an XLS document with metadata."""
-    number: str
-    title: str
-    description: str
-    author: str
-    folder: str
-    filename: str
-    status: str  # draft, candidate, released, etc.
-    
-    def to_dict(self):
-        return asdict(self)
-
-def extract_xls_metadata(content: str, folder_name: str) -> Optional[XLSDocument]:
-    """Extract metadata from XLS markdown content."""
-    
-    # Initialize metadata with defaults
-    metadata = {
-        'title': 'Unknown Title',
-        'description': '',
-        'author': 'Unknown Author'
-    }
-    
-    # Parse HTML pre block for metadata
-    soup = BeautifulSoup(content, 'html.parser')
-    pre_block = soup.find('pre')
-    
-    if pre_block:
-        pre_text = pre_block.get_text()
-        
-        # Extract metadata using various patterns
-        patterns = {
-            'title': [
-                r'title:\s*<b>(.*?)</b>',
-                r'Title:\s*<b>(.*?)</b>',
-                r'title:\s*(.*?)(?:\n|$)',
-                r'Title:\s*(.*?)(?:\n|$)'
-            ],
-            'description': [
-                r'description:\s*(.*?)(?:\n|$)',
-                r'Description:\s*(.*?)(?:\n|$)'
-            ],
-            'author': [
-                r'author:\s*(.*?)(?:\n|$)',
-                r'Author:\s*(.*?)(?:\n|$)'
-            ]
-        }
-        
-        for key, pattern_list in patterns.items():
-            for pattern in pattern_list:
-                match = re.search(pattern, pre_text, re.IGNORECASE | re.DOTALL)
-                if match:
-                    value = match.group(1).strip()
-                    # Clean HTML tags from value
-                    value = BeautifulSoup(value, 'html.parser').get_text()
-                    metadata[key] = value
-                    break
-    else:
-        # Try to extract from first heading and content
-        lines = content.split('\n')
-        first_line = lines[0].strip() if lines else ''
-        
-        # Try to extract title from first heading
-        heading_match = re.match(r'^#\s*(.*)', first_line)
-        if heading_match:
-            metadata['title'] = heading_match.group(1).strip()
-            
-        # For files without pre blocks, try to infer some info
-        print(f"Warning: No metadata pre block found in {folder_name}, using fallback extraction")
-    
-    # Extract XLS number from folder name
-    xls_match = re.match(r'XLS-(\d+)([d]?)', folder_name)
-    if xls_match:
-        number = xls_match.group(1)
-        is_draft = xls_match.group(2) == 'd'
-        status = 'draft' if is_draft else 'released'
-    else:
-        number = '000'
-        status = 'unknown'
-    
-    return XLSDocument(
-        number=number,
-        title=metadata['title'],
-        description=metadata['description'],
-        author=metadata['author'],
-        folder=folder_name,
-        filename='README.md',
-        status=status
-    )
+# Add site directory to Python path for imports
+sys.path.append('site')
+from xls_parser import find_xls_documents
 
 def convert_markdown_to_html(content: str) -> str:
     """Convert markdown content to HTML."""
@@ -153,43 +62,40 @@ def build_site():
     
     env = Environment(loader=FileSystemLoader(template_dir))
     
-    # Find all XLS documents
-    xls_docs = []
-    xls_folders = [d for d in root_dir.iterdir() 
-                   if d.is_dir() and d.name.startswith('XLS-')]
+    # Find and parse all XLS documents using the parser module
+    xls_docs = find_xls_documents(root_dir)
     
-    for folder in xls_folders:
+    # Generate HTML for each document
+    for doc in xls_docs:
+        folder = root_dir / doc.folder
         readme_path = folder / 'README.md'
-        if readme_path.exists():
-            try:
-                with open(readme_path, 'r', encoding='utf-8') as f:
-                    content = f.read()
-                
-                doc = extract_xls_metadata(content, folder.name)
-                if doc:
-                    xls_docs.append(doc)
-                    
-                    # Convert to HTML
-                    html_content = convert_markdown_to_html(content)
-                    
-                    # Render XLS page
-                    xls_template = env.get_template('xls.html')
-                    rendered_html = xls_template.render(
-                        doc=doc,
-                        content=html_content,
-                        title=f"XLS-{doc.number}: {doc.title}",
-                        base_url=base_url
-                    )
-                    
-                    # Write XLS HTML file
-                    output_path = site_dir / 'xls' / f"{folder.name}.html"
-                    with open(output_path, 'w', encoding='utf-8') as f:
-                        f.write(rendered_html)
-                    
-                    print(f"Generated: {output_path}")
-                    
-            except Exception as e:
-                print(f"Error processing {folder.name}: {e}")
+        
+        try:
+            with open(readme_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            # Convert to HTML
+            html_content = convert_markdown_to_html(content)
+            
+            # Render XLS page
+            xls_template = env.get_template('xls.html')
+            rendered_html = xls_template.render(
+                doc=doc,
+                content=html_content,
+                title=f"XLS-{doc.number}: {doc.title}",
+                base_url=base_url
+            )
+            
+            # Write XLS HTML file
+            output_path = site_dir / 'xls' / f"{doc.folder}.html"
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(rendered_html)
+            
+            print(f"Generated: {output_path}")
+            
+        except Exception as e:
+            print(f"Error processing {doc.folder}: {e}")
+            raise
     
     # Sort documents by number in reverse order (later ones more relevant)
     xls_docs.sort(key=lambda x: int(x.number), reverse=True)
diff --git a/site/xls_parser.py b/site/xls_parser.py