@@ -26,6 +26,8 @@ class XLSDocument:
2626 folder : str
2727 filename : str
2828 status : str # draft, final, stagnant, withdrawn, etc.
29+ category : str # amendment, community, protocol, etc.
30+ created : str # YYYY-MM-DD format
2931
3032 def to_dict (self ):
3133 return asdict (self )
@@ -54,53 +56,49 @@ def extract_xls_metadata(content: str, folder_name: str) -> Optional[XLSDocument
5456 print ("ERROR: No <pre> block found in content" )
5557 sys .exit (1 )
5658
57- # Extract metadata using various patterns
59+ # Extract metadata using standardized patterns (headers are now enforced by CI)
5860 patterns = {
59- "title" : [
60- r"[tT]itle:\s*<b>(.*?)</b>" ,
61- r"[tT]itle:\s*(.*?)(?:\n|$)" ,
62- ],
63- "description" : [
64- r"[dD]escription:\s*(.*?)(?:\n|$)" ,
65- ],
66- "authors" : [r"[aA]uthor:\s*(.*?)(?:\n|$)" ],
67- "status" : [r"[sS]tatus:\s*(.*?)(?:\n|$)" ],
61+ "title" : r"[tT]itle:\s*(.*?)(?:\n|$)" ,
62+ "description" : r"[dD]escription:\s*(.*?)(?:\n|$)" ,
63+ "authors" : r"[aA]uthor:\s*(.*?)(?:\n|$)" ,
64+ "status" : r"[sS]tatus:\s*(.*?)(?:\n|$)" ,
65+ "category" : r"[cC]ategory:\s*(.*?)(?:\n|$)" ,
66+ "created" : r"[cC]reated:\s*(.*?)(?:\n|$)" ,
6867 }
6968
70- for key , pattern_list in patterns .items ():
71- for pattern in pattern_list :
72- match = re .search (pattern , pre_text , re .IGNORECASE | re .DOTALL )
73- if match :
74- value = match .group (1 ).strip ()
75- # Clean HTML tags from value
76- if key == "authors" :
77- # Ensure authors are comma-separated
78- # Convert author to mailto or GitHub link if possible
79- def format_author (author ):
80- author = author .strip ()
81- # Email address
82- email_match = re .match (r"^(.*?)\s*<\s*([^>]+)\s*>$" , author )
83- if email_match :
84- name = email_match .group (1 ).strip ()
85- email = email_match .group (2 ).strip ()
86- return name , f'mailto:{ email } '
87- # GitHub username in parentheses
88- gh_match = re .match (r"^(.*?)\s*\(@([^)]+)\)$" , author )
89- if gh_match :
90- name = gh_match .group (1 ).strip ()
91- gh_user = gh_match .group (2 ).strip ()
92- return name , f'https://github.com/{ gh_user } '
93- # Just a name
94- return author , ""
95-
96- value = [
97- format_author (author )
98- for author in value .split ("," )
99- ]
100- else :
101- value = BeautifulSoup (value , "html.parser" ).get_text ().strip ()
102- metadata [key ] = value
103- break
69+ def format_author (author ):
70+ """Format author information into name and link tuple."""
71+ author = author .strip ()
72+ # Email address
73+ email_match = re .match (r"^(.*?)\s*<\s*([^>]+)\s*>$" , author )
74+ if email_match :
75+ name = email_match .group (1 ).strip ()
76+ email = email_match .group (2 ).strip ()
77+ return name , f'mailto:{ email } '
78+ # GitHub username in parentheses
79+ gh_match = re .match (r"^(.*?)\s*\(@([^)]+)\)$" , author )
80+ if gh_match :
81+ name = gh_match .group (1 ).strip ()
82+ gh_user = gh_match .group (2 ).strip ()
83+ return name , f'https://github.com/{ gh_user } '
84+ # Just a name
85+ return author , ""
86+
87+ for key , pattern in patterns .items ():
88+ match = re .search (pattern , pre_text , re .IGNORECASE | re .DOTALL )
89+ if match :
90+ value = match .group (1 ).strip ()
91+ # Clean HTML tags from value and process based on field type
92+ if key == "authors" :
93+ # Process comma-separated authors
94+ value = [
95+ format_author (author )
96+ for author in value .split ("," )
97+ ]
98+ else :
99+ # Clean HTML tags for other fields
100+ value = BeautifulSoup (value , "html.parser" ).get_text ().strip ()
101+ metadata [key ] = value
104102
105103 # Extract XLS number from folder name
106104 xls_match = re .match (r"XLS-(\d+)([d]?)" , folder_name )
@@ -111,12 +109,14 @@ def format_author(author):
111109
112110 return XLSDocument (
113111 number = number ,
114- title = metadata [ "title" ] ,
115- description = metadata [ "description" ] ,
116- authors = metadata [ "authors" ] ,
112+ title = metadata . get ( "title" , "Unknown Title" ) ,
113+ description = metadata . get ( "description" , "No description available" ) ,
114+ authors = metadata . get ( "authors" , [( "Unknown Author" , "" )]) ,
117115 folder = folder_name ,
118116 filename = "README.md" ,
119- status = metadata ["status" ],
117+ status = metadata .get ("status" , "Unknown" ),
118+ category = metadata .get ("category" , "Unknown" ),
119+ created = metadata .get ("created" , "Unknown" ),
120120 )
121121
122122
@@ -189,10 +189,22 @@ def validate_xls_documents(root_dir: Path) -> bool:
189189 validation_errors .append (
190190 f"Error: { doc .folder } is missing required title metadata"
191191 )
192- if not doc .authors or doc .authors == "Unknown Author" :
192+ if not doc .authors or doc .authors == [( "Unknown Author" , "" )] :
193193 validation_errors .append (
194194 f"Error: { doc .folder } is missing required authors metadata"
195195 )
196+ if not doc .status or doc .status == "Unknown" :
197+ validation_errors .append (
198+ f"Error: { doc .folder } is missing required status metadata"
199+ )
200+ if not doc .category or doc .category == "Unknown" :
201+ validation_errors .append (
202+ f"Error: { doc .folder } is missing required category metadata"
203+ )
204+ if not doc .created or doc .created == "Unknown" :
205+ validation_errors .append (
206+ f"Error: { doc .folder } is missing required created metadata"
207+ )
196208
197209 if validation_errors :
198210 for error in validation_errors :
0 commit comments