55from pathlib import Path
66from typing import Optional , Union , List , Set , Dict
77from ..models .types import Study
8+ from bs4 import BeautifulSoup , Comment
89
910
1011def _load_full_text (study : Study , text_path : str = None , output_dir : str = None ) -> Optional [str ]:
@@ -32,6 +33,9 @@ def _load_full_text(study: Study, text_path: str = None, output_dir: str = None)
3233 if full_text_file .suffix .lower () == '.txt' :
3334 with open (full_text_file , 'r' , encoding = 'utf-8' ) as f :
3435 return f .read ()
36+ elif full_text_file .suffix .lower () == '.html' :
37+ # Load HTML body text
38+ return _safe_clean_html (full_text_file .read_text (encoding = 'utf-8' ))
3539 else :
3640 raise ValueError (f"Unsupported file format: { full_text_file .suffix } " )
3741
@@ -111,7 +115,11 @@ def _map_pmids_to_text(
111115 if pmid_source in ['json' , 'folder_name' ] and not text_path_templates :
112116 raise ValueError ("`text_path_templates` must be provided for 'json' and 'folder_name' pmid_source." )
113117
114- iterator = root .iterdir ()
118+ # For file_name option, recursively search all files
119+ if pmid_source == 'file_name' :
120+ iterator = root .rglob ('*' )
121+ else :
122+ iterator = root .iterdir ()
115123
116124 for item in iterator :
117125 pmid = None
@@ -163,4 +171,24 @@ def _map_pmids_to_text(
163171 if pmids_to_include is None or pmid in pmids_to_include :
164172 index [pmid ] = text_file_path
165173
166- return index
174+ return index
175+
176+
177+ def _safe_clean_html (html : str ) -> str :
178+ soup = BeautifulSoup (html , "lxml" )
179+
180+ # 1. Remove non-text tags
181+ for tag in soup (["script" , "style" , "noscript" , "iframe" , "svg" , "canvas" ]):
182+ tag .decompose ()
183+
184+ # 2. Remove comments
185+ for comment in soup .find_all (string = lambda t : isinstance (t , Comment )):
186+ comment .extract ()
187+
188+ # 3. Strip heavy attributes but keep the tags/text
189+ for tag in soup .find_all (True ):
190+ for attr in list (tag .attrs ):
191+ if attr in ["style" , "onclick" , "class" , "id" , "aria-hidden" , "aria-label" ]:
192+ del tag [attr ]
193+
194+ return str (soup )
0 commit comments