iteratre over htmls, and clean them

adelavega · adelavega · commit 99b3ccc39d8a · 2025-09-17T15:20:20.000-05:00
diff --git a/autonima/retrieval/utils.py b/autonima/retrieval/utils.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import Optional, Union, List, Set, Dict
 from ..models.types import Study
+from bs4 import BeautifulSoup, Comment
 
 
 def _load_full_text(study: Study, text_path: str = None, output_dir: str = None) -> Optional[str]:
@@ -32,6 +33,9 @@ def _load_full_text(study: Study, text_path: str = None, output_dir: str = None)
                 if full_text_file.suffix.lower() == '.txt':
                     with open(full_text_file, 'r', encoding='utf-8') as f:
                         return f.read()
+                elif full_text_file.suffix.lower() == '.html':
+                    # Load HTML body text
+                    return _safe_clean_html(full_text_file.read_text(encoding='utf-8'))
                 else:
                     raise ValueError(f"Unsupported file format: {full_text_file.suffix}")
         
@@ -111,7 +115,11 @@ def _map_pmids_to_text(
     if pmid_source in ['json', 'folder_name'] and not text_path_templates:
         raise ValueError("`text_path_templates` must be provided for 'json' and 'folder_name' pmid_source.")
 
-    iterator = root.iterdir()
+    # For file_name option, recursively search all files
+    if pmid_source == 'file_name':
+        iterator = root.rglob('*')
+    else:
+        iterator = root.iterdir()
 
     for item in iterator:
         pmid = None
@@ -163,4 +171,24 @@ def _map_pmids_to_text(
             if pmids_to_include is None or pmid in pmids_to_include:
                 index[pmid] = text_file_path
 
-    return index
+    return index
+
+
+def _safe_clean_html(html: str) -> str:
+    soup = BeautifulSoup(html, "lxml")
+
+    # 1. Remove non-text tags
+    for tag in soup(["script", "style", "noscript", "iframe", "svg", "canvas"]):
+        tag.decompose()
+
+    # 2. Remove comments
+    for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
+        comment.extract()
+
+    # 3. Strip heavy attributes but keep the tags/text
+    for tag in soup.find_all(True):
+        for attr in list(tag.attrs):
+            if attr in ["style", "onclick", "class", "id", "aria-hidden", "aria-label"]:
+                del tag[attr]
+
+    return str(soup)
diff --git a/examples/sample_config.yml b/examples/sample_config.yml
@@ -15,6 +15,11 @@ retrieval:
   sources:
     - pubget
   # full_text_sources:
+  #   # Example for HTML files where filename is the PMID (searches recursively)
+  #   - root_path: "/path/to/your/html/files"
+  #     pmid_source: "file_name"
+  #     allowed_extensions: [".html"]
+  #   # Example for folder-based structure with text files
   #   - root_path: "/path/to/your/first/full/texts"
   #     pmid_source: "folder_name"  # or "json" or "file_name"
   #     text_path_templates:
@@ -25,6 +30,7 @@ retrieval:
   #     # json_pmid_key: "pmid"
   #     # For pmid_source: "file_name", you can customize:
   #     # allowed_extensions: [".txt", ".xml"]
+  #   # Example for another folder-based structure with JSON metadata
   #   - root_path: "/path/to/your/second/full/texts"
   #     pmid_source: "json"
   #     text_path_templates: