Skip to content

Commit 99b3ccc

Browse files
committed
iteratre over htmls, and clean them
1 parent d33a003 commit 99b3ccc

2 files changed

Lines changed: 36 additions & 2 deletions

File tree

autonima/retrieval/utils.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pathlib import Path
66
from typing import Optional, Union, List, Set, Dict
77
from ..models.types import Study
8+
from bs4 import BeautifulSoup, Comment
89

910

1011
def _load_full_text(study: Study, text_path: str = None, output_dir: str = None) -> Optional[str]:
@@ -32,6 +33,9 @@ def _load_full_text(study: Study, text_path: str = None, output_dir: str = None)
3233
if full_text_file.suffix.lower() == '.txt':
3334
with open(full_text_file, 'r', encoding='utf-8') as f:
3435
return f.read()
36+
elif full_text_file.suffix.lower() == '.html':
37+
# Load HTML body text
38+
return _safe_clean_html(full_text_file.read_text(encoding='utf-8'))
3539
else:
3640
raise ValueError(f"Unsupported file format: {full_text_file.suffix}")
3741

@@ -111,7 +115,11 @@ def _map_pmids_to_text(
111115
if pmid_source in ['json', 'folder_name'] and not text_path_templates:
112116
raise ValueError("`text_path_templates` must be provided for 'json' and 'folder_name' pmid_source.")
113117

114-
iterator = root.iterdir()
118+
# For file_name option, recursively search all files
119+
if pmid_source == 'file_name':
120+
iterator = root.rglob('*')
121+
else:
122+
iterator = root.iterdir()
115123

116124
for item in iterator:
117125
pmid = None
@@ -163,4 +171,24 @@ def _map_pmids_to_text(
163171
if pmids_to_include is None or pmid in pmids_to_include:
164172
index[pmid] = text_file_path
165173

166-
return index
174+
return index
175+
176+
177+
def _safe_clean_html(html: str) -> str:
178+
soup = BeautifulSoup(html, "lxml")
179+
180+
# 1. Remove non-text tags
181+
for tag in soup(["script", "style", "noscript", "iframe", "svg", "canvas"]):
182+
tag.decompose()
183+
184+
# 2. Remove comments
185+
for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
186+
comment.extract()
187+
188+
# 3. Strip heavy attributes but keep the tags/text
189+
for tag in soup.find_all(True):
190+
for attr in list(tag.attrs):
191+
if attr in ["style", "onclick", "class", "id", "aria-hidden", "aria-label"]:
192+
del tag[attr]
193+
194+
return str(soup)

examples/sample_config.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ retrieval:
1515
sources:
1616
- pubget
1717
# full_text_sources:
18+
# # Example for HTML files where filename is the PMID (searches recursively)
19+
# - root_path: "/path/to/your/html/files"
20+
# pmid_source: "file_name"
21+
# allowed_extensions: [".html"]
22+
# # Example for folder-based structure with text files
1823
# - root_path: "/path/to/your/first/full/texts"
1924
# pmid_source: "folder_name" # or "json" or "file_name"
2025
# text_path_templates:
@@ -25,6 +30,7 @@ retrieval:
2530
# # json_pmid_key: "pmid"
2631
# # For pmid_source: "file_name", you can customize:
2732
# # allowed_extensions: [".txt", ".xml"]
33+
# # Example for another folder-based structure with JSON metadata
2834
# - root_path: "/path/to/your/second/full/texts"
2935
# pmid_source: "json"
3036
# text_path_templates:

0 commit comments

Comments
 (0)