-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
285 lines (236 loc) · 10.4 KB
/
preprocess.py
File metadata and controls
285 lines (236 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
from docx import Document
import mammoth
import textwrap
from html import unescape
from typing import Dict, Tuple, List, Any
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
import mistune
# load file
def docx_to_html(path: str) -> str:
"""Convert a .docx into HTML, preserving tables and links."""
with open(path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
# result.value is the HTML string
# result.messages contains any warnings
return result.value
def split_plaintext_into_sections(plain_text: str, chunk_size: int = 1000, chunk_overlap: int = 100) -> list:
"""
Split plain text into sections:
- Any text within <figure>...</figure> or <table>...</table> is a separate section (including the tags).
- The remaining plain text is split using LangChain's RecursiveCharacterTextSplitter.
Returns a list of section strings.
"""
# Find all <figure>...</figure> and <table>...</table> blocks
pattern = re.compile(r'(<figure[\s\S]*?</figure>|<table[\s\S]*?</table>)', re.IGNORECASE)
sections = []
last_end = 0
for m in pattern.finditer(plain_text):
# Add any plain text before this block (to be split later)
if m.start() > last_end:
before = plain_text[last_end:m.start()]
before = before.strip()
if before:
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
sections.extend([chunk for chunk in splitter.split_text(before) if chunk.strip()])
# Add the matched block as a section
block = m.group(0).strip()
if block:
sections.append(block)
last_end = m.end()
# Add any remaining plain text after the last block
if last_end < len(plain_text):
after = plain_text[last_end:]
after = after.strip()
if after:
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
sections.extend([chunk for chunk in splitter.split_text(after) if chunk.strip()])
return sections
# split data
def split_html_into_paragraphs(html):
# First, extract tables and replace them with placeholders to avoid interference
table_placeholders = []
table_counter = 0
def replace_table_with_placeholder(match):
nonlocal table_counter
placeholder = f"__TABLE_PLACEHOLDER_{table_counter}__"
table_placeholders.append((placeholder, match.group(0)))
table_counter += 1
return placeholder
# Replace tables with placeholders (non-greedy to handle nested tables)
html_without_tables = re.sub(r'<table.*?>.*?</table>', replace_table_with_placeholder, html, flags=re.DOTALL | re.IGNORECASE)
# Use regex to find all <p>...</p> blocks in the HTML without tables
paragraphs = re.findall(r'<p.*?>.*?</p>', html_without_tables, re.DOTALL | re.IGNORECASE)
# Filter paragraphs to only include those with meaningful content (words or numbers)
meaningful_paragraphs = []
for paragraph in paragraphs:
# Remove HTML tags to get plain text
plain_text = re.sub(r'<[^>]+>', '', paragraph)
# Decode HTML entities
plain_text = unescape(plain_text)
# Remove extra whitespace
plain_text = re.sub(r'\s+', ' ', plain_text).strip()
# Check if paragraph contains words or numbers
if re.search(r'[a-zA-Z0-9]', plain_text):
meaningful_paragraphs.append(paragraph)
# Restore tables from placeholders and add them as paragraphs
for placeholder, table_html in table_placeholders:
# Replace the placeholder in the original HTML with the actual table
meaningful_paragraphs.append(table_html)
return meaningful_paragraphs
# parse file
def extract_text_and_citations_per_citation_href(html: str) -> Dict[str, str]:
"""
Finds all <a href="...">...</a> links in the HTML string,
and for each returns the text immediately before the link (stripped of any tags).
"""
# Regex to find <a ... href="URL">label</a>
link_re = re.compile(
r'<a\s+[^>]*href="(?P<href>[^"]+)"[^>]*>.*?</a>',
flags=re.IGNORECASE | re.DOTALL
)
results: Dict[str, str] = {}
last_end = 0
for m in link_re.finditer(html):
url = m.group('href')
# Segment from end of last link (or start) up to this link
pre_segment = html[last_end : m.start()]
# Remove any HTML tags from that segment and collapse whitespace
text_before = re.sub(r'<[^>]+>', '', pre_segment).strip()
results[text_before] = url
last_end = m.end()
return results
def extract_text_and_citations_per_section_href(text: str) -> dict:
"""
Same functionality but returns a dictionary instead of tuple.
Returns:
dict: {"text": clean_text, "citations": citations}
"""
# 1. Extract all citation URLs
citations = re.findall(
r'<a\s+[^>]*href="([^"]+)"',
text,
flags=re.IGNORECASE
)
# 2. Remove all <a>…</a> segments entirely (so link text doesn't remain)
html_no_links = re.sub(
r'<a\s+[^>]*>.*?</a>',
'',
text,
flags=re.IGNORECASE | re.DOTALL
)
# 3. Strip all remaining HTML tags
text_only = re.sub(r'<[^>]+>', ' ', html_no_links)
# 4. Decode HTML entities (e.g. & → &)
text_only = unescape(text_only)
# 5. Collapse multiple whitespace into single spaces
clean_text = re.sub(r'\s+', ' ', text_only).strip()
return {
"text": clean_text,
"citations": citations
}
def extract_text_and_citations_per_citation_source(html: str) -> Dict[str, str]:
"""
From an HTML string with markdown-style [Source](URL) citations,
return a dict mapping each citation context to its URL.
Each context is the clean text chunk immediately preceding that citation.
"""
# 1. Decode HTML entities
decoded = unescape(html)
# 2. Strip HTML tags
text = re.sub(r'<[^>]+>', ' ', decoded)
# 3. Normalize whitespace
text = re.sub(r'\s+', ' ', text).strip()
# 4. Find all [Source](URL)
link_pattern = re.compile(r'\[Source\]\((https?://[^\)]+)\)', flags=re.IGNORECASE)
results: Dict[str, str] = {}
last_end = 0
for m in link_pattern.finditer(text):
url = m.group(1)
# the text between the end of the previous link (or start) and this one
context = text[last_end:m.start()].strip()
results[context] = url
last_end = m.end()
return results
# case I
def run_href(text: str):
# II. split file
sections = split_html_into_paragraphs(text)
# III. parse file
text_links_per_citation = []
text_links_per_section = []
for section in sections:
dic = extract_text_and_citations_per_citation_href(section)
if not dic:
continue
text_links_per_citation.append(dic)
dic = extract_text_and_citations_per_section_href(section)
text_links_per_section.append(dic)
return text_links_per_citation, text_links_per_section
# case II
def run_source(text: str):
# II. split file
sections = split_html_into_paragraphs(text)
# III. parse file
text_links_per_citation = []
for section in sections:
dic = extract_text_and_citations_per_citation_source(section)
if not dic:
continue
text_links_per_citation.append(dic)
return text_links_per_citation
def extract_text_images_tables_from_md(md_content: str) -> tuple:
"""
Extract plain text (excluding figures, tables, images), HTML, tables, and figures from a Markdown string.
Returns (plain_text, html_content, images, tables, figures)
- plain_text: all text content, excluding figures, tables, and images
- html_content: HTML rendering of the markdown
- images: list of image URLs/paths (from markdown and HTML)
- tables: list of HTML tables (from both Markdown and raw HTML)
- figures: list of <figure>...</figure> blocks (as strings)
"""
# Remove <figure>...</figure> blocks
no_figures = re.sub(r'<figure[\s\S]*?>[\s\S]*?</figure>', '', md_content, flags=re.IGNORECASE)
# Remove Markdown tables (lines with | and at least one header separator)
no_tables = re.sub(r'(^\s*\|.*\|\s*$\n?)+', '', no_figures, flags=re.MULTILINE)
# Remove HTML tables
no_tables = re.sub(r'<table[\s\S]*?>[\s\S]*?</table>', '', no_tables, flags=re.IGNORECASE)
# Remove Markdown images 
no_images = re.sub(r'!\[[^\]]*\]\(([^)]+)\)', '', no_tables)
# Remove HTML images <img ...>
no_images = re.sub(r'<img [^>]*src=["\"][^"\"]+["\"][^>]*>', '', no_images, flags=re.IGNORECASE)
# Render HTML
markdown = mistune.create_markdown(renderer=mistune.HTMLRenderer())
html_content = markdown(md_content)
# Use mistune's AST to extract tables and text
class Collector(mistune.HTMLRenderer):
def __init__(self):
super().__init__()
self.tables = []
self.text_chunks = []
def table(self, header, body):
table_html = f"<table>{header}{body}</table>"
self.tables.append(table_html)
return table_html
def text(self, text):
self.text_chunks.append(text)
return super().text(text)
collector = Collector()
mistune.create_markdown(renderer=collector)(no_images)
plain_text = " ".join(collector.text_chunks).strip()
# Extract <figure>...</figure> blocks using regex (non-greedy)
figure_pattern = re.compile(r'<figure[\s\S]*?>[\s\S]*?</figure>', re.IGNORECASE)
figures = figure_pattern.findall(md_content)
# Extract images from markdown  and HTML <img ... src="...">
md_img_pattern = re.compile(r'!\[[^\]]*\]\(([^)]+)\)')
html_img_pattern = re.compile(r'<img [^>]*src=["\\\']([^"\\\']+)["\\\']', re.IGNORECASE)
images = md_img_pattern.findall(md_content) + html_img_pattern.findall(md_content)
# Extract raw HTML tables
html_table_pattern = re.compile(r'<table[\s\S]*?>[\s\S]*?</table>', re.IGNORECASE)
html_tables = html_table_pattern.findall(md_content)
# Combine Markdown and HTML tables, avoiding duplicates
all_tables = collector.tables.copy()
for tbl in html_tables:
if tbl not in all_tables:
all_tables.append(tbl)
return plain_text, html_content, images, all_tables, figures