Skip to content

Commit c51b909

Browse files
committed
Create text_recon.py
add function to split article to process
1 parent c129b59 commit c51b909

File tree

1 file changed

+208
-0
lines changed

1 file changed

+208
-0
lines changed

fastapi/app/ai/text_recon.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
import requests
2+
from html_to_markdown import convert_to_markdown
3+
import re
4+
from typing import List, Tuple
5+
6+
def html_to_md(page_name):
7+
params = {
8+
"action": "parse",
9+
"page": "Pet door", #replace
10+
"prop": "text",
11+
"format": "json",
12+
"formatversion": 2,
13+
"redirects": 1
14+
}
15+
r = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers={"User-Agent": "YourAppName/1.0 ([email protected])"}, timeout=30)
16+
html = r.json()["parse"]["text"]
17+
out_path = "page.html"
18+
with open(out_path, "w", encoding="utf-8") as f:
19+
f.write(html)
20+
markdown = convert_to_markdown(html)
21+
md_pth = "page.md"
22+
with open(md_pth, "w", encoding="utf-8") as f:
23+
f.write(markdown)
24+
25+
return markdown
26+
27+
28+
import re
29+
from typing import List, Tuple
30+
31+
32+
import re
33+
from typing import List, Tuple
34+
35+
def split_markdown_advanced(content: str) -> Tuple[List[str], List[str], List[str], List[str]]:
36+
"""
37+
Returns:
38+
- image_lines: full lines like [<img ...>](...) captured verbatim
39+
- tables: raw Markdown table blocks
40+
- paragraphs: remaining paragraphs after filtering
41+
- references_blocks: a list with one element containing the entire References section as a single string
42+
(or empty if not found), with '/wiki/' links stripped to plain text and edit line removed
43+
44+
Behaviors:
45+
- Remove any Markdown links whose URL contains '/wiki/' (preserve label).
46+
- Remove any line that starts with '[[edit]'.
47+
- Remove inline citation anchors like '[[5]](#cite_note-5)' or '[[12]](#cite_note-12)'.
48+
- Image lines that start with '[<img ...>](...)' are returned in image_lines and removed from paragraphs.
49+
- References section (from its heading to before the next heading or EOF) is extracted and returned as one string.
50+
"""
51+
if not content:
52+
return [], [], [], []
53+
54+
original = content
55+
56+
57+
58+
# Utility: strip wiki links "[label](/wiki/...)" -> "label"
59+
wiki_link_pattern = re.compile(
60+
r'\[([^\]]+)\]\(\s*(?:[^)\s]*?/wiki/[^)\s]*)(?:\s+"[^"]*")?\s*\)'
61+
)
62+
def strip_wiki_links(text: str) -> str:
63+
return wiki_link_pattern.sub(r'\1', text)
64+
65+
# Utility: remove inline citation links of the form [[5]](#cite_note-5) or [[7]](#cite_note-7)
66+
# Be lenient about the fragment suffix: #cite_note-7, #cite_note-7-0, etc.
67+
citation_pattern = re.compile(
68+
r'\[*\\\[\s*\d+\s*\]\s*\]\(\s*#cite_note-\d+(?:-[^)]+)?\s*\)'
69+
)
70+
def remove_inline_citations(text: str) -> str:
71+
return citation_pattern.sub('', text)
72+
73+
# 1) Extract the References section first, so it can be returned verbatim (with requested cleanups).
74+
refs_heading_pattern = re.compile(
75+
r'(?m)^(?:#{1,6}\s*References\s*$|References\s*\n[-=]{3,}\s*$)'
76+
)
77+
references_blocks: List[str] = []
78+
text = original
79+
80+
m = refs_heading_pattern.search(text)
81+
if m:
82+
start = m.start()
83+
# Find end: next heading (ATX or Setext) after start, else EOF
84+
next_heading_pattern = re.compile(
85+
r'(?m)^(?:#{1,6}\s*\S.*$|[^\n]+\n[-=]{3,}\s*$)'
86+
)
87+
next_m = next_heading_pattern.search(text, m.end())
88+
end = next_m.start() if next_m else len(text)
89+
refs_raw = text[start:end]
90+
91+
# Inside references: remove the edit line and inline citation anchors, strip /wiki/ links
92+
refs_lines = []
93+
for line in refs_raw.splitlines():
94+
if line.lstrip().startswith('[[edit]'):
95+
continue
96+
refs_lines.append(line)
97+
refs_clean = "\n".join(refs_lines)
98+
refs_clean = remove_inline_citations(refs_clean)
99+
refs_clean = strip_wiki_links(refs_clean)
100+
101+
# Keep as a single string
102+
refs_clean = refs_clean.strip()
103+
if refs_clean:
104+
references_blocks.append(refs_clean)
105+
106+
# Remove the references block from main text
107+
text = text[:start] + text[end:]
108+
109+
# 3) Extract full image lines of the form [<img ...>](...)
110+
image_lines: List[str] = []
111+
remaining_lines = []
112+
img_line_pattern = re.compile(r'^\s*\[\s*<img\b[^>]*>\s*\]\([^)]+\)\s*$', re.IGNORECASE)
113+
for line in text.splitlines():
114+
if img_line_pattern.match(line):
115+
image_lines.append(line.rstrip())
116+
continue
117+
remaining_lines.append(line)
118+
text = "\n".join(remaining_lines)
119+
120+
# 2) Global removals/rewrites on the remaining text
121+
# 2a) Remove any line that starts with '[[edit]'
122+
kept_lines = []
123+
for line in text.splitlines():
124+
if line.lstrip().startswith('\\[[edit]'):
125+
continue
126+
kept_lines.append(line)
127+
text = "\n".join(kept_lines)
128+
129+
# 2b) Remove inline citation anchors like [[5]](#cite_note-5)
130+
text = remove_inline_citations(text)
131+
132+
# 2c) Strip /wiki/ links globally, preserving label
133+
text = strip_wiki_links(text)
134+
135+
136+
137+
# 4) Extract Markdown tables as blocks
138+
lines = text.splitlines()
139+
tables: List[str] = []
140+
used_line_idx = set()
141+
142+
def is_table_sep(line: str) -> bool:
143+
return bool(re.match(
144+
r'^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$',
145+
line
146+
))
147+
148+
i = 0
149+
while i < len(lines):
150+
if "|" in lines[i]:
151+
j = i + 1
152+
found_sep = False
153+
while j < len(lines) and (j - i) <= 5 and "|" in lines[j]:
154+
if is_table_sep(lines[j]):
155+
found_sep = True
156+
break
157+
j += 1
158+
if found_sep:
159+
start = i
160+
end = j + 1
161+
while end < len(lines) and "|" in lines[end]:
162+
end += 1
163+
block = "\n".join(lines[start:end]).strip()
164+
if block:
165+
tables.append(block)
166+
for idx in range(start, end):
167+
used_line_idx.add(idx)
168+
i = end
169+
continue
170+
i += 1
171+
172+
# 5) Build paragraphs from remaining lines (excluding table lines and blank separators)
173+
paragraphs: List[str] = []
174+
buf: List[str] = []
175+
176+
def flush_buf():
177+
if buf:
178+
block = "\n".join(buf).strip()
179+
if block:
180+
paragraphs.append(block)
181+
buf.clear()
182+
183+
for idx, line in enumerate(lines):
184+
if idx in used_line_idx:
185+
flush_buf()
186+
continue
187+
if line.strip() == "":
188+
flush_buf()
189+
else:
190+
buf.append(line)
191+
flush_buf()
192+
193+
# Remove accidental table separators in paragraphs if any
194+
cleaned_paragraphs = []
195+
for block in paragraphs:
196+
if any(is_table_sep(l) for l in block.splitlines()):
197+
continue
198+
cleaned_paragraphs.append(block)
199+
200+
return image_lines, tables, cleaned_paragraphs, references_blocks
201+
202+
md = html_to_md("")
203+
dedup_images, tables, cleaned_paragraphs, references = split_markdown_advanced(md)
204+
205+
print(dedup_images)
206+
print(tables)
207+
print(cleaned_paragraphs)
208+
print(references)

0 commit comments

Comments
 (0)