1+ import requests
2+ from html_to_markdown import convert_to_markdown
3+ import re
4+ from typing import List , Tuple
5+
6+ def html_to_md (page_name ):
7+ params = {
8+ "action" : "parse" ,
9+ "page" : "Pet door" , #replace
10+ "prop" : "text" ,
11+ "format" : "json" ,
12+ "formatversion" : 2 ,
13+ "redirects" : 1
14+ }
15+ r = requests .
get (
"https://en.wikipedia.org/w/api.php" ,
params = params ,
headers = {
"User-Agent" :
"YourAppName/1.0 ([email protected] )" },
timeout = 30 )
16+ html = r .json ()["parse" ]["text" ]
17+ out_path = "page.html"
18+ with open (out_path , "w" , encoding = "utf-8" ) as f :
19+ f .write (html )
20+ markdown = convert_to_markdown (html )
21+ md_pth = "page.md"
22+ with open (md_pth , "w" , encoding = "utf-8" ) as f :
23+ f .write (markdown )
24+
25+ return markdown
26+
27+
28+ import re
29+ from typing import List , Tuple
30+
31+
32+ import re
33+ from typing import List , Tuple
34+
35+ def split_markdown_advanced (content : str ) -> Tuple [List [str ], List [str ], List [str ], List [str ]]:
36+ """
37+ Returns:
38+ - image_lines: full lines like [<img ...>](...) captured verbatim
39+ - tables: raw Markdown table blocks
40+ - paragraphs: remaining paragraphs after filtering
41+ - references_blocks: a list with one element containing the entire References section as a single string
42+ (or empty if not found), with '/wiki/' links stripped to plain text and edit line removed
43+
44+ Behaviors:
45+ - Remove any Markdown links whose URL contains '/wiki/' (preserve label).
46+ - Remove any line that starts with '[[edit]'.
47+ - Remove inline citation anchors like '[[5]](#cite_note-5)' or '[[12]](#cite_note-12)'.
48+ - Image lines that start with '[<img ...>](...)' are returned in image_lines and removed from paragraphs.
49+ - References section (from its heading to before the next heading or EOF) is extracted and returned as one string.
50+ """
51+ if not content :
52+ return [], [], [], []
53+
54+ original = content
55+
56+
57+
58+ # Utility: strip wiki links "[label](/wiki/...)" -> "label"
59+ wiki_link_pattern = re .compile (
60+ r'\[([^\]]+)\]\(\s*(?:[^)\s]*?/wiki/[^)\s]*)(?:\s+"[^"]*")?\s*\)'
61+ )
62+ def strip_wiki_links (text : str ) -> str :
63+ return wiki_link_pattern .sub (r'\1' , text )
64+
65+ # Utility: remove inline citation links of the form [[5]](#cite_note-5) or [[7]](#cite_note-7)
66+ # Be lenient about the fragment suffix: #cite_note-7, #cite_note-7-0, etc.
67+ citation_pattern = re .compile (
68+ r'\[*\\\[\s*\d+\s*\]\s*\]\(\s*#cite_note-\d+(?:-[^)]+)?\s*\)'
69+ )
70+ def remove_inline_citations (text : str ) -> str :
71+ return citation_pattern .sub ('' , text )
72+
73+ # 1) Extract the References section first, so it can be returned verbatim (with requested cleanups).
74+ refs_heading_pattern = re .compile (
75+ r'(?m)^(?:#{1,6}\s*References\s*$|References\s*\n[-=]{3,}\s*$)'
76+ )
77+ references_blocks : List [str ] = []
78+ text = original
79+
80+ m = refs_heading_pattern .search (text )
81+ if m :
82+ start = m .start ()
83+ # Find end: next heading (ATX or Setext) after start, else EOF
84+ next_heading_pattern = re .compile (
85+ r'(?m)^(?:#{1,6}\s*\S.*$|[^\n]+\n[-=]{3,}\s*$)'
86+ )
87+ next_m = next_heading_pattern .search (text , m .end ())
88+ end = next_m .start () if next_m else len (text )
89+ refs_raw = text [start :end ]
90+
91+ # Inside references: remove the edit line and inline citation anchors, strip /wiki/ links
92+ refs_lines = []
93+ for line in refs_raw .splitlines ():
94+ if line .lstrip ().startswith ('[[edit]' ):
95+ continue
96+ refs_lines .append (line )
97+ refs_clean = "\n " .join (refs_lines )
98+ refs_clean = remove_inline_citations (refs_clean )
99+ refs_clean = strip_wiki_links (refs_clean )
100+
101+ # Keep as a single string
102+ refs_clean = refs_clean .strip ()
103+ if refs_clean :
104+ references_blocks .append (refs_clean )
105+
106+ # Remove the references block from main text
107+ text = text [:start ] + text [end :]
108+
109+ # 3) Extract full image lines of the form [<img ...>](...)
110+ image_lines : List [str ] = []
111+ remaining_lines = []
112+ img_line_pattern = re .compile (r'^\s*\[\s*<img\b[^>]*>\s*\]\([^)]+\)\s*$' , re .IGNORECASE )
113+ for line in text .splitlines ():
114+ if img_line_pattern .match (line ):
115+ image_lines .append (line .rstrip ())
116+ continue
117+ remaining_lines .append (line )
118+ text = "\n " .join (remaining_lines )
119+
120+ # 2) Global removals/rewrites on the remaining text
121+ # 2a) Remove any line that starts with '[[edit]'
122+ kept_lines = []
123+ for line in text .splitlines ():
124+ if line .lstrip ().startswith ('\\ [[edit]' ):
125+ continue
126+ kept_lines .append (line )
127+ text = "\n " .join (kept_lines )
128+
129+ # 2b) Remove inline citation anchors like [[5]](#cite_note-5)
130+ text = remove_inline_citations (text )
131+
132+ # 2c) Strip /wiki/ links globally, preserving label
133+ text = strip_wiki_links (text )
134+
135+
136+
137+ # 4) Extract Markdown tables as blocks
138+ lines = text .splitlines ()
139+ tables : List [str ] = []
140+ used_line_idx = set ()
141+
142+ def is_table_sep (line : str ) -> bool :
143+ return bool (re .match (
144+ r'^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$' ,
145+ line
146+ ))
147+
148+ i = 0
149+ while i < len (lines ):
150+ if "|" in lines [i ]:
151+ j = i + 1
152+ found_sep = False
153+ while j < len (lines ) and (j - i ) <= 5 and "|" in lines [j ]:
154+ if is_table_sep (lines [j ]):
155+ found_sep = True
156+ break
157+ j += 1
158+ if found_sep :
159+ start = i
160+ end = j + 1
161+ while end < len (lines ) and "|" in lines [end ]:
162+ end += 1
163+ block = "\n " .join (lines [start :end ]).strip ()
164+ if block :
165+ tables .append (block )
166+ for idx in range (start , end ):
167+ used_line_idx .add (idx )
168+ i = end
169+ continue
170+ i += 1
171+
172+ # 5) Build paragraphs from remaining lines (excluding table lines and blank separators)
173+ paragraphs : List [str ] = []
174+ buf : List [str ] = []
175+
176+ def flush_buf ():
177+ if buf :
178+ block = "\n " .join (buf ).strip ()
179+ if block :
180+ paragraphs .append (block )
181+ buf .clear ()
182+
183+ for idx , line in enumerate (lines ):
184+ if idx in used_line_idx :
185+ flush_buf ()
186+ continue
187+ if line .strip () == "" :
188+ flush_buf ()
189+ else :
190+ buf .append (line )
191+ flush_buf ()
192+
193+ # Remove accidental table separators in paragraphs if any
194+ cleaned_paragraphs = []
195+ for block in paragraphs :
196+ if any (is_table_sep (l ) for l in block .splitlines ()):
197+ continue
198+ cleaned_paragraphs .append (block )
199+
200+ return image_lines , tables , cleaned_paragraphs , references_blocks
201+
202+ md = html_to_md ("" )
203+ dedup_images , tables , cleaned_paragraphs , references = split_markdown_advanced (md )
204+
205+ print (dedup_images )
206+ print (tables )
207+ print (cleaned_paragraphs )
208+ print (references )
0 commit comments