|
| 1 | +import requests |
| 2 | +from html_to_markdown import convert_to_markdown |
| 3 | +import re |
| 4 | +from typing import List, Tuple |
| 5 | +from dataclasses import dataclass, field |
| 6 | + |
| 7 | +TITLE = 0 |
| 8 | +TEXT = 1 |
| 9 | +MEDIA = 2 |
| 10 | +TABULAR = 3 |
| 11 | +REFERENCES = 4 |
| 12 | + |
| 13 | +@dataclass |
| 14 | +class TextFragment: |
| 15 | + text: str |
| 16 | + missing_info: List[str] = field(default_factory=list) |
| 17 | + |
| 18 | +@dataclass |
| 19 | +class ArticleModel: |
| 20 | + titles: list[str] = field(default_factory=list) |
| 21 | + text: list[TextFragment] = field(default_factory=list) |
| 22 | + media: list[str] = field(default_factory=list) |
| 23 | + tabular: list[str] = field(default_factory=list) |
| 24 | + references: list[str] = field(default_factory=list) |
| 25 | + |
| 26 | + structure: List[Tuple[int, int]] = field(default_factory=list) |
| 27 | + |
| 28 | + def add_to_section(self, section_type: int, section_content: str): |
| 29 | + idx = 0 |
| 30 | + if section_type == 0: |
| 31 | + self.titles.append(section_content) |
| 32 | + idx = len(self.titles)-1 |
| 33 | + elif section_type == 1: |
| 34 | + self.text.append(TextFragment(text=section_content)) |
| 35 | + idx = len(self.text)-1 |
| 36 | + elif section_type == 2: |
| 37 | + self.media.append(section_content) |
| 38 | + idx = len(self.media)-1 |
| 39 | + elif section_type == 3: |
| 40 | + self.tabular.append(section_content) |
| 41 | + idx = len(self.tabular)-1 |
| 42 | + elif section_type == 4: |
| 43 | + self.references.append(section_content) |
| 44 | + idx = len(self.references)-1 |
| 45 | + self.structure.append((section_type, idx)) |
| 46 | + |
| 47 | +def html_to_md(page_name): |
| 48 | + params = { |
| 49 | + "action": "parse", |
| 50 | + "page": page_name, |
| 51 | + "prop": "text", |
| 52 | + "format": "json", |
| 53 | + "formatversion": 2, |
| 54 | + "redirects": 1 |
| 55 | + } |
| 56 | + |
| 57 | + r = requests. get( "https://en.wikipedia.org/w/api.php", params=params, headers={ "User-Agent": "YourAppName/1.0 ([email protected])"}, timeout=30) |
| 58 | + html = r.json()["parse"]["text"] |
| 59 | + markdown = convert_to_markdown(html) |
| 60 | + |
| 61 | + return markdown |
| 62 | + |
| 63 | +def create_article_model(md_content: str) -> ArticleModel: |
| 64 | + def html_image_to_markdown(html_img): |
| 65 | + src_match = re.search(r'src=["\']([^"\']+)["\']', html_img) |
| 66 | + alt_match = re.search(r'alt=["\']([^"\']*)["\']', html_img) |
| 67 | + |
| 68 | + if not src_match: |
| 69 | + return html_img |
| 70 | + |
| 71 | + src = src_match.group(1) |
| 72 | + alt = alt_match.group(1) if alt_match else '' |
| 73 | + |
| 74 | + return f'' |
| 75 | + |
| 76 | + def is_wiki_reference(text: str) -> bool: |
| 77 | + pattern = r'^\d+\.\s+(\*\*\[\^.*?\]\(#cite_ref-.*?\)\*\*|\^\s+\[)' |
| 78 | + return bool(re.match(pattern, text.strip(), re.DOTALL)) |
| 79 | + |
| 80 | + def strip_wiki_links(text: str) -> str: |
| 81 | + wiki_link_pattern = re.compile( |
| 82 | + r'\[([^\]]+)\]\(\s*(?:[^)\s]*?/wiki/[^)\s]*)(?:\s+"[^"]*")?\s*\)' |
| 83 | + ) |
| 84 | + return wiki_link_pattern.sub(r'\1', text) |
| 85 | + |
| 86 | + def remove_inline_citations(text: str) -> str: |
| 87 | + citation_pattern = re.compile( |
| 88 | + r'\[*\\\[\s*\d+\s*\]\s*\]\(\s*#cite_note-\d+(?:-[^)]+)?\s*\)' |
| 89 | + ) |
| 90 | + return citation_pattern.sub('', text) |
| 91 | + |
| 92 | + def is_table_row(text: str) -> bool: |
| 93 | + pattern = r'^\|.*\|$|^[\|\s]*[-:]+[\|\s\-:]*$' |
| 94 | + return bool(re.match(pattern, text)) |
| 95 | + |
| 96 | + def is_image(text: str) -> bool: |
| 97 | + pattern = r'<img\s+[^>]*?src=["\'].*?["\'][^>]*?/?>' |
| 98 | + return bool(re.match(pattern, text)) |
| 99 | + |
| 100 | + def remove_wiki_edit_links(text): |
| 101 | + pattern = r'\[\[edit\]\([^)]+\)\]' |
| 102 | + return re.sub(pattern, '', text) |
| 103 | + |
| 104 | + model = ArticleModel() |
| 105 | + if not md_content: |
| 106 | + return model |
| 107 | + |
| 108 | + # clean article first |
| 109 | + content = strip_wiki_links(md_content) |
| 110 | + content = remove_inline_citations(content) |
| 111 | + content = content.replace("\\", "") # todo: remove '\' characters |
| 112 | + content = remove_wiki_edit_links(content) # todo: remove wiki [edit] links |
| 113 | + |
| 114 | + # process references first |
| 115 | + refs_heading_pattern = re.compile( |
| 116 | + r'(?m)^(?:#{1,6}\s*References\s*$|References\s*\n[-=]{3,}\s*$)' |
| 117 | + ) |
| 118 | + refstart = refs_heading_pattern.search(content) |
| 119 | + references_str_raw = content[refstart.start():len(content)] |
| 120 | + |
| 121 | + for ref in references_str_raw.split("\n"): |
| 122 | + if is_wiki_reference(ref): |
| 123 | + model.add_to_section(REFERENCES, ref.strip()) |
| 124 | + |
| 125 | + # process rest of article |
| 126 | + article_content = content[0:refstart.start()] |
| 127 | + article_lines = article_content.split("\n") |
| 128 | + line_idx = 1 |
| 129 | + article_end = len(article_lines)-1 |
| 130 | + |
| 131 | + def peek(idx): |
| 132 | + if idx == article_end: |
| 133 | + return "" |
| 134 | + return article_lines[idx+1] |
| 135 | + |
| 136 | + def parse_table(start_idx): |
| 137 | + table_str = article_lines[start_idx] + "\n" |
| 138 | + idx = start_idx+1 |
| 139 | + while idx <= article_end and is_table_row(article_lines[idx]): |
| 140 | + table_str += article_lines[idx] + "\n" |
| 141 | + idx += 1 |
| 142 | + |
| 143 | + model.add_to_section(TABULAR, table_str) |
| 144 | + return idx |
| 145 | + |
| 146 | + def parse_image(start_idx): |
| 147 | + image_str = article_lines[start_idx] + "\n" |
| 148 | + idx = start_idx+1 |
| 149 | + nextline = article_lines[idx] |
| 150 | + while nextline.strip() == "": |
| 151 | + idx+=1 |
| 152 | + nextline = article_lines[idx] |
| 153 | + |
| 154 | + if nextline.startswith("*") and nextline.endswith("*"): |
| 155 | + image_str += nextline + "\n" |
| 156 | + |
| 157 | + model.add_to_section(MEDIA, image_str) |
| 158 | + return idx+1 |
| 159 | + |
| 160 | + while line_idx <= article_end: |
| 161 | + if article_lines[line_idx] == "" or article_lines[line_idx].isspace(): |
| 162 | + line_idx+=1 |
| 163 | + continue |
| 164 | + |
| 165 | + if is_table_row(article_lines[line_idx]): |
| 166 | + line_idx = parse_table(line_idx) |
| 167 | + continue |
| 168 | + elif is_image(article_lines[line_idx]): |
| 169 | + line_idx = parse_image(line_idx) |
| 170 | + continue |
| 171 | + |
| 172 | + if peek(line_idx).startswith("-"): |
| 173 | + model.add_to_section(TITLE, article_lines[line_idx]) |
| 174 | + line_idx += 2 |
| 175 | + continue |
| 176 | + |
| 177 | + # else its text |
| 178 | + model.add_to_section(TEXT, article_lines[line_idx]) |
| 179 | + line_idx += 1 |
| 180 | + |
| 181 | + for x in model.tabular: |
| 182 | + print("-"*50) |
| 183 | + print(x) |
| 184 | + |
| 185 | + return model |
| 186 | + |
| 187 | + |
| 188 | +# article_titles = ["Pet door", "Owner-occupancy"] |
| 189 | +# |
| 190 | +# md = html_to_md(article_titles[1]) |
| 191 | +# model = create_article_model(md) |
| 192 | + |
0 commit comments