grey-box
diff --git a/‎__pycache__/__init__.cpython-310.pyc‎
160 Bytes b/‎__pycache__/__init__.cpython-310.pyc‎
160 Bytes
diff --git a/‎backend_requirements.txt‎
Lines changed: 198 additions & 0 deletions b/‎backend_requirements.txt‎
Lines changed: 198 additions & 0 deletions
diff --git a/‎fastapi/app/ai/synthesis.py‎
Lines changed: 192 additions & 0 deletions b/‎fastapi/app/ai/synthesis.py‎
Lines changed: 192 additions & 0 deletions
@@ -0,0 +1,198 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: osx-64
+aiohttp=3.8.6=pypi_0
+aiosignal=1.3.1=pypi_0
+allennlp=2.10.1=pypi_0
+allennlp-models=2.10.1=pypi_0
+annotated-types=0.5.0=pypi_0
+anyio=3.7.1=pyhd8ed1ab_0
+async-timeout=4.0.3=pypi_0
+asynctest=0.13.0=pypi_0
+attrs=24.2.0=pypi_0
+base58=2.1.1=pypi_0
+blis=0.2.4=pypi_0
+boto3=1.33.13=pypi_0
+botocore=1.33.13=pypi_0
+brotli-python=1.0.9=py37h0582d14_7
+ca-certificates=2025.8.3=hbd8a1cb_0
+cached-path=1.1.6=pypi_0
+cached-property=1.5.2=pypi_0
+cachetools=5.5.2=pypi_0
+certifi=2024.8.30=pyhd8ed1ab_0
+cffi=1.15.1=py37h7346b73_1
+charset-normalizer=3.4.1=pypi_0
+click=8.1.8=pypi_0
+colorama=0.4.6=pyhd8ed1ab_0
+commonmark=0.9.1=pypi_0
+conllu=4.4.2=pypi_0
+coreftools=1.1.1=pypi_0
+cymem=2.0.11=pypi_0
+dataclasses=0.8=pyhc8e2a94_3
+datasets=2.10.1=pypi_0
+dill=0.3.6=pypi_0
+distro=1.9.0=pypi_0
+docker-pycreds=0.4.0=pypi_0
+en-core-web-sm=2.1.0=pypi_0
+exceptiongroup=1.2.2=pypi_0
+fairscale=0.4.6=pypi_0
+fastapi=0.103.2=pyhd8ed1ab_0
+filelock=3.7.1=pypi_0
+freetype=2.13.3=h40dfd5c_0
+frozenlist=1.3.3=pypi_0
+fsspec=2023.1.0=pyhd8ed1ab_0
+ftfy=6.1.1=pypi_0
+gitdb=4.0.12=pypi_0
+gitpython=3.1.44=pypi_0
+google-api-core=2.24.2=pypi_0
+google-auth=2.38.0=pypi_0
+google-cloud-core=2.4.3=pypi_0
+google-cloud-storage=2.19.0=pypi_0
+google-crc32c=1.5.0=pypi_0
+google-resumable-media=2.7.2=pypi_0
+googleapis-common-protos=1.69.2=pypi_0
+h11=0.14.0=pypi_0
+h5py=3.8.0=pypi_0
+httpcore=0.17.3=pypi_0
+httpx=0.24.1=pypi_0
+huggingface-hub=0.10.1=pypi_0
+huggingface_hub=0.16.4=pyhd8ed1ab_0
+idna=3.10=pyhd8ed1ab_0
+importlib-metadata=6.7.0=pypi_0
+importlib_metadata=4.11.4=hd8ed1ab_0
+iniconfig=2.0.0=pypi_0
+jmespath=1.0.1=pypi_0
+joblib=1.3.2=pyhd8ed1ab_0
+jpeg=9e=hb7f2c08_3
+jsonnet=0.20.0=pypi_0
+jsonschema=2.6.0=pypi_0
+lcms2=2.14=h90f4b2a_0
+lerc=4.0.0=hb486fe8_0
+libblas=3.9.0=20_osx64_openblas
+libcblas=3.9.0=20_osx64_openblas
+libcxx=20.1.3=hf95d169_0
+libdeflate=1.14=hb7f2c08_0
+libffi=3.4.4=hecd8cb5_1
+libgfortran=14.2.0=hef36b68_105
+libgfortran5=14.2.0=h58528f3_105
+libhwloc=2.11.2=default_h4cdd727_1001
+libiconv=1.18=h4b5e92a_1
+liblapack=3.9.0=20_osx64_openblas
+liblzma=5.8.1=hd471939_0
+libopenblas=0.3.25=openmp_hfef2a42_0
+libpng=1.6.47=h3c4a55f_0
+libprotobuf=3.21.12=h7d26f99_2
+libsqlite=3.45.2=h92b6c6a_0
+libtiff=4.4.0=h6268bbc_5
+libwebp-base=1.5.0=h6cf52b4_0
+libxcb=1.13=h0d85af4_1004
+libxml2=2.13.7=h3fbc333_1
+libzlib=1.3.1=hd23fc13_2
+llvm-openmp=20.1.3=ha54dae1_0
+lmdb=1.6.2=pypi_0
+mkl=2022.2.1=h44ed08c_16952
+more-itertools=9.1.0=pypi_0
+multidict=6.0.5=pypi_0
+multiprocess=0.70.14=pypi_0
+murmurhash=1.0.12=pypi_0
+ncurses=6.4=hcec6c5f_0
+neuralcoref=4.0=pypi_0
+ninja=1.12.1=h3c5361c_0
+nltk=3.8.1=pyhd8ed1ab_0
+numpy=1.21.6=py37h345d48f_0
+ollama=0.6.7=cpu_h77ccaa4_
+openai=1.39.0=pypi_0
+openjpeg=2.5.0=h5d0d7b0_1
+openssl=3.5.3=h230baf5_1
+packaging=24.0=pypi_0
+pandas=1.3.5=pypi_0
+pathtools=0.1.2=pypi_0
+pillow=9.5.0=pypi_0
+pip=22.3.1=py37hecd8cb5_0
+plac=0.9.6=pypi_0
+pluggy=1.2.0=pypi_0
+preshed=2.0.1=pypi_0
+promise=2.3=pypi_0
+proto-plus=1.26.1=pypi_0
+protobuf=3.20.0=pypi_0
+psutil=7.0.0=pypi_0
+pthread-stubs=0.4=h00291cd_1002
+py-rouge=1.1=pypi_0
+pyarrow=12.0.1=pypi_0
+pyasn1=0.5.1=pypi_0
+pyasn1-modules=0.3.0=pypi_0
+pycparser=2.21=pyhd8ed1ab_0
+pydantic=2.5.3=pypi_0
+pydantic-core=2.14.6=pypi_0
+pydot=2.0.0=pypi_0
+pygments=2.17.2=pypi_0
+pyparsing=3.1.4=pypi_0
+pysocks=1.7.1=py37hf985489_5
+pytest=7.4.4=pypi_0
+python=3.7.12=hf3644f1_100_cpython
+python-dateutil=2.9.0.post0=pypi_0
+python_abi=3.7=4_cp37m
+pytorch=1.12.1=cpu_py37h3bab975_1
+pytz=2025.2=pypi_0
+pyyaml=6.0.1=pypi_0
+readline=8.2=hca72f7f_0
+regex=2024.4.16=pypi_0
+requests=2.31.0=pypi_0
+responses=0.18.0=pypi_0
+rich=12.6.0=pypi_0
+rsa=4.9=pypi_0
+s3transfer=0.8.2=pypi_0
+sacremoses=0.0.53=pyhd8ed1ab_0
+scikit-learn=1.0.2=py37h572704e_0
+scipy=1.7.3=py37h4e3cf02_0
+sentence-transformers=2.2.2=pyhd8ed1ab_0
+sentencepiece=0.2.0=pypi_0
+sentry-sdk=2.25.1=pypi_0
+setproctitle=1.3.3=pypi_0
+setuptools=65.6.3=py37hecd8cb5_0
+shellingham=1.5.4=pypi_0
+shortuuid=1.0.13=pypi_0
+six=1.17.0=pypi_0
+sleef=3.8=hfe0d17b_0
+smmap=5.0.2=pypi_0
+sniffio=1.3.1=pypi_0
+spacy=2.1.0=pypi_0
+sqlite=3.45.2=h7461747_0
+srsly=1.0.7=pypi_0
+stanford-openie=1.3.2=pypi_0
+stanfordcorenlp=3.9.1.1=pypi_0
+stanfordnlp=0.2.0=pypi_0
+starlette=0.27.0=pyhd8ed1ab_0
+tbb=2021.13.0=hb890de9_1
+tensorboardx=2.6.2.2=pypi_0
+termcolor=1.1.0=pypi_0
+thinc=7.0.8=pypi_0
+threadpoolctl=3.1.0=pyh8a188c0_0
+tk=8.6.13=h1abcd95_1
+tokenizers=0.12.1=pypi_0
+tomli=2.0.1=pypi_0
+torchvision=0.13.1=pypi_0
+tqdm=4.67.1=pyhd8ed1ab_0
+traitlets=5.9.0=pypi_0
+transformers=4.20.1=pypi_0
+typer=0.15.2=pypi_0
+typing-extensions=4.7.1=hd8ed1ab_0
+typing_extensions=4.7.1=pyha770c72_0
+urllib3=1.26.20=pypi_0
+uvicorn=0.19.0=py37hf985489_0
+wandb=0.12.21=pypi_0
+wasabi=0.10.1=pypi_0
+wcwidth=0.2.13=pypi_0
+wget=3.2=pypi_0
+wheel=0.38.4=py37hecd8cb5_0
+wikipedia-api=0.6.0=pyhd8ed1ab_0
+word2number=1.1=pypi_0
+xorg-libxau=1.0.12=h6e16a3a_0
+xorg-libxdmcp=1.1.5=h00291cd_0
+xxhash=3.5.0=pypi_0
+xz=5.6.4=h46256e1_1
+yaml=0.2.5=h0d85af4_2
+yarl=1.9.4=pypi_0
+zipp=3.15.0=pyhd8ed1ab_0
+zlib=1.3.1=hd23fc13_2
+zstd=1.5.7=h8210216_2
@@ -0,0 +1,192 @@
+import requests
+from html_to_markdown import convert_to_markdown
+import re
+from typing import List, Tuple
+from dataclasses import dataclass, field
+
+TITLE = 0
+TEXT = 1
+MEDIA = 2
+TABULAR = 3
+REFERENCES = 4
+
+@dataclass
+class TextFragment:
+    text: str
+    missing_info: List[str] = field(default_factory=list)
+
+@dataclass
+class ArticleModel:
+    titles: list[str] = field(default_factory=list)
+    text: list[TextFragment] = field(default_factory=list)
+    media: list[str] = field(default_factory=list)
+    tabular: list[str] = field(default_factory=list) 
+    references: list[str] = field(default_factory=list)
+
+    structure: List[Tuple[int, int]] = field(default_factory=list)
+
+    def add_to_section(self, section_type: int, section_content: str):
+        idx = 0
+        if section_type == 0:
+            self.titles.append(section_content)
+            idx = len(self.titles)-1
+        elif section_type == 1:
+            self.text.append(TextFragment(text=section_content))
+            idx = len(self.text)-1
+        elif section_type == 2:
+            self.media.append(section_content)
+            idx = len(self.media)-1
+        elif section_type == 3:
+            self.tabular.append(section_content)
+            idx = len(self.tabular)-1
+        elif section_type == 4:
+            self.references.append(section_content)
+            idx = len(self.references)-1
+        self.structure.append((section_type, idx))
+
+def html_to_md(page_name):
+    params = {
+        "action": "parse",
+        "page": page_name,
+        "prop": "text",
+        "format": "json",
+        "formatversion": 2,
+        "redirects": 1
+    }
+
+    r = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers={"User-Agent": "YourAppName/1.0 ([email protected])"}, timeout=30)
+    html = r.json()["parse"]["text"]
+    markdown = convert_to_markdown(html)
+
+    return markdown
+
+def create_article_model(md_content: str) -> ArticleModel:
+    def html_image_to_markdown(html_img):
+        src_match = re.search(r'src=["\']([^"\']+)["\']', html_img)
+        alt_match = re.search(r'alt=["\']([^"\']*)["\']', html_img)
+        
+        if not src_match:
+            return html_img
+        
+        src = src_match.group(1)
+        alt = alt_match.group(1) if alt_match else ''
+        
+        return f'![{alt}]({src})'
+
+    def is_wiki_reference(text: str) -> bool:
+        pattern = r'^\d+\.\s+(\*\*\[\^.*?\]\(#cite_ref-.*?\)\*\*|\^\s+\[)'
+        return bool(re.match(pattern, text.strip(), re.DOTALL))
+
+    def strip_wiki_links(text: str) -> str:
+        wiki_link_pattern = re.compile(
+            r'\[([^\]]+)\]\(\s*(?:[^)\s]*?/wiki/[^)\s]*)(?:\s+"[^"]*")?\s*\)'
+        )
+        return wiki_link_pattern.sub(r'\1', text)
+
+    def remove_inline_citations(text: str) -> str:
+        citation_pattern = re.compile(
+            r'\[*\\\[\s*\d+\s*\]\s*\]\(\s*#cite_note-\d+(?:-[^)]+)?\s*\)'
+        )
+        return citation_pattern.sub('', text)
+
+    def is_table_row(text: str) -> bool:
+        pattern = r'^\|.*\|$|^[\|\s]*[-:]+[\|\s\-:]*$'
+        return bool(re.match(pattern, text))
+
+    def is_image(text: str) -> bool:
+        pattern = r'<img\s+[^>]*?src=["\'].*?["\'][^>]*?/?>'
+        return bool(re.match(pattern, text))
+
+    def remove_wiki_edit_links(text):
+        pattern = r'\[\[edit\]\([^)]+\)\]'
+        return re.sub(pattern, '', text)
+
+    model = ArticleModel()
+    if not md_content:
+        return model
+
+    # clean article first
+    content = strip_wiki_links(md_content)
+    content = remove_inline_citations(content)
+    content = content.replace("\\", "")    # todo: remove '\' characters
+    content = remove_wiki_edit_links(content) # todo: remove wiki [edit] links
+
+    # process references first
+    refs_heading_pattern = re.compile(
+        r'(?m)^(?:#{1,6}\s*References\s*$|References\s*\n[-=]{3,}\s*$)'
+    )
+    refstart = refs_heading_pattern.search(content)
+    references_str_raw = content[refstart.start():len(content)]
+
+    for ref in references_str_raw.split("\n"):
+        if is_wiki_reference(ref):
+            model.add_to_section(REFERENCES, ref.strip())
+
+    # process rest of article
+    article_content = content[0:refstart.start()]
+    article_lines = article_content.split("\n")
+    line_idx = 1
+    article_end = len(article_lines)-1
+
+    def peek(idx):
+        if idx == article_end:
+            return ""
+        return article_lines[idx+1]
+
+    def parse_table(start_idx):
+        table_str = article_lines[start_idx] + "\n"
+        idx = start_idx+1
+        while idx <= article_end and is_table_row(article_lines[idx]):
+            table_str += article_lines[idx] + "\n"
+            idx += 1
+
+        model.add_to_section(TABULAR, table_str)
+        return idx
+
+    def parse_image(start_idx):
+        image_str = article_lines[start_idx] + "\n"
+        idx = start_idx+1
+        nextline = article_lines[idx]
+        while nextline.strip() == "":
+            idx+=1
+            nextline = article_lines[idx]
+
+        if nextline.startswith("*") and nextline.endswith("*"):
+            image_str += nextline + "\n"
+
+        model.add_to_section(MEDIA, image_str)
+        return idx+1
+
+    while line_idx <= article_end:
+        if article_lines[line_idx] == "" or article_lines[line_idx].isspace():
+            line_idx+=1
+            continue
+
+        if is_table_row(article_lines[line_idx]):
+            line_idx = parse_table(line_idx)
+            continue
+        elif is_image(article_lines[line_idx]):
+            line_idx = parse_image(line_idx)
+            continue
+
+        if peek(line_idx).startswith("-"):
+            model.add_to_section(TITLE, article_lines[line_idx])
+            line_idx += 2
+            continue
+
+        # else its text
+        model.add_to_section(TEXT, article_lines[line_idx])
+        line_idx += 1
+
+    for x in model.tabular:
+        print("-"*50)
+        print(x)
+
+    return model
+
+
+# article_titles = ["Pet door", "Owner-occupancy"]
+# 
+# md = html_to_md(article_titles[1])
+# model = create_article_model(md)
+