Skip to content

Commit d475137

Browse files
committed
article deconstruction function works, added central backend data store, added ArticleModel model, among other things
1 parent 36eb23a commit d475137

File tree

6 files changed

+416
-217
lines changed

6 files changed

+416
-217
lines changed
160 Bytes
Binary file not shown.

backend_requirements.txt

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
# This file may be used to create an environment using:
2+
# $ conda create --name <env> --file <this file>
3+
# platform: osx-64
4+
aiohttp=3.8.6=pypi_0
5+
aiosignal=1.3.1=pypi_0
6+
allennlp=2.10.1=pypi_0
7+
allennlp-models=2.10.1=pypi_0
8+
annotated-types=0.5.0=pypi_0
9+
anyio=3.7.1=pyhd8ed1ab_0
10+
async-timeout=4.0.3=pypi_0
11+
asynctest=0.13.0=pypi_0
12+
attrs=24.2.0=pypi_0
13+
base58=2.1.1=pypi_0
14+
blis=0.2.4=pypi_0
15+
boto3=1.33.13=pypi_0
16+
botocore=1.33.13=pypi_0
17+
brotli-python=1.0.9=py37h0582d14_7
18+
ca-certificates=2025.8.3=hbd8a1cb_0
19+
cached-path=1.1.6=pypi_0
20+
cached-property=1.5.2=pypi_0
21+
cachetools=5.5.2=pypi_0
22+
certifi=2024.8.30=pyhd8ed1ab_0
23+
cffi=1.15.1=py37h7346b73_1
24+
charset-normalizer=3.4.1=pypi_0
25+
click=8.1.8=pypi_0
26+
colorama=0.4.6=pyhd8ed1ab_0
27+
commonmark=0.9.1=pypi_0
28+
conllu=4.4.2=pypi_0
29+
coreftools=1.1.1=pypi_0
30+
cymem=2.0.11=pypi_0
31+
dataclasses=0.8=pyhc8e2a94_3
32+
datasets=2.10.1=pypi_0
33+
dill=0.3.6=pypi_0
34+
distro=1.9.0=pypi_0
35+
docker-pycreds=0.4.0=pypi_0
36+
en-core-web-sm=2.1.0=pypi_0
37+
exceptiongroup=1.2.2=pypi_0
38+
fairscale=0.4.6=pypi_0
39+
fastapi=0.103.2=pyhd8ed1ab_0
40+
filelock=3.7.1=pypi_0
41+
freetype=2.13.3=h40dfd5c_0
42+
frozenlist=1.3.3=pypi_0
43+
fsspec=2023.1.0=pyhd8ed1ab_0
44+
ftfy=6.1.1=pypi_0
45+
gitdb=4.0.12=pypi_0
46+
gitpython=3.1.44=pypi_0
47+
google-api-core=2.24.2=pypi_0
48+
google-auth=2.38.0=pypi_0
49+
google-cloud-core=2.4.3=pypi_0
50+
google-cloud-storage=2.19.0=pypi_0
51+
google-crc32c=1.5.0=pypi_0
52+
google-resumable-media=2.7.2=pypi_0
53+
googleapis-common-protos=1.69.2=pypi_0
54+
h11=0.14.0=pypi_0
55+
h5py=3.8.0=pypi_0
56+
httpcore=0.17.3=pypi_0
57+
httpx=0.24.1=pypi_0
58+
huggingface-hub=0.10.1=pypi_0
59+
huggingface_hub=0.16.4=pyhd8ed1ab_0
60+
idna=3.10=pyhd8ed1ab_0
61+
importlib-metadata=6.7.0=pypi_0
62+
importlib_metadata=4.11.4=hd8ed1ab_0
63+
iniconfig=2.0.0=pypi_0
64+
jmespath=1.0.1=pypi_0
65+
joblib=1.3.2=pyhd8ed1ab_0
66+
jpeg=9e=hb7f2c08_3
67+
jsonnet=0.20.0=pypi_0
68+
jsonschema=2.6.0=pypi_0
69+
lcms2=2.14=h90f4b2a_0
70+
lerc=4.0.0=hb486fe8_0
71+
libblas=3.9.0=20_osx64_openblas
72+
libcblas=3.9.0=20_osx64_openblas
73+
libcxx=20.1.3=hf95d169_0
74+
libdeflate=1.14=hb7f2c08_0
75+
libffi=3.4.4=hecd8cb5_1
76+
libgfortran=14.2.0=hef36b68_105
77+
libgfortran5=14.2.0=h58528f3_105
78+
libhwloc=2.11.2=default_h4cdd727_1001
79+
libiconv=1.18=h4b5e92a_1
80+
liblapack=3.9.0=20_osx64_openblas
81+
liblzma=5.8.1=hd471939_0
82+
libopenblas=0.3.25=openmp_hfef2a42_0
83+
libpng=1.6.47=h3c4a55f_0
84+
libprotobuf=3.21.12=h7d26f99_2
85+
libsqlite=3.45.2=h92b6c6a_0
86+
libtiff=4.4.0=h6268bbc_5
87+
libwebp-base=1.5.0=h6cf52b4_0
88+
libxcb=1.13=h0d85af4_1004
89+
libxml2=2.13.7=h3fbc333_1
90+
libzlib=1.3.1=hd23fc13_2
91+
llvm-openmp=20.1.3=ha54dae1_0
92+
lmdb=1.6.2=pypi_0
93+
mkl=2022.2.1=h44ed08c_16952
94+
more-itertools=9.1.0=pypi_0
95+
multidict=6.0.5=pypi_0
96+
multiprocess=0.70.14=pypi_0
97+
murmurhash=1.0.12=pypi_0
98+
ncurses=6.4=hcec6c5f_0
99+
neuralcoref=4.0=pypi_0
100+
ninja=1.12.1=h3c5361c_0
101+
nltk=3.8.1=pyhd8ed1ab_0
102+
numpy=1.21.6=py37h345d48f_0
103+
ollama=0.6.7=cpu_h77ccaa4_
104+
openai=1.39.0=pypi_0
105+
openjpeg=2.5.0=h5d0d7b0_1
106+
openssl=3.5.3=h230baf5_1
107+
packaging=24.0=pypi_0
108+
pandas=1.3.5=pypi_0
109+
pathtools=0.1.2=pypi_0
110+
pillow=9.5.0=pypi_0
111+
pip=22.3.1=py37hecd8cb5_0
112+
plac=0.9.6=pypi_0
113+
pluggy=1.2.0=pypi_0
114+
preshed=2.0.1=pypi_0
115+
promise=2.3=pypi_0
116+
proto-plus=1.26.1=pypi_0
117+
protobuf=3.20.0=pypi_0
118+
psutil=7.0.0=pypi_0
119+
pthread-stubs=0.4=h00291cd_1002
120+
py-rouge=1.1=pypi_0
121+
pyarrow=12.0.1=pypi_0
122+
pyasn1=0.5.1=pypi_0
123+
pyasn1-modules=0.3.0=pypi_0
124+
pycparser=2.21=pyhd8ed1ab_0
125+
pydantic=2.5.3=pypi_0
126+
pydantic-core=2.14.6=pypi_0
127+
pydot=2.0.0=pypi_0
128+
pygments=2.17.2=pypi_0
129+
pyparsing=3.1.4=pypi_0
130+
pysocks=1.7.1=py37hf985489_5
131+
pytest=7.4.4=pypi_0
132+
python=3.7.12=hf3644f1_100_cpython
133+
python-dateutil=2.9.0.post0=pypi_0
134+
python_abi=3.7=4_cp37m
135+
pytorch=1.12.1=cpu_py37h3bab975_1
136+
pytz=2025.2=pypi_0
137+
pyyaml=6.0.1=pypi_0
138+
readline=8.2=hca72f7f_0
139+
regex=2024.4.16=pypi_0
140+
requests=2.31.0=pypi_0
141+
responses=0.18.0=pypi_0
142+
rich=12.6.0=pypi_0
143+
rsa=4.9=pypi_0
144+
s3transfer=0.8.2=pypi_0
145+
sacremoses=0.0.53=pyhd8ed1ab_0
146+
scikit-learn=1.0.2=py37h572704e_0
147+
scipy=1.7.3=py37h4e3cf02_0
148+
sentence-transformers=2.2.2=pyhd8ed1ab_0
149+
sentencepiece=0.2.0=pypi_0
150+
sentry-sdk=2.25.1=pypi_0
151+
setproctitle=1.3.3=pypi_0
152+
setuptools=65.6.3=py37hecd8cb5_0
153+
shellingham=1.5.4=pypi_0
154+
shortuuid=1.0.13=pypi_0
155+
six=1.17.0=pypi_0
156+
sleef=3.8=hfe0d17b_0
157+
smmap=5.0.2=pypi_0
158+
sniffio=1.3.1=pypi_0
159+
spacy=2.1.0=pypi_0
160+
sqlite=3.45.2=h7461747_0
161+
srsly=1.0.7=pypi_0
162+
stanford-openie=1.3.2=pypi_0
163+
stanfordcorenlp=3.9.1.1=pypi_0
164+
stanfordnlp=0.2.0=pypi_0
165+
starlette=0.27.0=pyhd8ed1ab_0
166+
tbb=2021.13.0=hb890de9_1
167+
tensorboardx=2.6.2.2=pypi_0
168+
termcolor=1.1.0=pypi_0
169+
thinc=7.0.8=pypi_0
170+
threadpoolctl=3.1.0=pyh8a188c0_0
171+
tk=8.6.13=h1abcd95_1
172+
tokenizers=0.12.1=pypi_0
173+
tomli=2.0.1=pypi_0
174+
torchvision=0.13.1=pypi_0
175+
tqdm=4.67.1=pyhd8ed1ab_0
176+
traitlets=5.9.0=pypi_0
177+
transformers=4.20.1=pypi_0
178+
typer=0.15.2=pypi_0
179+
typing-extensions=4.7.1=hd8ed1ab_0
180+
typing_extensions=4.7.1=pyha770c72_0
181+
urllib3=1.26.20=pypi_0
182+
uvicorn=0.19.0=py37hf985489_0
183+
wandb=0.12.21=pypi_0
184+
wasabi=0.10.1=pypi_0
185+
wcwidth=0.2.13=pypi_0
186+
wget=3.2=pypi_0
187+
wheel=0.38.4=py37hecd8cb5_0
188+
wikipedia-api=0.6.0=pyhd8ed1ab_0
189+
word2number=1.1=pypi_0
190+
xorg-libxau=1.0.12=h6e16a3a_0
191+
xorg-libxdmcp=1.1.5=h00291cd_0
192+
xxhash=3.5.0=pypi_0
193+
xz=5.6.4=h46256e1_1
194+
yaml=0.2.5=h0d85af4_2
195+
yarl=1.9.4=pypi_0
196+
zipp=3.15.0=pyhd8ed1ab_0
197+
zlib=1.3.1=hd23fc13_2
198+
zstd=1.5.7=h8210216_2

fastapi/app/ai/synthesis.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import requests
2+
from html_to_markdown import convert_to_markdown
3+
import re
4+
from typing import List, Tuple
5+
from dataclasses import dataclass, field
6+
7+
TITLE = 0
8+
TEXT = 1
9+
MEDIA = 2
10+
TABULAR = 3
11+
REFERENCES = 4
12+
13+
@dataclass
14+
class TextFragment:
15+
text: str
16+
missing_info: List[str] = field(default_factory=list)
17+
18+
@dataclass
19+
class ArticleModel:
20+
titles: list[str] = field(default_factory=list)
21+
text: list[TextFragment] = field(default_factory=list)
22+
media: list[str] = field(default_factory=list)
23+
tabular: list[str] = field(default_factory=list)
24+
references: list[str] = field(default_factory=list)
25+
26+
structure: List[Tuple[int, int]] = field(default_factory=list)
27+
28+
def add_to_section(self, section_type: int, section_content: str):
29+
idx = 0
30+
if section_type == 0:
31+
self.titles.append(section_content)
32+
idx = len(self.titles)-1
33+
elif section_type == 1:
34+
self.text.append(TextFragment(text=section_content))
35+
idx = len(self.text)-1
36+
elif section_type == 2:
37+
self.media.append(section_content)
38+
idx = len(self.media)-1
39+
elif section_type == 3:
40+
self.tabular.append(section_content)
41+
idx = len(self.tabular)-1
42+
elif section_type == 4:
43+
self.references.append(section_content)
44+
idx = len(self.references)-1
45+
self.structure.append((section_type, idx))
46+
47+
def html_to_md(page_name):
48+
params = {
49+
"action": "parse",
50+
"page": page_name,
51+
"prop": "text",
52+
"format": "json",
53+
"formatversion": 2,
54+
"redirects": 1
55+
}
56+
57+
r = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers={"User-Agent": "YourAppName/1.0 ([email protected])"}, timeout=30)
58+
html = r.json()["parse"]["text"]
59+
markdown = convert_to_markdown(html)
60+
61+
return markdown
62+
63+
def create_article_model(md_content: str) -> ArticleModel:
64+
def html_image_to_markdown(html_img):
65+
src_match = re.search(r'src=["\']([^"\']+)["\']', html_img)
66+
alt_match = re.search(r'alt=["\']([^"\']*)["\']', html_img)
67+
68+
if not src_match:
69+
return html_img
70+
71+
src = src_match.group(1)
72+
alt = alt_match.group(1) if alt_match else ''
73+
74+
return f'![{alt}]({src})'
75+
76+
def is_wiki_reference(text: str) -> bool:
77+
pattern = r'^\d+\.\s+(\*\*\[\^.*?\]\(#cite_ref-.*?\)\*\*|\^\s+\[)'
78+
return bool(re.match(pattern, text.strip(), re.DOTALL))
79+
80+
def strip_wiki_links(text: str) -> str:
81+
wiki_link_pattern = re.compile(
82+
r'\[([^\]]+)\]\(\s*(?:[^)\s]*?/wiki/[^)\s]*)(?:\s+"[^"]*")?\s*\)'
83+
)
84+
return wiki_link_pattern.sub(r'\1', text)
85+
86+
def remove_inline_citations(text: str) -> str:
87+
citation_pattern = re.compile(
88+
r'\[*\\\[\s*\d+\s*\]\s*\]\(\s*#cite_note-\d+(?:-[^)]+)?\s*\)'
89+
)
90+
return citation_pattern.sub('', text)
91+
92+
def is_table_row(text: str) -> bool:
93+
pattern = r'^\|.*\|$|^[\|\s]*[-:]+[\|\s\-:]*$'
94+
return bool(re.match(pattern, text))
95+
96+
def is_image(text: str) -> bool:
97+
pattern = r'<img\s+[^>]*?src=["\'].*?["\'][^>]*?/?>'
98+
return bool(re.match(pattern, text))
99+
100+
def remove_wiki_edit_links(text):
101+
pattern = r'\[\[edit\]\([^)]+\)\]'
102+
return re.sub(pattern, '', text)
103+
104+
model = ArticleModel()
105+
if not md_content:
106+
return model
107+
108+
# clean article first
109+
content = strip_wiki_links(md_content)
110+
content = remove_inline_citations(content)
111+
content = content.replace("\\", "") # todo: remove '\' characters
112+
content = remove_wiki_edit_links(content) # todo: remove wiki [edit] links
113+
114+
# process references first
115+
refs_heading_pattern = re.compile(
116+
r'(?m)^(?:#{1,6}\s*References\s*$|References\s*\n[-=]{3,}\s*$)'
117+
)
118+
refstart = refs_heading_pattern.search(content)
119+
references_str_raw = content[refstart.start():len(content)]
120+
121+
for ref in references_str_raw.split("\n"):
122+
if is_wiki_reference(ref):
123+
model.add_to_section(REFERENCES, ref.strip())
124+
125+
# process rest of article
126+
article_content = content[0:refstart.start()]
127+
article_lines = article_content.split("\n")
128+
line_idx = 1
129+
article_end = len(article_lines)-1
130+
131+
def peek(idx):
132+
if idx == article_end:
133+
return ""
134+
return article_lines[idx+1]
135+
136+
def parse_table(start_idx):
137+
table_str = article_lines[start_idx] + "\n"
138+
idx = start_idx+1
139+
while idx <= article_end and is_table_row(article_lines[idx]):
140+
table_str += article_lines[idx] + "\n"
141+
idx += 1
142+
143+
model.add_to_section(TABULAR, table_str)
144+
return idx
145+
146+
def parse_image(start_idx):
147+
image_str = article_lines[start_idx] + "\n"
148+
idx = start_idx+1
149+
nextline = article_lines[idx]
150+
while nextline.strip() == "":
151+
idx+=1
152+
nextline = article_lines[idx]
153+
154+
if nextline.startswith("*") and nextline.endswith("*"):
155+
image_str += nextline + "\n"
156+
157+
model.add_to_section(MEDIA, image_str)
158+
return idx+1
159+
160+
while line_idx <= article_end:
161+
if article_lines[line_idx] == "" or article_lines[line_idx].isspace():
162+
line_idx+=1
163+
continue
164+
165+
if is_table_row(article_lines[line_idx]):
166+
line_idx = parse_table(line_idx)
167+
continue
168+
elif is_image(article_lines[line_idx]):
169+
line_idx = parse_image(line_idx)
170+
continue
171+
172+
if peek(line_idx).startswith("-"):
173+
model.add_to_section(TITLE, article_lines[line_idx])
174+
line_idx += 2
175+
continue
176+
177+
# else its text
178+
model.add_to_section(TEXT, article_lines[line_idx])
179+
line_idx += 1
180+
181+
for x in model.tabular:
182+
print("-"*50)
183+
print(x)
184+
185+
return model
186+
187+
188+
# article_titles = ["Pet door", "Owner-occupancy"]
189+
#
190+
# md = html_to_md(article_titles[1])
191+
# model = create_article_model(md)
192+

0 commit comments

Comments
 (0)