diff --git a/.gitignore b/.gitignore
index 5025672..c368399 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,180 @@
-arxiv-metadata-oai-snapshot.json
+/data/arxiv-metadata-oai-snapshot.json
+/data/**/checkpoint*/
*.pkl
*.DS_STORE
-*.log
\ No newline at end of file
+*.log
+
+## Standard Python ignores:
+arxiv-metadata-oai-snapshot.json
+*.DS_STORE
+*.log
+.tool-versions
+data/.ipynb_checkpoints/
+.env
+
+data/wandb/**
+data/paper-multilabel-finetuning
+*.zip
+data/checkpoint
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/README.md b/README.md
index 243baca..2e403e9 100644
--- a/README.md
+++ b/README.md
@@ -76,6 +76,21 @@ Both **Redis Stack** and the paper search app run with **Docker Compose** using
$ docker compose -f docker-local-redis.yml up
```
+### Running the code locally, Redis in Docker
+For local development.
+
+Build frontend and install Python dependencies locally:
+```bash
+$ sh /install-local.sh # compiles frontend, installs backend dependencies
+```
+
+Run local Redis in docker, but execute code locally so that code changes are reloaded automatically:
+```bash
+$ sh run-local.sh # runs Redis in local docker, runs code in local environment without docker
+```
+
+If you don't have command `docker compose` but have `docker-compose`, do `export DOCKER_COMPOSE="docker-compose"` prior to `run-local.sh`.
+
### Customizing (optional)
You can use the Jupyter Notebooks in the [`data/`](data/README.md) directory to create paper embeddings and metadata. The pickled dataframe will end up stored in the `data/` directory and used when creating your own container.
diff --git a/backend/vecsim_app/api/routes.py b/backend/vecsim_app/api/routes.py
index f894e14..20c7c9c 100644
--- a/backend/vecsim_app/api/routes.py
+++ b/backend/vecsim_app/api/routes.py
@@ -1,23 +1,35 @@
import asyncio
+import logging
import typing as t
-import redis.asyncio as redis
+import redis.asyncio as redis
from fastapi import APIRouter
from vecsim_app import config
+from vecsim_app.categories import CATEGORIES
from vecsim_app.embeddings import Embeddings
from vecsim_app.models import Paper
-
+from vecsim_app.multilabel_classifier.inference import load_models, predict_categories
from vecsim_app.schema import (
+ CategoriesPredictionRequest,
SimilarityRequest,
UserTextSimilarityRequest
)
from vecsim_app.search_index import SearchIndex
-
paper_router = r = APIRouter()
redis_client = redis.from_url(config.REDIS_URL)
embeddings = Embeddings()
search_index = SearchIndex()
+STATE = {}
+
+mlc_path = f"{config.DATA_LOCATION}/multilabel_classifier/checkpoint"
+mlc_model, mlc_tokenizer, mlc_b = load_models(mlc_path, f"{mlc_path}/mlb.pkl")
+
+
+def _cut_off_category_description(c: str):
+ # 'q-fin.TR (Trading and Market Microstructure)' -> 'q-fin.TR`
+ return c.split()[0]
+
async def process_paper(p, i: int) -> t.Dict[str, t.Any]:
paper = await Paper.get(p.paper_pk)
@@ -26,42 +38,44 @@ async def process_paper(p, i: int) -> t.Dict[str, t.Any]:
paper['similarity_score'] = score
return paper
+
async def papers_from_results(total, results) -> t.Dict[str, t.Any]:
# extract papers from VSS results
- return {
- 'total': total,
- 'papers': [
- await process_paper(p, i)
- for i, p in enumerate(results.docs)
+ results = [await process_paper(p, i) for i, p in enumerate(results.docs)]
+ dump = "\n".join(
+ [
+ f" [{r['similarity_score']:.3f}] " + r['title'].replace('\n', ' ')
+ for r in results
]
+ )
+ logging.debug(f"Retrieved {len(results)} papers:\n" + dump)
+ return {
+ "total": total,
+ "papers": results,
}
@r.get("/", response_model=t.Dict)
async def get_papers(
- limit: int = 20,
- skip: int = 0,
- years: str = "",
- categories: str = ""
+ limit: int = 20, skip: int = 0, years: str = "", categories: str = ""
):
papers = []
expressions = []
- years = [year for year in years.split(",") if year]
- categories = [cat for cat in categories.split(",") if cat]
+ years = [y for y in years.split(",") if y]
+ categories = [_cut_off_category_description(c) for c in categories.split(",") if c]
if years and categories:
- expressions.append(
- (Paper.year << years) & \
- (Paper.categories << categories)
- )
+ expressions.append((Paper.year << years) & (Paper.categories << categories))
elif years and not categories:
expressions.append(Paper.year << years)
elif categories and not years:
expressions.append(Paper.categories << categories)
# Run query
- papers = await Paper.find(*expressions)\
- .copy(offset=skip, limit=limit)\
+ papers = (
+ await Paper.find(*expressions)
+ .copy(offset=skip, limit=limit)
.execute(exhaust_results=False)
+ )
# Get total count
total = (
@@ -69,24 +83,39 @@ async def get_papers(
search_index.count_query(years=years, categories=categories)
)
).total
+ return {"total": total, "papers": papers}
+
+
+@r.post("/predict-categories", response_model=t.Dict)
+async def route_predict_categories(categories_request: CategoriesPredictionRequest):
+ categories = predict_categories(
+ categories_request.articles,
+ mlc_model,
+ mlc_tokenizer,
+ mlc_b,
+ proba_threshold=categories_request.proba_threshold,
+ )
return {
- 'total': total,
- 'papers': papers
+ "categories": categories,
+ "categories_names": [CATEGORIES.get(c) for c in categories],
}
@r.post("/vectorsearch/text", response_model=t.Dict)
async def find_papers_by_text(similarity_request: SimilarityRequest):
# Create query
+ categories = [
+ _cut_off_category_description(c) for c in similarity_request.categories
+ ]
query = search_index.vector_query(
- similarity_request.categories,
+ categories,
similarity_request.years,
similarity_request.search_type,
- similarity_request.number_of_results
+ similarity_request.number_of_results,
+ categories_operator=similarity_request.categories_operator,
)
count_query = search_index.count_query(
- years=similarity_request.years,
- categories=similarity_request.categories
+ years=similarity_request.years, categories=similarity_request.categories
)
# find the vector of the Paper listed in the request
@@ -96,7 +125,9 @@ async def find_papers_by_text(similarity_request: SimilarityRequest):
# obtain results of the queries
total, results = await asyncio.gather(
redis_client.ft(config.INDEX_NAME).search(count_query),
- redis_client.ft(config.INDEX_NAME).search(query, query_params={"vec_param": vector})
+ redis_client.ft(config.INDEX_NAME).search(
+ query, query_params={"vec_param": vector}
+ ),
)
# Get Paper records of those results
@@ -106,27 +137,38 @@ async def find_papers_by_text(similarity_request: SimilarityRequest):
@r.post("/vectorsearch/text/user", response_model=t.Dict)
async def find_papers_by_user_text(similarity_request: UserTextSimilarityRequest):
# Create query
+ categories = [
+ _cut_off_category_description(c) for c in similarity_request.categories
+ ]
+
query = search_index.vector_query(
- similarity_request.categories,
+ categories,
similarity_request.years,
similarity_request.search_type,
- similarity_request.number_of_results
+ similarity_request.number_of_results,
)
count_query = search_index.count_query(
- years=similarity_request.years,
- categories=similarity_request.categories
+ years=similarity_request.years, categories=similarity_request.categories
)
- # obtain results of the queries
- total, results = await asyncio.gather(
+ articles = [a["text"] for a in similarity_request.articles if a["text"].strip()]
+ if not articles:
+ return {}
+ article_embeddings = [embeddings.make(a) for a in articles]
+ mid_embedding = sum(article_embeddings) / len(article_embeddings)
+
+ # debug:
+ for ae in article_embeddings:
+ logging.debug(ae[:5])
+ logging.debug(mid_embedding[:5])
+
+ total, result = await asyncio.gather(
redis_client.ft(config.INDEX_NAME).search(count_query),
redis_client.ft(config.INDEX_NAME).search(
query,
- query_params={
- "vec_param": embeddings.make(similarity_request.user_text).tobytes()
- }
- )
+ query_params={"vec_param": mid_embedding.tobytes()},
+ ),
)
# Get Paper records of those results
- return await papers_from_results(total.total, results)
+ return await papers_from_results(total.total, result)
diff --git a/backend/vecsim_app/categories.py b/backend/vecsim_app/categories.py
new file mode 100644
index 0000000..25dbe73
--- /dev/null
+++ b/backend/vecsim_app/categories.py
@@ -0,0 +1,155 @@
+CATEGORIES = {
+ "astro-ph": "Astrophysics",
+ "astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
+ "astro-ph.EP": "Earth and Planetary Astrophysics",
+ "astro-ph.GA": "Astrophysics of Galaxies",
+ "astro-ph.HE": "High Energy Astrophysical Phenomena",
+ "astro-ph.IM": "Instrumentation and Methods for Astrophysics",
+ "astro-ph.SR": "Solar and Stellar Astrophysics",
+ "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
+ "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
+ "cond-mat.mtrl-sci": "Materials Science",
+ "cond-mat.other": "Other Condensed Matter",
+ "cond-mat.quant-gas": "Quantum Gases",
+ "cond-mat.soft": "Soft Condensed Matter",
+ "cond-mat.stat-mech": "Statistical Mechanics",
+ "cond-mat.str-el": "Strongly Correlated Electrons",
+ "cond-mat.supr-con": "Superconductivity",
+ "cs.AI": "Artificial Intelligence",
+ "cs.AR": "Hardware Architecture",
+ "cs.CC": "Computational Complexity",
+ "cs.CE": "Computational Engineering, Finance, and Science",
+ "cs.CG": "Computational Geometry",
+ "cs.CL": "Computation and Language",
+ "cs.CR": "Cryptography and Security",
+ "cs.CV": "Computer Vision and Pattern Recognition",
+ "cs.CY": "Computers and Society",
+ "cs.DB": "Databases",
+ "cs.DC": "Distributed, Parallel, and Cluster Computing",
+ "cs.DL": "Digital Libraries",
+ "cs.DM": "Discrete Mathematics",
+ "cs.DS": "Data Structures and Algorithms",
+ "cs.ET": "Emerging Technologies",
+ "cs.FL": "Formal Languages and Automata Theory",
+ "cs.GL": "General Literature",
+ "cs.GR": "Graphics",
+ "cs.GT": "Computer Science and Game Theory",
+ "cs.HC": "Human-Computer Interaction",
+ "cs.IR": "Information Retrieval",
+ "cs.IT": "Information Theory",
+ "cs.LG": "Machine Learning",
+ "cs.LO": "Logic in Computer Science",
+ "cs.MA": "Multiagent Systems",
+ "cs.MM": "Multimedia",
+ "cs.MS": "Mathematical Software",
+ "cs.NA": "Numerical Analysis",
+ "cs.NE": "Neural and Evolutionary Computing",
+ "cs.NI": "Networking and Internet Architecture",
+ "cs.OH": "Other Computer Science",
+ "cs.OS": "Operating Systems",
+ "cs.PF": "Performance",
+ "cs.PL": "Programming Languages",
+ "cs.RO": "Robotics",
+ "cs.SC": "Symbolic Computation",
+ "cs.SD": "Sound",
+ "cs.SE": "Software Engineering",
+ "cs.SI": "Social and Information Networks",
+ "cs.SY": "Systems and Control",
+ "econ.EM": "Econometrics",
+ "eess.AS": "Audio and Speech Processing",
+ "eess.IV": "Image and Video Processing",
+ "eess.SP": "Signal Processing",
+ "gr-qc": "General Relativity and Quantum Cosmology",
+ "hep-ex": "High Energy Physics - Experiment",
+ "hep-lat": "High Energy Physics - Lattice",
+ "hep-ph": "High Energy Physics - Phenomenology",
+ "hep-th": "High Energy Physics - Theory",
+ "math.AC": "Commutative Algebra",
+ "math.AG": "Algebraic Geometry",
+ "math.AP": "Analysis of PDEs",
+ "math.AT": "Algebraic Topology",
+ "math.CA": "Classical Analysis and ODEs",
+ "math.CO": "Combinatorics",
+ "math.CT": "Category Theory",
+ "math.CV": "Complex Variables",
+ "math.DG": "Differential Geometry",
+ "math.DS": "Dynamical Systems",
+ "math.FA": "Functional Analysis",
+ "math.GM": "General Mathematics",
+ "math.GN": "General Topology",
+ "math.GR": "Group Theory",
+ "math.GT": "Geometric Topology",
+ "math.HO": "History and Overview",
+ "math.IT": "Information Theory",
+ "math.KT": "K-Theory and Homology",
+ "math.LO": "Logic",
+ "math.MG": "Metric Geometry",
+ "math.MP": "Mathematical Physics",
+ "math.NA": "Numerical Analysis",
+ "math.NT": "Number Theory",
+ "math.OA": "Operator Algebras",
+ "math.OC": "Optimization and Control",
+ "math.PR": "Probability",
+ "math.QA": "Quantum Algebra",
+ "math.RA": "Rings and Algebras",
+ "math.RT": "Representation Theory",
+ "math.SG": "Symplectic Geometry",
+ "math.SP": "Spectral Theory",
+ "math.ST": "Statistics Theory",
+ "math-ph": "Mathematical Physics",
+ "nlin.AO": "Adaptation and Self-Organizing Systems",
+ "nlin.CD": "Chaotic Dynamics",
+ "nlin.CG": "Cellular Automata and Lattice Gases",
+ "nlin.PS": "Pattern Formation and Solitons",
+ "nlin.SI": "Exactly Solvable and Integrable Systems",
+ "nucl-ex": "Nuclear Experiment",
+ "nucl-th": "Nuclear Theory",
+ "physics.acc-ph": "Accelerator Physics",
+ "physics.ao-ph": "Atmospheric and Oceanic Physics",
+ "physics.app-ph": "Applied Physics",
+ "physics.atm-clus": "Atomic and Molecular Clusters",
+ "physics.atom-ph": "Atomic Physics",
+ "physics.bio-ph": "Biological Physics",
+ "physics.chem-ph": "Chemical Physics",
+ "physics.class-ph": "Classical Physics",
+ "physics.comp-ph": "Computational Physics",
+ "physics.data-an": "Data Analysis, Statistics and Probability",
+ "physics.ed-ph": "Physics Education",
+ "physics.flu-dyn": "Fluid Dynamics",
+ "physics.gen-ph": "General Physics",
+ "physics.geo-ph": "Geophysics",
+ "physics.hist-ph": "History and Philosophy of Physics",
+ "physics.ins-det": "Instrumentation and Detectors",
+ "physics.med-ph": "Medical Physics",
+ "physics.optics": "Optics",
+ "physics.plasm-ph": "Plasma Physics",
+ "physics.pop-ph": "Popular Physics",
+ "physics.soc-ph": "Physics and Society",
+ "physics.space-ph": "Space Physics",
+ "q-bio.BM": "Biomolecules",
+ "q-bio.CB": "Cell Behavior",
+ "q-bio.GN": "Genomics",
+ "q-bio.MN": "Molecular Networks",
+ "q-bio.NC": "Neurons and Cognition",
+ "q-bio.OT": "Other Quantitative Biology",
+ "q-bio.PE": "Populations and Evolution",
+ "q-bio.QM": "Quantitative Methods",
+ "q-bio.SC": "Subcellular Processes",
+ "q-bio.TO": "Tissues and Organs",
+ "q-fin.CP": "Computational Finance",
+ "q-fin.EC": "Economics",
+ "q-fin.GN": "General Finance",
+ "q-fin.MF": "Mathematical Finance",
+ "q-fin.PM": "Portfolio Management",
+ "q-fin.PR": "Pricing of Securities",
+ "q-fin.RM": "Risk Management",
+ "q-fin.ST": "Statistical Finance",
+ "q-fin.TR": "Trading and Market Microstructure",
+ "quant-ph": "Quantum Physics",
+ "stat.AP": "Applications",
+ "stat.CO": "Computation",
+ "stat.ME": "Methodology",
+ "stat.ML": "Machine Learning",
+ "stat.OT": "Other Statistics",
+ "stat.TH": "Statistics Theory",
+}
diff --git a/backend/vecsim_app/config.py b/backend/vecsim_app/config.py
index 111d95d..3ae965d 100644
--- a/backend/vecsim_app/config.py
+++ b/backend/vecsim_app/config.py
@@ -10,6 +10,8 @@
REDIS_DB = os.environ.get("REDIS_DB", 0)
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", "testing123")
REDIS_URL = f"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}"
+SERVER_HOST = os.environ.get("SERVER_HOSt", "0.0.0.0")
+SERVER_PORT = os.environ.get("SERVER_PORT", 8888)
os.environ["REDIS_DATA_URL"] = REDIS_URL
os.environ["REDIS_OM_URL"] = REDIS_URL
API_V1_STR = "/api/v1"
diff --git a/backend/vecsim_app/data_utils.py b/backend/vecsim_app/data_utils.py
new file mode 100644
index 0000000..96d7f5f
--- /dev/null
+++ b/backend/vecsim_app/data_utils.py
@@ -0,0 +1,35 @@
+import json
+import re
+
+
+def _process(paper: dict, year_pattern: str):
+ paper = json.loads(paper)
+ if paper["journal-ref"]:
+ years = [int(year) for year in re.findall(year_pattern, paper["journal-ref"])]
+ years = [year for year in years if (year <= 2022 and year >= 1991)]
+ year = min(years) if years else None
+ else:
+ year = None
+ return {
+ "id": paper["id"],
+ "title": paper["title"],
+ "year": year,
+ "authors": paper["authors"],
+ "categories": ",".join(paper["categories"].split(" ")),
+ "abstract": paper["abstract"],
+ }
+
+
+def papers(
+ data_path: str, year_cutoff: int, year_pattern: str, ml_category: str = None
+):
+ with open(data_path, "r") as f:
+ for paper in f:
+ paper = _process(paper, year_pattern)
+ if paper["year"]:
+ m = ml_category
+ ml_category_condition = (
+ m is not None and m in paper["categories"] or m is None
+ )
+ if paper["year"] >= year_cutoff and ml_category_condition:
+ yield paper
diff --git a/backend/vecsim_app/entrypoint.sh b/backend/vecsim_app/entrypoint.sh
old mode 100644
new mode 100755
index 7a81268..98f63b5
--- a/backend/vecsim_app/entrypoint.sh
+++ b/backend/vecsim_app/entrypoint.sh
@@ -1,4 +1,5 @@
#!/bin/sh
+set -eux
python load_data.py
diff --git a/backend/vecsim_app/load_data.py b/backend/vecsim_app/load_data.py
index cff5c15..532e0df 100644
--- a/backend/vecsim_app/load_data.py
+++ b/backend/vecsim_app/load_data.py
@@ -1,10 +1,10 @@
#!/usr/bin/env python3
-import typing as t
import asyncio
-import numpy as np
import pickle
-import redis.asyncio as redis
+import typing as t
+import numpy as np
+import redis.asyncio as redis
from redis.commands.search.field import TagField
from vecsim_app import config
from vecsim_app.models import Paper
@@ -12,12 +12,17 @@
def read_paper_df() -> t.List:
- with open(config.DATA_LOCATION + "/arxiv_embeddings_10000.pkl", "rb") as f:
+ path = config.DATA_LOCATION + "/embeddings/arxiv_embeddings_400000.pkl"
+ print(f"Loading data from : {path}")
+ with open(path, "rb") as f:
df = pickle.load(f)
+ print(f"Loaded {len(df)} items")
return df
+
async def gather_with_concurrency(n, redis_conn, *papers):
semaphore = asyncio.Semaphore(n)
+
async def load_paper(paper):
async with semaphore:
vector = paper.pop('vector')
@@ -26,7 +31,7 @@ async def load_paper(paper):
paper['categories'] = paper['categories'].replace(",", "|")
p = Paper(**paper)
# save model TODO -- combine these two objects eventually
- await p.save()
+ await p.save(redis_conn)
# save vector data
key = "paper_vector:" + str(p.paper_id)
await redis_conn.hset(
@@ -37,10 +42,12 @@ async def load_paper(paper):
"categories": p.categories,
"year": p.year,
"vector": np.array(vector, dtype=np.float32).tobytes(),
- })
+ })
+
# gather with concurrency
await asyncio.gather(*[load_paper(p) for p in papers])
+
async def load_all_data():
# TODO use redis-om connection
redis_conn = redis.from_url(config.REDIS_URL)
diff --git a/backend/vecsim_app/main.py b/backend/vecsim_app/main.py
index 93b054c..deacc3f 100644
--- a/backend/vecsim_app/main.py
+++ b/backend/vecsim_app/main.py
@@ -1,19 +1,17 @@
import uvicorn
+import logging
from pathlib import Path
-from aredis_om import (
- get_redis_connection,
- Migrator
-)
+from aredis_om import Migrator, get_redis_connection
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from starlette.middleware.cors import CORSMiddleware
-
from vecsim_app import config
-from vecsim_app.models import Paper
from vecsim_app.api import routes
+from vecsim_app.models import Paper
from vecsim_app.spa import SinglePageApplication
+logging.basicConfig(level=logging.DEBUG)
app = FastAPI(
title=config.PROJECT_NAME,
@@ -21,14 +19,6 @@
openapi_url=config.OPENAPI_DOCS
)
-app.add_middleware(
- CORSMiddleware,
- allow_origins="*",
- allow_credentials=True,
- allow_methods=["*"],
- allow_headers=["*"]
-)
-
# Routers
app.include_router(
routes.paper_router,
@@ -55,21 +45,37 @@ async def startup():
app.mount(
path="/", app=SinglePageApplication(directory=gui_build_dir), name="SPA"
)
-
if __name__ == "__main__":
+ import logging
import os
+
+ logging.basicConfig(level=logging.INFO)
+
env = os.environ.get("DEPLOYMENT", "prod")
+ logging.info(f"Running in {env} mode")
server_attr = {
- "host": "0.0.0.0",
+ "host": config.SERVER_HOST,
"reload": True,
- "port": 8888,
- "workers": 1
+ "port": int(config.SERVER_PORT),
+ "workers": 1,
+ "log_level": "debug",
}
if env == "prod":
- server_attr.update({"reload": False,
- "workers": 2,
- "ssl_keyfile": "key.pem",
- "ssl_certfile": "full.pem"})
+ server_attr.update(
+ {
+ "reload": False,
+ "workers": 2,
+ "ssl_keyfile": "key.pem",
+ "ssl_certfile": "full.pem",
+ }
+ )
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins="*",
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"]
+ )
- uvicorn.run("main:app", **server_attr)
+ uvicorn.run("vecsim_app.main:app", **server_attr)
diff --git a/backend/vecsim_app/multilabel_classifier/__init__.py b/backend/vecsim_app/multilabel_classifier/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/backend/vecsim_app/multilabel_classifier/inference.py b/backend/vecsim_app/multilabel_classifier/inference.py
new file mode 100644
index 0000000..82d7d0d
--- /dev/null
+++ b/backend/vecsim_app/multilabel_classifier/inference.py
@@ -0,0 +1,61 @@
+import pickle
+from typing import List
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, BertForSequenceClassification
+
+
+def predict_categories_on_single_text(text, model, tokenizer, mlb, proba_threshold=0.5):
+
+ encoding = tokenizer(text, return_tensors="pt")
+ encoding = {k: v.to(model.device) for k, v in encoding.items()}
+
+ outputs = model(**encoding)
+ logits = outputs.logits
+
+ # apply sigmoid + threshold
+ sigmoid = torch.nn.Sigmoid()
+ probs = sigmoid(logits.squeeze().cpu())
+ # predictions = probs.detach().numpy()
+ predictions = np.zeros(probs.shape)
+ predictions[np.where(probs >= proba_threshold)] = 1
+
+ classes = mlb.inverse_transform(predictions.reshape(1, -1))
+
+ if len(classes) > 0:
+ classes = classes[0]
+ else:
+ classes = []
+
+ return classes, probs
+
+
+def load_models(
+ multilabel_model_path="categories", multilabel_binarizer_path="mlb.pkl"
+):
+ model = BertForSequenceClassification.from_pretrained(
+ multilabel_model_path, problem_type="multi_label_classification"
+ )
+
+ tokenizer = AutoTokenizer.from_pretrained(multilabel_model_path)
+
+ with open(multilabel_binarizer_path, "rb") as handle:
+ mlb = pickle.load(handle)
+
+ return model, tokenizer, mlb
+
+
+def predict_categories(queries: List[str], model, tokenizer, mlb, proba_threshold=0.45):
+
+ categories = []
+
+ for query in queries:
+ cat, probs = predict_categories_on_single_text(
+ query, model, tokenizer, mlb, proba_threshold=proba_threshold
+ )
+
+ categories.extend(cat)
+
+ # return sorted(categories.items())
+ return sorted(set(categories))
diff --git a/backend/vecsim_app/schema/__init__.py b/backend/vecsim_app/schema/__init__.py
index 78bdb9a..866d5fe 100644
--- a/backend/vecsim_app/schema/__init__.py
+++ b/backend/vecsim_app/schema/__init__.py
@@ -1,4 +1,2 @@
-from .search import (
- SimilarityRequest,
- UserTextSimilarityRequest
-)
\ No newline at end of file
+from .predict_categories import CategoriesPredictionRequest
+from .search import SimilarityRequest, UserTextSimilarityRequest
diff --git a/backend/vecsim_app/schema/predict_categories.py b/backend/vecsim_app/schema/predict_categories.py
new file mode 100644
index 0000000..c72197c
--- /dev/null
+++ b/backend/vecsim_app/schema/predict_categories.py
@@ -0,0 +1,6 @@
+from pydantic import BaseModel
+
+
+class CategoriesPredictionRequest(BaseModel):
+ articles: list
+ proba_threshold: float = 0.35
diff --git a/backend/vecsim_app/schema/search.py b/backend/vecsim_app/schema/search.py
index 8b7b9dc..e959e43 100644
--- a/backend/vecsim_app/schema/search.py
+++ b/backend/vecsim_app/schema/search.py
@@ -7,9 +7,11 @@ class SimilarityRequest(BaseModel):
years: list
number_of_results: int = 15
search_type: str = "KNN"
+ categories_operator: str = "AND"
+
class UserTextSimilarityRequest(BaseModel):
- user_text: str
+ articles: list
categories: list
years: list
number_of_results: int = 15
diff --git a/backend/vecsim_app/search_index.py b/backend/vecsim_app/search_index.py
index 423e53a..0c3f255 100644
--- a/backend/vecsim_app/search_index.py
+++ b/backend/vecsim_app/search_index.py
@@ -1,11 +1,12 @@
+import logging
import re
+from typing import Optional, Pattern
-from config import INDEX_NAME
from redis.asyncio import Redis
-from redis.commands.search.query import Query
-from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.field import VectorField
-from typing import Optional, Pattern
+from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+from redis.commands.search.query import Query
+from vecsim_app.config import INDEX_NAME
class TokenEscaper:
@@ -23,12 +24,15 @@ def __init__(self, escape_chars_re: Optional[Pattern] = None):
self.escaped_chars_re = re.compile(self.DEFAULT_ESCAPED_CHARS)
def escape(self, value: str) -> str:
+ value = str(value)
+
def escape_symbol(match):
value = match.group(0)
return f"\\{value}"
return self.escaped_chars_re.sub(escape_symbol, value)
+
class SearchIndex:
"""
SearchIndex is used to wrap and capture all information
@@ -62,8 +66,7 @@ async def create_flat(
"DISTANCE_METRIC": distance_metric,
"INITIAL_CAP": number_of_vectors,
"BLOCK_SIZE": number_of_vectors
- }
- )
+ })
await self._create(
*fields,
vector_field,
@@ -95,14 +98,8 @@ async def create_hnsw(
"DIM": 768,
"DISTANCE_METRIC": distance_metric,
"INITIAL_CAP": number_of_vectors,
- }
- )
- await self._create(
- *fields,
- vector_field,
- redis_conn=redis_conn,
- prefix=prefix
- )
+ })
+ await self._create(*fields, vector_field, redis_conn=redis_conn, prefix=prefix)
async def _create(
self,
@@ -116,7 +113,9 @@ async def _create(
definition= IndexDefinition(prefix=[prefix], index_type=IndexType.HASH)
)
- def process_tags(self, categories: list, years: list) -> str:
+ def process_tags(
+ self, categories: list, years: list, categories_operator="AND"
+ ) -> str:
"""
Helper function to process tags data. TODO - factor this
out so it's agnostic to the name of the field.
@@ -128,33 +127,39 @@ def process_tags(self, categories: list, years: list) -> str:
Returns:
str: RediSearch tag query string.
"""
- tag = "("
+ tag = []
if years:
- years = "|".join([self.escaper.escape(year) for year in years])
- tag += f"(@year:{{{years}}})"
+ years = "{" + "|".join([self.escaper.escape(y) for y in years]) + "}"
+ tag.append(f"(@year:{years})")
+
if categories:
- categories = "|".join([self.escaper.escape(cat) for cat in categories])
- if tag:
- tag += f" (@categories:{{{categories}}})"
+ if categories_operator == "AND":
+ for c in categories:
+ cat = "{" + self.escaper.escape(c) + "}"
+ tag.append(f"(@categories:{cat})")
+ elif categories_operator == "OR":
+ cat = "{" + "|".join([self.escaper.escape(c) for c in categories]) + "}"
+ tag.append(f"(@categories:{cat})")
else:
- tag += f"(@categories:{{{categories}}})"
- tag += ")"
- # if no tags are selected
- if len(tag) < 3:
- tag = "*"
- return tag
+ raise ValueError(f"Unsupported categories_operator: {categories_operator}")
+
+ if tag:
+ tag = ["("] + tag + [")"]
+ else:
+ tag = ["*"]
+
+ return "".join(tag)
def vector_query(
self,
categories: list,
years: list,
- search_type: str="KNN",
- number_of_results: int=20
+ search_type: str='KNN',
+ number_of_results: int=20,
+ categories_operator: str='AND',
) -> Query:
"""
Create a RediSearch query to perform hybrid vector and tag based searches.
-
-
Args:
categories (list): List of categories.
years (list): List of years.
@@ -166,13 +171,16 @@ def vector_query(
"""
# Parse tags to create query
- tag_query = self.process_tags(categories, years)
- base_query = f'{tag_query}=>[{search_type} {number_of_results} @vector $vec_param AS vector_score]'
- return Query(base_query)\
- .sort_by("vector_score")\
- .paging(0, number_of_results)\
- .return_fields("paper_id", "paper_pk", "vector_score")\
+ tag_query = self.process_tags(categories, years, categories_operator)
+ base_query = f"{tag_query}=>[{search_type} {number_of_results} @vector $vec_param AS vector_score]"
+ logging.debug(f"base_query: {base_query}")
+ return (
+ Query(base_query)
+ .sort_by("vector_score")
+ .paging(0, number_of_results)
+ .return_fields("paper_id", "paper_pk", "vector_score")
.dialect(2)
+ )
def count_query(
self,
@@ -191,7 +199,5 @@ def count_query(
"""
# Parse tags to create query
tag_query = self.process_tags(categories, years)
- return Query(f'{tag_query}')\
- .no_content()\
- .dialect(2)
-
+ logging.debug(f"tag_query: {tag_query}")
+ return Query(f"{tag_query}").no_content().dialect(2)
diff --git a/data/README.md b/data/README.md
index 4248801..5e473aa 100644
--- a/data/README.md
+++ b/data/README.md
@@ -1,8 +1,9 @@
+
# Data!
Generate data before trying to run this application.
-### Three Notebooks
+### Notebooks
1. `arxiv-embeddings.ipynb` (app default)
- Uses local CPU and creates embeddings for ~10k machine learning papers.
@@ -17,3 +18,10 @@ Generate data before trying to run this application.
- Output: `arxiv_embeddings_300000pkl`.
+4. `multilabel-model.ipynb`
+ - A multilabel classification model as each paper can have more than one category. We fine-tuned a transformers model (`bert-base-uncased`).
+ - Output: `mlb.pickle`, `checkpoint` folder with the NLP model weights.
+
+5. `multilabel-inference`
+ - Script showcasing the inference for the multilabel model.
+ - Output: N/A
\ No newline at end of file
diff --git a/data/categories.py b/data/categories.py
deleted file mode 100644
index d2ec10a..0000000
--- a/data/categories.py
+++ /dev/null
@@ -1,155 +0,0 @@
-_map = {
- 'astro-ph': 'Astrophysics',
- 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
- 'astro-ph.EP': 'Earth and Planetary Astrophysics',
- 'astro-ph.GA': 'Astrophysics of Galaxies',
- 'astro-ph.HE': 'High Energy Astrophysical Phenomena',
- 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
- 'astro-ph.SR': 'Solar and Stellar Astrophysics',
- 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
- 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
- 'cond-mat.mtrl-sci': 'Materials Science',
- 'cond-mat.other': 'Other Condensed Matter',
- 'cond-mat.quant-gas': 'Quantum Gases',
- 'cond-mat.soft': 'Soft Condensed Matter',
- 'cond-mat.stat-mech': 'Statistical Mechanics',
- 'cond-mat.str-el': 'Strongly Correlated Electrons',
- 'cond-mat.supr-con': 'Superconductivity',
- 'cs.AI': 'Artificial Intelligence',
- 'cs.AR': 'Hardware Architecture',
- 'cs.CC': 'Computational Complexity',
- 'cs.CE': 'Computational Engineering, Finance, and Science',
- 'cs.CG': 'Computational Geometry',
- 'cs.CL': 'Computation and Language',
- 'cs.CR': 'Cryptography and Security',
- 'cs.CV': 'Computer Vision and Pattern Recognition',
- 'cs.CY': 'Computers and Society',
- 'cs.DB': 'Databases',
- 'cs.DC': 'Distributed, Parallel, and Cluster Computing',
- 'cs.DL': 'Digital Libraries',
- 'cs.DM': 'Discrete Mathematics',
- 'cs.DS': 'Data Structures and Algorithms',
- 'cs.ET': 'Emerging Technologies',
- 'cs.FL': 'Formal Languages and Automata Theory',
- 'cs.GL': 'General Literature',
- 'cs.GR': 'Graphics',
- 'cs.GT': 'Computer Science and Game Theory',
- 'cs.HC': 'Human-Computer Interaction',
- 'cs.IR': 'Information Retrieval',
- 'cs.IT': 'Information Theory',
- 'cs.LG': 'Machine Learning',
- 'cs.LO': 'Logic in Computer Science',
- 'cs.MA': 'Multiagent Systems',
- 'cs.MM': 'Multimedia',
- 'cs.MS': 'Mathematical Software',
- 'cs.NA': 'Numerical Analysis',
- 'cs.NE': 'Neural and Evolutionary Computing',
- 'cs.NI': 'Networking and Internet Architecture',
- 'cs.OH': 'Other Computer Science',
- 'cs.OS': 'Operating Systems',
- 'cs.PF': 'Performance',
- 'cs.PL': 'Programming Languages',
- 'cs.RO': 'Robotics',
- 'cs.SC': 'Symbolic Computation',
- 'cs.SD': 'Sound',
- 'cs.SE': 'Software Engineering',
- 'cs.SI': 'Social and Information Networks',
- 'cs.SY': 'Systems and Control',
- 'econ.EM': 'Econometrics',
- 'eess.AS': 'Audio and Speech Processing',
- 'eess.IV': 'Image and Video Processing',
- 'eess.SP': 'Signal Processing',
- 'gr-qc': 'General Relativity and Quantum Cosmology',
- 'hep-ex': 'High Energy Physics - Experiment',
- 'hep-lat': 'High Energy Physics - Lattice',
- 'hep-ph': 'High Energy Physics - Phenomenology',
- 'hep-th': 'High Energy Physics - Theory',
- 'math.AC': 'Commutative Algebra',
- 'math.AG': 'Algebraic Geometry',
- 'math.AP': 'Analysis of PDEs',
- 'math.AT': 'Algebraic Topology',
- 'math.CA': 'Classical Analysis and ODEs',
- 'math.CO': 'Combinatorics',
- 'math.CT': 'Category Theory',
- 'math.CV': 'Complex Variables',
- 'math.DG': 'Differential Geometry',
- 'math.DS': 'Dynamical Systems',
- 'math.FA': 'Functional Analysis',
- 'math.GM': 'General Mathematics',
- 'math.GN': 'General Topology',
- 'math.GR': 'Group Theory',
- 'math.GT': 'Geometric Topology',
- 'math.HO': 'History and Overview',
- 'math.IT': 'Information Theory',
- 'math.KT': 'K-Theory and Homology',
- 'math.LO': 'Logic',
- 'math.MG': 'Metric Geometry',
- 'math.MP': 'Mathematical Physics',
- 'math.NA': 'Numerical Analysis',
- 'math.NT': 'Number Theory',
- 'math.OA': 'Operator Algebras',
- 'math.OC': 'Optimization and Control',
- 'math.PR': 'Probability',
- 'math.QA': 'Quantum Algebra',
- 'math.RA': 'Rings and Algebras',
- 'math.RT': 'Representation Theory',
- 'math.SG': 'Symplectic Geometry',
- 'math.SP': 'Spectral Theory',
- 'math.ST': 'Statistics Theory',
- 'math-ph': 'Mathematical Physics',
- 'nlin.AO': 'Adaptation and Self-Organizing Systems',
- 'nlin.CD': 'Chaotic Dynamics',
- 'nlin.CG': 'Cellular Automata and Lattice Gases',
- 'nlin.PS': 'Pattern Formation and Solitons',
- 'nlin.SI': 'Exactly Solvable and Integrable Systems',
- 'nucl-ex': 'Nuclear Experiment',
- 'nucl-th': 'Nuclear Theory',
- 'physics.acc-ph': 'Accelerator Physics',
- 'physics.ao-ph': 'Atmospheric and Oceanic Physics',
- 'physics.app-ph': 'Applied Physics',
- 'physics.atm-clus': 'Atomic and Molecular Clusters',
- 'physics.atom-ph': 'Atomic Physics',
- 'physics.bio-ph': 'Biological Physics',
- 'physics.chem-ph': 'Chemical Physics',
- 'physics.class-ph': 'Classical Physics',
- 'physics.comp-ph': 'Computational Physics',
- 'physics.data-an': 'Data Analysis, Statistics and Probability',
- 'physics.ed-ph': 'Physics Education',
- 'physics.flu-dyn': 'Fluid Dynamics',
- 'physics.gen-ph': 'General Physics',
- 'physics.geo-ph': 'Geophysics',
- 'physics.hist-ph': 'History and Philosophy of Physics',
- 'physics.ins-det': 'Instrumentation and Detectors',
- 'physics.med-ph': 'Medical Physics',
- 'physics.optics': 'Optics',
- 'physics.plasm-ph': 'Plasma Physics',
- 'physics.pop-ph': 'Popular Physics',
- 'physics.soc-ph': 'Physics and Society',
- 'physics.space-ph': 'Space Physics',
- 'q-bio.BM': 'Biomolecules',
- 'q-bio.CB': 'Cell Behavior',
- 'q-bio.GN': 'Genomics',
- 'q-bio.MN': 'Molecular Networks',
- 'q-bio.NC': 'Neurons and Cognition',
- 'q-bio.OT': 'Other Quantitative Biology',
- 'q-bio.PE': 'Populations and Evolution',
- 'q-bio.QM': 'Quantitative Methods',
- 'q-bio.SC': 'Subcellular Processes',
- 'q-bio.TO': 'Tissues and Organs',
- 'q-fin.CP': 'Computational Finance',
- 'q-fin.EC': 'Economics',
- 'q-fin.GN': 'General Finance',
- 'q-fin.MF': 'Mathematical Finance',
- 'q-fin.PM': 'Portfolio Management',
- 'q-fin.PR': 'Pricing of Securities',
- 'q-fin.RM': 'Risk Management',
- 'q-fin.ST': 'Statistical Finance',
- 'q-fin.TR': 'Trading and Market Microstructure',
- 'quant-ph': 'Quantum Physics',
- 'stat.AP': 'Applications',
- 'stat.CO': 'Computation',
- 'stat.ME': 'Methodology',
- 'stat.ML': 'Machine Learning',
- 'stat.OT': 'Other Statistics',
- 'stat.TH': 'Statistics Theory'
-}
\ No newline at end of file
diff --git a/data/embeddings/arxiv-embeddings-with-distilroberta.ipynb b/data/embeddings/arxiv-embeddings-with-distilroberta.ipynb
new file mode 100644
index 0000000..ef1bfea
--- /dev/null
+++ b/data/embeddings/arxiv-embeddings-with-distilroberta.ipynb
@@ -0,0 +1,716 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "fd6ed5af",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The autoreload extension is already loaded. To reload it, use:\n",
+ " %reload_ext autoreload\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "51dbaff8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\n",
+ "You should consider upgrading via the '/home/jovyan/workspace/untitled1-vector-search/.venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
+ "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install -q -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "cc20c14a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "import re\n",
+ "import string\n",
+ "\n",
+ "from vecsim_app.embeddings import Embeddings\n",
+ "from vecsim_app.data_utils import papers\n",
+ "\n",
+ "\n",
+ "DATA_PATH = \"../arxiv-metadata-oai-snapshot.json\"\n",
+ "YEAR_CUTOFF = 2012\n",
+ "YEAR_PATTERN = r\"(19|20[0-9]{2})\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "04abead5-2567-47ed-ac51-abb10ca4b4c3",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "408773"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.DataFrame(papers(data_path=DATA_PATH, year_cutoff=YEAR_CUTOFF, year_pattern=YEAR_PATTERN))\n",
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "aee130cd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "169.84534547683685"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Avg length of the abstracts\n",
+ "# df.abstract.apply(lambda a: len(a.split())).mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "a1313d8d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
| id | title | year | authors | categories | abstract |
---|
0 | 0704.0304 | The World as Evolving Information | 2012 | Carlos Gershenson | cs.IT,cs.AI,math.IT,q-bio.PE | This paper discusses the benefits of describ... |
---|
1 | 0704.2744 | Nahm transform and parabolic minimal Laplace t... | 2012 | Szilard Szabo | math.AG | We prove that Nahm transform for integrable ... |
---|
2 | 0704.2768 | Heat Equations and the Weighted $\\bar\\partial$... | 2012 | Andrew Raich | math.AP,math.CV | The purpose of this article is to establish ... |
---|
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id title year \\\n",
+ "0 0704.0304 The World as Evolving Information 2012 \n",
+ "1 0704.2744 Nahm transform and parabolic minimal Laplace t... 2012 \n",
+ "2 0704.2768 Heat Equations and the Weighted $\\bar\\partial$... 2012 \n",
+ "\n",
+ " authors categories \\\n",
+ "0 Carlos Gershenson cs.IT,cs.AI,math.IT,q-bio.PE \n",
+ "1 Szilard Szabo math.AG \n",
+ "2 Andrew Raich math.AP,math.CV \n",
+ "\n",
+ " abstract \n",
+ "0 This paper discusses the benefits of describ... \n",
+ "1 We prove that Nahm transform for integrable ... \n",
+ "2 The purpose of this article is to establish ... "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "f295cc33",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 Carlos Gershenson\n",
+ "1 Szilard Szabo\n",
+ "2 Andrew Raich\n",
+ "Name: authors_clean, dtype: object"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['authors_clean'] = df['authors'].apply(lambda a: ' '.join(re.findall(r'\\w\\w+', a)).strip())\n",
+ "df['authors_clean'][:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "6336293d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 the world as evolving information this paper ...\n",
+ "Name: text, dtype: object"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['text'] = df.apply(lambda r: Embeddings.clean_description(r['title'] + ' ' + r['abstract']), axis=1)\n",
+ "df['text'][:1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "ef747be9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0ffb374860f84975aaf2d23ceb24f5e1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/737 [00:000. We also show that this condition is\\ninvariant under proper holomorphic maps that extend smoothly to the boundary.\\n',\n",
+ " 'math.CV']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Artificial Intelligence', 'Information Theory', 'Information Theory', 'Populations and Evolution']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Algebraic Geometry']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Materials Science', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Mathematical Physics', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Spectral Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Analysis of PDEs', 'Complex Variables']\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Analysis of PDEs', 'Functional Analysis']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Functional Analysis', 'Group Theory']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Other Condensed Matter', 'Statistical Mechanics']\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "actual ['Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['General Relativity and Quantum Cosmology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Differential Geometry', 'Mathematical Physics']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Materials Science', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Mathematical Physics', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Spectral Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Mathematical Physics', 'Mathematical Physics', 'Exactly Solvable and Integrable Systems']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Mathematical Physics', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['High Energy Physics - Theory']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Mathematical Physics', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Complex Variables']\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "for text, expected_categories in pairs:\n",
+ " actual = predict_categories([text], model, tokenizer, mlb, proba_threshold=0.35)\n",
+ " print('actual', [CATEGORIES[c] for c in actual])\n",
+ " print('expected', [CATEGORIES[c] for c in sorted(expected_categories.split(','))])\n",
+ " print()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['astro-ph',\n",
+ " 'astro-ph.CO',\n",
+ " 'astro-ph.EP',\n",
+ " 'astro-ph.GA',\n",
+ " 'astro-ph.HE',\n",
+ " 'astro-ph.IM',\n",
+ " 'astro-ph.SR',\n",
+ " 'cond-mat.dis-nn',\n",
+ " 'cond-mat.mes-hall',\n",
+ " 'cond-mat.other',\n",
+ " 'cond-mat.quant-gas',\n",
+ " 'cond-mat.soft',\n",
+ " 'cond-mat.stat-mech',\n",
+ " 'cond-mat.str-el',\n",
+ " 'cond-mat.supr-con',\n",
+ " 'cs.AI',\n",
+ " 'cs.AR',\n",
+ " 'cs.CC',\n",
+ " 'cs.CE',\n",
+ " 'cs.CG',\n",
+ " 'cs.CL',\n",
+ " 'cs.CR',\n",
+ " 'cs.CV',\n",
+ " 'cs.CY',\n",
+ " 'cs.DB',\n",
+ " 'cs.DC',\n",
+ " 'cs.DL',\n",
+ " 'cs.DM',\n",
+ " 'cs.DS',\n",
+ " 'cs.ET',\n",
+ " 'cs.FL',\n",
+ " 'cs.GL',\n",
+ " 'cs.GR',\n",
+ " 'cs.GT',\n",
+ " 'cs.HC',\n",
+ " 'cs.IR',\n",
+ " 'cs.IT',\n",
+ " 'cs.LG',\n",
+ " 'cs.LO',\n",
+ " 'cs.MA',\n",
+ " 'cs.MM',\n",
+ " 'cs.MS',\n",
+ " 'cs.NA',\n",
+ " 'cs.NE',\n",
+ " 'cs.NI',\n",
+ " 'cs.OH',\n",
+ " 'cs.OS',\n",
+ " 'cs.PF',\n",
+ " 'cs.PL',\n",
+ " 'cs.RO',\n",
+ " 'cs.SC',\n",
+ " 'cs.SD',\n",
+ " 'cs.SE',\n",
+ " 'cs.SI',\n",
+ " 'cs.SY',\n",
+ " 'econ.EM',\n",
+ " 'eess.AS',\n",
+ " 'eess.IV',\n",
+ " 'eess.SP',\n",
+ " 'gr-qc',\n",
+ " 'hep-ex',\n",
+ " 'hep-lat',\n",
+ " 'hep-ph',\n",
+ " 'hep-th',\n",
+ " 'math-ph',\n",
+ " 'math.AC',\n",
+ " 'math.AG',\n",
+ " 'math.AP',\n",
+ " 'math.AT',\n",
+ " 'math.CA',\n",
+ " 'math.CO',\n",
+ " 'math.CT',\n",
+ " 'math.CV',\n",
+ " 'math.DG',\n",
+ " 'math.DS',\n",
+ " 'math.FA',\n",
+ " 'math.GM',\n",
+ " 'math.GN',\n",
+ " 'math.GR',\n",
+ " 'math.GT',\n",
+ " 'math.HO',\n",
+ " 'math.IT',\n",
+ " 'math.KT',\n",
+ " 'math.LO',\n",
+ " 'math.MG',\n",
+ " 'math.MP',\n",
+ " 'math.NA',\n",
+ " 'math.NT',\n",
+ " 'math.OA',\n",
+ " 'math.OC',\n",
+ " 'math.PR',\n",
+ " 'math.QA',\n",
+ " 'math.RA',\n",
+ " 'math.SG',\n",
+ " 'math.SP',\n",
+ " 'math.ST',\n",
+ " 'nlin.AO',\n",
+ " 'nlin.CD',\n",
+ " 'nlin.CG',\n",
+ " 'nlin.PS',\n",
+ " 'nlin.SI',\n",
+ " 'nucl-ex',\n",
+ " 'nucl-th',\n",
+ " 'physics.acc-ph',\n",
+ " 'physics.ao-ph',\n",
+ " 'physics.app-ph',\n",
+ " 'physics.atm-clus',\n",
+ " 'physics.atom-ph',\n",
+ " 'physics.bio-ph',\n",
+ " 'physics.chem-ph',\n",
+ " 'physics.class-ph',\n",
+ " 'physics.comp-ph',\n",
+ " 'physics.data-an',\n",
+ " 'physics.ed-ph',\n",
+ " 'physics.flu-dyn',\n",
+ " 'physics.gen-ph',\n",
+ " 'physics.geo-ph',\n",
+ " 'physics.hist-ph',\n",
+ " 'physics.ins-det',\n",
+ " 'physics.med-ph',\n",
+ " 'physics.optics',\n",
+ " 'physics.plasm-ph',\n",
+ " 'physics.pop-ph',\n",
+ " 'physics.soc-ph',\n",
+ " 'physics.space-ph',\n",
+ " 'q-bio.BM',\n",
+ " 'q-bio.CB',\n",
+ " 'q-bio.GN',\n",
+ " 'q-bio.MN',\n",
+ " 'q-bio.NC',\n",
+ " 'q-bio.OT',\n",
+ " 'q-bio.PE',\n",
+ " 'q-bio.QM',\n",
+ " 'q-bio.SC',\n",
+ " 'q-fin.CP',\n",
+ " 'q-fin.EC',\n",
+ " 'q-fin.GN',\n",
+ " 'q-fin.MF',\n",
+ " 'q-fin.PM',\n",
+ " 'q-fin.PR',\n",
+ " 'q-fin.RM',\n",
+ " 'q-fin.ST',\n",
+ " 'q-fin.TR',\n",
+ " 'quant-ph',\n",
+ " 'stat.AP',\n",
+ " 'stat.CO',\n",
+ " 'stat.ME',\n",
+ " 'stat.ML',\n",
+ " 'stat.OT',\n",
+ " 'stat.TH']"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "actual"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.16"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data/multilabel_classifier/multilabel-model.ipynb b/data/multilabel_classifier/multilabel-model.ipynb
new file mode 100644
index 0000000..0a257ef
--- /dev/null
+++ b/data/multilabel_classifier/multilabel-model.ipynb
@@ -0,0 +1,1264 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install -q -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "import re\n",
+ "import string\n",
+ "import pickle\n",
+ "\n",
+ "from transformers import BertForSequenceClassification\n",
+ "from sklearn.preprocessing import MultiLabelBinarizer\n",
+ "from transformers import AutoTokenizer\n",
+ "from sklearn.metrics import f1_score, roc_auc_score, accuracy_score\n",
+ "from transformers import EvalPrediction\n",
+ "import torch\n",
+ "from transformers import TrainingArguments, Trainer\n",
+ "from datasets import Dataset\n",
+ "import numpy as np\n",
+ "from sklearn.model_selection import train_test_split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "#os.chdir('../..')\n",
+ "\n",
+ "from vecsim_app.categories import CATEGORIES\n",
+ "from vecsim_app.data_utils import papers"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Define the parameters for fetching the papers dataset:\n",
+ "\n",
+ "- Dataset Path\n",
+ "- Year cutoff: Year cut off for the papers.\n",
+ "- Pattern for fetching a given amount of years\n",
+ "- Max Sample size: maximum simple size (if you just want to try out the notebook - if it's too low the model won't perform well)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_PATH = \"./arxiv-metadata-oai-snapshot.json\"\n",
+ "YEAR_CUTOFF = 2010\n",
+ "YEAR_PATTERN = r\"(19|20[0-9]{2})\"\n",
+ "MAX_SAMPLE_SIZE = 20000\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.DataFrame(papers(data_path=DATA_PATH, year_cutoff=YEAR_CUTOFF, year_pattern=YEAR_PATTERN))\n",
+ "len(df)\n",
+ "\n",
+ "# Take a sample for computing reasons\n",
+ "df = df.sample(MAX_SAMPLE_SIZE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
| id | title | year | authors | categories | abstract | text |
---|
210240 | 1512.07410 | Fragmentation of long-lived hydrocarbons after... | 2016 | Seyedreza Larimian, Sonia Erattupuzha, Erik L\\... | physics.chem-ph,physics.atm-clus,physics.optics | We experimentally and theoretically investig... | Fragmentation of long-lived hydrocarbons after... |
---|
148715 | 1404.1520 | Single spin stochastic optical reconstruction ... | 2014 | Matthias Pfender, Nabeel Aslam, Gerald Waldher... | quant-ph,physics.optics | We experimentally demonstrate precision addr... | Single spin stochastic optical reconstruction ... |
---|
219516 | 1603.07790 | Weighted Pushdown Systems with Indexed Weight ... | 2016 | Yasuhiko Minamide | cs.FL,cs.PL | The reachability analysis of weighted pushdo... | Weighted Pushdown Systems with Indexed Weight ... |
---|
370814 | 1911.02005 | Simultaneous spectral estimation of dephasing ... | 2020 | Virginia Frey, Leigh M. Norris, Lorenza Viola ... | quant-ph | The fragility of quantum systems makes them ... | Simultaneous spectral estimation of dephasing ... |
---|
228464 | 1606.06192 | A Novel Quasi-One-Dimensional Topological Insu... | 2016 | Gabriel Aut\\`es, Anna Isaeva, Luca Moreschini,... | cond-mat.mtrl-sci,cond-mat.mes-hall | Recent progress in the field of topological ... | A Novel Quasi-One-Dimensional Topological Insu... |
---|
... | ... | ... | ... | ... | ... | ... | ... |
---|
316199 | 1808.06472 | Dark Matter Sommerfeld-enhanced annihilation a... | 2018 | Tobias Binder, Laura Covi and Kyohei Mukaida | hep-ph,astro-ph.CO,hep-th | Traditional computations of the dark matter ... | Dark Matter Sommerfeld-enhanced annihilation a... |
---|
414998 | 2010.12385 | Resonances in hyperbolic dynamics | 2018 | St\\'ephane Nonnenmacher | math-ph,math.DS,math.MP,math.SP | The study of wave propagation outside bounde... | Resonances in hyperbolic dynamics The study ... |
---|
189313 | 1506.01307 | Control of fixed points and existence and uniq... | 2016 | George Glauberman and Justin Lynd | math.GR,math.AT | A. Chermak has recently proved that to each ... | Control of fixed points and existence and uniq... |
---|
65915 | 1108.5137 | A Laser System for the Spectroscopy of Highly-... | 2012 | S. Albrecht, S. Altenburg, C. Siegel, N. Hersc... | physics.atom-ph,nucl-ex | We present and characterize a laser system f... | A Laser System for the Spectroscopy of Highly-... |
---|
319340 | 1809.05654 | Changes of graph structure of transition proba... | 2018 | Teruaki Okushima, Tomoaki Niiyama, Kensuke S. ... | cond-mat.dis-nn | Graphs of the most probable transitions for ... | Changes of graph structure of transition proba... |
---|
\n",
+ "
5000 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id title year \\\n",
+ "210240 1512.07410 Fragmentation of long-lived hydrocarbons after... 2016 \n",
+ "148715 1404.1520 Single spin stochastic optical reconstruction ... 2014 \n",
+ "219516 1603.07790 Weighted Pushdown Systems with Indexed Weight ... 2016 \n",
+ "370814 1911.02005 Simultaneous spectral estimation of dephasing ... 2020 \n",
+ "228464 1606.06192 A Novel Quasi-One-Dimensional Topological Insu... 2016 \n",
+ "... ... ... ... \n",
+ "316199 1808.06472 Dark Matter Sommerfeld-enhanced annihilation a... 2018 \n",
+ "414998 2010.12385 Resonances in hyperbolic dynamics 2018 \n",
+ "189313 1506.01307 Control of fixed points and existence and uniq... 2016 \n",
+ "65915 1108.5137 A Laser System for the Spectroscopy of Highly-... 2012 \n",
+ "319340 1809.05654 Changes of graph structure of transition proba... 2018 \n",
+ "\n",
+ " authors \\\n",
+ "210240 Seyedreza Larimian, Sonia Erattupuzha, Erik L\\... \n",
+ "148715 Matthias Pfender, Nabeel Aslam, Gerald Waldher... \n",
+ "219516 Yasuhiko Minamide \n",
+ "370814 Virginia Frey, Leigh M. Norris, Lorenza Viola ... \n",
+ "228464 Gabriel Aut\\`es, Anna Isaeva, Luca Moreschini,... \n",
+ "... ... \n",
+ "316199 Tobias Binder, Laura Covi and Kyohei Mukaida \n",
+ "414998 St\\'ephane Nonnenmacher \n",
+ "189313 George Glauberman and Justin Lynd \n",
+ "65915 S. Albrecht, S. Altenburg, C. Siegel, N. Hersc... \n",
+ "319340 Teruaki Okushima, Tomoaki Niiyama, Kensuke S. ... \n",
+ "\n",
+ " categories \\\n",
+ "210240 physics.chem-ph,physics.atm-clus,physics.optics \n",
+ "148715 quant-ph,physics.optics \n",
+ "219516 cs.FL,cs.PL \n",
+ "370814 quant-ph \n",
+ "228464 cond-mat.mtrl-sci,cond-mat.mes-hall \n",
+ "... ... \n",
+ "316199 hep-ph,astro-ph.CO,hep-th \n",
+ "414998 math-ph,math.DS,math.MP,math.SP \n",
+ "189313 math.GR,math.AT \n",
+ "65915 physics.atom-ph,nucl-ex \n",
+ "319340 cond-mat.dis-nn \n",
+ "\n",
+ " abstract \\\n",
+ "210240 We experimentally and theoretically investig... \n",
+ "148715 We experimentally demonstrate precision addr... \n",
+ "219516 The reachability analysis of weighted pushdo... \n",
+ "370814 The fragility of quantum systems makes them ... \n",
+ "228464 Recent progress in the field of topological ... \n",
+ "... ... \n",
+ "316199 Traditional computations of the dark matter ... \n",
+ "414998 The study of wave propagation outside bounde... \n",
+ "189313 A. Chermak has recently proved that to each ... \n",
+ "65915 We present and characterize a laser system f... \n",
+ "319340 Graphs of the most probable transitions for ... \n",
+ "\n",
+ " text \n",
+ "210240 Fragmentation of long-lived hydrocarbons after... \n",
+ "148715 Single spin stochastic optical reconstruction ... \n",
+ "219516 Weighted Pushdown Systems with Indexed Weight ... \n",
+ "370814 Simultaneous spectral estimation of dephasing ... \n",
+ "228464 A Novel Quasi-One-Dimensional Topological Insu... \n",
+ "... ... \n",
+ "316199 Dark Matter Sommerfeld-enhanced annihilation a... \n",
+ "414998 Resonances in hyperbolic dynamics The study ... \n",
+ "189313 Control of fixed points and existence and uniq... \n",
+ "65915 A Laser System for the Spectroscopy of Highly-... \n",
+ "319340 Changes of graph structure of transition proba... \n",
+ "\n",
+ "[5000 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'astro-ph': 'Astrophysics',\n",
+ " 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',\n",
+ " 'astro-ph.EP': 'Earth and Planetary Astrophysics',\n",
+ " 'astro-ph.GA': 'Astrophysics of Galaxies',\n",
+ " 'astro-ph.HE': 'High Energy Astrophysical Phenomena',\n",
+ " 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',\n",
+ " 'astro-ph.SR': 'Solar and Stellar Astrophysics',\n",
+ " 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',\n",
+ " 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',\n",
+ " 'cond-mat.mtrl-sci': 'Materials Science',\n",
+ " 'cond-mat.other': 'Other Condensed Matter',\n",
+ " 'cond-mat.quant-gas': 'Quantum Gases',\n",
+ " 'cond-mat.soft': 'Soft Condensed Matter',\n",
+ " 'cond-mat.stat-mech': 'Statistical Mechanics',\n",
+ " 'cond-mat.str-el': 'Strongly Correlated Electrons',\n",
+ " 'cond-mat.supr-con': 'Superconductivity',\n",
+ " 'cs.AI': 'Artificial Intelligence',\n",
+ " 'cs.AR': 'Hardware Architecture',\n",
+ " 'cs.CC': 'Computational Complexity',\n",
+ " 'cs.CE': 'Computational Engineering, Finance, and Science',\n",
+ " 'cs.CG': 'Computational Geometry',\n",
+ " 'cs.CL': 'Computation and Language',\n",
+ " 'cs.CR': 'Cryptography and Security',\n",
+ " 'cs.CV': 'Computer Vision and Pattern Recognition',\n",
+ " 'cs.CY': 'Computers and Society',\n",
+ " 'cs.DB': 'Databases',\n",
+ " 'cs.DC': 'Distributed, Parallel, and Cluster Computing',\n",
+ " 'cs.DL': 'Digital Libraries',\n",
+ " 'cs.DM': 'Discrete Mathematics',\n",
+ " 'cs.DS': 'Data Structures and Algorithms',\n",
+ " 'cs.ET': 'Emerging Technologies',\n",
+ " 'cs.FL': 'Formal Languages and Automata Theory',\n",
+ " 'cs.GL': 'General Literature',\n",
+ " 'cs.GR': 'Graphics',\n",
+ " 'cs.GT': 'Computer Science and Game Theory',\n",
+ " 'cs.HC': 'Human-Computer Interaction',\n",
+ " 'cs.IR': 'Information Retrieval',\n",
+ " 'cs.IT': 'Information Theory',\n",
+ " 'cs.LG': 'Machine Learning',\n",
+ " 'cs.LO': 'Logic in Computer Science',\n",
+ " 'cs.MA': 'Multiagent Systems',\n",
+ " 'cs.MM': 'Multimedia',\n",
+ " 'cs.MS': 'Mathematical Software',\n",
+ " 'cs.NA': 'Numerical Analysis',\n",
+ " 'cs.NE': 'Neural and Evolutionary Computing',\n",
+ " 'cs.NI': 'Networking and Internet Architecture',\n",
+ " 'cs.OH': 'Other Computer Science',\n",
+ " 'cs.OS': 'Operating Systems',\n",
+ " 'cs.PF': 'Performance',\n",
+ " 'cs.PL': 'Programming Languages',\n",
+ " 'cs.RO': 'Robotics',\n",
+ " 'cs.SC': 'Symbolic Computation',\n",
+ " 'cs.SD': 'Sound',\n",
+ " 'cs.SE': 'Software Engineering',\n",
+ " 'cs.SI': 'Social and Information Networks',\n",
+ " 'cs.SY': 'Systems and Control',\n",
+ " 'econ.EM': 'Econometrics',\n",
+ " 'eess.AS': 'Audio and Speech Processing',\n",
+ " 'eess.IV': 'Image and Video Processing',\n",
+ " 'eess.SP': 'Signal Processing',\n",
+ " 'gr-qc': 'General Relativity and Quantum Cosmology',\n",
+ " 'hep-ex': 'High Energy Physics - Experiment',\n",
+ " 'hep-lat': 'High Energy Physics - Lattice',\n",
+ " 'hep-ph': 'High Energy Physics - Phenomenology',\n",
+ " 'hep-th': 'High Energy Physics - Theory',\n",
+ " 'math.AC': 'Commutative Algebra',\n",
+ " 'math.AG': 'Algebraic Geometry',\n",
+ " 'math.AP': 'Analysis of PDEs',\n",
+ " 'math.AT': 'Algebraic Topology',\n",
+ " 'math.CA': 'Classical Analysis and ODEs',\n",
+ " 'math.CO': 'Combinatorics',\n",
+ " 'math.CT': 'Category Theory',\n",
+ " 'math.CV': 'Complex Variables',\n",
+ " 'math.DG': 'Differential Geometry',\n",
+ " 'math.DS': 'Dynamical Systems',\n",
+ " 'math.FA': 'Functional Analysis',\n",
+ " 'math.GM': 'General Mathematics',\n",
+ " 'math.GN': 'General Topology',\n",
+ " 'math.GR': 'Group Theory',\n",
+ " 'math.GT': 'Geometric Topology',\n",
+ " 'math.HO': 'History and Overview',\n",
+ " 'math.IT': 'Information Theory',\n",
+ " 'math.KT': 'K-Theory and Homology',\n",
+ " 'math.LO': 'Logic',\n",
+ " 'math.MG': 'Metric Geometry',\n",
+ " 'math.MP': 'Mathematical Physics',\n",
+ " 'math.NA': 'Numerical Analysis',\n",
+ " 'math.NT': 'Number Theory',\n",
+ " 'math.OA': 'Operator Algebras',\n",
+ " 'math.OC': 'Optimization and Control',\n",
+ " 'math.PR': 'Probability',\n",
+ " 'math.QA': 'Quantum Algebra',\n",
+ " 'math.RA': 'Rings and Algebras',\n",
+ " 'math.RT': 'Representation Theory',\n",
+ " 'math.SG': 'Symplectic Geometry',\n",
+ " 'math.SP': 'Spectral Theory',\n",
+ " 'math.ST': 'Statistics Theory',\n",
+ " 'math-ph': 'Mathematical Physics',\n",
+ " 'nlin.AO': 'Adaptation and Self-Organizing Systems',\n",
+ " 'nlin.CD': 'Chaotic Dynamics',\n",
+ " 'nlin.CG': 'Cellular Automata and Lattice Gases',\n",
+ " 'nlin.PS': 'Pattern Formation and Solitons',\n",
+ " 'nlin.SI': 'Exactly Solvable and Integrable Systems',\n",
+ " 'nucl-ex': 'Nuclear Experiment',\n",
+ " 'nucl-th': 'Nuclear Theory',\n",
+ " 'physics.acc-ph': 'Accelerator Physics',\n",
+ " 'physics.ao-ph': 'Atmospheric and Oceanic Physics',\n",
+ " 'physics.app-ph': 'Applied Physics',\n",
+ " 'physics.atm-clus': 'Atomic and Molecular Clusters',\n",
+ " 'physics.atom-ph': 'Atomic Physics',\n",
+ " 'physics.bio-ph': 'Biological Physics',\n",
+ " 'physics.chem-ph': 'Chemical Physics',\n",
+ " 'physics.class-ph': 'Classical Physics',\n",
+ " 'physics.comp-ph': 'Computational Physics',\n",
+ " 'physics.data-an': 'Data Analysis, Statistics and Probability',\n",
+ " 'physics.ed-ph': 'Physics Education',\n",
+ " 'physics.flu-dyn': 'Fluid Dynamics',\n",
+ " 'physics.gen-ph': 'General Physics',\n",
+ " 'physics.geo-ph': 'Geophysics',\n",
+ " 'physics.hist-ph': 'History and Philosophy of Physics',\n",
+ " 'physics.ins-det': 'Instrumentation and Detectors',\n",
+ " 'physics.med-ph': 'Medical Physics',\n",
+ " 'physics.optics': 'Optics',\n",
+ " 'physics.plasm-ph': 'Plasma Physics',\n",
+ " 'physics.pop-ph': 'Popular Physics',\n",
+ " 'physics.soc-ph': 'Physics and Society',\n",
+ " 'physics.space-ph': 'Space Physics',\n",
+ " 'q-bio.BM': 'Biomolecules',\n",
+ " 'q-bio.CB': 'Cell Behavior',\n",
+ " 'q-bio.GN': 'Genomics',\n",
+ " 'q-bio.MN': 'Molecular Networks',\n",
+ " 'q-bio.NC': 'Neurons and Cognition',\n",
+ " 'q-bio.OT': 'Other Quantitative Biology',\n",
+ " 'q-bio.PE': 'Populations and Evolution',\n",
+ " 'q-bio.QM': 'Quantitative Methods',\n",
+ " 'q-bio.SC': 'Subcellular Processes',\n",
+ " 'q-bio.TO': 'Tissues and Organs',\n",
+ " 'q-fin.CP': 'Computational Finance',\n",
+ " 'q-fin.EC': 'Economics',\n",
+ " 'q-fin.GN': 'General Finance',\n",
+ " 'q-fin.MF': 'Mathematical Finance',\n",
+ " 'q-fin.PM': 'Portfolio Management',\n",
+ " 'q-fin.PR': 'Pricing of Securities',\n",
+ " 'q-fin.RM': 'Risk Management',\n",
+ " 'q-fin.ST': 'Statistical Finance',\n",
+ " 'q-fin.TR': 'Trading and Market Microstructure',\n",
+ " 'quant-ph': 'Quantum Physics',\n",
+ " 'stat.AP': 'Applications',\n",
+ " 'stat.CO': 'Computation',\n",
+ " 'stat.ME': 'Methodology',\n",
+ " 'stat.ML': 'Machine Learning',\n",
+ " 'stat.OT': 'Other Statistics',\n",
+ " 'stat.TH': 'Statistics Theory'}"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "CATEGORIES"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5000, 6)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
| id | title | year | authors | categories | abstract |
---|
210240 | 1512.07410 | Fragmentation of long-lived hydrocarbons after... | 2016 | Seyedreza Larimian, Sonia Erattupuzha, Erik L\\... | physics.chem-ph,physics.atm-clus,physics.optics | We experimentally and theoretically investig... |
---|
148715 | 1404.1520 | Single spin stochastic optical reconstruction ... | 2014 | Matthias Pfender, Nabeel Aslam, Gerald Waldher... | quant-ph,physics.optics | We experimentally demonstrate precision addr... |
---|
219516 | 1603.07790 | Weighted Pushdown Systems with Indexed Weight ... | 2016 | Yasuhiko Minamide | cs.FL,cs.PL | The reachability analysis of weighted pushdo... |
---|
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id title year \\\n",
+ "210240 1512.07410 Fragmentation of long-lived hydrocarbons after... 2016 \n",
+ "148715 1404.1520 Single spin stochastic optical reconstruction ... 2014 \n",
+ "219516 1603.07790 Weighted Pushdown Systems with Indexed Weight ... 2016 \n",
+ "\n",
+ " authors \\\n",
+ "210240 Seyedreza Larimian, Sonia Erattupuzha, Erik L\\... \n",
+ "148715 Matthias Pfender, Nabeel Aslam, Gerald Waldher... \n",
+ "219516 Yasuhiko Minamide \n",
+ "\n",
+ " categories \\\n",
+ "210240 physics.chem-ph,physics.atm-clus,physics.optics \n",
+ "148715 quant-ph,physics.optics \n",
+ "219516 cs.FL,cs.PL \n",
+ "\n",
+ " abstract \n",
+ "210240 We experimentally and theoretically investig... \n",
+ "148715 We experimentally demonstrate precision addr... \n",
+ "219516 The reachability analysis of weighted pushdo... "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['text'] = df['title'] + ' ' + df['abstract']\n",
+ "# df['categories'] = df['categories'].apply(lambda x: x.split(','))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'physics.chem-ph,physics.atm-clus,physics.optics'"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.iloc[0].categories"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Train dataset creation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "((5000, 7), (4000, 7), (1000, 7))"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_train, df_test = train_test_split(df, train_size=0.8)\n",
+ "\n",
+ "df.shape, df_train.shape, df_test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_tokenizer(tokenizer_model):\n",
+ " def tokenize_function(examples):\n",
+ " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
+ "\n",
+ " tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)\n",
+ " return tokenize_function, tokenizer\n",
+ "\n",
+ "tokenize_function, tokenizer = get_tokenizer('bert-base-uncased')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop samples where there are categories which should not be present according to our predefined categories.py\n",
+ "\n",
+ "df['split_categories'] = df['categories'].apply(lambda x: x.split(','))\n",
+ "\n",
+ "df = df[\n",
+ " df['split_categories'].apply(lambda x: len(set(x) - set(CATEGORIES)) == 0)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA',\n",
+ " 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn',\n",
+ " 'cond-mat.mes-hall', 'cond-mat.mtrl-sci'], dtype=object)"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mlb = MultiLabelBinarizer()\n",
+ "# mlb.fit([[(k,v) for k, v in CATEGORIES.items()]]) #df_train['categories'])\n",
+ "mlb.fit([list(CATEGORIES.keys())]) #df_train['categories'])\n",
+ "mlb.classes_[:10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def preprocess_data(examples):\n",
+ " # take a batch of texts\n",
+ " text = examples[\"text\"]\n",
+ "\n",
+ " # encode them\n",
+ " encoding = tokenizer(text, padding=\"max_length\", truncation=True, max_length=128)\n",
+ "\n",
+ " encoded_categories = mlb.transform([c.split(',') for c in examples['categories']]).astype(float)\n",
+ "\n",
+ " encoding[\"labels\"] = encoded_categories\n",
+ "\n",
+ " return encoding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9692bf48791b4a1d87b8c5d796cef366",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/4 [00:00= threshold)] = 1\n",
+ " # finally, compute metrics\n",
+ " y_true = labels\n",
+ " f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')\n",
+ " roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')\n",
+ " accuracy = accuracy_score(y_true, y_pred)\n",
+ " # return as dictionary\n",
+ " metrics = {'f1': f1_micro_average,\n",
+ " 'roc_auc': roc_auc,\n",
+ " 'accuracy': accuracy}\n",
+ " return metrics\n",
+ "\n",
+ "def compute_metrics(p: EvalPrediction):\n",
+ " preds = p.predictions[0] if isinstance(p.predictions, \n",
+ " tuple) else p.predictions\n",
+ " return multi_label_metrics(\n",
+ " predictions=p.predictions, \n",
+ " labels=p.label_ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trainer = Trainer(\n",
+ " model,\n",
+ " args,\n",
+ " train_dataset=tokenized_train,\n",
+ " eval_dataset=tokenized_test,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/a.yushkovskiy/gh/atemate/redis-arXiv-search/.venv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+ " warnings.warn(\n",
+ "***** Running training *****\n",
+ " Num examples = 48\n",
+ " Num Epochs = 1\n",
+ " Instantaneous batch size per device = 8\n",
+ " Total train batch size (w. parallel, distributed & accumulation) = 8\n",
+ " Gradient Accumulation steps = 1\n",
+ " Total optimization steps = 6\n",
+ " Number of trainable parameters = 109599897\n",
+ "The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n",
+ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ "
\n",
+ " [6/6 00:08, Epoch 1/1]\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "Epoch | Training Loss | Validation Loss | F1 | Roc Auc | Accuracy |
---|
1 | No log | 0.657666 | 0.026588 | 0.502474 | 0.000000 |
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "***** Running Evaluation *****\n",
+ " Num examples = 12\n",
+ " Batch size = 4\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n",
+ "Saving model checkpoint to paper-multilabel-finetuning/checkpoint-6\n",
+ "Configuration saved in paper-multilabel-finetuning/checkpoint-6/config.json\n",
+ "Model weights saved in paper-multilabel-finetuning/checkpoint-6/pytorch_model.bin\n",
+ "tokenizer config file saved in paper-multilabel-finetuning/checkpoint-6/tokenizer_config.json\n",
+ "Special tokens file saved in paper-multilabel-finetuning/checkpoint-6/special_tokens_map.json\n",
+ "\n",
+ "\n",
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+ "\n",
+ "\n",
+ "Loading best model from paper-multilabel-finetuning/checkpoint-6 (score: 0.026587887740029542).\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "TrainOutput(global_step=6, training_loss=0.6783577601114908, metrics={'train_runtime': 9.3058, 'train_samples_per_second': 5.158, 'train_steps_per_second': 0.645, 'total_flos': 3161613275136.0, 'train_loss': 0.6783577601114908, 'epoch': 1.0})"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "***** Running Evaluation *****\n",
+ " Num examples = 12\n",
+ " Batch size = 4\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " [3/3 00:00]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'eval_loss': 0.6576664447784424,\n",
+ " 'eval_f1': 0.026587887740029542,\n",
+ " 'eval_roc_auc': 0.5024737713970182,\n",
+ " 'eval_accuracy': 0.0,\n",
+ " 'eval_runtime': 0.6499,\n",
+ " 'eval_samples_per_second': 18.465,\n",
+ " 'eval_steps_per_second': 4.616,\n",
+ " 'epoch': 1.0}"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "eval_res = trainer.evaluate()\n",
+ "eval_res"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Perform inference on a given text sample"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "astro-ph.HE\n"
+ ]
+ }
+ ],
+ "source": [
+ "text = df['text'].iloc[5]\n",
+ "categories = df['categories'].iloc[5]\n",
+ "print(categories)\n",
+ "\n",
+ "encoding = tokenizer(text, return_tensors=\"pt\")\n",
+ "encoding = {k: v.to(trainer.model.device) for k, v in encoding.items()}\n",
+ "\n",
+ "outputs = trainer.model(**encoding)\n",
+ "logits = outputs.logits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# apply sigmoid + threshold\n",
+ "sigmoid = torch.nn.Sigmoid()\n",
+ "probs = sigmoid(logits.squeeze().cpu())\n",
+ "predictions = np.zeros(probs.shape)\n",
+ "predictions[np.where(probs >= 0.3)] = 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "A Search for MeV to TeV Neutrinos from Fast Radio Bursts with IceCube We present two searches for IceCube neutrino events coincident with 28 fast\n",
+ "radio bursts (FRBs) and one repeating FRB. The first improves upon a previous\n",
+ "IceCube analysis -- searching for spatial and temporal correlation of events\n",
+ "with FRBs at energies greater than roughly 50 GeV -- by increasing the\n",
+ "effective area by an order of magnitude. The second is a search for temporal\n",
+ "correlation of MeV neutrino events with FRBs. No significant correlation is\n",
+ "found in either search, therefore, we set upper limits on the time-integrated\n",
+ "neutrino flux emitted by FRBs for a range of emission timescales less than one\n",
+ "day. These are the first limits on FRB neutrino emission at the MeV scale, and\n",
+ "the limits set at higher energies are an order-of-magnitude improvement over\n",
+ "those set by any neutrino telescope.\n",
+ "\n",
+ "[('astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn', 'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cond-mat.other', 'cond-mat.quant-gas', 'cond-mat.soft', 'cond-mat.stat-mech', 'cond-mat.str-el', 'cond-mat.supr-con', 'cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB', 'cs.DC', 'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH', 'cs.OS', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY', 'econ.EM', 'eess.AS', 'eess.IV', 'eess.SP', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA', 'math.CO', 'math.CT', 'math.CV', 'math.DG', 'math.DS', 'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT', 'math.HO', 'math.IT', 'math.KT', 'math.LO', 'math.MG', 'math.NA', 'math.NT', 'math.OA', 'math.OC', 'math.PR', 'math.QA', 'math.RA', 'math.RT', 'math.SG', 'math.SP', 'math.ST', 'nlin.AO', 'nlin.CD', 'nlin.CG', 'nlin.PS', 'nlin.SI', 'nucl-ex', 'nucl-th', 'physics.acc-ph', 'physics.ao-ph', 'physics.app-ph', 'physics.atm-clus', 'physics.atom-ph', 'physics.bio-ph', 'physics.chem-ph', 'physics.class-ph', 'physics.comp-ph', 'physics.data-an', 'physics.ed-ph', 'physics.flu-dyn', 'physics.gen-ph', 'physics.geo-ph', 'physics.hist-ph', 'physics.ins-det', 'physics.med-ph', 'physics.optics', 'physics.plasm-ph', 'physics.pop-ph', 'physics.soc-ph', 'physics.space-ph', 'q-bio.BM', 'q-bio.CB', 'q-bio.GN', 'q-bio.MN', 'q-bio.NC', 'q-bio.OT', 'q-bio.PE', 'q-bio.QM', 'q-bio.SC', 'q-fin.CP', 'q-fin.EC', 'q-fin.GN', 'q-fin.MF', 'q-fin.PM', 'q-fin.PR', 'q-fin.RM', 'q-fin.ST', 'q-fin.TR', 'quant-ph', 'stat.AP', 'stat.CO', 'stat.ME', 'stat.ML', 'stat.OT', 'stat.TH')]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(text)\n",
+ "print(mlb.inverse_transform(predictions.reshape(1, -1)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Saving model checkpoint to ./checkpoint\n",
+ "Configuration saved in ./checkpoint/config.json\n",
+ "Model weights saved in ./checkpoint/pytorch_model.bin\n",
+ "tokenizer config file saved in ./checkpoint/tokenizer_config.json\n",
+ "Special tokens file saved in ./checkpoint/special_tokens_map.json\n"
+ ]
+ }
+ ],
+ "source": [
+ "trainer.save_model(output_dir='./checkpoint')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('./checkpoint/model_info.json', 'w') as f:\n",
+ " f.write(json.dumps(eval_res, indent=4))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data/multilabel_classifier/requirements.txt b/data/multilabel_classifier/requirements.txt
new file mode 100644
index 0000000..1033e90
--- /dev/null
+++ b/data/multilabel_classifier/requirements.txt
@@ -0,0 +1,2 @@
+-e ../../backend # to access vecsim_app utils
+datasets
diff --git a/docker-local-redis.yml b/docker-local-redis.yml
index 3d9ee39..79b5860 100644
--- a/docker-local-redis.yml
+++ b/docker-local-redis.yml
@@ -10,6 +10,11 @@ services:
ports:
- 6379:6379
- 8001:8001
+ healthcheck:
+ test: ["CMD", "redis-cli", "ping"]
+ interval: 3s
+ timeout: 5s
+ retries: 5
backend:
image: ghcr.io/redisventures/redis-arxiv-search:latest
environment:
diff --git a/frontend/.tool-versions b/frontend/.tool-versions
new file mode 100644
index 0000000..0094556
--- /dev/null
+++ b/frontend/.tool-versions
@@ -0,0 +1 @@
+nodejs 16.14.2
diff --git a/frontend/package.json b/frontend/package.json
index 1595c87..fd9495f 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -9,7 +9,6 @@
"@material-ui/icons": "^4.11.3",
"@mui/icons-material": "^5.8.4",
"@mui/material": "^5.8.4",
- "material-ui-search-bar": "^1.0.0",
"@testing-library/jest-dom": "^5.14.1",
"@testing-library/react": "^13.0.0",
"@testing-library/user-event": "^13.2.1",
@@ -19,8 +18,10 @@
"@types/react-dom": "^18.0.0",
"autoprefixer": "10.4.5",
"bootstrap": "^5.1.3",
+ "immer": "^9.0.16",
"jwt-decode": "^3.1.2",
"material-ui": "^0.20.2",
+ "material-ui-search-bar": "^1.0.0",
"ra-data-simple-rest": "^4.2.0",
"react": "^18.2.0",
"react-admin": "^4.2.0",
@@ -29,7 +30,9 @@
"react-router": "^6.3.0",
"react-router-dom": "^6.3.0",
"react-scripts": "5.0.1",
+ "styled-components": "^5.3.6",
"typescript": "^4.4.2",
+ "use-immer": "^0.7.0",
"web-vitals": "^2.1.0"
},
"scripts": {
@@ -56,5 +59,8 @@
"last 1 safari version"
]
},
- "proxy": "http://localhost:8888"
+ "proxy": "http://localhost:8888",
+ "devDependencies": {
+ "@types/styled-components": "^5.1.26"
+ }
}
diff --git a/frontend/public/index.html b/frontend/public/index.html
index 614bb7f..769be84 100644
--- a/frontend/public/index.html
+++ b/frontend/public/index.html
@@ -9,6 +9,11 @@
name="description"
content="Redis vector similarity search demonstration"
/>
+
diff --git a/frontend/src/Layout.tsx b/frontend/src/Layout.tsx
index e42a91d..f99edd7 100644
--- a/frontend/src/Layout.tsx
+++ b/frontend/src/Layout.tsx
@@ -6,30 +6,13 @@ import { Footer } from './views/Footer';
export const Layout: FC = () => {
- const [papers, setPapers] = useState([]);
- const [categories, setCategories] = useState([]);
- const [years, setYears] = useState([]);
- const [state, setState] = useState('');
- const [total, setTotal] = useState(0);
-
return (
<>