Skip to content

Commit cb388cd

Browse files
authored
Test paper discovery (#66)
* Add tests for discover jmlr and miniconf * Add tests for discover openalex * Add openreview tests * Add test on pmlr * Add semantic scholar test * Add v3 to CI * Disable coverage for now
1 parent 2343769 commit cb388cd

40 files changed

+9340
-23
lines changed

.github/workflows/python-package.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ name: Python package
22

33
on:
44
push:
5-
branches: [ master ]
5+
branches: [ master, v3 ]
66
pull_request:
7-
branches: [ master ]
7+
branches: [ master, v3 ]
88

99
jobs:
1010
lint:
@@ -36,7 +36,7 @@ jobs:
3636
- python: '3.12'
3737
coverage: false
3838
- python: '3.13'
39-
coverage: true
39+
coverage: false
4040
steps:
4141
- name: Check out the code
4242
uses: actions/checkout@v3

src/paperoni/discovery/miniconf.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import math
12
import re
23
from datetime import datetime
34

@@ -43,9 +44,11 @@ def convert_paper(self, data, conference=None, venue_date=None):
4344
name=author_data.get("fullname", ""),
4445
aliases=[],
4546
links=[
46-
Link(type="profile", link=author_data["url"])
47-
if author_data.get("url")
48-
else None
47+
(
48+
Link(type="profile", link=author_data["url"])
49+
if author_data.get("url")
50+
else None
51+
)
4952
],
5053
)
5154
author.links = [link for link in author.links if link is not None]
@@ -122,6 +125,9 @@ def expand_base(uri):
122125
links.add(Link(type="pdf", link=url))
123126
if url := expand_base(data.get("virtualsite_url")):
124127
links.add(Link(type="abstract", link=url))
128+
links = sorted(
129+
links, key=lambda x: ({"pdf": 0}.get(x.type, math.inf), x.type, x.link)
130+
)
125131

126132
# Add eventmedia links
127133
for media in data.get("eventmedia", []):

src/paperoni/discovery/openalex.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -313,9 +313,6 @@ def query(
313313
# Title of the paper (mutually exclusive with "exact-title")
314314
# [alias: -t]
315315
title: str = None,
316-
# Exact title to query (mutually exclusive with "title")
317-
# [alias: -T]
318-
exact_title: str = None,
319316
# Page of results to display (start at 1). Need argument "per_page". By default, all results are displayed.
320317
page: int = None,
321318
# Number of results to display per page. Need argument "page". By default, all results are displayed.
@@ -354,14 +351,8 @@ def query(
354351
return
355352
filters.append(f"institutions.id:{institution_id}")
356353

357-
if title and exact_title:
358-
raise QueryError("Cannot query both title and exact title")
359-
elif title:
354+
if title:
360355
filters.append(f"display_name.search:{title}")
361-
elif exact_title:
362-
# No stemming, and quotation mark around title, to try to get exact title
363-
# https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/search-entities#boolean-searches
364-
filters.append(f'display_name.search.no_stem:"{exact_title}"')
365356

366357
params = {}
367358
if filters:

src/paperoni/discovery/openreview.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -525,9 +525,23 @@ def query(
525525
block_size=block_size,
526526
limit=limit,
527527
)
528+
528529
has_papers = False
529-
for paper in q:
530-
has_papers = True
531-
yield paper
530+
531+
exception = None
532+
try:
533+
for paper in q:
534+
has_papers = True
535+
yield paper
536+
537+
except openreview.OpenReviewException as e:
538+
# Try the next API version while holding the exception
539+
exception = e
540+
continue
541+
532542
if has_papers:
533543
break
544+
545+
else:
546+
if exception is not None:
547+
raise exception

src/paperoni/model/classes.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ def assimilate_date(date, infer_precision=True):
5151
if year < 100:
5252
year += 2000
5353
return {
54-
"date": f"{year}-01-01 00:00",
54+
"date": datetime(year, 1, 1).date(),
5555
"date_precision": DatePrecision.year,
5656
}
5757
case str() as year if re.match("^[0-9]{4}$", date):
5858
return {
59-
"date": f"{year}-01-01 00:00",
59+
"date": datetime(year, 1, 1).date(),
6060
"date_precision": DatePrecision.year,
6161
}
6262
case str() if m := re.match("^(....)-(..)-(..).*", date):
@@ -70,15 +70,15 @@ def assimilate_date(date, infer_precision=True):
7070
else:
7171
precision = DatePrecision.day
7272
return {
73-
"date": f"{year}-{month}-{day} 00:00",
73+
"date": datetime(year, month, day).date(),
7474
"date_precision": precision,
7575
}
7676
case _: # pragma: no cover
7777
assert False
7878
case None | "":
7979
return (
8080
{
81-
"date": "2000-01-01 00:00",
81+
"date": datetime(2000, 1, 1).date(),
8282
"date_precision": DatePrecision.unknown,
8383
}
8484
if infer_precision

tests/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from pathlib import Path
2+
3+
import gifnoc
4+
from pytest import fixture
5+
6+
7+
@fixture(scope="session", autouse=True)
8+
def set_config():
9+
with gifnoc.use(Path(__file__).resolve().parent.parent / "config/basic.yaml"):
10+
yield

tests/discovery/__init__.py

Whitespace-only changes.

tests/discovery/test_jmlr.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from pytest_regressions.data_regression import DataRegressionFixture
2+
3+
from paperoni.discovery.base import PaperInfo
4+
from paperoni.discovery.jmlr import JMLR
5+
6+
from ..utils import check_papers
7+
8+
9+
def test_query(data_regression: DataRegressionFixture):
10+
discoverer = JMLR()
11+
12+
assert "v24" in discoverer.list_volumes()
13+
14+
papers: list[PaperInfo] = sorted(
15+
discoverer.query(volume="v24", name="Yoshua Bengio"),
16+
key=lambda x: x.paper.title,
17+
)
18+
19+
assert papers, "No papers found for Yoshua Bengio in v24"
20+
21+
check_papers(data_regression, papers)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
- key: jmlr:vv24:benchmarking_graph_neural_networks
2+
paper:
3+
abstract: null
4+
authors:
5+
- affiliations: []
6+
author:
7+
aliases: []
8+
links: []
9+
name: Vijay Prakash Dwivedi
10+
display_name: Vijay Prakash Dwivedi
11+
- affiliations: []
12+
author:
13+
aliases: []
14+
links: []
15+
name: Chaitanya K. Joshi
16+
display_name: Chaitanya K. Joshi
17+
- affiliations: []
18+
author:
19+
aliases: []
20+
links: []
21+
name: Anh Tuan Luu
22+
display_name: Anh Tuan Luu
23+
- affiliations: []
24+
author:
25+
aliases: []
26+
links: []
27+
name: Thomas Laurent
28+
display_name: Thomas Laurent
29+
- affiliations: []
30+
author:
31+
aliases: []
32+
links: []
33+
name: Yoshua Bengio
34+
display_name: Yoshua Bengio
35+
- affiliations: []
36+
author:
37+
aliases: []
38+
links: []
39+
name: Xavier Bresson
40+
display_name: Xavier Bresson
41+
flags: []
42+
links:
43+
- link: https://jmlr.org/papers/v24/22-0567.html
44+
type: abstract.official
45+
- link: https://jmlr.org/papers/volume24/22-0567/22-0567.pdf
46+
type: pdf.official
47+
- link: https://jmlr.org/papers/v24/22-0567.bib
48+
type: bibtex
49+
releases:
50+
- pages: 1-48
51+
status: published
52+
venue:
53+
aliases: []
54+
date: '2023-01-01'
55+
date_precision: 1
56+
links: []
57+
name: Journal of Machine Learning Research
58+
open: false
59+
peer_reviewed: true
60+
publisher: JMLR
61+
series: JMLR
62+
type: journal
63+
volume: null
64+
title: Benchmarking Graph Neural Networks
65+
topics: []
66+
update_key: null
67+
- key: jmlr:vv24:gflownet_foundations
68+
paper:
69+
abstract: null
70+
authors:
71+
- affiliations: []
72+
author:
73+
aliases: []
74+
links: []
75+
name: Yoshua Bengio
76+
display_name: Yoshua Bengio
77+
- affiliations: []
78+
author:
79+
aliases: []
80+
links: []
81+
name: Salem Lahlou
82+
display_name: Salem Lahlou
83+
- affiliations: []
84+
author:
85+
aliases: []
86+
links: []
87+
name: Tristan Deleu
88+
display_name: Tristan Deleu
89+
- affiliations: []
90+
author:
91+
aliases: []
92+
links: []
93+
name: Edward J. Hu
94+
display_name: Edward J. Hu
95+
- affiliations: []
96+
author:
97+
aliases: []
98+
links: []
99+
name: Mo Tiwari
100+
display_name: Mo Tiwari
101+
- affiliations: []
102+
author:
103+
aliases: []
104+
links: []
105+
name: Emmanuel Bengio
106+
display_name: Emmanuel Bengio
107+
flags: []
108+
links:
109+
- link: https://jmlr.org/papers/v24/22-0364.html
110+
type: abstract.official
111+
- link: https://jmlr.org/papers/volume24/22-0364/22-0364.pdf
112+
type: pdf.official
113+
- link: https://jmlr.org/papers/v24/22-0364.bib
114+
type: bibtex
115+
releases:
116+
- pages: 1-55
117+
status: published
118+
venue:
119+
aliases: []
120+
date: '2023-01-01'
121+
date_precision: 1
122+
links: []
123+
name: Journal of Machine Learning Research
124+
open: false
125+
peer_reviewed: true
126+
publisher: JMLR
127+
series: JMLR
128+
type: journal
129+
volume: null
130+
title: GFlowNet Foundations
131+
topics: []
132+
update_key: null

tests/discovery/test_miniconf.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import itertools
2+
3+
import pytest
4+
from pytest_regressions.data_regression import DataRegressionFixture
5+
6+
from paperoni.discovery.base import PaperInfo
7+
from paperoni.discovery.miniconf import MiniConf, conference_urls
8+
9+
from ..utils import check_papers, iter_affiliations
10+
11+
12+
@pytest.mark.parametrize(
13+
["conference", "query_params"],
14+
itertools.product(
15+
conference_urls, [{"affiliation": "mila"}, {"author": "Yoshua Bengio"}]
16+
),
17+
)
18+
def test_query(data_regression: DataRegressionFixture, conference, query_params):
19+
discoverer = MiniConf()
20+
21+
papers: list[PaperInfo] = sorted(
22+
discoverer.query(conference, year=2024, **query_params),
23+
key=lambda x: x.paper.title,
24+
)
25+
26+
match_found = False
27+
28+
for param in query_params:
29+
match param:
30+
case "affiliation":
31+
assert all(
32+
any(
33+
query_params["affiliation"].lower() in aff.name.lower()
34+
for aff in iter_affiliations(paper.paper)
35+
)
36+
for paper in papers
37+
), (
38+
f"Some papers do not contain the affiliation {query_params['affiliation']=}"
39+
)
40+
match_found = True
41+
42+
case "author":
43+
assert all(
44+
any(
45+
query_params["author"].lower() in author.author.name.lower()
46+
for author in paper.paper.authors
47+
)
48+
for paper in papers
49+
), f"Some papers do not contain the author {query_params['author']=}"
50+
match_found = True
51+
52+
if not match_found:
53+
assert False, f"Unknown query parameters: {query_params=}"
54+
55+
check_papers(data_regression, papers)

0 commit comments

Comments
 (0)