Skip to content

Commit e94e987

Browse files
committed
Fix #2651: Extended OpenReview id matching
1 parent 6ca1268 commit e94e987

File tree

1 file changed

+61
-6
lines changed

1 file changed

+61
-6
lines changed

scholia/scrape/openreview.py

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,28 +47,83 @@
4747
"""
4848

4949

50-
def paper_url_to_html(identifier):
51-
"""Download the HTML content from an OpenReview.net submission page.
50+
def paper_url_to_identifier(url):
51+
"""Extract OpenReview identifier from URL.
5252
5353
Parameters
5454
----------
55+
url : str
56+
String with OpenReview URL.
57+
58+
Returns
59+
-------
5560
identifier : str
61+
String with OpenReview identifier or empty if no identifier is found.
62+
63+
Raises
64+
------
65+
ValueError
66+
If the OpenReview identifier cannot be found in the URL.
67+
68+
Examples
69+
--------
70+
>>> paper_url_to_identifier("https://openreview.net/forum?id=aVh9KRZdRk")
71+
'aVh9KRZdRk'
72+
>>> paper_url_to_identifier("https://openreview.net/pdf?id=aVh9KRZdRk")
73+
'aVh9KRZdRk'
74+
>>> paper_url_to_identifier("https://www.wikidata.org")
75+
Traceback (most recent call last):
76+
...
77+
ValueError: URL does not contain an 'id=' parameter
78+
79+
"""
80+
url = url.strip()
81+
lower_url = url.lower()
82+
if 'id=' in lower_url:
83+
# Find the position of 'id=' (case-insensitive match)
84+
id_pos = lower_url.find('id=')
85+
86+
# Find the actual key in the original string (preserves case of value)
87+
after_id = url[id_pos + 3:]
88+
return after_id.split('&')[0]
89+
raise ValueError("URL does not contain an 'id=' parameter")
90+
91+
92+
def paper_url_to_html(url_or_identifier):
93+
"""Download the HTML content from an OpenReview.net submission page.
94+
95+
Parameters
96+
----------
97+
url_or_identifier : str
5698
The URL or the submission ID of the OpenReview.net submission.
99+
The URL could be both the URL to the PDF or the HTML of the submission.
57100
58101
Returns
59102
-------
60-
str
103+
html : str
61104
The HTML content of the page.
62105
106+
Raises
107+
------
108+
ValueError
109+
If the OpenReview identifier cannot be found in the URL.
110+
requests.exceptions.HTTPError
111+
If URL not found on the OpenReview website.
112+
63113
Examples
64114
--------
115+
>>> # HTML of "Learning to grok" paper
65116
>>> html = paper_url_to_html('https://openreview.net/forum?id=aVh9KRZdRk')
117+
>>> "Learning to grok" in html
118+
True
66119
67120
"""
68-
if identifier.startswith('http'):
69-
url = identifier
121+
if url_or_identifier.startswith('http'):
122+
identifier = paper_url_to_identifier(url_or_identifier)
70123
else:
71-
url = f'https://openreview.net/forum?id={identifier}'
124+
identifier = url_or_identifier
125+
126+
url = f'https://openreview.net/forum?id={identifier}'
72127
headers = {'User-Agent': USER_AGENT}
73128
response = requests.get(url, headers=headers)
74129
response.raise_for_status()

0 commit comments

Comments
 (0)