Skip to content

Commit 0b13192

Browse files
committed
Fix #2657 Fix OpenReview scraping
Some pages on OpenReview does not have __NEXT_data__, so now scraping meta tags if fields are not set.
1 parent 9620e8c commit 0b13192

File tree

1 file changed

+32
-3
lines changed

1 file changed

+32
-3
lines changed

scholia/scrape/openreview.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,14 +209,14 @@ def html_to_paper(html):
209209
210210
Returns
211211
-------
212-
dict
212+
paper : dict
213213
A dictionary containing metadata about the submission.
214214
215215
Notes
216216
-----
217217
The function with look at the JSON in the HTML. If title and author are not
218-
found in JSON the the metatags are examined, citation_title and
219-
citation_author.
218+
found in JSON the the metatags are examined, citation_title,
219+
citation_author, citation_online_date, description and citation_pdf_url.
220220
221221
The paper is not matched to proceedings.
222222
@@ -227,6 +227,9 @@ def html_to_paper(html):
227227
>>> paper['title'].startswith('Learning to grok')
228228
True
229229
230+
>>> paper['url']
231+
'https://openreview.net/forum?id=aVh9KRZdRk'
232+
230233
"""
231234
tree = etree.HTML(html)
232235
data = {}
@@ -267,6 +270,11 @@ def html_to_paper(html):
267270
# For instance https://openreview.net/forum?id=0g0X4H8yN4I
268271
# Instead we at the metadags
269272

273+
if 'abstract' not in data:
274+
abstract = _field_to_content(tree, 'description')
275+
if abstract:
276+
data['abstract'] = abstract
277+
270278
if 'authors' not in data:
271279
authors = [
272280
author_element.attrib['content']
@@ -283,12 +291,33 @@ def html_to_paper(html):
283291
if len(authors) > 0:
284292
data['authors'] = authors
285293

294+
if 'date' not in data:
295+
date = _field_to_content(tree, 'citation_online_date')
296+
if date:
297+
data['date'] = date.replace('/', '-')
298+
299+
if 'full_text_url' not in data:
300+
full_text_url = _field_to_content(tree, 'citation_pdf_url')
301+
if full_text_url:
302+
data['full_text_url'] = full_text_url
303+
304+
if 'openreview_id' not in data:
305+
if 'full_text_url' in data:
306+
openreview_id = paper_url_to_identifier(data['full_text_url'])
307+
if openreview_id:
308+
data['openreview_id'] = openreview_id
309+
286310
if 'title' not in data:
287311
title = _fields_to_content(tree, ['citation_title', 'DC.Title',
288312
'DC.Title.Alternative'])
289313
if title:
290314
data['title'] = title
291315

316+
if 'url' not in data:
317+
if 'openreview_id' in data:
318+
data['url'] = 'https://openreview.net/forum?id=' + \
319+
data['openreview_id']
320+
292321
return data
293322

294323

0 commit comments

Comments
 (0)