|
47 | 47 | """ |
48 | 48 |
|
49 | 49 |
|
50 | | -def paper_url_to_html(identifier): |
51 | | - """Download the HTML content from an OpenReview.net submission page. |
| 50 | +def paper_url_to_identifier(url): |
| 51 | + """Extract OpenReview identifier from URL. |
52 | 52 |
|
53 | 53 | Parameters |
54 | 54 | ---------- |
| 55 | + url : str |
| 56 | + String with OpenReview URL. |
| 57 | +
|
| 58 | + Returns |
| 59 | + ------- |
55 | 60 | identifier : str |
| 61 | + String with OpenReview identifier or empty if no identifier is found. |
| 62 | +
|
| 63 | + Raises |
| 64 | + ------ |
| 65 | + ValueError |
| 66 | + If the OpenReview identifier cannot be found in the URL. |
| 67 | +
|
| 68 | + Examples |
| 69 | + -------- |
| 70 | + >>> paper_url_to_identifier("https://openreview.net/forum?id=aVh9KRZdRk") |
| 71 | + 'aVh9KRZdRk' |
| 72 | + >>> paper_url_to_identifier("https://openreview.net/pdf?id=aVh9KRZdRk") |
| 73 | + 'aVh9KRZdRk' |
| 74 | + >>> paper_url_to_identifier("https://www.wikidata.org") |
| 75 | + Traceback (most recent call last): |
| 76 | + ... |
| 77 | + ValueError: URL does not contain an 'id=' parameter |
| 78 | +
|
| 79 | + """ |
| 80 | + url = url.strip() |
| 81 | + lower_url = url.lower() |
| 82 | + if 'id=' in lower_url: |
| 83 | + # Find the position of 'id=' (case-insensitive match) |
| 84 | + id_pos = lower_url.find('id=') |
| 85 | + |
| 86 | + # Find the actual key in the original string (preserves case of value) |
| 87 | + after_id = url[id_pos + 3:] |
| 88 | + return after_id.split('&')[0] |
| 89 | + raise ValueError("URL does not contain an 'id=' parameter") |
| 90 | + |
| 91 | + |
| 92 | +def paper_url_to_html(url_or_identifier): |
| 93 | + """Download the HTML content from an OpenReview.net submission page. |
| 94 | +
|
| 95 | + Parameters |
| 96 | + ---------- |
| 97 | + url_or_identifier : str |
56 | 98 | The URL or the submission ID of the OpenReview.net submission. |
| 99 | + The URL could be both the URL to the PDF or the HTML of the submission. |
57 | 100 |
|
58 | 101 | Returns |
59 | 102 | ------- |
60 | | - str |
| 103 | + html : str |
61 | 104 | The HTML content of the page. |
62 | 105 |
|
| 106 | + Raises |
| 107 | + ------ |
| 108 | + ValueError |
| 109 | + If the OpenReview identifier cannot be found in the URL. |
| 110 | + requests.exceptions.HTTPError |
| 111 | + If URL not found on the OpenReview website. |
| 112 | +
|
63 | 113 | Examples |
64 | 114 | -------- |
| 115 | + >>> # HTML of "Learning to grok" paper |
65 | 116 | >>> html = paper_url_to_html('https://openreview.net/forum?id=aVh9KRZdRk') |
| 117 | + >>> "Learning to grok" in html |
| 118 | + True |
66 | 119 |
|
67 | 120 | """ |
68 | | - if identifier.startswith('http'): |
69 | | - url = identifier |
| 121 | + if url_or_identifier.startswith('http'): |
| 122 | + identifier = paper_url_to_identifier(url_or_identifier) |
70 | 123 | else: |
71 | | - url = f'https://openreview.net/forum?id={identifier}' |
| 124 | + identifier = url_or_identifier |
| 125 | + |
| 126 | + url = f'https://openreview.net/forum?id={identifier}' |
72 | 127 | headers = {'User-Agent': USER_AGENT} |
73 | 128 | response = requests.get(url, headers=headers) |
74 | 129 | response.raise_for_status() |
|
0 commit comments