Skip to content

Commit 5523fe5

Browse files
committed
feat: Add Open Library as a pyopds2 data provider and improve document key tracking in copydocs.py.
1 parent 12ad252 commit 5523fe5

File tree

1 file changed

+61
-3
lines changed

1 file changed

+61
-3
lines changed

openlibrary/book_providers.py

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -376,16 +376,74 @@ def get_acquisitions(
376376
if not access:
377377
return []
378378

379-
return [
379+
identifier = self.get_best_identifier(db_edition or ed_or_solr)
380+
acquisitions = [
380381
Acquisition(
381382
access=access,
382383
format='web',
383384
price=None,
384-
url=f'https://archive.org/details/{self.get_best_identifier(db_edition or ed_or_solr)}?view=theater&wrapper=false',
385+
url=f'https://archive.org/details/{identifier}?view=theater&wrapper=false',
385386
provider_name=self.short_name,
386-
)
387+
),
387388
]
388389

390+
# Add direct download links for open-access books
391+
if access == 'open-access':
392+
download_files = self._get_ia_download_files(identifier)
393+
for fmt, filename in download_files.items():
394+
if filename:
395+
acquisitions.append(
396+
Acquisition(
397+
access='open-access',
398+
format=fmt,
399+
price=None,
400+
url=f'https://archive.org/download/{identifier}/{filename}',
401+
provider_name=self.short_name,
402+
)
403+
)
404+
405+
return acquisitions
406+
407+
def _get_ia_download_files(
408+
self, identifier: str
409+
) -> dict[Literal['pdf', 'epub'], str | None]:
410+
"""
411+
Query IA metadata API to get available download file names.
412+
413+
Returns a dict mapping format to filename, e.g.:
414+
{'pdf': 'mybook.pdf', 'epub': 'mybook.epub'}
415+
"""
416+
import requests
417+
418+
result: dict[Literal['pdf', 'epub'], str | None] = {
419+
'pdf': None,
420+
'epub': None,
421+
}
422+
423+
try:
424+
resp = requests.get(
425+
f'https://archive.org/metadata/{identifier}',
426+
timeout=5,
427+
)
428+
resp.raise_for_status()
429+
files = resp.json().get('files', [])
430+
431+
for f in files:
432+
fmt = f.get('format', '')
433+
name = f.get('name', '')
434+
# IA uses 'Text PDF' or 'PDF' for PDFs
435+
if fmt in ('Text PDF', 'PDF') and not result['pdf']:
436+
result['pdf'] = name
437+
# IA uses 'EPUB' for epub files
438+
elif fmt == 'EPUB' and not result['epub']:
439+
result['epub'] = name
440+
441+
except Exception as e:
442+
logger.warning(f'Failed to fetch IA metadata for {identifier}: {e}')
443+
444+
return result
445+
446+
389447

390448
class LibriVoxProvider(AbstractBookProvider):
391449
short_name = 'librivox'

0 commit comments

Comments
 (0)