feat: Add Open Library as a pyopds2 data provider and improve document key tracking in copydocs.py.

ronibhakta1 · ronibhakta1 · commit 5523fe5d053d · 2026-01-24T18:31:14.000+05:30
diff --git a/openlibrary/book_providers.py b/openlibrary/book_providers.py
@@ -376,16 +376,74 @@ def get_acquisitions(
         if not access:
             return []
 
-        return [
+        identifier = self.get_best_identifier(db_edition or ed_or_solr)
+        acquisitions = [
             Acquisition(
                 access=access,
                 format='web',
                 price=None,
-                url=f'https://archive.org/details/{self.get_best_identifier(db_edition or ed_or_solr)}?view=theater&wrapper=false',
+                url=f'https://archive.org/details/{identifier}?view=theater&wrapper=false',
                 provider_name=self.short_name,
-            )
+            ),
         ]
 
+        # Add direct download links for open-access books
+        if access == 'open-access':
+            download_files = self._get_ia_download_files(identifier)
+            for fmt, filename in download_files.items():
+                if filename:
+                    acquisitions.append(
+                        Acquisition(
+                            access='open-access',
+                            format=fmt,
+                            price=None,
+                            url=f'https://archive.org/download/{identifier}/{filename}',
+                            provider_name=self.short_name,
+                        )
+                    )
+
+        return acquisitions
+
+    def _get_ia_download_files(
+        self, identifier: str
+    ) -> dict[Literal['pdf', 'epub'], str | None]:
+        """
+        Query IA metadata API to get available download file names.
+
+        Returns a dict mapping format to filename, e.g.:
+        {'pdf': 'mybook.pdf', 'epub': 'mybook.epub'}
+        """
+        import requests
+
+        result: dict[Literal['pdf', 'epub'], str | None] = {
+            'pdf': None,
+            'epub': None,
+        }
+
+        try:
+            resp = requests.get(
+                f'https://archive.org/metadata/{identifier}',
+                timeout=5,
+            )
+            resp.raise_for_status()
+            files = resp.json().get('files', [])
+
+            for f in files:
+                fmt = f.get('format', '')
+                name = f.get('name', '')
+                # IA uses 'Text PDF' or 'PDF' for PDFs
+                if fmt in ('Text PDF', 'PDF') and not result['pdf']:
+                    result['pdf'] = name
+                # IA uses 'EPUB' for epub files
+                elif fmt == 'EPUB' and not result['epub']:
+                    result['epub'] = name
+
+        except Exception as e:
+            logger.warning(f'Failed to fetch IA metadata for {identifier}: {e}')
+
+        return result
+
+
 
 class LibriVoxProvider(AbstractBookProvider):
     short_name = 'librivox'