Skip to content

Commit 4b7ea29

Browse files
pidgezero-onepre-commit-ci[bot]cdrini
authored
feat: use author identifiers in import API (#10110)
* author identifiers in import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * this wasnt supposed to be here * this was supposed to be here * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * notes * no more try/catch * precommits * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ? * re: slack convo, go ahead and import this and use get_author_config over hardcoded IDs * scripts * books are being imported, but author page does not list them * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix failing test * add 1900 exemption for wikisource, move script requirements into their own file * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update openlibrary/catalog/add_book/load_book.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update openlibrary/catalog/add_book/load_book.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update openlibrary/catalog/add_book/load_book.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update openlibrary/catalog/add_book/load_book.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update openlibrary/catalog/add_book/load_book.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update openlibrary/catalog/add_book/load_book.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update openlibrary/core/models.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update openlibrary/core/models.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update openlibrary/core/models.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update scripts/providers/import_wikisource.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update scripts/providers/import_wikisource.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update scripts/providers/import_wikisource.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update scripts/providers/import_wikisource.py Co-authored-by: Drini Cami <cdrini@gmail.com> * Update scripts/providers/import_wikisource.py Co-authored-by: Drini Cami <cdrini@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * imports * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * commtents * bracket fixes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update script instructions * :( * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ? * Update import API to use key/remote_ids instead of ol_id/identifiers to match type schema * Have Author.merge_remote_ids error on conflict for now --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Drini Cami <cdrini@gmail.com>
1 parent 86d9b8a commit 4b7ea29

File tree

9 files changed

+316
-61
lines changed

9 files changed

+316
-61
lines changed

openlibrary/catalog/add_book/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
"????",
7575
"01-01-1900",
7676
]
77+
SUSPECT_DATE_EXEMPT_SOURCES: Final = ["wikisource"]
7778
SUSPECT_AUTHOR_NAMES: Final = ["unknown", "n/a"]
7879
SOURCE_RECORDS_REQUIRING_DATE_SCRUTINY: Final = ["amazon", "bwb", "promise"]
7980
ALLOWED_COVER_HOSTS: Final = ("m.media-amazon.com", "books.google.com")

openlibrary/catalog/add_book/load_book.py

Lines changed: 74 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
key_int,
1010
)
1111
from openlibrary.core.helpers import extract_year
12+
from openlibrary.utils import extract_numeric_id_from_olid, uniq
1213

1314
if TYPE_CHECKING:
1415
from openlibrary.plugins.upstream.models import Author
@@ -149,8 +150,48 @@ def walk_redirects(obj, seen):
149150
seen.add(obj['key'])
150151
return obj
151152

152-
# Try for an 'exact' (case-insensitive) name match, but fall back to alternate_names,
153-
# then last name with identical birth and death dates (that are not themselves `None` or '').
153+
def get_redirected_authors(authors: list["Author"]):
154+
keys = [a.type.key for a in authors]
155+
if any(k != '/type/author' for k in keys):
156+
seen: set[dict] = set()
157+
all_authors = [
158+
walk_redirects(a, seen) for a in authors if a['key'] not in seen
159+
]
160+
return all_authors
161+
return authors
162+
163+
# Look for OL ID first.
164+
if (key := author.get("key")) and (record := web.ctx.site.get(key)):
165+
# Always match on OL ID, even if remote identifiers don't match.
166+
return get_redirected_authors([record])
167+
168+
# Try other identifiers next.
169+
if remote_ids := author.get("remote_ids"):
170+
queries = []
171+
matched_authors: list[Author] = []
172+
# Get all the authors that match any incoming identifier.
173+
for identifier, val in remote_ids.items():
174+
queries.append({"type": "/type/author", f"remote_ids.{identifier}": val})
175+
for query in queries:
176+
if reply := list(web.ctx.site.things(query)):
177+
matched_authors.extend(
178+
get_redirected_authors(list(web.ctx.site.get_many(reply)))
179+
)
180+
matched_authors = uniq(matched_authors, key=lambda thing: thing.key)
181+
# The match is whichever one has the most identifiers in common
182+
if matched_authors:
183+
selected_match = sorted(
184+
matched_authors,
185+
key=lambda a: (
186+
# First sort by number of matches desc
187+
-1 * a.merge_remote_ids(remote_ids)[1],
188+
# If there's a tie, prioritize lower OL ID
189+
extract_numeric_id_from_olid(a.key),
190+
),
191+
)[0]
192+
return [selected_match]
193+
194+
# Fall back to name/date matching, which we did before introducing identifiers.
154195
name = author["name"].replace("*", r"\*")
155196
queries = [
156197
{"type": "/type/author", "name~": name},
@@ -162,37 +203,18 @@ def walk_redirects(obj, seen):
162203
"death_date~": f"*{extract_year(author.get('death_date', '')) or -1}*",
163204
}, # Use `-1` to ensure an empty string from extract_year doesn't match empty dates.
164205
]
206+
things = []
165207
for query in queries:
166208
if reply := list(web.ctx.site.things(query)):
209+
things = get_redirected_authors(list(web.ctx.site.get_many(reply)))
167210
break
168-
169-
authors = [web.ctx.site.get(k) for k in reply]
170-
if any(a.type.key != '/type/author' for a in authors):
171-
seen: set[dict] = set()
172-
authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen]
173-
return authors
174-
175-
176-
def find_entity(author: dict[str, Any]) -> "Author | None":
177-
"""
178-
Looks for an existing Author record in OL
179-
and returns it if found.
180-
181-
:param dict author: Author import dict {"name": "Some One"}
182-
:return: Existing Author record if found, or None.
183-
"""
184-
assert isinstance(author, dict)
185-
things = find_author(author)
186-
if author.get('entity_type', 'person') != 'person':
187-
return things[0] if things else None
188211
match = []
189212
seen = set()
190213
for a in things:
191214
key = a['key']
192215
if key in seen:
193216
continue
194217
seen.add(key)
195-
orig_key = key
196218
assert a.type.key == '/type/author'
197219
if 'birth_date' in author and 'birth_date' not in a:
198220
continue
@@ -202,10 +224,27 @@ def find_entity(author: dict[str, Any]) -> "Author | None":
202224
continue
203225
match.append(a)
204226
if not match:
205-
return None
227+
return []
206228
if len(match) == 1:
207-
return match[0]
208-
return pick_from_matches(author, match)
229+
return match
230+
return [pick_from_matches(author, match)]
231+
232+
233+
def find_entity(author: dict[str, Any]) -> "Author | None":
234+
"""
235+
Looks for an existing Author record in OL
236+
and returns it if found.
237+
238+
:param dict author: Author import dict {"name": "Some One"}
239+
:return: Existing Author record if found, or None.
240+
"""
241+
assert isinstance(author, dict)
242+
things = find_author(author)
243+
if "remote_ids" in author:
244+
for index, t in enumerate(things):
245+
t.remote_ids, _ = t.merge_remote_ids(author["remote_ids"])
246+
things[index] = t
247+
return things[0] if things else None
209248

210249

211250
def remove_author_honorifics(name: str) -> str:
@@ -253,7 +292,15 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str,
253292
new['death_date'] = author['death_date']
254293
return new
255294
a = {'type': {'key': '/type/author'}}
256-
for f in 'name', 'title', 'personal_name', 'birth_date', 'death_date', 'date':
295+
for f in (
296+
'name',
297+
'title',
298+
'personal_name',
299+
'birth_date',
300+
'death_date',
301+
'date',
302+
'remote_ids',
303+
):
257304
if f in author:
258305
a[f] = author[f]
259306
return a

openlibrary/catalog/add_book/tests/test_load_book.py

Lines changed: 96 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
remove_author_honorifics,
99
)
1010
from openlibrary.catalog.utils import InvalidLanguage
11-
from openlibrary.core.models import Author
11+
from openlibrary.core.models import Author, AuthorRemoteIdConflictError
1212

1313

1414
@pytest.fixture
@@ -137,12 +137,103 @@ def test_author_wildcard_match_with_no_matches_creates_author_with_wildcard(
137137
new_author_name = import_author(author)
138138
assert author["name"] == new_author_name["name"]
139139

140-
def test_first_match_priority_name_and_dates(self, mock_site):
140+
def test_first_match_ol_key(self, mock_site):
141141
"""
142-
Highest priority match is name, birth date, and death date.
142+
Highest priority match is OL key.
143143
"""
144-
self.add_three_existing_authors(mock_site)
145144

145+
# Author with VIAF
146+
author = {
147+
"name": "William H. Brewer",
148+
"key": "/authors/OL3A",
149+
"type": {"key": "/type/author"},
150+
"remote_ids": {"viaf": "12345678"},
151+
}
152+
153+
# Another author with VIAF
154+
author_different_key = {
155+
"name": "William Brewer",
156+
"key": "/authors/OL4A",
157+
"type": {"key": "/type/author"},
158+
"remote_ids": {"viaf": "87654321"},
159+
}
160+
161+
mock_site.save(author)
162+
mock_site.save(author_different_key)
163+
164+
# Look for exact match on OL ID, regardless of other fields.
165+
searched_author = {
166+
"name": "William H. Brewer",
167+
"key": "/authors/OL4A",
168+
}
169+
found = import_author(searched_author)
170+
assert found.key == author_different_key["key"]
171+
172+
def test_conflicting_ids_cause_error(self, mock_site):
173+
# Author with VIAF
174+
author = {
175+
"name": "William H. Brewer",
176+
"key": "/authors/OL3A",
177+
"type": {"key": "/type/author"},
178+
"remote_ids": {"viaf": "12345678"},
179+
}
180+
181+
# Another author with VIAF
182+
author_different_key = {
183+
"name": "William Brewer",
184+
"key": "/authors/OL4A",
185+
"type": {"key": "/type/author"},
186+
"remote_ids": {"viaf": "87654321"},
187+
}
188+
189+
mock_site.save(author)
190+
mock_site.save(author_different_key)
191+
192+
# Author with differing ID
193+
searched_author = {
194+
"name": "William H. Brewer",
195+
"key": "/authors/OL4A",
196+
"remote_ids": {"viaf": "12345678"},
197+
}
198+
with pytest.raises(AuthorRemoteIdConflictError):
199+
import_author(searched_author)
200+
201+
def test_second_match_remote_identifier(self, mock_site):
202+
"""
203+
Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
204+
"""
205+
206+
# Author with VIAF
207+
author = {
208+
"name": "William H. Brewer",
209+
"key": "/authors/OL3A",
210+
"type": {"key": "/type/author"},
211+
"remote_ids": {"viaf": "12345678"},
212+
}
213+
214+
# Another author with VIAF
215+
author_different_viaf = {
216+
"name": "William Brewer",
217+
"key": "/authors/OL4A",
218+
"type": {"key": "/type/author"},
219+
"remote_ids": {"viaf": "87654321"},
220+
}
221+
222+
mock_site.save(author)
223+
mock_site.save(author_different_viaf)
224+
225+
# Look for exact match on VIAF, regardless of name field.
226+
searched_author = {
227+
"name": "William Brewer",
228+
"remote_ids": {"viaf": "12345678"},
229+
}
230+
found = import_author(searched_author)
231+
assert found.key == author["key"]
232+
233+
def test_third_match_priority_name_and_dates(self, mock_site):
234+
"""
235+
Next highest priority match is name, birth date, and death date.
236+
"""
146237
# Exact name match with no birth or death date
147238
author = {
148239
"name": "William H. Brewer",
@@ -202,7 +293,7 @@ def test_non_matching_birth_death_creates_new_author(self, mock_site):
202293
assert isinstance(found, dict)
203294
assert found["death_date"] == searched_and_not_found_author["death_date"]
204295

205-
def test_second_match_priority_alternate_names_and_dates(self, mock_site):
296+
def test_match_priority_alternate_names_and_dates(self, mock_site):
206297
"""
207298
Matching, as a unit, alternate name, birth date, and death date, get
208299
second match priority.

openlibrary/core/models.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from openlibrary.core.ratings import Ratings
3131
from openlibrary.core.vendors import get_amazon_metadata
3232
from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
33+
from openlibrary.plugins.upstream.utils import get_identifier_config
3334
from openlibrary.utils import extract_numeric_id_from_olid
3435
from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13
3536

@@ -760,6 +761,10 @@ def resolve_redirects_bulk(
760761
logger.info(f"[update-redirects] Done, processed {total}, fixed {fixed}")
761762

762763

764+
class AuthorRemoteIdConflictError(ValueError):
765+
pass
766+
767+
763768
class Author(Thing):
764769
"""Class to represent /type/author objects in OL."""
765770

@@ -802,6 +807,30 @@ def get_edition_count(self):
802807
def get_lists(self, limit=50, offset=0, sort=True):
803808
return self._get_lists(limit=limit, offset=offset, sort=sort)
804809

810+
def merge_remote_ids(
811+
self, incoming_ids: dict[str, str]
812+
) -> tuple[dict[str, str], int]:
813+
"""Returns the author's remote IDs merged with a given remote IDs object, as well as a count for how many IDs had conflicts.
814+
If incoming_ids is empty, or if there are more conflicts than matches, no merge will be attempted, and the output will be (author.remote_ids, -1).
815+
"""
816+
output = {**self.remote_ids}
817+
if not incoming_ids:
818+
return output, -1
819+
# Count
820+
matches = 0
821+
config = get_identifier_config("author")
822+
for id in config["identifiers"]:
823+
identifier: str = id.name
824+
if identifier in output and identifier in incoming_ids:
825+
if output[identifier] != incoming_ids[identifier]:
826+
# For now, cause an error so we can see when/how often this happens
827+
raise AuthorRemoteIdConflictError(
828+
f"Conflicting remote IDs for author {self.key}: {output[identifier]} vs {incoming_ids[identifier]}"
829+
)
830+
else:
831+
matches = matches + 1
832+
return output, matches
833+
805834

806835
class User(Thing):
807836
def get_default_preferences(self):

openlibrary/plugins/importapi/import_edition_builder.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,15 @@ def add_list(self, key, val):
100100
self.edition_dict[key] = [val]
101101

102102
def add_author(self, key, val):
103-
# We don't know birth_date or death_date.
104-
# Should name and personal_name be the same value?
105-
author_dict = {'personal_name': val, 'name': val, 'entity_type': 'person'}
106-
self.add_list('authors', author_dict)
103+
if isinstance(val, dict):
104+
author_dict = val
105+
if "name" in author_dict:
106+
author_dict['personal_name'] = author_dict['name']
107+
self.add_list('authors', author_dict)
108+
else:
109+
self.add_list(
110+
'authors', {'personal_name': val, 'name': val, 'entity_type': 'person'}
111+
)
107112

108113
def add_illustrator(self, key, val):
109114
self.add_list('contributions', val + ' (Illustrator)')

openlibrary/plugins/importapi/import_validator.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
from annotated_types import MinLen
44
from pydantic import BaseModel, ValidationError, model_validator, root_validator
55

6-
from openlibrary.catalog.add_book import SUSPECT_AUTHOR_NAMES, SUSPECT_PUBLICATION_DATES
6+
from openlibrary.catalog.add_book import (
7+
SUSPECT_AUTHOR_NAMES,
8+
SUSPECT_DATE_EXEMPT_SOURCES,
9+
SUSPECT_PUBLICATION_DATES,
10+
)
711

812
T = TypeVar("T")
913

@@ -34,6 +38,13 @@ class CompleteBook(BaseModel):
3438
@root_validator(pre=True)
3539
def remove_invalid_dates(cls, values):
3640
"""Remove known bad dates prior to validation."""
41+
is_exempt = any(
42+
source_record.split(":")[0] in SUSPECT_DATE_EXEMPT_SOURCES
43+
for source_record in values.get("source_records", [])
44+
)
45+
if is_exempt:
46+
return values
47+
3748
if values.get("publish_date") in SUSPECT_PUBLICATION_DATES:
3849
values.pop("publish_date")
3950

requirements.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@ isbnlib==3.10.14
1818
luqum==0.11.0
1919
lxml==4.9.4
2020
multipart==0.2.4
21-
mwparserfromhell==0.6.6
22-
nameparser==1.1.3
2321
Pillow==10.4.0
2422
psycopg2==2.9.6
2523
pydantic==2.4.0
@@ -33,4 +31,3 @@ sentry-sdk==2.19.2
3331
simplejson==3.19.1
3432
statsd==4.0.1
3533
validate_email==1.3
36-
wikitextparser==0.56.1

requirements_scripts.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Temporary requirements for running standalone scripts that are not necessary for OL to function.
2+
# Run like this:
3+
# python -m pip install -r requirements_scripts.txt && PYTHONPATH=. python ./path/to/script.py optional_args... && python -m pip uninstall -y -r requirements_scripts.txt
4+
5+
mwparserfromhell==0.6.6
6+
nameparser==1.1.3
7+
wikitextparser==0.56.1

0 commit comments

Comments
 (0)