Skip to content

Commit e54bc88

Browse files
committed
Have Author.merge_remote_ids error on conflict for now
1 parent 6fa2c1a commit e54bc88

File tree

3 files changed

+46
-31
lines changed

3 files changed

+46
-31
lines changed

openlibrary/catalog/add_book/load_book.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def get_redirected_authors(authors: list["Author"]):
168168
# Try other identifiers next.
169169
if remote_ids := author.get("remote_ids"):
170170
queries = []
171-
matched_authors = []
171+
matched_authors: list[Author] = []
172172
# Get all the authors that match any incoming identifier.
173173
for identifier, val in remote_ids.items():
174174
queries.append({"type": "/type/author", f"remote_ids.{identifier}": val})
@@ -178,28 +178,19 @@ def get_redirected_authors(authors: list["Author"]):
178178
get_redirected_authors(list(web.ctx.site.get_many(reply)))
179179
)
180180
matched_authors = uniq(matched_authors, key=lambda thing: thing.key)
181-
# The match is whichever one has the most identifiers in common AND does not have more conflicts than matched identifiers.
182-
highest_matches = 0
183-
selected_match = None
184-
for a in matched_authors:
185-
_, matches = a.merge_remote_ids(remote_ids)
186-
if matches > highest_matches:
187-
selected_match = a
188-
highest_matches = matches
189-
elif (
190-
matches == highest_matches
191-
and matches > 0
192-
and selected_match is not None
193-
):
194-
# Prioritize the lower OL ID when matched identifiers are equal
195-
selected_match = (
196-
a
197-
if extract_numeric_id_from_olid(a.key)
198-
< extract_numeric_id_from_olid(selected_match.key)
199-
else selected_match
200-
)
201-
if highest_matches > 0 and selected_match is not None:
181+
# The match is whichever one has the most identifiers in common
182+
if matched_authors:
183+
selected_match = sorted(
184+
matched_authors,
185+
key=lambda a: (
186+
# First sort by number of matches desc
187+
-1 * a.merge_remote_ids(remote_ids)[1],
188+
# If there's a tie, prioritize lower OL ID
189+
extract_numeric_id_from_olid(a.key),
190+
),
191+
)[0]
202192
return [selected_match]
193+
203194
# Fall back to name/date matching, which we did before introducing identifiers.
204195
name = author["name"].replace("*", r"\*")
205196
queries = [

openlibrary/catalog/add_book/tests/test_load_book.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,42 @@ def test_first_match_ol_key(self, mock_site):
162162
mock_site.save(author_different_key)
163163

164164
# Look for exact match on OL ID, regardless of other fields.
165-
# We ideally shouldn't ever have a case where different authors have the same VIAF, but this demonstrates priority.
166165
searched_author = {
167166
"name": "William H. Brewer",
168167
"key": "/authors/OL4A",
169-
"remote_ids": {"viaf": "12345678"},
170168
}
171169
found = import_author(searched_author)
172170
assert found.key == author_different_key["key"]
173171

172+
def test_conflicting_ids_cause_error(self, mock_site):
173+
# Author with VIAF
174+
author = {
175+
"name": "William H. Brewer",
176+
"key": "/authors/OL3A",
177+
"type": {"key": "/type/author"},
178+
"remote_ids": {"viaf": "12345678"},
179+
}
180+
181+
# Another author with VIAF
182+
author_different_key = {
183+
"name": "William Brewer",
184+
"key": "/authors/OL4A",
185+
"type": {"key": "/type/author"},
186+
"remote_ids": {"viaf": "87654321"},
187+
}
188+
189+
mock_site.save(author)
190+
mock_site.save(author_different_key)
191+
192+
# Author with differing ID
193+
searched_author = {
194+
"name": "William H. Brewer",
195+
"key": "/authors/OL4A",
196+
"remote_ids": {"viaf": "12345678"},
197+
}
198+
with pytest.raises(ValueError):
199+
import_author(searched_author)
200+
174201
def test_second_match_remote_identifier(self, mock_site):
175202
"""
176203
Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc.

openlibrary/core/models.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -814,20 +814,17 @@ def merge_remote_ids(
814814
return output, -1
815815
# Count
816816
matches = 0
817-
conflicts = 0
818817
config = get_identifier_config("author")
819818
for id in config["identifiers"]:
820819
identifier: str = id.name
821820
if identifier in output and identifier in incoming_ids:
822821
if output[identifier] != incoming_ids[identifier]:
823-
conflicts = conflicts + 1
822+
# For now, cause an error so we can see when/how often this happens
823+
raise ValueError(
824+
f"Conflicting remote IDs for author {self.key}: {output[identifier]} vs {incoming_ids[identifier]}"
825+
)
824826
else:
825827
matches = matches + 1
826-
# Decide at a later date if we want to change the logic to bail only when conflicts > matches, instead of conflicts > 0.
827-
# if conflicts > matches:
828-
if conflicts > 0:
829-
# TODO: Raise this to librarians, somehow.
830-
return self.remote_ids, -1
831828
return output, matches
832829

833830

0 commit comments

Comments
 (0)