Skip to content
Merged
Show file tree
Hide file tree
Changes from 45 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
19a1566
author identifiers in import
pidgezero-one Dec 3, 2024
5912f75
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 3, 2024
8305af5
this wasnt supposed to be here
pidgezero-one Dec 3, 2024
280d715
this was supposed to be here
pidgezero-one Dec 3, 2024
a3b8412
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 3, 2024
810dc33
notes
pidgezero-one Dec 3, 2024
9d54ff7
no more try/catch
pidgezero-one Dec 3, 2024
0be86b7
precommits
pidgezero-one Dec 3, 2024
f57f997
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 3, 2024
82b198c
?
pidgezero-one Dec 5, 2024
87d9f25
re: slack convo, go ahead and import this and use get_author_config o…
pidgezero-one Dec 12, 2024
f9a43c5
scripts
pidgezero-one Feb 4, 2025
047211c
merge
pidgezero-one Feb 4, 2025
18541e8
books are being imported, but author page does not list them
pidgezero-one Feb 9, 2025
e525c23
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 9, 2025
6473fbb
fix failing test
pidgezero-one Feb 9, 2025
c8e43b8
add 1900 exemption for wikisource, move script requirements into thei…
pidgezero-one Feb 10, 2025
419016a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2025
00c3dff
Update openlibrary/catalog/add_book/load_book.py
pidgezero-one Mar 4, 2025
a2af74a
Update openlibrary/catalog/add_book/load_book.py
pidgezero-one Mar 4, 2025
309daa9
Update openlibrary/catalog/add_book/load_book.py
pidgezero-one Mar 4, 2025
e47f5cb
Update openlibrary/catalog/add_book/load_book.py
pidgezero-one Mar 4, 2025
a8cd195
Update openlibrary/catalog/add_book/load_book.py
pidgezero-one Mar 4, 2025
01c2576
Update openlibrary/catalog/add_book/load_book.py
pidgezero-one Mar 4, 2025
c730dd0
Update openlibrary/core/models.py
pidgezero-one Mar 4, 2025
bab3f89
Update openlibrary/core/models.py
pidgezero-one Mar 4, 2025
8596a3b
Update openlibrary/core/models.py
pidgezero-one Mar 4, 2025
a3d9a24
Update scripts/providers/import_wikisource.py
pidgezero-one Mar 4, 2025
9bbf023
Update scripts/providers/import_wikisource.py
pidgezero-one Mar 4, 2025
ec13685
Update scripts/providers/import_wikisource.py
pidgezero-one Mar 4, 2025
a1251ad
Update scripts/providers/import_wikisource.py
pidgezero-one Mar 4, 2025
9c9cc48
Update scripts/providers/import_wikisource.py
pidgezero-one Mar 4, 2025
a615f26
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 4, 2025
253ed32
imports
pidgezero-one Mar 4, 2025
3f42068
Merge branch '9448/feat/use-known-author-identifiers-in-import' of ht…
pidgezero-one Mar 4, 2025
a9456f8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 4, 2025
9563baf
commtents
pidgezero-one Mar 4, 2025
6faf88d
Merge branch '9448/feat/use-known-author-identifiers-in-import' of ht…
pidgezero-one Mar 4, 2025
6facf5b
bracket fixes
pidgezero-one Mar 4, 2025
cd051af
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 4, 2025
43b3aaa
update script instructions
pidgezero-one Mar 4, 2025
b9ab864
:(
pidgezero-one Mar 20, 2025
d904ec1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 20, 2025
7c9f483
Merge branch 'master' into 9448/feat/use-known-author-identifiers-in-…
pidgezero-one Mar 20, 2025
37f173e
?
pidgezero-one Mar 20, 2025
6fa2c1a
Update import API to use key/remote_ids instead of ol_id/identifiers …
cdrini Mar 27, 2025
4895023
Have Author.merge_remote_ids error on conflict for now
cdrini Mar 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions openlibrary/catalog/add_book/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
"????",
"01-01-1900",
]
SUSPECT_DATE_EXEMPT_SOURCES: Final = ["wikisource"]
SUSPECT_AUTHOR_NAMES: Final = ["unknown", "n/a"]
SOURCE_RECORDS_REQUIRING_DATE_SCRUTINY: Final = ["amazon", "bwb", "promise"]
ALLOWED_COVER_HOSTS: Final = ("m.media-amazon.com", "books.google.com")
Expand Down
117 changes: 90 additions & 27 deletions openlibrary/catalog/add_book/load_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
key_int,
)
from openlibrary.core.helpers import extract_year
from openlibrary.utils import extract_numeric_id_from_olid, uniq

if TYPE_CHECKING:
from openlibrary.plugins.upstream.models import Author
Expand Down Expand Up @@ -149,8 +150,61 @@ def walk_redirects(obj, seen):
seen.add(obj['key'])
return obj

# Try for an 'exact' (case-insensitive) name match, but fall back to alternate_names,
# then last name with identical birth and death dates (that are not themselves `None` or '').
def get_redirected_authors(authors: list["Author"]):
keys = [a.type.key for a in authors]
if any(k != '/type/author' for k in keys):
seen: set[dict] = set()
all_authors = [
walk_redirects(a, seen) for a in authors if a['key'] not in seen
]
return all_authors
return authors

# Look for OL ID first.
if (key := author.get("ol_id")) and (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to name this one as key to be consistent with our book/thing records. Having the import endpoint mirror the shape of our core book records is convenient.

Suggested change
if (key := author.get("ol_id")) and (
if (key := author.get("key")) and (

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Freso do you have any strong opinions on this? ^

(see also Drini's comment above about remote_ids vs identifiers!)

reply := list(
web.ctx.site.things({"type": "/type/author", "key~": f'/authors/{key}'})
)
):
# Always match on OL ID, even if remote identifiers don't match.
return get_redirected_authors(list(web.ctx.site.get_many(reply)))

# Try other identifiers next.
if identifiers := author.get("identifiers"):
queries = []
matched_authors = []
# Get all the authors that match any incoming identifier.
for identifier, val in identifiers.items():
queries.append({"type": "/type/author", f"remote_ids.{identifier}": val})
for query in queries:
if reply := list(web.ctx.site.things(query)):
matched_authors.extend(
get_redirected_authors(list(web.ctx.site.get_many(reply)))
)
matched_authors = uniq(matched_authors, key=lambda thing: thing.key)
# The match is whichever one has the most identifiers in common AND does not have more conflicts than matched identifiers.
highest_matches = 0
selected_match = None
for a in matched_authors:
_, matches = a.merge_remote_ids(identifiers)
if matches > highest_matches:
selected_match = a
highest_matches = matches
elif (
matches == highest_matches
and matches > 0
and selected_match is not None
):
# Prioritize the lower OL ID when matched identifiers are equal
selected_match = (
a
if extract_numeric_id_from_olid(a.key)
< extract_numeric_id_from_olid(selected_match.key)
else selected_match
)
if highest_matches > 0 and selected_match is not None:
return [selected_match]
# Fall back to name/date matching, which we did before introducing identifiers.
name = author["name"].replace("*", r"\*")
queries = [
{"type": "/type/author", "name~": name},
Expand All @@ -162,37 +216,18 @@ def walk_redirects(obj, seen):
"death_date~": f"*{extract_year(author.get('death_date', '')) or -1}*",
}, # Use `-1` to ensure an empty string from extract_year doesn't match empty dates.
]
things = []
for query in queries:
if reply := list(web.ctx.site.things(query)):
things = get_redirected_authors(list(web.ctx.site.get_many(reply)))
break

authors = [web.ctx.site.get(k) for k in reply]
if any(a.type.key != '/type/author' for a in authors):
seen: set[dict] = set()
authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen]
return authors


def find_entity(author: dict[str, Any]) -> "Author | None":
"""
Looks for an existing Author record in OL
and returns it if found.

:param dict author: Author import dict {"name": "Some One"}
:return: Existing Author record if found, or None.
"""
assert isinstance(author, dict)
things = find_author(author)
if author.get('entity_type', 'person') != 'person':
return things[0] if things else None
match = []
seen = set()
for a in things:
key = a['key']
if key in seen:
continue
seen.add(key)
orig_key = key
assert a.type.key == '/type/author'
if 'birth_date' in author and 'birth_date' not in a:
continue
Expand All @@ -202,10 +237,27 @@ def find_entity(author: dict[str, Any]) -> "Author | None":
continue
match.append(a)
if not match:
return None
return []
if len(match) == 1:
return match[0]
return pick_from_matches(author, match)
return match
return [pick_from_matches(author, match)]


def find_entity(author: dict[str, Any]) -> "Author | None":
"""
Looks for an existing Author record in OL
and returns it if found.

:param dict author: Author import dict {"name": "Some One"}
:return: Existing Author record if found, or None.
"""
assert isinstance(author, dict)
things = find_author(author)
if "identifiers" in author:
for index, t in enumerate(things):
t.remote_ids, _ = t.merge_remote_ids(author["identifiers"])
things[index] = t
return things[0] if things else None


def remove_author_honorifics(name: str) -> str:
Expand Down Expand Up @@ -253,9 +305,20 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str,
new['death_date'] = author['death_date']
return new
a = {'type': {'key': '/type/author'}}
for f in 'name', 'title', 'personal_name', 'birth_date', 'death_date', 'date':
for f in (
'name',
'title',
'personal_name',
'birth_date',
'death_date',
'date',
'remote_ids',
):
if f in author:
a[f] = author[f]
# Import record hitting endpoint should list external IDs under "identifiers", but needs to be "remote_ids" when going into the DB.
if "identifiers" in author:
a["remote_ids"] = author["identifiers"]
return a


Expand Down
72 changes: 68 additions & 4 deletions openlibrary/catalog/add_book/tests/test_load_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,76 @@ def test_author_wildcard_match_with_no_matches_creates_author_with_wildcard(
new_author_name = import_author(author)
assert author["name"] == new_author_name["name"]

def test_first_match_priority_name_and_dates(self, mock_site):
def test_first_match_ol_key(self, mock_site):
"""
Highest priority match is name, birth date, and death date.
Highest priority match is OL key.
"""

# Author with VIAF
author = {
"name": "William H. Brewer",
"key": "/authors/OL3A",
"type": {"key": "/type/author"},
"remote_ids": {"viaf": "12345678"},
}

# Another author with VIAF
author_different_key = {
"name": "William Brewer",
"key": "/authors/OL4A",
"type": {"key": "/type/author"},
"remote_ids": {"viaf": "87654321"},
}

mock_site.save(author)
mock_site.save(author_different_key)

# Look for exact match on OL ID, regardless of other fields.
# We ideally shouldn't ever have a case where different authors have the same VIAF, but this demonstrates priority.
searched_author = {
"name": "William H. Brewer",
"ol_id": "OL4A",
"identifiers": {"viaf": "12345678"},
}
found = import_author(searched_author)
assert found.key == author_different_key["key"]

def test_second_match_remote_identifier(self, mock_site):
"""
Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
"""
self.add_three_existing_authors(mock_site)

# Author with VIAF
author = {
"name": "William H. Brewer",
"key": "/authors/OL3A",
"type": {"key": "/type/author"},
"remote_ids": {"viaf": "12345678"},
}

# Another author with VIAF
author_different_viaf = {
"name": "William Brewer",
"key": "/authors/OL4A",
"type": {"key": "/type/author"},
"remote_ids": {"viaf": "87654321"},
}

mock_site.save(author)
mock_site.save(author_different_viaf)

# Look for exact match on VIAF, regardless of name field.
searched_author = {
"name": "William Brewer",
"identifiers": {"viaf": "12345678"},
}
found = import_author(searched_author)
assert found.key == author["key"]

def test_third_match_priority_name_and_dates(self, mock_site):
"""
Next highest priority match is name, birth date, and death date.
"""
# Exact name match with no birth or death date
author = {
"name": "William H. Brewer",
Expand Down Expand Up @@ -202,7 +266,7 @@ def test_non_matching_birth_death_creates_new_author(self, mock_site):
assert isinstance(found, dict)
assert found["death_date"] == searched_and_not_found_author["death_date"]

def test_second_match_priority_alternate_names_and_dates(self, mock_site):
def test_match_priority_alternate_names_and_dates(self, mock_site):
"""
Matching, as a unit, alternate name, birth date, and death date, get
second match priority.
Expand Down
28 changes: 28 additions & 0 deletions openlibrary/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from openlibrary.core.ratings import Ratings
from openlibrary.core.vendors import get_amazon_metadata
from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
from openlibrary.plugins.upstream.utils import get_identifier_config
from openlibrary.utils import extract_numeric_id_from_olid
from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13

Expand Down Expand Up @@ -802,6 +803,33 @@ def get_edition_count(self):
def get_lists(self, limit=50, offset=0, sort=True):
return self._get_lists(limit=limit, offset=offset, sort=sort)

def merge_remote_ids(
self, incoming_ids: dict[str, str]
) -> tuple[dict[str, str], int]:
"""Returns the author's remote IDs merged with a given remote IDs object, as well as a count for how many IDs had conflicts.
If incoming_ids is empty, or if there are more conflicts than matches, no merge will be attempted, and the output will be (author.remote_ids, -1).
"""
output = {**self.remote_ids}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ended up having to revert to this deconstruction - self.remote_ids is being treated as a Thing and not a dict, for some reason (despite every other operation on it in this codebase suggesting it should be a dict) so deepcopy fails. I'm stumped on why that's happening.

if not incoming_ids:
return output, -1
# Count
matches = 0
conflicts = 0
config = get_identifier_config("author")
for id in config["identifiers"]:
identifier: str = id.name
if identifier in output and identifier in incoming_ids:
if output[identifier] != incoming_ids[identifier]:
conflicts = conflicts + 1
else:
matches = matches + 1
# Decide at a later date if we want to change the logic to bail only when conflicts > matches, instead of conflicts > 0.
# if conflicts > matches:
if conflicts > 0:
# TODO: Raise this to librarians, somehow.
return self.remote_ids, -1
return output, matches


class User(Thing):
def get_default_preferences(self):
Expand Down
13 changes: 9 additions & 4 deletions openlibrary/plugins/importapi/import_edition_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,15 @@ def add_list(self, key, val):
self.edition_dict[key] = [val]

def add_author(self, key, val):
# We don't know birth_date or death_date.
# Should name and personal_name be the same value?
author_dict = {'personal_name': val, 'name': val, 'entity_type': 'person'}
self.add_list('authors', author_dict)
if isinstance(val, dict):
author_dict = val
if "name" in author_dict:
author_dict['personal_name'] = author_dict['name']
self.add_list('authors', author_dict)
else:
self.add_list(
'authors', {'personal_name': val, 'name': val, 'entity_type': 'person'}
)

def add_illustrator(self, key, val):
self.add_list('contributions', val + ' (Illustrator)')
Expand Down
13 changes: 12 additions & 1 deletion openlibrary/plugins/importapi/import_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from annotated_types import MinLen
from pydantic import BaseModel, ValidationError, model_validator, root_validator

from openlibrary.catalog.add_book import SUSPECT_AUTHOR_NAMES, SUSPECT_PUBLICATION_DATES
from openlibrary.catalog.add_book import (
SUSPECT_AUTHOR_NAMES,
SUSPECT_DATE_EXEMPT_SOURCES,
SUSPECT_PUBLICATION_DATES,
)

T = TypeVar("T")

Expand Down Expand Up @@ -34,6 +38,13 @@ class CompleteBook(BaseModel):
@root_validator(pre=True)
def remove_invalid_dates(cls, values):
"""Remove known bad dates prior to validation."""
is_exempt = any(
source_record.split(":")[0] in SUSPECT_DATE_EXEMPT_SOURCES
for source_record in values.get("source_records", [])
)
if is_exempt:
return values

if values.get("publish_date") in SUSPECT_PUBLICATION_DATES:
values.pop("publish_date")

Expand Down
3 changes: 0 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ isbnlib==3.10.14
luqum==0.11.0
lxml==4.9.4
multipart==0.2.4
mwparserfromhell==0.6.6
nameparser==1.1.3
Pillow==10.4.0
psycopg2==2.9.6
pydantic==2.4.0
Expand All @@ -33,4 +31,3 @@ sentry-sdk==2.19.2
simplejson==3.19.1
statsd==4.0.1
validate_email==1.3
wikitextparser==0.56.1
7 changes: 7 additions & 0 deletions requirements_scripts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Temporary requirements for running standalone scripts that are not necessary for OL to function.
# Run like this:
# python -m pip install -r requirements_scripts.txt && PYTHONPATH=. python ./path/to/script.py optional_args... && python -m pip uninstall -y -r requirements_scripts.txt

mwparserfromhell==0.6.6
nameparser==1.1.3
wikitextparser==0.56.1
Loading
Loading