Add ISBN 10 to 13 converter bot

davidscotson · davidscotson · commit 3016f2c812b2 · 2023-06-29T12:50:39.000+01:00
diff --git a/isbn10to13bot/README.md b/isbn10to13bot/README.md
@@ -0,0 +1,13 @@
+A set of scripts to add isbn_13 values to editions with valid isbn_10.
+### How To Use
+```bash
+# Find Editions with ISBN 10, but no ISBN 13
+ ./find_editions_with_isbn10not13.sh /path/to/ol_dump.txt.gz /path/to/filtered_dump.txt.gz
+# Add ISBN 13s converted from the ISBN 10
+python isbn_10_to_13.py --dump_path=/path/to/filtered_dump.txt.gz --dry_run=<bool> --limit=<init>
+```
+If `dry_run` is True, the script will run as normal, but no changes will be saved to OpenLibrary.
+This is for debugging purposes. By default, `dry_run` is `True`.
+`limit` is the maximum number of changes to OpenLibrary that will occur before the script quits.
+By default, `limit` is set to `1`. Setting `limit` to `0` allows unlimited edits.
+A log is automatically generated whenever `isbn_10_to_13.py` executes.
diff --git a/isbn10to13bot/find_editions_with_isbn_10_not_13.sh b/isbn10to13bot/find_editions_with_isbn_10_not_13.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+if [[ -z $1 ]]
+  then
+    echo "No dump file provided"
+    exit 1
+fi
+if [[ -z $2 ]]
+  then
+    echo "No output file provided"
+    exit 1
+fi
+
+OL_DUMP=$1
+OUTPUT=$2
+
+zgrep ^/type/edition $OL_DUMP | grep -E '"isbn_10":' |  grep -v -E '"isbn_13":' | pv | gzip > $OUTPUT
diff --git a/isbn10to13bot/isbn_10_to_13.py b/isbn10to13bot/isbn_10_to_13.py
@@ -0,0 +1,75 @@
+"""
+isbn 10 to isbn 13
+NOTE: This script ideally works on an Open Library Dump that only contains editions with an isbn_10 and no isbn_13
+"""
+import gzip
+import json
+
+import isbnlib
+import olclient
+
+
+class ConvertISBN10to13Job(olclient.AbstractBotJob):
+
+    def run(self) -> None:
+        """Looks for any ISBN 10s to convert to 13"""
+        self.write_changes_declaration()
+        header = {"type": 0, "key": 1, "revision": 2, "last_modified": 3, "JSON": 4}
+        comment = "convert ISBN 10 to 13 using isbnlib"
+        with gzip.open(self.args.file, "rb") as fin:
+            for row_num, row in enumerate(fin):
+                row = row.decode().split("\t")
+                _json = json.loads(row[header["JSON"]])
+                if _json["type"]["key"] != "/type/edition":
+                    continue
+
+                if "isbn_10" in _json:
+                    isbns_10 = _json.get("isbn_10", None)
+                else:
+                    # we only update editions with existing isbn 10s
+                    continue
+                if "isbn_13" in _json:
+                    # we only update editions with no existing isbn 13s (for now at least)
+                    continue
+
+                olid = _json["key"].split("/")[-1]
+                edition = self.ol.Edition.get(olid)
+                if edition.type["key"] != "/type/edition":
+                    continue
+
+                isbns_13 = []
+                for isbn in isbns_10:
+                    canonical = isbnlib.canonical(isbn)
+                    if isbnlib.is_isbn10(canonical):
+                        isbn_13 = isbnlib.to_isbn13(canonical)
+                    if isbnlib.is_isbn13(canonical):
+                        isbn_13 = canonical
+                    if isbn_13:
+                        isbns_13.append(isbn_13)
+
+                if len(isbns_13) > 1:
+                    isbns_13 = dedupe(isbns_13)  # remove duplicates, shouldn't normally be necessary
+
+                setattr(edition, 'isbn_13', isbns_13)
+                self.logger.info(
+                    "\t".join([olid, str(isbns_10), str(isbns_13)])
+                )
+                self.save(lambda: edition.save(comment=comment))
+
+def dedupe(input_list: list) -> list:
+    """Remove duplicate elements in a list and return the new list"""
+    output = []
+    for i in input_list:
+        if i not in output:
+            output.append(i)
+    return output
+
+
+if __name__ == "__main__":
+    job = ConvertISBN10to13Job()
+
+    try:
+        job.run()
+    except Exception as e:
+        job.logger.exception(e)
+        raise e
diff --git a/isbn10to13bot/requirements.txt b/isbn10to13bot/requirements.txt
@@ -0,0 +1,2 @@
+openlibrary-client==0.0.30
+isbnlib==3.10.14

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+openlibrary-client==0.0.30`
	`2`	`+isbnlib==3.10.14`