From a8c09091bd8d5757ae738ca1dd342e100434760c Mon Sep 17 00:00:00 2001
From: Ingrid <git@ingrids.email>
Date: Sat, 1 Jun 2024 16:07:35 +0200
Subject: [PATCH 1/5] set up rye package

---
 .gitignore                |   7 +-
 .python-version           |   1 +
 latin2shaw.py             | 389 -------------------------------
 pyproject.toml            |  40 ++++
 requirements-dev.lock     | 121 ++++++++++
 requirements.lock         | 121 ++++++++++
 src/readlex/__init__.py   |   3 +
 src/readlex/latin2shaw.py | 470 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 762 insertions(+), 390 deletions(-)
 create mode 100644 .python-version
 delete mode 100644 latin2shaw.py
 create mode 100644 pyproject.toml
 create mode 100644 requirements-dev.lock
 create mode 100644 requirements.lock
 create mode 100644 src/readlex/__init__.py
 create mode 100644 src/readlex/latin2shaw.py

diff --git a/.gitignore b/.gitignore
index 9bea433..0c21c64 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,7 @@
-
+target
+.venv
+.env
+__pycache__
+.idea
+dist
 .DS_Store
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..871f80a
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.12.3
diff --git a/latin2shaw.py b/latin2shaw.py
deleted file mode 100644
index 2fec9d0..0000000
--- a/latin2shaw.py
+++ /dev/null
@@ -1,389 +0,0 @@
-import json
-import csv
-import re
-import unidecode
-import smartypants
-import spacy
-from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, filter_spans
-from spacy.tokens import Doc, Span
-from spacy.matcher import PhraseMatcher
-
-from bs4 import BeautifulSoup
-
-
-def latin2shaw(text):
-    with open("static/readlex_converter.json", 'r', encoding="utf-8") as file:
-        json_data = file.read()
-
-    readlex_dict: dict[str, list[dict[str, str]]] = json.loads(json_data)
-
-    # Categories of letters that determine how a following 's is pronounced
-    s_follows: set[str] = {"𐑐", "𐑑", "𐑒", "𐑓", "𐑔"}
-    uhz_follows: set[str] = {"𐑕", "𐑖", "𐑗", "𐑟", "𐑠", "𐑡"}
-    z_follows: set[str] = {"𐑚", "𐑛", "𐑜", "𐑝", "𐑞", "𐑙", "𐑤", "𐑥", "𐑯", "𐑸", "𐑹", "𐑺", "𐑻", "𐑼", "𐑽"}
-    consonants = set.union(s_follows, uhz_follows, z_follows)
-    # vowels = {"𐑦", "𐑰", "𐑧", "𐑱", "𐑨", "𐑲", "𐑩", "𐑳", "𐑪", "𐑴", "𐑫", "𐑵", "𐑬", "𐑶", "𐑭", "𐑷", "𐑾", "𐑿"}
-    # The following are never final other than in initialisms: "𐑣", "𐑢", "𐑘", "𐑮".
-
-    # Contractions that need special treatment since the separate words are not as they appear in the dictionary
-    contraction_start: dict[str, str] = {"ai": "𐑱", "ca": "𐑒𐑭", "do": "𐑛𐑴", "does": "𐑛𐑳𐑟", "did": "𐑛𐑦𐑛", "sha": "𐑖𐑭",
-                                         "wo": "𐑢𐑴",
-                                         "y'": "𐑘"}
-    contraction_end: dict[str, str] = {"n't": "𐑯𐑑", "all": "𐑷𐑤", "'ve": "𐑝", "'ll": "𐑤", "'m": "𐑥", "'d": "𐑛",
-                                       "'re": "𐑼"}
-
-    # Common prefixes and suffixes used in new coinings
-    prefixes: dict[str, str] = {"anti": "𐑨𐑯𐑑𐑦",
-                                "counter": "𐑒𐑬𐑯𐑑𐑼",
-                                "de": "𐑛𐑰",
-                                "dis": "𐑛𐑦𐑕",
-                                "esque": "𐑧𐑕𐑒",
-                                "hyper": "𐑣𐑲𐑐𐑼",
-                                "hypo": "𐑣𐑲𐑐𐑴",
-                                "mega": "𐑥𐑧𐑜𐑩",
-                                "meta": "𐑥𐑧𐑑𐑩",
-                                "micro": "𐑥𐑲𐑒𐑮𐑴",
-                                "multi": "𐑥𐑳𐑤𐑑𐑦",
-                                "mis": "𐑥𐑦𐑕",
-                                "neuro": "𐑯𐑘𐑫𐑼𐑴",
-                                "non": "𐑯𐑪𐑯",
-                                "o'er": "𐑴𐑼",
-                                "out": "𐑬𐑑",
-                                "over": "𐑴𐑝𐑼",
-                                "poly": "𐑐𐑪𐑤𐑦",
-                                "post": "𐑐𐑴𐑕𐑑",
-                                "pre": "𐑐𐑮𐑰",
-                                "pro": "𐑐𐑮𐑴",
-                                "pseudo": "𐑕𐑿𐑛𐑴",
-                                "re": "𐑮𐑰",
-                                "sub": "𐑕𐑳𐑚",
-                                "super": "𐑕𐑵𐑐𐑼",
-                                "ultra": "𐑳𐑤𐑑𐑮𐑩",
-                                "un": "𐑳𐑯",
-                                "under": "𐑳𐑯𐑛𐑼"
-                                }
-    suffixes: dict[str, str] = {"able": "𐑩𐑚𐑩𐑤",
-                "bound": "𐑚𐑬𐑯𐑛",
-                "ful": "𐑓𐑩𐑤",
-                "hood": "𐑣𐑫𐑛",
-                "ish": "𐑦𐑖",
-                "ism": "𐑦𐑟𐑩𐑥",
-                "less": "𐑤𐑩𐑕",
-                "like": "𐑤𐑲𐑒",
-                "ness": "𐑯𐑩𐑕"
-                }
-    affixes: dict[str, str] = prefixes | suffixes
-
-    # Words that sometimes change spelling before 'to'
-    have_to: dict[str, str] = {"have": "𐑣𐑨𐑓", "has": "𐑣𐑨𐑕"}
-    vbd_to: dict[str, str] = {"used": "𐑿𐑕𐑑", "unused": "𐑳𐑯𐑿𐑕𐑑", "supposed": "𐑕𐑩𐑐𐑴𐑕𐑑"}
-    before_to: dict[str, str] = have_to | vbd_to
-
-    # Suffixes that follow numerals in ordinal numbers
-    ordinal_suffixes: dict[str, str] = {"st": "𐑕𐑑", "nd": "𐑯𐑛", "rd": "𐑮𐑛", "th": "𐑔", "s": "𐑟"}
-
-    # Load spaCy, excluding pipeline components that are not required
-    nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "textcat"])
-
-    # Customise the spaCy tokeniser to ensure that initial and final dashes and dashes between words aren't stuck to one
-    # of the surrounding words
-    # Prefixes
-    spacy_prefixes: list[str] = nlp.Defaults.prefixes + [r'''^[-–—]+''',]
-    prefix_regex = compile_prefix_regex(spacy_prefixes)
-    nlp.tokenizer.prefix_search = prefix_regex.search
-    # Infixes
-    spacy_infixes: list[str] = nlp.Defaults.infixes + [r'''[.,?!:;\-–—"~\(\)\[\]]+''',]
-    infix_regex = compile_infix_regex(spacy_infixes)
-    nlp.tokenizer.infix_finditer = infix_regex.finditer
-    # Suffixes
-    spacy_suffixes: list[str] = nlp.Defaults.suffixes + [r'''[-–—]+$''',]
-    suffix_regex = compile_suffix_regex(spacy_suffixes)
-    nlp.tokenizer.suffix_search = suffix_regex.search
-
-    def add_span(matcher, doc, i, matches):
-        match_id, start, end = matches[i]
-
-    # Define the phrase to match
-    with open("static/readlex_converter_phrases.json", "r", newline="") as f:
-        reader = csv.reader(f)
-        phrases = [row[0] for row in reader if row]
-    phrase_patterns: list[Doc] = [nlp.make_doc(phrase) for phrase in phrases]
-    phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
-    phrase_matcher.add("phrases", phrase_patterns, on_match=add_span)
-
-    namer_dot_ents: set[str] = {"PERSON", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"}
-
-    def tokenise(text: str) -> spacy.tokens.Doc:
-        # Tokenise and tag the text using spaCy as doc
-
-        doc = nlp(text)
-        phrase_matches = phrase_matcher(doc)
-        phrase_spans: list[Span] = []
-        for match_id, start, end in phrase_matches:
-            span = Span(doc, start, end, label=match_id)
-            phrase_spans.append(span)
-
-        filtered_spans = filter_spans(phrase_spans)
-
-        with doc.retokenize() as retokenizer:
-            for span in filtered_spans:
-                retokenizer.merge(span)
-
-        # Expand person entities to include titles and take initial 'the' out of entity names
-        titles: set[str] = {
-            "archbishop",
-            "archdeacon",
-            "baron",
-            "baroness",
-            "bishop",
-            "captain",
-            "count",
-            "countess",
-            "cpt",
-            "dame",
-            "deacon",
-            "doctor",
-            "dr.",
-            "dr",
-            "duchess",
-            "duke",
-            "earl",
-            "emperor",
-            "empress",
-            "gov.",
-            "gov",
-            "governor",
-            "justice",
-            "king",
-            "lady",
-            "lord",
-            "marchioness",
-            "marquess",
-            "marquis",
-            "miss",
-            "missus",
-            "mister",
-            "mistress",
-            "mr.",
-            "mr",
-            "mrs.",
-            "mrs",
-            "ms.",
-            "ms",
-            "mx.",
-            "mx",
-            "pope",
-            "pres.",
-            "pres",
-            "president",
-            "prince",
-            "princess",
-            "prof.",
-            "prof",
-            "professor",
-            "queen",
-            "rev.",
-            "rev",
-            "reverend",
-            "saint",
-            "sen.",
-            "sen",
-            "senator",
-            "sir",
-            "st.",
-            "st",
-            "viscount",
-            "viscountess"
-        }
-        new_ents: list[Span] = []
-        for ent in doc.ents:
-            # Only check for title if it's a person and not the first token
-            if ent.label_ == "PERSON" and ent.start != 0:
-                prev_token = doc[ent.start - 1]
-                if prev_token.lower_ in titles:
-                    new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
-                    new_ents.append(new_ent)
-                else:
-                    new_ents.append(ent)
-            elif ent.label_ in namer_dot_ents:
-                if doc[ent.start].lower_ == "the":
-                    new_ent = Span(doc, ent.start + 1, ent.end, label=ent.label)
-                    new_ents.append(new_ent)
-                else:
-                    new_ents.append(ent)
-            else:
-                new_ents.append(ent)
-
-        filtered_ents = filter_spans(new_ents)
-        doc.ents = tuple(filtered_ents)
-
-        return doc
-
-    def convert(doc: spacy.tokens.Doc) -> str:
-        # Apply a series of tests to each token to determine how to Shavianise it.
-        text_split_shaw: str = ""
-
-        for token in doc:
-
-            # Leave HTML tags unchanged
-            if token.tag_ == "HTML":
-                text_split_shaw += token.text
-
-            # Convert contractions
-            if token.lower_ in contraction_start and token.i < len(doc) - 1 and doc[
-                token.i + 1].lower_ in contraction_end:
-                text_split_shaw += contraction_start[token.lower_]
-            elif token.lower_ in contraction_end:
-                prefix: str = "𐑩" if token.lower_ != "𐑼" and text_split_shaw and text_split_shaw[
-                    -1] in consonants else ""
-                text_split_shaw += prefix + contraction_end[token.lower_] + token.whitespace_
-
-            # Convert possessive 's
-            elif token.lower_ == "'s":
-                suffix: str = "𐑕" if text_split_shaw[-1] in s_follows else "𐑩𐑟" if text_split_shaw[
-                                                                                       -1] in uhz_follows else "𐑟"
-                text_split_shaw += suffix + token.whitespace_
-
-            # Convert possessive '
-            elif token.lower_ == "'" and token.tag_ == "POS":
-                text_split_shaw += token.whitespace_
-
-            # Convert verbs that change pronunciation before 'to', e.g. 'have to', 'used to', 'supposed to'
-            elif token.lower_ in before_to and token.i < len(doc) - 1 and doc[token.i + 1].lower_ == "to":
-                # 'have' only changes pronunciation where 'have to' means 'must'
-                if token.lower_ in have_to and doc[token.i + 2].tag_ in ["VB", "VBP"]:
-                    text_split_shaw += have_to[token.lower_] + token.whitespace_
-                # 'used', 'supposed' etc. only change pronunciation in the past tense, not past participle
-                elif token.lower_ in vbd_to and token.tag_ in ["VBD", "VBN", "."]:
-                    text_split_shaw += vbd_to[token.lower_] + token.whitespace_
-
-            # Match ordinal numbers represented by a numeral and a suffix
-            elif re.fullmatch(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_):
-                number, number_suffix = re.match(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_).groups()
-                text_split_shaw += number + ordinal_suffixes[number_suffix] + token.whitespace_
-
-            # Loop through the words in the ReadLex and look for matches, and only apply the namer dot to the first word
-            # in a name (or not at all for initialisms marked with ⸰)
-            elif token.lower_ in readlex_dict:
-                for i in readlex_dict.get(token.lower_, []):
-                    # Match the part of speech for heteronyms
-                    if i["tag"] == token.tag_:
-                        prefix: str = "·" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i[
-                            "Shaw"].startswith("⸰") else ""
-                        text_split_shaw += prefix + i["Shaw"] + token.whitespace_
-                        break
-
-                    # For any proper nouns not in the ReadLex, match if an identical common noun exists
-                    elif (i["tag"] in ["NN", "0"] and token.tag_ == "NNP") or (
-                            i["tag"] in ["NNS", "0"] and token.tag_ == "NNPS"):
-                        prefix = "·" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i[
-                            "Shaw"].startswith("⸰") else ""
-                        text_split_shaw += prefix + i["Shaw"] + token.whitespace_
-                        break
-
-                    # Match words with only one pronunciation
-                    elif i["tag"] == "0":
-                        prefix = "·" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i[
-                            "Shaw"].startswith("⸰") else ""
-                        text_split_shaw += prefix + i["Shaw"] + token.whitespace_
-                        break
-
-            # Apply additional tests where there is still no match
-            else:
-                found: bool = False
-                constructed_warning: str = "⚠️"
-                '''
-                Try to construct a match using common prefixes and suffixes and include a warning symbol to aid proof
-                reading
-                '''
-                for j in affixes:
-                    if token.lower_.startswith(j) and j in prefixes:
-                        prefix: str = prefixes[j]
-                        suffix: str = ""
-                        target_word: str = token.lower_[len(j):]
-                    elif token.lower_.endswith(j) and j in suffixes:
-                        prefix = ""
-                        suffix = suffixes[j]
-                        target_word = token.lower_[:-len(j)]
-                    else:
-                        continue
-                    if target_word in readlex_dict:
-                        found = True
-                        for i in readlex_dict.get(target_word):
-                            prefix = "·" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
-                                i[
-                                    "Shaw"].startswith("⸰") else prefix
-                            text_split_shaw += prefix + i[
-                                "Shaw"] + suffix + constructed_warning + token.whitespace_
-                            break
-
-                # Try to construct plurals if not expressly included in the ReadLex, e.g. plurals of proper names.
-                if token.lower_.endswith("s"):
-                    target_word = token.lower_[:-1]
-                    if target_word in readlex_dict:
-                        found = True
-                        for i in readlex_dict.get(target_word):
-                            suffix = "𐑕" if i["Shaw"][-1] in s_follows else "𐑩𐑟" if i["Shaw"][
-                                                                                        -1] in uhz_follows else "𐑟"
-                            prefix = "·" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
-                                i[
-                                    "Shaw"].startswith("⸰") else ""
-                            text_split_shaw += prefix + i[
-                                "Shaw"] + suffix + constructed_warning + token.whitespace_
-                            break
-
-                if found is not False:
-                    continue
-                # If there is still no match, do not convert the word
-                if token.text.isalpha():
-                    text_split_shaw += token.text + "✢" + token.whitespace_
-                else:
-                    text_split_shaw += token.text + token.whitespace_
-
-        return text_split_shaw
-
-    # Create the string that will contain the Shavianised text.
-    text_shaw: str = ""
-
-    # Split up the string to reduce the risk of spaCy exceeding memory limits
-    if text.strip().casefold().startswith("<!doctype html"):
-        style_pattern: str = r"(<style\b[^>]*>.*?</style>)"
-        script_pattern: str = r"(<script\b[^>]*>.*?</script>)"
-        html_pattern: str = r"(?!(?:<style[^>]*?>.*?</style>|<script[^>]*?>.*?</script>))(<.*?>)"
-        html_patterns: str = f"{style_pattern}|{script_pattern}|{html_pattern}"
-        text_split: list[str] = re.split(html_patterns, text, flags=re.DOTALL)
-        for text_part in text_split:
-            if text_part is None:
-                pass
-            elif re.fullmatch(style_pattern, text_part, flags=re.DOTALL) or re.fullmatch(
-                    script_pattern, text_part, flags=re.DOTALL) or re.fullmatch(html_pattern, text_part,
-                                                                                flags=re.DOTALL):
-                text_shaw += text_part
-            else:
-                doc: spacy.tokens.Doc = tokenise(text_part)
-                text_shaw += convert(doc)
-
-        # Convert dumb quotes, double hyphens, etc. to their typographic equivalents
-        text_shaw = smartypants.smartypants(text_shaw)
-        # Convert curly quotes to angle quotes
-        quotation_marks: dict[str, str] = {"&#8216;": "&lsaquo;", "&#8217;": "&rsaquo;", "&#8220;": "&laquo;", "&#8221;": "&raquo;"}
-        for key, value in quotation_marks.items():
-            text_shaw = text_shaw.replace(key, value)
-
-    else:
-        text = unidecode.unidecode(text)
-        text = re.sub(r"(\S)(\[)", r"\1 \2", text)
-        text = re.sub(r"](\S)", r"] \1", text)
-        text_split: list[str] = text.splitlines()
-        for i in text_split:
-            if len(i) < 10000:
-                doc: spacy.tokens.Doc = tokenise(i)
-                text_shaw += convert(doc) + "\n"
-        # Convert dumb quotes, double hyphens, etc. to their typographic equivalents
-        text_shaw = smartypants.smartypants(text_shaw)
-        quotation_marks: dict[str, str] = {"&#8216;": "&lsaquo;", "&#8217;": "&rsaquo;", "&#8220;": "&laquo;", "&#8221;": "&raquo;"}
-        for key, value in quotation_marks.items():
-            text_shaw = text_shaw.replace(key, value)
-        text_shaw = str(BeautifulSoup(text_shaw, features="html.parser"))
-
-    return text_shaw
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..6c64199
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,40 @@
+[project]
+name = "readlex"
+version = "0.1.0"
+description = "Auto-transliteration of English language text from latin to Shaw script using the Read Lexicon"
+authors = [
+    { name = "Shavian-info", email = "contact@shavian.info" },
+    { name = "Ingrid", email = "git@ingrids.email" }
+]
+dependencies = [
+    "spacy>=3.7.4",
+    "unidecode>=1.3.8",
+    "smartypants>=2.0.1",
+    "bs4>=0.0.2",
+    "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
+]
+readme = "README.md"
+requires-python = ">= 3.8"
+exclude = [
+    "readlex.json",
+    "kingsleyreadlexicon.tsv",
+    "readlex.dict",
+    "addendum.dict"
+]
+
+[project.scripts]
+latin2shaw = "readlex.latin2shaw:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.rye]
+managed = true
+dev-dependencies = []
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/readlex"]
diff --git a/requirements-dev.lock b/requirements-dev.lock
new file mode 100644
index 0000000..4c3293f
--- /dev/null
+++ b/requirements-dev.lock
@@ -0,0 +1,121 @@
+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: []
+#   all-features: false
+#   with-sources: false
+#   generate-hashes: false
+
+-e file:.
+annotated-types==0.7.0
+    # via pydantic
+beautifulsoup4==4.12.3
+    # via bs4
+blis==0.7.11
+    # via thinc
+bs4==0.0.2
+    # via readlex
+catalogue==2.0.10
+    # via spacy
+    # via srsly
+    # via thinc
+certifi==2024.2.2
+    # via requests
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via typer
+cloudpathlib==0.16.0
+    # via weasel
+confection==0.1.5
+    # via thinc
+    # via weasel
+cymem==2.0.8
+    # via preshed
+    # via spacy
+    # via thinc
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
+    # via readlex
+idna==3.7
+    # via requests
+jinja2==3.1.4
+    # via spacy
+langcodes==3.4.0
+    # via spacy
+language-data==1.2.0
+    # via langcodes
+marisa-trie==1.1.1
+    # via language-data
+markupsafe==2.1.5
+    # via jinja2
+murmurhash==1.0.10
+    # via preshed
+    # via spacy
+    # via thinc
+numpy==1.26.4
+    # via blis
+    # via spacy
+    # via thinc
+packaging==24.0
+    # via spacy
+    # via thinc
+    # via weasel
+preshed==3.0.9
+    # via spacy
+    # via thinc
+pydantic==2.7.2
+    # via confection
+    # via spacy
+    # via thinc
+    # via weasel
+pydantic-core==2.18.3
+    # via pydantic
+requests==2.32.3
+    # via spacy
+    # via weasel
+setuptools==70.0.0
+    # via marisa-trie
+    # via spacy
+    # via thinc
+smart-open==6.4.0
+    # via spacy
+    # via weasel
+smartypants==2.0.1
+    # via readlex
+soupsieve==2.5
+    # via beautifulsoup4
+spacy==3.7.4
+    # via en-core-web-sm
+    # via readlex
+spacy-legacy==3.0.12
+    # via spacy
+spacy-loggers==1.0.5
+    # via spacy
+srsly==2.4.8
+    # via confection
+    # via spacy
+    # via thinc
+    # via weasel
+thinc==8.2.3
+    # via spacy
+tqdm==4.66.4
+    # via spacy
+typer==0.9.4
+    # via spacy
+    # via weasel
+typing-extensions==4.12.0
+    # via pydantic
+    # via pydantic-core
+    # via typer
+unidecode==1.3.8
+    # via readlex
+urllib3==2.2.1
+    # via requests
+wasabi==1.1.3
+    # via spacy
+    # via thinc
+    # via weasel
+weasel==0.3.4
+    # via spacy
diff --git a/requirements.lock b/requirements.lock
new file mode 100644
index 0000000..4c3293f
--- /dev/null
+++ b/requirements.lock
@@ -0,0 +1,121 @@
+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: []
+#   all-features: false
+#   with-sources: false
+#   generate-hashes: false
+
+-e file:.
+annotated-types==0.7.0
+    # via pydantic
+beautifulsoup4==4.12.3
+    # via bs4
+blis==0.7.11
+    # via thinc
+bs4==0.0.2
+    # via readlex
+catalogue==2.0.10
+    # via spacy
+    # via srsly
+    # via thinc
+certifi==2024.2.2
+    # via requests
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via typer
+cloudpathlib==0.16.0
+    # via weasel
+confection==0.1.5
+    # via thinc
+    # via weasel
+cymem==2.0.8
+    # via preshed
+    # via spacy
+    # via thinc
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
+    # via readlex
+idna==3.7
+    # via requests
+jinja2==3.1.4
+    # via spacy
+langcodes==3.4.0
+    # via spacy
+language-data==1.2.0
+    # via langcodes
+marisa-trie==1.1.1
+    # via language-data
+markupsafe==2.1.5
+    # via jinja2
+murmurhash==1.0.10
+    # via preshed
+    # via spacy
+    # via thinc
+numpy==1.26.4
+    # via blis
+    # via spacy
+    # via thinc
+packaging==24.0
+    # via spacy
+    # via thinc
+    # via weasel
+preshed==3.0.9
+    # via spacy
+    # via thinc
+pydantic==2.7.2
+    # via confection
+    # via spacy
+    # via thinc
+    # via weasel
+pydantic-core==2.18.3
+    # via pydantic
+requests==2.32.3
+    # via spacy
+    # via weasel
+setuptools==70.0.0
+    # via marisa-trie
+    # via spacy
+    # via thinc
+smart-open==6.4.0
+    # via spacy
+    # via weasel
+smartypants==2.0.1
+    # via readlex
+soupsieve==2.5
+    # via beautifulsoup4
+spacy==3.7.4
+    # via en-core-web-sm
+    # via readlex
+spacy-legacy==3.0.12
+    # via spacy
+spacy-loggers==1.0.5
+    # via spacy
+srsly==2.4.8
+    # via confection
+    # via spacy
+    # via thinc
+    # via weasel
+thinc==8.2.3
+    # via spacy
+tqdm==4.66.4
+    # via spacy
+typer==0.9.4
+    # via spacy
+    # via weasel
+typing-extensions==4.12.0
+    # via pydantic
+    # via pydantic-core
+    # via typer
+unidecode==1.3.8
+    # via readlex
+urllib3==2.2.1
+    # via requests
+wasabi==1.1.3
+    # via spacy
+    # via thinc
+    # via weasel
+weasel==0.3.4
+    # via spacy
diff --git a/src/readlex/__init__.py b/src/readlex/__init__.py
new file mode 100644
index 0000000..d2af850
--- /dev/null
+++ b/src/readlex/__init__.py
@@ -0,0 +1,3 @@
+from readlex.latin2shaw import latin2shaw
+
+__all__ = ['latin2shaw',]
diff --git a/src/readlex/latin2shaw.py b/src/readlex/latin2shaw.py
new file mode 100644
index 0000000..5b4c518
--- /dev/null
+++ b/src/readlex/latin2shaw.py
@@ -0,0 +1,470 @@
+import json
+import csv
+import re
+import unidecode
+import smartypants
+
+import spacy
+from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, filter_spans
+from spacy.tokens import Span
+from spacy.matcher import PhraseMatcher  # , Matcher
+
+from bs4 import BeautifulSoup
+from pathlib import Path
+
+
+def latin2shaw(text):
+    # path where resource files (readlex.json etc.) are kept
+    resource_path = Path(__file__).parent.parent
+
+    with resource_path.with_name('readlex_converter.json').open('r', encoding="utf-8") as f:
+        json_data = f.read()
+
+    readlex_dict = json.loads(json_data)
+
+    # Categories of letters that determine how a following 's is pronounced
+    s_follows = {"𐑐", "𐑑", "𐑒", "𐑓", "𐑔"}
+    uhz_follows = {"𐑕", "𐑖", "𐑗", "𐑟", "𐑠", "𐑡"}
+    z_follows = {"𐑚", "𐑛", "𐑜", "𐑝", "𐑞", "𐑙", "𐑤", "𐑥", "𐑯", "𐑸", "𐑹", "𐑺", "𐑻", "𐑼", "𐑽"}
+    consonants = set.union(s_follows, uhz_follows, z_follows)
+    # vowels = {"𐑦", "𐑰", "𐑧", "𐑱", "𐑨", "𐑲", "𐑩", "𐑳", "𐑪", "𐑴", "𐑫", "𐑵", "𐑬", "𐑶", "𐑭", "𐑷", "𐑾", "𐑿"}
+    # The following are never final other than in initialisms: "𐑣", "𐑢", "𐑘", "𐑮".
+
+    # Contractions that need special treatment since the separate words are not as they appear in the dictionary
+    contraction_start = {"ai": "𐑱", "ca": "𐑒𐑭", "do": "𐑛𐑴", "does": "𐑛𐑳𐑟", "did": "𐑛𐑦𐑛", "sha": "𐑖𐑭", "wo": "𐑢𐑴",
+                         "y'": "𐑘"}
+    contraction_end = {"n't": "𐑯𐑑", "all": "𐑷𐑤", "'ve": "𐑝", "'ll": "𐑤", "'m": "𐑥", "'d": "𐑛", "'re": "𐑼"}
+
+    # Common prefixes and suffixes used in new coinings
+    prefixes = {"anti": "𐑨𐑯𐑑𐑦",
+                "counter": "𐑒𐑬𐑯𐑑𐑼",
+                "de": "𐑛𐑰",
+                "dis": "𐑛𐑦𐑕",
+                "esque": "𐑧𐑕𐑒",
+                "hyper": "𐑣𐑲𐑐𐑼",
+                "hypo": "𐑣𐑲𐑐𐑴",
+                "mega": "𐑥𐑧𐑜𐑩",
+                "meta": "𐑥𐑧𐑑𐑩",
+                "micro": "𐑥𐑲𐑒𐑮𐑴",
+                "multi": "𐑥𐑳𐑤𐑑𐑦",
+                "mis": "𐑥𐑦𐑕",
+                "neuro": "𐑯𐑘𐑫𐑼𐑴",
+                "non": "𐑯𐑪𐑯",
+                "o'er": "𐑴𐑼",
+                "out": "𐑬𐑑",
+                "over": "𐑴𐑝𐑼",
+                "poly": "𐑐𐑪𐑤𐑦",
+                "post": "𐑐𐑴𐑕𐑑",
+                "pre": "𐑐𐑮𐑰",
+                "pro": "𐑐𐑮𐑴",
+                "pseudo": "𐑕𐑿𐑛𐑴",
+                "re": "𐑮𐑰",
+                "sub": "𐑕𐑳𐑚",
+                "super": "𐑕𐑵𐑐𐑼",
+                "ultra": "𐑳𐑤𐑑𐑮𐑩",
+                "un": "𐑳𐑯",
+                "under": "𐑳𐑯𐑛𐑼"
+                }
+    suffixes = {"able": "𐑩𐑚𐑩𐑤",
+                "bound": "𐑚𐑬𐑯𐑛",
+                "ful": "𐑓𐑩𐑤",
+                "hood": "𐑣𐑫𐑛",
+                "ish": "𐑦𐑖",
+                "ism": "𐑦𐑟𐑩𐑥",
+                "less": "𐑤𐑩𐑕",
+                "like": "𐑤𐑲𐑒",
+                "ness": "𐑯𐑩𐑕"
+                }
+    affixes = prefixes | suffixes
+
+    # Words that sometimes change spelling before 'to'
+    have_to = {"have": "𐑣𐑨𐑓", "has": "𐑣𐑨𐑕"}
+    vbd_to = {"used": "𐑿𐑕𐑑", "unused": "𐑳𐑯𐑿𐑕𐑑", "supposed": "𐑕𐑩𐑐𐑴𐑕𐑑"}
+    before_to = have_to | vbd_to
+
+    # Suffixes that follow numerals in ordinal numbers
+    ordinal_suffixes = {"st": "𐑕𐑑", "nd": "𐑯𐑛", "rd": "𐑮𐑛", "th": "𐑔", "s": "𐑟"}
+
+    # Load spaCy, excluding pipeline components that are not required
+    nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "textcat"])
+
+    # Customise the spaCy tokeniser to ensure that initial and final dashes and dashes between words aren't stuck to one
+    # of the surrounding words
+    # Prefixes
+    spacy_prefixes = nlp.Defaults.prefixes + [r"""^[-–—]+""", ]
+    prefix_regex = compile_prefix_regex(spacy_prefixes)
+    nlp.tokenizer.prefix_search = prefix_regex.search
+    # Infixes
+    spacy_infixes = nlp.Defaults.infixes + [r"""[-–—\"\~\(\[]+""", ]
+    infix_regex = compile_infix_regex(spacy_infixes)
+    nlp.tokenizer.infix_finditer = infix_regex.finditer
+    # Suffixes
+    spacy_suffixes = nlp.Defaults.suffixes + [r"""[-–—]+$""", ]
+    suffix_regex = compile_suffix_regex(spacy_suffixes)
+    nlp.tokenizer.suffix_search = suffix_regex.search
+
+    def add_span(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+
+    # Define the phrase to match
+    with resource_path.with_name('readlex_converter_phrases.json').open('r', newline="") as f:
+        reader = csv.reader(f)
+        phrases = []
+        for i in reader:
+            phrases.append(i[0])
+    phrase_patterns = [nlp.make_doc(phrase) for phrase in phrases]
+    phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
+    phrase_matcher.add("phrases", phrase_patterns, on_match=add_span)
+
+    # # Define the HTML element patterns to match
+    # html_patterns = [[{"TEXT": {"REGEX": "(?<=<)"}},
+    #                   {"OP": "*", "TEXT": {"REGEX": "[^<>]"}},
+    #                   {"TEXT": {"REGEX": "(?=>)"}}],
+    #                  [{'LOWER': '<'},
+    #                   {'LOWER': 'style'},
+    #                   {'OP': '*', 'IS_ASCII': True},
+    #                   {'LOWER': '/style'},
+    #                   {'LOWER': '>'}],
+    #                  [{'LOWER': '<'},
+    #                   {'LOWER': 'script'},
+    #                   {'OP': '*', 'IS_ASCII': True},
+    #                   {'LOWER': '/script'},
+    #                   {'LOWER': '>'}]
+    #                  ]
+    # matcher = Matcher(nlp.vocab)
+    # matcher.add("html_elements", html_patterns, on_match=add_span)
+
+    namer_dot_ents = ["PERSON", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"]
+
+    def tokenise(str):
+        # Tokenise and tag the text using spaCy as doc
+
+        doc = nlp(str)
+        # matches = matcher(doc)
+        phrase_matches = phrase_matcher(doc)
+
+        # html_spans = []
+        # for match_id, start, end in matches:
+        #     span = Span(doc, start, end, label=match_id)
+        #     html_spans.append(span)
+
+        phrase_spans = []
+        for match_id, start, end in phrase_matches:
+            span = Span(doc, start, end, label=match_id)
+            phrase_spans.append(span)
+
+        # all_spans = html_spans
+        # for i in phrase_spans:
+        #     all_spans.append(i)
+        # filtered_spans = filter_spans(all_spans)
+
+        filtered_spans = filter_spans(phrase_spans)
+
+        with doc.retokenize() as retokenizer:
+            for span in filtered_spans:
+                # if span.label_ == "html_elements":
+                #     retokenizer.merge(span, attrs={"TAG": "HTML"})
+                # else:
+                retokenizer.merge(span)
+
+        # Expand person entities to include titles and take initial 'the' out of entity names
+        titles = [
+            "archbishop",
+            "archdeacon",
+            "baron",
+            "baroness",
+            "bishop",
+            "captain",
+            "count",
+            "countess",
+            "cpt",
+            "dame",
+            "deacon",
+            "doctor",
+            "dr.",
+            "dr",
+            "duchess",
+            "duke",
+            "earl",
+            "emperor",
+            "empress",
+            "gov.",
+            "gov",
+            "governor",
+            "justice",
+            "king",
+            "lady",
+            "lord",
+            "marchioness",
+            "marquess",
+            "marquis",
+            "miss",
+            "missus",
+            "mister",
+            "mistress",
+            "mr.",
+            "mr",
+            "mrs.",
+            "mrs",
+            "ms.",
+            "ms",
+            "mx.",
+            "mx",
+            "pope",
+            "pres.",
+            "pres",
+            "president",
+            "prince",
+            "princess",
+            "prof.",
+            "prof",
+            "professor",
+            "queen",
+            "rev.",
+            "rev",
+            "reverend",
+            "saint",
+            "sen.",
+            "sen",
+            "senator",
+            "sir",
+            "st.",
+            "st",
+            "viscount",
+            "viscountess"
+        ]
+        new_ents = []
+        for ent in doc.ents:
+            # Only check for title if it's a person and not the first token
+            if ent.label_ == "PERSON" and ent.start != 0:
+                prev_token = doc[ent.start - 1]
+                if prev_token.lower_ in titles:
+                    new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
+                    new_ents.append(new_ent)
+                else:
+                    new_ents.append(ent)
+            elif ent.label_ in namer_dot_ents:
+                if doc[ent.start].lower_ == "the":
+                    new_ent = Span(doc, ent.start + 1, ent.end, label=ent.label)
+                    new_ents.append(new_ent)
+                else:
+                    new_ents.append(ent)
+            else:
+                new_ents.append(ent)
+        doc.ents = filter_spans(new_ents)
+
+        return doc
+
+    def convert(doc):
+        # Apply a series of tests to each token to determine how to Shavianise it.
+        text_split_shaw = ""
+        for token in doc:
+
+            # Leave HTML tags unchanged
+            if token.tag_ == "HTML":
+                text_split_shaw += token.text
+
+            # Convert contractions
+            elif token.lower_ in contraction_start and doc[token.i + 1].lower_ in contraction_end:
+                text_split_shaw += contraction_start[token.lower_]
+            elif token.lower_ in contraction_end:
+                if token.lower_ != "𐑼" and len(text_split_shaw) > 0 and text_split_shaw[-1] in consonants:
+                    text_split_shaw += "𐑩" + contraction_end[token.lower_] + token.whitespace_
+                else:
+                    text_split_shaw += contraction_end[token.lower_] + token.whitespace_
+
+            # Convert possessive 's
+            elif token.lower_ == "'s":
+                if text_split_shaw[-1] in s_follows:
+                    text_split_shaw += "𐑕" + token.whitespace_
+                elif text_split_shaw[-1] in uhz_follows:
+                    text_split_shaw += "𐑩𐑟" + token.whitespace_
+                else:
+                    text_split_shaw += "𐑟" + token.whitespace_
+
+            # Convert possessive '
+            elif token.lower_ == "'" and token.tag_ == "POS":
+                text_split_shaw += token.whitespace_
+
+            # Convert verbs that change pronunciation before 'to', e.g. 'have to', 'used to', 'supposed to'
+            elif token.lower_ in before_to and token.i < (len(doc)-1) and doc[token.i + 1].lower_ == "to":
+                # 'have' only changes pronunciation where 'have to' means 'must'
+                if token.lower_ in have_to:
+                    if doc[token.i + 2].tag_ in ["VB", "VBP"]:
+                        text_split_shaw += have_to[token.lower_] + token.whitespace_
+                    # else:
+                    # text_split_shaw += "𐑣𐑨𐑟" + token.whitespace_
+                # 'used', 'supposed' etc. only change pronunciation in the past tense, not past participle
+                elif token.lower_ in vbd_to and token.tag_ in ["VBD", "VBN", "."]:
+                    text_split_shaw += vbd_to[token.lower_] + token.whitespace_
+
+            # Match ordinal numbers represented by a numeral and a suffix
+            elif re.fullmatch(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_):
+                match = re.match(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_)
+                number = match.group(1)
+                number_suffix = match.group(2)
+                text_split_shaw += number + ordinal_suffixes[number_suffix] + token.whitespace_
+
+            # Loop through the words in the ReadLex and look for matches, and only apply the namer dot to the first word
+            # in a name (or not at all for initialisms marked with ⸰)
+            elif token.lower_ in readlex_dict:
+                for i in readlex_dict.get(token.lower_, []):
+                    # Match the part of speech for heteronyms
+                    if i["tag"] == token.tag_:
+                        if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith(
+                                "⸰"):
+                            text_split_shaw += "·" + i["Shaw"] + token.whitespace_
+                        else:
+                            text_split_shaw += i["Shaw"] + token.whitespace_
+                        break
+                    # For any proper nouns not in the ReadLex, match if an identical common noun exists
+                    elif i["tag"] in ["NN", "0"] and token.tag_ == "NNP" or i["tag"] in ["NNS",
+                                                                                         "0"] and token.tag_ == "NNPS":
+                        if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith(
+                                "⸰"):
+                            text_split_shaw += "·" + i["Shaw"] + token.whitespace_
+                        else:
+                            text_split_shaw += i["Shaw"] + token.whitespace_
+                        break
+                    # Match words with only one pronunciation
+                    elif i["tag"] == "0":
+                        if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith(
+                                "⸰"):
+                            text_split_shaw += "·" + i["Shaw"] + token.whitespace_
+                        else:
+                            text_split_shaw += i["Shaw"] + token.whitespace_
+                        break
+
+            # Apply additional tests where there is still no match
+            else:
+                found = False
+                constructed_warning = "⚠️"
+                # Try to construct a match using common prefixes and suffixes and include a warning symbol to aid proof
+                # reading
+                for j in affixes:
+                    if token.lower_.startswith(j) and j in prefixes:
+                        prefix = prefixes[j]
+                        suffix = ""
+                        target_word = token.lower_[len(j):]
+                    elif token.lower_.endswith(j) and j in suffixes:
+                        prefix = ""
+                        suffix = suffixes[j]
+                        suffix_length = len(j)
+                        target_word = token.lower_[:-suffix_length]
+                    else:
+                        continue
+                    if target_word in readlex_dict:
+                        found = True
+                        for i in readlex_dict.get(target_word):
+                            if i["tag"] != "0" and i["tag"] == token.tag_:
+                                if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
+                                        i["Shaw"].startswith("⸰"):
+                                    text_split_shaw += "·" + prefix + i[
+                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                else:
+                                    text_split_shaw += prefix + i[
+                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                break
+                            elif i["tag"] == "0":
+                                if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
+                                        i["Shaw"].startswith("⸰"):
+                                    text_split_shaw += "·" + prefix + i[
+                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                else:
+                                    text_split_shaw += prefix + i[
+                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                break
+
+                # Try to construct plurals if not expressly included in the ReadLex, e.g. plurals of proper names.
+                if token.lower_.endswith("s"):
+                    target_word = token.lower_[:-1]
+                    if target_word in readlex_dict:
+                        found = True
+                        for i in readlex_dict.get(target_word):
+                            if i["Shaw"][-1] in s_follows:
+                                suffix = "𐑕"
+                            elif i["Shaw"][-1] in uhz_follows:
+                                suffix = "𐑩𐑟"
+                            else:
+                                suffix = "𐑟"
+                            if i["tag"] != "0" and i["tag"] == token.tag_:
+                                if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
+                                        i["Shaw"].startswith("⸰"):
+                                    text_split_shaw += "·" + i[
+                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                else:
+                                    text_split_shaw += i["Shaw"] + suffix + constructed_warning + token.whitespace_
+                                break
+                            elif i["tag"] == "0":
+                                if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
+                                        i["Shaw"].startswith("⸰"):
+                                    text_split_shaw += "·" + i[
+                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                else:
+                                    text_split_shaw += i["Shaw"] + suffix + constructed_warning + token.whitespace_
+                                break
+
+                # If there is still no match, do not convert the word
+                if found is False:
+                    if token.text.isalpha():
+                        text_split_shaw += token.text + "✢" + token.whitespace_
+                    else:
+                        text_split_shaw += token.text + token.whitespace_
+
+        return text_split_shaw
+
+    # Create the string that will contain the Shavianised text.
+    text_shaw = ""
+
+    # Split up the string to reduce the risk of spaCy exceeding memory limits
+    if text.strip().casefold().startswith("<!doctype html"):
+        style_pattern = r"(<style\b[^>]*>.*?</style>)"
+        script_pattern = r"(<script\b[^>]*>.*?</script>)"
+        html_pattern = r"(?!(?:<style[^>]*?>.*?</style>|<script[^>]*?>.*?</script>))(<.*?>)"
+        html_patterns = f"{style_pattern}|{script_pattern}|{html_pattern}"
+        text_split = re.split(html_patterns, text, flags=re.DOTALL)
+        for text_part in text_split:
+            if text_part is None:
+                pass
+            elif re.fullmatch(style_pattern, text_part, flags=re.DOTALL):
+                text_shaw += text_part
+            elif re.fullmatch(script_pattern, text_part, flags=re.DOTALL):
+                text_shaw += text_part
+            elif re.fullmatch(html_pattern, text_part, flags=re.DOTALL):
+                text_shaw += text_part
+            else:
+                doc = tokenise(text_part)
+                text_shaw += convert(doc)
+        # Convert dumb quotes, double hyphens, etc. to their typographic equivalents
+        text_shaw = smartypants.smartypants(text_shaw)
+        # # Convert curly quotes to angle quotes
+        # quotation_marks = {"&#8216;": "&lsaquo;", "&#8217;": "&rsaquo;", "&#8220;": "&laquo;", "&#8221;": "&raquo;"}
+        # for key, value in quotation_marks.items():
+        #     text_shaw = text_shaw.replace(key, value)
+
+    else:
+        text = unidecode.unidecode(text)
+        text = re.sub(r"(\S)(\[)", r"\1 \2", text)
+        text = re.sub(r"](\S)", r"] \1", text)
+        text_split = text.splitlines()
+        for i in text_split:
+            if len(i) < 10000:
+                doc = tokenise(i)
+                text_shaw += convert(doc) + "\n"
+        # Convert dumb quotes, double hyphens, etc. to their typographic equivalents
+        text_shaw = smartypants.smartypants(text_shaw)
+        quotation_marks = {"&#8216;": "&lsaquo;", "&#8217;": "&rsaquo;", "&#8220;": "&laquo;", "&#8221;": "&raquo;"}
+        for key, value in quotation_marks.items():
+            text_shaw = text_shaw.replace(key, value)
+        text_shaw = str(BeautifulSoup(text_shaw, features="html.parser"))
+
+    return text_shaw
+
+def main():
+    with open("in", 'r') as in_file:
+        text_latin = in_file.read()
+
+    text_shaw = latin2shaw(text_latin)
+
+    with open("out", 'w') as out_file:
+        out_file.write(text_shaw)

From 3ba122c99852a973d5dfd79c27021831a7014a33 Mon Sep 17 00:00:00 2001
From: Ingrid <git@ingrids.email>
Date: Sat, 1 Jun 2024 17:18:42 +0200
Subject: [PATCH 2/5] add unit test and enforce formatting

---
 pyproject.toml                 |   4 +-
 requirements-dev.lock          |   6 +
 src/readlex/__init__.py        |   4 +-
 src/readlex/latin2shaw.py      | 340 ++++++++++++++++++++++++---------
 src/readlex/latin2shaw_test.py |  20 ++
 5 files changed, 278 insertions(+), 96 deletions(-)
 create mode 100644 src/readlex/latin2shaw_test.py

diff --git a/pyproject.toml b/pyproject.toml
index 6c64199..c44f8fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,9 @@ build-backend = "hatchling.build"
 
 [tool.rye]
 managed = true
-dev-dependencies = []
+dev-dependencies = [
+    "pytest>=8.2.1",
+]
 
 [tool.hatch.metadata]
 allow-direct-references = true
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 4c3293f..8fa2854 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -40,6 +40,8 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_
     # via readlex
 idna==3.7
     # via requests
+iniconfig==2.0.0
+    # via pytest
 jinja2==3.1.4
     # via spacy
 langcodes==3.4.0
@@ -59,9 +61,12 @@ numpy==1.26.4
     # via spacy
     # via thinc
 packaging==24.0
+    # via pytest
     # via spacy
     # via thinc
     # via weasel
+pluggy==1.5.0
+    # via pytest
 preshed==3.0.9
     # via spacy
     # via thinc
@@ -72,6 +77,7 @@ pydantic==2.7.2
     # via weasel
 pydantic-core==2.18.3
     # via pydantic
+pytest==8.2.1
 requests==2.32.3
     # via spacy
     # via weasel
diff --git a/src/readlex/__init__.py b/src/readlex/__init__.py
index d2af850..b615e12 100644
--- a/src/readlex/__init__.py
+++ b/src/readlex/__init__.py
@@ -1,3 +1,5 @@
 from readlex.latin2shaw import latin2shaw
 
-__all__ = ['latin2shaw',]
+__all__ = [
+    "latin2shaw",
+]
diff --git a/src/readlex/latin2shaw.py b/src/readlex/latin2shaw.py
index 5b4c518..14da1f5 100644
--- a/src/readlex/latin2shaw.py
+++ b/src/readlex/latin2shaw.py
@@ -5,7 +5,12 @@
 import smartypants
 
 import spacy
-from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, filter_spans
+from spacy.util import (
+    compile_infix_regex,
+    compile_prefix_regex,
+    compile_suffix_regex,
+    filter_spans,
+)
 from spacy.tokens import Span
 from spacy.matcher import PhraseMatcher  # , Matcher
 
@@ -17,7 +22,9 @@ def latin2shaw(text):
     # path where resource files (readlex.json etc.) are kept
     resource_path = Path(__file__).parent.parent
 
-    with resource_path.with_name('readlex_converter.json').open('r', encoding="utf-8") as f:
+    with resource_path.with_name("readlex_converter.json").open(
+        "r", encoding="utf-8"
+    ) as f:
         json_data = f.read()
 
     readlex_dict = json.loads(json_data)
@@ -25,56 +32,90 @@ def latin2shaw(text):
     # Categories of letters that determine how a following 's is pronounced
     s_follows = {"𐑐", "𐑑", "𐑒", "𐑓", "𐑔"}
     uhz_follows = {"𐑕", "𐑖", "𐑗", "𐑟", "𐑠", "𐑡"}
-    z_follows = {"𐑚", "𐑛", "𐑜", "𐑝", "𐑞", "𐑙", "𐑤", "𐑥", "𐑯", "𐑸", "𐑹", "𐑺", "𐑻", "𐑼", "𐑽"}
+    z_follows = {
+        "𐑚",
+        "𐑛",
+        "𐑜",
+        "𐑝",
+        "𐑞",
+        "𐑙",
+        "𐑤",
+        "𐑥",
+        "𐑯",
+        "𐑸",
+        "𐑹",
+        "𐑺",
+        "𐑻",
+        "𐑼",
+        "𐑽",
+    }
     consonants = set.union(s_follows, uhz_follows, z_follows)
     # vowels = {"𐑦", "𐑰", "𐑧", "𐑱", "𐑨", "𐑲", "𐑩", "𐑳", "𐑪", "𐑴", "𐑫", "𐑵", "𐑬", "𐑶", "𐑭", "𐑷", "𐑾", "𐑿"}
     # The following are never final other than in initialisms: "𐑣", "𐑢", "𐑘", "𐑮".
 
     # Contractions that need special treatment since the separate words are not as they appear in the dictionary
-    contraction_start = {"ai": "𐑱", "ca": "𐑒𐑭", "do": "𐑛𐑴", "does": "𐑛𐑳𐑟", "did": "𐑛𐑦𐑛", "sha": "𐑖𐑭", "wo": "𐑢𐑴",
-                         "y'": "𐑘"}
-    contraction_end = {"n't": "𐑯𐑑", "all": "𐑷𐑤", "'ve": "𐑝", "'ll": "𐑤", "'m": "𐑥", "'d": "𐑛", "'re": "𐑼"}
+    contraction_start = {
+        "ai": "𐑱",
+        "ca": "𐑒𐑭",
+        "do": "𐑛𐑴",
+        "does": "𐑛𐑳𐑟",
+        "did": "𐑛𐑦𐑛",
+        "sha": "𐑖𐑭",
+        "wo": "𐑢𐑴",
+        "y'": "𐑘",
+    }
+    contraction_end = {
+        "n't": "𐑯𐑑",
+        "all": "𐑷𐑤",
+        "'ve": "𐑝",
+        "'ll": "𐑤",
+        "'m": "𐑥",
+        "'d": "𐑛",
+        "'re": "𐑼",
+    }
 
     # Common prefixes and suffixes used in new coinings
-    prefixes = {"anti": "𐑨𐑯𐑑𐑦",
-                "counter": "𐑒𐑬𐑯𐑑𐑼",
-                "de": "𐑛𐑰",
-                "dis": "𐑛𐑦𐑕",
-                "esque": "𐑧𐑕𐑒",
-                "hyper": "𐑣𐑲𐑐𐑼",
-                "hypo": "𐑣𐑲𐑐𐑴",
-                "mega": "𐑥𐑧𐑜𐑩",
-                "meta": "𐑥𐑧𐑑𐑩",
-                "micro": "𐑥𐑲𐑒𐑮𐑴",
-                "multi": "𐑥𐑳𐑤𐑑𐑦",
-                "mis": "𐑥𐑦𐑕",
-                "neuro": "𐑯𐑘𐑫𐑼𐑴",
-                "non": "𐑯𐑪𐑯",
-                "o'er": "𐑴𐑼",
-                "out": "𐑬𐑑",
-                "over": "𐑴𐑝𐑼",
-                "poly": "𐑐𐑪𐑤𐑦",
-                "post": "𐑐𐑴𐑕𐑑",
-                "pre": "𐑐𐑮𐑰",
-                "pro": "𐑐𐑮𐑴",
-                "pseudo": "𐑕𐑿𐑛𐑴",
-                "re": "𐑮𐑰",
-                "sub": "𐑕𐑳𐑚",
-                "super": "𐑕𐑵𐑐𐑼",
-                "ultra": "𐑳𐑤𐑑𐑮𐑩",
-                "un": "𐑳𐑯",
-                "under": "𐑳𐑯𐑛𐑼"
-                }
-    suffixes = {"able": "𐑩𐑚𐑩𐑤",
-                "bound": "𐑚𐑬𐑯𐑛",
-                "ful": "𐑓𐑩𐑤",
-                "hood": "𐑣𐑫𐑛",
-                "ish": "𐑦𐑖",
-                "ism": "𐑦𐑟𐑩𐑥",
-                "less": "𐑤𐑩𐑕",
-                "like": "𐑤𐑲𐑒",
-                "ness": "𐑯𐑩𐑕"
-                }
+    prefixes = {
+        "anti": "𐑨𐑯𐑑𐑦",
+        "counter": "𐑒𐑬𐑯𐑑𐑼",
+        "de": "𐑛𐑰",
+        "dis": "𐑛𐑦𐑕",
+        "esque": "𐑧𐑕𐑒",
+        "hyper": "𐑣𐑲𐑐𐑼",
+        "hypo": "𐑣𐑲𐑐𐑴",
+        "mega": "𐑥𐑧𐑜𐑩",
+        "meta": "𐑥𐑧𐑑𐑩",
+        "micro": "𐑥𐑲𐑒𐑮𐑴",
+        "multi": "𐑥𐑳𐑤𐑑𐑦",
+        "mis": "𐑥𐑦𐑕",
+        "neuro": "𐑯𐑘𐑫𐑼𐑴",
+        "non": "𐑯𐑪𐑯",
+        "o'er": "𐑴𐑼",
+        "out": "𐑬𐑑",
+        "over": "𐑴𐑝𐑼",
+        "poly": "𐑐𐑪𐑤𐑦",
+        "post": "𐑐𐑴𐑕𐑑",
+        "pre": "𐑐𐑮𐑰",
+        "pro": "𐑐𐑮𐑴",
+        "pseudo": "𐑕𐑿𐑛𐑴",
+        "re": "𐑮𐑰",
+        "sub": "𐑕𐑳𐑚",
+        "super": "𐑕𐑵𐑐𐑼",
+        "ultra": "𐑳𐑤𐑑𐑮𐑩",
+        "un": "𐑳𐑯",
+        "under": "𐑳𐑯𐑛𐑼",
+    }
+    suffixes = {
+        "able": "𐑩𐑚𐑩𐑤",
+        "bound": "𐑚𐑬𐑯𐑛",
+        "ful": "𐑓𐑩𐑤",
+        "hood": "𐑣𐑫𐑛",
+        "ish": "𐑦𐑖",
+        "ism": "𐑦𐑟𐑩𐑥",
+        "less": "𐑤𐑩𐑕",
+        "like": "𐑤𐑲𐑒",
+        "ness": "𐑯𐑩𐑕",
+    }
     affixes = prefixes | suffixes
 
     # Words that sometimes change spelling before 'to'
@@ -91,15 +132,21 @@ def latin2shaw(text):
     # Customise the spaCy tokeniser to ensure that initial and final dashes and dashes between words aren't stuck to one
     # of the surrounding words
     # Prefixes
-    spacy_prefixes = nlp.Defaults.prefixes + [r"""^[-–—]+""", ]
+    spacy_prefixes = nlp.Defaults.prefixes + [
+        r"""^[-–—]+""",
+    ]
     prefix_regex = compile_prefix_regex(spacy_prefixes)
     nlp.tokenizer.prefix_search = prefix_regex.search
     # Infixes
-    spacy_infixes = nlp.Defaults.infixes + [r"""[-–—\"\~\(\[]+""", ]
+    spacy_infixes = nlp.Defaults.infixes + [
+        r"""[-–—\"\~\(\[]+""",
+    ]
     infix_regex = compile_infix_regex(spacy_infixes)
     nlp.tokenizer.infix_finditer = infix_regex.finditer
     # Suffixes
-    spacy_suffixes = nlp.Defaults.suffixes + [r"""[-–—]+$""", ]
+    spacy_suffixes = nlp.Defaults.suffixes + [
+        r"""[-–—]+$""",
+    ]
     suffix_regex = compile_suffix_regex(spacy_suffixes)
     nlp.tokenizer.suffix_search = suffix_regex.search
 
@@ -107,7 +154,9 @@ def add_span(matcher, doc, i, matches):
         match_id, start, end = matches[i]
 
     # Define the phrase to match
-    with resource_path.with_name('readlex_converter_phrases.json').open('r', newline="") as f:
+    with resource_path.with_name("readlex_converter_phrases.json").open(
+        "r", newline=""
+    ) as f:
         reader = csv.reader(f)
         phrases = []
         for i in reader:
@@ -134,7 +183,17 @@ def add_span(matcher, doc, i, matches):
     # matcher = Matcher(nlp.vocab)
     # matcher.add("html_elements", html_patterns, on_match=add_span)
 
-    namer_dot_ents = ["PERSON", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"]
+    namer_dot_ents = [
+        "PERSON",
+        "FAC",
+        "ORG",
+        "GPE",
+        "LOC",
+        "PRODUCT",
+        "EVENT",
+        "WORK_OF_ART",
+        "LAW",
+    ]
 
     def tokenise(str):
         # Tokenise and tag the text using spaCy as doc
@@ -231,7 +290,7 @@ def tokenise(str):
             "st.",
             "st",
             "viscount",
-            "viscountess"
+            "viscountess",
         ]
         new_ents = []
         for ent in doc.ents:
@@ -259,17 +318,25 @@ def convert(doc):
         # Apply a series of tests to each token to determine how to Shavianise it.
         text_split_shaw = ""
         for token in doc:
-
             # Leave HTML tags unchanged
             if token.tag_ == "HTML":
                 text_split_shaw += token.text
 
             # Convert contractions
-            elif token.lower_ in contraction_start and doc[token.i + 1].lower_ in contraction_end:
+            elif (
+                token.lower_ in contraction_start
+                and doc[token.i + 1].lower_ in contraction_end
+            ):
                 text_split_shaw += contraction_start[token.lower_]
             elif token.lower_ in contraction_end:
-                if token.lower_ != "𐑼" and len(text_split_shaw) > 0 and text_split_shaw[-1] in consonants:
-                    text_split_shaw += "𐑩" + contraction_end[token.lower_] + token.whitespace_
+                if (
+                    token.lower_ != "𐑼"
+                    and len(text_split_shaw) > 0
+                    and text_split_shaw[-1] in consonants
+                ):
+                    text_split_shaw += (
+                        "𐑩" + contraction_end[token.lower_] + token.whitespace_
+                    )
                 else:
                     text_split_shaw += contraction_end[token.lower_] + token.whitespace_
 
@@ -287,7 +354,11 @@ def convert(doc):
                 text_split_shaw += token.whitespace_
 
             # Convert verbs that change pronunciation before 'to', e.g. 'have to', 'used to', 'supposed to'
-            elif token.lower_ in before_to and token.i < (len(doc)-1) and doc[token.i + 1].lower_ == "to":
+            elif (
+                token.lower_ in before_to
+                and token.i < (len(doc) - 1)
+                and doc[token.i + 1].lower_ == "to"
+            ):
                 # 'have' only changes pronunciation where 'have to' means 'must'
                 if token.lower_ in have_to:
                     if doc[token.i + 2].tag_ in ["VB", "VBP"]:
@@ -299,11 +370,17 @@ def convert(doc):
                     text_split_shaw += vbd_to[token.lower_] + token.whitespace_
 
             # Match ordinal numbers represented by a numeral and a suffix
-            elif re.fullmatch(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_):
-                match = re.match(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_)
+            elif re.fullmatch(
+                r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_
+            ):
+                match = re.match(
+                    r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_
+                )
                 number = match.group(1)
                 number_suffix = match.group(2)
-                text_split_shaw += number + ordinal_suffixes[number_suffix] + token.whitespace_
+                text_split_shaw += (
+                    number + ordinal_suffixes[number_suffix] + token.whitespace_
+                )
 
             # Loop through the words in the ReadLex and look for matches, and only apply the namer dot to the first word
             # in a name (or not at all for initialisms marked with ⸰)
@@ -311,25 +388,38 @@ def convert(doc):
                 for i in readlex_dict.get(token.lower_, []):
                     # Match the part of speech for heteronyms
                     if i["tag"] == token.tag_:
-                        if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith(
-                                "⸰"):
+                        if (
+                            token.ent_iob_ == "B"
+                            and token.ent_type_ in namer_dot_ents
+                            and not i["Shaw"].startswith("⸰")
+                        ):
                             text_split_shaw += "·" + i["Shaw"] + token.whitespace_
                         else:
                             text_split_shaw += i["Shaw"] + token.whitespace_
                         break
                     # For any proper nouns not in the ReadLex, match if an identical common noun exists
-                    elif i["tag"] in ["NN", "0"] and token.tag_ == "NNP" or i["tag"] in ["NNS",
-                                                                                         "0"] and token.tag_ == "NNPS":
-                        if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith(
-                                "⸰"):
+                    elif (
+                        i["tag"] in ["NN", "0"]
+                        and token.tag_ == "NNP"
+                        or i["tag"] in ["NNS", "0"]
+                        and token.tag_ == "NNPS"
+                    ):
+                        if (
+                            token.ent_iob_ == "B"
+                            and token.ent_type_ in namer_dot_ents
+                            and not i["Shaw"].startswith("⸰")
+                        ):
                             text_split_shaw += "·" + i["Shaw"] + token.whitespace_
                         else:
                             text_split_shaw += i["Shaw"] + token.whitespace_
                         break
                     # Match words with only one pronunciation
                     elif i["tag"] == "0":
-                        if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith(
-                                "⸰"):
+                        if (
+                            token.ent_iob_ == "B"
+                            and token.ent_type_ in namer_dot_ents
+                            and not i["Shaw"].startswith("⸰")
+                        ):
                             text_split_shaw += "·" + i["Shaw"] + token.whitespace_
                         else:
                             text_split_shaw += i["Shaw"] + token.whitespace_
@@ -345,7 +435,7 @@ def convert(doc):
                     if token.lower_.startswith(j) and j in prefixes:
                         prefix = prefixes[j]
                         suffix = ""
-                        target_word = token.lower_[len(j):]
+                        target_word = token.lower_[len(j) :]
                     elif token.lower_.endswith(j) and j in suffixes:
                         prefix = ""
                         suffix = suffixes[j]
@@ -357,22 +447,50 @@ def convert(doc):
                         found = True
                         for i in readlex_dict.get(target_word):
                             if i["tag"] != "0" and i["tag"] == token.tag_:
-                                if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
-                                        i["Shaw"].startswith("⸰"):
-                                    text_split_shaw += "·" + prefix + i[
-                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                if (
+                                    token.ent_iob_ == "B"
+                                    and token.ent_type_ in namer_dot_ents
+                                    and not i["Shaw"].startswith("⸰")
+                                ):
+                                    text_split_shaw += (
+                                        "·"
+                                        + prefix
+                                        + i["Shaw"]
+                                        + suffix
+                                        + constructed_warning
+                                        + token.whitespace_
+                                    )
                                 else:
-                                    text_split_shaw += prefix + i[
-                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                    text_split_shaw += (
+                                        prefix
+                                        + i["Shaw"]
+                                        + suffix
+                                        + constructed_warning
+                                        + token.whitespace_
+                                    )
                                 break
                             elif i["tag"] == "0":
-                                if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
-                                        i["Shaw"].startswith("⸰"):
-                                    text_split_shaw += "·" + prefix + i[
-                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                if (
+                                    token.ent_iob_ == "B"
+                                    and token.ent_type_ in namer_dot_ents
+                                    and not i["Shaw"].startswith("⸰")
+                                ):
+                                    text_split_shaw += (
+                                        "·"
+                                        + prefix
+                                        + i["Shaw"]
+                                        + suffix
+                                        + constructed_warning
+                                        + token.whitespace_
+                                    )
                                 else:
-                                    text_split_shaw += prefix + i[
-                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                    text_split_shaw += (
+                                        prefix
+                                        + i["Shaw"]
+                                        + suffix
+                                        + constructed_warning
+                                        + token.whitespace_
+                                    )
                                 break
 
                 # Try to construct plurals if not expressly included in the ReadLex, e.g. plurals of proper names.
@@ -388,20 +506,46 @@ def convert(doc):
                             else:
                                 suffix = "𐑟"
                             if i["tag"] != "0" and i["tag"] == token.tag_:
-                                if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
-                                        i["Shaw"].startswith("⸰"):
-                                    text_split_shaw += "·" + i[
-                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                if (
+                                    token.ent_iob_ == "B"
+                                    and token.ent_type_ in namer_dot_ents
+                                    and not i["Shaw"].startswith("⸰")
+                                ):
+                                    text_split_shaw += (
+                                        "·"
+                                        + i["Shaw"]
+                                        + suffix
+                                        + constructed_warning
+                                        + token.whitespace_
+                                    )
                                 else:
-                                    text_split_shaw += i["Shaw"] + suffix + constructed_warning + token.whitespace_
+                                    text_split_shaw += (
+                                        i["Shaw"]
+                                        + suffix
+                                        + constructed_warning
+                                        + token.whitespace_
+                                    )
                                 break
                             elif i["tag"] == "0":
-                                if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \
-                                        i["Shaw"].startswith("⸰"):
-                                    text_split_shaw += "·" + i[
-                                        "Shaw"] + suffix + constructed_warning + token.whitespace_
+                                if (
+                                    token.ent_iob_ == "B"
+                                    and token.ent_type_ in namer_dot_ents
+                                    and not i["Shaw"].startswith("⸰")
+                                ):
+                                    text_split_shaw += (
+                                        "·"
+                                        + i["Shaw"]
+                                        + suffix
+                                        + constructed_warning
+                                        + token.whitespace_
+                                    )
                                 else:
-                                    text_split_shaw += i["Shaw"] + suffix + constructed_warning + token.whitespace_
+                                    text_split_shaw += (
+                                        i["Shaw"]
+                                        + suffix
+                                        + constructed_warning
+                                        + token.whitespace_
+                                    )
                                 break
 
                 # If there is still no match, do not convert the word
@@ -420,7 +564,9 @@ def convert(doc):
     if text.strip().casefold().startswith("<!doctype html"):
         style_pattern = r"(<style\b[^>]*>.*?</style>)"
         script_pattern = r"(<script\b[^>]*>.*?</script>)"
-        html_pattern = r"(?!(?:<style[^>]*?>.*?</style>|<script[^>]*?>.*?</script>))(<.*?>)"
+        html_pattern = (
+            r"(?!(?:<style[^>]*?>.*?</style>|<script[^>]*?>.*?</script>))(<.*?>)"
+        )
         html_patterns = f"{style_pattern}|{script_pattern}|{html_pattern}"
         text_split = re.split(html_patterns, text, flags=re.DOTALL)
         for text_part in text_split:
@@ -453,18 +599,24 @@ def convert(doc):
                 text_shaw += convert(doc) + "\n"
         # Convert dumb quotes, double hyphens, etc. to their typographic equivalents
         text_shaw = smartypants.smartypants(text_shaw)
-        quotation_marks = {"&#8216;": "&lsaquo;", "&#8217;": "&rsaquo;", "&#8220;": "&laquo;", "&#8221;": "&raquo;"}
+        quotation_marks = {
+            "&#8216;": "&lsaquo;",
+            "&#8217;": "&rsaquo;",
+            "&#8220;": "&laquo;",
+            "&#8221;": "&raquo;",
+        }
         for key, value in quotation_marks.items():
             text_shaw = text_shaw.replace(key, value)
         text_shaw = str(BeautifulSoup(text_shaw, features="html.parser"))
 
     return text_shaw
 
+
 def main():
-    with open("in", 'r') as in_file:
+    with open("in", "r") as in_file:
         text_latin = in_file.read()
 
     text_shaw = latin2shaw(text_latin)
 
-    with open("out", 'w') as out_file:
+    with open("out", "w") as out_file:
         out_file.write(text_shaw)
diff --git a/src/readlex/latin2shaw_test.py b/src/readlex/latin2shaw_test.py
new file mode 100644
index 0000000..a3729a9
--- /dev/null
+++ b/src/readlex/latin2shaw_test.py
@@ -0,0 +1,20 @@
+from readlex import latin2shaw
+
+
+def test_latin2shaw():
+    text_latin = """
+ANDROCLES AND THE LION
+
+PROLOGUE
+
+Overture: forest sounds, roaring of lions, Christian hymn faintly.
+    """
+    text_shaw = """
+·𐑨𐑯𐑛𐑮𐑩𐑒𐑤𐑰𐑟 𐑯 𐑞 𐑤𐑲𐑩𐑯
+
+𐑐𐑮𐑴𐑤𐑪𐑜
+
+𐑴𐑝𐑼𐑗𐑫𐑼: 𐑓𐑪𐑮𐑦𐑕𐑑 𐑕𐑬𐑯𐑛𐑟, 𐑮𐑹𐑦𐑙 𐑝 𐑤𐑲𐑩𐑯𐑟, 𐑒𐑮𐑦𐑕𐑗𐑩𐑯 𐑣𐑦𐑥 𐑓𐑱𐑯𐑑𐑤𐑦.
+    \n"""  # TODO: the trailing newline here seems to be added by latin2shaw, not sure if that's a bug?
+
+    assert latin2shaw(text_latin) == text_shaw

From 259765c103104ca0b186e4a35504be24cb62a45f Mon Sep 17 00:00:00 2001
From: Ingrid <git@ingrids.email>
Date: Sat, 1 Jun 2024 17:57:02 +0200
Subject: [PATCH 3/5] add command line args for latin2shaw script

---
 pyproject.toml            |  1 +
 requirements-dev.lock     |  9 +++++++++
 requirements.lock         |  9 +++++++++
 src/readlex/latin2shaw.py | 27 +++++++++++++++++++++++----
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c44f8fd..26d788b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ dependencies = [
     "smartypants>=2.0.1",
     "bs4>=0.0.2",
     "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl",
+    "typed-argument-parser>=1.10.0",
 ]
 readme = "README.md"
 requires-python = ">= 3.8"
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 8fa2854..7a5b103 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -36,6 +36,8 @@ cymem==2.0.8
     # via preshed
     # via spacy
     # via thinc
+docstring-parser==0.16
+    # via typed-argument-parser
 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
     # via readlex
 idna==3.7
@@ -56,6 +58,8 @@ murmurhash==1.0.10
     # via preshed
     # via spacy
     # via thinc
+mypy-extensions==1.0.0
+    # via typing-inspect
 numpy==1.26.4
     # via blis
     # via spacy
@@ -108,6 +112,8 @@ thinc==8.2.3
     # via spacy
 tqdm==4.66.4
     # via spacy
+typed-argument-parser==1.10.0
+    # via readlex
 typer==0.9.4
     # via spacy
     # via weasel
@@ -115,6 +121,9 @@ typing-extensions==4.12.0
     # via pydantic
     # via pydantic-core
     # via typer
+    # via typing-inspect
+typing-inspect==0.9.0
+    # via typed-argument-parser
 unidecode==1.3.8
     # via readlex
 urllib3==2.2.1
diff --git a/requirements.lock b/requirements.lock
index 4c3293f..3b26bdd 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -36,6 +36,8 @@ cymem==2.0.8
     # via preshed
     # via spacy
     # via thinc
+docstring-parser==0.16
+    # via typed-argument-parser
 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
     # via readlex
 idna==3.7
@@ -54,6 +56,8 @@ murmurhash==1.0.10
     # via preshed
     # via spacy
     # via thinc
+mypy-extensions==1.0.0
+    # via typing-inspect
 numpy==1.26.4
     # via blis
     # via spacy
@@ -102,6 +106,8 @@ thinc==8.2.3
     # via spacy
 tqdm==4.66.4
     # via spacy
+typed-argument-parser==1.10.0
+    # via readlex
 typer==0.9.4
     # via spacy
     # via weasel
@@ -109,6 +115,9 @@ typing-extensions==4.12.0
     # via pydantic
     # via pydantic-core
     # via typer
+    # via typing-inspect
+typing-inspect==0.9.0
+    # via typed-argument-parser
 unidecode==1.3.8
     # via readlex
 urllib3==2.2.1
diff --git a/src/readlex/latin2shaw.py b/src/readlex/latin2shaw.py
index 14da1f5..3cc4afe 100644
--- a/src/readlex/latin2shaw.py
+++ b/src/readlex/latin2shaw.py
@@ -612,11 +612,30 @@ def convert(doc):
     return text_shaw
 
 
+from tap import Tap
+import sys
+
+
+class Args(Tap):
+    in_file: str = ""
+    """File to read latin text from, if not given, text will be read from stdin"""
+    out_file: str = ""
+    """File to output Shaw text to, if not given, text will be written to stdout"""
+
+
 def main():
-    with open("in", "r") as in_file:
-        text_latin = in_file.read()
+    args = Args().parse_args()
+
+    if args.in_file != "":
+        with open("in", "r") as in_file:
+            text_latin = in_file.read()
+    else:
+        text_latin = sys.stdin.read()
 
     text_shaw = latin2shaw(text_latin)
 
-    with open("out", "w") as out_file:
-        out_file.write(text_shaw)
+    if args.out_file != "":
+        with open("out", "w") as out_file:
+            out_file.write(text_shaw)
+    else:
+        sys.stdout.write(text_shaw)

From 0e269a0516ca9c9ec69ae6b8a2d9e72e8b9d7e2d Mon Sep 17 00:00:00 2001
From: Ingrid <git@ingrids.email>
Date: Sat, 1 Jun 2024 18:40:37 +0200
Subject: [PATCH 4/5] update README to use the packaged form of latin2shaw

---
 README.md | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 8795d25..a88d813 100644
--- a/README.md
+++ b/README.md
@@ -28,17 +28,33 @@ The files readlex_converter.json and readlex_converter_phrases.json have been de
 
 The file readlex.dict (and addendum.dict) is adapted for use with Dave Coffin's scrips available at [Dechifro.org](https://www.dechifro.org/shavian/).
 
-## latin2shaw.py
+## src/readlex/latin2shaw.py
 
-The file latin2shaw.py is the script I use for the ReadLex Converter. It uses spaCy for part of speech tagging. 
+The file latin2shaw.py contains the code I use for the ReadLex Converter. It uses spaCy for part of speech tagging. 
 
-To use latin2shaw.py you'll need to install the following packages with it:
-- spaCy and an English language model (I use en_core_web_sm), following the instructions [here](https://spacy.io/usage)
-- unidecode
-- smartypants
-- bs4 (BeautifulSoup)
+To use it as a command line tool:
 
-You will also need to make sure that latin2shaw.py points to the locations where you have saved readlex_converter.json and readlex_converter_phrases.json.
+```bash
+pip install readlex
+
+# the script can read from stdin and print to stdout
+echo "hello world" | latin2shaw 
+
+# or use files
+latin2shaw --in_file in.txt --out_file out.txt
+```
+
+Once installed, it can also be used from python:
+
+```python
+from readlex import latin2shaw
+
+print(latin2shaw("hello world"))
+```
+
+### For contributors
+
+Once you have cloned the repo and [installed rye](https://rye.astral.sh/), you can install/sync dependencies with `rye sync` and run the script with `rye run latin2shaw`
 
 ## Futher information
 

From bdc6543a0d57807285d68d7271c58796f06fd564 Mon Sep 17 00:00:00 2001
From: Ingrid <git@ingrids.email>
Date: Sat, 1 Jun 2024 21:48:40 +0200
Subject: [PATCH 5/5] fix bug with use of filename args in script

---
 src/readlex/latin2shaw.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/readlex/latin2shaw.py b/src/readlex/latin2shaw.py
index 3cc4afe..13c08f8 100644
--- a/src/readlex/latin2shaw.py
+++ b/src/readlex/latin2shaw.py
@@ -627,7 +627,7 @@ def main():
     args = Args().parse_args()
 
     if args.in_file != "":
-        with open("in", "r") as in_file:
+        with open(args.in_file, "r") as in_file:
             text_latin = in_file.read()
     else:
         text_latin = sys.stdin.read()
@@ -635,7 +635,7 @@ def main():
     text_shaw = latin2shaw(text_latin)
 
     if args.out_file != "":
-        with open("out", "w") as out_file:
+        with open(args.out_file, "w") as out_file:
             out_file.write(text_shaw)
     else:
         sys.stdout.write(text_shaw)