From a8c09091bd8d5757ae738ca1dd342e100434760c Mon Sep 17 00:00:00 2001 From: Ingrid Date: Sat, 1 Jun 2024 16:07:35 +0200 Subject: [PATCH 1/5] set up rye package --- .gitignore | 7 +- .python-version | 1 + latin2shaw.py | 389 ------------------------------- pyproject.toml | 40 ++++ requirements-dev.lock | 121 ++++++++++ requirements.lock | 121 ++++++++++ src/readlex/__init__.py | 3 + src/readlex/latin2shaw.py | 470 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 762 insertions(+), 390 deletions(-) create mode 100644 .python-version delete mode 100644 latin2shaw.py create mode 100644 pyproject.toml create mode 100644 requirements-dev.lock create mode 100644 requirements.lock create mode 100644 src/readlex/__init__.py create mode 100644 src/readlex/latin2shaw.py diff --git a/.gitignore b/.gitignore index 9bea433..0c21c64 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,7 @@ - +target +.venv +.env +__pycache__ +.idea +dist .DS_Store diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..871f80a --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12.3 diff --git a/latin2shaw.py b/latin2shaw.py deleted file mode 100644 index 2fec9d0..0000000 --- a/latin2shaw.py +++ /dev/null @@ -1,389 +0,0 @@ -import json -import csv -import re -import unidecode -import smartypants -import spacy -from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, filter_spans -from spacy.tokens import Doc, Span -from spacy.matcher import PhraseMatcher - -from bs4 import BeautifulSoup - - -def latin2shaw(text): - with open("static/readlex_converter.json", 'r', encoding="utf-8") as file: - json_data = file.read() - - readlex_dict: dict[str, list[dict[str, str]]] = json.loads(json_data) - - # Categories of letters that determine how a following 's is pronounced - s_follows: set[str] = {"๐‘", "๐‘‘", "๐‘’", "๐‘“", "๐‘”"} - uhz_follows: set[str] = {"๐‘•", "๐‘–", "๐‘—", "๐‘Ÿ", "๐‘ ", "๐‘ก"} - z_follows: set[str] = {"๐‘š", "๐‘›", "๐‘œ", "๐‘", "๐‘ž", "๐‘™", "๐‘ค", "๐‘ฅ", "๐‘ฏ", "๐‘ธ", "๐‘น", "๐‘บ", "๐‘ป", "๐‘ผ", "๐‘ฝ"} - consonants = set.union(s_follows, uhz_follows, z_follows) - # vowels = {"๐‘ฆ", "๐‘ฐ", "๐‘ง", "๐‘ฑ", "๐‘จ", "๐‘ฒ", "๐‘ฉ", "๐‘ณ", "๐‘ช", "๐‘ด", "๐‘ซ", "๐‘ต", "๐‘ฌ", "๐‘ถ", "๐‘ญ", "๐‘ท", "๐‘พ", "๐‘ฟ"} - # The following are never final other than in initialisms: "๐‘ฃ", "๐‘ข", "๐‘˜", "๐‘ฎ". - - # Contractions that need special treatment since the separate words are not as they appear in the dictionary - contraction_start: dict[str, str] = {"ai": "๐‘ฑ", "ca": "๐‘’๐‘ญ", "do": "๐‘›๐‘ด", "does": "๐‘›๐‘ณ๐‘Ÿ", "did": "๐‘›๐‘ฆ๐‘›", "sha": "๐‘–๐‘ญ", - "wo": "๐‘ข๐‘ด", - "y'": "๐‘˜"} - contraction_end: dict[str, str] = {"n't": "๐‘ฏ๐‘‘", "all": "๐‘ท๐‘ค", "'ve": "๐‘", "'ll": "๐‘ค", "'m": "๐‘ฅ", "'d": "๐‘›", - "'re": "๐‘ผ"} - - # Common prefixes and suffixes used in new coinings - prefixes: dict[str, str] = {"anti": "๐‘จ๐‘ฏ๐‘‘๐‘ฆ", - "counter": "๐‘’๐‘ฌ๐‘ฏ๐‘‘๐‘ผ", - "de": "๐‘›๐‘ฐ", - "dis": "๐‘›๐‘ฆ๐‘•", - "esque": "๐‘ง๐‘•๐‘’", - "hyper": "๐‘ฃ๐‘ฒ๐‘๐‘ผ", - "hypo": "๐‘ฃ๐‘ฒ๐‘๐‘ด", - "mega": "๐‘ฅ๐‘ง๐‘œ๐‘ฉ", - "meta": "๐‘ฅ๐‘ง๐‘‘๐‘ฉ", - "micro": "๐‘ฅ๐‘ฒ๐‘’๐‘ฎ๐‘ด", - "multi": "๐‘ฅ๐‘ณ๐‘ค๐‘‘๐‘ฆ", - "mis": "๐‘ฅ๐‘ฆ๐‘•", - "neuro": "๐‘ฏ๐‘˜๐‘ซ๐‘ผ๐‘ด", - "non": "๐‘ฏ๐‘ช๐‘ฏ", - "o'er": "๐‘ด๐‘ผ", - "out": "๐‘ฌ๐‘‘", - "over": "๐‘ด๐‘๐‘ผ", - "poly": "๐‘๐‘ช๐‘ค๐‘ฆ", - "post": "๐‘๐‘ด๐‘•๐‘‘", - "pre": "๐‘๐‘ฎ๐‘ฐ", - "pro": "๐‘๐‘ฎ๐‘ด", - "pseudo": "๐‘•๐‘ฟ๐‘›๐‘ด", - "re": "๐‘ฎ๐‘ฐ", - "sub": "๐‘•๐‘ณ๐‘š", - "super": "๐‘•๐‘ต๐‘๐‘ผ", - "ultra": "๐‘ณ๐‘ค๐‘‘๐‘ฎ๐‘ฉ", - "un": "๐‘ณ๐‘ฏ", - "under": "๐‘ณ๐‘ฏ๐‘›๐‘ผ" - } - suffixes: dict[str, str] = {"able": "๐‘ฉ๐‘š๐‘ฉ๐‘ค", - "bound": "๐‘š๐‘ฌ๐‘ฏ๐‘›", - "ful": "๐‘“๐‘ฉ๐‘ค", - "hood": "๐‘ฃ๐‘ซ๐‘›", - "ish": "๐‘ฆ๐‘–", - "ism": "๐‘ฆ๐‘Ÿ๐‘ฉ๐‘ฅ", - "less": "๐‘ค๐‘ฉ๐‘•", - "like": "๐‘ค๐‘ฒ๐‘’", - "ness": "๐‘ฏ๐‘ฉ๐‘•" - } - affixes: dict[str, str] = prefixes | suffixes - - # Words that sometimes change spelling before 'to' - have_to: dict[str, str] = {"have": "๐‘ฃ๐‘จ๐‘“", "has": "๐‘ฃ๐‘จ๐‘•"} - vbd_to: dict[str, str] = {"used": "๐‘ฟ๐‘•๐‘‘", "unused": "๐‘ณ๐‘ฏ๐‘ฟ๐‘•๐‘‘", "supposed": "๐‘•๐‘ฉ๐‘๐‘ด๐‘•๐‘‘"} - before_to: dict[str, str] = have_to | vbd_to - - # Suffixes that follow numerals in ordinal numbers - ordinal_suffixes: dict[str, str] = {"st": "๐‘•๐‘‘", "nd": "๐‘ฏ๐‘›", "rd": "๐‘ฎ๐‘›", "th": "๐‘”", "s": "๐‘Ÿ"} - - # Load spaCy, excluding pipeline components that are not required - nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "textcat"]) - - # Customise the spaCy tokeniser to ensure that initial and final dashes and dashes between words aren't stuck to one - # of the surrounding words - # Prefixes - spacy_prefixes: list[str] = nlp.Defaults.prefixes + [r'''^[-โ€“โ€”]+''',] - prefix_regex = compile_prefix_regex(spacy_prefixes) - nlp.tokenizer.prefix_search = prefix_regex.search - # Infixes - spacy_infixes: list[str] = nlp.Defaults.infixes + [r'''[.,?!:;\-โ€“โ€”"~\(\)\[\]]+''',] - infix_regex = compile_infix_regex(spacy_infixes) - nlp.tokenizer.infix_finditer = infix_regex.finditer - # Suffixes - spacy_suffixes: list[str] = nlp.Defaults.suffixes + [r'''[-โ€“โ€”]+$''',] - suffix_regex = compile_suffix_regex(spacy_suffixes) - nlp.tokenizer.suffix_search = suffix_regex.search - - def add_span(matcher, doc, i, matches): - match_id, start, end = matches[i] - - # Define the phrase to match - with open("static/readlex_converter_phrases.json", "r", newline="") as f: - reader = csv.reader(f) - phrases = [row[0] for row in reader if row] - phrase_patterns: list[Doc] = [nlp.make_doc(phrase) for phrase in phrases] - phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") - phrase_matcher.add("phrases", phrase_patterns, on_match=add_span) - - namer_dot_ents: set[str] = {"PERSON", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"} - - def tokenise(text: str) -> spacy.tokens.Doc: - # Tokenise and tag the text using spaCy as doc - - doc = nlp(text) - phrase_matches = phrase_matcher(doc) - phrase_spans: list[Span] = [] - for match_id, start, end in phrase_matches: - span = Span(doc, start, end, label=match_id) - phrase_spans.append(span) - - filtered_spans = filter_spans(phrase_spans) - - with doc.retokenize() as retokenizer: - for span in filtered_spans: - retokenizer.merge(span) - - # Expand person entities to include titles and take initial 'the' out of entity names - titles: set[str] = { - "archbishop", - "archdeacon", - "baron", - "baroness", - "bishop", - "captain", - "count", - "countess", - "cpt", - "dame", - "deacon", - "doctor", - "dr.", - "dr", - "duchess", - "duke", - "earl", - "emperor", - "empress", - "gov.", - "gov", - "governor", - "justice", - "king", - "lady", - "lord", - "marchioness", - "marquess", - "marquis", - "miss", - "missus", - "mister", - "mistress", - "mr.", - "mr", - "mrs.", - "mrs", - "ms.", - "ms", - "mx.", - "mx", - "pope", - "pres.", - "pres", - "president", - "prince", - "princess", - "prof.", - "prof", - "professor", - "queen", - "rev.", - "rev", - "reverend", - "saint", - "sen.", - "sen", - "senator", - "sir", - "st.", - "st", - "viscount", - "viscountess" - } - new_ents: list[Span] = [] - for ent in doc.ents: - # Only check for title if it's a person and not the first token - if ent.label_ == "PERSON" and ent.start != 0: - prev_token = doc[ent.start - 1] - if prev_token.lower_ in titles: - new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label) - new_ents.append(new_ent) - else: - new_ents.append(ent) - elif ent.label_ in namer_dot_ents: - if doc[ent.start].lower_ == "the": - new_ent = Span(doc, ent.start + 1, ent.end, label=ent.label) - new_ents.append(new_ent) - else: - new_ents.append(ent) - else: - new_ents.append(ent) - - filtered_ents = filter_spans(new_ents) - doc.ents = tuple(filtered_ents) - - return doc - - def convert(doc: spacy.tokens.Doc) -> str: - # Apply a series of tests to each token to determine how to Shavianise it. - text_split_shaw: str = "" - - for token in doc: - - # Leave HTML tags unchanged - if token.tag_ == "HTML": - text_split_shaw += token.text - - # Convert contractions - if token.lower_ in contraction_start and token.i < len(doc) - 1 and doc[ - token.i + 1].lower_ in contraction_end: - text_split_shaw += contraction_start[token.lower_] - elif token.lower_ in contraction_end: - prefix: str = "๐‘ฉ" if token.lower_ != "๐‘ผ" and text_split_shaw and text_split_shaw[ - -1] in consonants else "" - text_split_shaw += prefix + contraction_end[token.lower_] + token.whitespace_ - - # Convert possessive 's - elif token.lower_ == "'s": - suffix: str = "๐‘•" if text_split_shaw[-1] in s_follows else "๐‘ฉ๐‘Ÿ" if text_split_shaw[ - -1] in uhz_follows else "๐‘Ÿ" - text_split_shaw += suffix + token.whitespace_ - - # Convert possessive ' - elif token.lower_ == "'" and token.tag_ == "POS": - text_split_shaw += token.whitespace_ - - # Convert verbs that change pronunciation before 'to', e.g. 'have to', 'used to', 'supposed to' - elif token.lower_ in before_to and token.i < len(doc) - 1 and doc[token.i + 1].lower_ == "to": - # 'have' only changes pronunciation where 'have to' means 'must' - if token.lower_ in have_to and doc[token.i + 2].tag_ in ["VB", "VBP"]: - text_split_shaw += have_to[token.lower_] + token.whitespace_ - # 'used', 'supposed' etc. only change pronunciation in the past tense, not past participle - elif token.lower_ in vbd_to and token.tag_ in ["VBD", "VBN", "."]: - text_split_shaw += vbd_to[token.lower_] + token.whitespace_ - - # Match ordinal numbers represented by a numeral and a suffix - elif re.fullmatch(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_): - number, number_suffix = re.match(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_).groups() - text_split_shaw += number + ordinal_suffixes[number_suffix] + token.whitespace_ - - # Loop through the words in the ReadLex and look for matches, and only apply the namer dot to the first word - # in a name (or not at all for initialisms marked with โธฐ) - elif token.lower_ in readlex_dict: - for i in readlex_dict.get(token.lower_, []): - # Match the part of speech for heteronyms - if i["tag"] == token.tag_: - prefix: str = "ยท" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i[ - "Shaw"].startswith("โธฐ") else "" - text_split_shaw += prefix + i["Shaw"] + token.whitespace_ - break - - # For any proper nouns not in the ReadLex, match if an identical common noun exists - elif (i["tag"] in ["NN", "0"] and token.tag_ == "NNP") or ( - i["tag"] in ["NNS", "0"] and token.tag_ == "NNPS"): - prefix = "ยท" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i[ - "Shaw"].startswith("โธฐ") else "" - text_split_shaw += prefix + i["Shaw"] + token.whitespace_ - break - - # Match words with only one pronunciation - elif i["tag"] == "0": - prefix = "ยท" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i[ - "Shaw"].startswith("โธฐ") else "" - text_split_shaw += prefix + i["Shaw"] + token.whitespace_ - break - - # Apply additional tests where there is still no match - else: - found: bool = False - constructed_warning: str = "โš ๏ธ" - ''' - Try to construct a match using common prefixes and suffixes and include a warning symbol to aid proof - reading - ''' - for j in affixes: - if token.lower_.startswith(j) and j in prefixes: - prefix: str = prefixes[j] - suffix: str = "" - target_word: str = token.lower_[len(j):] - elif token.lower_.endswith(j) and j in suffixes: - prefix = "" - suffix = suffixes[j] - target_word = token.lower_[:-len(j)] - else: - continue - if target_word in readlex_dict: - found = True - for i in readlex_dict.get(target_word): - prefix = "ยท" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ - i[ - "Shaw"].startswith("โธฐ") else prefix - text_split_shaw += prefix + i[ - "Shaw"] + suffix + constructed_warning + token.whitespace_ - break - - # Try to construct plurals if not expressly included in the ReadLex, e.g. plurals of proper names. - if token.lower_.endswith("s"): - target_word = token.lower_[:-1] - if target_word in readlex_dict: - found = True - for i in readlex_dict.get(target_word): - suffix = "๐‘•" if i["Shaw"][-1] in s_follows else "๐‘ฉ๐‘Ÿ" if i["Shaw"][ - -1] in uhz_follows else "๐‘Ÿ" - prefix = "ยท" if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ - i[ - "Shaw"].startswith("โธฐ") else "" - text_split_shaw += prefix + i[ - "Shaw"] + suffix + constructed_warning + token.whitespace_ - break - - if found is not False: - continue - # If there is still no match, do not convert the word - if token.text.isalpha(): - text_split_shaw += token.text + "โœข" + token.whitespace_ - else: - text_split_shaw += token.text + token.whitespace_ - - return text_split_shaw - - # Create the string that will contain the Shavianised text. - text_shaw: str = "" - - # Split up the string to reduce the risk of spaCy exceeding memory limits - if text.strip().casefold().startswith("]*>.*?)" - script_pattern: str = r"(]*>.*?)" - html_pattern: str = r"(?!(?:]*?>.*?|]*?>.*?))(<.*?>)" - html_patterns: str = f"{style_pattern}|{script_pattern}|{html_pattern}" - text_split: list[str] = re.split(html_patterns, text, flags=re.DOTALL) - for text_part in text_split: - if text_part is None: - pass - elif re.fullmatch(style_pattern, text_part, flags=re.DOTALL) or re.fullmatch( - script_pattern, text_part, flags=re.DOTALL) or re.fullmatch(html_pattern, text_part, - flags=re.DOTALL): - text_shaw += text_part - else: - doc: spacy.tokens.Doc = tokenise(text_part) - text_shaw += convert(doc) - - # Convert dumb quotes, double hyphens, etc. to their typographic equivalents - text_shaw = smartypants.smartypants(text_shaw) - # Convert curly quotes to angle quotes - quotation_marks: dict[str, str] = {"‘": "‹", "’": "›", "“": "«", "”": "»"} - for key, value in quotation_marks.items(): - text_shaw = text_shaw.replace(key, value) - - else: - text = unidecode.unidecode(text) - text = re.sub(r"(\S)(\[)", r"\1 \2", text) - text = re.sub(r"](\S)", r"] \1", text) - text_split: list[str] = text.splitlines() - for i in text_split: - if len(i) < 10000: - doc: spacy.tokens.Doc = tokenise(i) - text_shaw += convert(doc) + "\n" - # Convert dumb quotes, double hyphens, etc. to their typographic equivalents - text_shaw = smartypants.smartypants(text_shaw) - quotation_marks: dict[str, str] = {"‘": "‹", "’": "›", "“": "«", "”": "»"} - for key, value in quotation_marks.items(): - text_shaw = text_shaw.replace(key, value) - text_shaw = str(BeautifulSoup(text_shaw, features="html.parser")) - - return text_shaw \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6c64199 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,40 @@ +[project] +name = "readlex" +version = "0.1.0" +description = "Auto-transliteration of English language text from latin to Shaw script using the Read Lexicon" +authors = [ + { name = "Shavian-info", email = "contact@shavian.info" }, + { name = "Ingrid", email = "git@ingrids.email" } +] +dependencies = [ + "spacy>=3.7.4", + "unidecode>=1.3.8", + "smartypants>=2.0.1", + "bs4>=0.0.2", + "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl", +] +readme = "README.md" +requires-python = ">= 3.8" +exclude = [ + "readlex.json", + "kingsleyreadlexicon.tsv", + "readlex.dict", + "addendum.dict" +] + +[project.scripts] +latin2shaw = "readlex.latin2shaw:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.rye] +managed = true +dev-dependencies = [] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["src/readlex"] diff --git a/requirements-dev.lock b/requirements-dev.lock new file mode 100644 index 0000000..4c3293f --- /dev/null +++ b/requirements-dev.lock @@ -0,0 +1,121 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false + +-e file:. +annotated-types==0.7.0 + # via pydantic +beautifulsoup4==4.12.3 + # via bs4 +blis==0.7.11 + # via thinc +bs4==0.0.2 + # via readlex +catalogue==2.0.10 + # via spacy + # via srsly + # via thinc +certifi==2024.2.2 + # via requests +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via typer +cloudpathlib==0.16.0 + # via weasel +confection==0.1.5 + # via thinc + # via weasel +cymem==2.0.8 + # via preshed + # via spacy + # via thinc +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl + # via readlex +idna==3.7 + # via requests +jinja2==3.1.4 + # via spacy +langcodes==3.4.0 + # via spacy +language-data==1.2.0 + # via langcodes +marisa-trie==1.1.1 + # via language-data +markupsafe==2.1.5 + # via jinja2 +murmurhash==1.0.10 + # via preshed + # via spacy + # via thinc +numpy==1.26.4 + # via blis + # via spacy + # via thinc +packaging==24.0 + # via spacy + # via thinc + # via weasel +preshed==3.0.9 + # via spacy + # via thinc +pydantic==2.7.2 + # via confection + # via spacy + # via thinc + # via weasel +pydantic-core==2.18.3 + # via pydantic +requests==2.32.3 + # via spacy + # via weasel +setuptools==70.0.0 + # via marisa-trie + # via spacy + # via thinc +smart-open==6.4.0 + # via spacy + # via weasel +smartypants==2.0.1 + # via readlex +soupsieve==2.5 + # via beautifulsoup4 +spacy==3.7.4 + # via en-core-web-sm + # via readlex +spacy-legacy==3.0.12 + # via spacy +spacy-loggers==1.0.5 + # via spacy +srsly==2.4.8 + # via confection + # via spacy + # via thinc + # via weasel +thinc==8.2.3 + # via spacy +tqdm==4.66.4 + # via spacy +typer==0.9.4 + # via spacy + # via weasel +typing-extensions==4.12.0 + # via pydantic + # via pydantic-core + # via typer +unidecode==1.3.8 + # via readlex +urllib3==2.2.1 + # via requests +wasabi==1.1.3 + # via spacy + # via thinc + # via weasel +weasel==0.3.4 + # via spacy diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 0000000..4c3293f --- /dev/null +++ b/requirements.lock @@ -0,0 +1,121 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false + +-e file:. +annotated-types==0.7.0 + # via pydantic +beautifulsoup4==4.12.3 + # via bs4 +blis==0.7.11 + # via thinc +bs4==0.0.2 + # via readlex +catalogue==2.0.10 + # via spacy + # via srsly + # via thinc +certifi==2024.2.2 + # via requests +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via typer +cloudpathlib==0.16.0 + # via weasel +confection==0.1.5 + # via thinc + # via weasel +cymem==2.0.8 + # via preshed + # via spacy + # via thinc +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl + # via readlex +idna==3.7 + # via requests +jinja2==3.1.4 + # via spacy +langcodes==3.4.0 + # via spacy +language-data==1.2.0 + # via langcodes +marisa-trie==1.1.1 + # via language-data +markupsafe==2.1.5 + # via jinja2 +murmurhash==1.0.10 + # via preshed + # via spacy + # via thinc +numpy==1.26.4 + # via blis + # via spacy + # via thinc +packaging==24.0 + # via spacy + # via thinc + # via weasel +preshed==3.0.9 + # via spacy + # via thinc +pydantic==2.7.2 + # via confection + # via spacy + # via thinc + # via weasel +pydantic-core==2.18.3 + # via pydantic +requests==2.32.3 + # via spacy + # via weasel +setuptools==70.0.0 + # via marisa-trie + # via spacy + # via thinc +smart-open==6.4.0 + # via spacy + # via weasel +smartypants==2.0.1 + # via readlex +soupsieve==2.5 + # via beautifulsoup4 +spacy==3.7.4 + # via en-core-web-sm + # via readlex +spacy-legacy==3.0.12 + # via spacy +spacy-loggers==1.0.5 + # via spacy +srsly==2.4.8 + # via confection + # via spacy + # via thinc + # via weasel +thinc==8.2.3 + # via spacy +tqdm==4.66.4 + # via spacy +typer==0.9.4 + # via spacy + # via weasel +typing-extensions==4.12.0 + # via pydantic + # via pydantic-core + # via typer +unidecode==1.3.8 + # via readlex +urllib3==2.2.1 + # via requests +wasabi==1.1.3 + # via spacy + # via thinc + # via weasel +weasel==0.3.4 + # via spacy diff --git a/src/readlex/__init__.py b/src/readlex/__init__.py new file mode 100644 index 0000000..d2af850 --- /dev/null +++ b/src/readlex/__init__.py @@ -0,0 +1,3 @@ +from readlex.latin2shaw import latin2shaw + +__all__ = ['latin2shaw',] diff --git a/src/readlex/latin2shaw.py b/src/readlex/latin2shaw.py new file mode 100644 index 0000000..5b4c518 --- /dev/null +++ b/src/readlex/latin2shaw.py @@ -0,0 +1,470 @@ +import json +import csv +import re +import unidecode +import smartypants + +import spacy +from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, filter_spans +from spacy.tokens import Span +from spacy.matcher import PhraseMatcher # , Matcher + +from bs4 import BeautifulSoup +from pathlib import Path + + +def latin2shaw(text): + # path where resource files (readlex.json etc.) are kept + resource_path = Path(__file__).parent.parent + + with resource_path.with_name('readlex_converter.json').open('r', encoding="utf-8") as f: + json_data = f.read() + + readlex_dict = json.loads(json_data) + + # Categories of letters that determine how a following 's is pronounced + s_follows = {"๐‘", "๐‘‘", "๐‘’", "๐‘“", "๐‘”"} + uhz_follows = {"๐‘•", "๐‘–", "๐‘—", "๐‘Ÿ", "๐‘ ", "๐‘ก"} + z_follows = {"๐‘š", "๐‘›", "๐‘œ", "๐‘", "๐‘ž", "๐‘™", "๐‘ค", "๐‘ฅ", "๐‘ฏ", "๐‘ธ", "๐‘น", "๐‘บ", "๐‘ป", "๐‘ผ", "๐‘ฝ"} + consonants = set.union(s_follows, uhz_follows, z_follows) + # vowels = {"๐‘ฆ", "๐‘ฐ", "๐‘ง", "๐‘ฑ", "๐‘จ", "๐‘ฒ", "๐‘ฉ", "๐‘ณ", "๐‘ช", "๐‘ด", "๐‘ซ", "๐‘ต", "๐‘ฌ", "๐‘ถ", "๐‘ญ", "๐‘ท", "๐‘พ", "๐‘ฟ"} + # The following are never final other than in initialisms: "๐‘ฃ", "๐‘ข", "๐‘˜", "๐‘ฎ". + + # Contractions that need special treatment since the separate words are not as they appear in the dictionary + contraction_start = {"ai": "๐‘ฑ", "ca": "๐‘’๐‘ญ", "do": "๐‘›๐‘ด", "does": "๐‘›๐‘ณ๐‘Ÿ", "did": "๐‘›๐‘ฆ๐‘›", "sha": "๐‘–๐‘ญ", "wo": "๐‘ข๐‘ด", + "y'": "๐‘˜"} + contraction_end = {"n't": "๐‘ฏ๐‘‘", "all": "๐‘ท๐‘ค", "'ve": "๐‘", "'ll": "๐‘ค", "'m": "๐‘ฅ", "'d": "๐‘›", "'re": "๐‘ผ"} + + # Common prefixes and suffixes used in new coinings + prefixes = {"anti": "๐‘จ๐‘ฏ๐‘‘๐‘ฆ", + "counter": "๐‘’๐‘ฌ๐‘ฏ๐‘‘๐‘ผ", + "de": "๐‘›๐‘ฐ", + "dis": "๐‘›๐‘ฆ๐‘•", + "esque": "๐‘ง๐‘•๐‘’", + "hyper": "๐‘ฃ๐‘ฒ๐‘๐‘ผ", + "hypo": "๐‘ฃ๐‘ฒ๐‘๐‘ด", + "mega": "๐‘ฅ๐‘ง๐‘œ๐‘ฉ", + "meta": "๐‘ฅ๐‘ง๐‘‘๐‘ฉ", + "micro": "๐‘ฅ๐‘ฒ๐‘’๐‘ฎ๐‘ด", + "multi": "๐‘ฅ๐‘ณ๐‘ค๐‘‘๐‘ฆ", + "mis": "๐‘ฅ๐‘ฆ๐‘•", + "neuro": "๐‘ฏ๐‘˜๐‘ซ๐‘ผ๐‘ด", + "non": "๐‘ฏ๐‘ช๐‘ฏ", + "o'er": "๐‘ด๐‘ผ", + "out": "๐‘ฌ๐‘‘", + "over": "๐‘ด๐‘๐‘ผ", + "poly": "๐‘๐‘ช๐‘ค๐‘ฆ", + "post": "๐‘๐‘ด๐‘•๐‘‘", + "pre": "๐‘๐‘ฎ๐‘ฐ", + "pro": "๐‘๐‘ฎ๐‘ด", + "pseudo": "๐‘•๐‘ฟ๐‘›๐‘ด", + "re": "๐‘ฎ๐‘ฐ", + "sub": "๐‘•๐‘ณ๐‘š", + "super": "๐‘•๐‘ต๐‘๐‘ผ", + "ultra": "๐‘ณ๐‘ค๐‘‘๐‘ฎ๐‘ฉ", + "un": "๐‘ณ๐‘ฏ", + "under": "๐‘ณ๐‘ฏ๐‘›๐‘ผ" + } + suffixes = {"able": "๐‘ฉ๐‘š๐‘ฉ๐‘ค", + "bound": "๐‘š๐‘ฌ๐‘ฏ๐‘›", + "ful": "๐‘“๐‘ฉ๐‘ค", + "hood": "๐‘ฃ๐‘ซ๐‘›", + "ish": "๐‘ฆ๐‘–", + "ism": "๐‘ฆ๐‘Ÿ๐‘ฉ๐‘ฅ", + "less": "๐‘ค๐‘ฉ๐‘•", + "like": "๐‘ค๐‘ฒ๐‘’", + "ness": "๐‘ฏ๐‘ฉ๐‘•" + } + affixes = prefixes | suffixes + + # Words that sometimes change spelling before 'to' + have_to = {"have": "๐‘ฃ๐‘จ๐‘“", "has": "๐‘ฃ๐‘จ๐‘•"} + vbd_to = {"used": "๐‘ฟ๐‘•๐‘‘", "unused": "๐‘ณ๐‘ฏ๐‘ฟ๐‘•๐‘‘", "supposed": "๐‘•๐‘ฉ๐‘๐‘ด๐‘•๐‘‘"} + before_to = have_to | vbd_to + + # Suffixes that follow numerals in ordinal numbers + ordinal_suffixes = {"st": "๐‘•๐‘‘", "nd": "๐‘ฏ๐‘›", "rd": "๐‘ฎ๐‘›", "th": "๐‘”", "s": "๐‘Ÿ"} + + # Load spaCy, excluding pipeline components that are not required + nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "textcat"]) + + # Customise the spaCy tokeniser to ensure that initial and final dashes and dashes between words aren't stuck to one + # of the surrounding words + # Prefixes + spacy_prefixes = nlp.Defaults.prefixes + [r"""^[-โ€“โ€”]+""", ] + prefix_regex = compile_prefix_regex(spacy_prefixes) + nlp.tokenizer.prefix_search = prefix_regex.search + # Infixes + spacy_infixes = nlp.Defaults.infixes + [r"""[-โ€“โ€”\"\~\(\[]+""", ] + infix_regex = compile_infix_regex(spacy_infixes) + nlp.tokenizer.infix_finditer = infix_regex.finditer + # Suffixes + spacy_suffixes = nlp.Defaults.suffixes + [r"""[-โ€“โ€”]+$""", ] + suffix_regex = compile_suffix_regex(spacy_suffixes) + nlp.tokenizer.suffix_search = suffix_regex.search + + def add_span(matcher, doc, i, matches): + match_id, start, end = matches[i] + + # Define the phrase to match + with resource_path.with_name('readlex_converter_phrases.json').open('r', newline="") as f: + reader = csv.reader(f) + phrases = [] + for i in reader: + phrases.append(i[0]) + phrase_patterns = [nlp.make_doc(phrase) for phrase in phrases] + phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") + phrase_matcher.add("phrases", phrase_patterns, on_match=add_span) + + # # Define the HTML element patterns to match + # html_patterns = [[{"TEXT": {"REGEX": "(?<=<)"}}, + # {"OP": "*", "TEXT": {"REGEX": "[^<>]"}}, + # {"TEXT": {"REGEX": "(?=>)"}}], + # [{'LOWER': '<'}, + # {'LOWER': 'style'}, + # {'OP': '*', 'IS_ASCII': True}, + # {'LOWER': '/style'}, + # {'LOWER': '>'}], + # [{'LOWER': '<'}, + # {'LOWER': 'script'}, + # {'OP': '*', 'IS_ASCII': True}, + # {'LOWER': '/script'}, + # {'LOWER': '>'}] + # ] + # matcher = Matcher(nlp.vocab) + # matcher.add("html_elements", html_patterns, on_match=add_span) + + namer_dot_ents = ["PERSON", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"] + + def tokenise(str): + # Tokenise and tag the text using spaCy as doc + + doc = nlp(str) + # matches = matcher(doc) + phrase_matches = phrase_matcher(doc) + + # html_spans = [] + # for match_id, start, end in matches: + # span = Span(doc, start, end, label=match_id) + # html_spans.append(span) + + phrase_spans = [] + for match_id, start, end in phrase_matches: + span = Span(doc, start, end, label=match_id) + phrase_spans.append(span) + + # all_spans = html_spans + # for i in phrase_spans: + # all_spans.append(i) + # filtered_spans = filter_spans(all_spans) + + filtered_spans = filter_spans(phrase_spans) + + with doc.retokenize() as retokenizer: + for span in filtered_spans: + # if span.label_ == "html_elements": + # retokenizer.merge(span, attrs={"TAG": "HTML"}) + # else: + retokenizer.merge(span) + + # Expand person entities to include titles and take initial 'the' out of entity names + titles = [ + "archbishop", + "archdeacon", + "baron", + "baroness", + "bishop", + "captain", + "count", + "countess", + "cpt", + "dame", + "deacon", + "doctor", + "dr.", + "dr", + "duchess", + "duke", + "earl", + "emperor", + "empress", + "gov.", + "gov", + "governor", + "justice", + "king", + "lady", + "lord", + "marchioness", + "marquess", + "marquis", + "miss", + "missus", + "mister", + "mistress", + "mr.", + "mr", + "mrs.", + "mrs", + "ms.", + "ms", + "mx.", + "mx", + "pope", + "pres.", + "pres", + "president", + "prince", + "princess", + "prof.", + "prof", + "professor", + "queen", + "rev.", + "rev", + "reverend", + "saint", + "sen.", + "sen", + "senator", + "sir", + "st.", + "st", + "viscount", + "viscountess" + ] + new_ents = [] + for ent in doc.ents: + # Only check for title if it's a person and not the first token + if ent.label_ == "PERSON" and ent.start != 0: + prev_token = doc[ent.start - 1] + if prev_token.lower_ in titles: + new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label) + new_ents.append(new_ent) + else: + new_ents.append(ent) + elif ent.label_ in namer_dot_ents: + if doc[ent.start].lower_ == "the": + new_ent = Span(doc, ent.start + 1, ent.end, label=ent.label) + new_ents.append(new_ent) + else: + new_ents.append(ent) + else: + new_ents.append(ent) + doc.ents = filter_spans(new_ents) + + return doc + + def convert(doc): + # Apply a series of tests to each token to determine how to Shavianise it. + text_split_shaw = "" + for token in doc: + + # Leave HTML tags unchanged + if token.tag_ == "HTML": + text_split_shaw += token.text + + # Convert contractions + elif token.lower_ in contraction_start and doc[token.i + 1].lower_ in contraction_end: + text_split_shaw += contraction_start[token.lower_] + elif token.lower_ in contraction_end: + if token.lower_ != "๐‘ผ" and len(text_split_shaw) > 0 and text_split_shaw[-1] in consonants: + text_split_shaw += "๐‘ฉ" + contraction_end[token.lower_] + token.whitespace_ + else: + text_split_shaw += contraction_end[token.lower_] + token.whitespace_ + + # Convert possessive 's + elif token.lower_ == "'s": + if text_split_shaw[-1] in s_follows: + text_split_shaw += "๐‘•" + token.whitespace_ + elif text_split_shaw[-1] in uhz_follows: + text_split_shaw += "๐‘ฉ๐‘Ÿ" + token.whitespace_ + else: + text_split_shaw += "๐‘Ÿ" + token.whitespace_ + + # Convert possessive ' + elif token.lower_ == "'" and token.tag_ == "POS": + text_split_shaw += token.whitespace_ + + # Convert verbs that change pronunciation before 'to', e.g. 'have to', 'used to', 'supposed to' + elif token.lower_ in before_to and token.i < (len(doc)-1) and doc[token.i + 1].lower_ == "to": + # 'have' only changes pronunciation where 'have to' means 'must' + if token.lower_ in have_to: + if doc[token.i + 2].tag_ in ["VB", "VBP"]: + text_split_shaw += have_to[token.lower_] + token.whitespace_ + # else: + # text_split_shaw += "๐‘ฃ๐‘จ๐‘Ÿ" + token.whitespace_ + # 'used', 'supposed' etc. only change pronunciation in the past tense, not past participle + elif token.lower_ in vbd_to and token.tag_ in ["VBD", "VBN", "."]: + text_split_shaw += vbd_to[token.lower_] + token.whitespace_ + + # Match ordinal numbers represented by a numeral and a suffix + elif re.fullmatch(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_): + match = re.match(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_) + number = match.group(1) + number_suffix = match.group(2) + text_split_shaw += number + ordinal_suffixes[number_suffix] + token.whitespace_ + + # Loop through the words in the ReadLex and look for matches, and only apply the namer dot to the first word + # in a name (or not at all for initialisms marked with โธฐ) + elif token.lower_ in readlex_dict: + for i in readlex_dict.get(token.lower_, []): + # Match the part of speech for heteronyms + if i["tag"] == token.tag_: + if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith( + "โธฐ"): + text_split_shaw += "ยท" + i["Shaw"] + token.whitespace_ + else: + text_split_shaw += i["Shaw"] + token.whitespace_ + break + # For any proper nouns not in the ReadLex, match if an identical common noun exists + elif i["tag"] in ["NN", "0"] and token.tag_ == "NNP" or i["tag"] in ["NNS", + "0"] and token.tag_ == "NNPS": + if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith( + "โธฐ"): + text_split_shaw += "ยท" + i["Shaw"] + token.whitespace_ + else: + text_split_shaw += i["Shaw"] + token.whitespace_ + break + # Match words with only one pronunciation + elif i["tag"] == "0": + if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith( + "โธฐ"): + text_split_shaw += "ยท" + i["Shaw"] + token.whitespace_ + else: + text_split_shaw += i["Shaw"] + token.whitespace_ + break + + # Apply additional tests where there is still no match + else: + found = False + constructed_warning = "โš ๏ธ" + # Try to construct a match using common prefixes and suffixes and include a warning symbol to aid proof + # reading + for j in affixes: + if token.lower_.startswith(j) and j in prefixes: + prefix = prefixes[j] + suffix = "" + target_word = token.lower_[len(j):] + elif token.lower_.endswith(j) and j in suffixes: + prefix = "" + suffix = suffixes[j] + suffix_length = len(j) + target_word = token.lower_[:-suffix_length] + else: + continue + if target_word in readlex_dict: + found = True + for i in readlex_dict.get(target_word): + if i["tag"] != "0" and i["tag"] == token.tag_: + if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ + i["Shaw"].startswith("โธฐ"): + text_split_shaw += "ยท" + prefix + i[ + "Shaw"] + suffix + constructed_warning + token.whitespace_ + else: + text_split_shaw += prefix + i[ + "Shaw"] + suffix + constructed_warning + token.whitespace_ + break + elif i["tag"] == "0": + if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ + i["Shaw"].startswith("โธฐ"): + text_split_shaw += "ยท" + prefix + i[ + "Shaw"] + suffix + constructed_warning + token.whitespace_ + else: + text_split_shaw += prefix + i[ + "Shaw"] + suffix + constructed_warning + token.whitespace_ + break + + # Try to construct plurals if not expressly included in the ReadLex, e.g. plurals of proper names. + if token.lower_.endswith("s"): + target_word = token.lower_[:-1] + if target_word in readlex_dict: + found = True + for i in readlex_dict.get(target_word): + if i["Shaw"][-1] in s_follows: + suffix = "๐‘•" + elif i["Shaw"][-1] in uhz_follows: + suffix = "๐‘ฉ๐‘Ÿ" + else: + suffix = "๐‘Ÿ" + if i["tag"] != "0" and i["tag"] == token.tag_: + if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ + i["Shaw"].startswith("โธฐ"): + text_split_shaw += "ยท" + i[ + "Shaw"] + suffix + constructed_warning + token.whitespace_ + else: + text_split_shaw += i["Shaw"] + suffix + constructed_warning + token.whitespace_ + break + elif i["tag"] == "0": + if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ + i["Shaw"].startswith("โธฐ"): + text_split_shaw += "ยท" + i[ + "Shaw"] + suffix + constructed_warning + token.whitespace_ + else: + text_split_shaw += i["Shaw"] + suffix + constructed_warning + token.whitespace_ + break + + # If there is still no match, do not convert the word + if found is False: + if token.text.isalpha(): + text_split_shaw += token.text + "โœข" + token.whitespace_ + else: + text_split_shaw += token.text + token.whitespace_ + + return text_split_shaw + + # Create the string that will contain the Shavianised text. + text_shaw = "" + + # Split up the string to reduce the risk of spaCy exceeding memory limits + if text.strip().casefold().startswith("]*>.*?)" + script_pattern = r"(]*>.*?)" + html_pattern = r"(?!(?:]*?>.*?|]*?>.*?))(<.*?>)" + html_patterns = f"{style_pattern}|{script_pattern}|{html_pattern}" + text_split = re.split(html_patterns, text, flags=re.DOTALL) + for text_part in text_split: + if text_part is None: + pass + elif re.fullmatch(style_pattern, text_part, flags=re.DOTALL): + text_shaw += text_part + elif re.fullmatch(script_pattern, text_part, flags=re.DOTALL): + text_shaw += text_part + elif re.fullmatch(html_pattern, text_part, flags=re.DOTALL): + text_shaw += text_part + else: + doc = tokenise(text_part) + text_shaw += convert(doc) + # Convert dumb quotes, double hyphens, etc. to their typographic equivalents + text_shaw = smartypants.smartypants(text_shaw) + # # Convert curly quotes to angle quotes + # quotation_marks = {"‘": "‹", "’": "›", "“": "«", "”": "»"} + # for key, value in quotation_marks.items(): + # text_shaw = text_shaw.replace(key, value) + + else: + text = unidecode.unidecode(text) + text = re.sub(r"(\S)(\[)", r"\1 \2", text) + text = re.sub(r"](\S)", r"] \1", text) + text_split = text.splitlines() + for i in text_split: + if len(i) < 10000: + doc = tokenise(i) + text_shaw += convert(doc) + "\n" + # Convert dumb quotes, double hyphens, etc. to their typographic equivalents + text_shaw = smartypants.smartypants(text_shaw) + quotation_marks = {"‘": "‹", "’": "›", "“": "«", "”": "»"} + for key, value in quotation_marks.items(): + text_shaw = text_shaw.replace(key, value) + text_shaw = str(BeautifulSoup(text_shaw, features="html.parser")) + + return text_shaw + +def main(): + with open("in", 'r') as in_file: + text_latin = in_file.read() + + text_shaw = latin2shaw(text_latin) + + with open("out", 'w') as out_file: + out_file.write(text_shaw) From 3ba122c99852a973d5dfd79c27021831a7014a33 Mon Sep 17 00:00:00 2001 From: Ingrid Date: Sat, 1 Jun 2024 17:18:42 +0200 Subject: [PATCH 2/5] add unit test and enforce formatting --- pyproject.toml | 4 +- requirements-dev.lock | 6 + src/readlex/__init__.py | 4 +- src/readlex/latin2shaw.py | 340 ++++++++++++++++++++++++--------- src/readlex/latin2shaw_test.py | 20 ++ 5 files changed, 278 insertions(+), 96 deletions(-) create mode 100644 src/readlex/latin2shaw_test.py diff --git a/pyproject.toml b/pyproject.toml index 6c64199..c44f8fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,9 @@ build-backend = "hatchling.build" [tool.rye] managed = true -dev-dependencies = [] +dev-dependencies = [ + "pytest>=8.2.1", +] [tool.hatch.metadata] allow-direct-references = true diff --git a/requirements-dev.lock b/requirements-dev.lock index 4c3293f..8fa2854 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -40,6 +40,8 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_ # via readlex idna==3.7 # via requests +iniconfig==2.0.0 + # via pytest jinja2==3.1.4 # via spacy langcodes==3.4.0 @@ -59,9 +61,12 @@ numpy==1.26.4 # via spacy # via thinc packaging==24.0 + # via pytest # via spacy # via thinc # via weasel +pluggy==1.5.0 + # via pytest preshed==3.0.9 # via spacy # via thinc @@ -72,6 +77,7 @@ pydantic==2.7.2 # via weasel pydantic-core==2.18.3 # via pydantic +pytest==8.2.1 requests==2.32.3 # via spacy # via weasel diff --git a/src/readlex/__init__.py b/src/readlex/__init__.py index d2af850..b615e12 100644 --- a/src/readlex/__init__.py +++ b/src/readlex/__init__.py @@ -1,3 +1,5 @@ from readlex.latin2shaw import latin2shaw -__all__ = ['latin2shaw',] +__all__ = [ + "latin2shaw", +] diff --git a/src/readlex/latin2shaw.py b/src/readlex/latin2shaw.py index 5b4c518..14da1f5 100644 --- a/src/readlex/latin2shaw.py +++ b/src/readlex/latin2shaw.py @@ -5,7 +5,12 @@ import smartypants import spacy -from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, filter_spans +from spacy.util import ( + compile_infix_regex, + compile_prefix_regex, + compile_suffix_regex, + filter_spans, +) from spacy.tokens import Span from spacy.matcher import PhraseMatcher # , Matcher @@ -17,7 +22,9 @@ def latin2shaw(text): # path where resource files (readlex.json etc.) are kept resource_path = Path(__file__).parent.parent - with resource_path.with_name('readlex_converter.json').open('r', encoding="utf-8") as f: + with resource_path.with_name("readlex_converter.json").open( + "r", encoding="utf-8" + ) as f: json_data = f.read() readlex_dict = json.loads(json_data) @@ -25,56 +32,90 @@ def latin2shaw(text): # Categories of letters that determine how a following 's is pronounced s_follows = {"๐‘", "๐‘‘", "๐‘’", "๐‘“", "๐‘”"} uhz_follows = {"๐‘•", "๐‘–", "๐‘—", "๐‘Ÿ", "๐‘ ", "๐‘ก"} - z_follows = {"๐‘š", "๐‘›", "๐‘œ", "๐‘", "๐‘ž", "๐‘™", "๐‘ค", "๐‘ฅ", "๐‘ฏ", "๐‘ธ", "๐‘น", "๐‘บ", "๐‘ป", "๐‘ผ", "๐‘ฝ"} + z_follows = { + "๐‘š", + "๐‘›", + "๐‘œ", + "๐‘", + "๐‘ž", + "๐‘™", + "๐‘ค", + "๐‘ฅ", + "๐‘ฏ", + "๐‘ธ", + "๐‘น", + "๐‘บ", + "๐‘ป", + "๐‘ผ", + "๐‘ฝ", + } consonants = set.union(s_follows, uhz_follows, z_follows) # vowels = {"๐‘ฆ", "๐‘ฐ", "๐‘ง", "๐‘ฑ", "๐‘จ", "๐‘ฒ", "๐‘ฉ", "๐‘ณ", "๐‘ช", "๐‘ด", "๐‘ซ", "๐‘ต", "๐‘ฌ", "๐‘ถ", "๐‘ญ", "๐‘ท", "๐‘พ", "๐‘ฟ"} # The following are never final other than in initialisms: "๐‘ฃ", "๐‘ข", "๐‘˜", "๐‘ฎ". # Contractions that need special treatment since the separate words are not as they appear in the dictionary - contraction_start = {"ai": "๐‘ฑ", "ca": "๐‘’๐‘ญ", "do": "๐‘›๐‘ด", "does": "๐‘›๐‘ณ๐‘Ÿ", "did": "๐‘›๐‘ฆ๐‘›", "sha": "๐‘–๐‘ญ", "wo": "๐‘ข๐‘ด", - "y'": "๐‘˜"} - contraction_end = {"n't": "๐‘ฏ๐‘‘", "all": "๐‘ท๐‘ค", "'ve": "๐‘", "'ll": "๐‘ค", "'m": "๐‘ฅ", "'d": "๐‘›", "'re": "๐‘ผ"} + contraction_start = { + "ai": "๐‘ฑ", + "ca": "๐‘’๐‘ญ", + "do": "๐‘›๐‘ด", + "does": "๐‘›๐‘ณ๐‘Ÿ", + "did": "๐‘›๐‘ฆ๐‘›", + "sha": "๐‘–๐‘ญ", + "wo": "๐‘ข๐‘ด", + "y'": "๐‘˜", + } + contraction_end = { + "n't": "๐‘ฏ๐‘‘", + "all": "๐‘ท๐‘ค", + "'ve": "๐‘", + "'ll": "๐‘ค", + "'m": "๐‘ฅ", + "'d": "๐‘›", + "'re": "๐‘ผ", + } # Common prefixes and suffixes used in new coinings - prefixes = {"anti": "๐‘จ๐‘ฏ๐‘‘๐‘ฆ", - "counter": "๐‘’๐‘ฌ๐‘ฏ๐‘‘๐‘ผ", - "de": "๐‘›๐‘ฐ", - "dis": "๐‘›๐‘ฆ๐‘•", - "esque": "๐‘ง๐‘•๐‘’", - "hyper": "๐‘ฃ๐‘ฒ๐‘๐‘ผ", - "hypo": "๐‘ฃ๐‘ฒ๐‘๐‘ด", - "mega": "๐‘ฅ๐‘ง๐‘œ๐‘ฉ", - "meta": "๐‘ฅ๐‘ง๐‘‘๐‘ฉ", - "micro": "๐‘ฅ๐‘ฒ๐‘’๐‘ฎ๐‘ด", - "multi": "๐‘ฅ๐‘ณ๐‘ค๐‘‘๐‘ฆ", - "mis": "๐‘ฅ๐‘ฆ๐‘•", - "neuro": "๐‘ฏ๐‘˜๐‘ซ๐‘ผ๐‘ด", - "non": "๐‘ฏ๐‘ช๐‘ฏ", - "o'er": "๐‘ด๐‘ผ", - "out": "๐‘ฌ๐‘‘", - "over": "๐‘ด๐‘๐‘ผ", - "poly": "๐‘๐‘ช๐‘ค๐‘ฆ", - "post": "๐‘๐‘ด๐‘•๐‘‘", - "pre": "๐‘๐‘ฎ๐‘ฐ", - "pro": "๐‘๐‘ฎ๐‘ด", - "pseudo": "๐‘•๐‘ฟ๐‘›๐‘ด", - "re": "๐‘ฎ๐‘ฐ", - "sub": "๐‘•๐‘ณ๐‘š", - "super": "๐‘•๐‘ต๐‘๐‘ผ", - "ultra": "๐‘ณ๐‘ค๐‘‘๐‘ฎ๐‘ฉ", - "un": "๐‘ณ๐‘ฏ", - "under": "๐‘ณ๐‘ฏ๐‘›๐‘ผ" - } - suffixes = {"able": "๐‘ฉ๐‘š๐‘ฉ๐‘ค", - "bound": "๐‘š๐‘ฌ๐‘ฏ๐‘›", - "ful": "๐‘“๐‘ฉ๐‘ค", - "hood": "๐‘ฃ๐‘ซ๐‘›", - "ish": "๐‘ฆ๐‘–", - "ism": "๐‘ฆ๐‘Ÿ๐‘ฉ๐‘ฅ", - "less": "๐‘ค๐‘ฉ๐‘•", - "like": "๐‘ค๐‘ฒ๐‘’", - "ness": "๐‘ฏ๐‘ฉ๐‘•" - } + prefixes = { + "anti": "๐‘จ๐‘ฏ๐‘‘๐‘ฆ", + "counter": "๐‘’๐‘ฌ๐‘ฏ๐‘‘๐‘ผ", + "de": "๐‘›๐‘ฐ", + "dis": "๐‘›๐‘ฆ๐‘•", + "esque": "๐‘ง๐‘•๐‘’", + "hyper": "๐‘ฃ๐‘ฒ๐‘๐‘ผ", + "hypo": "๐‘ฃ๐‘ฒ๐‘๐‘ด", + "mega": "๐‘ฅ๐‘ง๐‘œ๐‘ฉ", + "meta": "๐‘ฅ๐‘ง๐‘‘๐‘ฉ", + "micro": "๐‘ฅ๐‘ฒ๐‘’๐‘ฎ๐‘ด", + "multi": "๐‘ฅ๐‘ณ๐‘ค๐‘‘๐‘ฆ", + "mis": "๐‘ฅ๐‘ฆ๐‘•", + "neuro": "๐‘ฏ๐‘˜๐‘ซ๐‘ผ๐‘ด", + "non": "๐‘ฏ๐‘ช๐‘ฏ", + "o'er": "๐‘ด๐‘ผ", + "out": "๐‘ฌ๐‘‘", + "over": "๐‘ด๐‘๐‘ผ", + "poly": "๐‘๐‘ช๐‘ค๐‘ฆ", + "post": "๐‘๐‘ด๐‘•๐‘‘", + "pre": "๐‘๐‘ฎ๐‘ฐ", + "pro": "๐‘๐‘ฎ๐‘ด", + "pseudo": "๐‘•๐‘ฟ๐‘›๐‘ด", + "re": "๐‘ฎ๐‘ฐ", + "sub": "๐‘•๐‘ณ๐‘š", + "super": "๐‘•๐‘ต๐‘๐‘ผ", + "ultra": "๐‘ณ๐‘ค๐‘‘๐‘ฎ๐‘ฉ", + "un": "๐‘ณ๐‘ฏ", + "under": "๐‘ณ๐‘ฏ๐‘›๐‘ผ", + } + suffixes = { + "able": "๐‘ฉ๐‘š๐‘ฉ๐‘ค", + "bound": "๐‘š๐‘ฌ๐‘ฏ๐‘›", + "ful": "๐‘“๐‘ฉ๐‘ค", + "hood": "๐‘ฃ๐‘ซ๐‘›", + "ish": "๐‘ฆ๐‘–", + "ism": "๐‘ฆ๐‘Ÿ๐‘ฉ๐‘ฅ", + "less": "๐‘ค๐‘ฉ๐‘•", + "like": "๐‘ค๐‘ฒ๐‘’", + "ness": "๐‘ฏ๐‘ฉ๐‘•", + } affixes = prefixes | suffixes # Words that sometimes change spelling before 'to' @@ -91,15 +132,21 @@ def latin2shaw(text): # Customise the spaCy tokeniser to ensure that initial and final dashes and dashes between words aren't stuck to one # of the surrounding words # Prefixes - spacy_prefixes = nlp.Defaults.prefixes + [r"""^[-โ€“โ€”]+""", ] + spacy_prefixes = nlp.Defaults.prefixes + [ + r"""^[-โ€“โ€”]+""", + ] prefix_regex = compile_prefix_regex(spacy_prefixes) nlp.tokenizer.prefix_search = prefix_regex.search # Infixes - spacy_infixes = nlp.Defaults.infixes + [r"""[-โ€“โ€”\"\~\(\[]+""", ] + spacy_infixes = nlp.Defaults.infixes + [ + r"""[-โ€“โ€”\"\~\(\[]+""", + ] infix_regex = compile_infix_regex(spacy_infixes) nlp.tokenizer.infix_finditer = infix_regex.finditer # Suffixes - spacy_suffixes = nlp.Defaults.suffixes + [r"""[-โ€“โ€”]+$""", ] + spacy_suffixes = nlp.Defaults.suffixes + [ + r"""[-โ€“โ€”]+$""", + ] suffix_regex = compile_suffix_regex(spacy_suffixes) nlp.tokenizer.suffix_search = suffix_regex.search @@ -107,7 +154,9 @@ def add_span(matcher, doc, i, matches): match_id, start, end = matches[i] # Define the phrase to match - with resource_path.with_name('readlex_converter_phrases.json').open('r', newline="") as f: + with resource_path.with_name("readlex_converter_phrases.json").open( + "r", newline="" + ) as f: reader = csv.reader(f) phrases = [] for i in reader: @@ -134,7 +183,17 @@ def add_span(matcher, doc, i, matches): # matcher = Matcher(nlp.vocab) # matcher.add("html_elements", html_patterns, on_match=add_span) - namer_dot_ents = ["PERSON", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"] + namer_dot_ents = [ + "PERSON", + "FAC", + "ORG", + "GPE", + "LOC", + "PRODUCT", + "EVENT", + "WORK_OF_ART", + "LAW", + ] def tokenise(str): # Tokenise and tag the text using spaCy as doc @@ -231,7 +290,7 @@ def tokenise(str): "st.", "st", "viscount", - "viscountess" + "viscountess", ] new_ents = [] for ent in doc.ents: @@ -259,17 +318,25 @@ def convert(doc): # Apply a series of tests to each token to determine how to Shavianise it. text_split_shaw = "" for token in doc: - # Leave HTML tags unchanged if token.tag_ == "HTML": text_split_shaw += token.text # Convert contractions - elif token.lower_ in contraction_start and doc[token.i + 1].lower_ in contraction_end: + elif ( + token.lower_ in contraction_start + and doc[token.i + 1].lower_ in contraction_end + ): text_split_shaw += contraction_start[token.lower_] elif token.lower_ in contraction_end: - if token.lower_ != "๐‘ผ" and len(text_split_shaw) > 0 and text_split_shaw[-1] in consonants: - text_split_shaw += "๐‘ฉ" + contraction_end[token.lower_] + token.whitespace_ + if ( + token.lower_ != "๐‘ผ" + and len(text_split_shaw) > 0 + and text_split_shaw[-1] in consonants + ): + text_split_shaw += ( + "๐‘ฉ" + contraction_end[token.lower_] + token.whitespace_ + ) else: text_split_shaw += contraction_end[token.lower_] + token.whitespace_ @@ -287,7 +354,11 @@ def convert(doc): text_split_shaw += token.whitespace_ # Convert verbs that change pronunciation before 'to', e.g. 'have to', 'used to', 'supposed to' - elif token.lower_ in before_to and token.i < (len(doc)-1) and doc[token.i + 1].lower_ == "to": + elif ( + token.lower_ in before_to + and token.i < (len(doc) - 1) + and doc[token.i + 1].lower_ == "to" + ): # 'have' only changes pronunciation where 'have to' means 'must' if token.lower_ in have_to: if doc[token.i + 2].tag_ in ["VB", "VBP"]: @@ -299,11 +370,17 @@ def convert(doc): text_split_shaw += vbd_to[token.lower_] + token.whitespace_ # Match ordinal numbers represented by a numeral and a suffix - elif re.fullmatch(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_): - match = re.match(r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_) + elif re.fullmatch( + r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_ + ): + match = re.match( + r"([0-9]+(?:[, .]?[0-9]+)*)(st|nd|rd|th|s)", token.lower_ + ) number = match.group(1) number_suffix = match.group(2) - text_split_shaw += number + ordinal_suffixes[number_suffix] + token.whitespace_ + text_split_shaw += ( + number + ordinal_suffixes[number_suffix] + token.whitespace_ + ) # Loop through the words in the ReadLex and look for matches, and only apply the namer dot to the first word # in a name (or not at all for initialisms marked with โธฐ) @@ -311,25 +388,38 @@ def convert(doc): for i in readlex_dict.get(token.lower_, []): # Match the part of speech for heteronyms if i["tag"] == token.tag_: - if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith( - "โธฐ"): + if ( + token.ent_iob_ == "B" + and token.ent_type_ in namer_dot_ents + and not i["Shaw"].startswith("โธฐ") + ): text_split_shaw += "ยท" + i["Shaw"] + token.whitespace_ else: text_split_shaw += i["Shaw"] + token.whitespace_ break # For any proper nouns not in the ReadLex, match if an identical common noun exists - elif i["tag"] in ["NN", "0"] and token.tag_ == "NNP" or i["tag"] in ["NNS", - "0"] and token.tag_ == "NNPS": - if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith( - "โธฐ"): + elif ( + i["tag"] in ["NN", "0"] + and token.tag_ == "NNP" + or i["tag"] in ["NNS", "0"] + and token.tag_ == "NNPS" + ): + if ( + token.ent_iob_ == "B" + and token.ent_type_ in namer_dot_ents + and not i["Shaw"].startswith("โธฐ") + ): text_split_shaw += "ยท" + i["Shaw"] + token.whitespace_ else: text_split_shaw += i["Shaw"] + token.whitespace_ break # Match words with only one pronunciation elif i["tag"] == "0": - if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not i["Shaw"].startswith( - "โธฐ"): + if ( + token.ent_iob_ == "B" + and token.ent_type_ in namer_dot_ents + and not i["Shaw"].startswith("โธฐ") + ): text_split_shaw += "ยท" + i["Shaw"] + token.whitespace_ else: text_split_shaw += i["Shaw"] + token.whitespace_ @@ -345,7 +435,7 @@ def convert(doc): if token.lower_.startswith(j) and j in prefixes: prefix = prefixes[j] suffix = "" - target_word = token.lower_[len(j):] + target_word = token.lower_[len(j) :] elif token.lower_.endswith(j) and j in suffixes: prefix = "" suffix = suffixes[j] @@ -357,22 +447,50 @@ def convert(doc): found = True for i in readlex_dict.get(target_word): if i["tag"] != "0" and i["tag"] == token.tag_: - if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ - i["Shaw"].startswith("โธฐ"): - text_split_shaw += "ยท" + prefix + i[ - "Shaw"] + suffix + constructed_warning + token.whitespace_ + if ( + token.ent_iob_ == "B" + and token.ent_type_ in namer_dot_ents + and not i["Shaw"].startswith("โธฐ") + ): + text_split_shaw += ( + "ยท" + + prefix + + i["Shaw"] + + suffix + + constructed_warning + + token.whitespace_ + ) else: - text_split_shaw += prefix + i[ - "Shaw"] + suffix + constructed_warning + token.whitespace_ + text_split_shaw += ( + prefix + + i["Shaw"] + + suffix + + constructed_warning + + token.whitespace_ + ) break elif i["tag"] == "0": - if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ - i["Shaw"].startswith("โธฐ"): - text_split_shaw += "ยท" + prefix + i[ - "Shaw"] + suffix + constructed_warning + token.whitespace_ + if ( + token.ent_iob_ == "B" + and token.ent_type_ in namer_dot_ents + and not i["Shaw"].startswith("โธฐ") + ): + text_split_shaw += ( + "ยท" + + prefix + + i["Shaw"] + + suffix + + constructed_warning + + token.whitespace_ + ) else: - text_split_shaw += prefix + i[ - "Shaw"] + suffix + constructed_warning + token.whitespace_ + text_split_shaw += ( + prefix + + i["Shaw"] + + suffix + + constructed_warning + + token.whitespace_ + ) break # Try to construct plurals if not expressly included in the ReadLex, e.g. plurals of proper names. @@ -388,20 +506,46 @@ def convert(doc): else: suffix = "๐‘Ÿ" if i["tag"] != "0" and i["tag"] == token.tag_: - if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ - i["Shaw"].startswith("โธฐ"): - text_split_shaw += "ยท" + i[ - "Shaw"] + suffix + constructed_warning + token.whitespace_ + if ( + token.ent_iob_ == "B" + and token.ent_type_ in namer_dot_ents + and not i["Shaw"].startswith("โธฐ") + ): + text_split_shaw += ( + "ยท" + + i["Shaw"] + + suffix + + constructed_warning + + token.whitespace_ + ) else: - text_split_shaw += i["Shaw"] + suffix + constructed_warning + token.whitespace_ + text_split_shaw += ( + i["Shaw"] + + suffix + + constructed_warning + + token.whitespace_ + ) break elif i["tag"] == "0": - if token.ent_iob_ == "B" and token.ent_type_ in namer_dot_ents and not \ - i["Shaw"].startswith("โธฐ"): - text_split_shaw += "ยท" + i[ - "Shaw"] + suffix + constructed_warning + token.whitespace_ + if ( + token.ent_iob_ == "B" + and token.ent_type_ in namer_dot_ents + and not i["Shaw"].startswith("โธฐ") + ): + text_split_shaw += ( + "ยท" + + i["Shaw"] + + suffix + + constructed_warning + + token.whitespace_ + ) else: - text_split_shaw += i["Shaw"] + suffix + constructed_warning + token.whitespace_ + text_split_shaw += ( + i["Shaw"] + + suffix + + constructed_warning + + token.whitespace_ + ) break # If there is still no match, do not convert the word @@ -420,7 +564,9 @@ def convert(doc): if text.strip().casefold().startswith("]*>.*?)" script_pattern = r"(]*>.*?)" - html_pattern = r"(?!(?:]*?>.*?|]*?>.*?))(<.*?>)" + html_pattern = ( + r"(?!(?:]*?>.*?|]*?>.*?))(<.*?>)" + ) html_patterns = f"{style_pattern}|{script_pattern}|{html_pattern}" text_split = re.split(html_patterns, text, flags=re.DOTALL) for text_part in text_split: @@ -453,18 +599,24 @@ def convert(doc): text_shaw += convert(doc) + "\n" # Convert dumb quotes, double hyphens, etc. to their typographic equivalents text_shaw = smartypants.smartypants(text_shaw) - quotation_marks = {"‘": "‹", "’": "›", "“": "«", "”": "»"} + quotation_marks = { + "‘": "‹", + "’": "›", + "“": "«", + "”": "»", + } for key, value in quotation_marks.items(): text_shaw = text_shaw.replace(key, value) text_shaw = str(BeautifulSoup(text_shaw, features="html.parser")) return text_shaw + def main(): - with open("in", 'r') as in_file: + with open("in", "r") as in_file: text_latin = in_file.read() text_shaw = latin2shaw(text_latin) - with open("out", 'w') as out_file: + with open("out", "w") as out_file: out_file.write(text_shaw) diff --git a/src/readlex/latin2shaw_test.py b/src/readlex/latin2shaw_test.py new file mode 100644 index 0000000..a3729a9 --- /dev/null +++ b/src/readlex/latin2shaw_test.py @@ -0,0 +1,20 @@ +from readlex import latin2shaw + + +def test_latin2shaw(): + text_latin = """ +ANDROCLES AND THE LION + +PROLOGUE + +Overture: forest sounds, roaring of lions, Christian hymn faintly. + """ + text_shaw = """ +ยท๐‘จ๐‘ฏ๐‘›๐‘ฎ๐‘ฉ๐‘’๐‘ค๐‘ฐ๐‘Ÿ ๐‘ฏ ๐‘ž ๐‘ค๐‘ฒ๐‘ฉ๐‘ฏ + +๐‘๐‘ฎ๐‘ด๐‘ค๐‘ช๐‘œ + +๐‘ด๐‘๐‘ผ๐‘—๐‘ซ๐‘ผ: ๐‘“๐‘ช๐‘ฎ๐‘ฆ๐‘•๐‘‘ ๐‘•๐‘ฌ๐‘ฏ๐‘›๐‘Ÿ, ๐‘ฎ๐‘น๐‘ฆ๐‘™ ๐‘ ๐‘ค๐‘ฒ๐‘ฉ๐‘ฏ๐‘Ÿ, ๐‘’๐‘ฎ๐‘ฆ๐‘•๐‘—๐‘ฉ๐‘ฏ ๐‘ฃ๐‘ฆ๐‘ฅ ๐‘“๐‘ฑ๐‘ฏ๐‘‘๐‘ค๐‘ฆ. + \n""" # TODO: the trailing newline here seems to be added by latin2shaw, not sure if that's a bug? + + assert latin2shaw(text_latin) == text_shaw From 259765c103104ca0b186e4a35504be24cb62a45f Mon Sep 17 00:00:00 2001 From: Ingrid Date: Sat, 1 Jun 2024 17:57:02 +0200 Subject: [PATCH 3/5] add command line args for latin2shaw script --- pyproject.toml | 1 + requirements-dev.lock | 9 +++++++++ requirements.lock | 9 +++++++++ src/readlex/latin2shaw.py | 27 +++++++++++++++++++++++---- 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c44f8fd..26d788b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "smartypants>=2.0.1", "bs4>=0.0.2", "en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl", + "typed-argument-parser>=1.10.0", ] readme = "README.md" requires-python = ">= 3.8" diff --git a/requirements-dev.lock b/requirements-dev.lock index 8fa2854..7a5b103 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -36,6 +36,8 @@ cymem==2.0.8 # via preshed # via spacy # via thinc +docstring-parser==0.16 + # via typed-argument-parser en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl # via readlex idna==3.7 @@ -56,6 +58,8 @@ murmurhash==1.0.10 # via preshed # via spacy # via thinc +mypy-extensions==1.0.0 + # via typing-inspect numpy==1.26.4 # via blis # via spacy @@ -108,6 +112,8 @@ thinc==8.2.3 # via spacy tqdm==4.66.4 # via spacy +typed-argument-parser==1.10.0 + # via readlex typer==0.9.4 # via spacy # via weasel @@ -115,6 +121,9 @@ typing-extensions==4.12.0 # via pydantic # via pydantic-core # via typer + # via typing-inspect +typing-inspect==0.9.0 + # via typed-argument-parser unidecode==1.3.8 # via readlex urllib3==2.2.1 diff --git a/requirements.lock b/requirements.lock index 4c3293f..3b26bdd 100644 --- a/requirements.lock +++ b/requirements.lock @@ -36,6 +36,8 @@ cymem==2.0.8 # via preshed # via spacy # via thinc +docstring-parser==0.16 + # via typed-argument-parser en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl # via readlex idna==3.7 @@ -54,6 +56,8 @@ murmurhash==1.0.10 # via preshed # via spacy # via thinc +mypy-extensions==1.0.0 + # via typing-inspect numpy==1.26.4 # via blis # via spacy @@ -102,6 +106,8 @@ thinc==8.2.3 # via spacy tqdm==4.66.4 # via spacy +typed-argument-parser==1.10.0 + # via readlex typer==0.9.4 # via spacy # via weasel @@ -109,6 +115,9 @@ typing-extensions==4.12.0 # via pydantic # via pydantic-core # via typer + # via typing-inspect +typing-inspect==0.9.0 + # via typed-argument-parser unidecode==1.3.8 # via readlex urllib3==2.2.1 diff --git a/src/readlex/latin2shaw.py b/src/readlex/latin2shaw.py index 14da1f5..3cc4afe 100644 --- a/src/readlex/latin2shaw.py +++ b/src/readlex/latin2shaw.py @@ -612,11 +612,30 @@ def convert(doc): return text_shaw +from tap import Tap +import sys + + +class Args(Tap): + in_file: str = "" + """File to read latin text from, if not given, text will be read from stdin""" + out_file: str = "" + """File to output Shaw text to, if not given, text will be written to stdout""" + + def main(): - with open("in", "r") as in_file: - text_latin = in_file.read() + args = Args().parse_args() + + if args.in_file != "": + with open("in", "r") as in_file: + text_latin = in_file.read() + else: + text_latin = sys.stdin.read() text_shaw = latin2shaw(text_latin) - with open("out", "w") as out_file: - out_file.write(text_shaw) + if args.out_file != "": + with open("out", "w") as out_file: + out_file.write(text_shaw) + else: + sys.stdout.write(text_shaw) From 0e269a0516ca9c9ec69ae6b8a2d9e72e8b9d7e2d Mon Sep 17 00:00:00 2001 From: Ingrid Date: Sat, 1 Jun 2024 18:40:37 +0200 Subject: [PATCH 4/5] update README to use the packaged form of latin2shaw --- README.md | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 8795d25..a88d813 100644 --- a/README.md +++ b/README.md @@ -28,17 +28,33 @@ The files readlex_converter.json and readlex_converter_phrases.json have been de The file readlex.dict (and addendum.dict) is adapted for use with Dave Coffin's scrips available at [Dechifro.org](https://www.dechifro.org/shavian/). -## latin2shaw.py +## src/readlex/latin2shaw.py -The file latin2shaw.py is the script I use for the ReadLex Converter. It uses spaCy for part of speech tagging. +The file latin2shaw.py contains the code I use for the ReadLex Converter. It uses spaCy for part of speech tagging. -To use latin2shaw.py you'll need to install the following packages with it: -- spaCy and an English language model (I use en_core_web_sm), following the instructions [here](https://spacy.io/usage) -- unidecode -- smartypants -- bs4 (BeautifulSoup) +To use it as a command line tool: -You will also need to make sure that latin2shaw.py points to the locations where you have saved readlex_converter.json and readlex_converter_phrases.json. +```bash +pip install readlex + +# the script can read from stdin and print to stdout +echo "hello world" | latin2shaw + +# or use files +latin2shaw --in_file in.txt --out_file out.txt +``` + +Once installed, it can also be used from python: + +```python +from readlex import latin2shaw + +print(latin2shaw("hello world")) +``` + +### For contributors + +Once you have cloned the repo and [installed rye](https://rye.astral.sh/), you can install/sync dependencies with `rye sync` and run the script with `rye run latin2shaw` ## Futher information From bdc6543a0d57807285d68d7271c58796f06fd564 Mon Sep 17 00:00:00 2001 From: Ingrid Date: Sat, 1 Jun 2024 21:48:40 +0200 Subject: [PATCH 5/5] fix bug with use of filename args in script --- src/readlex/latin2shaw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/readlex/latin2shaw.py b/src/readlex/latin2shaw.py index 3cc4afe..13c08f8 100644 --- a/src/readlex/latin2shaw.py +++ b/src/readlex/latin2shaw.py @@ -627,7 +627,7 @@ def main(): args = Args().parse_args() if args.in_file != "": - with open("in", "r") as in_file: + with open(args.in_file, "r") as in_file: text_latin = in_file.read() else: text_latin = sys.stdin.read() @@ -635,7 +635,7 @@ def main(): text_shaw = latin2shaw(text_latin) if args.out_file != "": - with open("out", "w") as out_file: + with open(args.out_file, "w") as out_file: out_file.write(text_shaw) else: sys.stdout.write(text_shaw)