digirati-co-uk · kamiylahkc · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025 · Apr 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,4 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+/04_speech_to_text/02_annotation/tmp/
diff --git a/02_transliteration/.gitignore b/02_transliteration/.gitignore
@@ -0,0 +1,2 @@
+initial_eda/
+../local_data/
diff --git a/02_transliteration/.python-version b/02_transliteration/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/02_transliteration/01_reformatting/01_run_reformatter.py b/02_transliteration/01_reformatting/01_run_reformatter.py
@@ -0,0 +1,45 @@
+import typer
+import logging
+
+from pathlib import Path
+from typing_extensions import Annotated
+
+from utils.reformatter import convert_txt_to_xml
+
+
+def apply_reformatter(
+    txt_file_path: Annotated[
+        Path, typer.Option(help="Location to load in TXT dataset from.")
+    ] = Path(
+        "../../local_data/02_transliteration/Japanese_records_personal_names_w_800_20250325.txt"
+    ),
+    xml_file_dir: Annotated[
+        Path, typer.Option(help="Location to save in MARC XML dataset to.")
+    ] = Path("../../local_data/02_transliteration/source_xml/"),
+    log_level: Annotated[
+        str,
+        typer.Option(
+            help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)"
+        ),
+    ] = "INFO",
+):
+    """
+    Converts TXT file to MARCXML and saves to output dir under same file name
+    """
+    log_level = log_level.upper()
+    level_enum = getattr(logging, log_level, None)
+    if not isinstance(level_enum, int):
+        raise typer.BadParameter(f"Invalid log level: {log_level}")
+    logging.basicConfig(
+        level=level_enum,
+        format="%(asctime)s - %(name)s.%(funcName)s:%(lineno)d - %(levelname)s - %(message)s",
+    )
+
+    xml_file_name = txt_file_path.with_suffix(".xml").name
+    output_path = xml_file_dir / xml_file_name
+
+    convert_txt_to_xml(txt_input_path=txt_file_path, xml_output_path=output_path)
+
+
+if __name__ == "__main__":
+    typer.run(apply_reformatter)
diff --git a/02_transliteration/01_reformatting/02_combine_xml.py b/02_transliteration/01_reformatting/02_combine_xml.py
@@ -0,0 +1,52 @@
+import typer
+import logging
+
+from pathlib import Path
+from typing_extensions import Annotated
+
+from utils.reformatter import parse_marc_xml, convert_list_to_marcxml
+
+
+def combine_xml(
+    xml_file_path_1: Annotated[
+        Path, typer.Option(help="Location to load in first MARCXML dataset from.")
+    ] = Path(
+        "../../local_data/02_transliteration/source_xml/Japanese_records_personal_names_w_800_20250325.xml"
+    ),
+    xml_file_path_2: Annotated[
+        Path, typer.Option(help="Location to load in second MARCXML dataset to.")
+    ] = Path(
+        "../../local_data/02_transliteration/source_xml/Japanese_records_w_880_20250314.xml"
+    ),
+    combined_xml_output_path: Annotated[
+        Path, typer.Option(help="Location to save in MARC XML dataset to.")
+    ] = Path(
+        "../../local_data/02_transliteration/source_xml/all_unique_Japanese_records.xml"
+    ),
+    log_level: Annotated[
+        str,
+        typer.Option(
+            help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)"
+        ),
+    ] = "INFO",
+):
+    """
+    Loads in two MARCXML files, combines and saves a singular XML file.
+    """
+    log_level = log_level.upper()
+    level_enum = getattr(logging, log_level, None)
+    if not isinstance(level_enum, int):
+        raise typer.BadParameter(f"Invalid log level: {log_level}")
+    logging.basicConfig(
+        level=level_enum,
+        format="%(asctime)s - %(name)s.%(funcName)s:%(lineno)d - %(levelname)s - %(message)s",
+    )
+
+    xml_list_1 = parse_marc_xml(xml_file_path_1)
+    xml_list_2 = parse_marc_xml(xml_file_path_2)
+
+    convert_list_to_marcxml(xml_list_1 + xml_list_2, combined_xml_output_path)
+
+
+if __name__ == "__main__":
+    typer.run(combine_xml)
diff --git a/02_transliteration/01_reformatting/README.md b/02_transliteration/01_reformatting/README.md
@@ -0,0 +1,23 @@
+# Reformatting
+
+This directory holds the code for creating XML files, which is to be following the download cataloging record files.
+
+## Scripts
+### Reformat TXT to MARCXML
+- [01_run_reformatter.py](./01_run_reformatter.py)
+    - Loads TXT file containing cataloging records
+    - Saves as (MARC)XML file to the output directory under the same file name.
+    - Runs locally
+    - Running:
+    `uv run 01_run_reformatter.py --txt-file-path=path/to/txt_file --xml-file-dir=path/to/XML_output_directory`
+
+### Combine XML files
+- [02_combine_xml.py](./02_combine_xml.py)
+    - Loads in two XML files
+    - Converts each file to dictionary 
+    - Compares dictionaries to create a [union](https://en.wikipedia.org/wiki/Union_(set_theory)) of the dictionaries 
+    - Converted union dictionary back to MARC XML structure
+    - Saves as XML file to output path
+    - Runs locally
+    - Running:
+    `uv run 02_combine_xml.py --xml-file-path-1=path/to/first/xml_file --xml-file-path-2=path/to/second/xml_file --combined-xml-output-path=path/to/save/output/xml_file`
diff --git a/02_transliteration/01_reformatting/utils/__init__.py b/02_transliteration/01_reformatting/utils/__init__.py
diff --git a/02_transliteration/01_reformatting/utils/reformatter.py b/02_transliteration/01_reformatting/utils/reformatter.py
@@ -0,0 +1,100 @@
+import xml.etree.ElementTree as ET
+from lxml import etree
+
+import json
+
+from pathlib import Path
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def convert_txt_to_xml(txt_input_path: Path, xml_output_path: Path):
+    marcxml_content = txt_input_path.read_text(encoding="utf-8")
+    xml_output_path.write_text(marcxml_content, encoding="utf-8")
+
+    logger.info(
+        f"Conversion successful! The MARCXML file is saved as {xml_output_path}."
+    )
+
+
+def parse_marc_xml(file_path):
+    logger.info(f"Parsing {file_path}...")
+    tree = ET.parse(file_path)
+    root = tree.getroot()
+
+    namespace = {"marc": "http://www.loc.gov/MARC21/slim"}
+    records = []
+
+    for record in root.findall("marc:record", namespace):
+        record_dict = {}
+
+        for controlfield in record.findall("marc:controlfield", namespace):
+            tag = controlfield.attrib["tag"]
+            record_dict[tag] = controlfield.text
+
+        for datafield in record.findall("marc:datafield", namespace):
+            tag = datafield.attrib["tag"]
+            subfields = datafield.findall("marc:subfield", namespace)
+
+            subfield_data = {sf.attrib["code"]: sf.text for sf in subfields}
+
+            if tag in record_dict:
+                if isinstance(record_dict[tag], list):
+                    record_dict[tag].append(subfield_data)
+                else:
+                    record_dict[tag] = [record_dict[tag], subfield_data]
+            else:
+                record_dict[tag] = subfield_data
+
+        records.append(record_dict)
+
+    return records
+
+
+def convert_list_to_marcxml(marc_list: list, output_file: Path, make_unique=True):
+
+    if make_unique:
+        unique_data = list({json.dumps(d, sort_keys=True) for d in marc_list})
+        marc_list = [json.loads(d) for d in unique_data]
+
+    logger.info(f"MARC LIST LEN: {len(marc_list)}")
+
+    collection = etree.Element("collection", xmlns="http://www.loc.gov/MARC21/slim")
+
+    for record_dict in marc_list:
+        record = etree.Element("record")
+
+        for tag, value in record_dict.items():
+            if isinstance(value, str):
+                controlfield = etree.Element("controlfield", tag=tag)
+                controlfield.text = value
+                record.append(controlfield)
+
+            elif isinstance(value, dict):
+                datafield = etree.Element("datafield", tag=tag, ind1=" ", ind2=" ")
+                for code, sub_value in value.items():
+                    subfield = etree.Element("subfield", code=code)
+                    subfield.text = sub_value
+                    datafield.append(subfield)
+                record.append(datafield)
+
+            elif isinstance(value, list):
+                for subfield_dict in value:
+                    datafield = etree.Element("datafield", tag=tag, ind1=" ", ind2=" ")
+                    for code, sub_value in subfield_dict.items():
+                        subfield = etree.Element("subfield", code=code)
+                        subfield.text = sub_value
+                        datafield.append(subfield)
+                    record.append(datafield)
+
+        collection.append(record)
+    print(len(collection))
+    collection_tree = etree.ElementTree(collection)
+    with output_file.open("wb") as f:
+        collection_tree.write(
+            f, pretty_print=True, xml_declaration=True, encoding="UTF-8"
+        )
+
+    logger.info(f"Converted MARCXML saved to {output_file}")
diff --git a/02_transliteration/02_assembly/01_run_assembly.py b/02_transliteration/02_assembly/01_run_assembly.py
@@ -0,0 +1,103 @@
+import typer
+import logging
+
+from utils.assembler import (
+    assemble_transliteration_df,
+)  # Assembler, load_variable_from_file
+from pathlib import Path
+from typing_extensions import Annotated
+
+import pandas as pd
+
+FILTER_ON = {
+    "no_filters": {},
+    "personal_names": {"100": ["a"], "600": ["a"], "700": ["a"], "800": ["a"]},
+    "corporate_names": {"110": ["a"], "610": ["a"], "710": ["a"], "810": ["a"]},
+}
+
+TO_EXCLUDE = {
+    "no_names": [
+        "100",
+        "110",
+        "600",
+        "610",
+        "700",
+        "710",
+        "800",
+        "810",
+    ],
+}
+
+
+def apply_assembly(
+    xml_file_path: Annotated[
+        Path, typer.Option(help="Location to load in MARC XML dataset from.")
+    ] = Path(
+        "../../local_data/02_transliteration/source_xml/Japanese_records_personal_names_w_800_20250325.xml"
+    ),
+    output_dir: Annotated[
+        Path, typer.Option(help="Directory to save resulting dataframe.")
+    ] = Path("../../local_data/02_transliteration/generated_data/"),
+    filtering_type: Annotated[
+        str,
+        typer.Option(
+            help="Type of filtering to be applied to original dataset (no_filters, personal_names, corporate_names)"
+        ),
+    ] = "no_filters",
+    excluding_type: Annotated[
+        str,
+        typer.Option(
+            help="Fields to be excluded from the original dataset (None, no_names)"
+        ),
+    ] = None,
+    log_level: Annotated[
+        str,
+        typer.Option(
+            help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)"
+        ),
+    ] = "INFO",
+):
+    """
+    Takes a MARCXML file.
+    Pulls out transliterated field (880) and pairs them
+    Applies filtering and exclusions
+    Save dataframe to output directory
+    """
+    log_level = log_level.upper()
+    level_enum = getattr(logging, log_level, None)
+    if not isinstance(level_enum, int):
+        raise typer.BadParameter(f"Invalid log level: {log_level}")
+    logging.basicConfig(
+        level=level_enum,
+        format="%(asctime)s - %(name)s.%(funcName)s:%(lineno)d - %(levelname)s - %(message)s",
+    )
+
+    if excluding_type:
+        to_exclude = TO_EXCLUDE.get(excluding_type)
+    else:
+        to_exclude = []
+
+    transliteration_df = assemble_transliteration_df(
+        file_path=xml_file_path,
+        filter_on=FILTER_ON.get(filtering_type),
+        to_exclude=to_exclude,
+    )
+
+    file_name = xml_file_path.stem
+
+    if to_exclude:
+        output_path = (
+            output_dir
+            / f"transliteration_{file_name}_{filtering_type}_{excluding_type}.csv.gz"
+        )
+    else:
+        output_path = (
+            output_dir / f"transliteration_{file_name}_{filtering_type}.csv.gz"
+        )
+
+    print(f"Saving to: {output_path=} ...")
+    transliteration_df.to_csv(output_path, compression="gzip", index=False)
+
+
+if __name__ == "__main__":
+    typer.run(apply_assembly)
diff --git a/02_transliteration/02_assembly/README.md b/02_transliteration/02_assembly/README.md
@@ -0,0 +1,14 @@
+# Assembly
+
+This directory holds the code for creating the Dataframes (CSV files), which is to be run on XML files following [reformatting](../01_reformatting/).
+
+## Script
+### Assemble Dataframe
+- [01_run_assembly.py](./01_run_assembly.py)
+    - Loads in XML file
+    - Pulls out transliterated field `880` and pairs them
+    - Creates dataframe according to filtering and exclusion scheme 
+    - Saves Dataframe to output directory 
+    - Run locally
+    - Running:
+    `uv run 01_run_assembly.py --xml-file-path=path/to/xml_file --output-dir=path/to/save/output_dataframe --filtering-type=type_of_filtering_to_be_applied --excluding-type=type_of_exclusion_to_be_applied`
diff --git a/02_transliteration/02_assembly/utils/__init__.py b/02_transliteration/02_assembly/utils/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -169,3 +169,4 @@ cython_debug/

		# PyPI configuration file
		.pypirc
		/04_speech_to_text/02_annotation/tmp/