Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,4 @@ cython_debug/

# PyPI configuration file
.pypirc
/04_speech_to_text/02_annotation/tmp/
2 changes: 2 additions & 0 deletions 02_transliteration/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
initial_eda/
../local_data/
1 change: 1 addition & 0 deletions 02_transliteration/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.13
45 changes: 45 additions & 0 deletions 02_transliteration/01_reformatting/01_run_reformatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import typer
import logging

from pathlib import Path
from typing_extensions import Annotated

from utils.reformatter import convert_txt_to_xml


def apply_reformatter(
txt_file_path: Annotated[
Path, typer.Option(help="Location to load in TXT dataset from.")
] = Path(
"../../local_data/02_transliteration/Japanese_records_personal_names_w_800_20250325.txt"
),
xml_file_dir: Annotated[
Path, typer.Option(help="Location to save in MARC XML dataset to.")
] = Path("../../local_data/02_transliteration/source_xml/"),
log_level: Annotated[
str,
typer.Option(
help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)"
),
] = "INFO",
):
"""
Converts TXT file to MARCXML and saves to output dir under same file name
"""
log_level = log_level.upper()
level_enum = getattr(logging, log_level, None)
if not isinstance(level_enum, int):
raise typer.BadParameter(f"Invalid log level: {log_level}")
logging.basicConfig(
level=level_enum,
format="%(asctime)s - %(name)s.%(funcName)s:%(lineno)d - %(levelname)s - %(message)s",
)

xml_file_name = txt_file_path.with_suffix(".xml").name
output_path = xml_file_dir / xml_file_name

convert_txt_to_xml(txt_input_path=txt_file_path, xml_output_path=output_path)


if __name__ == "__main__":
typer.run(apply_reformatter)
52 changes: 52 additions & 0 deletions 02_transliteration/01_reformatting/02_combine_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import typer
import logging

from pathlib import Path
from typing_extensions import Annotated

from utils.reformatter import parse_marc_xml, convert_list_to_marcxml


def combine_xml(
xml_file_path_1: Annotated[
Path, typer.Option(help="Location to load in first MARCXML dataset from.")
] = Path(
"../../local_data/02_transliteration/source_xml/Japanese_records_personal_names_w_800_20250325.xml"
),
xml_file_path_2: Annotated[
Path, typer.Option(help="Location to load in second MARCXML dataset to.")
] = Path(
"../../local_data/02_transliteration/source_xml/Japanese_records_w_880_20250314.xml"
),
combined_xml_output_path: Annotated[
Path, typer.Option(help="Location to save in MARC XML dataset to.")
] = Path(
"../../local_data/02_transliteration/source_xml/all_unique_Japanese_records.xml"
),
log_level: Annotated[
str,
typer.Option(
help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)"
),
] = "INFO",
):
"""
Loads in two MARCXML files, combines and saves a singular XML file.
"""
log_level = log_level.upper()
level_enum = getattr(logging, log_level, None)
if not isinstance(level_enum, int):
raise typer.BadParameter(f"Invalid log level: {log_level}")
logging.basicConfig(
level=level_enum,
format="%(asctime)s - %(name)s.%(funcName)s:%(lineno)d - %(levelname)s - %(message)s",
)

xml_list_1 = parse_marc_xml(xml_file_path_1)
xml_list_2 = parse_marc_xml(xml_file_path_2)

convert_list_to_marcxml(xml_list_1 + xml_list_2, combined_xml_output_path)


if __name__ == "__main__":
typer.run(combine_xml)
23 changes: 23 additions & 0 deletions 02_transliteration/01_reformatting/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Reformatting

This directory holds the code for creating XML files, which is to be following the download cataloging record files.

## Scripts
### Reformat TXT to MARCXML
- [01_run_reformatter.py](./01_run_reformatter.py)
- Loads TXT file containing cataloging records
- Saves as (MARC)XML file to the output directory under the same file name.
- Runs locally
- Running:
`uv run 01_run_reformatter.py --txt-file-path=path/to/txt_file --xml-file-dir=path/to/XML_output_directory`

### Combine XML files
- [02_combine_xml.py](./02_combine_xml.py)
- Loads in two XML files
- Converts each file to dictionary
- Compares dictionaries to create a [union](https://en.wikipedia.org/wiki/Union_(set_theory)) of the dictionaries
- Converted union dictionary back to MARC XML structure
- Saves as XML file to output path
- Runs locally
- Running:
`uv run 02_combine_xml.py --xml-file-path-1=path/to/first/xml_file --xml-file-path-2=path/to/second/xml_file --combined-xml-output-path=path/to/save/output/xml_file`
Empty file.
100 changes: 100 additions & 0 deletions 02_transliteration/01_reformatting/utils/reformatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import xml.etree.ElementTree as ET
from lxml import etree

import json

from pathlib import Path

import logging

logger = logging.getLogger(__name__)


def convert_txt_to_xml(txt_input_path: Path, xml_output_path: Path):
marcxml_content = txt_input_path.read_text(encoding="utf-8")
xml_output_path.write_text(marcxml_content, encoding="utf-8")

logger.info(
f"Conversion successful! The MARCXML file is saved as {xml_output_path}."
)


def parse_marc_xml(file_path):
logger.info(f"Parsing {file_path}...")
tree = ET.parse(file_path)
root = tree.getroot()

namespace = {"marc": "http://www.loc.gov/MARC21/slim"}
records = []

for record in root.findall("marc:record", namespace):
record_dict = {}

for controlfield in record.findall("marc:controlfield", namespace):
tag = controlfield.attrib["tag"]
record_dict[tag] = controlfield.text

for datafield in record.findall("marc:datafield", namespace):
tag = datafield.attrib["tag"]
subfields = datafield.findall("marc:subfield", namespace)

subfield_data = {sf.attrib["code"]: sf.text for sf in subfields}

if tag in record_dict:
if isinstance(record_dict[tag], list):
record_dict[tag].append(subfield_data)
else:
record_dict[tag] = [record_dict[tag], subfield_data]
else:
record_dict[tag] = subfield_data

records.append(record_dict)

return records


def convert_list_to_marcxml(marc_list: list, output_file: Path, make_unique=True):

if make_unique:
unique_data = list({json.dumps(d, sort_keys=True) for d in marc_list})
marc_list = [json.loads(d) for d in unique_data]

logger.info(f"MARC LIST LEN: {len(marc_list)}")

collection = etree.Element("collection", xmlns="http://www.loc.gov/MARC21/slim")

for record_dict in marc_list:
record = etree.Element("record")

for tag, value in record_dict.items():
if isinstance(value, str):
controlfield = etree.Element("controlfield", tag=tag)
controlfield.text = value
record.append(controlfield)

elif isinstance(value, dict):
datafield = etree.Element("datafield", tag=tag, ind1=" ", ind2=" ")
for code, sub_value in value.items():
subfield = etree.Element("subfield", code=code)
subfield.text = sub_value
datafield.append(subfield)
record.append(datafield)

elif isinstance(value, list):
for subfield_dict in value:
datafield = etree.Element("datafield", tag=tag, ind1=" ", ind2=" ")
for code, sub_value in subfield_dict.items():
subfield = etree.Element("subfield", code=code)
subfield.text = sub_value
datafield.append(subfield)
record.append(datafield)

collection.append(record)
print(len(collection))
collection_tree = etree.ElementTree(collection)
with output_file.open("wb") as f:
collection_tree.write(
f, pretty_print=True, xml_declaration=True, encoding="UTF-8"
)

logger.info(f"Converted MARCXML saved to {output_file}")
103 changes: 103 additions & 0 deletions 02_transliteration/02_assembly/01_run_assembly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import typer
import logging

from utils.assembler import (
assemble_transliteration_df,
) # Assembler, load_variable_from_file
from pathlib import Path
from typing_extensions import Annotated

import pandas as pd

FILTER_ON = {
"no_filters": {},
"personal_names": {"100": ["a"], "600": ["a"], "700": ["a"], "800": ["a"]},
"corporate_names": {"110": ["a"], "610": ["a"], "710": ["a"], "810": ["a"]},
}

TO_EXCLUDE = {
"no_names": [
"100",
"110",
"600",
"610",
"700",
"710",
"800",
"810",
],
}


def apply_assembly(
xml_file_path: Annotated[
Path, typer.Option(help="Location to load in MARC XML dataset from.")
] = Path(
"../../local_data/02_transliteration/source_xml/Japanese_records_personal_names_w_800_20250325.xml"
),
output_dir: Annotated[
Path, typer.Option(help="Directory to save resulting dataframe.")
] = Path("../../local_data/02_transliteration/generated_data/"),
filtering_type: Annotated[
str,
typer.Option(
help="Type of filtering to be applied to original dataset (no_filters, personal_names, corporate_names)"
),
] = "no_filters",
excluding_type: Annotated[
str,
typer.Option(
help="Fields to be excluded from the original dataset (None, no_names)"
),
] = None,
log_level: Annotated[
str,
typer.Option(
help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)"
),
] = "INFO",
):
"""
Takes a MARCXML file.
Pulls out transliterated field (880) and pairs them
Applies filtering and exclusions
Save dataframe to output directory
"""
log_level = log_level.upper()
level_enum = getattr(logging, log_level, None)
if not isinstance(level_enum, int):
raise typer.BadParameter(f"Invalid log level: {log_level}")
logging.basicConfig(
level=level_enum,
format="%(asctime)s - %(name)s.%(funcName)s:%(lineno)d - %(levelname)s - %(message)s",
)

if excluding_type:
to_exclude = TO_EXCLUDE.get(excluding_type)
else:
to_exclude = []

transliteration_df = assemble_transliteration_df(
file_path=xml_file_path,
filter_on=FILTER_ON.get(filtering_type),
to_exclude=to_exclude,
)

file_name = xml_file_path.stem

if to_exclude:
output_path = (
output_dir
/ f"transliteration_{file_name}_{filtering_type}_{excluding_type}.csv.gz"
)
else:
output_path = (
output_dir / f"transliteration_{file_name}_{filtering_type}.csv.gz"
)

print(f"Saving to: {output_path=} ...")
transliteration_df.to_csv(output_path, compression="gzip", index=False)


if __name__ == "__main__":
typer.run(apply_assembly)
14 changes: 14 additions & 0 deletions 02_transliteration/02_assembly/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Assembly

This directory holds the code for creating the Dataframes (CSV files), which is to be run on XML files following [reformatting](../01_reformatting/).

## Script
### Assemble Dataframe
- [01_run_assembly.py](./01_run_assembly.py)
- Loads in XML file
- Pulls out transliterated field `880` and pairs them
- Creates dataframe according to filtering and exclusion scheme
- Saves Dataframe to output directory
- Run locally
- Running:
`uv run 01_run_assembly.py --xml-file-path=path/to/xml_file --output-dir=path/to/save/output_dataframe --filtering-type=type_of_filtering_to_be_applied --excluding-type=type_of_exclusion_to_be_applied`
Empty file.
Loading