Handle presence of TXT files from Transmogrifier

ghukill · ghukill · commit ad9e732894ee · 2024-10-30T16:48:05.000-04:00
Why these changes are being introduced: It was overlooked that Transmogrifier will write a text file with records to delete as part of its output, and how this would be captured in the collating of records and deduping. In the case of records where the delete action was the last action, then they should be removed from the dataset. How this addresses that need: * Updates opinionations where a .json extension is assumed * Updates run_ab_transforms validation to look for output files that indicate Transmogrifier produced something as output Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-371
diff --git a/abdiff/core/collate_ab_transforms.py b/abdiff/core/collate_ab_transforms.py
@@ -10,6 +10,7 @@
 
 import duckdb
 import ijson
+import pandas as pd
 import pyarrow as pa
 
 from abdiff.core.exceptions import OutputValidationError
@@ -119,17 +120,34 @@ def get_transformed_records_iter(
     """
     version = get_transform_version(transformed_file)
     filename_details = parse_timdex_filename(transformed_file)
-    with open(transformed_file, "rb") as file:
-        for record in ijson.items(file, "item"):
+
+    base_record = {
+        "source": filename_details["source"],
+        "run_date": filename_details["run-date"],
+        "run_type": filename_details["run-type"],
+        "action": filename_details["action"],
+        "version": version,
+        "transformed_file_name": transformed_file.split("/")[-1],
+    }
+
+    # handle JSON files with records to index
+    if transformed_file.endswith(".json"):
+        with open(transformed_file, "rb") as file:
+            for record in ijson.items(file, "item"):
+                yield {
+                    **base_record,
+                    "timdex_record_id": record["timdex_record_id"],
+                    "record": json.dumps(record).encode(),
+                }
+
+    # handle TXT files with records to delete
+    else:
+        deleted_records_df = pd.read_csv(transformed_file, header=None)
+        for row in deleted_records_df.itertuples():
             yield {
-                "timdex_record_id": record["timdex_record_id"],
-                "source": filename_details["source"],
-                "run_date": filename_details["run-date"],  # use underscore for DuckDB
-                "run_type": filename_details["run-type"],  # use underscore for DuckDB
-                "action": filename_details["action"],
-                "record": json.dumps(record).encode(),
-                "version": version,
-                "transformed_file_name": transformed_file.split("/")[-1],
+                **base_record,
+                "timdex_record_id": row[1],
+                "record": None,
             }
 
 
@@ -364,10 +382,7 @@ def fetch_single_value(query: str) -> int:
 
 def get_transform_version(transformed_filepath: str) -> str:
     """Get A/B transform version, either 'a' or 'b'."""
-    match_result = re.match(
-        r".*transformed\/(.*)\/.*.json",
-        transformed_filepath,
-    )
+    match_result = re.match(r".*transformed\/(.*)\/.*", transformed_filepath)
     if not match_result:
         raise ValueError(f"Transformed filepath is invalid: {transformed_filepath}.")
 
diff --git a/abdiff/core/run_ab_transforms.py b/abdiff/core/run_ab_transforms.py
@@ -110,7 +110,7 @@ def run_ab_transforms(
             "to complete successfully."
         )
     ab_transformed_file_lists = get_transformed_files(run_directory)
-    validate_output(ab_transformed_file_lists, len(input_files))
+    validate_output(ab_transformed_file_lists, input_files)
 
     # write and return results
     run_data = {
@@ -278,11 +278,11 @@ def get_transformed_files(run_directory: str) -> tuple[list[str], ...]:
 
     Returns:
         tuple[list[str]]: Tuple containing lists of paths to transformed
-            JSON files for each image, relative to 'run_directory'.
+            JSON and TXT (deletions) files for each image, relative to 'run_directory'.
     """
     ordered_files = []
     for version in ["a", "b"]:
-        absolute_filepaths = glob.glob(f"{run_directory}/transformed/{version}/*.json")
+        absolute_filepaths = glob.glob(f"{run_directory}/transformed/{version}/*")
         relative_filepaths = [
             os.path.relpath(file, run_directory) for file in absolute_filepaths
         ]
@@ -291,24 +291,39 @@ def get_transformed_files(run_directory: str) -> tuple[list[str], ...]:
 
 
 def validate_output(
-    ab_transformed_file_lists: tuple[list[str], ...], input_files_count: int
+    ab_transformed_file_lists: tuple[list[str], ...], input_files: list[str]
 ) -> None:
     """Validate the output of run_ab_transforms.
 
-    This function checks that the number of files in each of the A/B
-    transformed file directories matches the number of input files
-    provided to run_ab_transforms (i.e., the expected number of
-    files that are transformed).
+    Transmogrifier produces JSON files for records that need indexing, and TXT files for
+    records that need deletion.  Every run of Transmogrifier should produce one OR both of
+    these.  Some TIMDEX sources provide one file to Transmogrifier that contains both
+    records to index and delete, and others provide separate files for each.
+
+    The net effect for validation is that, given an input file, we should expect to see
+    1+ files in the A and B output for that input file, ignoring if it's records to index
+    or delete.
     """
-    if any(
-        len(transformed_files) != input_files_count
-        for transformed_files in ab_transformed_file_lists
-    ):
-        raise OutputValidationError(  # noqa: TRY003
-            "At least one or more transformed JSON file(s) are missing. "
-            f"Expecting {input_files_count} transformed JSON file(s) per A/B version. "
-            "Check the transformed file directories."
-        )
+    for input_file in input_files:
+        file_parts = parse_timdex_filename(input_file)
+        logger.debug(f"Validating output for input file root: {file_parts}")
+
+        file_found = False
+        for version_files in ab_transformed_file_lists:
+            for version_file in version_files:
+                if (
+                    file_parts["source"] in version_file  # type: ignore[operator]
+                    and file_parts["run-date"] in version_file  # type: ignore[operator]
+                    and file_parts["run-type"] in version_file  # type: ignore[operator]
+                    and (not file_parts["index"] or file_parts["index"] in version_file)
+                ):
+                    file_found = True
+                    break
+
+        if not file_found:
+            raise OutputValidationError(  # noqa: TRY003
+                f"Transmogrifier output was not found for input file '{input_file}'"
+            )
 
 
 def get_transformed_filename(filename_details: dict) -> str:
diff --git a/tests/test_collate_ab_transforms.py b/tests/test_collate_ab_transforms.py
@@ -75,7 +75,7 @@ def test_get_transformed_records_iter_success(example_transformed_directory):
     )
     timdex_record_dict = next(records_iter)
 
-    assert list(timdex_record_dict.keys()) == [
+    assert set(timdex_record_dict.keys()) == {
         "timdex_record_id",
         "source",
         "run_date",
@@ -84,7 +84,7 @@ def test_get_transformed_records_iter_success(example_transformed_directory):
         "record",
         "version",
         "transformed_file_name",
-    ]
+    }
     assert isinstance(timdex_record_dict["record"], bytes)
     assert timdex_record_dict["version"] == "a"
     assert (
@@ -104,7 +104,7 @@ def test_get_transformed_batches_iter_success(
 
     assert isinstance(transformed_batch, pa.RecordBatch)
     assert transformed_batch.num_rows <= READ_BATCH_SIZE
-    assert transformed_batch.schema.names == TRANSFORMED_DATASET_SCHEMA.names
+    assert set(transformed_batch.schema.names) == set(TRANSFORMED_DATASET_SCHEMA.names)
 
 
 def test_get_joined_batches_iter_success(transformed_parquet_dataset):
diff --git a/tests/test_run_ab_transforms.py b/tests/test_run_ab_transforms.py
@@ -166,19 +166,86 @@ def test_get_transformed_files_success(
     )
 
 
-def test_validate_output_success():
+@pytest.mark.parametrize(
+    ("ab_files", "input_files"),
+    [
+        # single JSON from single file
+        (
+            (
+                ["dspace-2024-04-10-daily-extracted-records-to-index.json"],
+                ["dspace-2024-04-10-daily-extracted-records-to-index.json"],
+            ),
+            ["s3://X/dspace-2024-04-10-daily-extracted-records-to-index.xml"],
+        ),
+        # JSON and TXT from single file
+        (
+            (
+                [
+                    "dspace-2024-04-10-daily-extracted-records-to-index.json",
+                    "dspace-2024-04-10-daily-extracted-records-to-delete.txt",
+                ],
+                [
+                    "dspace-2024-04-10-daily-extracted-records-to-index.json",
+                    "dspace-2024-04-10-daily-extracted-records-to-delete.txt",
+                ],
+            ),
+            ["s3://X/dspace-2024-04-10-daily-extracted-records-to-index.xml"],
+        ),
+        # handles indexed files when multiple
+        (
+            (
+                ["alma-2024-04-10-daily-extracted-records-to-index_09.json"],
+                ["alma-2024-04-10-daily-extracted-records-to-index_09.json"],
+            ),
+            ["s3://X/alma-2024-04-10-daily-extracted-records-to-index_09.xml"],
+        ),
+        # handles deletes only for alma deletes
+        (
+            (
+                ["alma-2024-04-10-daily-extracted-records-to-delete.txt"],
+                ["alma-2024-04-10-daily-extracted-records-to-delete.txt"],
+            ),
+            ["s3://X/alma-2024-04-10-daily-extracted-records-to-delete.xml"],
+        ),
+    ],
+)
+def test_validate_output_success(ab_files, input_files):
     assert (
         validate_output(
-            ab_transformed_file_lists=(["transformed/a/file1"], ["transformed/b/file2"]),
-            input_files_count=1,
+            ab_transformed_file_lists=ab_files,
+            input_files=input_files,
         )
         is None
     )
 
 
-def test_validate_output_error():
+@pytest.mark.parametrize(
+    ("ab_files", "input_files"),
+    [
+        # nothing returned
+        (
+            ([], []),
+            ["s3://X/dspace-2024-04-10-daily-extracted-records-to-index.xml"],
+        ),
+        # output files don't have index, or wrong index, so not direct match
+        (
+            (
+                [
+                    "alma-2024-04-10-daily-extracted-records-to-index.json",
+                    "alma-2024-04-10-daily-extracted-records-to-index_04.json",
+                ],
+                [
+                    "alma-2024-04-10-daily-extracted-records-to-index.json",
+                    "alma-2024-04-10-daily-extracted-records-to-index_04.json",
+                ],
+            ),
+            ["s3://X/alma-2024-04-10-daily-extracted-records-to-index_09.xml"],
+        ),
+    ],
+)
+def test_validate_output_error(ab_files, input_files):
     with pytest.raises(OutputValidationError):
-        validate_output(ab_transformed_file_lists=([], []), input_files_count=1)
+        validate_output(ab_transformed_file_lists=ab_files, input_files=input_files)
 
 
 def test_get_output_filename_success():