MITLibraries
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎abdiff/cli.py‎
Lines changed: 17 additions & 1 deletion b/‎abdiff/cli.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎abdiff/config.py‎
Lines changed: 15 additions & 0 deletions b/‎abdiff/config.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎abdiff/core/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎abdiff/core/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎abdiff/core/calc_ab_diffs.py‎
Lines changed: 74 additions & 26 deletions b/‎abdiff/core/calc_ab_diffs.py‎
Lines changed: 74 additions & 26 deletions
diff --git a/‎abdiff/core/calc_ab_metrics.py‎
Lines changed: 4 additions & 3 deletions b/‎abdiff/core/calc_ab_metrics.py‎
Lines changed: 4 additions & 3 deletions
@@ -137,6 +137,8 @@ WEBAPP_PORT=# port for flask webapp
 TRANSMOGRIFIER_MAX_WORKERS=# max number of Transmogrifier containers to run in parallel; default is 6
 TRANSMOGRIFIER_TIMEOUT=# timeout for a single Transmogrifier container; default is 5 hours
 TIMDEX_BUCKET=# when using CLI command 'timdex-sources-csv', this is required to know what TIMDEX bucket to use
+PRESERVE_ARTIFACTS=# if 'true', intermediate artifacts like transformed files, collated records, etc., will not be automatically removed
+ALLOW_FAILED_TRANSMOGRIFIER_CONTAINERS=# if 'true' (default), the run will continue even if some Transmogrifier containers failed to complete successfully
 ```
 
 ## CLI commands
 
@@ -1,5 +1,6 @@
 import json
 import logging
+import shutil
 from datetime import timedelta
 from itertools import chain
 from time import perf_counter
@@ -14,6 +15,7 @@
     calc_ab_diffs,
     calc_ab_metrics,
     collate_ab_transforms,
+    create_final_records,
     download_input_files,
     init_run,
     run_ab_transforms,
@@ -25,6 +27,8 @@
 
 logger = logging.getLogger(__name__)
 
+CONFIG = Config()
+
 
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
 @click.option(
@@ -181,19 +185,31 @@ def run_diff(
         input_files=input_files_list,
         use_local_s3=download_files,
     )
+
     collated_dataset_path = collate_ab_transforms(
         run_directory=run_directory,
         ab_transformed_file_lists=ab_transformed_file_lists,
     )
+
     diffs_dataset_path = calc_ab_diffs(
         run_directory=run_directory,
         collated_dataset_path=collated_dataset_path,
     )
-    calc_ab_metrics(
+
+    if not CONFIG.preserve_artifacts:
+        shutil.rmtree(collated_dataset_path)
+
+    metrics_dataset_path = calc_ab_metrics(
         run_directory=run_directory,
         diffs_dataset_path=diffs_dataset_path,
     )
 
+    create_final_records(run_directory, diffs_dataset_path, metrics_dataset_path)
+
+    if not CONFIG.preserve_artifacts:
+        shutil.rmtree(diffs_dataset_path)
+        shutil.rmtree(metrics_dataset_path)
+
 
 @main.command()
 @click.option(
 
@@ -21,6 +21,8 @@ class Config:
         "TRANSMOGRIFIER_MAX_WORKERS",
         "TRANSMOGRIFIER_TIMEOUT",
         "TIMDEX_BUCKET",
+        "PRESERVE_ARTIFACTS",
+        "ALLOW_FAILED_TRANSMOGRIFIER_CONTAINERS",
     )
 
     def __getattr__(self, name: str) -> Any:  # noqa: ANN401
@@ -81,6 +83,19 @@ def active_timdex_sources(self) -> list[str]:
             "researchdatabases",
         ]
 
+    @property
+    def preserve_artifacts(self) -> bool:
+        return bool(
+            self.PRESERVE_ARTIFACTS and self.PRESERVE_ARTIFACTS.strip().lower() == "true"
+        )
+
+    @property
+    def allow_failed_transmogrifier_containers(self) -> bool:
+        return bool(
+            self.ALLOW_FAILED_TRANSMOGRIFIER_CONTAINERS
+            and self.ALLOW_FAILED_TRANSMOGRIFIER_CONTAINERS.strip().lower() == "true"
+        )
+
 
 def configure_logger(logger: logging.Logger, *, verbose: bool) -> str:
     if verbose:
 
@@ -7,6 +7,7 @@
 from abdiff.core.calc_ab_diffs import calc_ab_diffs
 from abdiff.core.calc_ab_metrics import calc_ab_metrics
 from abdiff.core.collate_ab_transforms import collate_ab_transforms
+from abdiff.core.create_final_records import create_final_records
 from abdiff.core.init_job import init_job
 from abdiff.core.init_run import init_run
 from abdiff.core.run_ab_transforms import run_ab_transforms
@@ -21,4 +22,5 @@
     "collate_ab_transforms",
     "calc_ab_diffs",
     "calc_ab_metrics",
+    "create_final_records",
 ]
@@ -1,3 +1,4 @@
+import concurrent.futures
 import json
 import logging
 import time
@@ -12,9 +13,9 @@
 
 logger = logging.getLogger(__name__)
 
-READ_BATCH_SIZE = 1_000
-WRITE_MAX_ROW_GROUP_SIZE = 1_000
+READ_BATCH_SIZE = 10_000
 WRITE_MAX_ROWS_PER_FILE = 100_000
+MAX_PARALLEL_WORKERS = 6
 
 DIFFS_DATASET_OUTPUT_SCHEMA = pa.schema(
     (
@@ -52,29 +53,55 @@ def calc_ab_diffs(run_directory: str, collated_dataset_path: str) -> str:
     return str(diffs_dataset)
 
 
+def process_batch(batch: pa.RecordBatch) -> pa.RecordBatch:
+    """Parallel worker for calculating record diffs for a batch.
+
+    The pyarrow RecordBatch is converted into a pandas dataframe, a diff is calculated via
+    DeepDiff for each record in the batch, and this is converted back to a pyarrow
+    RecordBatch for returning.
+    """
+    df = batch.to_pandas()  # noqa: PD901
+    diff_results = df.apply(
+        lambda row: calc_record_diff(row["record_a"], row["record_b"]), axis=1
+    )
+    df["ab_diff"] = diff_results.apply(lambda x: x[0])
+    df["modified_timdex_fields"] = diff_results.apply(
+        lambda x: list(x[1]) if x[1] else []
+    )
+    df["has_diff"] = diff_results.apply(lambda x: x[2])
+    return pa.RecordBatch.from_pandas(df)  # type: ignore[attr-defined]
+
+
 def get_diffed_batches_iter(
     collated_dataset: ds.Dataset,
     batch_size: int = READ_BATCH_SIZE,
+    max_parallel_processes: int = MAX_PARALLEL_WORKERS,
 ) -> Generator[pa.RecordBatch, None, None]:
-    """Yield pyarrow record batches with diff calculated for records in batch."""
+    """Yield pyarrow record batches with diff calculated for each record.
+
+    This work is performed in parallel, leveraging CPU cores to calculate the diffs and
+    yield batches for writing to the "diffs" dataset.
+    """
     batches_iter = collated_dataset.to_batches(batch_size=batch_size)
-    for i, batch in enumerate(batches_iter):
-        logger.info(f"Calculating AB diff for batch: {i}")
 
-        # convert batch to pandas dataframe and calc values for new columns
-        df = batch.to_pandas()  # noqa: PD901
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=max_parallel_processes + 1
+    ) as executor:
+        pending_futures = []
+        for batch_count, batch in enumerate(batches_iter):
+            future = executor.submit(process_batch, batch)
+            pending_futures.append((batch_count, future))
 
-        # calculate all diffs and unpack into separate columns
-        diff_results = df.apply(
-            lambda row: calc_record_diff(row["record_a"], row["record_b"]), axis=1
-        )
-        df["ab_diff"] = diff_results.apply(lambda x: x[0])
-        df["modified_timdex_fields"] = diff_results.apply(
-            lambda x: list(x[1]) if x[1] else []
-        )
-        df["has_diff"] = diff_results.apply(lambda x: x[2])
+            if len(pending_futures) >= max_parallel_processes:
+                idx, completed_future = pending_futures.pop(0)
+                result = completed_future.result()
+                logger.info(f"Yielding diffed batch: {idx}")
+                yield result
 
-        yield pa.RecordBatch.from_pandas(df)  # type: ignore[attr-defined]
+        for idx, future in pending_futures:
+            result = future.result()
+            logger.info(f"Yielding diffed batch: {idx}")
+            yield result
 
 
 def calc_record_diff(
@@ -83,32 +110,53 @@ def calc_record_diff(
     *,
     ignore_order: bool = True,
     report_repetition: bool = True,
-) -> tuple[str | None, list[str] | None, bool]:
+) -> tuple[str, set[str], bool]:
     """Calculate diff from two JSON byte strings.
 
     The DeepDiff library has the property 'affected_root_keys' on the produced diff object
     that is very useful for our purposes.  At this time, we simply want to know if
     anything about a particular root level TIMDEX field (e.g. 'dates' or 'title') has
-    changed which this method provides explicitly.  We also serialize the full diff to
-    JSON via the to_json() method for storage and possible further analysis.
+    changed which this method provides explicitly.  In the unlikely case that the records
+    share ZERO keys, a special case is handled where the modified root paths are returned
+    as only ['root'], in which case we get a combined set keys from both records, which is
+    effectively the modified root fields.
+
+    We also serialize the full diff to JSON via the to_json() method for storage and
+    possible further analysis.
 
-    This method returns a tuple:
+    Returns tuple(ab_diff, modified_timdex_fields, has_diff):
         - ab_diff: [str] - full diff as JSON
         - modified_timdex_fields: list[str] - list of modified root keys (TIMDEX fields)
         - has_diff: bool - True/False if any diff present
     """
-    if record_a is None or record_b is None:
-        return None, None, False
+    # Replace None with empty dict
+    record_a = record_a or {}
+    record_b = record_b or {}
+
+    # Parse JSON strings or bytes into dictionaries
+    if isinstance(record_a, (str | bytes)):
+        record_a = json.loads(record_a)
+    if isinstance(record_b, (str | bytes)):
+        record_b = json.loads(record_b)
 
     diff = DeepDiff(
-        json.loads(record_a) if isinstance(record_a, str | bytes) else record_a,
-        json.loads(record_b) if isinstance(record_b, str | bytes) else record_b,
+        record_a,
+        record_b,
         ignore_order=ignore_order,
         report_repetition=report_repetition,
     )
 
     ab_diff = diff.to_json()
-    modified_timdex_fields = diff.affected_root_keys
+
+    # get modified root fields, handling edge cases
+    if diff.affected_paths != ["root"]:
+        modified_timdex_fields = diff.affected_root_keys
+    else:
+        modified_timdex_fields = set()
+        for record in [record_a, record_b]:
+            if isinstance(record, dict):
+                modified_timdex_fields.update(record.keys())
+
     has_diff = bool(modified_timdex_fields)
 
     return ab_diff, modified_timdex_fields, has_diff
@@ -20,9 +20,10 @@
 def calc_ab_metrics(
     run_directory: str,
     diffs_dataset_path: str,
-) -> dict:
+) -> str:
 
-    os.makedirs(Path(run_directory) / "metrics", exist_ok=True)
+    metrics_dataset = Path(run_directory) / "metrics"
+    os.makedirs(metrics_dataset, exist_ok=True)
 
     # build field diffs dataframe
     field_matrix_dataset_filepath = create_record_diff_matrix_dataset(
@@ -37,7 +38,7 @@ def calc_ab_metrics(
         run_directory=run_directory, new_data={"metrics": metrics_data}
     )
 
-    return metrics_data
+    return str(metrics_dataset)
 
 
 def create_record_diff_matrix_dataset(