Generate CSV of input files and use for run-diff

ghukill · ghukill · commit 3da47ee9d150 · 2024-11-01T09:48:34.000-04:00
Why these changes are being introduced: For very large runs, the use of command line, comman seperated input files for Transmogrifier does not scale. Two things are needed and helpful: 1. allow CLI command run-diff to accept a list of input files via another method 2. a tool to help craft lists of input files from S3 based on some criteria How this addresses that need: * A new CLI command is added 'timdex-sources-csv' that generates a list of input files based on the S3 bucket set by env var 'TIMDEX_BUCKET'. This utilizes the helper methods added via a former commit. Default is, for all sources, to list extract files going back to the last full run, then every subsequent daily run. This also can be limited to a subset of sources. * Update CLI command run-diff to allow the -i / --input-files argument to accept a local CSV file which is parsed and used as the input files list. Side effects of this change: * None really, except able to more easily support runs with large amounts of input files. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-379
diff --git a/README.md b/README.md
@@ -94,6 +94,7 @@ WEBAPP_HOST=# host for flask webapp
 WEBAPP_PORT=# port for flask webapp
 TRANSMOGRIFIER_MAX_WORKERS=# max number of Transmogrifier containers to run in parallel; default is 6
 TRANSMOGRIFIER_TIMEOUT=# timeout for a single Transmogrifier container; default is 5 hours
+TIMDEX_BUCKET=# when using CLI command 'timdex-sources-csv', this is required to know what TIMDEX bucket to use
 ```
 
 ## CLI commands
@@ -143,7 +144,10 @@ Usage: -c run-diff [OPTIONS]
 
 Options:
   -d, --job-directory TEXT  Job directory to create.  [required]
-  -i, --input-files TEXT    Input files to transform.  [required]
+  -i, --input-files TEXT    Input files to transform.  This may be a comma
+                            separated list of input files, or a local CSV file
+                            that provides a list of files.  [required]
+  -m, --message TEXT        Message to describe Run.
   -h, --help                Show this message and exit.
 ```
 
@@ -158,3 +162,23 @@ Options:
   -h, --help                Show this message and exit.
 ```
 
+### `timdex-sources-csv`
+```text
+Usage: -c timdex-sources-csv [OPTIONS]
+
+  Generate a CSV of ordered extract files for all, or a subset, of TIMDEX
+  sources.
+
+  This CSV may be passed to CLI command 'run-diff' for the '-i / --input-
+  files' argument, serving as the list of input files for the run.
+
+  This command requires that env var 'TIMDEX_BUCKET' is set to establish what
+  S3 bucket to use for scanning.  The appropriate AWS credentials are also
+  needed to be set.
+
+Options:
+  -o, --output-file TEXT  Output filepath for CSV.  [required]
+  -s, --sources TEXT      Optional comma separated list of sources to include.
+                          Default is all.
+  -h, --help              Show this message and exit.
+```
diff --git a/abdiff/cli.py b/abdiff/cli.py
@@ -1,9 +1,11 @@
 import json
 import logging
 from datetime import timedelta
+from itertools import chain
 from time import perf_counter
 
 import click
+import pandas as pd
 from click.exceptions import ClickException
 
 from abdiff.config import Config, configure_logger
@@ -17,6 +19,7 @@
 )
 from abdiff.core import init_job as core_init_job
 from abdiff.core.utils import read_job_json
+from abdiff.helpers.timdex_sources import get_ordered_extracted_files_all_sources
 from abdiff.webapp.app import app
 
 logger = logging.getLogger(__name__)
@@ -132,14 +135,30 @@ def init_job(
     "--input-files",
     type=str,
     required=True,
-    help="Input files to transform.",
+    help=(
+        "Input files to transform.  This may be a comma separated list of input files, "
+        "or a local CSV file that provides a list of files."
+    ),
 )
-def run_diff(job_directory: str, input_files: str) -> None:
+@click.option(
+    "-m",
+    "--message",
+    type=str,
+    required=False,
+    help="Message to describe Run.",
+    default="Not provided.",
+)
+def run_diff(job_directory: str, input_files: str, message: str) -> None:
 
     job_data = read_job_json(job_directory)
-    run_directory = init_run(job_directory)
+    run_directory = init_run(job_directory, message=message)
 
-    input_files_list = [filepath.strip() for filepath in input_files.split(",")]
+    # handle CSV file containing input files
+    if input_files.endswith(".csv"):
+        input_files_list = pd.read_csv(input_files, header=None)[0].tolist()
+    # else, assume comma separated list of extract files
+    else:
+        input_files_list = [filepath.strip() for filepath in input_files.split(",")]
 
     ab_transformed_file_lists = run_ab_transforms(
         run_directory=run_directory,
@@ -181,3 +200,41 @@ def view_job(
     app.config.update(JOB_DIRECTORY=job_directory)
     logger.info("")
     app.run(host=config.webapp_host, port=config.webapp_port)
+
+
+@main.command()
+@click.option(
+    "-o",
+    "--output-file",
+    type=str,
+    required=True,
+    help="Output filepath for CSV.",
+)
+@click.option(
+    "-s",
+    "--sources",
+    type=str,
+    required=False,
+    help="Optional comma separated list of sources to include.  Default is all.",
+)
+def timdex_sources_csv(output_file: str, sources: str) -> None:
+    """Generate a CSV of ordered extract files for all, or a subset, of TIMDEX sources.
+
+    This CSV may be passed to CLI command 'run-diff' for the '-i / --input-files'
+    argument, serving as the list of input files for the run.
+
+    This command requires that env var 'TIMDEX_BUCKET' is set to establish what S3 bucket
+    to use for scanning.  The appropriate AWS credentials are also needed to be set.
+    """
+    sources_list = None
+    if sources:
+        sources_list = [source.strip() for source in sources.split(",")]
+
+    input_files = get_ordered_extracted_files_all_sources(sources=sources_list)
+
+    input_files_df = pd.DataFrame(
+        columns=["input_file"],
+        data=[*chain.from_iterable(v for v in input_files.values())],
+    )
+    input_files_df.to_csv(output_file, index=False, header=False)
+    logger.info(f"Created file: {output_file}")
diff --git a/abdiff/helpers/timdex_sources.py b/abdiff/helpers/timdex_sources.py
@@ -29,7 +29,7 @@ def get_extracted_files_for_source(
             for obj in page["Contents"]:
                 if not obj["Key"].endswith("/"):  # skip folders
                     s3_uri = f"s3://{bucket}/{obj['Key']}"
-                    files.append(s3_uri)  # noqa: PERF401
+                    files.append(s3_uri)
 
     # filter where "extracted" in filename
     return [file for file in files if "extracted" in file]