Merge pull request #45 from MITLibraries/TIMX-379-input-files-options-and-helpers

ghukill · web-flow · commit fd3a0c88d77b · 2024-11-01T16:50:22.000-04:00
TIMX 379 - CSV for input files support and helpers
diff --git a/Pipfile b/Pipfile
@@ -15,6 +15,7 @@ types-docker = "*"
 types-pygit2 = "*"
 flask = "*"
 jsondiff = "*"
+boto3 = "*"
 
 [dev-packages]
 black = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -94,6 +94,7 @@ WEBAPP_HOST=# host for flask webapp
 WEBAPP_PORT=# port for flask webapp
 TRANSMOGRIFIER_MAX_WORKERS=# max number of Transmogrifier containers to run in parallel; default is 6
 TRANSMOGRIFIER_TIMEOUT=# timeout for a single Transmogrifier container; default is 5 hours
+TIMDEX_BUCKET=# when using CLI command 'timdex-sources-csv', this is required to know what TIMDEX bucket to use
 ```
 
 ## CLI commands
@@ -143,7 +144,10 @@ Usage: -c run-diff [OPTIONS]
 
 Options:
   -d, --job-directory TEXT  Job directory to create.  [required]
-  -i, --input-files TEXT    Input files to transform.  [required]
+  -i, --input-files TEXT    Input files to transform.  This may be a comma
+                            separated list of input files, or a local CSV file
+                            that provides a list of files.  [required]
+  -m, --message TEXT        Message to describe Run.
   -h, --help                Show this message and exit.
 ```
 
@@ -158,3 +162,23 @@ Options:
   -h, --help                Show this message and exit.
 ```
 
+### `timdex-sources-csv`
+```text
+Usage: -c timdex-sources-csv [OPTIONS]
+
+  Generate a CSV of ordered extract files for all, or a subset, of TIMDEX
+  sources.
+
+  This CSV may be passed to CLI command 'run-diff' for the '-i / --input-
+  files' argument, serving as the list of input files for the run.
+
+  This command requires that env var 'TIMDEX_BUCKET' is set to establish what
+  S3 bucket to use for scanning.  The appropriate AWS credentials are also
+  needed to be set.
+
+Options:
+  -o, --output-file TEXT  Output filepath for CSV.  [required]
+  -s, --sources TEXT      Optional comma separated list of sources to include.
+                          Default is all.
+  -h, --help              Show this message and exit.
+```
diff --git a/abdiff/cli.py b/abdiff/cli.py
@@ -1,9 +1,11 @@
 import json
 import logging
 from datetime import timedelta
+from itertools import chain
 from time import perf_counter
 
 import click
+import pandas as pd
 from click.exceptions import ClickException
 
 from abdiff.config import Config, configure_logger
@@ -17,6 +19,7 @@
 )
 from abdiff.core import init_job as core_init_job
 from abdiff.core.utils import read_job_json
+from abdiff.extras.timdex_sources import get_ordered_extracted_files_all_sources
 from abdiff.webapp.app import app
 
 logger = logging.getLogger(__name__)
@@ -132,14 +135,30 @@ def init_job(
     "--input-files",
     type=str,
     required=True,
-    help="Input files to transform.",
+    help=(
+        "Input files to transform.  This may be a comma separated list of input files, "
+        "or a local CSV file that provides a list of files."
+    ),
 )
-def run_diff(job_directory: str, input_files: str) -> None:
+@click.option(
+    "-m",
+    "--message",
+    type=str,
+    required=False,
+    help="Message to describe Run.",
+    default="Not provided.",
+)
+def run_diff(job_directory: str, input_files: str, message: str) -> None:
 
     job_data = read_job_json(job_directory)
-    run_directory = init_run(job_directory)
+    run_directory = init_run(job_directory, message=message)
 
-    input_files_list = [filepath.strip() for filepath in input_files.split(",")]
+    # handle CSV file containing input files
+    if input_files.endswith(".csv"):
+        input_files_list = pd.read_csv(input_files, header=None)[0].tolist()
+    # else, assume comma separated list of extract files
+    else:
+        input_files_list = [filepath.strip() for filepath in input_files.split(",")]
 
     ab_transformed_file_lists = run_ab_transforms(
         run_directory=run_directory,
@@ -181,3 +200,41 @@ def view_job(
     app.config.update(JOB_DIRECTORY=job_directory)
     logger.info("")
     app.run(host=config.webapp_host, port=config.webapp_port)
+
+
+@main.command()
+@click.option(
+    "-o",
+    "--output-file",
+    type=str,
+    required=True,
+    help="Output filepath for CSV.",
+)
+@click.option(
+    "-s",
+    "--sources",
+    type=str,
+    required=False,
+    help="Optional comma separated list of sources to include.  Default is all.",
+)
+def timdex_sources_csv(output_file: str, sources: str) -> None:
+    """Generate a CSV of ordered extract files for all, or a subset, of TIMDEX sources.
+
+    This CSV may be passed to CLI command 'run-diff' for the '-i / --input-files'
+    argument, serving as the list of input files for the run.
+
+    This command requires that env var 'TIMDEX_BUCKET' is set to establish what S3 bucket
+    to use for scanning.  The appropriate AWS credentials are also needed to be set.
+    """
+    sources_list = None
+    if sources:
+        sources_list = [source.strip() for source in sources.split(",")]
+
+    input_files = get_ordered_extracted_files_all_sources(sources=sources_list)
+
+    input_files_df = pd.DataFrame(
+        columns=["input_file"],
+        data=[*chain.from_iterable(v for v in input_files.values())],
+    )
+    input_files_df.to_csv(output_file, index=False, header=False)
+    logger.info(f"Created file: {output_file}")
diff --git a/abdiff/config.py b/abdiff/config.py
@@ -15,6 +15,7 @@ class Config:
         "WEBAPP_PORT",
         "TRANSMOGRIFIER_MAX_WORKERS",
         "TRANSMOGRIFIER_TIMEOUT",
+        "TIMDEX_BUCKET",
     )
 
     def __getattr__(self, name: str) -> Any:  # noqa: ANN401
@@ -45,6 +46,18 @@ def transmogrifier_timeout(self) -> int:
         timeout = self.TRANSMOGRIFIER_TIMEOUT or 60 * 60 * 5  # 5 hours default
         return int(timeout)
 
+    @property
+    def active_timdex_sources(self) -> list[str]:
+        return [
+            "alma",
+            "aspace",
+            "dspace",
+            "gismit",
+            "gisogm",
+            "libguides",
+            "researchdatabases",
+        ]
+
 
 def configure_logger(logger: logging.Logger, *, verbose: bool) -> str:
     if verbose:
diff --git a/abdiff/extras/__init__.py b/abdiff/extras/__init__.py
diff --git a/abdiff/extras/timdex_sources.py b/abdiff/extras/timdex_sources.py
@@ -0,0 +1,93 @@
+"""abdiff.helpers.timdex_sources"""
+
+import datetime
+import logging
+import re
+
+import boto3  # type: ignore[import-untyped]
+
+from abdiff.config import Config
+
+logger = logging.getLogger(__name__)
+
+CONFIG = Config()
+
+
+def get_ordered_extracted_files_all_sources(
+    sources: list[str] | None = None,
+) -> dict[str, list[str]]:
+    """Get ordered extract files for all TIMDEX sources."""
+    if not sources:
+        sources = CONFIG.active_timdex_sources
+    return {
+        source: get_ordered_extracted_files_since_last_full_run(source=source)
+        for source in sources
+    }
+
+
+def get_ordered_extracted_files_since_last_full_run(source: str) -> list[str]:
+    """Get extract files, from last full run, through all daily runs, for a source."""
+    logger.info(f"Retrieving ordered extracted files for source: '{source}'")
+    all_files = get_extracted_files_for_source(source)
+
+    # Find all full extract files and extract their dates
+    full_extracts = [f for f in all_files if _is_full_extract(f)]
+    if not full_extracts:
+        logger.warning("No full extracts found.")
+        return []
+
+    # Extract dates from full extract files and find the most recent date
+    full_extract_dates = [_extract_date(f) for f in full_extracts]
+    most_recent_full_date = max(full_extract_dates)
+
+    # Collect all full extract files with the most recent date
+    most_recent_full_files = sorted(
+        f for f in full_extracts if _extract_date(f) == most_recent_full_date
+    )
+
+    # Collect all daily extracts from the cutoff date onwards
+    recent_daily_extracts = sorted(
+        f
+        for f in all_files
+        if _is_daily_extract(f) and _extract_date(f) >= most_recent_full_date
+    )
+
+    # Combine full extracts and daily extracts
+    ordered_files = most_recent_full_files + recent_daily_extracts
+    logger.info(f"Total files retrieved: {len(ordered_files)}")
+    return ordered_files
+
+
+def _is_full_extract(filename: str) -> bool:
+    return "-full-" in filename
+
+
+def _is_daily_extract(filename: str) -> bool:
+    return "-daily-" in filename
+
+
+def _extract_date(filename: str) -> datetime.datetime:
+    date_string = re.findall(r".+?(\d{4}-\d{2}-\d{2})", filename)[0]
+    return datetime.datetime.strptime(date_string, "%Y-%m-%d").astimezone(datetime.UTC)
+
+
+def get_extracted_files_for_source(
+    source: str,
+    bucket: str = CONFIG.TIMDEX_BUCKET,
+) -> list[str]:
+    """List S3 URIs for extract files in TIMDEX S3 bucket for a given source."""
+    s3_client = boto3.client("s3")
+    files = []
+
+    paginator = s3_client.get_paginator("list_objects_v2")
+    page_iterator = paginator.paginate(Bucket=bucket, Prefix=source)
+
+    for page in page_iterator:
+        if "Contents" in page:
+            for obj in page["Contents"]:
+                if not obj["Key"].endswith("/"):  # skip folders
+                    s3_uri = f"s3://{bucket}/{obj['Key']}"
+                    files.append(s3_uri)
+
+    # filter where "extracted" in filename
+    return [file for file in files if "extracted" in file]