Skip to content

Commit 3da47ee

Browse files
committed
Generate CSV of input files and use for run-diff
Why these changes are being introduced: For very large runs, the use of command line, comman seperated input files for Transmogrifier does not scale. Two things are needed and helpful: 1. allow CLI command run-diff to accept a list of input files via another method 2. a tool to help craft lists of input files from S3 based on some criteria How this addresses that need: * A new CLI command is added 'timdex-sources-csv' that generates a list of input files based on the S3 bucket set by env var 'TIMDEX_BUCKET'. This utilizes the helper methods added via a former commit. Default is, for all sources, to list extract files going back to the last full run, then every subsequent daily run. This also can be limited to a subset of sources. * Update CLI command run-diff to allow the -i / --input-files argument to accept a local CSV file which is parsed and used as the input files list. Side effects of this change: * None really, except able to more easily support runs with large amounts of input files. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-379
1 parent 517955d commit 3da47ee

File tree

3 files changed

+87
-6
lines changed

3 files changed

+87
-6
lines changed

README.md

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ WEBAPP_HOST=# host for flask webapp
9494
WEBAPP_PORT=# port for flask webapp
9595
TRANSMOGRIFIER_MAX_WORKERS=# max number of Transmogrifier containers to run in parallel; default is 6
9696
TRANSMOGRIFIER_TIMEOUT=# timeout for a single Transmogrifier container; default is 5 hours
97+
TIMDEX_BUCKET=# when using CLI command 'timdex-sources-csv', this is required to know what TIMDEX bucket to use
9798
```
9899

99100
## CLI commands
@@ -143,7 +144,10 @@ Usage: -c run-diff [OPTIONS]
143144
144145
Options:
145146
-d, --job-directory TEXT Job directory to create. [required]
146-
-i, --input-files TEXT Input files to transform. [required]
147+
-i, --input-files TEXT Input files to transform. This may be a comma
148+
separated list of input files, or a local CSV file
149+
that provides a list of files. [required]
150+
-m, --message TEXT Message to describe Run.
147151
-h, --help Show this message and exit.
148152
```
149153

@@ -158,3 +162,23 @@ Options:
158162
-h, --help Show this message and exit.
159163
```
160164

165+
### `timdex-sources-csv`
166+
```text
167+
Usage: -c timdex-sources-csv [OPTIONS]
168+
169+
Generate a CSV of ordered extract files for all, or a subset, of TIMDEX
170+
sources.
171+
172+
This CSV may be passed to CLI command 'run-diff' for the '-i / --input-
173+
files' argument, serving as the list of input files for the run.
174+
175+
This command requires that env var 'TIMDEX_BUCKET' is set to establish what
176+
S3 bucket to use for scanning. The appropriate AWS credentials are also
177+
needed to be set.
178+
179+
Options:
180+
-o, --output-file TEXT Output filepath for CSV. [required]
181+
-s, --sources TEXT Optional comma separated list of sources to include.
182+
Default is all.
183+
-h, --help Show this message and exit.
184+
```

abdiff/cli.py

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import json
22
import logging
33
from datetime import timedelta
4+
from itertools import chain
45
from time import perf_counter
56

67
import click
8+
import pandas as pd
79
from click.exceptions import ClickException
810

911
from abdiff.config import Config, configure_logger
@@ -17,6 +19,7 @@
1719
)
1820
from abdiff.core import init_job as core_init_job
1921
from abdiff.core.utils import read_job_json
22+
from abdiff.helpers.timdex_sources import get_ordered_extracted_files_all_sources
2023
from abdiff.webapp.app import app
2124

2225
logger = logging.getLogger(__name__)
@@ -132,14 +135,30 @@ def init_job(
132135
"--input-files",
133136
type=str,
134137
required=True,
135-
help="Input files to transform.",
138+
help=(
139+
"Input files to transform. This may be a comma separated list of input files, "
140+
"or a local CSV file that provides a list of files."
141+
),
136142
)
137-
def run_diff(job_directory: str, input_files: str) -> None:
143+
@click.option(
144+
"-m",
145+
"--message",
146+
type=str,
147+
required=False,
148+
help="Message to describe Run.",
149+
default="Not provided.",
150+
)
151+
def run_diff(job_directory: str, input_files: str, message: str) -> None:
138152

139153
job_data = read_job_json(job_directory)
140-
run_directory = init_run(job_directory)
154+
run_directory = init_run(job_directory, message=message)
141155

142-
input_files_list = [filepath.strip() for filepath in input_files.split(",")]
156+
# handle CSV file containing input files
157+
if input_files.endswith(".csv"):
158+
input_files_list = pd.read_csv(input_files, header=None)[0].tolist()
159+
# else, assume comma separated list of extract files
160+
else:
161+
input_files_list = [filepath.strip() for filepath in input_files.split(",")]
143162

144163
ab_transformed_file_lists = run_ab_transforms(
145164
run_directory=run_directory,
@@ -181,3 +200,41 @@ def view_job(
181200
app.config.update(JOB_DIRECTORY=job_directory)
182201
logger.info("")
183202
app.run(host=config.webapp_host, port=config.webapp_port)
203+
204+
205+
@main.command()
206+
@click.option(
207+
"-o",
208+
"--output-file",
209+
type=str,
210+
required=True,
211+
help="Output filepath for CSV.",
212+
)
213+
@click.option(
214+
"-s",
215+
"--sources",
216+
type=str,
217+
required=False,
218+
help="Optional comma separated list of sources to include. Default is all.",
219+
)
220+
def timdex_sources_csv(output_file: str, sources: str) -> None:
221+
"""Generate a CSV of ordered extract files for all, or a subset, of TIMDEX sources.
222+
223+
This CSV may be passed to CLI command 'run-diff' for the '-i / --input-files'
224+
argument, serving as the list of input files for the run.
225+
226+
This command requires that env var 'TIMDEX_BUCKET' is set to establish what S3 bucket
227+
to use for scanning. The appropriate AWS credentials are also needed to be set.
228+
"""
229+
sources_list = None
230+
if sources:
231+
sources_list = [source.strip() for source in sources.split(",")]
232+
233+
input_files = get_ordered_extracted_files_all_sources(sources=sources_list)
234+
235+
input_files_df = pd.DataFrame(
236+
columns=["input_file"],
237+
data=[*chain.from_iterable(v for v in input_files.values())],
238+
)
239+
input_files_df.to_csv(output_file, index=False, header=False)
240+
logger.info(f"Created file: {output_file}")

abdiff/helpers/timdex_sources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def get_extracted_files_for_source(
2929
for obj in page["Contents"]:
3030
if not obj["Key"].endswith("/"): # skip folders
3131
s3_uri = f"s3://{bucket}/{obj['Key']}"
32-
files.append(s3_uri) # noqa: PERF401
32+
files.append(s3_uri)
3333

3434
# filter where "extracted" in filename
3535
return [file for file in files if "extracted" in file]

0 commit comments

Comments
 (0)