Skip to content

Commit fd3a0c8

Browse files
authored
Merge pull request #45 from MITLibraries/TIMX-379-input-files-options-and-helpers
TIMX 379 - CSV for input files support and helpers
2 parents a4b1a13 + 5701e9b commit fd3a0c8

File tree

7 files changed

+229
-8
lines changed

7 files changed

+229
-8
lines changed

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ types-docker = "*"
1515
types-pygit2 = "*"
1616
flask = "*"
1717
jsondiff = "*"
18+
boto3 = "*"
1819

1920
[dev-packages]
2021
black = "*"

Pipfile.lock

Lines changed: 36 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ WEBAPP_HOST=# host for flask webapp
9494
WEBAPP_PORT=# port for flask webapp
9595
TRANSMOGRIFIER_MAX_WORKERS=# max number of Transmogrifier containers to run in parallel; default is 6
9696
TRANSMOGRIFIER_TIMEOUT=# timeout for a single Transmogrifier container; default is 5 hours
97+
TIMDEX_BUCKET=# when using CLI command 'timdex-sources-csv', this is required to know what TIMDEX bucket to use
9798
```
9899

99100
## CLI commands
@@ -143,7 +144,10 @@ Usage: -c run-diff [OPTIONS]
143144
144145
Options:
145146
-d, --job-directory TEXT Job directory to create. [required]
146-
-i, --input-files TEXT Input files to transform. [required]
147+
-i, --input-files TEXT Input files to transform. This may be a comma
148+
separated list of input files, or a local CSV file
149+
that provides a list of files. [required]
150+
-m, --message TEXT Message to describe Run.
147151
-h, --help Show this message and exit.
148152
```
149153

@@ -158,3 +162,23 @@ Options:
158162
-h, --help Show this message and exit.
159163
```
160164

165+
### `timdex-sources-csv`
166+
```text
167+
Usage: -c timdex-sources-csv [OPTIONS]
168+
169+
Generate a CSV of ordered extract files for all, or a subset, of TIMDEX
170+
sources.
171+
172+
This CSV may be passed to CLI command 'run-diff' for the '-i / --input-
173+
files' argument, serving as the list of input files for the run.
174+
175+
This command requires that env var 'TIMDEX_BUCKET' is set to establish what
176+
S3 bucket to use for scanning. The appropriate AWS credentials are also
177+
needed to be set.
178+
179+
Options:
180+
-o, --output-file TEXT Output filepath for CSV. [required]
181+
-s, --sources TEXT Optional comma separated list of sources to include.
182+
Default is all.
183+
-h, --help Show this message and exit.
184+
```

abdiff/cli.py

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import json
22
import logging
33
from datetime import timedelta
4+
from itertools import chain
45
from time import perf_counter
56

67
import click
8+
import pandas as pd
79
from click.exceptions import ClickException
810

911
from abdiff.config import Config, configure_logger
@@ -17,6 +19,7 @@
1719
)
1820
from abdiff.core import init_job as core_init_job
1921
from abdiff.core.utils import read_job_json
22+
from abdiff.extras.timdex_sources import get_ordered_extracted_files_all_sources
2023
from abdiff.webapp.app import app
2124

2225
logger = logging.getLogger(__name__)
@@ -132,14 +135,30 @@ def init_job(
132135
"--input-files",
133136
type=str,
134137
required=True,
135-
help="Input files to transform.",
138+
help=(
139+
"Input files to transform. This may be a comma separated list of input files, "
140+
"or a local CSV file that provides a list of files."
141+
),
136142
)
137-
def run_diff(job_directory: str, input_files: str) -> None:
143+
@click.option(
144+
"-m",
145+
"--message",
146+
type=str,
147+
required=False,
148+
help="Message to describe Run.",
149+
default="Not provided.",
150+
)
151+
def run_diff(job_directory: str, input_files: str, message: str) -> None:
138152

139153
job_data = read_job_json(job_directory)
140-
run_directory = init_run(job_directory)
154+
run_directory = init_run(job_directory, message=message)
141155

142-
input_files_list = [filepath.strip() for filepath in input_files.split(",")]
156+
# handle CSV file containing input files
157+
if input_files.endswith(".csv"):
158+
input_files_list = pd.read_csv(input_files, header=None)[0].tolist()
159+
# else, assume comma separated list of extract files
160+
else:
161+
input_files_list = [filepath.strip() for filepath in input_files.split(",")]
143162

144163
ab_transformed_file_lists = run_ab_transforms(
145164
run_directory=run_directory,
@@ -181,3 +200,41 @@ def view_job(
181200
app.config.update(JOB_DIRECTORY=job_directory)
182201
logger.info("")
183202
app.run(host=config.webapp_host, port=config.webapp_port)
203+
204+
205+
@main.command()
206+
@click.option(
207+
"-o",
208+
"--output-file",
209+
type=str,
210+
required=True,
211+
help="Output filepath for CSV.",
212+
)
213+
@click.option(
214+
"-s",
215+
"--sources",
216+
type=str,
217+
required=False,
218+
help="Optional comma separated list of sources to include. Default is all.",
219+
)
220+
def timdex_sources_csv(output_file: str, sources: str) -> None:
221+
"""Generate a CSV of ordered extract files for all, or a subset, of TIMDEX sources.
222+
223+
This CSV may be passed to CLI command 'run-diff' for the '-i / --input-files'
224+
argument, serving as the list of input files for the run.
225+
226+
This command requires that env var 'TIMDEX_BUCKET' is set to establish what S3 bucket
227+
to use for scanning. The appropriate AWS credentials are also needed to be set.
228+
"""
229+
sources_list = None
230+
if sources:
231+
sources_list = [source.strip() for source in sources.split(",")]
232+
233+
input_files = get_ordered_extracted_files_all_sources(sources=sources_list)
234+
235+
input_files_df = pd.DataFrame(
236+
columns=["input_file"],
237+
data=[*chain.from_iterable(v for v in input_files.values())],
238+
)
239+
input_files_df.to_csv(output_file, index=False, header=False)
240+
logger.info(f"Created file: {output_file}")

abdiff/config.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class Config:
1515
"WEBAPP_PORT",
1616
"TRANSMOGRIFIER_MAX_WORKERS",
1717
"TRANSMOGRIFIER_TIMEOUT",
18+
"TIMDEX_BUCKET",
1819
)
1920

2021
def __getattr__(self, name: str) -> Any: # noqa: ANN401
@@ -45,6 +46,18 @@ def transmogrifier_timeout(self) -> int:
4546
timeout = self.TRANSMOGRIFIER_TIMEOUT or 60 * 60 * 5 # 5 hours default
4647
return int(timeout)
4748

49+
@property
50+
def active_timdex_sources(self) -> list[str]:
51+
return [
52+
"alma",
53+
"aspace",
54+
"dspace",
55+
"gismit",
56+
"gisogm",
57+
"libguides",
58+
"researchdatabases",
59+
]
60+
4861

4962
def configure_logger(logger: logging.Logger, *, verbose: bool) -> str:
5063
if verbose:

abdiff/extras/__init__.py

Whitespace-only changes.

abdiff/extras/timdex_sources.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""abdiff.helpers.timdex_sources"""
2+
3+
import datetime
4+
import logging
5+
import re
6+
7+
import boto3 # type: ignore[import-untyped]
8+
9+
from abdiff.config import Config
10+
11+
logger = logging.getLogger(__name__)
12+
13+
CONFIG = Config()
14+
15+
16+
def get_ordered_extracted_files_all_sources(
17+
sources: list[str] | None = None,
18+
) -> dict[str, list[str]]:
19+
"""Get ordered extract files for all TIMDEX sources."""
20+
if not sources:
21+
sources = CONFIG.active_timdex_sources
22+
return {
23+
source: get_ordered_extracted_files_since_last_full_run(source=source)
24+
for source in sources
25+
}
26+
27+
28+
def get_ordered_extracted_files_since_last_full_run(source: str) -> list[str]:
29+
"""Get extract files, from last full run, through all daily runs, for a source."""
30+
logger.info(f"Retrieving ordered extracted files for source: '{source}'")
31+
all_files = get_extracted_files_for_source(source)
32+
33+
# Find all full extract files and extract their dates
34+
full_extracts = [f for f in all_files if _is_full_extract(f)]
35+
if not full_extracts:
36+
logger.warning("No full extracts found.")
37+
return []
38+
39+
# Extract dates from full extract files and find the most recent date
40+
full_extract_dates = [_extract_date(f) for f in full_extracts]
41+
most_recent_full_date = max(full_extract_dates)
42+
43+
# Collect all full extract files with the most recent date
44+
most_recent_full_files = sorted(
45+
f for f in full_extracts if _extract_date(f) == most_recent_full_date
46+
)
47+
48+
# Collect all daily extracts from the cutoff date onwards
49+
recent_daily_extracts = sorted(
50+
f
51+
for f in all_files
52+
if _is_daily_extract(f) and _extract_date(f) >= most_recent_full_date
53+
)
54+
55+
# Combine full extracts and daily extracts
56+
ordered_files = most_recent_full_files + recent_daily_extracts
57+
logger.info(f"Total files retrieved: {len(ordered_files)}")
58+
return ordered_files
59+
60+
61+
def _is_full_extract(filename: str) -> bool:
62+
return "-full-" in filename
63+
64+
65+
def _is_daily_extract(filename: str) -> bool:
66+
return "-daily-" in filename
67+
68+
69+
def _extract_date(filename: str) -> datetime.datetime:
70+
date_string = re.findall(r".+?(\d{4}-\d{2}-\d{2})", filename)[0]
71+
return datetime.datetime.strptime(date_string, "%Y-%m-%d").astimezone(datetime.UTC)
72+
73+
74+
def get_extracted_files_for_source(
75+
source: str,
76+
bucket: str = CONFIG.TIMDEX_BUCKET,
77+
) -> list[str]:
78+
"""List S3 URIs for extract files in TIMDEX S3 bucket for a given source."""
79+
s3_client = boto3.client("s3")
80+
files = []
81+
82+
paginator = s3_client.get_paginator("list_objects_v2")
83+
page_iterator = paginator.paginate(Bucket=bucket, Prefix=source)
84+
85+
for page in page_iterator:
86+
if "Contents" in page:
87+
for obj in page["Contents"]:
88+
if not obj["Key"].endswith("/"): # skip folders
89+
s3_uri = f"s3://{bucket}/{obj['Key']}"
90+
files.append(s3_uri)
91+
92+
# filter where "extracted" in filename
93+
return [file for file in files if "extracted" in file]

0 commit comments

Comments
 (0)