Skip to content

Commit b1a4438

Browse files
authored
Merge pull request #84 from MITLibraries/TIMX-401-expect-v2-datasets
TIMX 401 - expect and support v2 parquet dataset Transmogrifier output
2 parents b957d65 + a5ca7f4 commit b1a4438

24 files changed

+901
-3188
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,6 @@ repos:
2424
types: ["python"]
2525
- id: safety
2626
name: safety
27-
entry: pipenv check --ignore 70612
27+
entry: pipenv check
2828
language: system
2929
pass_filenames: false

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ ruff: # Run 'ruff' linter and print a preview of errors
4545
pipenv run ruff check .
4646

4747
safety: # Check for security vulnerabilities and verify Pipfile.lock is up-to-date
48-
pipenv check --ignore 70612
48+
pipenv check
4949
pipenv verify
5050

5151
lint-apply: black-apply ruff-apply # Apply changes with 'black' and resolve 'fixable errors' with 'ruff'

Pipfile.lock

Lines changed: 678 additions & 695 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,9 @@ output/mvp
103103
├── run.json
104104
└── transformed
105105
├── a
106-
│ ├── alma-2023-02-19-daily-transformed-records-to-index.json
107-
│ ├── dspace-2024-10-11-daily-transformed-records-to-index.json
108-
│ └── libguides-2024-04-03-full-transformed-records-to-index.json
106+
│ └── dataset <parquet dataset from Transmogrifier>
109107
├── b
110-
│ ├── alma-2023-02-19-daily-transformed-records-to-index.json
111-
│ ├── dspace-2024-10-11-daily-transformed-records-to-index.json
112-
│ └── libguides-2024-04-03-full-transformed-records-to-index.json
108+
│ └── dataset <parquet dataset from Transmogrifier>
113109
└── logs.txt
114110
```
115111

@@ -175,8 +171,10 @@ Usage: -c init-job [OPTIONS]
175171
Options:
176172
-d, --job-directory TEXT Job directory to create. [required]
177173
-m, --message TEXT Message to describe Job.
174+
-la, --location-a TEXT Location to clone Transmogrifier version 'A'
178175
-a, --commit-sha-a TEXT Transmogrifier commit SHA for version 'A'
179176
[required]
177+
-lb, --location-b TEXT Location to clone Transmogrifier version 'B'
180178
-b, --commit-sha-b TEXT Transmogrifier commit SHA for version 'B'
181179
[required]
182180
-h, --help Show this message and exit.

abdiff/cli.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,29 @@ def ping() -> None:
8888
help="Message to describe Job.",
8989
default="Not provided.",
9090
)
91+
@click.option(
92+
"-la",
93+
"--location-a",
94+
type=str,
95+
required=False,
96+
default="https://github.com/MITLibraries/transmogrifier.git",
97+
help="Location to clone Transmogrifier version 'A'",
98+
)
9199
@click.option(
92100
"-a",
93101
"--commit-sha-a",
94102
type=str,
95103
required=True,
96104
help="Transmogrifier commit SHA for version 'A'",
97105
)
106+
@click.option(
107+
"-lb",
108+
"--location-b",
109+
type=str,
110+
required=False,
111+
default="https://github.com/MITLibraries/transmogrifier.git",
112+
help="Location to clone Transmogrifier version 'B'",
113+
)
98114
@click.option(
99115
"-b",
100116
"--commit-sha-b",
@@ -106,7 +122,9 @@ def init_job(
106122
job_directory: str,
107123
message: str,
108124
commit_sha_a: str,
125+
location_a: str,
109126
commit_sha_b: str,
127+
location_b: str,
110128
) -> None:
111129
"""Initialize a new Job."""
112130
try:
@@ -119,7 +137,9 @@ def init_job(
119137

120138
build_ab_images(
121139
job_directory,
140+
location_a,
122141
commit_sha_a,
142+
location_b,
123143
commit_sha_b,
124144
)
125145

@@ -179,7 +199,7 @@ def run_diff(
179199
if download_files:
180200
download_input_files(input_files_list)
181201

182-
ab_transformed_file_lists = run_ab_transforms(
202+
ab_transformed_datasets = run_ab_transforms(
183203
run_directory=run_directory,
184204
image_tag_a=job_data["image_tag_a"],
185205
image_tag_b=job_data["image_tag_b"],
@@ -189,7 +209,7 @@ def run_diff(
189209

190210
collated_dataset_path = collate_ab_transforms(
191211
run_directory=run_directory,
192-
ab_transformed_file_lists=ab_transformed_file_lists,
212+
ab_transformed_datasets=ab_transformed_datasets,
193213
)
194214

195215
diffs_dataset_path = calc_ab_diffs(

abdiff/core/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414
from abdiff.extras.minio.download_input_files import download_input_files
1515

1616
__all__ = [
17-
"init_job",
18-
"init_run",
1917
"build_ab_images",
20-
"download_input_files",
21-
"run_ab_transforms",
22-
"collate_ab_transforms",
2318
"calc_ab_diffs",
2419
"calc_ab_metrics",
20+
"collate_ab_transforms",
2521
"create_final_records",
22+
"download_input_files",
23+
"init_job",
24+
"init_run",
25+
"run_ab_transforms",
2626
]

abdiff/core/build_ab_images.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,30 +15,34 @@
1515

1616
def build_ab_images(
1717
job_directory: str,
18+
location_a: str,
1819
commit_sha_a: str,
20+
location_b: str,
1921
commit_sha_b: str,
2022
docker_client: docker.client.DockerClient | None = None,
2123
) -> tuple[str, str]:
2224
"""Build Docker images based on 2 commit SHAs.
2325
2426
Args:
2527
job_directory: The directory containing all files related to a job.
28+
location_a: Location of Transmogrifier version 'A' to clone from.
2629
commit_sha_a: The SHA of the first commit for comparison.
30+
location_b: Location of Transmogrifier version 'B' to clone from.
2731
commit_sha_b: The SHA of the second commit for comparison.
2832
docker_client: A configured Docker client.
2933
"""
3034
if not docker_client:
3135
docker_client = docker.from_env()
3236

3337
image_tags = []
34-
for commit_sha in [commit_sha_a, commit_sha_b]:
38+
for location, commit_sha in [(location_a, commit_sha_a), (location_b, commit_sha_b)]:
3539
logger.debug(f"Processing commit: {commit_sha}")
3640
image_tag = generate_image_name(commit_sha)
3741
if docker_image_exists(docker_client, image_tag):
3842
logger.debug(f"Docker image already exists with tag: {image_tag}")
3943
image_tags.append(image_tag)
4044
else:
41-
image = build_image(commit_sha, docker_client)
45+
image = build_image(location, commit_sha, docker_client)
4246
image_tags.append(image.tags[0])
4347
logger.debug(f"Finished processing commit: {commit_sha}")
4448

@@ -67,43 +71,44 @@ def docker_image_exists(
6771

6872

6973
def build_image(
74+
location: str,
7075
commit_sha: str,
7176
docker_client: docker.client.DockerClient,
7277
) -> docker.models.images.Image:
7378
"""Clone repo and build Docker image.
7479
7580
Args:
76-
job_directory: The directory containing all files related to a job.
81+
location: Location of Transmogrifier to clone from.
7782
commit_sha: The SHA of the commit.
7883
docker_client: A configured Docker client.
7984
"""
8085
with tempfile.TemporaryDirectory() as clone_directory:
8186
image_tag = generate_image_name(commit_sha)
82-
clone_repo_and_reset_to_commit(clone_directory, commit_sha)
87+
clone_repo_and_reset_to_commit(location, clone_directory, commit_sha)
8388
image, _ = docker_client.images.build(path=clone_directory, tag=image_tag)
8489
logger.debug(f"Docker image created with tag: {image}")
8590
return image
8691

8792

88-
def clone_repo_and_reset_to_commit(clone_directory: str, commit_sha: str) -> None:
93+
def clone_repo_and_reset_to_commit(
94+
location: str,
95+
clone_directory: str,
96+
commit_sha: str,
97+
) -> None:
8998
"""Clone GitHub repo and reset to a specified commit.
9099
91100
Args:
101+
location: Location of Transmogrifier to clone from.
92102
clone_directory: The directory for the cloned repo.
93103
commit_sha: The SHA of a repo commit.
94104
"""
95-
logger.debug(f"Cloning repo to: {clone_directory}")
96-
transmogrifier_url = "https://github.com/MITLibraries/transmogrifier.git"
105+
logger.debug(f"Cloning repo from: {location}, to: {clone_directory}")
97106
repository = clone_repository(
98-
transmogrifier_url,
107+
location,
99108
clone_directory,
100109
)
101-
logger.debug(f"Cloned repo to: {clone_directory}")
102-
103110
try:
104111
repository.reset(commit_sha, ResetMode.HARD)
105112
logger.debug(f"Cloned repo reset to commit: {commit_sha}")
106113
except KeyError as exception:
107-
raise InvalidRepositoryCommitSHAError(
108-
transmogrifier_url, commit_sha
109-
) from exception
114+
raise InvalidRepositoryCommitSHAError(location, commit_sha) from exception

0 commit comments

Comments
 (0)