Skip to content

Commit 37c1d40

Browse files
author
Bruno Grande
authored
Merge pull request #25 from Sage-Bionetworks-Workflows/bgrande/ORCA-167/lazy-staging
[ORCA-167] Stage files lazily
2 parents 70d919f + cceede9 commit 37c1d40

File tree

6 files changed

+74
-74
lines changed

6 files changed

+74
-74
lines changed

src/dcqc/file.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,9 @@ def stage(
334334
if self._local_path is not None:
335335
return self._local_path
336336
else:
337-
destination_str = mkdtemp()
337+
# TODO: This prefix is used by nf-dcqc to easily find the staged file.
338+
# It might be worth using a DCQCTMPDIR to avoid hard-coding this.
339+
destination_str = mkdtemp(prefix="dcqc-staged-")
338340
destination = Path(destination_str)
339341

340342
# By this point, destination is defined (not None)

src/dcqc/main.py

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import sys
22
from csv import DictWriter
33
from pathlib import Path
4-
from typing import List, Optional
4+
from typing import List
55

66
from typer import Argument, Exit, Option, Typer
77

@@ -28,11 +28,8 @@
2828

2929
# Common options
3030
overwrite_opt = Option(False, "--overwrite", "-f", help="Ignore existing files")
31-
required_tests_opt = Option(None, "--required-tests", "-rt", help="Required tests")
32-
skipped_tests_opt = Option(None, "--skipped-tests", "-st", help="Skipped tests")
33-
stage_files_opt = Option(False, "--stage-files", "-sf", help="Stage remote files.")
34-
prt_help = "Update paths to be relative to given directory upon serialization."
35-
paths_relative_to_opt = Option(None, "--paths-relative-to", "-prt", help=prt_help)
31+
required_tests_opt = Option(None, "--required-tests", "-r", help="Required tests")
32+
skipped_tests_opt = Option(None, "--skipped-tests", "-s", help="Skipped tests")
3633

3734

3835
@app.callback()
@@ -48,12 +45,11 @@ def create_targets(
4845
input_csv: Path = input_path_arg,
4946
output_dir: Path = output_dir_path_arg,
5047
overwrite: bool = overwrite_opt,
51-
stage_files: bool = stage_files_opt,
5248
):
5349
"""Create target JSON files from a targets CSV file"""
5450
output_dir.mkdir(parents=True, exist_ok=True)
5551

56-
parser = CsvParser(input_csv, stage_files)
52+
parser = CsvParser(input_csv)
5753
targets = parser.create_targets()
5854

5955
# Naming the targets by index to ensure no clashes
@@ -63,24 +59,6 @@ def create_targets(
6359
report.save_many(named_targets, output_dir.as_posix(), overwrite)
6460

6561

66-
@app.command()
67-
def stage_target(
68-
input_json: Path = input_path_arg,
69-
output_json: str = output_arg,
70-
output_dir: Path = output_dir_path_arg,
71-
overwrite: bool = overwrite_opt,
72-
paths_relative_to: Optional[Path] = paths_relative_to_opt,
73-
):
74-
"""Create local file copies from a target JSON file"""
75-
output_dir.mkdir(parents=True, exist_ok=True)
76-
77-
target = JsonParser.parse_object(input_json, Target)
78-
target.stage(output_dir, overwrite)
79-
80-
report = JsonReport(paths_relative_to)
81-
report.save(target, output_json, overwrite)
82-
83-
8462
@app.command()
8563
def create_tests(
8664
input_json: Path = input_path_arg,
@@ -93,8 +71,8 @@ def create_tests(
9371
output_dir.mkdir(parents=True, exist_ok=True)
9472

9573
# Interpret empty lists from CLI as None (to auto-generate values)
96-
required_tests_maybe = required_tests if len(required_tests) > 0 else None
97-
skipped_tests_maybe = skipped_tests if len(skipped_tests) > 0 else None
74+
required_tests_maybe = required_tests if required_tests else None
75+
skipped_tests_maybe = skipped_tests if skipped_tests else None
9876

9977
target = JsonParser.parse_object(input_json, Target)
10078
suite = SuiteABC.from_target(target, required_tests_maybe, skipped_tests_maybe)
@@ -150,8 +128,8 @@ def create_suite(
150128
):
151129
"""Create a suite from a set of test JSON files sharing the same target"""
152130
# Interpret empty lists from CLI as None (to auto-generate values)
153-
required_tests_maybe = required_tests if len(required_tests) > 0 else None
154-
skipped_tests_maybe = skipped_tests if len(skipped_tests) > 0 else None
131+
required_tests_maybe = required_tests if required_tests else None
132+
skipped_tests_maybe = skipped_tests if skipped_tests else None
155133

156134
tests = [JsonParser.parse_object(test_json, TestABC) for test_json in input_jsons]
157135
suite = SuiteABC.from_tests(tests, required_tests_maybe, skipped_tests_maybe)

src/dcqc/tests/test_abc.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,40 @@ def get_status(self, compute_ok: bool = True) -> TestStatus:
6969
self._status = self.compute_status()
7070
return self._status
7171

72-
def _get_single_target_file(self) -> File:
73-
files = self.target.files
72+
def get_files(self, staged: bool = True) -> list[File]:
73+
"""Get and stage files for target.
74+
75+
Args:
76+
staged: Whether to make sure that the files are staged.
77+
Defaults to True.
78+
79+
Returns:
80+
Staged target files.
81+
"""
82+
files = []
83+
for file in self.target.files:
84+
if staged:
85+
file.stage()
86+
files.append(file)
87+
return files
88+
89+
def get_file(self, staged: bool = True) -> File:
90+
"""Get and stage file for single-file target.
91+
92+
Args:
93+
staged: Whether to make sure that the files are staged.
94+
Defaults to True.
95+
96+
Raises:
97+
ValueError: If the target has multiple files.
98+
99+
Returns:
100+
Staged target file.
101+
"""
102+
files = self.get_files(staged)
103+
if len(files) != 1:
104+
message = "This method only supports single-file targets."
105+
raise ValueError(message)
74106
return files[0]
75107

76108
@classmethod

src/dcqc/tests/tests.py

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import hashlib
22
import json
3+
from pathlib import Path
34

4-
from dcqc.file import File
55
from dcqc.tests.test_abc import ExternalTestMixin, Process, TestABC, TestStatus
66

77

@@ -11,7 +11,7 @@ class FileExtensionTest(TestABC):
1111

1212
def compute_status(self) -> TestStatus:
1313
status = TestStatus.PASS
14-
for file in self.target.files:
14+
for file in self.get_files(staged=False):
1515
file_type = file.get_file_type()
1616
file_extensions = file_type.file_extensions
1717
if not file.name.endswith(file_extensions):
@@ -26,18 +26,17 @@ class Md5ChecksumTest(TestABC):
2626

2727
def compute_status(self) -> TestStatus:
2828
status = TestStatus.PASS
29-
for file in self.target.files:
29+
for file in self.get_files():
3030
expected_md5 = file.get_metadata("md5_checksum")
31-
actual_md5 = self._compute_md5_checksum(file)
31+
actual_md5 = self._compute_md5_checksum(file.local_path)
3232
if expected_md5 != actual_md5:
3333
status = TestStatus.FAIL
3434
break
3535
return status
3636

37-
def _compute_md5_checksum(self, file: File) -> str:
38-
local_path = file.local_path
37+
def _compute_md5_checksum(self, path: Path) -> str:
3938
hash_md5 = hashlib.md5()
40-
with local_path.open("rb") as infile:
39+
with path.open("rb") as infile:
4140
for chunk in iter(lambda: infile.read(4096), b""):
4241
hash_md5.update(chunk)
4342
actual_md5 = hash_md5.hexdigest()
@@ -50,16 +49,15 @@ class JsonLoadTest(TestABC):
5049

5150
def compute_status(self) -> TestStatus:
5251
status = TestStatus.PASS
53-
for file in self.target.files:
54-
if not self._can_be_loaded(file):
52+
for file in self.get_files():
53+
if not self._can_be_loaded(file.local_path):
5554
status = TestStatus.FAIL
5655
break
5756
return status
5857

59-
def _can_be_loaded(self, file: File) -> bool:
58+
def _can_be_loaded(self, path: Path) -> bool:
6059
success = True
61-
local_path = file.local_path
62-
with local_path.open("r") as infile:
60+
with path.open("r") as infile:
6361
try:
6462
json.load(infile)
6563
except Exception:
@@ -73,19 +71,18 @@ class JsonLdLoadTest(TestABC):
7371

7472
def compute_status(self) -> TestStatus:
7573
status = TestStatus.PASS
76-
for file in self.target.files:
77-
if not self._can_be_loaded(file):
74+
for file in self.get_files():
75+
if not self._can_be_loaded(file.local_path):
7876
status = TestStatus.FAIL
7977
break
8078
return status
8179

82-
def _can_be_loaded(self, file: File) -> bool:
80+
def _can_be_loaded(self, path: Path) -> bool:
8381
rdflib = self.import_module("rdflib")
8482
graph = rdflib.Graph()
8583

8684
success = True
87-
local_path = file.local_path
88-
with local_path.open("r") as infile:
85+
with path.open("r") as infile:
8986
try:
9087
graph.parse(infile, format="json-ld")
9188
except Exception:
@@ -97,9 +94,8 @@ class LibTiffInfoTest(ExternalTestMixin, TestABC):
9794
tier = 2
9895

9996
def generate_process(self) -> Process:
100-
file = self._get_single_target_file()
101-
path = file.local_path.as_posix()
102-
command_args = ["tiffinfo", path]
97+
file = self.get_file()
98+
command_args = ["tiffinfo", file.local_path.as_posix()]
10399
process = Process(
104100
container="quay.io/sagebionetworks/libtiff:2.0",
105101
command_args=command_args,
@@ -111,14 +107,13 @@ class BioFormatsInfoTest(ExternalTestMixin, TestABC):
111107
tier = 2
112108

113109
def generate_process(self) -> Process:
114-
file = self._get_single_target_file()
115-
path = file.local_path.as_posix()
110+
file = self.get_file()
116111
command_args = [
117112
"/opt/bftools/showinf",
118113
"-nopix",
119114
"-novalid",
120115
"-nocore",
121-
path,
116+
file.local_path.as_posix(),
122117
]
123118
process = Process(
124119
container="quay.io/sagebionetworks/bftools:latest",
@@ -131,11 +126,10 @@ class OmeXmlSchemaTest(ExternalTestMixin, TestABC):
131126
tier = 2
132127

133128
def generate_process(self) -> Process:
134-
file = self._get_single_target_file()
135-
path = file.local_path.as_posix()
129+
file = self.get_file()
136130
command_args = [
137131
"/opt/bftools/xmlvalid",
138-
path,
132+
file.local_path.as_posix(),
139133
]
140134
process = Process(
141135
container="quay.io/sagebionetworks/bftools:latest",

tests/test_main.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,27 +55,13 @@ def test_create_targets(get_data, get_output):
5555
assert len(list(output_dir.iterdir())) > 0
5656

5757

58-
def test_stage_target(get_data, get_output):
59-
input_json = get_data("target.json")
60-
output_json = get_output("stage_target/target.staged.json")
61-
output_dir = get_output("stage_target/targets")
62-
output_json.unlink(missing_ok=True)
63-
shutil.rmtree(output_dir, ignore_errors=True)
64-
65-
assert not output_dir.exists()
66-
args = ["stage-target", "-prt", ".", input_json, output_json, output_dir]
67-
result = run_command(args)
68-
check_command_result(result)
69-
assert len(list(output_dir.iterdir())) > 0
70-
71-
7258
def test_create_tests(get_data, get_output):
7359
input_json = get_data("target.json")
7460
output_dir = get_output("create_tests")
7561
shutil.rmtree(output_dir, ignore_errors=True)
7662

7763
assert not output_dir.exists()
78-
args = ["create-tests", "-rt", "Md5ChecksumTest", input_json, output_dir]
64+
args = ["create-tests", "-r", "Md5ChecksumTest", input_json, output_dir]
7965
result = run_command(args)
8066
check_command_result(result)
8167
assert len(list(output_dir.iterdir())) > 0

tests/test_tests.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,11 @@ def test_that_a_process_can_be_serialized_and_deserialized():
190190
process_dict = process.to_dict()
191191
process_from_dict = Process.from_dict(process_dict)
192192
assert process_dict == process_from_dict.to_dict()
193+
194+
195+
def test_for_an_error_when_getting_one_file_from_multi_file_target(test_files):
196+
file = test_files["good"]
197+
target = Target(file, file)
198+
test = tests.FileExtensionTest(target)
199+
with pytest.raises(ValueError):
200+
test.get_file()

0 commit comments

Comments
 (0)