Merge pull request #25 from Sage-Bionetworks-Workflows/bgrande/ORCA-167/lazy-staging

Bruno Grande · web-flow · commit 37c1d40f818b · 2023-04-03T13:14:11.000-07:00
[ORCA-167] Stage files lazily
diff --git a/src/dcqc/file.py b/src/dcqc/file.py
@@ -334,7 +334,9 @@ def stage(
             if self._local_path is not None:
                 return self._local_path
             else:
-                destination_str = mkdtemp()
+                # TODO: This prefix is used by nf-dcqc to easily find the staged file.
+                #       It might be worth using a DCQCTMPDIR to avoid hard-coding this.
+                destination_str = mkdtemp(prefix="dcqc-staged-")
                 destination = Path(destination_str)
 
         # By this point, destination is defined (not None)
diff --git a/src/dcqc/main.py b/src/dcqc/main.py
@@ -1,7 +1,7 @@
 import sys
 from csv import DictWriter
 from pathlib import Path
-from typing import List, Optional
+from typing import List
 
 from typer import Argument, Exit, Option, Typer
 
@@ -28,11 +28,8 @@
 
 # Common options
 overwrite_opt = Option(False, "--overwrite", "-f", help="Ignore existing files")
-required_tests_opt = Option(None, "--required-tests", "-rt", help="Required tests")
-skipped_tests_opt = Option(None, "--skipped-tests", "-st", help="Skipped tests")
-stage_files_opt = Option(False, "--stage-files", "-sf", help="Stage remote files.")
-prt_help = "Update paths to be relative to given directory upon serialization."
-paths_relative_to_opt = Option(None, "--paths-relative-to", "-prt", help=prt_help)
+required_tests_opt = Option(None, "--required-tests", "-r", help="Required tests")
+skipped_tests_opt = Option(None, "--skipped-tests", "-s", help="Skipped tests")
 
 
 @app.callback()
@@ -48,12 +45,11 @@ def create_targets(
     input_csv: Path = input_path_arg,
     output_dir: Path = output_dir_path_arg,
     overwrite: bool = overwrite_opt,
-    stage_files: bool = stage_files_opt,
 ):
     """Create target JSON files from a targets CSV file"""
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    parser = CsvParser(input_csv, stage_files)
+    parser = CsvParser(input_csv)
     targets = parser.create_targets()
 
     # Naming the targets by index to ensure no clashes
@@ -63,24 +59,6 @@ def create_targets(
     report.save_many(named_targets, output_dir.as_posix(), overwrite)
 
 
-@app.command()
-def stage_target(
-    input_json: Path = input_path_arg,
-    output_json: str = output_arg,
-    output_dir: Path = output_dir_path_arg,
-    overwrite: bool = overwrite_opt,
-    paths_relative_to: Optional[Path] = paths_relative_to_opt,
-):
-    """Create local file copies from a target JSON file"""
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    target = JsonParser.parse_object(input_json, Target)
-    target.stage(output_dir, overwrite)
-
-    report = JsonReport(paths_relative_to)
-    report.save(target, output_json, overwrite)
-
-
 @app.command()
 def create_tests(
     input_json: Path = input_path_arg,
@@ -93,8 +71,8 @@ def create_tests(
     output_dir.mkdir(parents=True, exist_ok=True)
 
     # Interpret empty lists from CLI as None (to auto-generate values)
-    required_tests_maybe = required_tests if len(required_tests) > 0 else None
-    skipped_tests_maybe = skipped_tests if len(skipped_tests) > 0 else None
+    required_tests_maybe = required_tests if required_tests else None
+    skipped_tests_maybe = skipped_tests if skipped_tests else None
 
     target = JsonParser.parse_object(input_json, Target)
     suite = SuiteABC.from_target(target, required_tests_maybe, skipped_tests_maybe)
@@ -150,8 +128,8 @@ def create_suite(
 ):
     """Create a suite from a set of test JSON files sharing the same target"""
     # Interpret empty lists from CLI as None (to auto-generate values)
-    required_tests_maybe = required_tests if len(required_tests) > 0 else None
-    skipped_tests_maybe = skipped_tests if len(skipped_tests) > 0 else None
+    required_tests_maybe = required_tests if required_tests else None
+    skipped_tests_maybe = skipped_tests if skipped_tests else None
 
     tests = [JsonParser.parse_object(test_json, TestABC) for test_json in input_jsons]
     suite = SuiteABC.from_tests(tests, required_tests_maybe, skipped_tests_maybe)
diff --git a/src/dcqc/tests/test_abc.py b/src/dcqc/tests/test_abc.py
@@ -69,8 +69,40 @@ def get_status(self, compute_ok: bool = True) -> TestStatus:
             self._status = self.compute_status()
         return self._status
 
-    def _get_single_target_file(self) -> File:
-        files = self.target.files
+    def get_files(self, staged: bool = True) -> list[File]:
+        """Get and stage files for target.
+
+        Args:
+            staged: Whether to make sure that the files are staged.
+            Defaults to True.
+
+        Returns:
+            Staged target files.
+        """
+        files = []
+        for file in self.target.files:
+            if staged:
+                file.stage()
+            files.append(file)
+        return files
+
+    def get_file(self, staged: bool = True) -> File:
+        """Get and stage file for single-file target.
+
+        Args:
+            staged: Whether to make sure that the files are staged.
+            Defaults to True.
+
+        Raises:
+            ValueError: If the target has multiple files.
+
+        Returns:
+            Staged target file.
+        """
+        files = self.get_files(staged)
+        if len(files) != 1:
+            message = "This method only supports single-file targets."
+            raise ValueError(message)
         return files[0]
 
     @classmethod
diff --git a/src/dcqc/tests/tests.py b/src/dcqc/tests/tests.py
@@ -1,7 +1,7 @@
 import hashlib
 import json
+from pathlib import Path
 
-from dcqc.file import File
 from dcqc.tests.test_abc import ExternalTestMixin, Process, TestABC, TestStatus
 
 
@@ -11,7 +11,7 @@ class FileExtensionTest(TestABC):
 
     def compute_status(self) -> TestStatus:
         status = TestStatus.PASS
-        for file in self.target.files:
+        for file in self.get_files(staged=False):
             file_type = file.get_file_type()
             file_extensions = file_type.file_extensions
             if not file.name.endswith(file_extensions):
@@ -26,18 +26,17 @@ class Md5ChecksumTest(TestABC):
 
     def compute_status(self) -> TestStatus:
         status = TestStatus.PASS
-        for file in self.target.files:
+        for file in self.get_files():
             expected_md5 = file.get_metadata("md5_checksum")
-            actual_md5 = self._compute_md5_checksum(file)
+            actual_md5 = self._compute_md5_checksum(file.local_path)
             if expected_md5 != actual_md5:
                 status = TestStatus.FAIL
                 break
         return status
 
-    def _compute_md5_checksum(self, file: File) -> str:
-        local_path = file.local_path
+    def _compute_md5_checksum(self, path: Path) -> str:
         hash_md5 = hashlib.md5()
-        with local_path.open("rb") as infile:
+        with path.open("rb") as infile:
             for chunk in iter(lambda: infile.read(4096), b""):
                 hash_md5.update(chunk)
         actual_md5 = hash_md5.hexdigest()
@@ -50,16 +49,15 @@ class JsonLoadTest(TestABC):
 
     def compute_status(self) -> TestStatus:
         status = TestStatus.PASS
-        for file in self.target.files:
-            if not self._can_be_loaded(file):
+        for file in self.get_files():
+            if not self._can_be_loaded(file.local_path):
                 status = TestStatus.FAIL
                 break
         return status
 
-    def _can_be_loaded(self, file: File) -> bool:
+    def _can_be_loaded(self, path: Path) -> bool:
         success = True
-        local_path = file.local_path
-        with local_path.open("r") as infile:
+        with path.open("r") as infile:
             try:
                 json.load(infile)
             except Exception:
@@ -73,19 +71,18 @@ class JsonLdLoadTest(TestABC):
 
     def compute_status(self) -> TestStatus:
         status = TestStatus.PASS
-        for file in self.target.files:
-            if not self._can_be_loaded(file):
+        for file in self.get_files():
+            if not self._can_be_loaded(file.local_path):
                 status = TestStatus.FAIL
                 break
         return status
 
-    def _can_be_loaded(self, file: File) -> bool:
+    def _can_be_loaded(self, path: Path) -> bool:
         rdflib = self.import_module("rdflib")
         graph = rdflib.Graph()
 
         success = True
-        local_path = file.local_path
-        with local_path.open("r") as infile:
+        with path.open("r") as infile:
             try:
                 graph.parse(infile, format="json-ld")
             except Exception:
@@ -97,9 +94,8 @@ class LibTiffInfoTest(ExternalTestMixin, TestABC):
     tier = 2
 
     def generate_process(self) -> Process:
-        file = self._get_single_target_file()
-        path = file.local_path.as_posix()
-        command_args = ["tiffinfo", path]
+        file = self.get_file()
+        command_args = ["tiffinfo", file.local_path.as_posix()]
         process = Process(
             container="quay.io/sagebionetworks/libtiff:2.0",
             command_args=command_args,
@@ -111,14 +107,13 @@ class BioFormatsInfoTest(ExternalTestMixin, TestABC):
     tier = 2
 
     def generate_process(self) -> Process:
-        file = self._get_single_target_file()
-        path = file.local_path.as_posix()
+        file = self.get_file()
         command_args = [
             "/opt/bftools/showinf",
             "-nopix",
             "-novalid",
             "-nocore",
-            path,
+            file.local_path.as_posix(),
         ]
         process = Process(
             container="quay.io/sagebionetworks/bftools:latest",
@@ -131,11 +126,10 @@ class OmeXmlSchemaTest(ExternalTestMixin, TestABC):
     tier = 2
 
     def generate_process(self) -> Process:
-        file = self._get_single_target_file()
-        path = file.local_path.as_posix()
+        file = self.get_file()
         command_args = [
             "/opt/bftools/xmlvalid",
-            path,
+            file.local_path.as_posix(),
         ]
         process = Process(
             container="quay.io/sagebionetworks/bftools:latest",
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -55,27 +55,13 @@ def test_create_targets(get_data, get_output):
     assert len(list(output_dir.iterdir())) > 0
 
 
-def test_stage_target(get_data, get_output):
-    input_json = get_data("target.json")
-    output_json = get_output("stage_target/target.staged.json")
-    output_dir = get_output("stage_target/targets")
-    output_json.unlink(missing_ok=True)
-    shutil.rmtree(output_dir, ignore_errors=True)
-
-    assert not output_dir.exists()
-    args = ["stage-target", "-prt", ".", input_json, output_json, output_dir]
-    result = run_command(args)
-    check_command_result(result)
-    assert len(list(output_dir.iterdir())) > 0
-
-
 def test_create_tests(get_data, get_output):
     input_json = get_data("target.json")
     output_dir = get_output("create_tests")
     shutil.rmtree(output_dir, ignore_errors=True)
 
     assert not output_dir.exists()
-    args = ["create-tests", "-rt", "Md5ChecksumTest", input_json, output_dir]
+    args = ["create-tests", "-r", "Md5ChecksumTest", input_json, output_dir]
     result = run_command(args)
     check_command_result(result)
     assert len(list(output_dir.iterdir())) > 0
diff --git a/tests/test_tests.py b/tests/test_tests.py
@@ -190,3 +190,11 @@ def test_that_a_process_can_be_serialized_and_deserialized():
     process_dict = process.to_dict()
     process_from_dict = Process.from_dict(process_dict)
     assert process_dict == process_from_dict.to_dict()
+
+
+def test_for_an_error_when_getting_one_file_from_multi_file_target(test_files):
+    file = test_files["good"]
+    target = Target(file, file)
+    test = tests.FileExtensionTest(target)
+    with pytest.raises(ValueError):
+        test.get_file()