Skip to content

Commit 5e2ca71

Browse files
author
Bruno Grande
authored
Merge pull request #4 from Sage-Bionetworks-Workflows/bgrande/files-and-targets
Implement `FileType`, `File`, and `Target` to representing the targets of quality control
2 parents 41459ac + 58b09f7 commit 5e2ca71

File tree

8 files changed

+424
-4
lines changed

8 files changed

+424
-4
lines changed

src/dcqc/file.py

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
from __future__ import annotations
2+
3+
import os
4+
import re
5+
from collections.abc import Collection, Mapping
6+
from copy import deepcopy
7+
from dataclasses import asdict, dataclass
8+
from pathlib import Path, PurePosixPath
9+
from typing import Any, Optional
10+
11+
from dcqc.mixins import SerializableMixin
12+
from dcqc.utils import open_parent_fs
13+
14+
15+
@dataclass
16+
class FileType:
17+
# Class attributes
18+
# A type hint is omitted so this attribute isn't
19+
# picked up by @dataclass as an instance attribute
20+
_registry = dict() # type: ignore
21+
22+
# Instance attributes
23+
name: str
24+
file_extensions: tuple[str, ...]
25+
26+
def __init__(self, name: str, file_extensions: Collection[str]):
27+
self.name = name
28+
self.file_extensions = tuple(file_extensions)
29+
self.register_file_type(self)
30+
31+
@classmethod
32+
def register_file_type(cls, self):
33+
name = self.name.lower()
34+
if name in cls._registry:
35+
message = f"File type ({name}) is already registered ({self._registry})."
36+
raise ValueError(message)
37+
cls._registry[name] = self
38+
39+
@classmethod
40+
def get_file_type(cls, file_type: str) -> FileType:
41+
file_type = file_type.lower()
42+
if file_type not in cls._registry:
43+
types = list(cls._registry)
44+
message = f"File type ({file_type}) not among available options ({types})."
45+
raise ValueError(message)
46+
return cls._registry[file_type]
47+
48+
49+
# TODO: These file types could be moved to an external file
50+
# Instantiated file types are automatically tracked by the FileType class
51+
FileType("*", ()) # To represent all file types
52+
FileType("TXT", (".txt",))
53+
FileType("TIFF", (".tif", ".tiff"))
54+
FileType("OME-TIFF", (".ome.tif", ".ome.tiff"))
55+
56+
57+
@dataclass
58+
class File(SerializableMixin):
59+
url: str
60+
metadata: dict[str, Any]
61+
type: str
62+
63+
LOCAL_REGEX = re.compile(r"((file|osfs)://)?/?[^:]+")
64+
65+
def __init__(
66+
self,
67+
url: str,
68+
metadata: Mapping[str, Any],
69+
relative_to: Optional[Path] = None,
70+
):
71+
relative_to = relative_to or Path.cwd()
72+
if self.is_local(url):
73+
scheme, separator, resource = url.rpartition("://")
74+
path = Path(resource)
75+
if not path.is_absolute():
76+
resource = os.path.relpath(relative_to / resource)
77+
url = "".join([scheme, separator, resource])
78+
self.url = str(url)
79+
self.metadata = dict(metadata)
80+
self.type = self._pop_file_type()
81+
self.file_name = self._get_file_name()
82+
self._fs = None
83+
84+
@property
85+
def fs(self):
86+
if self._fs is None:
87+
fname = self.file_name
88+
fs, bname = open_parent_fs(self.url)
89+
if bname != fname:
90+
message = f"Inconsistent file names: FS ({bname}) and File ({fname})."
91+
raise ValueError(message)
92+
self._fs = fs
93+
return self._fs
94+
95+
def _pop_file_type(self) -> str:
96+
file_type = self.get_metadata("file_type")
97+
del self.metadata["file_type"]
98+
return file_type
99+
100+
def _get_file_name(self):
101+
path = PurePosixPath(self.url)
102+
return path.name
103+
104+
def get_file_type(self) -> FileType:
105+
return FileType.get_file_type(self.type)
106+
107+
def get_metadata(self, key: str) -> Any:
108+
if key not in self.metadata:
109+
url = self.url
110+
md = self.metadata
111+
message = f"File ({url}) does not have '{key}' in its metadata ({md})."
112+
raise ValueError(message)
113+
return self.metadata[key]
114+
115+
def is_local(self, url: Optional[str] = None):
116+
url = url or self.url
117+
return self.LOCAL_REGEX.fullmatch(url) is not None
118+
119+
# TODO: Create a new instance attribute `self._local_path` for keeping
120+
# track of the local path instead of overwriting `self.url`
121+
def get_local_path(self) -> str:
122+
if not self.is_local():
123+
message = f"File ({self.url}) should first be downloaded using stage()."
124+
raise FileNotFoundError(message)
125+
local_path = self.url
126+
if local_path.startswith(("osfs://", "file://")):
127+
local_path = self.fs.getsyspath(self.file_name)
128+
return local_path
129+
130+
def stage(self, destination: Optional[str] = None, overwrite: bool = False) -> str:
131+
"""Download remote files and copy local files.
132+
133+
A destination is required for remote files.
134+
Local files aren't moved if a destination is omitted.
135+
136+
Args:
137+
destination (Optional[str]): File or folder path
138+
where to store the file. Defaults to None.
139+
overwrite (bool): Whether to ignore existing file
140+
at the target destination. Defaults to False.
141+
142+
Raises:
143+
ValueError: If a destination is not specified
144+
when staging a remote file.
145+
ValueError: If the parent directory of the
146+
destination does not exist.
147+
FileExistsError: If the destination file already
148+
exists and ``overwrite`` was not enabled.
149+
150+
Returns:
151+
str: The updated URL (i.e., location) of the file.
152+
"""
153+
if not destination:
154+
if self.is_local():
155+
return self.url
156+
else:
157+
message = f"Destination is required for remote files ({self.url})."
158+
raise ValueError(message)
159+
160+
# By this point, destination is defined (not None)
161+
file_name = self._get_file_name()
162+
destination_path = Path(destination)
163+
if destination_path.is_dir():
164+
destination_path = destination_path / file_name
165+
destination = destination_path.as_posix()
166+
167+
if not destination_path.parent.exists():
168+
message = f"Parent folder of destination ({destination}) does not exist."
169+
raise ValueError(message)
170+
171+
if destination_path.exists() and not overwrite:
172+
message = f"Destination ({destination}) already exists. Enable overwrite."
173+
raise FileExistsError(message)
174+
175+
if self.is_local():
176+
local_path = self.get_local_path()
177+
destination_path.symlink_to(local_path)
178+
else:
179+
with open(destination, "wb") as dest_file:
180+
self.fs.download(self.file_name, dest_file)
181+
182+
self.url = destination
183+
return self.url
184+
185+
def to_dict(self):
186+
return asdict(self)
187+
188+
@classmethod
189+
def from_dict(cls, dictionary: dict) -> File:
190+
dictionary = deepcopy(dictionary)
191+
file_type = dictionary.pop("type")
192+
dictionary["metadata"]["file_type"] = file_type
193+
file = cls(**dictionary)
194+
return file

src/dcqc/mixins.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Any
3+
4+
SerializedObject = dict[str, Any]
5+
6+
7+
class SerializableMixin(ABC):
8+
@classmethod
9+
def from_dict_prepare(cls, dictionary: SerializedObject) -> SerializedObject:
10+
"""Validate and prepare dictionary for deserialization."""
11+
type_ = dictionary.pop("type")
12+
if type_ != cls.__name__:
13+
message = f"Type ({type_}) does not match the class ({cls.__name__})."
14+
raise ValueError(message)
15+
return dictionary
16+
17+
@abstractmethod
18+
def to_dict(self) -> SerializedObject:
19+
""""""
20+
21+
# TODO: Uncomment this once the functions are ready
22+
# @abstractmethod
23+
# def from_dict(self):
24+
# """"""

src/dcqc/target.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from __future__ import annotations
2+
3+
from copy import deepcopy
4+
from dataclasses import asdict, dataclass
5+
6+
from dcqc.file import File
7+
from dcqc.mixins import SerializableMixin
8+
9+
10+
# TODO: Eventually, there might be target-specific metadata
11+
# TODO: Now that Target is much simpler, it might make sense
12+
# to rename the class to FileSet since it currently
13+
# really is just a wrapper for a group of files
14+
# TODO: Maybe the Composite pattern would work here?
15+
@dataclass
16+
class Target(SerializableMixin):
17+
"""Construct a multi-file Target.
18+
19+
Targets ensure support for both single-file
20+
and multi-file tests.
21+
22+
Args:
23+
*files (File): Sequence of files objects.
24+
"""
25+
26+
type: str
27+
files: list[File]
28+
29+
def __init__(self, *files: File):
30+
self.type = self.__class__.__name__
31+
self.files = list(files)
32+
33+
def to_dict(self):
34+
return asdict(self)
35+
36+
@classmethod
37+
def from_dict(cls, dictionary: dict) -> Target:
38+
dictionary = deepcopy(dictionary)
39+
dictionary = cls.from_dict_prepare(dictionary)
40+
files = [File.from_dict(d) for d in dictionary["files"]]
41+
target = cls(*files)
42+
return target

src/dcqc/utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from fs import open_fs
2+
from fs.base import FS
3+
4+
5+
def open_parent_fs(url: str) -> tuple[FS, str]:
6+
# Split off prefix to avoid issues with `rpartition("/")`
7+
scheme, separator, path = url.rpartition("://")
8+
if separator == "":
9+
prefix = "osfs://"
10+
else:
11+
prefix = scheme + separator
12+
13+
# parent_path can be "" if there is no "/" in the path
14+
parent_path, _, base_name = path.rpartition("/")
15+
parent_url = prefix + parent_path
16+
fs = open_fs(parent_url)
17+
return fs, base_name

tests/conftest.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99

1010
from datetime import datetime
1111
from getpass import getuser
12-
from pathlib import Path, PurePath, PurePosixPath
12+
from pathlib import Path
1313
from uuid import uuid4
1414

1515
import pytest
1616

17+
from dcqc.file import File
18+
1719
CNFPATH = Path(__file__).resolve()
1820
TESTDIR = CNFPATH.parent
1921
DATADIR = TESTDIR / "data"
@@ -30,12 +32,36 @@ def pytest_configure():
3032

3133
@pytest.fixture
3234
def get_data():
33-
def _get_data(filename: str, as_posix: bool = False) -> PurePath:
35+
def _get_data(filename: str) -> Path:
3436
path = DATADIR / filename
3537
if not path.exists():
3638
raise ValueError(f"Path ({path}) does not exist.")
37-
if as_posix:
38-
path = PurePosixPath(*path.parts) # type: ignore
3939
return path
4040

4141
yield _get_data
42+
43+
44+
@pytest.fixture
45+
def test_files(get_data):
46+
txt_path = get_data("test.txt").as_posix()
47+
tiff_path = get_data("circuit.tif").as_posix()
48+
syn_path = "syn://syn50555279"
49+
good_metadata = {
50+
"file_type": "txt",
51+
"md5_checksum": "14758f1afd44c09b7992073ccf00b43d",
52+
}
53+
bad_metadata = {
54+
"file_type": "tiff",
55+
"md5_checksum": "definitelynottherightmd5checksum",
56+
}
57+
tiff_metadata = {
58+
"file_type": "tiff",
59+
"md5_checksum": "c7b08f6decb5e7572efbe6074926a843",
60+
}
61+
test_files = {
62+
"good": File(txt_path, good_metadata),
63+
"bad": File(txt_path, bad_metadata),
64+
"tiff": File(tiff_path, tiff_metadata),
65+
"synapse": File(syn_path, good_metadata),
66+
}
67+
yield test_files

tests/data/circuit.tif

75 KB
Binary file not shown.

0 commit comments

Comments
 (0)