Skip to content

Commit 689b2a2

Browse files
author
Bruno Grande
committed
Separate handling of File URL and local path
1 parent 63d0965 commit 689b2a2

File tree

4 files changed

+97
-57
lines changed

4 files changed

+97
-57
lines changed

setup.cfg

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,6 @@ extensions =
149149
pre_commit
150150

151151
[autoflake]
152-
check=true
153152
in-place=true
154153
remove-all-unused-imports=true
155154
remove-unused-variables=true

src/dcqc/file.py

Lines changed: 86 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from collections.abc import Collection, Mapping
66
from copy import deepcopy
77
from dataclasses import asdict, dataclass
8-
from pathlib import Path, PurePosixPath
8+
from pathlib import Path
9+
from tempfile import mkdtemp
910
from typing import Any, Optional
1011

1112
from fs.base import FS
@@ -58,9 +59,20 @@ def get_file_type(cls, file_type: str) -> FileType:
5859

5960
@dataclass
6061
class File(SerializableMixin):
62+
"""Construct a File object.
63+
64+
Args:
65+
url (str): URL indicating the location of the file.
66+
metadata (Mapping[str, Any]): File metadata.
67+
relative_to (Optional[Path]): Used to update any
68+
local URLs if they are relative to a directory
69+
other than the current work directory (default).
70+
"""
71+
6172
url: str
6273
metadata: dict[str, Any]
6374
type: str
75+
local_path: Optional[str]
6476

6577
LOCAL_REGEX = re.compile(r"((file|osfs)://)?/?[^:]+")
6678

@@ -69,40 +81,72 @@ def __init__(
6981
url: str,
7082
metadata: Mapping[str, Any],
7183
relative_to: Optional[Path] = None,
84+
local_path: Optional[str] = None,
7285
):
73-
relative_to = relative_to or Path.cwd()
74-
if self.is_local(url):
75-
scheme, separator, resource = url.rpartition("://")
76-
path = Path(resource)
77-
if not path.is_absolute():
78-
resource = os.path.relpath(relative_to / resource)
79-
url = "".join([scheme, separator, resource])
80-
self.url = str(url)
86+
self.url = self._relativize_url(url, relative_to)
8187
self.metadata = dict(metadata)
8288
self.type = self._pop_file_type()
83-
self.file_name = self._get_file_name()
89+
8490
self._fs: Optional[FS]
8591
self._fs = None
92+
self._fs_path: Optional[str]
93+
self._fs_path = None
94+
self._name: Optional[str]
95+
self._name = None
8696

87-
@property
88-
def fs(self) -> FS:
89-
if self._fs is None:
90-
fname = self.file_name
91-
fs, bname = open_parent_fs(self.url)
92-
if bname != fname:
93-
message = f"Inconsistent file names: FS ({bname}) and File ({fname})."
94-
raise ValueError(message)
95-
self._fs = fs
96-
return self._fs
97+
self.local_path = local_path or self._init_local_path()
98+
99+
def _relativize_url(self, url: str, relative_to: Optional[Path]) -> str:
100+
"""Update local URLs if relative to a directory other than CWD."""
101+
relative_to = relative_to or Path.cwd()
102+
if self.is_url_local(url):
103+
scheme, separator, resource = url.rpartition("://")
104+
path = Path(resource)
105+
if not path.is_absolute():
106+
resource = os.path.relpath(relative_to / resource)
107+
url = f"{scheme}{separator}{resource}"
108+
return url
97109

98110
def _pop_file_type(self) -> str:
99111
file_type = self.get_metadata("file_type")
100112
del self.metadata["file_type"]
101113
return file_type
102114

103-
def _get_file_name(self) -> str:
104-
path = PurePosixPath(self.url)
105-
return path.name
115+
def _init_local_path(self) -> Optional[str]:
116+
if self.is_url_local():
117+
local_path = self.fs.getsyspath(self.fs_path)
118+
else:
119+
local_path = None
120+
return local_path
121+
122+
def _initialize_fs(self) -> tuple[FS, str]:
123+
"""Retrieve and store parent FS and basename."""
124+
fs, fs_path = open_parent_fs(self.url)
125+
self._fs_path = fs_path
126+
self._fs = fs
127+
return fs, fs_path
128+
129+
@property
130+
def fs(self) -> FS:
131+
fs = self._fs
132+
if fs is None:
133+
fs, _ = self._initialize_fs()
134+
return fs
135+
136+
@property
137+
def fs_path(self) -> str:
138+
fs_path = self._fs_path
139+
if fs_path is None:
140+
_, fs_path = self._initialize_fs()
141+
return fs_path
142+
143+
@property
144+
def name(self) -> str:
145+
name = self._name
146+
if name is None:
147+
info = self.fs.getinfo(self.fs_path)
148+
name = info.name
149+
return name
106150

107151
def get_file_type(self) -> FileType:
108152
return FileType.get_file_type(self.type)
@@ -115,22 +159,20 @@ def get_metadata(self, key: str) -> Any:
115159
raise ValueError(message)
116160
return self.metadata[key]
117161

118-
def is_local(self, url: Optional[str] = None) -> bool:
162+
def is_url_local(self, url: Optional[str] = None) -> bool:
119163
url = url or self.url
120164
return self.LOCAL_REGEX.fullmatch(url) is not None
121165

122-
# TODO: Create a new instance attribute `self._local_path` for keeping
123-
# track of the local path instead of overwriting `self.url`
124-
def get_local_path(self) -> str:
125-
if not self.is_local():
126-
message = f"File ({self.url}) should first be downloaded using stage()."
127-
raise FileNotFoundError(message)
128-
local_path = self.url
129-
if local_path.startswith(("osfs://", "file://")):
130-
local_path = self.fs.getsyspath(self.file_name)
131-
return local_path
166+
def is_file_local(self, url: Optional[str] = None) -> bool:
167+
return self.local_path is not None
168+
169+
def get_local_path(self) -> Path:
170+
if self.local_path is None:
171+
message = "Local path is unavailable. Use stage() to create a local copy."
172+
raise ValueError(message)
173+
return Path(self.local_path)
132174

133-
def stage(self, destination: Optional[str] = None, overwrite: bool = False) -> str:
175+
def stage(self, destination: Optional[str] = None, overwrite: bool = False) -> Path:
134176
"""Download remote files and copy local files.
135177
136178
A destination is required for remote files.
@@ -143,28 +185,24 @@ def stage(self, destination: Optional[str] = None, overwrite: bool = False) -> s
143185
at the target destination. Defaults to False.
144186
145187
Raises:
146-
ValueError: If a destination is not specified
147-
when staging a remote file.
148188
ValueError: If the parent directory of the
149189
destination does not exist.
150190
FileExistsError: If the destination file already
151191
exists and ``overwrite`` was not enabled.
152192
153193
Returns:
154-
str: The updated URL (i.e., location) of the file.
194+
Path: The path of the local copy.
155195
"""
156196
if not destination:
157-
if self.is_local():
158-
return self.url
197+
if self.local_path is not None:
198+
return self.get_local_path()
159199
else:
160-
message = f"Destination is required for remote files ({self.url})."
161-
raise ValueError(message)
200+
destination = mkdtemp()
162201

163202
# By this point, destination is defined (not None)
164-
file_name = self._get_file_name()
165203
destination_path = Path(destination)
166204
if destination_path.is_dir():
167-
destination_path = destination_path / file_name
205+
destination_path = destination_path / self.name
168206
destination = destination_path.as_posix()
169207

170208
if not destination_path.parent.exists():
@@ -175,15 +213,14 @@ def stage(self, destination: Optional[str] = None, overwrite: bool = False) -> s
175213
message = f"Destination ({destination}) already exists. Enable overwrite."
176214
raise FileExistsError(message)
177215

178-
if self.is_local():
216+
if self.is_url_local():
179217
local_path = self.get_local_path()
180218
destination_path.symlink_to(local_path)
181219
else:
182-
with open(destination, "wb") as dest_file:
183-
self.fs.download(self.file_name, dest_file)
220+
with destination_path.open("wb") as dest_file:
221+
self.fs.download(self.fs_path, dest_file)
184222

185-
self.url = destination
186-
return self.url
223+
return destination_path
187224

188225
def to_dict(self) -> SerializedObject:
189226
return asdict(self)

src/dcqc/parsers.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ class CsvParser:
1313

1414
def __init__(self, path: Path):
1515
self.path = path
16+
self.csv_directory = self.path.parent
1617

1718
def list_rows(self) -> Iterator[tuple[int, dict]]:
1819
with self.path.open(newline="") as file:
@@ -21,14 +22,17 @@ def list_rows(self) -> Iterator[tuple[int, dict]]:
2122
yield index, row
2223

2324
def _row_to_file(self, row: dict[str, str]) -> File:
24-
csv_directory = self.path.parent
2525
url = row.pop("url")
26-
file = File(url, row, relative_to=csv_directory)
26+
file = File(url, row, relative_to=self.csv_directory)
2727
return file
2828

2929
def create_files(self) -> Iterator[File]:
30-
for _, row in self.list_rows():
30+
for index, row in self.list_rows():
3131
file = self._row_to_file(row)
32+
if not file.is_file_local():
33+
destination = self.csv_directory / "staged_files" / f"index_{index}"
34+
destination.mkdir(parents=True, exist_ok=True)
35+
file.stage(destination.as_posix(), overwrite=True)
3236
yield file
3337

3438
def create_targets(self) -> Iterator[Target]:

src/dcqc/tests/tests.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def compute_status(self) -> TestStatus:
3737
def _compute_md5_checksum(self, file: File) -> str:
3838
local_path = file.get_local_path()
3939
hash_md5 = hashlib.md5()
40-
with open(local_path, "rb") as f:
40+
with local_path.open("rb") as f:
4141
for chunk in iter(lambda: f.read(4096), b""):
4242
hash_md5.update(chunk)
4343
actual_md5 = hash_md5.hexdigest()
@@ -49,7 +49,7 @@ class LibTiffInfoTest(ExternalTestMixin, TestABC):
4949

5050
def generate_process(self) -> Process:
5151
file = self._get_single_target_file()
52-
path = file.get_local_path()
52+
path = file.get_local_path().as_posix()
5353
command_args = ["tiffinfo", path]
5454
process = Process(
5555
container="autamus/libtiff:4.4.0",
@@ -63,7 +63,7 @@ class BioFormatsInfoTest(ExternalTestMixin, TestABC):
6363

6464
def generate_process(self) -> Process:
6565
file = self._get_single_target_file()
66-
path = file.get_local_path()
66+
path = file.get_local_path().as_posix()
6767
command_args = [
6868
'export PATH="$PATH:/opt/bftools"',
6969
";",
@@ -85,7 +85,7 @@ class OmeXmlSchemaTest(ExternalTestMixin, TestABC):
8585

8686
def generate_process(self) -> Process:
8787
file = self._get_single_target_file()
88-
path = file.get_local_path()
88+
path = file.get_local_path().as_posix()
8989
command_args = [
9090
'export PATH="$PATH:/opt/bftools"',
9191
";",

0 commit comments

Comments
 (0)