Skip to content

Commit 7bc80a7

Browse files
authored
Fix download logic (#230)
* 👽 Remove workaround now that upstream change has been merged * ✨ Check dataset status before deciding what to do with it * ✨ Add simple function to load in chunks using native Anemoi functionality * ✨ Add an initialise function to wrap Anemoi Init * ♻️ Replace download Create with download in parts * ⚰️ Remove load_in_parts and associated tracking functionality as this is handled directly by Anemoi * ⚰️ Remove init command and CLI * ⚰️ Remove load command and CLI * 🚚 Alphabetise DataDownloader methods * 🚸 Also check whether statistics are ready before finalising * 🐛 Remove invalid datasets whether or not they appear to be complete * ⚰️ Removed tests for removed code * ⚗️ Test whether chunk-checking is sufficient to cause downloads to resume * 🔧 Default to statistics=False when inspecting * 🥅 Exit gracefully when inspect fails during download * 🔧 Do not set progress when detailed is set or the same information is printed twice * 🐛 If only the statistics are missing we need to finalise * 🚨 Perform explicit type checking to avoid linting errors * 🐛 Finalise when statistics are *not* ready * 🐛 Ensure that typer.Exit is raised * 🥅 Raise exception if status cannot be retrieved, rather than using default values * ⚰️ Removed the unused finalise CLI command
1 parent 6eefed7 commit 7bc80a7

9 files changed

Lines changed: 138 additions & 991 deletions

File tree

icenet_mp/cli/datasets.py

Lines changed: 5 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -32,93 +32,18 @@ def create(
3232

3333
@datasets_cli.command("inspect")
3434
@hydra_adaptor
35-
def inspect(config: DictConfig) -> None:
36-
"""Inspect all datasets."""
37-
factory = DataDownloaderFactory(config)
38-
for downloader in factory.downloaders:
39-
logger.info("Working on %s.", downloader.name)
40-
downloader.inspect()
41-
42-
43-
@datasets_cli.command("init")
44-
@hydra_adaptor
45-
def init(
35+
def inspect(
4636
config: DictConfig,
4737
*,
48-
overwrite: Annotated[
49-
bool, typer.Option(help="Specify whether to overwrite existing datasets")
38+
statistics: Annotated[
39+
bool, typer.Option(help="Specify whether to show dataset statistics")
5040
] = False,
5141
) -> None:
52-
"""Create all datasets."""
53-
factory = DataDownloaderFactory(config)
54-
for downloader in factory.downloaders:
55-
logger.info("Working on %s.", downloader.name)
56-
downloader.init(overwrite=overwrite)
57-
58-
59-
@datasets_cli.command("load")
60-
@hydra_adaptor
61-
def load(
62-
config: DictConfig,
63-
parts: Annotated[str, typer.Option(help="The part to process, specified as 'i/n'")],
64-
) -> None:
65-
"""Load dataset in parts."""
66-
factory = DataDownloaderFactory(config)
67-
for downloader in factory.downloaders:
68-
logger.info("Working on %s.", downloader.name)
69-
downloader.load(parts=parts)
70-
71-
72-
@datasets_cli.command("load_in_parts")
73-
@hydra_adaptor
74-
def load_in_parts(
75-
config: DictConfig,
76-
*,
77-
continue_on_error: Annotated[
78-
bool, typer.Option(help="Continue to next part on error")
79-
] = True,
80-
force_reset: Annotated[
81-
bool,
82-
typer.Option(
83-
help="Clear existing progress part_tracker file and start from part 1"
84-
),
85-
] = False,
86-
dataset: Annotated[
87-
str | None, typer.Option(help="Run only a single dataset by name")
88-
] = None,
89-
total_parts: Annotated[
90-
int, typer.Option(help="Override default total parts (10)")
91-
] = 10,
92-
overwrite: Annotated[
93-
bool,
94-
typer.Option(help="Delete the dataset directory before loading"),
95-
] = False,
96-
) -> None:
97-
"""Load all parts for all datasets in parts, recording progress so runs can be resumed."""
98-
factory = DataDownloaderFactory(config)
99-
for downloader in factory.downloaders:
100-
if dataset and downloader.name != dataset:
101-
logger.info("Not loading %s.", downloader.name)
102-
continue
103-
logger.info("Working on %s.", downloader.name)
104-
downloader.load_in_parts(
105-
continue_on_error=continue_on_error,
106-
force_reset=force_reset,
107-
total_parts=total_parts,
108-
overwrite=overwrite,
109-
)
110-
111-
112-
@datasets_cli.command("finalise")
113-
@hydra_adaptor
114-
def finalise(
115-
config: DictConfig,
116-
) -> None:
117-
"""Finalise loaded dataset."""
42+
"""Inspect all datasets."""
11843
factory = DataDownloaderFactory(config)
11944
for downloader in factory.downloaders:
12045
logger.info("Working on %s.", downloader.name)
121-
downloader.finalise()
46+
downloader.inspect(statistics=statistics)
12247

12348

12449
if __name__ == "__main__":

0 commit comments

Comments
 (0)