From 0074ea6777f49d9f8b3ad75e1099d81187dcc6e9 Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Sat, 25 Jan 2025 16:32:12 +0100 Subject: [PATCH 01/11] [main](chore) Remove unneeded test util function --- tests/test_data/utils.py | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 tests/test_data/utils.py diff --git a/tests/test_data/utils.py b/tests/test_data/utils.py deleted file mode 100644 index 996bd1c..0000000 --- a/tests/test_data/utils.py +++ /dev/null @@ -1,10 +0,0 @@ -import torch - - -def tensor_batch_iterable(n_batches: int, batch_size: int, batch_shape: tuple[int, ...], n_tensors: int): - """ - Testing util for defining batched iterable for Tensors returning given - number of batches with given batch size all with ones tensors - """ - for _ in range(n_batches): - yield tuple([torch.ones((batch_size,) + batch_shape) for _ in range(n_tensors)]) From 71c87e45cec83195c562acd83da5bccf6102eced Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Mon, 27 Jan 2025 19:33:20 +0100 Subject: [PATCH 02/11] [experiment](chore) Remove the unused defaults --- pydentification/experiment/defaults/report.py | 18 ------------------ pydentification/experiment/defaults/train.py | 12 ------------ pydentification/experiment/dumper/__init__.py | 0 3 files changed, 30 deletions(-) delete mode 100644 pydentification/experiment/defaults/report.py delete mode 100644 pydentification/experiment/defaults/train.py create mode 100644 pydentification/experiment/dumper/__init__.py diff --git a/pydentification/experiment/defaults/report.py b/pydentification/experiment/defaults/report.py deleted file mode 100644 index 840e006..0000000 --- a/pydentification/experiment/defaults/report.py +++ /dev/null @@ -1,18 +0,0 @@ -import lightning.pytorch as pl -import torch - -from pydentification.experiment.reporters import report_metrics, report_prediction_plot, report_trainable_parameters -from pydentification.metrics import regression_metrics - - -def report_fn(model: pl.LightningModule, trainer: pl.Trainer, dm: pl.LightningDataModule): - """Logs the experiment results to W&B""" - y_hat = trainer.predict(model, datamodule=dm) - y_pred = torch.cat(y_hat).numpy() - y_true = torch.cat([y for _, y in dm.test_dataloader()]).numpy() - - metrics = regression_metrics(y_pred=y_pred.flatten(), y_true=y_true.flatten()) # type: ignore - - report_metrics(metrics, prefix="test") # type: ignore - report_trainable_parameters(model, prefix="config") - report_prediction_plot(predictions=y_pred, targets=y_true, prefix="test") diff --git a/pydentification/experiment/defaults/train.py b/pydentification/experiment/defaults/train.py deleted file mode 100644 index 8f37fc6..0000000 --- a/pydentification/experiment/defaults/train.py +++ /dev/null @@ -1,12 +0,0 @@ -import lightning.pytorch as pl - - -def train_fn( - model: pl.LightningModule, trainer: pl.Trainer, dm: pl.LightningDataModule, checkpoint_path: str | None = None -) -> tuple[pl.LightningModule, pl.Trainer]: - """ - Runs training using pl.Trainer and pl.LightningModule with given LightningDataModule, returns both model and trainer - Can be restarted if checkpoint_path is provided. - """ - trainer.fit(model, datamodule=dm, ckpt_path=checkpoint_path) - return model, trainer diff --git a/pydentification/experiment/dumper/__init__.py b/pydentification/experiment/dumper/__init__.py new file mode 100644 index 0000000..e69de29 From 2f79a5b85ad24776360e43351df172f3e9500a67 Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Tue, 28 Jan 2025 19:22:33 +0100 Subject: [PATCH 03/11] [experiment](chore) Add code for saving only torch.nn.Module --- pydentification/experiment/defaults/save.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pydentification/experiment/defaults/save.py b/pydentification/experiment/defaults/save.py index bf95c72..bd57201 100644 --- a/pydentification/experiment/defaults/save.py +++ b/pydentification/experiment/defaults/save.py @@ -1,11 +1,14 @@ import os +import lightning.pytorch as pl import torch import wandb -def save_fn(name: str, model: torch.nn.Module): +def save_torch_module(name: str, model: pl.LightningModule): + """Saves the torch model from given LightningModule to W&B""" path = f"models/{name}/trained-model.pt" os.makedirs(os.path.dirname(path), exist_ok=True) - torch.save(model, path) + + torch.save(model.module.state_dict(), path) # saves only torch wandb.save(path) From 045880fd76f91e1479d4c20cd6e9d1995cc5bced Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Wed, 29 Jan 2025 21:34:54 +0100 Subject: [PATCH 04/11] [experiment](feat) Add loading with safetensors --- pydentification/experiment/defaults/save.py | 14 +++++++++++--- requirements.txt | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pydentification/experiment/defaults/save.py b/pydentification/experiment/defaults/save.py index bd57201..852f637 100644 --- a/pydentification/experiment/defaults/save.py +++ b/pydentification/experiment/defaults/save.py @@ -1,14 +1,22 @@ import os +from typing import Literal import lightning.pytorch as pl import torch import wandb +from safetensors.torch import save_model -def save_torch_module(name: str, model: pl.LightningModule): +def save_torch_module(name: str, model: pl.LightningModule, method: Literal["pt", "safetensors"] = "safetensors"): """Saves the torch model from given LightningModule to W&B""" - path = f"models/{name}/trained-model.pt" + path = f"models/{name}/trained-model.{method}" os.makedirs(os.path.dirname(path), exist_ok=True) - torch.save(model.module.state_dict(), path) # saves only torch + if method == "safetensors": + save_model(model.module, path) + elif method == "pt": + torch.save(model.module.state_dict(), path) # saves only torch + else: + raise ValueError(f"Unknown method: {method}!") + wandb.save(path) diff --git a/requirements.txt b/requirements.txt index 8ab2ccb..7cba2f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ torch==2.4.0 plotly==5.24.1 wandb==0.18.3 h5py==3.12.1 +safetensors==0.5.2 From a955ba177d60d3e492e3436ca79345585483cb43 Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Wed, 29 Jan 2025 21:47:38 +0100 Subject: [PATCH 05/11] [experiment](chore) Move saving to new package --- pydentification/experiment/{defaults => dumper}/save.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pydentification/experiment/{defaults => dumper}/save.py (100%) diff --git a/pydentification/experiment/defaults/save.py b/pydentification/experiment/dumper/save.py similarity index 100% rename from pydentification/experiment/defaults/save.py rename to pydentification/experiment/dumper/save.py From 1761c6199c8db6092eea82dc481d66e3a34abd1e Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Thu, 30 Jan 2025 14:11:02 +0100 Subject: [PATCH 06/11] [experiment](feat) Add code snapshot generator --- .../experiment/defaults/__init__.py | 0 pydentification/experiment/dumper/code.py | 60 +++++++++++++++++++ pydentification/measure/coverters/__init__.py | 0 3 files changed, 60 insertions(+) delete mode 100644 pydentification/experiment/defaults/__init__.py create mode 100644 pydentification/experiment/dumper/code.py delete mode 100644 pydentification/measure/coverters/__init__.py diff --git a/pydentification/experiment/defaults/__init__.py b/pydentification/experiment/defaults/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pydentification/experiment/dumper/code.py b/pydentification/experiment/dumper/code.py new file mode 100644 index 0000000..5487d52 --- /dev/null +++ b/pydentification/experiment/dumper/code.py @@ -0,0 +1,60 @@ +import os +import shutil +import uuid +from pathlib import Path + +PYTHON_EXTENSIONS = frozenset({".py", ".json", ".txt", ".md", ".yaml", ".yml", ".toml", ".ini"}) +FORBIDDEN_PREFIX = frozenset({"venv", ".ipynb_checkpoints", "__pycache__", ".git", ".pytest_cache"}) + + +def load_gitignore() -> set[str]: + """Load .gitignore from default name and root directory as set""" + + def not_comment(line: str) -> bool: + return not (line.startswith("#") or line.isspace() or not line) + + gitignore = Path(".gitignore") + if not gitignore.exists(): + return set() + + with gitignore.open("r") as f: + return set(filter(not_comment, f.read().splitlines())) + + +def save_code_snapshot(name: str, source_dir: str | Path): + """Save only text-based files in a ZIP archive, excluding binary data files.""" + if isinstance(source_dir, str): + source_dir = Path(source_dir) + + source_dir = Path(source_dir).resolve() # ensure absolute path + snapshot_filename = f"source_code_{name}" + temp_dir = Path(f"temp_code_snapshot_{uuid.uuid4()}") # append random UUID to avoid conflicts + + forbidden = FORBIDDEN_PREFIX | load_gitignore() # union of forbidden prefixes and gitignore + + if temp_dir.exists(): + shutil.rmtree(temp_dir) + + temp_dir.mkdir(parents=True, exist_ok=True) + + for root, dirs, files in os.walk(source_dir): + root_path = Path(root) + if ( + str(temp_dir.absolute()) == root # prevent copying the temp directory + or temp_dir in root_path.parents + or any(part.startswith(prefix) for prefix in forbidden for part in root_path.parts) + ): + dirs.clear() # prevent descending into this directory + continue # skip to the next directory + + for file in files: + file_path = root_path / file + if file_path.suffix in PYTHON_EXTENSIONS: + relative_path = file_path.relative_to(source_dir) + dest_path = temp_dir / relative_path + + dest_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(file_path, dest_path) + + shutil.make_archive(snapshot_filename, format="zip", root_dir=temp_dir) # archive the directory + shutil.rmtree(temp_dir) diff --git a/pydentification/measure/coverters/__init__.py b/pydentification/measure/coverters/__init__.py deleted file mode 100644 index e69de29..0000000 From 8bbfb3c0c241e39cc1d2503b5d68e12ea17fa4be Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Sun, 2 Feb 2025 13:16:21 +0100 Subject: [PATCH 07/11] [experiment](chore) Simplify exclusion logic --- pydentification/experiment/dumper/code.py | 27 ++++++++++++++++------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/pydentification/experiment/dumper/code.py b/pydentification/experiment/dumper/code.py index 5487d52..d458ab4 100644 --- a/pydentification/experiment/dumper/code.py +++ b/pydentification/experiment/dumper/code.py @@ -4,10 +4,10 @@ from pathlib import Path PYTHON_EXTENSIONS = frozenset({".py", ".json", ".txt", ".md", ".yaml", ".yml", ".toml", ".ini"}) -FORBIDDEN_PREFIX = frozenset({"venv", ".ipynb_checkpoints", "__pycache__", ".git", ".pytest_cache"}) +DEFAULT_FORBIDDEN_PREFIX = frozenset({"venv", ".ipynb_checkpoints", "__pycache__", ".git", ".pytest_cache"}) -def load_gitignore() -> set[str]: +def _load_gitignore() -> set[str]: """Load .gitignore from default name and root directory as set""" def not_comment(line: str) -> bool: @@ -21,8 +21,22 @@ def not_comment(line: str) -> bool: return set(filter(not_comment, f.read().splitlines())) +def _skip_subdir(current: Path, archive_path: Path, forbidden_paths: frozenset[str]) -> bool: + # prevent copying the temp directory, where the archive with source code is build + if str(archive_path.absolute()) == current: + return True + # prevent copying the parent directory of the temp directory + elif archive_path in current.parents: + return True + # prevent copying the forbidden paths from defaults and .gitignore + elif any(part.startswith(prefix) for prefix in forbidden_paths for part in current.parts): + return True + return False + + def save_code_snapshot(name: str, source_dir: str | Path): """Save only text-based files in a ZIP archive, excluding binary data files.""" + if isinstance(source_dir, str): source_dir = Path(source_dir) @@ -30,7 +44,8 @@ def save_code_snapshot(name: str, source_dir: str | Path): snapshot_filename = f"source_code_{name}" temp_dir = Path(f"temp_code_snapshot_{uuid.uuid4()}") # append random UUID to avoid conflicts - forbidden = FORBIDDEN_PREFIX | load_gitignore() # union of forbidden prefixes and gitignore + gitignore = _load_gitignore() + forbidden = DEFAULT_FORBIDDEN_PREFIX | gitignore if temp_dir.exists(): shutil.rmtree(temp_dir) @@ -39,11 +54,7 @@ def save_code_snapshot(name: str, source_dir: str | Path): for root, dirs, files in os.walk(source_dir): root_path = Path(root) - if ( - str(temp_dir.absolute()) == root # prevent copying the temp directory - or temp_dir in root_path.parents - or any(part.startswith(prefix) for prefix in forbidden for part in root_path.parts) - ): + if _skip_subdir(root_path, temp_dir, forbidden): dirs.clear() # prevent descending into this directory continue # skip to the next directory From 4bb4003b12f3b6a9533178a685fc95cb03ee5789 Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Sun, 2 Feb 2025 13:18:23 +0100 Subject: [PATCH 08/11] [experiment](chore) Rename save.py to models.py --- pydentification/experiment/dumper/{save.py => models.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pydentification/experiment/dumper/{save.py => models.py} (100%) diff --git a/pydentification/experiment/dumper/save.py b/pydentification/experiment/dumper/models.py similarity index 100% rename from pydentification/experiment/dumper/save.py rename to pydentification/experiment/dumper/models.py From 3df48dfc50b90c5fca6fa074b9bfd36a3b221155 Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Sun, 2 Feb 2025 13:20:48 +0100 Subject: [PATCH 09/11] [experiment](chore) Extract saving torch model from lightning module --- pydentification/experiment/dumper/models.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pydentification/experiment/dumper/models.py b/pydentification/experiment/dumper/models.py index 852f637..f059efb 100644 --- a/pydentification/experiment/dumper/models.py +++ b/pydentification/experiment/dumper/models.py @@ -1,4 +1,4 @@ -import os +from pathlib import Path from typing import Literal import lightning.pytorch as pl @@ -7,16 +7,19 @@ from safetensors.torch import save_model -def save_torch_module(name: str, model: pl.LightningModule, method: Literal["pt", "safetensors"] = "safetensors"): - """Saves the torch model from given LightningModule to W&B""" - path = f"models/{name}/trained-model.{method}" - os.makedirs(os.path.dirname(path), exist_ok=True) - +def save_torch(path: Path, model: torch.nn.Module, method: Literal["pt", "safetensors"] = "safetensors"): if method == "safetensors": - save_model(model.module, path) + save_model(model, path) elif method == "pt": - torch.save(model.module.state_dict(), path) # saves only torch + torch.save(model.state_dict(), path) # saves only torch else: raise ValueError(f"Unknown method: {method}!") + +def save_fn(name: str, model: pl.LightningModule, method: Literal["pt", "safetensors"] = "safetensors"): + """Saves the torch model from given LightningModule to W&B""" + path = Path(f"models/{name}/trained-model.{method}") + path.parent.mkdir(parents=True, exist_ok=True) + + save_torch(path, model=model.module, method=method) # save only the model wandb.save(path) From ef617594a027ce939c3cfe8101a2648a13e26265 Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Sun, 2 Feb 2025 13:31:52 +0100 Subject: [PATCH 10/11] [experiment](feat) Save hyperparameters with JSON --- pydentification/experiment/dumper/models.py | 30 +++++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pydentification/experiment/dumper/models.py b/pydentification/experiment/dumper/models.py index f059efb..121a832 100644 --- a/pydentification/experiment/dumper/models.py +++ b/pydentification/experiment/dumper/models.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import Literal @@ -16,10 +17,29 @@ def save_torch(path: Path, model: torch.nn.Module, method: Literal["pt", "safete raise ValueError(f"Unknown method: {method}!") -def save_fn(name: str, model: pl.LightningModule, method: Literal["pt", "safetensors"] = "safetensors"): - """Saves the torch model from given LightningModule to W&B""" - path = Path(f"models/{name}/trained-model.{method}") - path.parent.mkdir(parents=True, exist_ok=True) +def save_json(path: Path, data: dict): + with path.open("w") as f: + json.dump(data, f) # type: ignore + + +def save_fn( + name: str, + model: pl.LightningModule, + method: Literal["pt", "safetensors"] = "safetensors", + save_hparams: bool = False, +): + """ + :param name: name of the parent directory with the model and settings + :param model: PyTorch model + :param method: method of saving the model, either "pt" or "safetensors" + :param save_hparams: whether to save hyperparameters in a JSON file + """ + path = Path(f"models/{name}") + path.mkdir(parents=True, exist_ok=True) + + save_torch(path / f"trained-model.{method}", model=model.module, method=method) # save only the model + + if save_hparams: + save_json((path / "hparams.json"), model.hparams or {}) - save_torch(path, model=model.module, method=method) # save only the model wandb.save(path) From 96fa8892c65b845e6fc3f6259db78542b544a68a Mon Sep 17 00:00:00 2001 From: Krzysztof Zajac Date: Mon, 3 Feb 2025 20:17:51 +0100 Subject: [PATCH 11/11] [experiment](chore) Extend .gitignore to all venv prefixes --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4fcddaa..56bc5ff 100644 --- a/.gitignore +++ b/.gitignore @@ -124,6 +124,7 @@ celerybeat.pid .venv env/ venv/ +venv*/ ENV/ env.bak/ venv.bak/