Fix/wandb checkpoints (#229)

asawczyn · web-flow · commit e14fb1c0336f · 2022-04-19T12:22:39.000+02:00
* fix: save output as files not artifacts

* ci: add mypy check of examples

Co-authored-by: Albert Sawczyn &lt;albert.sawczyn@pwr.edu.pl&gt;
diff --git a/embeddings/pipeline/lightning_pipeline.py b/embeddings/pipeline/lightning_pipeline.py
@@ -48,14 +48,16 @@ def run(self, run_name: Optional[str] = None) -> EvaluationResult:
         self._save_artifacts()
         model_result = self.model.execute(data=self.datamodule, run_name=run_name)
         result = self.evaluator.evaluate(model_result)
-        self._finish_logging(run_name)
+        self._finish_logging()
         return result
 
     def _save_artifacts(self) -> None:
         srsly.write_json(self.output_path.joinpath("packages.json"), get_installed_packages())
 
-    def _finish_logging(self, run_name: Optional[str] = None) -> None:
+    def _finish_logging(self) -> None:
         if self.logging_config.use_wandb():
             logger = WandbWrapper()
-            logger.log_output(self.output_path, run_name)
+            logger.log_output(
+                self.output_path, ignore={"wandb", "csv", "tensorboard", "checkpoints"}
+            )
             logger.finish_logging()
diff --git a/embeddings/pipeline/pipelines_metadata.py b/embeddings/pipeline/pipelines_metadata.py
@@ -63,7 +63,6 @@ class FlairClassificationEvaluationPipelineMetadata(FlairEvaluationPipelineMetad
 class LightningPipelineMetadata(EmbeddingPipelineBaseMetadata):
     embedding_name_or_path: T_path
     dataset_name_or_path: T_path
-    input_column_name: Union[str, Sequence[str]]
     target_column_name: str
     config: LightningConfig
     devices: Optional[Union[List[int], str, int]]
@@ -74,10 +73,11 @@ class LightningPipelineMetadata(EmbeddingPipelineBaseMetadata):
 
 
 class LightningClassificationPipelineMetadata(LightningPipelineMetadata):
-    pass
+    input_column_name: Union[str, Sequence[str]]
 
 
 class LightningSequenceLabelingPipelineMetadata(LightningPipelineMetadata):
+    input_column_name: str
     evaluation_mode: EvaluationMode
     tagging_scheme: Optional[TaggingScheme]
 
diff --git a/embeddings/utils/loggers.py b/embeddings/utils/loggers.py
@@ -3,7 +3,7 @@
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Union
 
 import wandb
 from pytorch_lightning import loggers as pl_loggers
@@ -129,12 +129,14 @@ def finish_logging(self) -> None:
 
 
 class WandbWrapper(ExperimentLogger):
-    def log_output(self, output_path: T_path, run_name: Optional[str] = None) -> None:
-        wandb.log_artifact(
-            str(output_path),
-            name=run_name,
-            type="output",
-        )
+    def log_output(
+        self,
+        output_path: T_path,
+        ignore: Optional[Iterable[str]] = None,
+    ) -> None:
+        for entry in os.scandir(output_path):
+            if not ignore or entry.name not in ignore:
+                wandb.save(entry.path, output_path)
 
     def finish_logging(self) -> None:
         wandb.finish()
diff --git a/examples/hps_lightning_document_classification.py b/examples/hps_lightning_document_classification.py
@@ -2,12 +2,11 @@
 
 import typer
 
+from embeddings.config.lighting_config_space import LightingTextClassificationConfigSpace
 from embeddings.defaults import RESULTS_PATH
-from embeddings.hyperparameter_search.lighting_configspace import (
-    LightingTextClassificationConfigSpace,
-)
 from embeddings.pipeline.lightning_classification import LightningClassificationPipeline
 from embeddings.pipeline.lightning_hps_pipeline import OptimizedLightingClassificationPipeline
+from embeddings.utils.loggers import LightningLoggingConfig
 from embeddings.utils.utils import build_output_path
 
 app = typer.Typer()
@@ -32,20 +31,11 @@ def run(
     wandb: bool = typer.Option(False, help="Flag for using wandb."),
     tensorboard: bool = typer.Option(False, help="Flag for using tensorboard."),
     csv: bool = typer.Option(False, help="Flag for using csv."),
-    wandb_project: Optional[str] = typer.Option(None, help="Name of wandb project."),
+    tracking_project_name: Optional[str] = typer.Option(None, help="Name of wandb project."),
     wandb_entity: Optional[str] = typer.Option(None, help="Name of entity project"),
 ) -> None:
     if not run_name:
         run_name = embedding_name_or_path
-
-    logging_kwargs = {
-        "use_tensorboard": tensorboard,
-        "use_wandb": wandb,
-        "use_csv": csv,
-        "wandb_project": wandb_project,
-        "wandb_entity": wandb_entity,
-    }
-
     output_path = build_output_path(root, embedding_name_or_path, dataset_name)
     config_space = LightingTextClassificationConfigSpace(
         embedding_name_or_path=embedding_name_or_path,
@@ -55,15 +45,30 @@ def run(
         dataset_name_or_path=dataset_name,
         input_column_name=input_column_name,
         target_column_name=target_column_name,
-        logging_kwargs=logging_kwargs,
+        logging_config=LightningLoggingConfig.from_flags(
+            wandb=wandb,
+            tensorboard=tensorboard,
+            csv=csv,
+            tracking_project_name=tracking_project_name,
+            wandb_entity=wandb_entity,
+        ),
         n_trials=n_trials,
     ).persisting(
         best_params_path=output_path.joinpath("best_params.yaml"),
         log_path=output_path.joinpath("hps_log.pickle"),
     )
     df, metadata = pipeline.run(run_name=f"search-{run_name}")
+    del pipeline
 
-    pipeline = LightningClassificationPipeline(
-        output_path=output_path, logging_kwargs=logging_kwargs, **metadata
+    metadata["output_path"] = output_path
+    retrain_pipeline = LightningClassificationPipeline(
+        logging_config=LightningLoggingConfig.from_flags(
+            wandb=wandb,
+            tensorboard=tensorboard,
+            csv=csv,
+            tracking_project_name=tracking_project_name,
+            wandb_entity=wandb_entity,
+        ),
+        **metadata,
     )
-    pipeline.run(run_name=f"best-params-retrain-{run_name}")
+    retrain_pipeline.run(run_name=f"best-params-retrain-{run_name}")
diff --git a/examples/hps_lightning_sequence_labeling.py b/examples/hps_lightning_sequence_labeling.py
@@ -2,12 +2,11 @@
 
 import typer
 
+from embeddings.config.lighting_config_space import LightingSequenceLabelingConfigSpace
 from embeddings.defaults import RESULTS_PATH
-from embeddings.hyperparameter_search.lighting_configspace import (
-    LightingSequenceLabelingConfigSpace,
-)
 from embeddings.pipeline.lightning_hps_pipeline import OptimizedLightingSequenceLabelingPipeline
 from embeddings.pipeline.lightning_sequence_labeling import LightningSequenceLabelingPipeline
+from embeddings.utils.loggers import LightningLoggingConfig
 from embeddings.utils.utils import build_output_path
 
 app = typer.Typer()
@@ -32,20 +31,11 @@ def run(
     wandb: bool = typer.Option(False, help="Flag for using wandb."),
     tensorboard: bool = typer.Option(False, help="Flag for using tensorboard."),
     csv: bool = typer.Option(False, help="Flag for using csv."),
-    wandb_project: Optional[str] = typer.Option(None, help="Name of wandb project."),
+    tracking_project_name: Optional[str] = typer.Option(None, help="Name of wandb project."),
     wandb_entity: Optional[str] = typer.Option(None, help="Name of entity project"),
 ) -> None:
     if not run_name:
         run_name = embedding_name_or_path
-
-    logging_kwargs = {
-        "use_tensorboard": tensorboard,
-        "use_wandb": wandb,
-        "use_csv": csv,
-        "wandb_project": wandb_project,
-        "wandb_entity": wandb_entity,
-    }
-
     output_path = build_output_path(root, embedding_name_or_path, dataset_name)
     config_space = LightingSequenceLabelingConfigSpace(
         embedding_name_or_path=embedding_name_or_path,
@@ -55,15 +45,30 @@ def run(
         dataset_name_or_path=dataset_name,
         input_column_name=input_column_name,
         target_column_name=target_column_name,
-        logging_kwargs=logging_kwargs,
+        logging_config=LightningLoggingConfig.from_flags(
+            wandb=wandb,
+            tensorboard=tensorboard,
+            csv=csv,
+            tracking_project_name=tracking_project_name,
+            wandb_entity=wandb_entity,
+        ),
         n_trials=n_trials,
     ).persisting(
         best_params_path=output_path.joinpath("best_params.yaml"),
         log_path=output_path.joinpath("hps_log.pickle"),
     )
     df, metadata = pipeline.run(run_name=f"search-{run_name}")
+    del pipeline
 
-    pipeline = LightningSequenceLabelingPipeline(
-        output_path=output_path, logging_kwargs=logging_kwargs, **metadata
+    metadata["output_path"] = output_path
+    retrain_pipeline = LightningSequenceLabelingPipeline(
+        logging_config=LightningLoggingConfig.from_flags(
+            wandb=wandb,
+            tensorboard=tensorboard,
+            csv=csv,
+            tracking_project_name=tracking_project_name,
+            wandb_entity=wandb_entity,
+        ),
+        **metadata,
     )
-    pipeline.run(run_name=f"best-params-retrain-{run_name}")
+    retrain_pipeline.run(run_name=f"best-params-retrain-{run_name}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,10 +63,11 @@ black_fix = "black ./"
 isort_fix = "isort . "
 pyflakes = "pyflakes embeddings"
 mypy = "mypy -p embeddings"
+mypy_examples = "mypy examples"
 coverage_test = "coverage run -m pytest"
 coverage_report = "coverage report -mi"
 test = ["coverage_test", "coverage_report"]
-check = ["black", "isort", "mypy", "pyflakes"]
+check = ["black", "isort", "pyflakes", "mypy", "mypy_examples"]
 fix = ["black_fix", "isort_fix"]
 all = ["check", "test"]