From 15914a1c7bd84d9de94f3fccad99022d3a26a698 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Thu, 16 Jan 2025 22:44:14 +0000 Subject: [PATCH 1/5] added save_config function and implemented DiskLogger.log_config --- torchtune/training/metric_logging.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/torchtune/training/metric_logging.py b/torchtune/training/metric_logging.py index 42aa1f9d72..2c179a26e2 100644 --- a/torchtune/training/metric_logging.py +++ b/torchtune/training/metric_logging.py @@ -22,6 +22,22 @@ log = get_logger("DEBUG") +def save_config(config): + try: + output_config_fname = Path( + os.path.join( + config.output_dir, + "torchtune_config.yaml", + ) + ) + OmegaConf.save(config, output_config_fname) + log.info(f"Logging {output_config_fname}") + except Exception as e: + log.warning( + f"Error saving {output_config_fname} to disk.\nError: \n{e}." + ) + + class MetricLoggerInterface(Protocol): """Abstract metric logger.""" @@ -99,6 +115,9 @@ def log(self, name: str, data: Scalar, step: int) -> None: self._file.write(f"Step {step} | {name}:{data}\n") self._file.flush() + def log_config(self, config: DictConfig) -> None: + save_config(config) + def log_dict(self, payload: Mapping[str, Scalar], step: int) -> None: self._file.write(f"Step {step} | ") for name, data in payload.items(): From 1885cf3773ec0530a8215eb52e11424f352e7f70 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Thu, 16 Jan 2025 22:57:39 +0000 Subject: [PATCH 2/5] updated log message --- torchtune/training/metric_logging.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/torchtune/training/metric_logging.py b/torchtune/training/metric_logging.py index 2c179a26e2..071cc68d82 100644 --- a/torchtune/training/metric_logging.py +++ b/torchtune/training/metric_logging.py @@ -22,6 +22,7 @@ log = get_logger("DEBUG") + def save_config(config): try: output_config_fname = Path( @@ -30,13 +31,10 @@ def save_config(config): "torchtune_config.yaml", ) ) + log.info(f"Writing resolved config to {output_config_fname}") OmegaConf.save(config, output_config_fname) - log.info(f"Logging {output_config_fname}") except Exception as e: - log.warning( - f"Error saving {output_config_fname} to disk.\nError: \n{e}." - ) - + log.warning(f"Error saving {output_config_fname} to disk.\nError: \n{e}.") class MetricLoggerInterface(Protocol): From 1e9005218810e060e0f0e457e97c716219dc07c2 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sat, 18 Jan 2025 00:50:06 +0000 Subject: [PATCH 3/5] refactor log_config implementation for each logger class --- torchtune/training/metric_logging.py | 86 +++++++++++++++------------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/torchtune/training/metric_logging.py b/torchtune/training/metric_logging.py index 071cc68d82..c22bd06b00 100644 --- a/torchtune/training/metric_logging.py +++ b/torchtune/training/metric_logging.py @@ -23,18 +23,30 @@ log = get_logger("DEBUG") -def save_config(config): +def save_config(config: DictConfig) -> Path: + """ + Save the OmegaConf configuration to a YAML file at `{config.output_dir}/torchtune_config.yaml`. + + Args: + config (DictConfig): The OmegaConf config object to be saved. It must contain an `output_dir` attribute + specifying where the configuration file should be saved. + + Returns: + Path: The path to the saved configuration file. + + Note: + If the specified `output_dir` does not exist, it will be created. + """ try: - output_config_fname = Path( - os.path.join( - config.output_dir, - "torchtune_config.yaml", - ) - ) - log.info(f"Writing resolved config to {output_config_fname}") + output_dir = Path(config.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + output_config_fname = output_dir / "torchtune_config.yaml" + log.info(f"Writing config to {output_config_fname}") OmegaConf.save(config, output_config_fname) + return output_config_fname except Exception as e: - log.warning(f"Error saving {output_config_fname} to disk.\nError: \n{e}.") + log.warning(f"Error saving config to {output_config_fname}.\nError: \n{e}.") class MetricLoggerInterface(Protocol): @@ -56,7 +68,7 @@ def log( pass def log_config(self, config: DictConfig) -> None: - """Logs the config + """Logs the config as file Args: config (DictConfig): config to log @@ -114,7 +126,7 @@ def log(self, name: str, data: Scalar, step: int) -> None: self._file.flush() def log_config(self, config: DictConfig) -> None: - save_config(config) + _ = save_config(config) def log_dict(self, payload: Mapping[str, Scalar], step: int) -> None: self._file.write(f"Step {step} | ") @@ -136,6 +148,9 @@ class StdoutLogger(MetricLoggerInterface): def log(self, name: str, data: Scalar, step: int) -> None: print(f"Step {step} | {name}:{data}") + def log_config(self, config: DictConfig) -> None: + _ = save_config(config) + def log_dict(self, payload: Mapping[str, Scalar], step: int) -> None: print(f"Step {step} | ", end="") for name, data in payload.items(): @@ -200,6 +215,10 @@ def __init__( # Use dir if specified, otherwise use log_dir. self.log_dir = kwargs.pop("dir", log_dir) + # create log_dir if missing + if not os.path.exists(self.log_dir): + os.makedirs(self.log_dir) + _, self.rank = get_world_size_and_rank() if self._wandb.run is None and self.rank == 0: @@ -236,23 +255,17 @@ def log_config(self, config: DictConfig) -> None: self._wandb.config.update( resolved, allow_val_change=self.config_allow_val_change ) - try: - output_config_fname = Path( - os.path.join( - config.output_dir, - "torchtune_config.yaml", - ) - ) - OmegaConf.save(config, output_config_fname) - log.info(f"Logging {output_config_fname} to W&B under Files") + # Also try to save the config as a file + output_config_fname = save_config(config) + try: + log.info(f"Uploading {output_config_fname} to W&B under Files") self._wandb.save( output_config_fname, base_path=output_config_fname.parent ) - except Exception as e: log.warning( - f"Error saving {output_config_fname} to W&B.\nError: \n{e}." + f"Error uploading {output_config_fname} to W&B.\nError: \n{e}." "Don't worry the config will be logged the W&B workspace" ) @@ -322,6 +335,9 @@ def log(self, name: str, data: Scalar, step: int) -> None: if self._writer: self._writer.add_scalar(name, data, global_step=step, new_style=True) + def log_config(self, config: DictConfig) -> None: + _ = save_config(config) + def log_dict(self, payload: Mapping[str, Scalar], step: int) -> None: for name, data in payload.items(): self.log(name, data, step) @@ -404,13 +420,15 @@ def __init__( "Alternatively, use the ``StdoutLogger``, which can be specified by setting metric_logger_type='stdout'." ) from e + # Remove 'log_dir' from kwargs as it is not a valid argument for comet_ml.ExperimentConfig + del kwargs["log_dir"] + _, self.rank = get_world_size_and_rank() # Declare it early so further methods don't crash in case of # Experiment Creation failure due to mis-named configuration for # example self.experiment = None - if self.rank == 0: self.experiment = comet_ml.start( api_key=api_key, @@ -438,24 +456,14 @@ def log_config(self, config: DictConfig) -> None: self.experiment.log_parameters(resolved) # Also try to save the config as a file + output_config_fname = save_config(config) try: - self._log_config_as_file(config) + log.info(f"Uploading {output_config_fname} to Comet as an asset.") + self.experiment.log_asset( + output_config_fname, file_name=output_config_fname.name + ) except Exception as e: - log.warning(f"Error saving Config to disk.\nError: \n{e}.") - return - - def _log_config_as_file(self, config: DictConfig): - output_config_fname = Path( - os.path.join( - config.checkpointer.checkpoint_dir, - "torchtune_config.yaml", - ) - ) - OmegaConf.save(config, output_config_fname) - - self.experiment.log_asset( - output_config_fname, file_name="torchtune_config.yaml" - ) + log.warning(f"Failed to upload config to Comet assets. Error: {e}") def close(self) -> None: if self.experiment is not None: From 62185831dc262eb3e2686652e39a9a201f536378 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sat, 18 Jan 2025 03:48:36 +0000 Subject: [PATCH 4/5] fix key error in CometLogger --- torchtune/training/metric_logging.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchtune/training/metric_logging.py b/torchtune/training/metric_logging.py index c22bd06b00..19d3a0ef0e 100644 --- a/torchtune/training/metric_logging.py +++ b/torchtune/training/metric_logging.py @@ -46,7 +46,7 @@ def save_config(config: DictConfig) -> Path: OmegaConf.save(config, output_config_fname) return output_config_fname except Exception as e: - log.warning(f"Error saving config to {output_config_fname}.\nError: \n{e}.") + log.warning(f"Error saving config.\nError: \n{e}.") class MetricLoggerInterface(Protocol): @@ -421,7 +421,8 @@ def __init__( ) from e # Remove 'log_dir' from kwargs as it is not a valid argument for comet_ml.ExperimentConfig - del kwargs["log_dir"] + if "log_dir" in kwargs: + del kwargs["log_dir"] _, self.rank = get_world_size_and_rank() From 2a04a0732ecbe05e422b368777de90cc855dc8b4 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sun, 19 Jan 2025 11:14:25 -0800 Subject: [PATCH 5/5] remove redundant log messages in save_config and logger classes --- torchtune/training/metric_logging.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/torchtune/training/metric_logging.py b/torchtune/training/metric_logging.py index 19d3a0ef0e..dde1619194 100644 --- a/torchtune/training/metric_logging.py +++ b/torchtune/training/metric_logging.py @@ -42,7 +42,6 @@ def save_config(config: DictConfig) -> Path: output_dir.mkdir(parents=True, exist_ok=True) output_config_fname = output_dir / "torchtune_config.yaml" - log.info(f"Writing config to {output_config_fname}") OmegaConf.save(config, output_config_fname) return output_config_fname except Exception as e: @@ -259,7 +258,6 @@ def log_config(self, config: DictConfig) -> None: # Also try to save the config as a file output_config_fname = save_config(config) try: - log.info(f"Uploading {output_config_fname} to W&B under Files") self._wandb.save( output_config_fname, base_path=output_config_fname.parent ) @@ -459,7 +457,6 @@ def log_config(self, config: DictConfig) -> None: # Also try to save the config as a file output_config_fname = save_config(config) try: - log.info(f"Uploading {output_config_fname} to Comet as an asset.") self.experiment.log_asset( output_config_fname, file_name=output_config_fname.name )