Add + improve Pax status logging.

The paxml Authors · pax authors · commit cacfd9f563d3 · 2023-05-22T09:31:33.000-07:00
PiperOrigin-RevId: 534090287
diff --git a/paxml/checkpoint_creators.py b/paxml/checkpoint_creators.py
@@ -561,6 +561,7 @@ def _create_checkpointer(
     tensorstore_use_ocdbt: bool = False,
 ) -> checkpoints.TrainingCheckpointer:
   """Creates a checkpoint manager."""
+  logging.info('[PAX STATUS]: Creating checkpointer.')
   checkpoint_dir = _make_checkpoint_dir(job_log_dir)
   train_p = task_p.train
   max_to_keep = train_p.save_max_to_keep
diff --git a/paxml/executors.py b/paxml/executors.py
@@ -38,7 +38,6 @@
 from praxis import base_layer
 from praxis import pax_fiddle
 from praxis import py_utils
-from praxis import pytypes
 import tensorflow.compat.v2 as tf
 
 from paxml import checkpoints  # mapped to internal
@@ -161,6 +160,7 @@ def _maybe_create_train_input(
         passed to checkpointer.get_model_states(). If set, the checkpointer will
         restore its states from checkpoint.
     """
+    logging.info('[PAX STATUS]: Instantiating train input pipeline.')
     if not task_p.train.enable_input_checkpointing:
       _maybe_update_latest_model_step(train_input_p, step, task_p)
     train_input = instantiate(train_input_p)
@@ -219,6 +219,7 @@ def setup(
             'No training input specs available, while enabling '
             '`task_p.train.enforce_input_specs` requires it.'
         )
+    logging.info('[PAX STATUS]: Setting up partitioner')
     partitioner.setup(
         jax_task,
         root_prng_key,
@@ -291,6 +292,7 @@ def _create_decode_programs(self, decode_input_ps):
     return decode_programs
 
   def start(self):
+    logging.info('Starting executor.')
     is_vars_replicated = self._task.model.ici_mesh_shape is None
     _train_and_evaluate_common(
         task=self._task,
@@ -313,9 +315,11 @@ def start(self):
     )
 
     # Shutdown the programs and run necessary cleanup.
+    logging.info('[PAX STATUS]: Shutting down executor.')
     self._train_program.shutdown()
     for program in self._eval_programs:
       program.shutdown()
+    logging.info('[PAX STATUS]: Executor shutdown complete.')
 
 
 def _get_partition_decode_once_fn(
@@ -454,7 +458,7 @@ def _train_and_evaluate_common(
         f' number {initial_global_step} mismatch.'
     )
 
-  logging.info('Training loop starting...')
+  logging.info('[PAX STATUS]: Starting training loop.')
   with _DecodeSummaryWriters(
       job_log_dir, decode_input_names
   ) as decode_summary_writers:
@@ -497,7 +501,7 @@ def _train_and_evaluate_common(
     gc.collect()
     gc.freeze()
     while True:
-      logging.debug('step=`%d`: Beginning', step_i)
+      logging.debug('[PAX STATUS]: Beginning step `%d`.', step_i)
       checkpointer.save_if_needed(
           step_i,
           partitioned_train_state,
@@ -538,13 +542,15 @@ def _train_and_evaluate_common(
           train_p.eval_interval_steps
           and step_i % train_p.eval_interval_steps == 0
       ):
-        logging.debug('  Starting eval_step().')
+        logging.debug('[PAX STATUS]:  Starting eval_step().')
         eval_partitioned_train_state = programs.get_eval_train_state(
             task, partitioned_train_state
         )
         # If we have eval test then also evaluate on test.
         if eval_programs:
-          logging.debug('  Performing eval_step() runs on test splits.')
+          logging.debug(
+              '[PAX STATUS]:  Performing eval_step() runs on test splits.'
+          )
           with py_utils.timeit() as eval_period:
             eval_metrics_list, eval_scoring_metrics_list, num_eval_steps = (
                 eval_lib.run_eval_loop_over_test_splits(
@@ -566,7 +572,8 @@ def _train_and_evaluate_common(
               input_names=[prog.eval_input.name for prog in eval_programs],
           )
           logging.debug(
-              '  Completed eval_step() runs on test splits in %f seconds.',
+              '[PAX STATUS]:  Completed eval_step() runs on test splits in %f'
+              ' seconds.',
               eval_period.elapsed,
           )
 
@@ -586,7 +593,9 @@ def _train_and_evaluate_common(
             decode_partitioned_train_state = tasks_lib.extract_ema(
                 partitioned_train_state
             )
-            logging.debug('  Performing decode_once_fn() with ema states.')
+            logging.debug(
+                '[PAX STATUS]:  Performing decode_once_fn() with EMA states.'
+            )
           else:
             decode_partitioned_train_state = partitioned_train_state
           decode_metrics = decode_once_fn(
@@ -595,8 +604,7 @@ def _train_and_evaluate_common(
         jax.monitoring.record_event_duration_secs(
             '/jax/pax/train/interleaved_decode_duration_sec',
             decode_period.elapsed)
-
-      logging.debug('step=`%d`: End', step_i - 1)
+      logging.debug('[PAX STATUS]: Step `%d` completed.', step_i - 1)
 
       if early_stopping_fn is not None:
         if tuning_lib.should_early_stop(
@@ -633,7 +641,8 @@ def _train_and_evaluate_common(
           )
           break
     gc.unfreeze()
-    # Save checkpoint for the last step.
+
+    logging.info('[PAX STATUS]: Saving checkpoint for final step.')
     checkpointer.save_final(
         step_i,
         partitioned_train_state=partitioned_train_state,
@@ -643,3 +652,4 @@ def _train_and_evaluate_common(
     )
 
     checkpointer.wait_until_finished()
+    logging.info('[PAX STATUS]: Final checkpoint saved.')
diff --git a/paxml/partitioning.py b/paxml/partitioning.py
@@ -347,8 +347,10 @@ def setup(
       )
 
     if train_inputs_shape_dtype:
+      logging.info('[PAX STATUS]: Getting input shapes from spec.')
       self._train_inputs_shape_dtype = train_inputs_shape_dtype
     else:
+      logging.info('[PAX STATUS]: Getting input shapes from first batch.')
       self._train_inputs_shape_dtype = self._get_train_inputs_shape_dtype(
           train_input_pipeline
       )
diff --git a/paxml/programs.py b/paxml/programs.py
@@ -76,7 +76,7 @@ def get_eval_train_state(task: tasks_lib.SingleTask, state: TrainState):
           'learner does not seem to have ema enabled'
       )
     eval_state = tasks_lib.extract_ema(state).to_eval_state()
-    logging.debug('  Converted train state to eval with ema state.')
+    logging.info('[PAX STATUS]: Converted train state to eval with EMA state.')
   else:
     eval_state = state.to_eval_state()
   return eval_state
@@ -207,6 +207,7 @@ def setup(
       eval_prng_seed: PRNGKey,
       init_step: int,
   ) -> None:
+    logging.info('[PAX STATUS]: Setting up BaseTrainProgram.')
     self._task = task
     self._train_input = train_input
     self._partitioner = partitioner
@@ -263,7 +264,7 @@ def should_run(self, state: TrainState, step: int) -> bool:
   # correspondingly.
   def run(self, state: TrainState, step: int) -> ProgramOutput:
     train_p = self._task.train
-    logging.debug('  Retrieving inputs.')
+    logging.debug('[PAX STATUS]:  Retrieving inputs.')
 
     model_inputs = self._train_input.get_next_padded()
 
@@ -276,7 +277,7 @@ def run(self, state: TrainState, step: int) -> ProgramOutput:
         model_inputs,  ## First two args can be consolidated
         self.train_input_partition_spec(model_inputs),
     )
-    logging.debug('  Retrieved inputs.')
+    logging.debug('[PAX STATUS]:  Retrieved inputs.')
 
     # Waits if it reaches max inflight steps. We do this after retrieving the
     # inputs to maximize efficiency.
@@ -287,7 +288,7 @@ def run(self, state: TrainState, step: int) -> ProgramOutput:
     if do_profile and step - self._initial_step == profiler_capture_step:
       self._profiler.capture_async()
 
-    logging.debug('  Performing train_step().')
+    logging.debug('[PAX STATUS]:  Performing train_step().')
     with jax.profiler.StepTraceAnnotation('train', step_num=step):
       with py_utils.timeit() as train_period:
         new_step, new_state, train_outputs = self.train_step(
@@ -297,21 +298,21 @@ def run(self, state: TrainState, step: int) -> ProgramOutput:
             model_inputs,
             self._train_unpadded_global_batch_size,
         )
-      del state  # Unused anymore.
+      del state  # Unused.
     jax.monitoring.record_event_duration_secs(
         '/jax/pax/train/duration_sec', train_period.elapsed
     )
     logging.debug(
-        '  Completed train_step() in %f seconds.', train_period.elapsed
+        '[PAX STATUS]: train_step() took %f seconds.', train_period.elapsed
     )
     self._pending_train_losses.add_computation(train_outputs.loss)
     if step == self._initial_step:
       self._first_step_completion_time = time.time()
 
     if do_profile and step - self._initial_step < profiler_capture_step:
       self._profiler.update_step_moving_mean(train_period.elapsed)
+    logging.debug('[PAX STATUS]:  Writing summaries (attempt).')
     steps_per_sec = self._maybe_write_summaries(step, new_step, train_outputs)
-    logging.debug('  Writing summaries (attempt).')
 
     # Run eval at regular step interval.
     # While the eval ones below are post-model weight updates, hence we use the
@@ -402,7 +403,7 @@ def _maybe_write_summaries(
         per_example_out=train_outputs.per_example_out,
         steps_per_sec=steps_per_sec,
     )
-    logging.debug('  Wrote summaries (attempted).')
+    logging.debug('[PAX STATUS]:  Wrote summaries (attempted).')
     return steps_per_sec
 
   def _compute_steps_per_sec(self, step: int):
@@ -474,7 +475,7 @@ def _maybe_run_eval_train(self, new_state: TrainState, new_step: int):
         if self._eval_train_summary_handler.process(
             new_step, loss, weighted_scalars, summary_tensors
         ):
-          logging.debug('  Wrote eval summaries.')
+          logging.debug('[PAX STATUS]:  Wrote eval summaries.')
         eval_train_metrics = metric_utils.as_float_dict(weighted_scalars)
     return eval_train_metrics
 
@@ -680,7 +681,9 @@ def setup(
 
     # Creates the eval input pipeline.
     self._input_p = self._partitioner.preprocess_input_config(self._input_p)
-    logging.debug('Initializing eval_input pipeline : %s', self._input_p)
+    logging.info(
+        '[PAX STATUS]: Initializing eval_input pipeline : %s', self._input_p
+    )
     self._eval_input_pipeline = instantiate(self._input_p)
     self._name = self.eval_input.name
     self._eval_unpadded_global_batch_size = (
diff --git a/paxml/train.py b/paxml/train.py
@@ -136,16 +136,19 @@ def train_and_evaluate(
       on-demand checkpoint due to preemption.
   """
   jax.monitoring.record_event('/jax/pax/train_and_evaluate/beacon')
+  logging.info('[PAX STATUS] Starting `train_and_evaluate`')
   task_p = experiment_config.task()
   task_p = typing.cast(pax_fiddle.Config[tasks_lib.SingleTask], task_p)
 
   # in case the user passed in a string dtype, convert it to an actual dtype
   task_p.model.fprop_dtype = jnp.dtype(task_p.model.fprop_dtype)
 
+  logging.info('[PAX STATUS] Obtaining and initializing datasets.')
   input_p = experiment_config.datasets()
   for inp in input_p:
     if not isinstance(
-        inp, (base_input.BaseInput.HParams, base_input.DistributedInputHParams)
+        inp,
+        (base_input.BaseInput.HParams, base_input.DistributedInputHParams),
     ):
       raise ValueError(
           f'Expecting BaseInput.HParams from datasets(), got: {inp.ToText()}'
@@ -156,6 +159,7 @@ def train_and_evaluate(
         f'Expecting exactly one training split. Got `{len(train_input_p)}`.'
     )
   train_input_p = train_input_p[0]
+  logging.info('[PAX STATUS]: Done initializing dataset objects')
 
   logging.info('train_input_p:')
   for line in base_hyperparams.nested_struct_to_text(
@@ -166,6 +170,7 @@ def train_and_evaluate(
   for line in base_hyperparams.nested_struct_to_text(task_p).splitlines():  # pytype: disable=attribute-error
     logging.info('  %s', line)
 
+  logging.info('[PAX STATUS]: Initializing decoder')
   if (
       run_decode
       and task_p.train.decode_interval_steps is not None
@@ -198,6 +203,7 @@ def train_and_evaluate(
     )
 
   # Creates the task.
+  logging.info('[PAX STATUS]: Creating task')
   jax_task = instantiate(task_p)
   if jax_task.early_stopping_fn is not None:
     if early_stopping_fn is None:
@@ -208,15 +214,16 @@ def train_and_evaluate(
           'train_and_evel function parameter.'
       )
 
+  logging.info('[PAX STATUS]: Initializing partitioner')
   # Creates the partitioner, which will be set up later.
   partitioner = experiment_config.partitioner()
   if not partitioner:
     # For the input pipeline on the Pathways client, the inputs are numpy
     # arrays. We rely on the Pathways to transfer the inputs, since
     # jax.device_put() has a larger performance overhead.
     reshard_inputs = (
-        checkpointer.checkpoint_type != CheckpointType.PERSISTENCE or
-        train_input_p.experimental_remote_input
+        checkpointer.checkpoint_type != CheckpointType.PERSISTENCE
+        or train_input_p.experimental_remote_input
     )
     partitioner = partitioning.create_partitioner(
         jax_task,
@@ -235,9 +242,11 @@ def train_and_evaluate(
     eval_programs = experiment_config.eval_programs()
 
   # Creates the executor and run the training pipeline.
+  logging.info('[PAX STATUS]: Creating executor.')
   executor = experiment_config.executor()
   if not executor:
     executor = executors.DefaultExecutor()
+  logging.info('[PAX STATUS]: Setting up executor.')
   with partitioner.global_mesh or contextlib.nullcontext():
     executor.setup(
         jax_task,

Original file line number	Diff line number	Diff line change
`@@ -347,8 +347,10 @@ def setup(`
`347`	`347`	`)`
`348`	`348`
`349`	`349`	`if train_inputs_shape_dtype:`
	`350`	`+ logging.info('[PAX STATUS]: Getting input shapes from spec.')`
`350`	`351`	`self._train_inputs_shape_dtype = train_inputs_shape_dtype`
`351`	`352`	`else:`
	`353`	`+ logging.info('[PAX STATUS]: Getting input shapes from first batch.')`
`352`	`354`	`self._train_inputs_shape_dtype = self._get_train_inputs_shape_dtype(`
`353`	`355`	`train_input_pipeline`
`354`	`356`	`)`