Debug commit upleveled latest

gilkeren1 · gilkeren1 · commit acd77f824b0d · 2025-04-30T15:16:41.000-07:00
diff --git a/src/fairseq2/cli/_main.py b/src/fairseq2/cli/_main.py
@@ -8,47 +8,47 @@
 
 import os
 import sys
-from signal import SIG_DFL, SIGINT, raise_signal, signal
+from signal import raise_signal, SIG_DFL, SIGINT, signal
 
 import torch
-from torch.cuda import OutOfMemoryError
 
 from fairseq2 import setup_fairseq2
-from fairseq2.cli.utils.rich import create_rich_progress_reporter
-from fairseq2.error import ContractError, InternalError
-from fairseq2.extensions import ExtensionError
-from fairseq2.logging import LoggingSetupError, log
-from fairseq2.setup import SetupError
-from fairseq2.utils.env import InvalidEnvironmentVariableError, get_rank
 
 # isort: split
 
 from fairseq2.cli._logging import setup_logging
 from fairseq2.cli._setup import setup_cli
+from fairseq2.cli.utils.rich import create_rich_progress_reporter
+from fairseq2.error import ContractError, InternalError
+from fairseq2.extensions import ExtensionError
+from fairseq2.logging import log, LoggingSetupError
+from fairseq2.setup import SetupError
+from fairseq2.utils.env import get_rank, InvalidEnvironmentVariableError
+from torch.cuda import OutOfMemoryError
 
 
 def main() -> None:
     """Runs the command line fairseq2 program."""
     exit_code = 1
 
-    try:
-        exit_code = _run()
-    except KeyboardInterrupt:
-        log.info("Command canceled!")
+    # try:
+    exit_code = _run()
+    # except KeyboardInterrupt:
+    #     log.info("Command canceled!")
 
-        signal(SIGINT, SIG_DFL)
+    #     signal(SIGINT, SIG_DFL)
 
-        raise_signal(SIGINT)
-    except OutOfMemoryError:
-        s = torch.cuda.memory_summary()
+    #     raise_signal(SIGINT)
+    # except OutOfMemoryError:
+    #     s = torch.cuda.memory_summary()
 
-        log.exception("CUDA out of memory. See logged memory stats.\n{}", s)
-    except InternalError:
-        log.exception("Command failed with an unexpected internal error. Please file a bug report.")  # fmt: skip
-    except ContractError:
-        log.exception("Command failed with an unexpected internal error caused by an extension. Please file a bug report to the corresponding extension author.")  # fmt: skip
-    except Exception:
-        log.exception("Command failed with an unexpected error. See the logged stack trace for details.")  # fmt: skip
+    #     log.exception("CUDA out of memory. See logged memory stats.\n{}", s)
+    # except InternalError:
+    #     log.exception("Command failed with an unexpected internal error. Please file a bug report.")  # fmt: skip
+    # except ContractError:
+    #     log.exception("Command failed with an unexpected internal error caused by an extension. Please file a bug report to the corresponding extension author.")  # fmt: skip
+    # except Exception:
+    #     log.exception("Command failed with an unexpected error. See the logged stack trace for details.")  # fmt: skip
 
     sys.exit(exit_code)
 
@@ -84,3 +84,7 @@ def _run() -> int:
         return 1
 
     return cli.run(context)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/fairseq2/nn/utils/module.py b/src/fairseq2/nn/utils/module.py
@@ -13,13 +13,13 @@
 from typing import Protocol, runtime_checkable
 
 import torch
-from torch import Tensor
-from torch.nn import Module, Parameter
-from torch.nn.utils import remove_weight_norm  # type: ignore[attr-defined]
 
 from fairseq2.gang import Gang
 from fairseq2.logging import log
 from fairseq2.typing import CPU, Device
+from torch import Tensor
+from torch.nn import Module, Parameter
+from torch.nn.utils import remove_weight_norm  # type: ignore[attr-defined]
 
 
 @runtime_checkable
@@ -464,6 +464,21 @@ def load_state_dict(
     ``state_dict`` does not contain any keys corresponding to descendants that are set to ``None``
     via :meth:`Module.register_module()`.
     """
+    # Key mapping
+    need_mapping = False
+    sample_key = list(state_dict.keys())[0]
+    if (
+        sample_key.startswith("module.")
+        and not sample_key in module.state_dict().keys()
+    ):
+        mapped_key = sample_key[7:]
+        if mapped_key in module.state_dict().keys():
+            need_mapping = True
+
+    if need_mapping:
+        key_mapping = lambda key: key[7:] if key.startswith("module.") else key
+        state_dict = {key_mapping(key): value for key, value in state_dict.items()}
+
     module.load_state_dict(state_dict, strict=strict)
 
     unexpected_keys = []
diff --git a/src/fairseq2/recipes/_validator.py b/src/fairseq2/recipes/_validator.py
@@ -6,15 +6,14 @@
 
 from __future__ import annotations
 
+import socket
+
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from contextlib import nullcontext
-from typing import Generic, TypeVar, final
+from typing import final, Generic, TypeVar
 
 import torch
-from torch import Tensor
-from torch.profiler import record_function
-from typing_extensions import override
 
 from fairseq2.checkpoint import CheckpointError, CheckpointManager, CheckpointSaveError
 from fairseq2.datasets import DataReader, DataReadError
@@ -25,17 +24,20 @@
 from fairseq2.metrics import MetricBagError, MetricDescriptor
 from fairseq2.metrics.recorders import MetricRecorder, MetricRecordError
 from fairseq2.profilers import Profiler
-from fairseq2.typing import CPU, ContextManager, DataType
-from fairseq2.utils.device_stat import DeviceStatTracker
-from fairseq2.utils.progress import ProgressReporter, ProgressTask
-from fairseq2.utils.rng import RngBag
-from fairseq2.utils.stopwatch import Stopwatch
 
 # isort: split
 
 from fairseq2.recipes._error import RecipeError, UnitError
 from fairseq2.recipes._evaluator import EvalUnit
 from fairseq2.recipes._metrics import extend_batch_metrics
+from fairseq2.typing import ContextManager, CPU, DataType
+from fairseq2.utils.device_stat import DeviceStatTracker
+from fairseq2.utils.progress import ProgressReporter, ProgressTask
+from fairseq2.utils.rng import RngBag
+from fairseq2.utils.stopwatch import Stopwatch
+from torch import Tensor
+from torch.profiler import record_function
+from typing_extensions import override
 
 
 class Validator(ABC):
@@ -243,7 +245,14 @@ def _run_unit(
                     f"The {s} unit has failed. See the nested exception for details."
                 ) from ex
 
+            machine_name = socket.gethostname()
+            if machine_name.startswith("devvm"):
+                _max_num_valid_steps = 5
+            else:
+                _max_num_valid_steps = 50000000000
+            c = 0
             while not eod:
+                log.info(f"s1: Running validation step {c}.")
                 try:
                     self._checkpoint_manager.maybe_complete_async_checkpoint()
                 except CheckpointSaveError as ex:
@@ -252,10 +261,13 @@ def _run_unit(
                     ) from ex
 
                 batches = self._read_next_batches(unit, data_reader)
-                if batches is None:
+                log.info(f"s2: Read batches step {c}.")
+                if batches is None or c == _max_num_valid_steps:
                     eod = True
                 else:
                     self._run_step(unit, batches, progress_task)
+                log.info(f"s7: Done with step {c}.")
+                c += 1
 
             with self._compute_watch:
                 with record_function("finalize"):
diff --git a/src/fairseq2/recipes/asr/_common.py b/src/fairseq2/recipes/asr/_common.py
@@ -7,12 +7,15 @@
 from __future__ import annotations
 
 import math
+import re
 from typing import Any, Dict, final, TextIO
 
 import torch
 
 from fairseq2.data.text.tokenizers import TextTokenDecoder, TextTokenizer
 from fairseq2.gang import Gang
+
+from fairseq2.logging import log
 from fairseq2.metrics import Mean
 from fairseq2.metrics.text import WerMetric
 from fairseq2.models.asr import AsrModel, AsrModelOutput
@@ -57,8 +60,10 @@ def __call__(
                 )
             input_batch = batch
 
+        log.info(f"s3: calling forward")
         output = self._forward(input_batch)
 
+        log.info(f"s4: calling loss")
         loss, extra_metrics = output.compute_loss(
             batch.target_seqs, batch.target_padding_mask
         )
@@ -68,8 +73,11 @@ def __call__(
         metric_bag.update_batch_metrics(batch)
 
         metric_bag.update_extra_metrics(batch, extra_metrics)
+
+        log.info(f"s5: calling scorer")
         if self._scorer is not None:
             self._scorer(batch, output, metric_bag)
+        log.info(f"s6: done scorer")
 
         return loss, batch.batch_size
 
@@ -132,6 +140,11 @@ def __call__(
         refs = [self._text_decoder(s) for s in ref_seqs]
         hyps = [self._text_decoder(s) for s in hyp_seqs]
 
+        for r, h in zip(refs, hyps):
+            if torch.rand([]) < 0.01 or bool(re.search(r"[\u0590-\u05FF]", r)):
+                log.info(f"Reference: {r}")
+                log.info(f"Hypothesis: {h}")
+
         metric_bag.wer.update(
             refs, ref_seqs, ref_padding_mask, hyps, hyp_seqs, hyp_padding_mask
         )
diff --git a/src/fairseq2/recipes/wav2vec2/asr/_train.py b/src/fairseq2/recipes/wav2vec2/asr/_train.py
@@ -122,6 +122,7 @@ class Wav2Vec2AsrTrainConfig:
             validate_after_n_steps=10_000,
             validate_every_n_steps=1_000,
             publish_metrics_every_n_steps=200,
+            keep_last_n_checkpoints=1,
         )
     )
 
@@ -266,7 +267,7 @@ def load_wav2vec2_asr_trainer(
 
     # If we start the training with an empty ASR model, use the weights of a
     # pretrained wav2vec 2.0 model.
-    if model.is_empty_initialized:
+    if model.is_empty_initialized and config.pretrained_model.name:
         pt_model = load_reference_model(
             Wav2Vec2Model,
             context,

Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,7 @@ class Wav2Vec2AsrTrainConfig:`
`122`	`122`	`validate_after_n_steps=10_000,`
`123`	`123`	`validate_every_n_steps=1_000,`
`124`	`124`	`publish_metrics_every_n_steps=200,`
	`125`	`+ keep_last_n_checkpoints=1,`
`125`	`126`	`)`
`126`	`127`	`)`
`127`	`128`
`@@ -266,7 +267,7 @@ def load_wav2vec2_asr_trainer(`
`266`	`267`
`267`	`268`	`# If we start the training with an empty ASR model, use the weights of a`
`268`	`269`	`# pretrained wav2vec 2.0 model.`
`269`		`- if model.is_empty_initialized:`
	`270`	`+ if model.is_empty_initialized and config.pretrained_model.name:`
`270`	`271`	`pt_model = load_reference_model(`
`271`	`272`	`Wav2Vec2Model,`
`272`	`273`	`context,`