NVIDIA-NeMo
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 1 addition & 2 deletions b/‎docker/Dockerfile‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py‎
Lines changed: 143 additions & 29 deletions b/‎examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py‎
Lines changed: 143 additions & 29 deletions
diff --git a/‎examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/asr/conf/asr_streaming_inference/cache_aware_rnnt.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/asr/conf/asr_streaming_inference/cache_aware_rnnt.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo/collections/asr/inference/factory/base_builder.py‎
Lines changed: 27 additions & 1 deletion b/‎nemo/collections/asr/inference/factory/base_builder.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎nemo/collections/asr/inference/factory/buffered_pipeline_builder.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo/collections/asr/inference/factory/buffered_pipeline_builder.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo/collections/asr/inference/factory/cache_aware_pipeline_builder.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo/collections/asr/inference/factory/cache_aware_pipeline_builder.py‎
Lines changed: 1 addition & 0 deletions
@@ -16,6 +16,7 @@ weight checkpoints and demos!
 > The first release of NeMo Speech after NeMo repository split is scheduled for June 2026, as the repo undergoes transformation.
 > For the latest stable released version, please use [the 26.02 NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo?version=26.02).
 
+- 2026-06: [Nemotron-3.5-ASR-Streaming-0.6B](https://huggingface.co/nvidia/nemotron-3.5-asr-streaming-0.6b) has been released with 40 languages supported, controllable latency 80ms-1s, and 240-2400 1xH100 concurrent streams. Built on cache-aware Fastconformer architecture.
 - 2026-04: [Parakeet-unified-en-0.6b](https://huggingface.co/nvidia/parakeet-unified-en-0.6b) has been released with high-quality offline and streaming (with a minimum latency of 160ms) inference in one model for English language with punctuation and capitalization support. 
 - 2026-03: [Nemotron 3 VoiceChat](https://build.nvidia.com/nvidia/nemotron-voicechat/modelcard) is now released in Early Access. Built on the Nemotron Nano v2 LLM backbone with Nemotron speech and TTS decoder, VoiceChat delivers full-duplex, natural, interruptible conversations with low latency. Try out [the demo](https://build.nvidia.com/nvidia/nemotron-voicechat) and apply for [early access](https://developer.nvidia.com/nemotron-voicechat-early-access).
 - 2026-03: [Nemotron-Speech-Streaming v2603](https://huggingface.co/nvidia/nemotron-speech-streaming-en-0.6b) has been
 
@@ -130,7 +130,6 @@ case "$(nemo-cuda-flavor)" in
 esac
 uv pip install --index-url "${torchcodec_index}" torchcodec
 EOF
-COPY nemo /workspace/nemo
 
 FROM base-image AS automodel-deps
 ARG GPU_TARGET=h100plus
@@ -291,7 +290,7 @@ if [ "${INSTALL_FFMPEG}" = "true" ]; then
 fi
 EOF
 
-ENV NEMO_HOME="/home/TestData/nemo_home"
+COPY nemo /workspace/nemo
 
 # NOTICES.txt file points to where the OSS source code is archived
 RUN echo "This distribution includes open source which is archived at the following URL: https://opensource.nvidia.com/oss/teams/nvidia/nemo/${RC_DATE}:linux-${TARGETARCH}/index.html" > NOTICES.txt && \
 
@@ -75,6 +75,9 @@
 from nemo.collections.asr.models import EncDecHybridRNNTCTCModel, EncDecRNNTModel
 from nemo.collections.asr.parts.context_biasing.biasing_multi_model import BiasingRequestItemConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_maes_batched_computer import ModifiedAESBatchedRNNTComputer
+from nemo.collections.asr.parts.submodules.rnnt_malsd_batched_computer import ModifiedALSDBatchedRNNTComputer
+from nemo.collections.asr.parts.submodules.tdt_malsd_batched_computer import ModifiedALSDBatchedTDTComputer
 from nemo.collections.asr.parts.submodules.transducer_decoding.label_looping_base import (
     GreedyBatchedLabelLoopingComputerBase,
 )
@@ -84,6 +87,7 @@
 from nemo.collections.asr.parts.utils.streaming_utils import (
     AudioBatch,
     ContextSize,
+    DynamicLengthTensor,
     SimpleAudioDataset,
     StreamingBatchedAudioBuffer,
 )
@@ -155,8 +159,15 @@ class TranscriptionConfig:
     decoding: RNNTDecodingConfig = field(default_factory=RNNTDecodingConfig)
     # Per-utterance biasing with biasing config in the manifest
     use_per_stream_biasing: bool = False
+    # simulated decoding (False by default) for faster experiments
+    # + experiments with different decoding algorithms not yet implemented in streaming
+    # encoder is evaluated on chunks, output is concatenated and decoded at one step
+    # expected to provide the same results if the decoding strategy supports
+    # streaming decoding without additional heuristics (e.g., pruning between steps)
+    simulated: bool = False
 
     timestamps: bool = False  # output timestamps
+    confidence: bool = False  # output word confidence
 
     # Config for word / character error rate calculation
     calculate_wer: bool = True
@@ -229,19 +240,34 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
     asr_model.to(compute_dtype)
 
     use_per_stream_biasing = cfg.use_per_stream_biasing
+    use_simulated_decoding = cfg.simulated
 
     # Change Decoding Config
-    with open_dict(cfg.decoding):
-        if cfg.decoding.strategy != "greedy_batch" or cfg.decoding.greedy.loop_labels is not True:
-            raise NotImplementedError(
-                "This script currently supports only `greedy_batch` strategy with Label-Looping algorithm"
-            )
-        cfg.decoding.tdt_include_token_duration = cfg.timestamps
-        cfg.decoding.greedy.preserve_alignments = False
-        cfg.decoding.fused_batch_size = -1  # temporarily stop fused batch during inference.
-        cfg.decoding.beam.return_best_hypothesis = True  # return and write the best hypothsis only
-        if use_per_stream_biasing:
-            cfg.decoding.greedy.enable_per_stream_biasing = use_per_stream_biasing
+    if use_simulated_decoding:
+        # simulated decoding: any config allowed, do not change config
+        with open_dict(cfg.decoding):
+            if cfg.decoding.strategy != "greedy_batch" or cfg.decoding.greedy.loop_labels is not True:
+                logging.warning(
+                    f"Using {cfg.decoding.strategy} in simulated decoding."
+                    " Only greedy_batch with label-looping fully supports"
+                    " non-simulated streaming decoding for now."
+                )
+    else:
+        with open_dict(cfg.decoding):
+            if cfg.decoding.strategy == "greedy_batch" and cfg.decoding.greedy.loop_labels is not True:
+                raise NotImplementedError(
+                    "This script supports `greedy_batch` strategy only with Label-Looping algorithm"
+                )
+            cfg.decoding.tdt_include_token_duration = cfg.timestamps
+            cfg.decoding.greedy.preserve_alignments = False
+            cfg.decoding.fused_batch_size = -1  # temporarily stop fused batch during inference.
+            cfg.decoding.beam.return_best_hypothesis = True  # return and write the best hypothsis only
+            if use_per_stream_biasing:
+                cfg.decoding.greedy.enable_per_stream_biasing = use_per_stream_biasing
+            if cfg.confidence:
+                cfg.decoding.greedy.preserve_frame_confidence = True
+                cfg.decoding.confidence_cfg.preserve_frame_confidence = True
+                cfg.decoding.confidence_cfg.preserve_word_confidence = True
 
     # Setup decoding strategy
     if hasattr(asr_model, 'change_decoding_strategy'):
@@ -278,7 +304,20 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
     asr_model.preprocessor.featurizer.pad_to = 0
     asr_model.eval()
 
-    decoding_computer: GreedyBatchedLabelLoopingComputerBase = asr_model.decoding.decoding.decoding_computer
+    try:
+        if cfg.decoding.strategy == "greedy_batch":
+            decoding_computer: GreedyBatchedLabelLoopingComputerBase = asr_model.decoding.decoding.decoding_computer
+        elif cfg.decoding.strategy == "malsd_batch":
+            decoding_computer = asr_model.decoding.decoding.decoding_computer
+        elif cfg.decoding.strategy == "maes_batch":
+            decoding_computer: ModifiedAESBatchedRNNTComputer = asr_model.decoding.decoding.decoding_computer
+        else:
+            raise ValueError(f"Unsupported decoding strategy: {cfg.decoding.strategy}")
+    except AttributeError:
+        decoding_computer = None
+
+    if (not use_simulated_decoding) or use_per_stream_biasing:
+        assert decoding_computer is not None
 
     audio_sample_rate = model_cfg.preprocessor['sample_rate']
 
@@ -393,6 +432,12 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
                 device=device,
             )
             rest_audio_lengths = audio_batch_lengths.clone()
+            encoder_output_aggregated: DynamicLengthTensor | None = None
+
+            is_beam_search = isinstance(
+                decoding_computer,
+                (ModifiedALSDBatchedRNNTComputer, ModifiedAESBatchedRNNTComputer, ModifiedALSDBatchedTDTComputer),
+            )
 
             # iterate over audio samples
             while left_sample < audio_batch.shape[1]:
@@ -423,36 +468,97 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
                 encoder_context_batch = buffer.context_size_batch.subsample(factor=encoder_frame2audio_samples)
                 # remove left context
                 encoder_output = encoder_output[:, encoder_context.left :]
-
-                # decode only chunk frames
-                chunk_batched_hyps, _, state = decoding_computer(
-                    x=encoder_output,
-                    out_len=torch.where(
-                        is_last_chunk_batch,
-                        encoder_output_len - encoder_context_batch.left,
-                        encoder_context_batch.chunk,
-                    ),
-                    prev_batched_state=state,
-                    multi_biasing_ids=multi_biasing_ids,
+                encoder_output_len_to_decode = torch.where(
+                    is_last_chunk_batch,
+                    encoder_output_len - encoder_context_batch.left,
+                    encoder_context_batch.chunk,
                 )
-                # merge hyps with previous hyps
-                if current_batched_hyps is None:
-                    current_batched_hyps = chunk_batched_hyps
+
+                if use_simulated_decoding:
+                    # store encoder output (accumulate)
+                    if encoder_output_aggregated is None:
+                        encoder_output_aggregated = DynamicLengthTensor(
+                            batch_size=batch_size,
+                            init_length=encoder_output.shape[1],
+                            dim_shape=encoder_output.shape[2],
+                            device=device,
+                            dtype=compute_dtype,
+                        )
+                    encoder_output_aggregated.append_(data=encoder_output, lengths=encoder_output_len_to_decode)
                 else:
-                    current_batched_hyps.merge_(chunk_batched_hyps)
+                    if not is_beam_search:
+                        # decode only chunk frames
+                        chunk_batched_hyps, state = decoding_computer(
+                            x=encoder_output,
+                            out_len=encoder_output_len_to_decode,
+                            prev_batched_state=state,
+                            multi_biasing_ids=multi_biasing_ids,
+                        )
+
+                        # merge hyps with previous hyps
+                        if current_batched_hyps is None:
+                            current_batched_hyps = chunk_batched_hyps
+                        else:
+                            current_batched_hyps.merge_(chunk_batched_hyps)
+                    else:
+                        chunk_batched_hyps, state = decoding_computer(
+                            x=encoder_output,
+                            out_len=encoder_output_len_to_decode,
+                            prev_batched_state=state,
+                        )
+                        # flatten_ to flatten the prefix tree and link beams to prior chunks in merge_ using root_ptrs.
+                        chunk_root_ptrs = chunk_batched_hyps.flatten_()
+                        if current_batched_hyps is None:
+                            current_batched_hyps = chunk_batched_hyps
+                        else:
+                            current_batched_hyps.merge_(
+                                chunk_batched_hyps,
+                                is_chunk_continuation=True,
+                                boundary_prev_ptr=chunk_root_ptrs,
+                            )
 
                 # move to next sample
                 rest_audio_lengths -= chunk_lengths_batch
                 left_sample = right_sample
                 right_sample = min(right_sample + context_samples.chunk, audio_batch.shape[1])  # add next chunk
 
+            if use_simulated_decoding:
+                # decode aggregated streaming encoder output
+                if decoding_computer is not None:
+                    if not is_beam_search:
+                        current_batched_hyps, _ = decoding_computer(
+                            x=encoder_output_aggregated.data,
+                            out_len=encoder_output_aggregated.lengths,
+                            prev_batched_state=state,
+                            multi_biasing_ids=multi_biasing_ids,
+                        )
+                        all_hyps.extend(batched_hyps_to_hypotheses(current_batched_hyps, batch_size=batch_size))
+                    else:
+                        current_batched_hyps, _ = decoding_computer(
+                            x=encoder_output_aggregated.data,
+                            out_len=encoder_output_aggregated.lengths,
+                            prev_batched_state=state,
+                        )
+                        all_hyps.extend(current_batched_hyps.to_hyps_list(score_norm=True))
+                else:
+                    # no decoding computer, fallback to `asr_model.decoding.decoding`
+                    (cur_hyps,) = asr_model.decoding.decoding(
+                        encoder_output=encoder_output_aggregated.data.transpose(1, 2),
+                        encoded_lengths=encoder_output_aggregated.lengths,
+                    )
+                    all_hyps.extend(cur_hyps)
+            else:
+                if not is_beam_search:
+                    all_hyps.extend(batched_hyps_to_hypotheses(current_batched_hyps, batch_size=batch_size))
+                else:
+                    all_hyps.extend(current_batched_hyps.to_hyps_list(score_norm=True))
+
             # remove biasing requests from the decoder
             if use_per_stream_biasing and audio_data.biasing_requests is not None:
                 for request in audio_data.biasing_requests:
                     if request is not None and request.multi_model_id is not None:
                         decoding_computer.biasing_multi_model.remove_model(request.multi_model_id)
                         request.multi_model_id = None
-            all_hyps.extend(batched_hyps_to_hypotheses(current_batched_hyps, None, batch_size=batch_size))
         timer.stop(device=map_location)
 
     # convert text
@@ -466,6 +572,8 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
                 window_stride=asr_model.cfg['preprocessor']['window_stride'],
             )
             all_hyps[i] = hyp
+    if cfg.confidence:
+        all_hyps = asr_model.decoding.compute_confidence(all_hyps)
 
     if cfg.sort_by_duration:
         # restore order for all_hyps and records (all_hyps are consistent with records)
@@ -475,7 +583,13 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
         records, all_hyps = map(list, zip(*order_restored))
 
     output_filename, pred_text_attr_name = write_transcription(
-        all_hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, timestamps=cfg.timestamps
+        all_hyps,
+        cfg,
+        model_name,
+        filepaths=filepaths,
+        compute_langs=False,
+        timestamps=cfg.timestamps,
+        confidence=cfg.confidence,
     )
     logging.info(f"Finished writing predictions to {output_filename}!")
 
 
@@ -14,6 +14,7 @@ asr:
     greedy:
       use_cuda_graph_decoder: true
       enable_per_stream_biasing: true  # Per-stream biasing in decoder
+      preserve_frame_confidence: false  # Set to true to calculate confidence (token/word etc.)
       max_symbols: 10
       # n-gram LM
       ngram_lm_model: null  # The path to built '.nemo' NGPU-LM model
 
@@ -14,6 +14,7 @@ asr:
     greedy:
       use_cuda_graph_decoder: false  # Disabled due to issues with decoding
       enable_per_stream_biasing: true  # Per-stream biasing in decoder
+      preserve_frame_confidence: false  # Set to true to calculate confidence (token/word etc.)
       max_symbols: 10
       # n-gram LM
       ngram_lm_model: null  # The path to built '.nemo' NGPU-LM model
 
@@ -16,7 +16,7 @@
 
 from typing import TYPE_CHECKING, Any
 
-from omegaconf import open_dict
+from omegaconf import OmegaConf, open_dict
 from omegaconf.dictconfig import DictConfig
 
 from nemo.collections.asr.inference.model_wrappers.cache_aware_ctc_inference_wrapper import (
@@ -72,6 +72,32 @@ def _build_nmt(cls, cfg: DictConfig) -> LLMTranslator | None:
             logging.info(f"NMT model `{cfg.nmt.model_name}` loaded")
         return nmt_model
 
+    @staticmethod
+    def _apply_confidence_cfg(cfg: DictConfig, decoding_cfg: RNNTDecodingConfig) -> None:
+        """
+        Wire the separately-stored `confidence` block into the RNNT decoding confidence config so the
+        greedy decoder computes per-token confidence with the configured method. The streaming pipelines
+        only support non-blank confidence (`confidence.exclude_blank=true`).
+        Args:
+            cfg: (DictConfig) Full pipeline config (provides the top-level `confidence` block).
+            decoding_cfg: (RNNTDecodingConfig) Decoding config to update in place.
+        """
+        if not decoding_cfg.greedy.get("preserve_frame_confidence", False):
+            return
+        confidence_cfg = cfg.get("confidence", None)
+        if confidence_cfg is None:
+            return
+        if not confidence_cfg.get("exclude_blank", True):
+            raise ValueError(
+                "Streaming confidence supports only non-blank confidence (`confidence.exclude_blank=true`)."
+            )
+        decoding_cfg.confidence_cfg.preserve_frame_confidence = True
+        decoding_cfg.confidence_cfg.exclude_blank = True
+        decoding_cfg.confidence_cfg.aggregation = confidence_cfg.get("aggregation", "mean")
+        decoding_cfg.confidence_cfg.method_cfg = OmegaConf.merge(
+            decoding_cfg.confidence_cfg.method_cfg, confidence_cfg.method_cfg
+        )
+
     @classmethod
     def _build_asr(cls, cfg: DictConfig, decoding_cfg: CTCDecodingConfig | RNNTDecodingConfig | None) -> Any:
         """
 
@@ -62,6 +62,7 @@ def get_rnnt_decoding_cfg(cls, cfg: DictConfig) -> RNNTDecodingConfig:
         base_cfg_structured = OmegaConf.structured(RNNTDecodingConfig)
         base_cfg = OmegaConf.create(OmegaConf.to_container(base_cfg_structured))
         decoding_cfg = OmegaConf.merge(base_cfg, cfg.asr.decoding)
+        cls._apply_confidence_cfg(cfg, decoding_cfg)
         return decoding_cfg
 
     @classmethod
 
@@ -57,6 +57,7 @@ def get_rnnt_decoding_cfg(cls, cfg: DictConfig) -> RNNTDecodingConfig:
         base_cfg_structured = OmegaConf.structured(RNNTDecodingConfig)
         base_cfg = OmegaConf.create(OmegaConf.to_container(base_cfg_structured))
         decoding_cfg = OmegaConf.merge(base_cfg, cfg.asr.decoding)
+        cls._apply_confidence_cfg(cfg, decoding_cfg)
         return decoding_cfg
 
     @classmethod