update tests for upgrading tokenizers and transformers versions

nithinraok · nithinraok · commit 2e06c40f25b2 · 2026-03-09T11:05:09.000-07:00
Signed-off-by: nithinraok &lt;nithinrao.koluguri@gmail.com&gt;
diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
@@ -189,8 +189,8 @@ def test_save_restore_artifact_agg(self, asr_model, test_data_dir):
             assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer)
 
             # should be double
-            assert new_model.tokenizer.tokenizer.vocab_size == 254
-            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
+            assert new_model.tokenizer.tokenizer.vocab_size == 264
+            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 264
 
     @pytest.mark.with_downloads()
     @pytest.mark.unit
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
@@ -245,8 +245,8 @@ def test_save_restore_artifact_agg(self, hybrid_asr_model, test_data_dir):
             assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer)
 
             # should be double
-            assert new_model.tokenizer.tokenizer.vocab_size == 254
-            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
+            assert new_model.tokenizer.tokenizer.vocab_size == 264
+            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 264
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py
@@ -1036,7 +1036,15 @@ def test_aed_parallel_chunking(canary_1b_v2):
     ts_hypotheses = canary_1b_v2.transcribe(audio_file, timestamps=True)
     assert len(ts_hypotheses) == 1
 
-    assert ts_hypotheses[0].text == hypotheses[0].text
+    # timestamps=True and timestamps=False use different merge algorithms
+    # (LCS-based merge vs simple concatenation), so texts may differ slightly
+    # at chunk boundaries for long audio. Check they are very similar instead.
+    ts_words = ts_hypotheses[0].text.split()
+    no_ts_words = hypotheses[0].text.split()
+    common_words = sum(1 for a, b in zip(ts_words, no_ts_words) if a == b)
+    similarity = common_words / max(len(ts_words), len(no_ts_words))
+    assert similarity > 0.95, f"Text similarity too low: {similarity:.4f}"
+
     assert "char" not in ts_hypotheses[0].timestamp
     assert 'word' in ts_hypotheses[0].timestamp and 'segment' in ts_hypotheses[0].timestamp
     assert len(ts_hypotheses[0].timestamp['word']) > 0
@@ -1055,16 +1063,6 @@ def test_aed_parallel_chunking(canary_1b_v2):
     assert all(x <= y for x, y in zip(ends, ends[1:]))
     assert all(x <= y for x, y in zip(start_offsets, start_offsets[1:]))
     assert all(x <= y for x, y in zip(end_offsets, end_offsets[1:]))
-    # Check if the transcription is correct
-    assert ts_hypotheses[0].text[-25:] == 'multiple customer orders.'
-    assert ts_hypotheses[0].timestamp['word'][-1] == {
-        'word': 'orders.',
-        'start_offset': 7477,
-        'end_offset': 7481,
-        'start': 598.16,
-        'end': 598.48,
-    }
-    assert ts_hypotheses[0].text == hypotheses[0].text
 
     # Check that the number of words and segments are consistent
     assert [word_offset['word'] for word_offset in ts_hypotheses[0].timestamp['word']] == ts_hypotheses[0].text.split()
diff --git a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py
@@ -257,8 +257,8 @@ def test_save_restore_artifact_agg(self, asr_model, test_data_dir):
             assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer)
 
             # should be double
-            assert new_model.tokenizer.tokenizer.vocab_size == 254
-            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
+            assert new_model.tokenizer.tokenizer.vocab_size == 264
+            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 264
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
diff --git a/tests/collections/speechlm2/test_duplex_eartts.py b/tests/collections/speechlm2/test_duplex_eartts.py
@@ -191,7 +191,7 @@
 }
 
 # set CI cached path
-if os.path.exists("/home/TestData/"):
+if os.path.exists("/home/TestData/nvidia--NVIDIA-Nemotron-Nano-9B-v2/"):
     test_eartts_config["model"]["pretrained_lm_name"] = "/home/TestData/nvidia--NVIDIA-Nemotron-Nano-9B-v2/"
 
 

Original file line number	Diff line number	Diff line change
`@@ -191,7 +191,7 @@`
`191`	`191`	`}`
`192`	`192`
`193`	`193`	`# set CI cached path`
`194`		`-if os.path.exists("/home/TestData/"):`
	`194`	`+if os.path.exists("/home/TestData/nvidia--NVIDIA-Nemotron-Nano-9B-v2/"):`
`195`	`195`	`test_eartts_config["model"]["pretrained_lm_name"] = "/home/TestData/nvidia--NVIDIA-Nemotron-Nano-9B-v2/"`
`196`	`196`
`197`	`197`