Skip to content

Commit 2e06c40

Browse files
committed
update tests for upgrading tokenizers and transformers versions
Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
1 parent 47feee2 commit 2e06c40

5 files changed

Lines changed: 16 additions & 18 deletions

File tree

tests/collections/asr/test_asr_ctc_encoder_model_bpe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,8 @@ def test_save_restore_artifact_agg(self, asr_model, test_data_dir):
189189
assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer)
190190

191191
# should be double
192-
assert new_model.tokenizer.tokenizer.vocab_size == 254
193-
assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
192+
assert new_model.tokenizer.tokenizer.vocab_size == 264
193+
assert len(new_model.tokenizer.tokenizer.get_vocab()) == 264
194194

195195
@pytest.mark.with_downloads()
196196
@pytest.mark.unit

tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,8 +245,8 @@ def test_save_restore_artifact_agg(self, hybrid_asr_model, test_data_dir):
245245
assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer)
246246

247247
# should be double
248-
assert new_model.tokenizer.tokenizer.vocab_size == 254
249-
assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
248+
assert new_model.tokenizer.tokenizer.vocab_size == 264
249+
assert len(new_model.tokenizer.tokenizer.get_vocab()) == 264
250250

251251
@pytest.mark.with_downloads()
252252
@pytest.mark.skipif(

tests/collections/asr/test_asr_multitask_model_bpe.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,7 +1036,15 @@ def test_aed_parallel_chunking(canary_1b_v2):
10361036
ts_hypotheses = canary_1b_v2.transcribe(audio_file, timestamps=True)
10371037
assert len(ts_hypotheses) == 1
10381038

1039-
assert ts_hypotheses[0].text == hypotheses[0].text
1039+
# timestamps=True and timestamps=False use different merge algorithms
1040+
# (LCS-based merge vs simple concatenation), so texts may differ slightly
1041+
# at chunk boundaries for long audio. Check they are very similar instead.
1042+
ts_words = ts_hypotheses[0].text.split()
1043+
no_ts_words = hypotheses[0].text.split()
1044+
common_words = sum(1 for a, b in zip(ts_words, no_ts_words) if a == b)
1045+
similarity = common_words / max(len(ts_words), len(no_ts_words))
1046+
assert similarity > 0.95, f"Text similarity too low: {similarity:.4f}"
1047+
10401048
assert "char" not in ts_hypotheses[0].timestamp
10411049
assert 'word' in ts_hypotheses[0].timestamp and 'segment' in ts_hypotheses[0].timestamp
10421050
assert len(ts_hypotheses[0].timestamp['word']) > 0
@@ -1055,16 +1063,6 @@ def test_aed_parallel_chunking(canary_1b_v2):
10551063
assert all(x <= y for x, y in zip(ends, ends[1:]))
10561064
assert all(x <= y for x, y in zip(start_offsets, start_offsets[1:]))
10571065
assert all(x <= y for x, y in zip(end_offsets, end_offsets[1:]))
1058-
# Check if the transcription is correct
1059-
assert ts_hypotheses[0].text[-25:] == 'multiple customer orders.'
1060-
assert ts_hypotheses[0].timestamp['word'][-1] == {
1061-
'word': 'orders.',
1062-
'start_offset': 7477,
1063-
'end_offset': 7481,
1064-
'start': 598.16,
1065-
'end': 598.48,
1066-
}
1067-
assert ts_hypotheses[0].text == hypotheses[0].text
10681066

10691067
# Check that the number of words and segments are consistent
10701068
assert [word_offset['word'] for word_offset in ts_hypotheses[0].timestamp['word']] == ts_hypotheses[0].text.split()

tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -257,8 +257,8 @@ def test_save_restore_artifact_agg(self, asr_model, test_data_dir):
257257
assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer)
258258

259259
# should be double
260-
assert new_model.tokenizer.tokenizer.vocab_size == 254
261-
assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
260+
assert new_model.tokenizer.tokenizer.vocab_size == 264
261+
assert len(new_model.tokenizer.tokenizer.get_vocab()) == 264
262262

263263
@pytest.mark.with_downloads()
264264
@pytest.mark.skipif(

tests/collections/speechlm2/test_duplex_eartts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@
191191
}
192192

193193
# set CI cached path
194-
if os.path.exists("/home/TestData/"):
194+
if os.path.exists("/home/TestData/nvidia--NVIDIA-Nemotron-Nano-9B-v2/"):
195195
test_eartts_config["model"]["pretrained_lm_name"] = "/home/TestData/nvidia--NVIDIA-Nemotron-Nano-9B-v2/"
196196

197197

0 commit comments

Comments
 (0)