Skip to content

Commit 538f260

Browse files
authored
【ASR】whisper large v3 (#4101)
* whisper large v3 * add convert.py * mv nlp tokenizer to tiktoken. * fix bug * remove convert.py * add new model file. * fix * fix version number * fix version number * fix some bug * fix bug
1 parent 8f367b0 commit 538f260

File tree

6 files changed

+635
-275
lines changed

6 files changed

+635
-275
lines changed

paddlespeech/cli/whisper/infer.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,9 @@ def __init__(self):
7474
self.parser.add_argument(
7575
'--size',
7676
type=str,
77-
default='large',
78-
choices=['large', 'medium', 'base', 'small', 'tiny'],
79-
help='Choose model size. now only support large, large:[whisper-large-16k]'
80-
)
77+
default='turbo',
78+
choices=['large', 'medium', 'base', 'small', 'tiny', 'turbo'],
79+
help='Choose model size.')
8180
self.parser.add_argument(
8281
'--language',
8382
type=str,
@@ -141,7 +140,7 @@ def _init_from_path(self,
141140
model_type: str='whisper',
142141
lang: str='',
143142
task: str='transcribe',
144-
size: str='large',
143+
size: str='turbo',
145144
language: str='None',
146145
sample_rate: int=16000,
147146
cfg_path: Optional[os.PathLike]=None,
@@ -200,6 +199,7 @@ def _init_from_path(self,
200199
# load model
201200
model_dict = paddle.load(self.ckpt_path)
202201
dims = ModelDimensions(**model_dict["dims"])
202+
self.dims = dims
203203
self.model = Whisper(dims)
204204
self.model.load_dict(model_dict)
205205
self.model.eval()
@@ -251,8 +251,11 @@ def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
251251

252252
logger.debug(f"audio shape: {audio.shape}")
253253
# fbank
254-
audio = log_mel_spectrogram(audio, resource_path=self.resource_path)
255-
254+
audio = log_mel_spectrogram(
255+
audio,
256+
resource_path=self.resource_path,
257+
n_mels=self.dims.n_mels,
258+
padding=480000)
256259
audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
257260

258261
self._inputs["audio"] = audio
@@ -275,7 +278,6 @@ def infer(self, model_type: str):
275278
cfg.temperature_increment_on_fallback))
276279
else:
277280
temperature = [cfg.temperature]
278-
279281
self._outputs["result"] = self.model.transcribe(
280282
audio,
281283
verbose=cfg.verbose,

paddlespeech/resource/pretrained_models.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,24 @@
617617
'resource_data_md5':
618618
'37a0a8abdb3641a51194f79567a93b61',
619619
},
620+
'1.5': {
621+
'url':
622+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-large-model.tar.gz',
623+
'md5':
624+
'9ebbd228fa07ca4557e5da863dac2982',
625+
'cfg_path':
626+
'whisper.yaml',
627+
'ckpt_path':
628+
'whisper-large-model',
629+
'model':
630+
'whisper-large-model.pdparams',
631+
'params':
632+
'whisper-large-model.pdparams',
633+
'resource_data':
634+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
635+
'resource_data_md5':
636+
'dd61d092d362f1fdbae6ede08282e177',
637+
},
620638
},
621639
"whisper-base-en-16k": {
622640
'1.3': {
@@ -637,6 +655,24 @@
637655
'resource_data_md5':
638656
'37a0a8abdb3641a51194f79567a93b61',
639657
},
658+
'1.5': {
659+
'url':
660+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-base-en-model.tar.gz',
661+
'md5':
662+
'376617a9c5f36404f50dde3708bac0c6',
663+
'cfg_path':
664+
'whisper.yaml',
665+
'ckpt_path':
666+
'whisper-base-en-model',
667+
'model':
668+
'whisper-base-en-model.pdparams',
669+
'params':
670+
'whisper-base-en-model.pdparams',
671+
'resource_data':
672+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
673+
'resource_data_md5':
674+
'dd61d092d362f1fdbae6ede08282e177',
675+
},
640676
},
641677
"whisper-base-16k": {
642678
'1.3': {
@@ -657,6 +693,24 @@
657693
'resource_data_md5':
658694
'37a0a8abdb3641a51194f79567a93b61',
659695
},
696+
'1.5': {
697+
'url':
698+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-base-model.tar.gz',
699+
'md5':
700+
'61836cb29c93048621f83364d83b532b',
701+
'cfg_path':
702+
'whisper.yaml',
703+
'ckpt_path':
704+
'whisper-base-model',
705+
'model':
706+
'whisper-base-model.pdparams',
707+
'params':
708+
'whisper-base-model.pdparams',
709+
'resource_data':
710+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
711+
'resource_data_md5':
712+
'dd61d092d362f1fdbae6ede08282e177',
713+
},
660714
},
661715
"whisper-medium-en-16k": {
662716
'1.3': {
@@ -677,6 +731,24 @@
677731
'resource_data_md5':
678732
'37a0a8abdb3641a51194f79567a93b61',
679733
},
734+
'1.5': {
735+
'url':
736+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-medium-en-model.tar.gz',
737+
'md5':
738+
'ac01145c5de962f1416f3d98171be559',
739+
'cfg_path':
740+
'whisper.yaml',
741+
'ckpt_path':
742+
'whisper-medium-en-model',
743+
'model':
744+
'whisper-medium-en-model.pdparams',
745+
'params':
746+
'whisper-medium-en-model.pdparams',
747+
'resource_data':
748+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
749+
'resource_data_md5':
750+
'dd61d092d362f1fdbae6ede08282e177',
751+
},
680752
},
681753
"whisper-medium-16k": {
682754
'1.3': {
@@ -697,6 +769,24 @@
697769
'resource_data_md5':
698770
'37a0a8abdb3641a51194f79567a93b61',
699771
},
772+
'1.5': {
773+
'url':
774+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-medium-model.tar.gz',
775+
'md5':
776+
'07770819961d1fe795facd3666f8db17',
777+
'cfg_path':
778+
'whisper.yaml',
779+
'ckpt_path':
780+
'whisper-medium-model',
781+
'model':
782+
'whisper-medium-model.pdparams',
783+
'params':
784+
'whisper-medium-model.pdparams',
785+
'resource_data':
786+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
787+
'resource_data_md5':
788+
'dd61d092d362f1fdbae6ede08282e177',
789+
},
700790
},
701791
"whisper-small-en-16k": {
702792
'1.3': {
@@ -717,6 +807,24 @@
717807
'resource_data_md5':
718808
'37a0a8abdb3641a51194f79567a93b61',
719809
},
810+
'1.5': {
811+
'url':
812+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-small-en-model.tar.gz',
813+
'md5':
814+
'67af14156b93f49ae738a17204189e46',
815+
'cfg_path':
816+
'whisper.yaml',
817+
'ckpt_path':
818+
'whisper-small-en-model',
819+
'model':
820+
'whisper-small-en-model.pdparams',
821+
'params':
822+
'whisper-small-en-model.pdparams',
823+
'resource_data':
824+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
825+
'resource_data_md5':
826+
'dd61d092d362f1fdbae6ede08282e177',
827+
},
720828
},
721829
"whisper-small-16k": {
722830
'1.3': {
@@ -737,6 +845,24 @@
737845
'resource_data_md5':
738846
'37a0a8abdb3641a51194f79567a93b61',
739847
},
848+
'1.5': {
849+
'url':
850+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-small-model.tar.gz',
851+
'md5':
852+
'db53c4bf39a9ad46ef77e6f9a37200b6',
853+
'cfg_path':
854+
'whisper.yaml',
855+
'ckpt_path':
856+
'whisper-small-model',
857+
'model':
858+
'whisper-small-model.pdparams',
859+
'params':
860+
'whisper-small-model.pdparams',
861+
'resource_data':
862+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
863+
'resource_data_md5':
864+
'dd61d092d362f1fdbae6ede08282e177',
865+
},
740866
},
741867
"whisper-tiny-en-16k": {
742868
'1.3': {
@@ -757,6 +883,24 @@
757883
'resource_data_md5':
758884
'37a0a8abdb3641a51194f79567a93b61',
759885
},
886+
'1.5': {
887+
'url':
888+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-tiny-en-model.tar.gz',
889+
'md5':
890+
'f91f8447d8b37ed13f4327ef6565b094',
891+
'cfg_path':
892+
'whisper.yaml',
893+
'ckpt_path':
894+
'whisper-tiny-en-model',
895+
'model':
896+
'whisper-tiny-en-model.pdparams',
897+
'params':
898+
'whisper-tiny-en-model.pdparams',
899+
'resource_data':
900+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
901+
'resource_data_md5':
902+
'dd61d092d362f1fdbae6ede08282e177',
903+
},
760904
},
761905
"whisper-tiny-16k": {
762906
'1.3': {
@@ -777,6 +921,44 @@
777921
'resource_data_md5':
778922
'37a0a8abdb3641a51194f79567a93b61',
779923
},
924+
'1.5': {
925+
'url':
926+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-tiny-model.tar.gz',
927+
'md5':
928+
'6f2209ac656ff12de085c824363316e2',
929+
'cfg_path':
930+
'whisper.yaml',
931+
'ckpt_path':
932+
'whisper-tiny-model',
933+
'model':
934+
'whisper-tiny-model.pdparams',
935+
'params':
936+
'whisper-tiny-model.pdparams',
937+
'resource_data':
938+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
939+
'resource_data_md5':
940+
'dd61d092d362f1fdbae6ede08282e177',
941+
},
942+
},
943+
"whisper-turbo-16k": {
944+
'1.5': {
945+
'url':
946+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/whisper-turbo-model.tar.gz',
947+
'md5':
948+
'fe2dd1a1d6eb8e6d017aafc7d5f62336',
949+
'cfg_path':
950+
'whisper.yaml',
951+
'ckpt_path':
952+
'whisper-turbo-model',
953+
'model':
954+
'whisper-turbo-model.pdparams',
955+
'params':
956+
'whisper-turbo-model.pdparams',
957+
'resource_data':
958+
'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20250825/assets.tar',
959+
'resource_data_md5':
960+
'dd61d092d362f1fdbae6ede08282e177',
961+
},
780962
},
781963
}
782964

paddlespeech/s2t/exps/whisper/test_wav.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def __init__(self, config, args):
4545
model_dict = paddle.load(self.config.model_file)
4646
config.pop("model_file")
4747
dims = ModelDimensions(**model_dict["dims"])
48+
self.dims = dims
4849
self.model = Whisper(dims)
4950
self.model.load_dict(model_dict)
5051

@@ -64,8 +65,10 @@ def run(self):
6465

6566
#load audio
6667
mel = log_mel_spectrogram(
67-
args.audio_file, resource_path=config.resource_path)
68-
68+
args.audio_file,
69+
resource_path=config.resource_path,
70+
n_mels=self.dims.n_mels,
71+
padding=480000)
6972
result = transcribe(
7073
self.model, mel, temperature=temperature, **config)
7174
if args.result_file is not None:

0 commit comments

Comments
 (0)