Skip to content

Commit 78a952a

Browse files
authored
Feat: npu supported for default model (#4084)
* Feat: npu supported for default model * Feat: npu supported for 'vector' * typo
1 parent 563217a commit 78a952a

File tree

5 files changed

+49
-8
lines changed

5 files changed

+49
-8
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
build
1717
*output/
1818
.history
19+
.idea
1920

2021
audio/dist/
2122
audio/fc_patch/
@@ -51,3 +52,5 @@ tools/onnx-simplifier/
5152
speechx/fc_patch/
5253

5354
third_party/ctc_decoders/paddlespeech_ctcdecoders.py
55+
56+
kernel_meta/

paddlespeech/audio/compliance/kaldi.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,15 @@ def _get_window(waveform: Tensor,
167167
energy_floor) # (m)
168168

169169
if preemphasis_coefficient != 0.0:
170+
# npu only support mode=constant right now
171+
if paddle.get_device().startswith('npu'):
172+
mode = 'constant'
173+
else:
174+
mode = 'replicate'
175+
170176
offset_strided_input = paddle.nn.functional.pad(
171-
strided_input.unsqueeze(0), (1, 0),
172-
data_format='NCL',
173-
mode='replicate').squeeze(0) # (m, window_size + 1)
177+
strided_input.unsqueeze(0), (1, 0), data_format='NCL',
178+
mode=mode).squeeze(0) # (m, window_size + 1)
174179
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
175180
-1]
176181

paddlespeech/cli/cls/infer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,12 @@ def preprocess(self, audio_file: Union[str, os.PathLike]):
144144
if isinstance(audio_file, (str, os.PathLike)):
145145
logger.debug("Preprocessing audio_file:" + audio_file)
146146

147+
# set 'pad_mode' be 'constant' when device is npu, otherwise be the default 'pad_mode' value
148+
if paddle.get_device().startswith('npu'):
149+
pad_mode_kwarg = {"pad_mode": "constant"}
150+
else:
151+
pad_mode_kwarg = {}
152+
147153
# Feature extraction
148154
feature_extractor = LogMelSpectrogram(
149155
sr=feat_conf['sample_rate'],
@@ -153,7 +159,8 @@ def preprocess(self, audio_file: Union[str, os.PathLike]):
153159
win_length=feat_conf['window_length'],
154160
f_min=feat_conf['f_min'],
155161
f_max=feat_conf['f_max'],
156-
n_mels=feat_conf['n_mels'], )
162+
n_mels=feat_conf['n_mels'],
163+
**pad_mode_kwarg, )
157164
feats = feature_extractor(
158165
paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
159166
self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(

paddlespeech/t2s/exps/syn_utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,12 +451,27 @@ def get_voc_inference(
451451
voc_name = voc[:voc.rindex('_')]
452452
voc_class = dynamic_import(voc_name, model_alias)
453453
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
454+
455+
# npu only support mode=constant right now
456+
# this code has been adapted to support 'paddlespeech.t2s.models.melgan.melgan.MelGANGenerator'
457+
npu_pad_mode = {
458+
"mode": "constant"
459+
} if paddle.get_device().startswith('npu') else {}
460+
454461
if voc_name != 'wavernn':
462+
if npu_pad_mode:
463+
voc_config["generator_params"].setdefault("pad_params", {})
464+
voc_config["generator_params"]["pad_params"].update(npu_pad_mode)
465+
455466
voc = voc_class(**voc_config["generator_params"])
456467
voc.set_state_dict(paddle.load(voc_ckpt)["generator_params"])
457468
voc.remove_weight_norm()
458469
voc.eval()
459470
else:
471+
if npu_pad_mode:
472+
voc_config["model"].setdefault("pad_params", {})
473+
voc_config["model"]["pad_params"].update(npu_pad_mode)
474+
460475
voc = voc_class(**voc_config["model"])
461476
voc.set_state_dict(paddle.load(voc_ckpt)["main_params"])
462477
voc.eval()

paddlespeech/vector/models/ecapa_tdnn.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,12 @@ def __init__(
6666
self.stride = stride
6767
self.dilation = dilation
6868
self.padding = padding
69-
self.padding_mode = padding_mode
69+
70+
# padding_mode is forcibly set to 'constant' when using the npu device because npu only support mode=constant right now
71+
if paddle.get_device().startswith('npu'):
72+
self.padding_mode = 'constant'
73+
else:
74+
self.padding_mode = padding_mode
7075

7176
self.conv = nn.Conv1D(
7277
in_channels,
@@ -335,10 +340,16 @@ def _compute_statistics(x, m, axis=2, eps=self.eps):
335340
# Apply layers
336341
attn = self.conv(self.tanh(self.tdnn(attn)))
337342

343+
if paddle.get_device().startswith('npu'):
344+
# The following way is designed to fix the 'Broadcast dimension mismatch' error
345+
# that occurs when using the npu device and setting padding_mode to 'constant'.
346+
inf_tensor = paddle.full_like(attn, float("-inf"))
347+
else:
348+
# the default way
349+
inf_tensor = paddle.ones_like(attn) * float("-inf")
350+
338351
# Filter out zero-paddings
339-
attn = paddle.where(
340-
mask.tile((1, C, 1)) == 0,
341-
paddle.ones_like(attn) * float("-inf"), attn)
352+
attn = paddle.where(mask.tile((1, C, 1)) == 0, inf_tensor, attn)
342353

343354
attn = F.softmax(attn, axis=2)
344355
mean, std = _compute_statistics(x, attn)

0 commit comments

Comments
 (0)