Feat: npu supported for default model (#4084)

yzztin · web-flow · commit 78a952a66e11 · 2025-06-10T15:59:28.000+08:00
* Feat: npu supported for default model

* Feat: npu supported for 'vector'

* typo
diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,7 @@
 build
 *output/
 .history
+.idea
 
 audio/dist/
 audio/fc_patch/
@@ -51,3 +52,5 @@ tools/onnx-simplifier/
 speechx/fc_patch/
 
 third_party/ctc_decoders/paddlespeech_ctcdecoders.py
+
+kernel_meta/
diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py
@@ -167,10 +167,15 @@ def _get_window(waveform: Tensor,
                                             energy_floor)  # (m)
 
     if preemphasis_coefficient != 0.0:
+        # npu only support mode=constant right now
+        if paddle.get_device().startswith('npu'):
+            mode = 'constant'
+        else:
+            mode = 'replicate'
+
         offset_strided_input = paddle.nn.functional.pad(
-            strided_input.unsqueeze(0), (1, 0),
-            data_format='NCL',
-            mode='replicate').squeeze(0)  # (m, window_size + 1)
+            strided_input.unsqueeze(0), (1, 0), data_format='NCL',
+            mode=mode).squeeze(0)  # (m, window_size + 1)
         strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
                                                                                        -1]
 
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
@@ -144,6 +144,12 @@ def preprocess(self, audio_file: Union[str, os.PathLike]):
         if isinstance(audio_file, (str, os.PathLike)):
             logger.debug("Preprocessing audio_file:" + audio_file)
 
+        # set 'pad_mode' be 'constant' when device is npu, otherwise be the default 'pad_mode' value
+        if paddle.get_device().startswith('npu'):
+            pad_mode_kwarg = {"pad_mode": "constant"}
+        else:
+            pad_mode_kwarg = {}
+
         # Feature extraction
         feature_extractor = LogMelSpectrogram(
             sr=feat_conf['sample_rate'],
@@ -153,7 +159,8 @@ def preprocess(self, audio_file: Union[str, os.PathLike]):
             win_length=feat_conf['window_length'],
             f_min=feat_conf['f_min'],
             f_max=feat_conf['f_max'],
-            n_mels=feat_conf['n_mels'], )
+            n_mels=feat_conf['n_mels'],
+            **pad_mode_kwarg, )
         feats = feature_extractor(
             paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
         self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
@@ -451,12 +451,27 @@ def get_voc_inference(
     voc_name = voc[:voc.rindex('_')]
     voc_class = dynamic_import(voc_name, model_alias)
     voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
+
+    # npu only support mode=constant right now
+    # this code has been adapted to support 'paddlespeech.t2s.models.melgan.melgan.MelGANGenerator'
+    npu_pad_mode = {
+        "mode": "constant"
+    } if paddle.get_device().startswith('npu') else {}
+
     if voc_name != 'wavernn':
+        if npu_pad_mode:
+            voc_config["generator_params"].setdefault("pad_params", {})
+            voc_config["generator_params"]["pad_params"].update(npu_pad_mode)
+
         voc = voc_class(**voc_config["generator_params"])
         voc.set_state_dict(paddle.load(voc_ckpt)["generator_params"])
         voc.remove_weight_norm()
         voc.eval()
     else:
+        if npu_pad_mode:
+            voc_config["model"].setdefault("pad_params", {})
+            voc_config["model"]["pad_params"].update(npu_pad_mode)
+
         voc = voc_class(**voc_config["model"])
         voc.set_state_dict(paddle.load(voc_ckpt)["main_params"])
         voc.eval()
diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -66,7 +66,12 @@ def __init__(
         self.stride = stride
         self.dilation = dilation
         self.padding = padding
-        self.padding_mode = padding_mode
+
+        # padding_mode is forcibly set to 'constant' when using the npu device because npu only support mode=constant right now
+        if paddle.get_device().startswith('npu'):
+            self.padding_mode = 'constant'
+        else:
+            self.padding_mode = padding_mode
 
         self.conv = nn.Conv1D(
             in_channels,
@@ -335,10 +340,16 @@ def _compute_statistics(x, m, axis=2, eps=self.eps):
         # Apply layers
         attn = self.conv(self.tanh(self.tdnn(attn)))
 
+        if paddle.get_device().startswith('npu'):
+            # The following way is designed to fix the 'Broadcast dimension mismatch' error
+            # that occurs when using the npu device and setting padding_mode to 'constant'.
+            inf_tensor = paddle.full_like(attn, float("-inf"))
+        else:
+            # the default way
+            inf_tensor = paddle.ones_like(attn) * float("-inf")
+
         # Filter out zero-paddings
-        attn = paddle.where(
-            mask.tile((1, C, 1)) == 0,
-            paddle.ones_like(attn) * float("-inf"), attn)
+        attn = paddle.where(mask.tile((1, C, 1)) == 0, inf_tensor, attn)
 
         attn = F.softmax(attn, axis=2)
         mean, std = _compute_statistics(x, attn)