Merge pull request #1077 from icecoins/master

w-okada · web-flow · commit 927bba646753 · 2024-01-18T06:32:12.000+09:00
implement of the fcpe in RVC
diff --git a/client/demo/dist/assets/gui_settings/GUI.json b/client/demo/dist/assets/gui_settings/GUI.json
@@ -21,7 +21,7 @@
             {
                 "name": "configArea",
                 "options": {
-                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx"],
+                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx", "fcpe" ],
                     "inputChunkNums": [1, 2, 4, 6, 8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048, 4096, 8192, 16384]
                 }
             }
diff --git a/client/demo/public/assets/gui_settings/GUI.json b/client/demo/public/assets/gui_settings/GUI.json
@@ -21,7 +21,7 @@
             {
                 "name": "configArea",
                 "options": {
-                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx"],
+                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx", "fcpe"],
                     "inputChunkNums": [1, 2, 4, 6, 8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048, 4096, 8192, 16384]
                 }
             }
diff --git a/client/lib/src/const.ts b/client/lib/src/const.ts
@@ -56,6 +56,7 @@ export const F0Detector = {
     crepe_tiny: "crepe_tiny",
     rmvpe: "rmvpe",
     rmvpe_onnx: "rmvpe_onnx",
+    fcpe: "fcpe",
 } as const;
 export type F0Detector = (typeof F0Detector)[keyof typeof F0Detector];
 
diff --git a/server/const.py b/server/const.py
@@ -82,6 +82,7 @@ class EnumInferenceTypes(Enum):
     "crepe_tiny",
     "rmvpe",
     "rmvpe_onnx",
+    "fcpe",
 ]
 
 ServerAudioDeviceType: TypeAlias = Literal["audioinput", "audiooutput"]
diff --git a/server/requirements.txt b/server/requirements.txt
@@ -27,3 +27,4 @@ websockets==11.0.2
 sounddevice==0.4.6
 dataclasses_json==0.5.7
 onnxsim==0.4.28
+torchfcpe
diff --git a/server/voice_changer/RVC/pitchExtractor/FcpePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/FcpePitchExtractor.py
@@ -0,0 +1,44 @@
+import numpy as np
+from const import PitchExtractorType
+from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
+from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
+import torchfcpe
+
+class FcpePitchExtractor(PitchExtractor):
+
+    def __init__(self, gpu: int):
+        super().__init__()
+        self.pitchExtractorType: PitchExtractorType = "fcpe"
+        self.device = DeviceManager.get_instance().getDevice(gpu)
+        self.fcpe = torchfcpe.spawn_bundled_infer_model(self.device)
+
+    # I merge the code of Voice-Changer-CrepePitchExtractor and RVC-fcpe-infer, sry I don't know how to optimize the function.
+    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
+        start_frame = int(silence_front * sr / window)
+        real_silence_front = start_frame * window / sr
+
+        silence_front_offset = int(np.round(real_silence_front * sr))
+        audio = audio[silence_front_offset:]
+
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
+        f0 = self.fcpe.infer(
+            audio.to(self.device).unsqueeze(0).float(),
+            sr=16000,
+            decoder_mode="local_argmax",
+            threshold=0.006,
+        )
+        f0 = f0.squeeze()
+
+        f0 *= pow(2, f0_up_key / 12)
+        pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]]
+        f0bak = pitchf.copy()
+        f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
+        f0_mel = np.clip(
+            (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
+        )
+        pitch_coarse = f0_mel.astype(int)
+        return pitch_coarse, pitchf
diff --git a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py
@@ -43,6 +43,9 @@ def loadPitchExtractor(
             return RMVPEPitchExtractor(cls.params.rmvpe, gpu)
         elif pitchExtractorType == "rmvpe_onnx":
             return RMVPEOnnxPitchExtractor(cls.params.rmvpe_onnx, gpu)
+        elif pitchExtractorType == "fcpe":
+            # add the FcpePitchExtractor
+            return FcpePitchExtractor(gpu)
         else:
             # return hubert as default
             print("[Voice Changer] PitchExctractor not found", pitchExtractorType)

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`{`
`22`	`22`	`"name": "configArea",`
`23`	`23`	`"options": {`
`24`		`- "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx"],`
	`24`	`+ "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx", "fcpe" ],`
`25`	`25`	`"inputChunkNums": [1, 2, 4, 6, 8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048, 4096, 8192, 16384]`
`26`	`26`	`}`
`27`	`27`	`}`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ class EnumInferenceTypes(Enum):`
`82`	`82`	`"crepe_tiny",`
`83`	`83`	`"rmvpe",`
`84`	`84`	`"rmvpe_onnx",`
	`85`	`+ "fcpe",`
`85`	`86`	`]`
`86`	`87`
`87`	`88`	`ServerAudioDeviceType: TypeAlias = Literal["audioinput", "audiooutput"]`