diff --git a/README.md b/README.md index dec512d2..00e728cf 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice๐Ÿค &text2=Text-to-Speech%20๐Ÿ’–%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners) ## ๐Ÿ‘‰๐Ÿป CosyVoice ๐Ÿ‘ˆ๐Ÿป -**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B) +**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B); [Replicate Demo and API](https://replicate.com/chenxwh/cosyvoice2-0.5b) **CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M) diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 00000000..b6a103f8 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,55 @@ +# Configuration for Cog โš™๏ธ +# Reference: https://cog.run/yaml + +build: + # set to true if your model requires a GPU + gpu: true + + # a list of ubuntu apt packages to install + system_packages: + - "libgl1-mesa-glx" + - "libglib2.0-0" + - "libsox-dev" + - "sox" + + # python version in the form '3.11' or '3.11.4' + python_version: "3.11" + + # a list of packages in the format == + python_packages: + - ipython + - pynini + - torch==2.3.1 + - torchaudio==2.3.1 + - transformers==4.40.1 + - conformer==0.3.2 + - diffusers==0.27.2 + - gdown==5.1.0 + - grpcio==1.57.0 + - grpcio-tools==1.57.0 + - huggingface-hub==0.23.5 + - hydra-core==1.3.2 + - HyperPyYAML==1.2.2 + - inflect==7.3.1 + - librosa==0.10.2 + - lightning==2.2.4 + - matplotlib==3.7.5 + - modelscope==1.15.0 + - networkx==3.1 + - omegaconf==2.3.0 + - onnx==1.16.0 + - onnxruntime-gpu + - openai-whisper==20231117 + - protobuf==4.25 + - rich==13.7.1 + - soundfile==0.12.1 + - uvicorn==0.30.0 + - wget==3.2 + - fastapi==0.111.0 + - fastapi-cli==0.0.4 + - WeTextProcessing==1.0.3 + + run: + - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget + +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 00000000..313b6ebf --- /dev/null +++ b/predict.py @@ -0,0 +1,88 @@ +# Prediction interface for Cog โš™๏ธ +# https://cog.run/python + +import os +import sys +import subprocess +import time +from cog import BasePredictor, Input, Path +import torchaudio + +sys.path.insert(0, os.path.abspath("third_party/Matcha-TTS")) + +from cosyvoice.cli.cosyvoice import CosyVoice2 +from cosyvoice.utils.file_utils import load_wav + + +MODEL_CACHE = "pretrained_models" +MODEL_URL = ( + f"https://weights.replicate.delivery/default/FunAudioLLM/CosyVoice/model_cache.tar" +) + + +def download_weights(url, dest): + start = time.time() + print("downloading url: ", url) + print("downloading to: ", dest) + subprocess.check_call(["pget", "-x", url, dest], close_fds=False) + print("downloading took: ", time.time() - start) + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + + if not os.path.exists(MODEL_CACHE): + print("downloading") + download_weights(MODEL_URL, MODEL_CACHE) + + self.cosyvoice = CosyVoice2( + "pretrained_models/CosyVoice2-0.5B", + load_jit=True, + load_onnx=False, + load_trt=False, + ) + + def predict( + self, + source_audio: Path = Input(description="Source audio"), + source_transcript: str = Input( + description="Transcript of the source audio, you can use models such as whisper to transcribe first" + ), + tts_text: str = Input(description="Text of the audio to generate"), + task: str = Input( + choices=[ + "zero-shot voice clone", + "cross-lingual voice clone", + "Instructed Voice Generation", + ], + default="zero-shot voice clone", + ), + instruction: str = Input( + description="Instruction for Instructed Voice Generation task", default="" + ), + ) -> Path: + """Run a single prediction on the model""" + if task == "Instructed Voice Generation": + assert len(instruction) > 0, "Please specify the instruction." + + prompt_speech_16k = load_wav(str(source_audio), 16000) + + if task == "zero-shot voice clone": + output = self.cosyvoice.inference_zero_shot( + tts_text, source_transcript, prompt_speech_16k, stream=False + ) + elif task == "cross-lingual voice clone": + output = self.cosyvoice.inference_cross_lingual( + tts_text, prompt_speech_16k, stream=False + ) + else: + output = self.cosyvoice.inference_instruct2( + tts_text, instruction, prompt_speech_16k, stream=False + ) + + out_path = "/tmp/out.wav" + torchaudio.save( + out_path, list(output)[0]["tts_speech"], self.cosyvoice.sample_rate + ) + return Path(out_path)