Replace jieba pkg with rjieba - a jieba-rs Python binding

SWivid · SWivid · commit 9ae46c836030 · 2025-11-28T13:08:07.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "f5-tts"
-version = "1.1.9"
+version = "1.1.10"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}
@@ -22,13 +22,13 @@ dependencies = [
     "ema_pytorch>=0.5.2",
     "gradio>=5.0.0",
     "hydra-core>=1.3.0",
-    "jieba",
     "librosa",
     "matplotlib",
     "numpy<=1.26.4; python_version<='3.10'",
     "pydantic<=2.10.6",
     "pydub",
     "pypinyin",
+    "rjieba",
     "safetensors",
     "soundfile",
     "tomli",
diff --git a/src/f5_tts/model/utils.py b/src/f5_tts/model/utils.py
@@ -7,7 +7,7 @@
 from collections import defaultdict
 from importlib.resources import files
 
-import jieba
+import rjieba
 import torch
 from pypinyin import Style, lazy_pinyin
 from torch.nn.utils.rnn import pad_sequence
@@ -146,10 +146,6 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
 
 
 def convert_char_to_pinyin(text_list, polyphone=True):
-    if jieba.dt.initialized is False:
-        jieba.default_logger.setLevel(50)  # CRITICAL
-        jieba.initialize()
-
     final_text_list = []
     custom_trans = str.maketrans(
         {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
@@ -163,7 +159,7 @@ def is_chinese(c):
     for text in text_list:
         char_list = []
         text = text.translate(custom_trans)
-        for seg in jieba.cut(text):
+        for seg in rjieba.cut(text):
             seg_byte_len = len(bytes(seg, "UTF-8"))
             if seg_byte_len == len(seg):  # if pure alphabets and symbols
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
diff --git a/src/f5_tts/runtime/triton_trtllm/Dockerfile.server b/src/f5_tts/runtime/triton_trtllm/Dockerfile.server
@@ -1,3 +1,3 @@
 FROM nvcr.io/nvidia/tritonserver:24.12-py3
-RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 jieba pypinyin librosa vocos
+RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 rjieba pypinyin librosa vocos
 WORKDIR /workspace
diff --git a/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py b/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py
@@ -26,7 +26,7 @@
 import json
 import os
 
-import jieba
+import rjieba
 import torch
 import torchaudio
 import triton_python_backend_utils as pb_utils
@@ -66,7 +66,7 @@ def is_chinese(c):
     for text in reference_target_texts_list:
         char_list = []
         text = text.translate(custom_trans)
-        for seg in jieba.cut(text):
+        for seg in rjieba.cut(text):
             seg_byte_len = len(bytes(seg, "UTF-8"))
             if seg_byte_len == len(seg):  # if pure alphabets and symbols
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
diff --git a/src/f5_tts/train/datasets/prepare_emilia.py b/src/f5_tts/train/datasets/prepare_emilia.py
@@ -225,5 +225,5 @@ def main():
     # bad zh asr cnt        230435   (samples)
     # bad eh asr cnt         37217   (samples)
 
-    # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
+    # vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme)
     # please be careful if using pretrained model, make sure the vocab.txt is same
diff --git a/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py b/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
@@ -122,5 +122,5 @@ def main():
     #                           -            -        1459   (polyphone)
     # char   vocab size      5264         5219        5042
 
-    # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
+    # vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme)
     # please be careful if using pretrained model, make sure the vocab.txt is same