Skip to content

Commit 9ae46c8

Browse files
committed
Replace jieba pkg with rjieba - a jieba-rs Python binding
1 parent 3eecd94 commit 9ae46c8

File tree

6 files changed

+9
-13
lines changed

6 files changed

+9
-13
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "f5-tts"
7-
version = "1.1.9"
7+
version = "1.1.10"
88
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
99
readme = "README.md"
1010
license = {text = "MIT License"}
@@ -22,13 +22,13 @@ dependencies = [
2222
"ema_pytorch>=0.5.2",
2323
"gradio>=5.0.0",
2424
"hydra-core>=1.3.0",
25-
"jieba",
2625
"librosa",
2726
"matplotlib",
2827
"numpy<=1.26.4; python_version<='3.10'",
2928
"pydantic<=2.10.6",
3029
"pydub",
3130
"pypinyin",
31+
"rjieba",
3232
"safetensors",
3333
"soundfile",
3434
"tomli",

src/f5_tts/model/utils.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from collections import defaultdict
88
from importlib.resources import files
99

10-
import jieba
10+
import rjieba
1111
import torch
1212
from pypinyin import Style, lazy_pinyin
1313
from torch.nn.utils.rnn import pad_sequence
@@ -146,10 +146,6 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
146146

147147

148148
def convert_char_to_pinyin(text_list, polyphone=True):
149-
if jieba.dt.initialized is False:
150-
jieba.default_logger.setLevel(50) # CRITICAL
151-
jieba.initialize()
152-
153149
final_text_list = []
154150
custom_trans = str.maketrans(
155151
{";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
@@ -163,7 +159,7 @@ def is_chinese(c):
163159
for text in text_list:
164160
char_list = []
165161
text = text.translate(custom_trans)
166-
for seg in jieba.cut(text):
162+
for seg in rjieba.cut(text):
167163
seg_byte_len = len(bytes(seg, "UTF-8"))
168164
if seg_byte_len == len(seg): # if pure alphabets and symbols
169165
if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
FROM nvcr.io/nvidia/tritonserver:24.12-py3
2-
RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 jieba pypinyin librosa vocos
2+
RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 rjieba pypinyin librosa vocos
33
WORKDIR /workspace

src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import json
2727
import os
2828

29-
import jieba
29+
import rjieba
3030
import torch
3131
import torchaudio
3232
import triton_python_backend_utils as pb_utils
@@ -66,7 +66,7 @@ def is_chinese(c):
6666
for text in reference_target_texts_list:
6767
char_list = []
6868
text = text.translate(custom_trans)
69-
for seg in jieba.cut(text):
69+
for seg in rjieba.cut(text):
7070
seg_byte_len = len(bytes(seg, "UTF-8"))
7171
if seg_byte_len == len(seg): # if pure alphabets and symbols
7272
if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":

src/f5_tts/train/datasets/prepare_emilia.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,5 +225,5 @@ def main():
225225
# bad zh asr cnt 230435 (samples)
226226
# bad eh asr cnt 37217 (samples)
227227

228-
# vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
228+
# vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme)
229229
# please be careful if using pretrained model, make sure the vocab.txt is same

src/f5_tts/train/datasets/prepare_wenetspeech4tts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,5 +122,5 @@ def main():
122122
# - - 1459 (polyphone)
123123
# char vocab size 5264 5219 5042
124124

125-
# vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
125+
# vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme)
126126
# please be careful if using pretrained model, make sure the vocab.txt is same

0 commit comments

Comments
 (0)