Skip to content
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ __pycache__/
*.py[cod]
*$py.class

# data
terminology/

# C extensions
*.so

Expand Down
18 changes: 17 additions & 1 deletion book_maker/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from book_maker.translator import MODEL_DICT
from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -89,6 +88,21 @@ def main():
default="p",
help="example --translate-tags p,blockquote",
)
parser.add_argument(
"--terminology",
dest="terminology_filename",
type=str,
default="",
help="terminology file name",
)

parser.add_argument(
"--professional_field",
dest="professional_field",
type=str,
default="",
help="professional field",
)

options = parser.parse_args()
PROXY = options.proxy
Expand Down Expand Up @@ -129,6 +143,8 @@ def main():
is_test=options.test,
test_num=options.test_num,
translate_tags=options.translate_tags,
terminology_filename=options.terminology_filename,
Professional_field=options.professional_field,
)
e.make_bilingual_book()

Expand Down
7 changes: 5 additions & 2 deletions book_maker/loader/epub_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from .base_loader import BaseBookLoader


class EPUBBookLoader(BaseBookLoader):
def __init__(
self,
Expand All @@ -24,10 +23,14 @@ def __init__(
is_test=False,
test_num=5,
translate_tags="p",
terminology_filename="terminology.txt",
Professional_field="",
):
self.epub_name = epub_name
self.new_epub = epub.EpubBook()
self.translate_model = model(key, language, model_api_base)

self.terminology_filename=terminology_filename
self.translate_model = model(key, language, terminology_filename, model_api_base)
self.is_test = is_test
self.test_num = test_num
self.translate_tags = translate_tags
Expand Down
4 changes: 3 additions & 1 deletion book_maker/translator/base_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@


class Base(ABC):
def __init__(self, key, language):
def __init__(self, key, language,terminology_filename,Professional_field):
self.keys = itertools.cycle(key.split(","))
self.language = language
self.terminology_filename=terminology_filename
self.Professional_field=Professional_field

@abstractmethod
def rotate_key(self):
Expand Down
19 changes: 15 additions & 4 deletions book_maker/translator/chatgptapi_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,38 @@

from .base_translator import Base

from .terminology_translator import build_terminology, terminology_prompt

class ChatGPTAPI(Base):
def __init__(self, key, language, api_base=None):
super().__init__(key, language)
def __init__(self, key, language, terminology_filename, Professional_field="medical", api_base=None):
super().__init__(key, language, terminology_filename,Professional_field)
self.key_len = len(key.split(","))
if api_base:
openai.api_base = api_base
self.terminology=build_terminology(self.terminology_filename)

def rotate_key(self):
openai.api_key = next(self.keys)

def translate(self, text):
print(text)
self.rotate_key()
# Professional_field="medical"
Professional_prompt=""
if self.Professional_field !="":
Professional_prompt= f"It is {self.Professional_field} contents, and when translating, attention should be paid to using {self.Professional_field} professional terms and expressions. "
positive_prompt="Keep the meaning same, but make them more literary and easier to understand. "
nagative_prompt="Please do not translate numbers and abbreviations, such as '123' '4.00' 'ACD' or 'IOL', If the text is too short, or consists only of numbers or abbreviations that are difficult to translate, then returning the original text is sufficient and there is no need to translate it."

try:
term_prompt=terminology_prompt(text, self.terminology)
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
# english prompt here to save tokens
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
"content": f"I want you to act as a translator, Please help me to translate to {self.language}, {positive_prompt} {nagative_prompt} {term_prompt} {Professional_prompt} Please return only translated content not include the origin text. The content that needs to be translated is \n\n `{text}` ",
}
],
)
Expand All @@ -42,12 +52,13 @@ def translate(self, text):
time.sleep(sleep_time)
print(e, f"will sleep {sleep_time} seconds")
self.rotate_key()
term_prompt=terminology_prompt(text, self.terminology)
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
"content": f"I want you to act as a translator, Please help me to translate to {self.language}, {positive_prompt} {nagative_prompt} {term_prompt} {Professional_prompt} Please return only translated content not include the origin text. The content that needs to be translated is \n\n `{text}` ",
}
],
)
Expand Down
72 changes: 72 additions & 0 deletions book_maker/translator/terminology_translator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas as pd
from openai.embeddings_utils import get_embedding
import os

def read_terminology(terminology_filename):
if not os.path.exists(terminology_filename):
return None
with open(terminology_filename, 'r', encoding='utf-8') as f:
terminology_list=f.readlines()
# 如果terminology_list为空,则返回None
if len(terminology_list)==0:
return None
df=pd.DataFrame(terminology_list,columns=["term"])
return df

def read_reference_abstract(ref_filename):
# CNKI,搜关键词,以“被引次数”排序,全选-导出文献-知网研学-复制到剪贴板,粘贴到文本文件中。
with open(ref_filename, 'r', encoding='utf-8') as f:
reference_list=f.readlines()
# 仅仅保留开头为“Title-题名”,“Keyword-关键词”,“Summary-摘要”这三个的行
reference_filtered = [x for x in reference_list if x.startswith("Title-题名") or x.startswith("Summary-摘要")]
reference_filtered = [x.replace("Title-题名: ","").replace("Summary-摘要: ","").replace("\n","") for x in reference_filtered]
reference_filtered = [x.replace("目的","").replace("方法","").replace("结果","").replace("结论","").replace(":","").replace(" ","").replace("\n","").replace("\u3000","") for x in reference_filtered]
reference_filtered = [x.split("。") for x in reference_filtered]
reference_filtered = [item for sublist in reference_filtered for item in sublist]
reference_filtered = set(reference_filtered)
reference_filtered = [x for x in reference_filtered if x != ""]
df=pd.DataFrame(reference_filtered,columns=["term"])
return df

def read_reference(ref_filename):
with open(ref_filename, 'r', encoding='utf-8') as f:
reference_list=f.readlines()
reference_filtered = [x.replace("Title-题名: ","").replace("Summary-摘要: ","").replace("\n","") for x in reference_list]
reference_filtered = [x.split("。") for x in reference_filtered]
df=pd.DataFrame(reference_filtered,columns=["term"])
return df

def get_embedding_from_terminology(terminology_filename,
embedding_model = "text-embedding-ada-002"):
df=read_terminology(terminology_filename)
# df=read_reference_abstract(terminology_filename)
# print(df.head())
if df is None:
return None
df["embedding"]=df["term"].apply(lambda x: get_embedding(x,embedding_model))
return df


def build_terminology(terminology_filename,
embedding_model = "text-embedding-ada-002",
):
# 不管有没有都重建一次好了,估计不费事
df=get_embedding_from_terminology(terminology_filename,embedding_model)
return df



def terminology_prompt(text, terminology,
term_candidate_n=5,
embedding_model="text-embedding-ada-002"):
if terminology is None:
return ""

text_embedding=get_embedding(text, engine=embedding_model)
terminology["similarity"]=terminology["embedding"].apply(lambda x: cosine_similarity(x,text_embedding))
results = terminology.sort_values("similarity", ascending=False, ignore_index=True)
results = results["term"].head(term_candidate_n)
terminology_list=", ".join(results.to_list())
terminology_promt=f"and use the following terminology list if necessary: [{terminology_list}], but please do NOT show the terminology directly in the results. "
return terminology_promt