-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathopenai_translator.py
More file actions
288 lines (239 loc) · 12.7 KB
/
openai_translator.py
File metadata and controls
288 lines (239 loc) · 12.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python
import json
import requests
import time
import re
from typing import Optional, List
import tiktoken
class OpenAITranslator:
def __init__(self, api_key: str, base_url: str = "https://api.openai.com/v1",
model: str = "gpt-3.5-turbo", max_tokens: int = 2000,
temperature: float = 0.3, chunk_size: int = 3000):
# Ensure API key is provided
if not api_key:
raise ValueError("OpenAI API key must be provided")
self.api_key = api_key
self.base_url = base_url
self.model = model
self.max_tokens = max_tokens
self.temperature = temperature
self.chunk_size = chunk_size # Maximum tokens per chunk
self.session = requests.Session()
self.session.headers.update({
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
})
# Initialize tokenizer for token counting
try:
self.tokenizer = tiktoken.get_encoding("cl100k_base")
except Exception:
# Fallback if tiktoken is not available
self.tokenizer = None
print("Warning: tiktoken not available, using approximate token counting")
def count_tokens(self, text: str) -> int:
"""Count tokens in text using tokenizer or approximate method"""
if self.tokenizer:
return len(self.tokenizer.encode(text))
else:
# Approximate token counting (roughly 4 characters per token)
return len(text) // 4
def split_text_into_chunks(self, text: str, target_language: str, source_language: str = "en") -> List[str]:
"""Split text into chunks that fit within token limits"""
# Reserve tokens for the prompt template and overhead
prompt_template = f"""You are a professional academic translator. Your task is to translate the following text from {source_language} to {target_language}.
CRITICAL REQUIREMENTS:
1. Translate EXACTLY what is provided - DO NOT add any content that is not in the original text
2. DO NOT add explanations, examples, or clarifications that are not in the source
3. DO NOT omit any information from the original text
4. Preserve all LaTeX commands, mathematical formulas, and technical notation exactly as they appear
5. Maintain the original structure, formatting, and paragraph breaks
6. Translate ONLY the text content, keeping all non-text elements unchanged
7. If you encounter ambiguous terms, choose the most literal translation rather than adding explanatory context
Text to translate:
{{text}}
Translated text (strictly faithful to original):"""
template_tokens = self.count_tokens(prompt_template)
available_tokens = self.chunk_size - template_tokens - 100 # 100 token safety margin
if available_tokens <= 0:
print("Warning: Chunk size too small for prompt template")
available_tokens = 1000
# If text is small enough, return as single chunk
if self.count_tokens(text) <= available_tokens:
return [text]
chunks = []
remaining_text = text
# For LaTeX documents, try to split at logical boundaries
# First try to split at sections
section_pattern = r'(\\section\{[^}]+\}\s*)'
sections = re.split(section_pattern, remaining_text)
if len(sections) > 1:
current_chunk = ""
for i, section in enumerate(sections):
if i % 2 == 1: # This is a section header
if current_chunk and self.count_tokens(current_chunk + section) > available_tokens:
chunks.append(current_chunk.strip())
current_chunk = section
else:
current_chunk += section
else: # This is section content
if self.count_tokens(current_chunk + section) > available_tokens:
# Split this section further
sub_chunks = self.split_paragraphs(section, available_tokens)
for sub_chunk in sub_chunks:
if current_chunk and self.count_tokens(current_chunk + sub_chunk) > available_tokens:
chunks.append(current_chunk.strip())
current_chunk = sub_chunk
else:
current_chunk += sub_chunk
else:
current_chunk += section
if current_chunk:
chunks.append(current_chunk.strip())
else:
# If no sections, split by paragraphs
chunks = self.split_paragraphs(text, available_tokens)
return chunks
def split_paragraphs(self, text: str, max_tokens: int) -> List[str]:
"""Split text by paragraphs"""
paragraphs = text.split('\n\n')
chunks = []
current_chunk = ""
for paragraph in paragraphs:
if self.count_tokens(current_chunk + paragraph) <= max_tokens:
if current_chunk:
current_chunk += "\n\n" + paragraph
else:
current_chunk = paragraph
else:
if current_chunk:
chunks.append(current_chunk.strip())
# If single paragraph is too long, split it
if self.count_tokens(paragraph) > max_tokens:
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
temp_chunk = ""
for sentence in sentences:
if self.count_tokens(temp_chunk + sentence) <= max_tokens:
if temp_chunk:
temp_chunk += " " + sentence
else:
temp_chunk = sentence
else:
if temp_chunk:
chunks.append(temp_chunk.strip())
temp_chunk = sentence
if temp_chunk:
current_chunk = temp_chunk
else:
current_chunk = paragraph
else:
current_chunk = paragraph
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def translate_chunk(self, chunk: str, target_language: str, source_language: str = "en") -> str:
"""Translate a single chunk using OpenAI API"""
if not chunk.strip():
return chunk
# Protect math formula placeholders (XMATHX_* and XMATHXBS)
# Enhanced to protect entire LaTeX commands that follow XMATHX placeholders
import re
placeholder_pattern = r'XMATHX_\d+(?:_\d+)*|XMATHXBS'
placeholders = re.findall(placeholder_pattern, chunk)
# Also protect entire LaTeX command structures that follow XMATHX placeholders
# This includes patterns like "XMATHXBS bibliographystyle {IEEEtran}"
# Enhanced to handle multiple commands in the same line
extended_pattern = r'XMATHXBS\s+\w+(?:\s*\{[^}]*\})*(?:\s+\w+(?:\s*\{[^}]*\})*)*'
extended_placeholders = re.findall(extended_pattern, chunk)
# Combine all placeholders and sort by length (descending) to handle overlapping patterns
# Longer patterns should be replaced first to avoid leaving parts unprotected
all_placeholders = sorted(list(set(placeholders + extended_placeholders)), key=len, reverse=True)
# Create a mapping of placeholders to temporary safe tokens
placeholder_map = {}
protected_chunk = chunk
for i, placeholder in enumerate(all_placeholders):
safe_token = f"__MATH_PLACEHOLDER_{i}__"
placeholder_map[safe_token] = placeholder
protected_chunk = protected_chunk.replace(placeholder, safe_token)
# Construct the prompt with strict faithfulness requirements
prompt = f"""You are a professional academic translator. Your task is to translate the following text from {source_language} to {target_language}.
CRITICAL REQUIREMENTS:
1. Translate EXACTLY what is provided - DO NOT add any content that is not in the original text
2. DO NOT add explanations, examples, or clarifications that are not in the source
3. DO NOT omit any information from the original text
4. Preserve all LaTeX commands, mathematical formulas, and technical notation exactly as they appear
5. Maintain the original structure, formatting, and paragraph breaks
6. Translate ONLY the text content, keeping all non-text elements unchanged
7. If you encounter ambiguous terms, choose the most literal translation rather than adding explanatory context
8. SPECIAL INSTRUCTION: Any text that looks like "__MATH_PLACEHOLDER_N__" must be preserved exactly as-is. These are protected mathematical formula markers.
Text to translate:
{protected_chunk}
Translated text (strictly faithful to original):"""
try:
# Make API request with enhanced prompt
response = self.session.post(
f"{self.base_url}/chat/completions",
json={
"model": self.model,
"messages": [
{"role": "system", "content": "You are a precise academic translator specializing in mathematical and technical documents. Your primary directive is absolute faithfulness to the source text. NEVER add, remove, or modify information that is not explicitly part of the translation process. Preserve all formatting, mathematical notation, and technical terms exactly as written. Your role is translation ONLY, not explanation or elaboration."},
{"role": "user", "content": prompt}
],
"max_tokens": self.max_tokens,
"temperature": self.temperature
},
timeout=60
)
response.raise_for_status()
data = response.json()
if "choices" in data and len(data["choices"]) > 0:
translated_text = data["choices"][0]["message"]["content"].strip()
# Restore original placeholders from protected tokens
for safe_token, original_placeholder in placeholder_map.items():
translated_text = translated_text.replace(safe_token, original_placeholder)
return translated_text
else:
raise ValueError("Invalid response format from OpenAI API")
except requests.exceptions.RequestException as e:
error_msg = str(e)
print(f"OpenAI API request failed: {error_msg}")
# Special handling for authentication errors
if "401" in error_msg or "Unauthorized" in error_msg:
print("Authentication error: Please check if your OpenAI API key is correct")
# For authentication errors, we shouldn't fall back - we should exit
raise
# Fallback to original text for other API errors
return chunk
except (KeyError, ValueError) as e:
print(f"Failed to parse OpenAI API response: {e}")
return chunk
except Exception as e:
print(f"Unexpected error during translation: {e}")
return chunk
def translate(self, text: str, target_language: str, source_language: str = "en") -> str:
"""Translate text using OpenAI API with chunking support"""
if not text.strip():
return text
# Check if text needs to be chunked
if self.count_tokens(text) <= self.chunk_size:
# Small text, translate directly
return self.translate_chunk(text, target_language, source_language)
# Large text, split into chunks and translate
print(f"Text too large ({self.count_tokens(text)} tokens), splitting into chunks...")
chunks = self.split_text_into_chunks(text, target_language, source_language)
print(f"Split into {len(chunks)} chunks")
translated_chunks = []
total_chunks = len(chunks)
for i, chunk in enumerate(chunks):
print(f"Translating chunk {i+1}/{total_chunks}...")
# Add delay to avoid rate limiting
if i > 0:
time.sleep(1)
translated_chunk = self.translate_chunk(chunk, target_language, source_language)
translated_chunks.append(translated_chunk)
# Combine translated chunks
result = "\n\n".join(translated_chunks)
print("All chunks translated successfully")
return result
def __call__(self, text: str, target_language: str, source_language: str = "en") -> str:
"""Make the translator callable"""
return self.translate(text, target_language, source_language)