RapidLatex/openai_translator.py at main · RapidAI/RapidLatex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python
import json
import requests
import time
import re
from typing import Optional, List
import tiktoken

class OpenAITranslator:
    def __init__(self, api_key: str, base_url: str = "https://api.openai.com/v1",
                 model: str = "gpt-3.5-turbo", max_tokens: int = 2000,
                 temperature: float = 0.3, chunk_size: int = 3000):
        # Ensure API key is provided
        if not api_key:
            raise ValueError("OpenAI API key must be provided")

        self.api_key = api_key
        self.base_url = base_url
        self.model = model
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.chunk_size = chunk_size  # Maximum tokens per chunk
        self.session = requests.Session()
        self.session.headers.update({
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        })

        # Initialize tokenizer for token counting
        try:
            self.tokenizer = tiktoken.get_encoding("cl100k_base")
        except Exception:
            # Fallback if tiktoken is not available
            self.tokenizer = None
            print("Warning: tiktoken not available, using approximate token counting")

    def count_tokens(self, text: str) -> int:
        """Count tokens in text using tokenizer or approximate method"""
        if self.tokenizer:
            return len(self.tokenizer.encode(text))
        else:
            # Approximate token counting (roughly 4 characters per token)
            return len(text) // 4

    def split_text_into_chunks(self, text: str, target_language: str, source_language: str = "en") -> List[str]:
        """Split text into chunks that fit within token limits"""
        # Reserve tokens for the prompt template and overhead
        prompt_template = f"""You are a professional academic translator. Your task is to translate the following text from {source_language} to {target_language}.

CRITICAL REQUIREMENTS:
1. Translate EXACTLY what is provided - DO NOT add any content that is not in the original text
2. DO NOT add explanations, examples, or clarifications that are not in the source
3. DO NOT omit any information from the original text
4. Preserve all LaTeX commands, mathematical formulas, and technical notation exactly as they appear
5. Maintain the original structure, formatting, and paragraph breaks
6. Translate ONLY the text content, keeping all non-text elements unchanged
7. If you encounter ambiguous terms, choose the most literal translation rather than adding explanatory context

Text to translate:
{{text}}

Translated text (strictly faithful to original):"""

        template_tokens = self.count_tokens(prompt_template)
        available_tokens = self.chunk_size - template_tokens - 100  # 100 token safety margin

        if available_tokens <= 0:
            print("Warning: Chunk size too small for prompt template")
            available_tokens = 1000

        # If text is small enough, return as single chunk
        if self.count_tokens(text) <= available_tokens:
            return [text]

        chunks = []
        remaining_text = text

        # For LaTeX documents, try to split at logical boundaries
        # First try to split at sections
        section_pattern = r'(\\section\{[^}]+\}\s*)'
        sections = re.split(section_pattern, remaining_text)

        if len(sections) > 1:
            current_chunk = ""
            for i, section in enumerate(sections):
                if i % 2 == 1:  # This is a section header
                    if current_chunk and self.count_tokens(current_chunk + section) > available_tokens:
                        chunks.append(current_chunk.strip())
                        current_chunk = section
                    else:
                        current_chunk += section
                else:  # This is section content
                    if self.count_tokens(current_chunk + section) > available_tokens:
                        # Split this section further
                        sub_chunks = self.split_paragraphs(section, available_tokens)
                        for sub_chunk in sub_chunks:
                            if current_chunk and self.count_tokens(current_chunk + sub_chunk) > available_tokens:
                                chunks.append(current_chunk.strip())
                                current_chunk = sub_chunk
                            else:
                                current_chunk += sub_chunk
                    else:
                        current_chunk += section

            if current_chunk:
                chunks.append(current_chunk.strip())
        else:
            # If no sections, split by paragraphs
            chunks = self.split_paragraphs(text, available_tokens)

        return chunks

    def split_paragraphs(self, text: str, max_tokens: int) -> List[str]:
        """Split text by paragraphs"""
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = ""

        for paragraph in paragraphs:
            if self.count_tokens(current_chunk + paragraph) <= max_tokens:
                if current_chunk:
                    current_chunk += "\n\n" + paragraph
                else:
                    current_chunk = paragraph
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())

                # If single paragraph is too long, split it
                if self.count_tokens(paragraph) > max_tokens:
                    sentences = re.split(r'(?<=[.!?])\s+', paragraph)
                    temp_chunk = ""
                    for sentence in sentences:
                        if self.count_tokens(temp_chunk + sentence) <= max_tokens:
                            if temp_chunk:
                                temp_chunk += " " + sentence
                            else:
                                temp_chunk = sentence
                        else:
                            if temp_chunk:
                                chunks.append(temp_chunk.strip())
                            temp_chunk = sentence

                    if temp_chunk:
                        current_chunk = temp_chunk
                    else:
                        current_chunk = paragraph
                else:
                    current_chunk = paragraph

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def translate_chunk(self, chunk: str, target_language: str, source_language: str = "en") -> str:
        """Translate a single chunk using OpenAI API"""
        if not chunk.strip():
            return chunk

        # Protect math formula placeholders (XMATHX_* and XMATHXBS)
        # Enhanced to protect entire LaTeX commands that follow XMATHX placeholders
        import re
        placeholder_pattern = r'XMATHX_\d+(?:_\d+)*|XMATHXBS'
        placeholders = re.findall(placeholder_pattern, chunk)

        # Also protect entire LaTeX command structures that follow XMATHX placeholders
        # This includes patterns like "XMATHXBS bibliographystyle {IEEEtran}"
        # Enhanced to handle multiple commands in the same line
        extended_pattern = r'XMATHXBS\s+\w+(?:\s*\{[^}]*\})*(?:\s+\w+(?:\s*\{[^}]*\})*)*'
        extended_placeholders = re.findall(extended_pattern, chunk)

        # Combine all placeholders and sort by length (descending) to handle overlapping patterns
        # Longer patterns should be replaced first to avoid leaving parts unprotected
        all_placeholders = sorted(list(set(placeholders + extended_placeholders)), key=len, reverse=True)

        # Create a mapping of placeholders to temporary safe tokens
        placeholder_map = {}
        protected_chunk = chunk
        for i, placeholder in enumerate(all_placeholders):
            safe_token = f"__MATH_PLACEHOLDER_{i}__"
            placeholder_map[safe_token] = placeholder
            protected_chunk = protected_chunk.replace(placeholder, safe_token)

        # Construct the prompt with strict faithfulness requirements
        prompt = f"""You are a professional academic translator. Your task is to translate the following text from {source_language} to {target_language}.

CRITICAL REQUIREMENTS:
1. Translate EXACTLY what is provided - DO NOT add any content that is not in the original text
2. DO NOT add explanations, examples, or clarifications that are not in the source
3. DO NOT omit any information from the original text
4. Preserve all LaTeX commands, mathematical formulas, and technical notation exactly as they appear
5. Maintain the original structure, formatting, and paragraph breaks
6. Translate ONLY the text content, keeping all non-text elements unchanged
7. If you encounter ambiguous terms, choose the most literal translation rather than adding explanatory context
8. SPECIAL INSTRUCTION: Any text that looks like "__MATH_PLACEHOLDER_N__" must be preserved exactly as-is. These are protected mathematical formula markers.

Text to translate:
{protected_chunk}

Translated text (strictly faithful to original):"""

        try:
            # Make API request with enhanced prompt
            response = self.session.post(
                f"{self.base_url}/chat/completions",
                json={
                    "model": self.model,
                    "messages": [
                        {"role": "system", "content": "You are a precise academic translator specializing in mathematical and technical documents. Your primary directive is absolute faithfulness to the source text. NEVER add, remove, or modify information that is not explicitly part of the translation process. Preserve all formatting, mathematical notation, and technical terms exactly as written. Your role is translation ONLY, not explanation or elaboration."},
                        {"role": "user", "content": prompt}
                    ],
                    "max_tokens": self.max_tokens,
                    "temperature": self.temperature
                },
                timeout=60
            )

            response.raise_for_status()
            data = response.json()

            if "choices" in data and len(data["choices"]) > 0:
                translated_text = data["choices"][0]["message"]["content"].strip()

                # Restore original placeholders from protected tokens
                for safe_token, original_placeholder in placeholder_map.items():
                    translated_text = translated_text.replace(safe_token, original_placeholder)

                return translated_text
            else:
                raise ValueError("Invalid response format from OpenAI API")

        except requests.exceptions.RequestException as e:
            error_msg = str(e)
            print(f"OpenAI API request failed: {error_msg}")

            # Special handling for authentication errors
            if "401" in error_msg or "Unauthorized" in error_msg:
                print("Authentication error: Please check if your OpenAI API key is correct")
                # For authentication errors, we shouldn't fall back - we should exit
                raise

            # Fallback to original text for other API errors
            return chunk
        except (KeyError, ValueError) as e:
            print(f"Failed to parse OpenAI API response: {e}")
            return chunk
        except Exception as e:
            print(f"Unexpected error during translation: {e}")
            return chunk

    def translate(self, text: str, target_language: str, source_language: str = "en") -> str:
        """Translate text using OpenAI API with chunking support"""
        if not text.strip():
            return text

        # Check if text needs to be chunked
        if self.count_tokens(text) <= self.chunk_size:
            # Small text, translate directly
            return self.translate_chunk(text, target_language, source_language)

        # Large text, split into chunks and translate
        print(f"Text too large ({self.count_tokens(text)} tokens), splitting into chunks...")
        chunks = self.split_text_into_chunks(text, target_language, source_language)
        print(f"Split into {len(chunks)} chunks")

        translated_chunks = []
        total_chunks = len(chunks)

        for i, chunk in enumerate(chunks):
            print(f"Translating chunk {i+1}/{total_chunks}...")

            # Add delay to avoid rate limiting
            if i > 0:
                time.sleep(1)

            translated_chunk = self.translate_chunk(chunk, target_language, source_language)
            translated_chunks.append(translated_chunk)

        # Combine translated chunks
        result = "\n\n".join(translated_chunks)
        print("All chunks translated successfully")

        return result

    def __call__(self, text: str, target_language: str, source_language: str = "en") -> str:
        """Make the translator callable"""
        return self.translate(text, target_language, source_language)