1+ from transformers import MarianMTModel , MarianTokenizer
2+ import torch
3+
4+ def translate_text (input_text : str , target_lang : str ):
5+ """
6+ Translate text to the specified target language.
7+
8+ Supported language pairs (source -> target):
9+ - English -> Spanish, French, German, Italian, Portuguese, Dutch, Polish, Russian, Chinese, Japanese, Korean, Arabic, Hindi, Turkish
10+ - Spanish -> English, French, German, Italian, Portuguese
11+ - French -> English, Spanish, German, Italian, Portuguese
12+ - German -> English, Spanish, French, Italian, Portuguese
13+ - Italian -> English, Spanish, French, German, Portuguese
14+ - Portuguese -> English, Spanish, French, German, Italian
15+ - Dutch -> English
16+ - Polish -> English
17+ - Russian -> English
18+ - Chinese -> English
19+ - Japanese -> English
20+ - Korean -> English
21+ - Arabic -> English
22+ - Hindi -> English
23+ - Turkish -> English
24+
25+ For Romance languages (Spanish, French, Italian, Portuguese, Romanian, Catalan, etc.) to English,
26+ you can also use the generic 'ROMANCE-en' model.
27+
28+ Usage: Enter text and target language separated by comma (e.g., "Hello world,es" or "Bonjour,en")
29+ """
30+
31+ # Clean and parse inputs
32+ target_lang = target_lang .strip ().lower ()
33+
34+ # Dictionary mapping target languages to model names
35+ # Format: {target_lang_code: (source_prefix, model_name)}
36+ translation_models = {
37+ # English to other languages
38+ 'es' : ('en' , 'Helsinki-NLP/opus-mt-en-es' ),
39+ 'fr' : ('en' , 'Helsinki-NLP/opus-mt-en-fr' ),
40+ 'de' : ('en' , 'Helsinki-NLP/opus-mt-en-de' ),
41+ 'it' : ('en' , 'Helsinki-NLP/opus-mt-en-it' ),
42+ 'pt' : ('en' , 'Helsinki-NLP/opus-mt-en-pt' ),
43+ 'nl' : ('en' , 'Helsinki-NLP/opus-mt-en-nl' ),
44+ 'pl' : ('en' , 'Helsinki-NLP/opus-mt-en-pl' ),
45+ 'ru' : ('en' , 'Helsinki-NLP/opus-mt-en-ru' ),
46+ 'zh' : ('en' , 'Helsinki-NLP/opus-mt-en-zh' ),
47+ 'ja' : ('en' , 'Helsinki-NLP/opus-mt-en-jap' ),
48+ 'ko' : ('en' , 'Helsinki-NLP/opus-mt-en-ko' ),
49+ 'ar' : ('en' , 'Helsinki-NLP/opus-mt-en-ar' ),
50+ 'hi' : ('en' , 'Helsinki-NLP/opus-mt-en-hi' ),
51+ 'tr' : ('en' , 'Helsinki-NLP/opus-mt-en-tr' ),
52+
53+ # Other languages to English
54+ 'en' : ('multi' , 'Helsinki-NLP/opus-mt-ROMANCE-en' ), # Generic Romance to English
55+ }
56+
57+ # Handle specific language pairs for better accuracy
58+ if target_lang == 'en' :
59+ # Detect source language and use specific model if available
60+ # For simplicity, we'll use the generic ROMANCE model for Romance languages
61+ # In a real application, you might want to detect the source language first
62+ model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
63+ source_prefix = '>>en<<'
64+ elif target_lang in translation_models :
65+ source_prefix , model_name = translation_models [target_lang ]
66+ # Add target language prefix for some models
67+ if source_prefix == 'en' :
68+ source_prefix = ''
69+ else :
70+ # Fallback to generic approach or raise error
71+ available_langs = ', ' .join (translation_models .keys ())
72+ raise ValueError (f"Target language '{ target_lang } ' not supported. Available: { available_langs } " )
73+
74+ try :
75+ # Load tokenizer and model
76+ tokenizer = MarianTokenizer .from_pretrained (model_name )
77+ model = MarianMTModel .from_pretrained (model_name )
78+
79+ # Prepare input text
80+ if target_lang == 'en' :
81+ # For Romance to English, the model expects the text as-is
82+ input_text_formatted = input_text
83+ else :
84+ # For English to other languages, format appropriately
85+ input_text_formatted = input_text
86+
87+ # Tokenize and translate
88+ inputs = tokenizer (input_text_formatted , return_tensors = "pt" , padding = True )
89+ with torch .no_grad ():
90+ translated = model .generate (** inputs )
91+
92+ # Decode the result
93+ translated_text = tokenizer .decode (translated [0 ], skip_special_tokens = True )
94+ return translated_text
95+
96+ except Exception as e :
97+ return f"Translation error: { str (e )} "
98+
99+ # Google Colab interactive input
100+ # try:
101+ # user_input = input("Enter text and target language (separated by comma): ")
102+ # if ',' not in user_input:
103+ # print("Please use format: 'your text,language_code' (e.g., 'Hello world,es')")
104+ # else:
105+ # text_part, lang_part = user_input.rsplit(',', 1) # Split on last comma to handle text with commas
106+ # result = translate_text(text_part.strip(), lang_part.strip())
107+ # print(f"\nTranslated text: {result}")
108+ #
109+ # except KeyboardInterrupt:
110+ # print("\nOperation cancelled by user.")
111+ # except Exception as e:
112+ # print(f"Error: {e}"
0 commit comments