@@ -14,21 +14,12 @@ def get_installed_languages() -> list:
1414 logger .error (f"Error getting installed languages: { str (e )} " )
1515 return ['eng' ]
1616
17+
1718def get_supported_languages () -> Dict [str , str ]:
1819 """Returns a dictionary of supported languages and their codes"""
1920 all_languages = {
2021 'English' : 'eng' ,
21- 'Spanish' : 'spa' ,
22- 'French' : 'fra' ,
23- 'German' : 'deu' ,
24- 'Italian' : 'ita' ,
25- 'Portuguese' : 'por' ,
26- 'Chinese (Simplified)' : 'chi_sim' ,
27- 'Japanese' : 'jpn' ,
28- 'Korean' : 'kor' ,
29- 'Russian' : 'rus' ,
30- 'Arabic' : 'ara' ,
31- 'Hindi' : 'hin'
22+ 'Hindi' : 'hin' ,
3223 }
3324
3425 installed_langs = get_installed_languages ()
@@ -42,19 +33,66 @@ def validate_language(lang: str) -> str:
4233 valid_langs = [l for l in requested_langs if l in installed_langs ]
4334 return '+' .join (valid_langs ) if valid_langs else 'eng'
4435
36+ def detect_script (image : Union [str , bytes ]) -> str :
37+ """
38+ Detects the script of the text in the image using Tesseract OCR.
39+
40+ Args:
41+ image: Preprocessed image
42+
43+ Returns:
44+ str: Detected script code (e.g., 'Latin', 'Devanagari', etc.)
45+ """
46+ try :
47+ # Use Tesseract's script detection feature
48+ script_config = "--psm 3 -l script"
49+ script_info = pytesseract .image_to_osd (image , config = script_config )
50+
51+ # Extract script name from the OSD output
52+ script_line = [line for line in script_info .split ('\n ' ) if "Script" in line ][0 ]
53+ script_name = script_line .split (":" )[1 ].strip ()
54+
55+ return script_name
56+ except Exception as e :
57+ logger .error (f"Error detecting script: { str (e )} " )
58+ return "Latin" # Fallback to Latin script (English)
59+
60+ def map_script_to_language (script : str ) -> str :
61+ """
62+ Maps a detected script to the corresponding Tesseract language code.
63+
64+ Args:
65+ script: Detected script name (e.g., 'Latin', 'Devanagari')
66+
67+ Returns:
68+ str: Tesseract language code (e.g., 'eng', 'hin')
69+ """
70+ script_to_lang = {
71+ 'Latin' : 'eng' , # English
72+ 'Devanagari' : 'hin' , # Hindi
73+ 'Gujarati' : 'guj' , # Gujarati
74+ 'Gurmukhi' : 'pan' , # Punjabi
75+ 'Devanagari' : 'mar' # Marathi (same script as Hindi)
76+ }
77+ return script_to_lang .get (script , 'eng' ) # Fallback to English
4578def extract_text (image : Union [str , bytes ], options : Dict ) -> str :
4679 """
4780 Extracts text from the preprocessed image using pytesseract OCR.
81+ Automatically detects the script and adjusts the language configuration.
4882
4983 Args:
5084 image: Preprocessed image
5185 options: Dictionary containing OCR options including:
5286 - psm: Page segmentation mode
53- - language: Language code(s) for OCR
87+ - language: Language code(s) for OCR (optional)
5488 """
5589 try :
56- # Get and validate language
57- lang = validate_language (options .get ('language' , 'eng' ))
90+ # Detect script and map to language
91+ detected_script = detect_script (image )
92+ detected_lang = map_script_to_language (detected_script )
93+
94+ # Override the language setting if script detection is successful
95+ options ['language' ] = detected_lang
5896
5997 # Configure OCR settings
6098 config = f"--oem 3 --psm { options ['psm' ]} preserve_interword_spaces=1"
@@ -63,7 +101,7 @@ def extract_text(image: Union[str, bytes], options: Dict) -> str:
63101 text = pytesseract .image_to_string (
64102 image ,
65103 config = config ,
66- lang = lang
104+ lang = options [ 'language' ]
67105 )
68106
69107 return text .strip ()
0 commit comments