Update text_extraction.py

PhoenixAlpha23 · web-flow · commit ec7ed23cb5b7 · 2025-01-29T11:21:10.000+05:30
Added osd function for hindi,english,gujrati ,marathi and punjabi.
diff --git a/utils/text_extraction.py b/utils/text_extraction.py
@@ -14,21 +14,12 @@ def get_installed_languages() -> list:
         logger.error(f"Error getting installed languages: {str(e)}")
         return ['eng']
 
+
 def get_supported_languages() -> Dict[str, str]:
     """Returns a dictionary of supported languages and their codes"""
     all_languages = {
         'English': 'eng',
-        'Spanish': 'spa',
-        'French': 'fra',
-        'German': 'deu',
-        'Italian': 'ita',
-        'Portuguese': 'por',
-        'Chinese (Simplified)': 'chi_sim',
-        'Japanese': 'jpn',
-        'Korean': 'kor',
-        'Russian': 'rus',
-        'Arabic': 'ara',
-        'Hindi': 'hin'
+        'Hindi': 'hin',
     }
     
     installed_langs = get_installed_languages()
@@ -42,19 +33,66 @@ def validate_language(lang: str) -> str:
     valid_langs = [l for l in requested_langs if l in installed_langs]
     return '+'.join(valid_langs) if valid_langs else 'eng'
 
+def detect_script(image: Union[str, bytes]) -> str:
+    """
+    Detects the script of the text in the image using Tesseract OCR.
+    
+    Args:
+        image: Preprocessed image
+    
+    Returns:
+        str: Detected script code (e.g., 'Latin', 'Devanagari', etc.)
+    """
+    try:
+        # Use Tesseract's script detection feature
+        script_config = "--psm 3 -l script"
+        script_info = pytesseract.image_to_osd(image, config=script_config)
+        
+        # Extract script name from the OSD output
+        script_line = [line for line in script_info.split('\n') if "Script" in line][0]
+        script_name = script_line.split(":")[1].strip()
+        
+        return script_name
+    except Exception as e:
+        logger.error(f"Error detecting script: {str(e)}")
+        return "Latin"  # Fallback to Latin script (English)
+
+def map_script_to_language(script: str) -> str:
+    """
+    Maps a detected script to the corresponding Tesseract language code.
+    
+    Args:
+        script: Detected script name (e.g., 'Latin', 'Devanagari')
+    
+    Returns:
+        str: Tesseract language code (e.g., 'eng', 'hin')
+    """
+    script_to_lang = {
+        'Latin': 'eng',          # English
+        'Devanagari': 'hin',     # Hindi
+        'Gujarati': 'guj',       # Gujarati
+        'Gurmukhi': 'pan',       # Punjabi
+        'Devanagari': 'mar'      # Marathi (same script as Hindi)
+    }
+    return script_to_lang.get(script, 'eng')  # Fallback to English
 def extract_text(image: Union[str, bytes], options: Dict) -> str:
     """
     Extracts text from the preprocessed image using pytesseract OCR.
+    Automatically detects the script and adjusts the language configuration.
     
     Args:
         image: Preprocessed image
         options: Dictionary containing OCR options including:
             - psm: Page segmentation mode
-            - language: Language code(s) for OCR
+            - language: Language code(s) for OCR (optional)
     """
     try:
-        # Get and validate language
-        lang = validate_language(options.get('language', 'eng'))
+        # Detect script and map to language
+        detected_script = detect_script(image)
+        detected_lang = map_script_to_language(detected_script)
+        
+        # Override the language setting if script detection is successful
+        options['language'] = detected_lang
         
         # Configure OCR settings
         config = f"--oem 3 --psm {options['psm']} preserve_interword_spaces=1"
@@ -63,7 +101,7 @@ def extract_text(image: Union[str, bytes], options: Dict) -> str:
         text = pytesseract.image_to_string(
             image,
             config=config,
-            lang=lang
+            lang=options['language']
         )
         
         return text.strip()