Skip to content

Commit ec7ed23

Browse files
Update text_extraction.py
Added osd function for hindi,english,gujrati ,marathi and punjabi.
1 parent 1573b07 commit ec7ed23

File tree

1 file changed

+53
-15
lines changed

1 file changed

+53
-15
lines changed

utils/text_extraction.py

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,12 @@ def get_installed_languages() -> list:
1414
logger.error(f"Error getting installed languages: {str(e)}")
1515
return ['eng']
1616

17+
1718
def get_supported_languages() -> Dict[str, str]:
1819
"""Returns a dictionary of supported languages and their codes"""
1920
all_languages = {
2021
'English': 'eng',
21-
'Spanish': 'spa',
22-
'French': 'fra',
23-
'German': 'deu',
24-
'Italian': 'ita',
25-
'Portuguese': 'por',
26-
'Chinese (Simplified)': 'chi_sim',
27-
'Japanese': 'jpn',
28-
'Korean': 'kor',
29-
'Russian': 'rus',
30-
'Arabic': 'ara',
31-
'Hindi': 'hin'
22+
'Hindi': 'hin',
3223
}
3324

3425
installed_langs = get_installed_languages()
@@ -42,19 +33,66 @@ def validate_language(lang: str) -> str:
4233
valid_langs = [l for l in requested_langs if l in installed_langs]
4334
return '+'.join(valid_langs) if valid_langs else 'eng'
4435

36+
def detect_script(image: Union[str, bytes]) -> str:
37+
"""
38+
Detects the script of the text in the image using Tesseract OCR.
39+
40+
Args:
41+
image: Preprocessed image
42+
43+
Returns:
44+
str: Detected script code (e.g., 'Latin', 'Devanagari', etc.)
45+
"""
46+
try:
47+
# Use Tesseract's script detection feature
48+
script_config = "--psm 3 -l script"
49+
script_info = pytesseract.image_to_osd(image, config=script_config)
50+
51+
# Extract script name from the OSD output
52+
script_line = [line for line in script_info.split('\n') if "Script" in line][0]
53+
script_name = script_line.split(":")[1].strip()
54+
55+
return script_name
56+
except Exception as e:
57+
logger.error(f"Error detecting script: {str(e)}")
58+
return "Latin" # Fallback to Latin script (English)
59+
60+
def map_script_to_language(script: str) -> str:
61+
"""
62+
Maps a detected script to the corresponding Tesseract language code.
63+
64+
Args:
65+
script: Detected script name (e.g., 'Latin', 'Devanagari')
66+
67+
Returns:
68+
str: Tesseract language code (e.g., 'eng', 'hin')
69+
"""
70+
script_to_lang = {
71+
'Latin': 'eng', # English
72+
'Devanagari': 'hin', # Hindi
73+
'Gujarati': 'guj', # Gujarati
74+
'Gurmukhi': 'pan', # Punjabi
75+
'Devanagari': 'mar' # Marathi (same script as Hindi)
76+
}
77+
return script_to_lang.get(script, 'eng') # Fallback to English
4578
def extract_text(image: Union[str, bytes], options: Dict) -> str:
4679
"""
4780
Extracts text from the preprocessed image using pytesseract OCR.
81+
Automatically detects the script and adjusts the language configuration.
4882
4983
Args:
5084
image: Preprocessed image
5185
options: Dictionary containing OCR options including:
5286
- psm: Page segmentation mode
53-
- language: Language code(s) for OCR
87+
- language: Language code(s) for OCR (optional)
5488
"""
5589
try:
56-
# Get and validate language
57-
lang = validate_language(options.get('language', 'eng'))
90+
# Detect script and map to language
91+
detected_script = detect_script(image)
92+
detected_lang = map_script_to_language(detected_script)
93+
94+
# Override the language setting if script detection is successful
95+
options['language'] = detected_lang
5896

5997
# Configure OCR settings
6098
config = f"--oem 3 --psm {options['psm']} preserve_interword_spaces=1"
@@ -63,7 +101,7 @@ def extract_text(image: Union[str, bytes], options: Dict) -> str:
63101
text = pytesseract.image_to_string(
64102
image,
65103
config=config,
66-
lang=lang
104+
lang=options['language']
67105
)
68106

69107
return text.strip()

0 commit comments

Comments
 (0)