77
88from utils .image_processing import preprocess_image
99from utils .pdf_processing import process_pdf
10- from utils .text_extraction import extract_text
11-
12- from utils .text_extraction import get_supported_languages
10+ from utils .text_extraction import extract_text , get_supported_languages
1311
1412def setup_page_config ():
1513 """Configure Streamlit page settings."""
@@ -30,6 +28,32 @@ def initialize_session_state():
3028
3129def create_sidebar_options ():
3230 """Create user-friendly OCR processing options."""
31+ st .sidebar .header ("OCR Settings" )
32+
33+ # Language Selection
34+ st .sidebar .subheader ("Language Settings" )
35+ available_languages = get_supported_languages ()
36+ default_lang = 'English'
37+
38+ # Primary language selection
39+ primary_lang = st .sidebar .selectbox (
40+ "Primary Language" ,
41+ options = list (available_languages .keys ()),
42+ index = list (available_languages .keys ()).index (default_lang ),
43+ help = "Select the main language of your document"
44+ )
45+
46+ # Additional languages selection
47+ additional_langs = st .sidebar .multiselect (
48+ "Additional Languages (Optional)" ,
49+ options = [lang for lang in available_languages .keys () if lang != primary_lang ],
50+ help = "Select additional languages if your document contains multiple languages"
51+ )
52+
53+ # Combine selected languages
54+ selected_langs = [primary_lang ] + additional_langs
55+ lang_codes = '+' .join ([available_languages [lang ] for lang in selected_langs ])
56+
3357 st .sidebar .header ("Image Enhancement Options" )
3458 return {
3559 'apply_threshold' : st .sidebar .checkbox (
@@ -63,9 +87,9 @@ def create_sidebar_options():
6387 12 : "Word by Word"
6488 }[x ],
6589 help = "Choose how the system should read your document's layout. Automatic[3] is best for most documents."
66- )
90+ ),
91+ 'language' : lang_codes
6792 }
68-
6993def display_processed_image (original_image , processed_image ):
7094 """
7195 Display original and processed images side by side
@@ -74,12 +98,10 @@ def display_processed_image(original_image, processed_image):
7498 original_image (numpy.ndarray): Original input image
7599 processed_image (numpy.ndarray): Preprocessed image
76100 """
77- # Create two columns for display
78101 col1 , col2 = st .columns (2 )
79102
80103 with col1 :
81104 st .subheader ("Original Image" )
82- # Convert OpenCV image (BGR) to RGB for correct color display
83105 st .image (cv2 .cvtColor (original_image , cv2 .COLOR_BGR2RGB ),
84106 use_container_width = True )
85107
@@ -88,7 +110,6 @@ def display_processed_image(original_image, processed_image):
88110 st .image (cv2 .cvtColor (processed_image , cv2 .COLOR_BGR2RGB ),
89111 use_container_width = True )
90112
91-
92113def process_uploaded_files (uploaded_files , options ):
93114 """
94115 Modified function to show processed images
@@ -100,12 +121,17 @@ def process_uploaded_files(uploaded_files, options):
100121 Returns:
101122 tuple: Lists of all text and individual texts
102123 """
124+ print ("Uploaded Files:" ,uploaded_files )
103125 all_text = []
104126 individual_texts = {}
105127
106128 # Progress bar for multiple file processing
107129 progress_bar = st .progress (0 )
108130
131+ # Make sure uploaded_files is not None and is iterable
132+ if not uploaded_files :
133+ return all_text , individual_texts
134+
109135 for i , uploaded_file in enumerate (uploaded_files ):
110136 try :
111137 # Update progress bar
@@ -123,9 +149,6 @@ def process_uploaded_files(uploaded_files, options):
123149 # Show original image before processing
124150 st .subheader (f"Processing: { uploaded_file .name } " )
125151
126- # Display original image
127- st .image (image , caption = "Original Image" , use_container_width = True )
128-
129152 # Preprocess image
130153 processed_image = preprocess_image (image_np , options )
131154
@@ -148,7 +171,7 @@ def process_uploaded_files(uploaded_files, options):
148171
149172def create_text_downloads (all_text , individual_texts ):
150173 """
151- Create download buttons for extracted texts .
174+ download buttons for extracted text(s) .
152175
153176 Args:
154177 all_text (list): Combined extracted texts
@@ -189,12 +212,14 @@ def main():
189212 # App title and description
190213 st .title ("Text Extraction using Tesseract OCR" )
191214 st .markdown ('## Upload multiple images or PDF files to extract text from.' )
192- st .write ('You can use documents in the following languages\n ' )
193- # Print available languages
194- print (get_supported_languages ())
195- st .write ('From the list of Tesseract Page Segmentation Modes (PSM) on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:' )
215+
216+ # Display supported languages
217+ available_languages = get_supported_languages ()
218+ st .write ('Supported Languages:' , ', ' .join (available_languages .keys ()))
219+
220+ st .write ('From the list of Tesseract Page Segmentation Modes (PSM) on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:' )
196221 st .write (""" Automatic detection works fine for most documents,\n
197- You can also Choose a different one based on your document's structure from the list.\n """ )
222+ You can also Choose a different one based on your document's structure from the list.\n """ )
198223
199224 # File uploader
200225 uploaded_files = st .file_uploader (
@@ -203,7 +228,7 @@ def main():
203228 type = ["png" , "jpg" , "jpeg" , "pdf" ]
204229 )
205230
206- # Create OCR options
231+ #OCR options
207232 options = create_sidebar_options ()
208233
209234 # Process files when uploaded
@@ -215,9 +240,8 @@ def main():
215240 if all_text :
216241 st .text_area ("Extracted Text" , value = "\n " .join (all_text ), height = 300 )
217242
218- # Create download buttons
243+ # download buttons
219244 create_text_downloads (all_text , individual_texts )
220245
221- # This ensures the app runs automatically when accessed
222246if __name__ == "__main__" :
223247 main ()
0 commit comments