Skip to content

Commit 61b19ca

Browse files
Update app.py
multilingual version Jan 11
1 parent cc52696 commit 61b19ca

File tree

1 file changed

+44
-20
lines changed

1 file changed

+44
-20
lines changed

app.py

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77

88
from utils.image_processing import preprocess_image
99
from utils.pdf_processing import process_pdf
10-
from utils.text_extraction import extract_text
11-
12-
from utils.text_extraction import get_supported_languages
10+
from utils.text_extraction import extract_text, get_supported_languages
1311

1412
def setup_page_config():
1513
"""Configure Streamlit page settings."""
@@ -30,6 +28,32 @@ def initialize_session_state():
3028

3129
def create_sidebar_options():
3230
"""Create user-friendly OCR processing options."""
31+
st.sidebar.header("OCR Settings")
32+
33+
# Language Selection
34+
st.sidebar.subheader("Language Settings")
35+
available_languages = get_supported_languages()
36+
default_lang = 'English'
37+
38+
# Primary language selection
39+
primary_lang = st.sidebar.selectbox(
40+
"Primary Language",
41+
options=list(available_languages.keys()),
42+
index=list(available_languages.keys()).index(default_lang),
43+
help="Select the main language of your document"
44+
)
45+
46+
# Additional languages selection
47+
additional_langs = st.sidebar.multiselect(
48+
"Additional Languages (Optional)",
49+
options=[lang for lang in available_languages.keys() if lang != primary_lang],
50+
help="Select additional languages if your document contains multiple languages"
51+
)
52+
53+
# Combine selected languages
54+
selected_langs = [primary_lang] + additional_langs
55+
lang_codes = '+'.join([available_languages[lang] for lang in selected_langs])
56+
3357
st.sidebar.header("Image Enhancement Options")
3458
return {
3559
'apply_threshold': st.sidebar.checkbox(
@@ -63,9 +87,9 @@ def create_sidebar_options():
6387
12: "Word by Word"
6488
}[x],
6589
help="Choose how the system should read your document's layout. Automatic[3] is best for most documents."
66-
)
90+
),
91+
'language': lang_codes
6792
}
68-
6993
def display_processed_image(original_image, processed_image):
7094
"""
7195
Display original and processed images side by side
@@ -74,12 +98,10 @@ def display_processed_image(original_image, processed_image):
7498
original_image (numpy.ndarray): Original input image
7599
processed_image (numpy.ndarray): Preprocessed image
76100
"""
77-
# Create two columns for display
78101
col1, col2 = st.columns(2)
79102

80103
with col1:
81104
st.subheader("Original Image")
82-
# Convert OpenCV image (BGR) to RGB for correct color display
83105
st.image(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB),
84106
use_container_width=True)
85107

@@ -88,7 +110,6 @@ def display_processed_image(original_image, processed_image):
88110
st.image(cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB),
89111
use_container_width=True)
90112

91-
92113
def process_uploaded_files(uploaded_files, options):
93114
"""
94115
Modified function to show processed images
@@ -100,12 +121,17 @@ def process_uploaded_files(uploaded_files, options):
100121
Returns:
101122
tuple: Lists of all text and individual texts
102123
"""
124+
print("Uploaded Files:",uploaded_files)
103125
all_text = []
104126
individual_texts = {}
105127

106128
# Progress bar for multiple file processing
107129
progress_bar = st.progress(0)
108130

131+
# Make sure uploaded_files is not None and is iterable
132+
if not uploaded_files:
133+
return all_text, individual_texts
134+
109135
for i, uploaded_file in enumerate(uploaded_files):
110136
try:
111137
# Update progress bar
@@ -123,9 +149,6 @@ def process_uploaded_files(uploaded_files, options):
123149
# Show original image before processing
124150
st.subheader(f"Processing: {uploaded_file.name}")
125151

126-
# Display original image
127-
st.image(image, caption="Original Image", use_container_width=True)
128-
129152
# Preprocess image
130153
processed_image = preprocess_image(image_np, options)
131154

@@ -148,7 +171,7 @@ def process_uploaded_files(uploaded_files, options):
148171

149172
def create_text_downloads(all_text, individual_texts):
150173
"""
151-
Create download buttons for extracted texts.
174+
download buttons for extracted text(s).
152175
153176
Args:
154177
all_text (list): Combined extracted texts
@@ -189,12 +212,14 @@ def main():
189212
# App title and description
190213
st.title("Text Extraction using Tesseract OCR")
191214
st.markdown('## Upload multiple images or PDF files to extract text from.')
192-
st.write('You can use documents in the following languages\n')
193-
# Print available languages
194-
print(get_supported_languages())
195-
st.write('From the list of Tesseract Page Segmentation Modes (PSM) on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:')
215+
216+
# Display supported languages
217+
available_languages = get_supported_languages()
218+
st.write('Supported Languages:', ', '.join(available_languages.keys()))
219+
220+
st.write('From the list of Tesseract Page Segmentation Modes (PSM) on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:')
196221
st.write(""" Automatic detection works fine for most documents,\n
197-
You can also Choose a different one based on your document's structure from the list.\n""")
222+
You can also Choose a different one based on your document's structure from the list.\n""")
198223

199224
# File uploader
200225
uploaded_files = st.file_uploader(
@@ -203,7 +228,7 @@ def main():
203228
type=["png", "jpg", "jpeg", "pdf"]
204229
)
205230

206-
# Create OCR options
231+
#OCR options
207232
options = create_sidebar_options()
208233

209234
# Process files when uploaded
@@ -215,9 +240,8 @@ def main():
215240
if all_text:
216241
st.text_area("Extracted Text", value="\n".join(all_text), height=300)
217242

218-
# Create download buttons
243+
# download buttons
219244
create_text_downloads(all_text, individual_texts)
220245

221-
# This ensures the app runs automatically when accessed
222246
if __name__ == "__main__":
223247
main()

0 commit comments

Comments
 (0)