Skip to content

Commit cc52696

Browse files
Update app.py
reverting back to Jan 11 version
1 parent 344ca78 commit cc52696

File tree

1 file changed

+19
-61
lines changed

1 file changed

+19
-61
lines changed

app.py

Lines changed: 19 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,10 @@
77

88
from utils.image_processing import preprocess_image
99
from utils.pdf_processing import process_pdf
10-
from utils.text_extraction import extract_text, get_supported_languages
10+
from utils.text_extraction import extract_text
1111

12-
#Added for script detection functionality by using script.tessdata on streamlit cloud
13-
import os
14-
import urllib.request
15-
import pytesseract
16-
# Use `/tmp/tessdata` (writable directory)
17-
tessdata_dir = "/tmp/tessdata"
18-
os.makedirs(tessdata_dir, exist_ok=True) # This should now work
12+
from utils.text_extraction import get_supported_languages
1913

20-
#To Download traineddata if not present
21-
traineddata_path = os.path.join(tessdata_dir, "script.traineddata")
22-
if not os.path.exists(traineddata_path):
23-
urllib.request.urlretrieve("URL_TO_SCRIPT.TRAINEDDATA", traineddata_path)
24-
25-
# Set Tesseract environment variable
26-
os.environ["TESSDATA_PREFIX"] = "/tmp/"
27-
config = f"--tessdata-dir {tessdata_dir}"
28-
29-
# Below remains the same
3014
def setup_page_config():
3115
"""Configure Streamlit page settings."""
3216
st.set_page_config(
@@ -46,32 +30,6 @@ def initialize_session_state():
4630

4731
def create_sidebar_options():
4832
"""Create user-friendly OCR processing options."""
49-
st.sidebar.header("OCR Settings")
50-
51-
# Language Selection
52-
st.sidebar.subheader("Language Settings")
53-
available_languages = get_supported_languages()
54-
default_lang = 'English'
55-
56-
# Primary language selection
57-
primary_lang = st.sidebar.selectbox(
58-
"Primary Language",
59-
options=list(available_languages.keys()),
60-
index=list(available_languages.keys()).index(default_lang),
61-
help="Select the main language of your document"
62-
)
63-
64-
# Additional languages selection
65-
additional_langs = st.sidebar.multiselect(
66-
"Additional Languages (Optional)",
67-
options=[lang for lang in available_languages.keys() if lang != primary_lang],
68-
help="Select additional languages if your document contains multiple languages"
69-
)
70-
71-
# Combine selected languages
72-
selected_langs = [primary_lang] + additional_langs
73-
lang_codes = '+'.join([available_languages[lang] for lang in selected_langs])
74-
7533
st.sidebar.header("Image Enhancement Options")
7634
return {
7735
'apply_threshold': st.sidebar.checkbox(
@@ -105,9 +63,9 @@ def create_sidebar_options():
10563
12: "Word by Word"
10664
}[x],
10765
help="Choose how the system should read your document's layout. Automatic[3] is best for most documents."
108-
),
109-
'language': lang_codes
66+
)
11067
}
68+
11169
def display_processed_image(original_image, processed_image):
11270
"""
11371
Display original and processed images side by side
@@ -116,10 +74,12 @@ def display_processed_image(original_image, processed_image):
11674
original_image (numpy.ndarray): Original input image
11775
processed_image (numpy.ndarray): Preprocessed image
11876
"""
77+
# Create two columns for display
11978
col1, col2 = st.columns(2)
12079

12180
with col1:
12281
st.subheader("Original Image")
82+
# Convert OpenCV image (BGR) to RGB for correct color display
12383
st.image(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB),
12484
use_container_width=True)
12585

@@ -128,6 +88,7 @@ def display_processed_image(original_image, processed_image):
12888
st.image(cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB),
12989
use_container_width=True)
13090

91+
13192
def process_uploaded_files(uploaded_files, options):
13293
"""
13394
Modified function to show processed images
@@ -139,17 +100,12 @@ def process_uploaded_files(uploaded_files, options):
139100
Returns:
140101
tuple: Lists of all text and individual texts
141102
"""
142-
print("Uploaded Files:",uploaded_files)
143103
all_text = []
144104
individual_texts = {}
145105

146106
# Progress bar for multiple file processing
147107
progress_bar = st.progress(0)
148108

149-
# Make sure uploaded_files is not None and is iterable
150-
if not uploaded_files:
151-
return all_text, individual_texts
152-
153109
for i, uploaded_file in enumerate(uploaded_files):
154110
try:
155111
# Update progress bar
@@ -167,6 +123,9 @@ def process_uploaded_files(uploaded_files, options):
167123
# Show original image before processing
168124
st.subheader(f"Processing: {uploaded_file.name}")
169125

126+
# Display original image
127+
st.image(image, caption="Original Image", use_container_width=True)
128+
170129
# Preprocess image
171130
processed_image = preprocess_image(image_np, options)
172131

@@ -189,7 +148,7 @@ def process_uploaded_files(uploaded_files, options):
189148

190149
def create_text_downloads(all_text, individual_texts):
191150
"""
192-
download buttons for extracted text(s).
151+
Create download buttons for extracted texts.
193152
194153
Args:
195154
all_text (list): Combined extracted texts
@@ -230,14 +189,12 @@ def main():
230189
# App title and description
231190
st.title("Text Extraction using Tesseract OCR")
232191
st.markdown('## Upload multiple images or PDF files to extract text from.')
233-
234-
# Display supported languages
235-
available_languages = get_supported_languages()
236-
st.write('Supported Languages:', ', '.join(available_languages.keys()))
237-
238-
st.write('From the list of Tesseract Page Segmentation Modes (PSM) on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:')
192+
st.write('You can use documents in the following languages\n')
193+
# Print available languages
194+
print(get_supported_languages())
195+
st.write('From the list of Tesseract Page Segmentation Modes (PSM) on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:')
239196
st.write(""" Automatic detection works fine for most documents,\n
240-
You can also Choose a different one based on your document's structure from the list.\n""")
197+
You can also Choose a different one based on your document's structure from the list.\n""")
241198

242199
# File uploader
243200
uploaded_files = st.file_uploader(
@@ -246,7 +203,7 @@ def main():
246203
type=["png", "jpg", "jpeg", "pdf"]
247204
)
248205

249-
#OCR options
206+
# Create OCR options
250207
options = create_sidebar_options()
251208

252209
# Process files when uploaded
@@ -258,8 +215,9 @@ def main():
258215
if all_text:
259216
st.text_area("Extracted Text", value="\n".join(all_text), height=300)
260217

261-
# download buttons
218+
# Create download buttons
262219
create_text_downloads(all_text, individual_texts)
263220

221+
# This ensures the app runs automatically when accessed
264222
if __name__ == "__main__":
265223
main()

0 commit comments

Comments
 (0)