Update app.py

PhoenixAlpha23 · web-flow · commit cc52696980ba · 2025-02-03T22:34:01.000+05:30
reverting back to Jan 11 version
diff --git a/app.py b/app.py
@@ -7,26 +7,10 @@
 
 from utils.image_processing import preprocess_image
 from utils.pdf_processing import process_pdf
-from utils.text_extraction import extract_text, get_supported_languages
+from utils.text_extraction import extract_text
 
-#Added for script detection functionality by using script.tessdata on streamlit cloud
-import os
-import urllib.request
-import pytesseract
-# Use `/tmp/tessdata` (writable directory)
-tessdata_dir = "/tmp/tessdata"
-os.makedirs(tessdata_dir, exist_ok=True)  # This should now work
+from utils.text_extraction import get_supported_languages
 
-#To Download traineddata if not present
-traineddata_path = os.path.join(tessdata_dir, "script.traineddata")
-if not os.path.exists(traineddata_path):
-    urllib.request.urlretrieve("URL_TO_SCRIPT.TRAINEDDATA", traineddata_path)
-
-# Set Tesseract environment variable
-os.environ["TESSDATA_PREFIX"] = "/tmp/"
-config = f"--tessdata-dir {tessdata_dir}"
-
-# Below remains the same
 def setup_page_config():
     """Configure Streamlit page settings."""
     st.set_page_config(
@@ -46,32 +30,6 @@ def initialize_session_state():
 
 def create_sidebar_options():
     """Create user-friendly OCR processing options."""
-    st.sidebar.header("OCR Settings")
-    
-    # Language Selection
-    st.sidebar.subheader("Language Settings")
-    available_languages = get_supported_languages()
-    default_lang = 'English'
-    
-    # Primary language selection
-    primary_lang = st.sidebar.selectbox(
-        "Primary Language",
-        options=list(available_languages.keys()),
-        index=list(available_languages.keys()).index(default_lang),
-        help="Select the main language of your document"
-    )
-    
-    # Additional languages selection
-    additional_langs = st.sidebar.multiselect(
-        "Additional Languages (Optional)",
-        options=[lang for lang in available_languages.keys() if lang != primary_lang],
-        help="Select additional languages if your document contains multiple languages"
-    )
-    
-    # Combine selected languages
-    selected_langs = [primary_lang] + additional_langs
-    lang_codes = '+'.join([available_languages[lang] for lang in selected_langs])
-
     st.sidebar.header("Image Enhancement Options")
     return {
         'apply_threshold': st.sidebar.checkbox(
@@ -105,9 +63,9 @@ def create_sidebar_options():
                 12: "Word by Word"
             }[x],
             help="Choose how the system should read your document's layout. Automatic[3] is best for most documents."
-        ),
-        'language': lang_codes  
+        )
     }
+
 def display_processed_image(original_image, processed_image):
     """
     Display original and processed images side by side
@@ -116,10 +74,12 @@ def display_processed_image(original_image, processed_image):
         original_image (numpy.ndarray): Original input image
         processed_image (numpy.ndarray): Preprocessed image
     """
+    # Create two columns for display
     col1, col2 = st.columns(2)
     
     with col1:
         st.subheader("Original Image")
+        # Convert OpenCV image (BGR) to RGB for correct color display
         st.image(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB), 
                  use_container_width=True)
     
@@ -128,6 +88,7 @@ def display_processed_image(original_image, processed_image):
         st.image(cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB), 
                  use_container_width=True)
 
+
 def process_uploaded_files(uploaded_files, options):
     """
     Modified function to show processed images
@@ -139,17 +100,12 @@ def process_uploaded_files(uploaded_files, options):
     Returns:
         tuple: Lists of all text and individual texts
     """
-    print("Uploaded Files:",uploaded_files)
     all_text = []
     individual_texts = {}
     
     # Progress bar for multiple file processing
     progress_bar = st.progress(0)
     
-    # Make sure uploaded_files is not None and is iterable
-    if not uploaded_files:
-        return all_text, individual_texts
-        
     for i, uploaded_file in enumerate(uploaded_files):
         try:
             # Update progress bar
@@ -167,6 +123,9 @@ def process_uploaded_files(uploaded_files, options):
                 # Show original image before processing
                 st.subheader(f"Processing: {uploaded_file.name}")
                 
+                # Display original image
+                st.image(image, caption="Original Image", use_container_width=True)
+                
                 # Preprocess image
                 processed_image = preprocess_image(image_np, options)
                 
@@ -189,7 +148,7 @@ def process_uploaded_files(uploaded_files, options):
 
 def create_text_downloads(all_text, individual_texts):
     """
-   download buttons for extracted text(s).
+    Create download buttons for extracted texts.
     
     Args:
         all_text (list): Combined extracted texts
@@ -230,14 +189,12 @@ def main():
     # App title and description
     st.title("Text Extraction using Tesseract OCR")
     st.markdown('## Upload multiple images or PDF files to extract text from.')
-    
-    # Display supported languages
-    available_languages = get_supported_languages()
-    st.write('Supported Languages:', ', '.join(available_languages.keys()))
-    
-    st.write('From the list of Tesseract Page Segmentation Modes (PSM) on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:')
+    st.write('You can use documents in the following languages\n')
+    # Print available languages
+    print(get_supported_languages())
+    st.write('From the list of Tesseract Page Segmentation Modes (PSM)  on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:')
     st.write(""" Automatic detection works fine for most documents,\n
-    You can also Choose a different one based on your document's structure from the list.\n""")
+    You can  also Choose a different one based on your document's structure from the list.\n""")
     
     # File uploader
     uploaded_files = st.file_uploader(
@@ -246,7 +203,7 @@ def main():
         type=["png", "jpg", "jpeg", "pdf"]
     )
     
-    #OCR options
+    # Create OCR options
     options = create_sidebar_options()
     
     # Process files when uploaded
@@ -258,8 +215,9 @@ def main():
         if all_text:
             st.text_area("Extracted Text", value="\n".join(all_text), height=300)
             
-            # download buttons
+            # Create download buttons
             create_text_downloads(all_text, individual_texts)
 
+# This ensures the app runs automatically when accessed
 if __name__ == "__main__":
     main()