updated easyocr loader and tests

meghana1090 · meghana1090 · commit f59efe75114b · 2025-06-04T10:53:58.000-07:00
diff --git a/extract_thinker/document_loader/document_loader_easy_ocr.py b/extract_thinker/document_loader/document_loader_easy_ocr.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
+from io import BytesIO
 from PIL import Image
 import numpy as np
 from dataclasses import dataclass, field
@@ -15,19 +16,23 @@ class EasyOCRConfig:
     """Configuration for EasyOCR loader.
 
     Args:
-        lang_list: List of languages to use for OCR
-        gpu: Whether to use GPU acceleration
-        download_enabled: Whether to download models automatically
-        cache_ttl: Time-to-live for cache in seconds
+        lang_list: List of languages to use for OCR. Defaults to ['en'].
+        gpu: Whether to use GPU acceleration. Defaults to True.
+        download_enabled: Whether to download models automatically. Defaults to True.
+        cache_ttl: Time-to-live for cache in seconds. Defaults to 300.
     """
     lang_list: List[str] = field(default_factory=lambda: ['en'])
     gpu: bool = True
     download_enabled: bool = True
     cache_ttl: int = 300
 
     def __post_init__(self):
+        """Initialize EasyOCR reader with configuration settings and validation."""
         if not self.lang_list:
-            self.lang_list = ['en']
+            raise ValueError("lang_list must contain at least one language code.")
+        if self.cache_ttl < 0:
+            raise ValueError("cache_ttl must be non-negative.")
+
         self.reader = easyocr.Reader(
             lang_list=self.lang_list,
             gpu=self.gpu,
@@ -39,29 +44,63 @@ class DocumentLoaderEasyOCR(CachedDocumentLoader):
     SUPPORTED_FORMATS = ["png", "jpg", "jpeg", "tiff", "tif", "webp"]
 
     def __init__(self, config: EasyOCRConfig):
+        """Initialize the EasyOCR document loader.
+
+        Args:
+            config: Configuration object for EasyOCR settings
+        """
         super().__init__()
         self.config = config
         self.cache = TTLCache(maxsize=128, ttl=self.config.cache_ttl)
+        self.vision_mode = False
+
+    def can_handle(self, source: Union[str, BytesIO]) -> bool:
+        """Check if the loader can handle the given source.
+
+        Args:
+            source: Path to a file or BytesIO stream
 
-    def can_handle(self, source: str) -> bool:
-        if not isinstance(source, str) or '.' not in source:
-            return False
-        ext = source.split('.')[-1].lower()
-        return ext in self.SUPPORTED_FORMATS
+        Returns:
+            bool: True if source is supported, False otherwise
+        """
+        # Check if source is a BytesIO stream
+        if isinstance(source, BytesIO):
+            return True
+        # Check if source is a file path and has a valid extension
+        if isinstance(source, str) and '.' in source:
+             # Extract the file extension (after the last '.') and convert to lowercase
+            ext = source.split('.')[-1].lower()
+            return ext in self.SUPPORTED_FORMATS
+        return False
 
-    @cachedmethod(cache=attrgetter('cache'), key=lambda _, path: hashkey(path))
-    def load(self, image_path: str) -> List[List[Dict[str, Any]]]:
-        """Load and process an image using EasyOCR.
+    @cachedmethod(cache=attrgetter('cache'), key=lambda self, source: hashkey(source) if isinstance(source, str) else None)
+    def load(self, source: Union[str, BytesIO]) -> List[List[Dict[str, Any]]]:
+        """Load and process an image (file path or BytesIO) using EasyOCR.
 
         Args:
-            image_path: Path to the image file
+            source: Image file path or in-memory image stream (BytesIO)
 
         Returns:
             List of pages, where each page contains a list of OCR results.
-            Each OCR result is a dictionary with 'text', 'probability', and 'bbox' keys.
+            Each OCR result is a dictionary with:
+                - text: The extracted text
+                - probability: Confidence score
+                - bbox: Bounding box coordinates
         """
-        with Image.open(image_path).convert("RGB") as img:
-            ocr_result = self.config.reader.readtext(np.array(img))
+         # Convert image from file path into numpy array
+        if isinstance(source, str):
+            with Image.open(source).convert("RGB") as img:
+                image_array = np.array(img)
+         # Convert image from bytes stream into numpy array
+        elif isinstance(source, BytesIO):
+            source.seek(0)
+            with Image.open(source).convert("RGB") as img:
+                image_array = np.array(img)
+        else:
+            raise ValueError("Unsupported source type. Expected str or BytesIO.")
+
+        ocr_result = self.config.reader.readtext(image_array)
+        # Loop through OCR results and structure them into a dictionary format
         page_data = []
         for bbox, text, prob in ocr_result:
             page_data.append({
@@ -70,3 +109,13 @@ def load(self, image_path: str) -> List[List[Dict[str, Any]]]:
                 "probability": prob
             })
         return [page_data]
+
+    def can_handle_vision(self, source: Union[str, BytesIO]) -> bool:
+        """EasyOCR currently doesn't support vision mode in this loader."""
+        return False
+
+    def set_vision_mode(self, enabled: bool = True):
+        """Disable vision mode, not supported here."""
+        if enabled:
+            raise ValueError("Vision mode is not supported in EasyOCR loader.")
+      
diff --git a/tests/test_document_loader_easyocr.py b/tests/test_document_loader_easyocr.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+from io import BytesIO
 import numpy as np
 from extract_thinker.document_loader.document_loader_easy_ocr import DocumentLoaderEasyOCR, EasyOCRConfig
 from .test_document_loader_base import BaseDocumentLoaderTest
@@ -25,45 +26,75 @@ def test_file_path(self):
         return os.path.join(current_dir, "test_images", "invoice.png")
 
     def test_load_content(self, loader, test_file_path):
+        """Tests that the loader can process an image file and return OCR results
+            in the expected structure"""
         content = loader.load(test_file_path)
         assert isinstance(content, list) and len(content) > 0
         for page in content:
+            # Each page should be a list of OCR results
             assert isinstance(page, list)
             for item in page:
+                # Each OCR result should be a dictionary
                 assert isinstance(item, dict)
                 assert all(key in item for key in ['text', 'probability', 'bbox'])
                 assert isinstance(item['text'], str)
                 assert isinstance(item['probability'], (float, np.float64))
                 assert isinstance(item['bbox'], (list, tuple))
 
-    def test_can_handle_formats(self, loader, tmp_path):
-        for fmt in loader.SUPPORTED_FORMATS:
-            test_file = tmp_path / f"test.{fmt}"
-            test_file.touch()
-            assert loader.can_handle(str(test_file))
+    def test_load_from_bytesio(self, loader, test_file_path):
+        """Tests that the loader can process an image provided as a BytesIO stream."""
+        with open(test_file_path, "rb") as f:
+            image_bytes = BytesIO(f.read())
+        content = loader.load(image_bytes)
+        assert isinstance(content, list) and len(content) > 0
+
+    def test_can_handle(self, loader, tmp_path):
+        """Tests that the loader correctly identifies supported and unsupported file formats"""
+        # Supported extensions
+        for ext in loader.SUPPORTED_FORMATS:
+            f = tmp_path / f"file.{ext}"
+            f.touch()
+            assert loader.can_handle(str(f))
+        # Unsupported extension
+        assert not loader.can_handle(str(tmp_path / "file.abc"))
+        # Missing extension
+        assert not loader.can_handle(str(tmp_path / "file"))
+        # BytesIO stream
+        assert loader.can_handle(BytesIO(b"data"))
 
-        bad_file = tmp_path / "test.xyz"
-        bad_file.touch()
-        assert not loader.can_handle(str(bad_file))
+    def test_vision_mode(self, loader):
+        """Test that vision mode is not supported"""
+        # Vision mode should be disabled by default
+        assert loader.vision_mode is False
+        
+        # Attempting to enable vision mode should raise an error
+        with pytest.raises(ValueError, match="Vision mode is not supported"):
+            loader.set_vision_mode(True)
+        
+        # Vision mode should still be False after failed attempt
+        assert loader.vision_mode is False
+        
+        # can_handle_vision should always return False
+        assert loader.can_handle_vision("test.txt") is False
 
     def test_language_configuration(self, test_file_path):
+        """test that the loader can handle english language"""
         loader = DocumentLoaderEasyOCR(EasyOCRConfig(lang_list=['en']))
         pages = loader.load(test_file_path)
         assert len(pages) > 0
-
+        """test that the loader can handle multiple languages(english and spanish)"""
         loader = DocumentLoaderEasyOCR(EasyOCRConfig(lang_list=['en', 'es']))
         pages = loader.load(test_file_path)
         assert len(pages) > 0
 
-    def test_simple_initialization_easyocr(self):
-        config = EasyOCRConfig(lang_list=["en"])
-        loader = DocumentLoaderEasyOCR(config)
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        test_file = os.path.join(current_dir, "test_images", "invoice.png")
-        pages = loader.load(test_file)
-        assert isinstance(pages, list)
-        assert len(pages) > 0
-        assert isinstance(pages[0], list)
-        assert isinstance(pages[0][0], dict)
-        assert "text" in pages[0][0]
-        assert isinstance(pages[0][0]["text"], str)
+    def test_easyocr_config_validation(self):
+        """Test EasyOCRConfig validation"""
+        # raise error if lang_list is empty
+        with pytest.raises(ValueError, match="lang_list must contain at least one"):
+            EasyOCRConfig(lang_list=[])
+        # raise error if cache_ttl is negative
+        with pytest.raises(ValueError, match="cache_ttl must be non-negative"):
+            EasyOCRConfig(cache_ttl=-1)
+
+
+