Skip to content

Commit 7d3cccb

Browse files
authored
Merge pull request #337 from meghana1090/issue334
Added EasyOCR Loader
2 parents db5a590 + 0b22e02 commit 7d3cccb

File tree

5 files changed

+1566
-28
lines changed

5 files changed

+1566
-28
lines changed

extract_thinker/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
)
3434
from .warning import filter_pydantic_v2_warnings
3535
from .document_loader.document_loader_mistral_ocr import DocumentLoaderMistralOCR, MistralOCRConfig
36+
from .document_loader.document_loader_easy_ocr import EasyOCRConfig, DocumentLoaderEasyOCR
3637
from .markdown.markdown_converter import MarkdownConverter, PageContent
3738
filter_pydantic_v2_warnings()
3839

@@ -82,6 +83,8 @@
8283
'BatchJob',
8384
'DocumentLoaderMistralOCR',
8485
'MistralOCRConfig',
86+
'EasyOCRConfig',
87+
'DocumentLoaderEasyOCR',
8588
'MarkdownConverter',
8689
'PageContent',
87-
]
90+
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
from typing import Any, Dict, List, Union
2+
from io import BytesIO
3+
from PIL import Image
4+
import numpy as np
5+
from dataclasses import dataclass, field
6+
from cachetools import cachedmethod, TTLCache
7+
from cachetools.keys import hashkey
8+
from operator import attrgetter
9+
import easyocr
10+
11+
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
12+
13+
14+
@dataclass
15+
class EasyOCRConfig:
16+
"""Configuration for EasyOCR loader.
17+
18+
Args:
19+
lang_list: List of languages to use for OCR. Defaults to ['en'].
20+
gpu: Whether to use GPU acceleration. Defaults to True.
21+
download_enabled: Whether to download models automatically. Defaults to True.
22+
cache_ttl: Time-to-live for cache in seconds. Defaults to 300.
23+
"""
24+
lang_list: List[str] = field(default_factory=lambda: ['en'])
25+
gpu: bool = True
26+
download_enabled: bool = True
27+
cache_ttl: int = 300
28+
29+
def __post_init__(self):
30+
"""Initialize EasyOCR reader with configuration settings and validation."""
31+
if not self.lang_list:
32+
raise ValueError("lang_list must contain at least one language code.")
33+
if self.cache_ttl <= 0:
34+
raise ValueError("cache_ttl must be positive.")
35+
36+
self.reader = easyocr.Reader(
37+
lang_list=self.lang_list,
38+
gpu=self.gpu,
39+
download_enabled=self.download_enabled,
40+
)
41+
42+
43+
class DocumentLoaderEasyOCR(CachedDocumentLoader):
44+
SUPPORTED_FORMATS = ["png", "jpg", "jpeg", "tiff", "tif", "webp"]
45+
46+
def __init__(self, config: EasyOCRConfig):
47+
"""Initialize the EasyOCR document loader.
48+
49+
Args:
50+
config: Configuration object for EasyOCR settings
51+
"""
52+
super().__init__()
53+
self.config = config
54+
self.cache = TTLCache(maxsize=128, ttl=self.config.cache_ttl)
55+
self.vision_mode = False
56+
57+
def can_handle(self, source: Union[str, BytesIO]) -> bool:
58+
"""Check if the loader can handle the given source.
59+
60+
Args:
61+
source: Path to a file or BytesIO stream
62+
63+
Returns:
64+
bool: True if source is supported, False otherwise
65+
"""
66+
# Check if source is a BytesIO stream
67+
if isinstance(source, BytesIO):
68+
return True
69+
# Check if source is a file path and has a valid extension
70+
if isinstance(source, str) and '.' in source:
71+
# Extract the file extension (after the last '.') and convert to lowercase
72+
ext = source.split('.')[-1].lower()
73+
return ext in self.SUPPORTED_FORMATS
74+
return False
75+
76+
@cachedmethod(cache=attrgetter('cache'), key=lambda self, source: hashkey(source) if isinstance(source, str) else None)
77+
def load(self, source: Union[str, BytesIO]) -> List[List[Dict[str, Any]]]:
78+
"""Load and process an image (file path or BytesIO) using EasyOCR.
79+
80+
Args:
81+
source: Image file path or in-memory image stream (BytesIO)
82+
83+
Returns:
84+
List of pages, where each page contains a list of OCR results.
85+
Each OCR result is a dictionary with:
86+
- text: The extracted text
87+
- probability: Confidence score
88+
- bbox: Bounding box coordinates
89+
"""
90+
# Convert image from file path into numpy array
91+
if isinstance(source, str):
92+
with Image.open(source).convert("RGB") as img:
93+
image_array = np.array(img)
94+
# Convert image from bytes stream into numpy array
95+
elif isinstance(source, BytesIO):
96+
source.seek(0)
97+
with Image.open(source).convert("RGB") as img:
98+
image_array = np.array(img)
99+
else:
100+
raise ValueError("Unsupported source type. Expected str or BytesIO.")
101+
102+
ocr_result = self.config.reader.readtext(image_array)
103+
# Loop through OCR results and structure them into a dictionary format
104+
page_data = []
105+
for bbox, text, prob in ocr_result:
106+
page_data.append({
107+
"bbox": bbox,
108+
"text": text,
109+
"probability": prob
110+
})
111+
return [page_data]
112+
113+
def can_handle_vision(self, source: Union[str, BytesIO]) -> bool:
114+
"""EasyOCR currently doesn't support vision mode in this loader."""
115+
return False
116+
117+
def set_vision_mode(self, enabled: bool = True):
118+
"""Disable vision mode, not supported here."""
119+
if enabled:
120+
raise ValueError("Vision mode is not supported in EasyOCR loader.")
121+

0 commit comments

Comments
 (0)