Skip to content

Commit 7775559

Browse files
committed
feat: use paddle ocr v4 instead
1 parent 2b6cbee commit 7775559

File tree

4 files changed

+85
-148
lines changed

4 files changed

+85
-148
lines changed

docker-compose.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,6 @@ services:
129129
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
130130
- MINIO_USE_SSL=${MINIO_USE_SSL}
131131
- WEB_PROXY=${WEB_PROXY}
132-
- GRPC_ENABLE_FORK_SUPPORT=1
133-
- GRPC_WORKER_PROCESSES=1
134-
- GRPC_MAX_WORKERS=4
135132
networks:
136133
- WeKnora-network
137134
restart: unless-stopped

services/docreader/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ urllib3
1313
markdownify
1414
mistletoe
1515
goose3[all]
16-
paddleocr==3.2.0
16+
paddleocr>=2.10.0,<3.0.0
1717
markdown
1818
pypdf
1919
cos-python-sdk-v5
@@ -25,7 +25,7 @@ ollama
2525
pdfplumber
2626

2727
--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
28-
paddlepaddle==3.2.0
28+
paddlepaddle>=3.0.0,<4.0.0
2929

3030
# --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
3131
# paddlepaddle-gpu==3.0.0

services/docreader/src/parser/ocr_engine.py

Lines changed: 51 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -33,30 +33,40 @@ def __init__(self, **kwargs):
3333
"""Initialize PaddleOCR backend"""
3434
self.ocr = None
3535
try:
36+
import os
37+
import paddle
38+
39+
# Set PaddlePaddle to use CPU and disable GPU
40+
os.environ['CUDA_VISIBLE_DEVICES'] = ''
41+
paddle.set_device('cpu')
42+
3643
from paddleocr import PaddleOCR
37-
# Default OCR configuration
44+
# Simplified OCR configuration
3845
ocr_config = {
39-
"text_det_limit_type": "max", # Change from 'min' to 'max'
40-
"text_det_limit_side_len": 960, # A standard and safe limit for the longest side
41-
"use_doc_orientation_classify": False, # Do not use document image orientation classification
42-
"use_doc_unwarping": False, # Do not use document unwarping
43-
"use_textline_orientation": False, # Do not use textline orientation classification
44-
"text_recognition_model_name": "PP-OCRv5_server_rec",
45-
"text_detection_model_name": "PP-OCRv5_server_det",
46-
"text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer",
47-
"text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer",
48-
"text_det_thresh": 0.3, # Text detection pixel threshold
49-
"text_det_box_thresh": 0.6, # Text detection box threshold
50-
"text_det_unclip_ratio": 1.5, # Text detection expansion ratio
51-
"text_rec_score_thresh": 0.0, # Text recognition confidence threshold
52-
"ocr_version": "PP-OCRv5", # Switch to PP-OCRv4 here to compare
46+
"use_gpu": False,
47+
"text_det_limit_type": "max",
48+
"text_det_limit_side_len": 960,
49+
"use_doc_orientation_classify": False,
50+
"use_doc_unwarping": False,
51+
"use_textline_orientation": False,
52+
"text_recognition_model_name": "PP-OCRv4_server_rec",
53+
"text_detection_model_name": "PP-OCRv4_server_det",
54+
"text_det_thresh": 0.3,
55+
"text_det_box_thresh": 0.6,
56+
"text_det_unclip_ratio": 1.5,
57+
"text_rec_score_thresh": 0.0,
58+
"ocr_version": "PP-OCRv4",
5359
"lang": "ch",
60+
"show_log": False,
61+
"use_dilation": True, # improves accuracy
62+
"det_db_score_mode": "slow", # improves accuracy
5463
}
5564

5665
self.ocr = PaddleOCR(**ocr_config)
5766
logger.info("PaddleOCR engine initialized successfully")
58-
except ImportError:
59-
logger.error("Failed to import paddleocr. Please install it with 'pip install paddleocr'")
67+
68+
except ImportError as e:
69+
logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
6070
except Exception as e:
6171
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
6272

@@ -71,50 +81,39 @@ def predict(self, image):
7181
"""
7282
try:
7383
# Ensure image is in RGB format
74-
if hasattr(image, "convert"):
75-
if image.mode == "RGBA":
76-
img_for_ocr = image.convert("RGB") # 尝试转换为 RGB
77-
logger.info(f"Converted image from RGBA to RGB format for OCR.")
78-
elif image.mode != "RGB": # 如果不是 RGBA 也不是 RGB,也尝试转 RGB
79-
img_for_ocr = image.convert("RGB")
80-
logger.info(f"Converted image from {image.mode} to RGB format for OCR.")
81-
else:
82-
img_for_ocr = image
83-
logger.info(f"Image already in RGB format.")
84-
else:
85-
img_for_ocr = image
86-
logger.info(f"Image is not a PIL.Image object, assuming it's already suitable for OCR.")
84+
if hasattr(image, "convert") and image.mode != "RGB":
85+
image = image.convert("RGB")
8786

88-
# Convert to numpy array if not already
89-
if hasattr(img_for_ocr, "convert"):
90-
image_array = np.array(img_for_ocr)
87+
# Convert to numpy array if needed
88+
if hasattr(image, "convert"):
89+
image_array = np.array(image)
9190
else:
92-
image_array = img_for_ocr
91+
image_array = image
9392

94-
ocr_result = self.ocr.predict(image_array)
93+
# Perform OCR
94+
ocr_result = self.ocr.ocr(image_array, cls=False)
9595

9696
# Extract text
97-
if ocr_result and any(ocr_result):
98-
ocr_text = ""
99-
for image_result in ocr_result:
100-
ocr_text = ocr_text + " ".join(image_result["rec_texts"])
101-
text_length = len(ocr_text)
102-
if text_length > 0:
103-
logger.info(f"OCR extracted {text_length} characters")
104-
logger.info(
105-
f"OCR text sample: {ocr_text[:100]}..."
106-
if text_length > 100
107-
else f"OCR text: {ocr_text}"
108-
)
109-
return ocr_text
110-
else:
111-
logger.warning("OCR returned empty result")
97+
ocr_text = ""
98+
if ocr_result and ocr_result[0]:
99+
for line in ocr_result[0]:
100+
if line and len(line) >= 2:
101+
text = line[1][0] if line[1] else ""
102+
if text:
103+
ocr_text += text + " "
104+
105+
text_length = len(ocr_text.strip())
106+
if text_length > 0:
107+
logger.info(f"OCR extracted {text_length} characters")
108+
return ocr_text.strip()
112109
else:
113-
logger.warning("OCR did not return any result")
114-
return ""
110+
logger.warning("OCR returned empty result")
111+
return ""
112+
115113
except Exception as e:
116114
logger.error(f"OCR recognition error: {str(e)}")
117115
return ""
116+
118117
class NanonetsOCRBackend(OCRBackend):
119118
"""Nanonets OCR backend implementation using OpenAI API format"""
120119

services/docreader/src/server/server.py

Lines changed: 32 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55
import traceback
66
import grpc
77
import uuid
8-
9-
# Enable gRPC fork support to avoid multiprocessing issues
10-
os.environ.setdefault('GRPC_ENABLE_FORK_SUPPORT', '1')
8+
import atexit
119

1210
# Add parent directory to Python path
1311
current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -327,104 +325,47 @@ def init_ocr_engine(ocr_backend, ocr_config):
327325
logger.error(f"Error initializing OCR engine: {str(e)}")
328326
return False
329327

328+
330329
def serve():
330+
331331
init_ocr_engine(os.getenv("OCR_BACKEND", "paddle"), {
332332
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
333333
})
334-
# Set max number of worker threads and processes
334+
335+
# Set max number of worker threads
335336
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
336-
# Force single process mode to avoid gRPC multiprocessing issues
337-
worker_processes = 1
338-
logger.info(f"Starting DocReader service, max worker threads per process: {max_workers}, "
339-
f"processes: {worker_processes} (forced single process mode)")
337+
logger.info(f"Starting DocReader service with {max_workers} worker threads")
340338

341339
# Get port number
342340
port = os.environ.get("GRPC_PORT", "50051")
343341

344-
# Multi-process mode (disabled due to gRPC fork issues)
345-
if False and worker_processes > 1:
346-
import multiprocessing
347-
processes = []
348-
349-
def run_server():
350-
# Create server
351-
server = grpc.server(
352-
futures.ThreadPoolExecutor(max_workers=max_workers),
353-
options=[
354-
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
355-
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
356-
('grpc.enable_fork_support', 1),
357-
('grpc.so_reuseport', 1),
358-
],
359-
)
360-
361-
# Register service
362-
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
363-
364-
# Set listen address
365-
server.add_insecure_port(f"[::]:{port}")
366-
367-
# Start service
368-
server.start()
369-
370-
logger.info(f"Worker process {os.getpid()} started on port {port}")
371-
372-
try:
373-
# Wait for service termination
374-
server.wait_for_termination()
375-
except KeyboardInterrupt:
376-
logger.info(f"Worker process {os.getpid()} received termination signal")
377-
server.stop(0)
378-
379-
# Start specified number of worker processes
380-
for i in range(worker_processes):
381-
process = multiprocessing.Process(target=run_server)
382-
processes.append(process)
383-
process.start()
384-
logger.info(f"Started worker process {process.pid} ({i+1}/{worker_processes})")
385-
386-
# Wait for all processes to complete
387-
try:
388-
for process in processes:
389-
process.join()
390-
except KeyboardInterrupt:
391-
logger.info("Master process received termination signal")
392-
for process in processes:
393-
if process.is_alive():
394-
logger.info(f"Terminating worker process {process.pid}")
395-
process.terminate()
342+
# Create server
343+
server = grpc.server(
344+
futures.ThreadPoolExecutor(max_workers=max_workers),
345+
options=[
346+
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
347+
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
348+
],
349+
)
396350

397-
# Single-process mode
398-
else:
399-
# Create server
400-
server = grpc.server(
401-
futures.ThreadPoolExecutor(max_workers=max_workers),
402-
options=[
403-
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
404-
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
405-
('grpc.enable_fork_support', 1),
406-
('grpc.so_reuseport', 1),
407-
],
408-
)
409-
410-
# Register service
411-
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
412-
413-
# Set listen address
414-
server.add_insecure_port(f"[::]:{port}")
415-
416-
# Start service
417-
server.start()
418-
419-
logger.info(f"Server started on port {port} (single process mode)")
420-
logger.info("Server is ready to accept connections")
421-
422-
try:
423-
# Wait for service termination
424-
server.wait_for_termination()
425-
except KeyboardInterrupt:
426-
logger.info("Received termination signal, shutting down server")
427-
server.stop(0)
351+
# Register service
352+
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
353+
354+
# Set listen address
355+
server.add_insecure_port(f"[::]:{port}")
356+
357+
# Start service
358+
server.start()
359+
360+
logger.info(f"Server started on port {port}")
361+
logger.info("Server is ready to accept connections")
362+
363+
try:
364+
# Wait for service termination
365+
server.wait_for_termination()
366+
except KeyboardInterrupt:
367+
logger.info("Received termination signal, shutting down server")
368+
server.stop(0)
428369

429370
if __name__ == "__main__":
430371
serve()

0 commit comments

Comments
 (0)