Skip to content

Commit b053b8b

Browse files
authored
Merge pull request #25 from RapidAI/develop
feat: add page_num_list params
2 parents b97232f + 31c253a commit b053b8b

File tree

10 files changed

+128
-41
lines changed

10 files changed

+128
-41
lines changed

README.md

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717

1818
本仓库依托于[RapidOCR](https://github.com/RapidAI/RapidOCR)仓库,快速提取PDF中文字,包括扫描版PDF、加密版PDF、可直接复制文字版PDF。
1919

20-
🔥🔥🔥 版式还原参见项目:[RapidLayoutRecover](https://github.com/RapidAI/RapidLayoutRecover)
21-
2220
### 整体流程
2321

2422
```mermaid
@@ -40,32 +38,40 @@ pip install rapidocr_pdf
4038

4139
#### 脚本使用
4240

43-
`rapidocr_pdf>=0.2.0`中,已经适配`rapidocr>=2.0.0`版本,可以通过参数来使用不同OCR推理引擎来提速。
41+
⚠️注意:`rapidocr_pdf>=0.2.0`中,已经适配`rapidocr>=2.0.0`版本,可以通过参数来使用不同OCR推理引擎来提速。
4442
下面的`ocr_params`为示例参数,详细请参见RapidOCR官方文档:[docs](https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#_4)
4543

44+
⚠️注意:在`rapidocr_pdf>=0.3.0`中,支持了`page_num_list`参数,默认为None,全部提取。**如果指定,页码从0开始**
45+
4646
```python
4747
from rapidocr_pdf import RapidOCRPDF
4848

4949
pdf_extracter = RapidOCRPDF(ocr_params={"Global.with_torch": True})
5050

5151
pdf_path = "tests/test_files/direct_and_image.pdf"
52-
texts = pdf_extracter(pdf_path, force_ocr=False)
52+
53+
# page_num_list=[1]: 仅提取第2页
54+
texts = pdf_extracter(pdf_path, force_ocr=False, page_num_list=[1])
5355
print(texts)
5456
```
5557

5658
#### 命令行使用
5759

5860
```bash
5961
$ rapidocr_pdf -h
60-
usage: rapidocr_pdf [-h] [-path FILE_PATH] [-f]
62+
usage: rapidocr_pdf [-h] [--dpi DPI] [-f] [--page_num_list [PAGE_NUM_LIST ...]] pdf_path
6163

62-
optional arguments:
64+
positional arguments:
65+
pdf_path
66+
67+
options:
6368
-h, --help show this help message and exit
64-
-path FILE_PATH, --file_path FILE_PATH
65-
File path, PDF or images
69+
--dpi DPI
6670
-f, --force_ocr Whether to use ocr for all pages.
71+
--page_num_list [PAGE_NUM_LIST ...]
72+
Which pages will be extracted. e.g. 1 2 3. Note: the index of page num starts from 1.
6773

68-
$ rapidocr_pdf -path tests/test_files/direct_and_image.pdf
74+
$ rapidocr_pdf tests/test_files/direct_and_image.pdf --page_num_list 1
6975
```
7076
7177
### 输入输出说明
@@ -76,9 +82,7 @@ $ rapidocr_pdf -path tests/test_files/direct_and_image.pdf
7682
7783
```python
7884
[
79-
['0', '人之初,性本善。性相近,习相远。', 0.8969868],
80-
['1', 'Men at their birth, are naturally good.', 0.8969868],
85+
[0, '人之初,性本善。性相近,习相远。', 0.8969868],
86+
[1, 'Men at their birth, are naturally good.', 0.8969868],
8187
]
8288
```
83-
84-
### [更新日志](https://github.com/RapidAI/RapidOCRPDF/releases)

demo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
# @Contact: [email protected]
44
from rapidocr_pdf import RapidOCRPDF
55

6-
pdf_extracter = RapidOCRPDF(ocr_params={"Global.with_torch": True})
6+
pdf_extracter = RapidOCRPDF()
77

88
pdf_path = "tests/test_files/direct_and_image.pdf"
9-
texts = pdf_extracter(pdf_path, force_ocr=False)
9+
texts = pdf_extracter(pdf_path, force_ocr=False, page_num_list=[2])
1010
print(texts)

rapidocr_pdf/main.py

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,23 @@
1010
import numpy as np
1111
from rapidocr import RapidOCR
1212

13-
from .logger import Logger
14-
from .utils import which_type
13+
from .utils.logger import Logger
14+
from .utils.utils import error_log, which_type
15+
16+
logger = Logger(logger_name=__name__).get_log()
1517

1618

1719
class RapidOCRPDF:
1820
def __init__(self, dpi=200, ocr_params: Optional[Dict] = None):
1921
self.dpi = dpi
2022
self.ocr_engine = RapidOCR(params=ocr_params)
2123
self.empty_list = []
22-
self.logger = Logger(logger_name=__name__).get_log()
2324

2425
def __call__(
25-
self, content: Union[str, Path, bytes], force_ocr: bool = False
26+
self,
27+
content: Union[str, Path, bytes],
28+
force_ocr: bool = False,
29+
page_num_list: Optional[List[int]] = None,
2630
) -> List[List[Union[str, str, str]]]:
2731
try:
2832
file_type = which_type(content)
@@ -35,10 +39,12 @@ def __call__(
3539
try:
3640
pdf_data = self.load_pdf(content)
3741
except RapidOCRPDFError as e:
38-
self.logger.error(e)
42+
logger.error("%s\n%s", e, error_log())
3943
return self.empty_list
4044

41-
txts_dict, need_ocr_idxs = self.extract_texts(pdf_data, force_ocr)
45+
txts_dict, need_ocr_idxs = self.extract_texts(
46+
pdf_data, force_ocr, page_num_list
47+
)
4248

4349
ocr_res_dict = self.get_ocr_res_streaming(pdf_data, need_ocr_idxs)
4450

@@ -60,21 +66,41 @@ def load_pdf(pdf_content: Union[str, Path, bytes]) -> bytes:
6066

6167
raise RapidOCRPDFError(f"{type(pdf_content)} is not in [str, Path, bytes].")
6268

63-
def extract_texts(self, pdf_data: bytes, force_ocr: bool) -> Tuple[Dict, List]:
69+
def extract_texts(
70+
self, pdf_data: bytes, force_ocr: bool, page_num_list: Optional[List[int]]
71+
) -> Tuple[Dict, List]:
6472
texts, need_ocr_idxs = {}, []
6573
with fitz.open(stream=pdf_data) as doc:
74+
page_num_list = self.get_page_num_range(page_num_list, doc.page_count)
6675
for i, page in enumerate(doc):
76+
if page_num_list is not None and i not in page_num_list:
77+
continue
78+
6779
if force_ocr:
6880
need_ocr_idxs.append(i)
6981
continue
7082

7183
text = page.get_text("text", sort=True)
7284
if text:
73-
texts[str(i)] = text
85+
texts[i] = text
7486
else:
7587
need_ocr_idxs.append(i)
7688
return texts, need_ocr_idxs
7789

90+
@staticmethod
91+
def get_page_num_range(
92+
page_num_list: Optional[List[int]], page_count: int
93+
) -> Optional[List[int]]:
94+
if page_num_list is None:
95+
return None
96+
97+
if max(page_num_list) >= page_count:
98+
raise RapidOCRPDFError(
99+
f"The max value of {page_num_list} is greater than total page nums: {page_count}"
100+
)
101+
102+
return page_num_list
103+
78104
def get_ocr_res_streaming(self, pdf_data: bytes, need_ocr_idxs: List) -> Dict:
79105
def convert_img(page):
80106
pix = page.get_pixmap(dpi=self.dpi)
@@ -96,7 +122,7 @@ def convert_img(page):
96122
sum(preds.scores) / len(preds.scores) if preds.scores else 0.0
97123
)
98124

99-
ocr_res[str(i)] = {
125+
ocr_res[i] = {
100126
"text": "\n".join(preds.txts),
101127
"avg_confidence": avg_score,
102128
}
@@ -121,27 +147,36 @@ class RapidOCRPDFError(Exception):
121147
pass
122148

123149

124-
def main():
150+
def parse_args(arg_list: Optional[List[str]] = None):
125151
parser = argparse.ArgumentParser()
126-
parser.add_argument(
127-
"-path", "--file_path", type=str, help="File path, PDF or images"
128-
)
152+
parser.add_argument("pdf_path", type=str)
153+
parser.add_argument("--dpi", type=int, default=200)
129154
parser.add_argument(
130155
"-f",
131156
"--force_ocr",
132157
action="store_true",
133158
default=False,
134159
help="Whether to use ocr for all pages.",
135160
)
136-
args = parser.parse_args()
161+
parser.add_argument(
162+
"--page_num_list",
163+
type=int,
164+
nargs="*",
165+
default=None,
166+
help="Which pages will be extracted. e.g. 0 1 2. Note: the index of page num starts from 0.",
167+
)
168+
args = parser.parse_args(arg_list)
169+
return args
137170

138-
pdf_extracter = RapidOCRPDF()
139171

172+
def main(arg_list: Optional[List[str]] = None):
173+
args = parse_args(arg_list)
174+
pdf_extracter = RapidOCRPDF(args.dpi)
140175
try:
141-
result = pdf_extracter(args.file_path, args.force_ocr)
176+
result = pdf_extracter(args.pdf_path, args.force_ocr, args.page_num_list)
142177
print(result)
143178
except Exception as e:
144-
print(f"[ERROR] {e}")
179+
logger.error("%s\n%s", e, error_log())
145180

146181

147182
if __name__ == "__main__":

rapidocr_pdf/utils/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# -*- encoding: utf-8 -*-
2+
# @Author: SWHL
3+
# @Contact: [email protected]
File renamed without changes.

rapidocr_pdf/utils.py renamed to rapidocr_pdf/utils/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,17 @@
22
# @Author: SWHL
33
# @Contact: [email protected]
44
import importlib
5+
import traceback
56
from pathlib import Path
67
from typing import Union
78

89
import filetype
910

1011

12+
def error_log():
13+
return traceback.format_exc()
14+
15+
1116
def import_package(name, package=None):
1217
try:
1318
module = importlib.import_module(name, package=package)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
filetype>=1.2.0
22
pymupdf
3-
rapidocr
3+
rapidocr>=2.0.7
44
colorlog

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def get_readme():
5656
author_email="[email protected]",
5757
url="https://github.com/RapidAI/RapidOCRPDF",
5858
license="Apache-2.0",
59-
packages=[MODULE_NAME],
59+
packages=setuptools.find_packages(),
6060
install_requires=read_txt("requirements.txt"),
6161
keywords=["rapidocr_pdf,rapidocr_onnxruntime,ocr,onnxruntime,openvino"],
6262
classifiers=[

tests/test_files/1.jpg

-48.9 KB
Binary file not shown.

tests/test_main.py

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,66 @@
11
# -*- encoding: utf-8 -*-
22
# @Author: SWHL
33
# @Contact: [email protected]
4+
import ast
5+
import shlex
46
import sys
57
from pathlib import Path
68

7-
root_dir = Path(__file__).resolve().parent.parent
9+
cur_dir = Path(__file__).resolve().parent
10+
root_dir = cur_dir.parent
811
sys.path.append(str(root_dir))
912

1013
import pytest
1114

1215
from rapidocr_pdf import RapidOCRPDF, RapidOCRPDFError
16+
from rapidocr_pdf.main import main
17+
18+
test_dir = cur_dir / "test_files"
19+
20+
pdf_path = test_dir / "direct_and_image.pdf"
1321

14-
test_file_dir = Path(__file__).resolve().parent / "test_files"
1522
extracter = RapidOCRPDF()
1623

1724

25+
@pytest.mark.parametrize(
26+
"command, expected_output",
27+
[
28+
(
29+
f"{pdf_path} --page_num_list 0",
30+
"ABCNet: Real-time Scene Text Spotting with Adaptive Bezier-Curve Network∗",
31+
)
32+
],
33+
)
34+
def test_cli(capsys, command, expected_output):
35+
main(shlex.split(command))
36+
output = capsys.readouterr().out.rstrip()
37+
output = ast.literal_eval(output)
38+
assert output[0][1].split("\n")[0].strip() == expected_output
39+
40+
41+
def test_page_num():
42+
pdf_path = test_dir / "direct_extract.pdf"
43+
result = extracter(pdf_path, page_num_list=[0])
44+
45+
assert (
46+
result[0][1].split("\n")[0].strip()
47+
== "Defending Ukraine: Early Lessons from the Cyber War"
48+
)
49+
50+
51+
def test_error_page_num():
52+
pdf_path = test_dir / "direct_extract.pdf"
53+
with pytest.raises(RapidOCRPDFError) as exc_info:
54+
result = extracter(pdf_path, page_num_list=[1])
55+
assert exc_info.type is RapidOCRPDFError
56+
57+
1858
@pytest.mark.parametrize(
1959
"pdf_content, result1, result2",
2060
[
21-
(test_file_dir / "direct_extract.pdf", 3214, "Defend"),
22-
(test_file_dir / "image.pdf", 3400, "Kurbas"),
23-
(test_file_dir / "direct_and_image.pdf", 3710, "ABCNet"),
61+
(test_dir / "direct_extract.pdf", 4858, " "),
62+
(test_dir / "image.pdf", 3478, "Kurbas"),
63+
(test_dir / "direct_and_image.pdf", 4848, " "),
2464
],
2565
)
2666
def test_different_pdf(pdf_content, result1, result2):
@@ -30,7 +70,7 @@ def test_different_pdf(pdf_content, result1, result2):
3070

3171

3272
def test_input_bytes():
33-
pdf_content = test_file_dir / "image.pdf"
73+
pdf_content = test_dir / "image.pdf"
3474
with open(pdf_content, "rb") as f:
3575
data = f.read()
3676

@@ -41,7 +81,7 @@ def test_input_bytes():
4181

4282

4383
def test_force_ocr():
44-
pdf_content = test_file_dir / "image.pdf"
84+
pdf_content = test_dir / "image.pdf"
4585
with open(pdf_content, "rb") as f:
4686
data = f.read()
4787

0 commit comments

Comments
 (0)