Skip to content

Commit bd03a51

Browse files
authored
Merge pull request #2372 from arc53/fast-ebook
feat: faster ebook parsing
2 parents e787c89 + fcdb4fb commit bd03a51

File tree

3 files changed

+31
-131
lines changed

3 files changed

+31
-131
lines changed

application/parser/file/epub_parser.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,25 +19,10 @@ def _init_parser(self) -> Dict:
1919
def parse_file(self, file: Path, errors: str = "ignore") -> str:
2020
"""Parse file."""
2121
try:
22-
import ebooklib
23-
from ebooklib import epub
22+
from fast_ebook import epub
2423
except ImportError:
25-
raise ValueError("`EbookLib` is required to read Epub files.")
26-
try:
27-
import html2text
28-
except ImportError:
29-
raise ValueError("`html2text` is required to parse Epub files.")
30-
31-
text_list = []
32-
book = epub.read_epub(file, options={"ignore_ncx": True})
33-
34-
# Iterate through all chapters.
35-
for item in book.get_items():
36-
# Chapters are typically located in epub documents items.
37-
if item.get_type() == ebooklib.ITEM_DOCUMENT:
38-
text_list.append(
39-
html2text.html2text(item.get_content().decode("utf-8"))
40-
)
24+
raise ValueError("`fast-ebook` is required to read Epub files.")
4125

42-
text = "\n".join(text_list)
26+
book = epub.read_epub(file)
27+
text = book.to_markdown()
4328
return text

application/requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ rapidocr>=1.4.0
1111
onnxruntime>=1.19.0
1212
docx2txt==0.9
1313
ddgs>=8.0.0
14-
ebooklib==0.20
14+
fast-ebook
1515
elevenlabs==2.41.0
1616
Flask==3.1.3
1717
faiss-cpu==1.13.2
@@ -23,7 +23,6 @@ google-auth-httplib2==0.3.1
2323
google-auth-oauthlib==1.3.1
2424
gTTS==2.5.4
2525
gunicorn==25.3.0
26-
html2text==2025.4.15
2726
jinja2==3.1.6
2827
jiter==0.13.0
2928
jmespath==1.1.0

tests/parser/file/test_epub_parser.py

Lines changed: 26 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -20,133 +20,49 @@ def test_epub_init_parser():
2020
assert parser.parser_config_set
2121

2222

23-
def test_epub_parser_ebooklib_import_error(epub_parser):
24-
"""Test that ImportError is raised when ebooklib is not available."""
25-
with patch.dict(sys.modules, {"ebooklib": None}):
26-
with pytest.raises(ValueError, match="`EbookLib` is required to read Epub files"):
23+
def test_epub_parser_fast_ebook_import_error(epub_parser):
24+
"""Test that ImportError is raised when fast-ebook is not available."""
25+
with patch.dict(sys.modules, {"fast_ebook": None}):
26+
with pytest.raises(ValueError, match="`fast-ebook` is required to read Epub files"):
2727
epub_parser.parse_file(Path("test.epub"))
2828

2929

30-
def test_epub_parser_html2text_import_error(epub_parser):
31-
"""Test that ImportError is raised when html2text is not available."""
32-
fake_ebooklib = types.ModuleType("ebooklib")
33-
fake_epub = types.ModuleType("ebooklib.epub")
34-
fake_ebooklib.epub = fake_epub
35-
36-
with patch.dict(sys.modules, {"ebooklib": fake_ebooklib, "ebooklib.epub": fake_epub}):
37-
with patch.dict(sys.modules, {"html2text": None}):
38-
with pytest.raises(ValueError, match="`html2text` is required to parse Epub files"):
39-
epub_parser.parse_file(Path("test.epub"))
40-
41-
4230
def test_epub_parser_successful_parsing(epub_parser):
4331
"""Test successful parsing of an epub file."""
32+
fake_fast_ebook = types.ModuleType("fast_ebook")
33+
fake_epub = types.ModuleType("fast_ebook.epub")
34+
fake_fast_ebook.epub = fake_epub
4435

45-
fake_ebooklib = types.ModuleType("ebooklib")
46-
fake_epub = types.ModuleType("ebooklib.epub")
47-
fake_html2text = types.ModuleType("html2text")
48-
49-
# Mock ebooklib constants
50-
fake_ebooklib.ITEM_DOCUMENT = "document"
51-
fake_ebooklib.epub = fake_epub
52-
53-
mock_item1 = MagicMock()
54-
mock_item1.get_type.return_value = "document"
55-
mock_item1.get_content.return_value = b"<h1>Chapter 1</h1><p>Content 1</p>"
56-
57-
mock_item2 = MagicMock()
58-
mock_item2.get_type.return_value = "document"
59-
mock_item2.get_content.return_value = b"<h1>Chapter 2</h1><p>Content 2</p>"
60-
61-
mock_item3 = MagicMock()
62-
mock_item3.get_type.return_value = "other" # Should be ignored
63-
mock_item3.get_content.return_value = b"<p>Other content</p>"
64-
6536
mock_book = MagicMock()
66-
mock_book.get_items.return_value = [mock_item1, mock_item2, mock_item3]
67-
37+
mock_book.to_markdown.return_value = "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n"
38+
6839
fake_epub.read_epub = MagicMock(return_value=mock_book)
69-
70-
def mock_html2text_func(html_content):
71-
if "Chapter 1" in html_content:
72-
return "# Chapter 1\n\nContent 1\n"
73-
elif "Chapter 2" in html_content:
74-
return "# Chapter 2\n\nContent 2\n"
75-
return "Other content\n"
76-
77-
fake_html2text.html2text = mock_html2text_func
78-
40+
7941
with patch.dict(sys.modules, {
80-
"ebooklib": fake_ebooklib,
81-
"ebooklib.epub": fake_epub,
82-
"html2text": fake_html2text
42+
"fast_ebook": fake_fast_ebook,
43+
"fast_ebook.epub": fake_epub,
8344
}):
8445
result = epub_parser.parse_file(Path("test.epub"))
85-
86-
expected_result = "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n"
87-
assert result == expected_result
88-
89-
# Verify epub.read_epub was called with correct parameters
90-
fake_epub.read_epub.assert_called_once_with(Path("test.epub"), options={"ignore_ncx": True})
46+
47+
assert result == "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n"
48+
fake_epub.read_epub.assert_called_once_with(Path("test.epub"))
9149

9250

9351
def test_epub_parser_empty_book(epub_parser):
94-
"""Test parsing an epub file with no document items."""
95-
# Create mock modules
96-
fake_ebooklib = types.ModuleType("ebooklib")
97-
fake_epub = types.ModuleType("ebooklib.epub")
98-
fake_html2text = types.ModuleType("html2text")
99-
100-
fake_ebooklib.ITEM_DOCUMENT = "document"
101-
fake_ebooklib.epub = fake_epub
102-
103-
# Create mock book with no document items
52+
"""Test parsing an epub file with no content."""
53+
fake_fast_ebook = types.ModuleType("fast_ebook")
54+
fake_epub = types.ModuleType("fast_ebook.epub")
55+
fake_fast_ebook.epub = fake_epub
56+
10457
mock_book = MagicMock()
105-
mock_book.get_items.return_value = []
106-
58+
mock_book.to_markdown.return_value = ""
59+
10760
fake_epub.read_epub = MagicMock(return_value=mock_book)
108-
fake_html2text.html2text = MagicMock()
109-
61+
11062
with patch.dict(sys.modules, {
111-
"ebooklib": fake_ebooklib,
112-
"ebooklib.epub": fake_epub,
113-
"html2text": fake_html2text
63+
"fast_ebook": fake_fast_ebook,
64+
"fast_ebook.epub": fake_epub,
11465
}):
11566
result = epub_parser.parse_file(Path("empty.epub"))
116-
assert result == ""
11767

118-
fake_html2text.html2text.assert_not_called()
119-
120-
121-
def test_epub_parser_non_document_items_ignored(epub_parser):
122-
"""Test that non-document items are ignored during parsing."""
123-
fake_ebooklib = types.ModuleType("ebooklib")
124-
fake_epub = types.ModuleType("ebooklib.epub")
125-
fake_html2text = types.ModuleType("html2text")
126-
127-
fake_ebooklib.ITEM_DOCUMENT = "document"
128-
fake_ebooklib.epub = fake_epub
129-
130-
mock_doc_item = MagicMock()
131-
mock_doc_item.get_type.return_value = "document"
132-
mock_doc_item.get_content.return_value = b"<p>Document content</p>"
133-
134-
mock_other_item = MagicMock()
135-
mock_other_item.get_type.return_value = "image" # Not a document
136-
137-
mock_book = MagicMock()
138-
mock_book.get_items.return_value = [mock_other_item, mock_doc_item]
139-
140-
fake_epub.read_epub = MagicMock(return_value=mock_book)
141-
fake_html2text.html2text = MagicMock(return_value="Document content\n")
142-
143-
with patch.dict(sys.modules, {
144-
"ebooklib": fake_ebooklib,
145-
"ebooklib.epub": fake_epub,
146-
"html2text": fake_html2text
147-
}):
148-
result = epub_parser.parse_file(Path("test.epub"))
149-
150-
assert result == "Document content\n"
151-
152-
fake_html2text.html2text.assert_called_once_with("<p>Document content</p>")
68+
assert result == ""

0 commit comments

Comments
 (0)