Skip to content

Commit 341a182

Browse files
refactor: streamline JSON conversion functions and improve error handling for unsupported file types
1 parent 2369b93 commit 341a182

File tree

2 files changed

+78
-138
lines changed

2 files changed

+78
-138
lines changed

audiobook/main.py

+1-12
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,9 @@
66

77
from audiobook.config import speed_dict
88
from audiobook.utils import (
9-
docs_to_json,
10-
epub_to_json,
11-
html_to_json,
129
load_json,
13-
mobi_to_json,
14-
odt_to_json,
15-
pdf_to_json,
1610
speak_text,
17-
txt_to_json,
1811
write_json_file,
19-
rtf_to_json
2012
)
2113
from audiobook.utils import get_json_metadata
2214

@@ -73,10 +65,7 @@ def create_json_book(self, input_book_path, password=None, load_from_library=Fal
7365
metadata["pages"] = len(json_book)
7466
return json_book, metadata
7567

76-
if json_book:
77-
json_book, metadata = get_json_metadata(input_book_path=input_book_path, password=password)
78-
else:
79-
raise NotImplementedError("Only PDF, TXT, EPUB, MOBI, ODT, HTTP, RTF, DOCX and DOC files are supported")
68+
json_book, metadata = get_json_metadata(input_book_path=input_book_path, password=password)
8069

8170
write_json_file(json_book, output_file_path)
8271

audiobook/utils.py

+77-126
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import json
22
import os
33
import re
4-
54
import docx2txt
65
import ebooklib
76
import html2text
@@ -11,235 +10,187 @@
1110
from odf import text, teletype
1211
from odf.opendocument import load
1312
from striprtf.striprtf import rtf_to_text
14-
1513
from audiobook.doc_parser.web_parser import ArticleWebScraper
1614
from audiobook.doc_parser.pdf_parser import PyPDF2DocParser
17-
# from audiobook.doc_parser.pdf_parser import PdfMinerDocParser
18-
1915

16+
# Helper function to load JSON data from a file
2017
def load_json(filename):
2118
with open(filename, "r") as fp:
2219
return json.load(fp)
2320

24-
21+
# Helper function to write JSON data to a file
2522
def write_json_file(json_data, filename):
2623
with open(filename, "w") as fp:
2724
json.dump(json_data, fp)
2825

29-
26+
# Text preprocessing: removes unwanted characters and extra spaces
3027
def text_preprocessing(input_text):
31-
"""function to preprocess text"""
32-
regex = re.compile(r"[\n\r\t]")
33-
preprocessed_text = regex.sub("", input_text)
34-
preprocessed_text = re.sub(" +", " ", preprocessed_text)
35-
preprocessed_text = preprocessed_text.strip()
28+
preprocessed_text = re.sub(r"[\n\r\t]", "", input_text)
29+
preprocessed_text = re.sub(" +", " ", preprocessed_text).strip()
3630
return preprocessed_text
3731

38-
32+
# Extract text content from HTML, preprocess it
3933
def response_to_text(chapter):
40-
"""fuction to convert response to text
41-
42-
required for epub files
43-
maybe required for html files
44-
"""
4534
soup = BeautifulSoup(chapter, "html.parser")
46-
extracted_text = [para.get_text() for para in soup.find_all("p")]
47-
extracted_text = " ".join(extracted_text)
48-
preprocessed_text = text_preprocessing(extracted_text)
49-
return preprocessed_text
50-
35+
extracted_text = " ".join([para.get_text() for para in soup.find_all("p")])
36+
return text_preprocessing(extracted_text)
5137

38+
# Speak the given text using the engine
5239
def speak_text(engine, text, display=True):
53-
"""function to speak text and display it"""
5440
if display:
5541
print(text)
5642
engine.say(text)
5743
engine.runAndWait()
5844

59-
45+
# Helper function to convert mobi files to JSON format
6046
def mobi_to_json(input_book_path):
61-
"""sub method to create json book from mobi file"""
6247
metadata = {}
6348
json_book = {}
6449
book_name = os.path.basename(input_book_path).split(".")[0]
6550
tempdir, filepath = mobi.extract(input_book_path)
51+
6652
with open(filepath, "r", encoding="utf-8") as fp:
6753
content = fp.read()
54+
6855
book_data = html2text.html2text(content)
6956
book_data = text_preprocessing(book_data)
7057

71-
for i in range(0, len(book_data), 2000):
72-
page_num = i // 2000
73-
json_book[str(page_num)] = book_data[i: i + 2000]
74-
58+
# Split content into chunks of 2000 characters
59+
json_book = {str(i // 2000): book_data[i:i + 2000] for i in range(0, len(book_data), 2000)}
60+
7561
metadata["pages"] = len(json_book)
7662
metadata["book_name"] = book_name
7763
return json_book, metadata
7864

79-
65+
# Helper function to convert PDF to JSON format
8066
def pdf_to_json(input_book_path, password=None):
81-
"""sub method to create json book from pdf file"""
82-
metadata = {}
8367
json_book = {}
68+
metadata = {}
8469
basename = os.path.basename(input_book_path).split(".")[0]
8570

86-
# removed pdf parser selection(only pydf2 is supported now)
87-
# if extraction_engine is None or extraction_engine == "pdfminer":
88-
# print("Using pdfminer")
89-
# pdf_parser = PdfMinerDocParser()
90-
# elif extraction_engine == "pypdf2":
91-
# print("Using pypdf2")
92-
# pdf_parser = PyPDF2DocParser()
93-
# else:
94-
# raise NotImplementedError("Only pdfminer and pypdf2 are supported")
95-
9671
pdf_parser = PyPDF2DocParser()
9772
text = pdf_parser.get_text(input_book_path, password=password)
9873
text = text_preprocessing(text)
9974

100-
for i in range(0, len(text), 2000):
101-
page_num = i // 2000
102-
json_book[str(page_num)] = text[i: i + 2000]
103-
104-
metadata['book_name'] = basename
105-
metadata['pages'] = len(json_book)
75+
json_book = {str(i // 2000): text[i:i + 2000] for i in range(0, len(text), 2000)}
76+
77+
metadata["book_name"] = basename
78+
metadata["pages"] = len(json_book)
10679
return json_book, metadata
10780

108-
81+
# Helper function to convert ODT files to JSON format
10982
def odt_to_json(input_book_path):
110-
"""sub method to create json book from odt file"""
111-
metadata = {}
11283
json_book = {}
84+
metadata = {}
11385
basename = os.path.basename(input_book_path).split(".")[0]
11486

11587
textdoc = load(input_book_path)
116-
allparas = textdoc.getElementsByType(text.P)
117-
output_text = ""
118-
for i in range(len(allparas)):
119-
output_text += " " + teletype.extractText(allparas[i])
88+
output_text = " ".join([teletype.extractText(para) for para in textdoc.getElementsByType(text.P)])
12089
output_text = text_preprocessing(output_text)
12190

122-
for i in range(0, len(output_text), 2000):
123-
page_num = i // 2000
124-
json_book[str(page_num)] = output_text[i: i + 2000]
125-
126-
metadata['book_name'] = basename
127-
metadata['pages'] = len(json_book)
128-
91+
json_book = {str(i // 2000): output_text[i:i + 2000] for i in range(0, len(output_text), 2000)}
92+
93+
metadata["book_name"] = basename
94+
metadata["pages"] = len(json_book)
12995
return json_book, metadata
13096

131-
97+
# Helper function to convert TXT files to JSON format
13298
def txt_to_json(input_book_path):
133-
"""sub method to create json book from txt file"""
13499
json_book = {}
135100
metadata = {}
136101
book_name = os.path.basename(input_book_path).split(".")[0]
102+
137103
with open(input_book_path, "r") as fp:
138104
file_txt_data = fp.read()
105+
139106
file_txt_data = text_preprocessing(file_txt_data)
140-
141-
for i in range(0, len(file_txt_data), 2000):
142-
page_num = i // 2000
143-
json_book[str(page_num)] = file_txt_data[i: i + 2000]
144-
107+
json_book = {str(i // 2000): file_txt_data[i:i + 2000] for i in range(0, len(file_txt_data), 2000)}
108+
145109
metadata["pages"] = len(json_book)
146110
metadata["book_name"] = book_name
147111
return json_book, metadata
148112

149-
113+
# Helper function to convert RTF files to JSON format
150114
def rtf_to_json(input_book_path):
151-
"""sub method to create json book from rtf file"""
152115
json_book = {}
153116
metadata = {}
154117
book_name = os.path.basename(input_book_path).split(".")[0]
118+
155119
with open(input_book_path, "r") as fp:
156120
file_rtf_data = fp.read()
157-
file_txt_data = rtf_to_text(file_rtf_data , errors="ignore")
121+
122+
file_txt_data = rtf_to_text(file_rtf_data, errors="ignore")
158123
file_txt_data = text_preprocessing(file_txt_data)
159124

160-
for i in range(0, len(file_txt_data), 2000):
161-
page_num = i // 2000
162-
json_book[str(page_num)] = file_txt_data[i: i + 2000]
163-
125+
json_book = {str(i // 2000): file_txt_data[i:i + 2000] for i in range(0, len(file_txt_data), 2000)}
126+
164127
metadata["pages"] = len(json_book)
165128
metadata["book_name"] = book_name
166129
return json_book, metadata
167130

168-
131+
# Helper function to convert DOCX files to JSON format
169132
def docs_to_json(input_book_path):
170-
"""sub method to create json book from docs file"""
171-
metadata = {}
172133
json_book = {}
134+
metadata = {}
173135
book_name = os.path.basename(input_book_path).split(".")[0]
136+
174137
book_data = docx2txt.process(input_book_path)
175-
for i in range(0, len(book_data), 2000):
176-
page_num = i // 2000
177-
json_book[str(page_num)] = book_data[i: i + 2000]
178-
138+
json_book = {str(i // 2000): book_data[i:i + 2000] for i in range(0, len(book_data), 2000)}
139+
179140
metadata["pages"] = len(json_book)
180141
metadata["book_name"] = book_name
181142
return json_book, metadata
182143

183-
144+
# Helper function to convert EPUB files to JSON format
184145
def epub_to_json(input_book_path):
185-
metadata = {}
186146
json_book = {}
147+
metadata = {}
187148
book_name = os.path.basename(input_book_path).split(".")[0]
149+
188150
book = epub.read_epub(input_book_path)
189-
text = " ".join(
190-
[
191-
response_to_text(chapter.get_body_content())
192-
for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
193-
]
194-
)
195-
for i in range(1, len(text) + 1, 2000):
196-
page_num = i // 2000
197-
json_book[str(page_num)] = text[i: i + 2000]
198-
151+
text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)])
152+
153+
json_book = {str(i // 2000): text[i:i + 2000] for i in range(1, len(text) + 1, 2000)}
154+
199155
metadata["pages"] = len(json_book)
200156
metadata["book_name"] = book_name
201157
return json_book, metadata
202158

203-
159+
# Helper function to convert HTML (web) content to JSON format
204160
def html_to_json(url):
205-
"""method to create json book from web article"""
206161
metadata = {}
207162
json_book = {}
208163
book_name = os.path.basename(url).split(".")[0]
164+
209165
article_scraper = ArticleWebScraper(url)
210166
page_data = article_scraper.get_page_data()
211167
page_data = text_preprocessing(page_data)
212-
for i in range(0, len(page_data), 2000):
213-
page_num = i // 2000
214-
json_book[str(page_num)] = page_data[i: i + 2000]
215-
168+
169+
json_book = {str(i // 2000): page_data[i:i + 2000] for i in range(0, len(page_data), 2000)}
170+
216171
metadata["pages"] = len(json_book)
217172
metadata["book_name"] = book_name
218173
return json_book, metadata
219174

220-
221-
def get_json_metadata(input_book_path, password):
222-
""" helper function to call the function based on the file type """
223-
# get the file extension
224-
json_book = {}
225-
metadata = {}
226-
file_extension = input_book_path.split(".")[-1]
227-
228-
if file_extension == "odt":
229-
json_book, metadata = odt_to_json(input_book_path)
230-
elif file_extension == "pdf":
231-
json_book, metadata = pdf_to_json(input_book_path, password)
232-
elif file_extension == "txt":
233-
json_book, metadata = txt_to_json(input_book_path)
234-
elif file_extension == "epub":
235-
json_book, metadata = epub_to_json(input_book_path)
236-
elif file_extension == "mobi":
237-
json_book, metadata = mobi_to_json(input_book_path)
238-
elif input_book_path.startswith(("http", "https")):
239-
json_book, metadata = html_to_json(input_book_path)
240-
elif input_book_path.endswith((".docx", ".doc")):
241-
json_book, metadata = docs_to_json(input_book_path)
242-
elif file_extension == "rtf":
243-
json_book, metadata = rtf_to_json(input_book_path)
244-
175+
# Main function to determine the file type and call respective methods
176+
def get_json_metadata(input_book_path, password=None):
177+
file_extension = input_book_path.split(".")[-1].lower()
178+
json_book, metadata = {}, {}
179+
180+
file_to_json = {
181+
"odt": odt_to_json,
182+
"pdf": pdf_to_json,
183+
"txt": txt_to_json,
184+
"epub": epub_to_json,
185+
"mobi": mobi_to_json,
186+
"html": html_to_json,
187+
"docx": docs_to_json,
188+
"rtf": rtf_to_json
189+
}
190+
191+
if file_extension in file_to_json:
192+
json_book, metadata = file_to_json[file_extension](input_book_path, password)
193+
else:
194+
raise NotImplementedError(f"Unsupported file type: {file_extension}")
195+
245196
return json_book, metadata

0 commit comments

Comments
 (0)