|
1 | 1 | import json
|
2 | 2 | import os
|
3 | 3 | import re
|
4 |
| - |
5 | 4 | import docx2txt
|
6 | 5 | import ebooklib
|
7 | 6 | import html2text
|
|
11 | 10 | from odf import text, teletype
|
12 | 11 | from odf.opendocument import load
|
13 | 12 | from striprtf.striprtf import rtf_to_text
|
14 |
| - |
15 | 13 | from audiobook.doc_parser.web_parser import ArticleWebScraper
|
16 | 14 | from audiobook.doc_parser.pdf_parser import PyPDF2DocParser
|
17 |
| -# from audiobook.doc_parser.pdf_parser import PdfMinerDocParser |
18 |
| - |
19 | 15 |
|
| 16 | +# Helper function to load JSON data from a file |
20 | 17 | def load_json(filename):
|
21 | 18 | with open(filename, "r") as fp:
|
22 | 19 | return json.load(fp)
|
23 | 20 |
|
24 |
| - |
| 21 | +# Helper function to write JSON data to a file |
25 | 22 | def write_json_file(json_data, filename):
|
26 | 23 | with open(filename, "w") as fp:
|
27 | 24 | json.dump(json_data, fp)
|
28 | 25 |
|
29 |
| - |
| 26 | +# Text preprocessing: removes unwanted characters and extra spaces |
30 | 27 | def text_preprocessing(input_text):
|
31 |
| - """function to preprocess text""" |
32 |
| - regex = re.compile(r"[\n\r\t]") |
33 |
| - preprocessed_text = regex.sub("", input_text) |
34 |
| - preprocessed_text = re.sub(" +", " ", preprocessed_text) |
35 |
| - preprocessed_text = preprocessed_text.strip() |
| 28 | + preprocessed_text = re.sub(r"[\n\r\t]", "", input_text) |
| 29 | + preprocessed_text = re.sub(" +", " ", preprocessed_text).strip() |
36 | 30 | return preprocessed_text
|
37 | 31 |
|
38 |
| - |
| 32 | +# Extract text content from HTML, preprocess it |
39 | 33 | def response_to_text(chapter):
|
40 |
| - """fuction to convert response to text |
41 |
| -
|
42 |
| - required for epub files |
43 |
| - maybe required for html files |
44 |
| - """ |
45 | 34 | soup = BeautifulSoup(chapter, "html.parser")
|
46 |
| - extracted_text = [para.get_text() for para in soup.find_all("p")] |
47 |
| - extracted_text = " ".join(extracted_text) |
48 |
| - preprocessed_text = text_preprocessing(extracted_text) |
49 |
| - return preprocessed_text |
50 |
| - |
| 35 | + extracted_text = " ".join([para.get_text() for para in soup.find_all("p")]) |
| 36 | + return text_preprocessing(extracted_text) |
51 | 37 |
|
| 38 | +# Speak the given text using the engine |
52 | 39 | def speak_text(engine, text, display=True):
|
53 |
| - """function to speak text and display it""" |
54 | 40 | if display:
|
55 | 41 | print(text)
|
56 | 42 | engine.say(text)
|
57 | 43 | engine.runAndWait()
|
58 | 44 |
|
59 |
| - |
| 45 | +# Helper function to convert mobi files to JSON format |
60 | 46 | def mobi_to_json(input_book_path):
|
61 |
| - """sub method to create json book from mobi file""" |
62 | 47 | metadata = {}
|
63 | 48 | json_book = {}
|
64 | 49 | book_name = os.path.basename(input_book_path).split(".")[0]
|
65 | 50 | tempdir, filepath = mobi.extract(input_book_path)
|
| 51 | + |
66 | 52 | with open(filepath, "r", encoding="utf-8") as fp:
|
67 | 53 | content = fp.read()
|
| 54 | + |
68 | 55 | book_data = html2text.html2text(content)
|
69 | 56 | book_data = text_preprocessing(book_data)
|
70 | 57 |
|
71 |
| - for i in range(0, len(book_data), 2000): |
72 |
| - page_num = i // 2000 |
73 |
| - json_book[str(page_num)] = book_data[i: i + 2000] |
74 |
| - |
| 58 | + # Split content into chunks of 2000 characters |
| 59 | + json_book = {str(i // 2000): book_data[i:i + 2000] for i in range(0, len(book_data), 2000)} |
| 60 | + |
75 | 61 | metadata["pages"] = len(json_book)
|
76 | 62 | metadata["book_name"] = book_name
|
77 | 63 | return json_book, metadata
|
78 | 64 |
|
79 |
| - |
| 65 | +# Helper function to convert PDF to JSON format |
80 | 66 | def pdf_to_json(input_book_path, password=None):
|
81 |
| - """sub method to create json book from pdf file""" |
82 |
| - metadata = {} |
83 | 67 | json_book = {}
|
| 68 | + metadata = {} |
84 | 69 | basename = os.path.basename(input_book_path).split(".")[0]
|
85 | 70 |
|
86 |
| - # removed pdf parser selection(only pydf2 is supported now) |
87 |
| - # if extraction_engine is None or extraction_engine == "pdfminer": |
88 |
| - # print("Using pdfminer") |
89 |
| - # pdf_parser = PdfMinerDocParser() |
90 |
| - # elif extraction_engine == "pypdf2": |
91 |
| - # print("Using pypdf2") |
92 |
| - # pdf_parser = PyPDF2DocParser() |
93 |
| - # else: |
94 |
| - # raise NotImplementedError("Only pdfminer and pypdf2 are supported") |
95 |
| - |
96 | 71 | pdf_parser = PyPDF2DocParser()
|
97 | 72 | text = pdf_parser.get_text(input_book_path, password=password)
|
98 | 73 | text = text_preprocessing(text)
|
99 | 74 |
|
100 |
| - for i in range(0, len(text), 2000): |
101 |
| - page_num = i // 2000 |
102 |
| - json_book[str(page_num)] = text[i: i + 2000] |
103 |
| - |
104 |
| - metadata['book_name'] = basename |
105 |
| - metadata['pages'] = len(json_book) |
| 75 | + json_book = {str(i // 2000): text[i:i + 2000] for i in range(0, len(text), 2000)} |
| 76 | + |
| 77 | + metadata["book_name"] = basename |
| 78 | + metadata["pages"] = len(json_book) |
106 | 79 | return json_book, metadata
|
107 | 80 |
|
108 |
| - |
| 81 | +# Helper function to convert ODT files to JSON format |
109 | 82 | def odt_to_json(input_book_path):
|
110 |
| - """sub method to create json book from odt file""" |
111 |
| - metadata = {} |
112 | 83 | json_book = {}
|
| 84 | + metadata = {} |
113 | 85 | basename = os.path.basename(input_book_path).split(".")[0]
|
114 | 86 |
|
115 | 87 | textdoc = load(input_book_path)
|
116 |
| - allparas = textdoc.getElementsByType(text.P) |
117 |
| - output_text = "" |
118 |
| - for i in range(len(allparas)): |
119 |
| - output_text += " " + teletype.extractText(allparas[i]) |
| 88 | + output_text = " ".join([teletype.extractText(para) for para in textdoc.getElementsByType(text.P)]) |
120 | 89 | output_text = text_preprocessing(output_text)
|
121 | 90 |
|
122 |
| - for i in range(0, len(output_text), 2000): |
123 |
| - page_num = i // 2000 |
124 |
| - json_book[str(page_num)] = output_text[i: i + 2000] |
125 |
| - |
126 |
| - metadata['book_name'] = basename |
127 |
| - metadata['pages'] = len(json_book) |
128 |
| - |
| 91 | + json_book = {str(i // 2000): output_text[i:i + 2000] for i in range(0, len(output_text), 2000)} |
| 92 | + |
| 93 | + metadata["book_name"] = basename |
| 94 | + metadata["pages"] = len(json_book) |
129 | 95 | return json_book, metadata
|
130 | 96 |
|
131 |
| - |
| 97 | +# Helper function to convert TXT files to JSON format |
132 | 98 | def txt_to_json(input_book_path):
|
133 |
| - """sub method to create json book from txt file""" |
134 | 99 | json_book = {}
|
135 | 100 | metadata = {}
|
136 | 101 | book_name = os.path.basename(input_book_path).split(".")[0]
|
| 102 | + |
137 | 103 | with open(input_book_path, "r") as fp:
|
138 | 104 | file_txt_data = fp.read()
|
| 105 | + |
139 | 106 | file_txt_data = text_preprocessing(file_txt_data)
|
140 |
| - |
141 |
| - for i in range(0, len(file_txt_data), 2000): |
142 |
| - page_num = i // 2000 |
143 |
| - json_book[str(page_num)] = file_txt_data[i: i + 2000] |
144 |
| - |
| 107 | + json_book = {str(i // 2000): file_txt_data[i:i + 2000] for i in range(0, len(file_txt_data), 2000)} |
| 108 | + |
145 | 109 | metadata["pages"] = len(json_book)
|
146 | 110 | metadata["book_name"] = book_name
|
147 | 111 | return json_book, metadata
|
148 | 112 |
|
149 |
| - |
| 113 | +# Helper function to convert RTF files to JSON format |
150 | 114 | def rtf_to_json(input_book_path):
|
151 |
| - """sub method to create json book from rtf file""" |
152 | 115 | json_book = {}
|
153 | 116 | metadata = {}
|
154 | 117 | book_name = os.path.basename(input_book_path).split(".")[0]
|
| 118 | + |
155 | 119 | with open(input_book_path, "r") as fp:
|
156 | 120 | file_rtf_data = fp.read()
|
157 |
| - file_txt_data = rtf_to_text(file_rtf_data , errors="ignore") |
| 121 | + |
| 122 | + file_txt_data = rtf_to_text(file_rtf_data, errors="ignore") |
158 | 123 | file_txt_data = text_preprocessing(file_txt_data)
|
159 | 124 |
|
160 |
| - for i in range(0, len(file_txt_data), 2000): |
161 |
| - page_num = i // 2000 |
162 |
| - json_book[str(page_num)] = file_txt_data[i: i + 2000] |
163 |
| - |
| 125 | + json_book = {str(i // 2000): file_txt_data[i:i + 2000] for i in range(0, len(file_txt_data), 2000)} |
| 126 | + |
164 | 127 | metadata["pages"] = len(json_book)
|
165 | 128 | metadata["book_name"] = book_name
|
166 | 129 | return json_book, metadata
|
167 | 130 |
|
168 |
| - |
| 131 | +# Helper function to convert DOCX files to JSON format |
169 | 132 | def docs_to_json(input_book_path):
|
170 |
| - """sub method to create json book from docs file""" |
171 |
| - metadata = {} |
172 | 133 | json_book = {}
|
| 134 | + metadata = {} |
173 | 135 | book_name = os.path.basename(input_book_path).split(".")[0]
|
| 136 | + |
174 | 137 | book_data = docx2txt.process(input_book_path)
|
175 |
| - for i in range(0, len(book_data), 2000): |
176 |
| - page_num = i // 2000 |
177 |
| - json_book[str(page_num)] = book_data[i: i + 2000] |
178 |
| - |
| 138 | + json_book = {str(i // 2000): book_data[i:i + 2000] for i in range(0, len(book_data), 2000)} |
| 139 | + |
179 | 140 | metadata["pages"] = len(json_book)
|
180 | 141 | metadata["book_name"] = book_name
|
181 | 142 | return json_book, metadata
|
182 | 143 |
|
183 |
| - |
| 144 | +# Helper function to convert EPUB files to JSON format |
184 | 145 | def epub_to_json(input_book_path):
|
185 |
| - metadata = {} |
186 | 146 | json_book = {}
|
| 147 | + metadata = {} |
187 | 148 | book_name = os.path.basename(input_book_path).split(".")[0]
|
| 149 | + |
188 | 150 | book = epub.read_epub(input_book_path)
|
189 |
| - text = " ".join( |
190 |
| - [ |
191 |
| - response_to_text(chapter.get_body_content()) |
192 |
| - for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT) |
193 |
| - ] |
194 |
| - ) |
195 |
| - for i in range(1, len(text) + 1, 2000): |
196 |
| - page_num = i // 2000 |
197 |
| - json_book[str(page_num)] = text[i: i + 2000] |
198 |
| - |
| 151 | + text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)]) |
| 152 | + |
| 153 | + json_book = {str(i // 2000): text[i:i + 2000] for i in range(1, len(text) + 1, 2000)} |
| 154 | + |
199 | 155 | metadata["pages"] = len(json_book)
|
200 | 156 | metadata["book_name"] = book_name
|
201 | 157 | return json_book, metadata
|
202 | 158 |
|
203 |
| - |
| 159 | +# Helper function to convert HTML (web) content to JSON format |
204 | 160 | def html_to_json(url):
|
205 |
| - """method to create json book from web article""" |
206 | 161 | metadata = {}
|
207 | 162 | json_book = {}
|
208 | 163 | book_name = os.path.basename(url).split(".")[0]
|
| 164 | + |
209 | 165 | article_scraper = ArticleWebScraper(url)
|
210 | 166 | page_data = article_scraper.get_page_data()
|
211 | 167 | page_data = text_preprocessing(page_data)
|
212 |
| - for i in range(0, len(page_data), 2000): |
213 |
| - page_num = i // 2000 |
214 |
| - json_book[str(page_num)] = page_data[i: i + 2000] |
215 |
| - |
| 168 | + |
| 169 | + json_book = {str(i // 2000): page_data[i:i + 2000] for i in range(0, len(page_data), 2000)} |
| 170 | + |
216 | 171 | metadata["pages"] = len(json_book)
|
217 | 172 | metadata["book_name"] = book_name
|
218 | 173 | return json_book, metadata
|
219 | 174 |
|
220 |
| - |
221 |
| -def get_json_metadata(input_book_path, password): |
222 |
| - """ helper function to call the function based on the file type """ |
223 |
| - # get the file extension |
224 |
| - json_book = {} |
225 |
| - metadata = {} |
226 |
| - file_extension = input_book_path.split(".")[-1] |
227 |
| - |
228 |
| - if file_extension == "odt": |
229 |
| - json_book, metadata = odt_to_json(input_book_path) |
230 |
| - elif file_extension == "pdf": |
231 |
| - json_book, metadata = pdf_to_json(input_book_path, password) |
232 |
| - elif file_extension == "txt": |
233 |
| - json_book, metadata = txt_to_json(input_book_path) |
234 |
| - elif file_extension == "epub": |
235 |
| - json_book, metadata = epub_to_json(input_book_path) |
236 |
| - elif file_extension == "mobi": |
237 |
| - json_book, metadata = mobi_to_json(input_book_path) |
238 |
| - elif input_book_path.startswith(("http", "https")): |
239 |
| - json_book, metadata = html_to_json(input_book_path) |
240 |
| - elif input_book_path.endswith((".docx", ".doc")): |
241 |
| - json_book, metadata = docs_to_json(input_book_path) |
242 |
| - elif file_extension == "rtf": |
243 |
| - json_book, metadata = rtf_to_json(input_book_path) |
244 |
| - |
| 175 | +# Main function to determine the file type and call respective methods |
| 176 | +def get_json_metadata(input_book_path, password=None): |
| 177 | + file_extension = input_book_path.split(".")[-1].lower() |
| 178 | + json_book, metadata = {}, {} |
| 179 | + |
| 180 | + file_to_json = { |
| 181 | + "odt": odt_to_json, |
| 182 | + "pdf": pdf_to_json, |
| 183 | + "txt": txt_to_json, |
| 184 | + "epub": epub_to_json, |
| 185 | + "mobi": mobi_to_json, |
| 186 | + "html": html_to_json, |
| 187 | + "docx": docs_to_json, |
| 188 | + "rtf": rtf_to_json |
| 189 | + } |
| 190 | + |
| 191 | + if file_extension in file_to_json: |
| 192 | + json_book, metadata = file_to_json[file_extension](input_book_path, password) |
| 193 | + else: |
| 194 | + raise NotImplementedError(f"Unsupported file type: {file_extension}") |
| 195 | + |
245 | 196 | return json_book, metadata
|
0 commit comments