forked from openzim/gutenberg
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathgutenberg2zim
More file actions
executable file
·231 lines (200 loc) · 9.4 KB
/
gutenberg2zim
File metadata and controls
executable file
·231 lines (200 loc) · 9.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
from __future__ import (unicode_literals, absolute_import,
division, print_function)
import os
import sys
from docopt import docopt
from path import Path as path
import shutil
from gutenbergtozim import logger, VERSION
from gutenbergtozim.database import setup_database
from gutenbergtozim.rdf import setup_rdf_folder, parse_and_fill
from gutenbergtozim.download import download_all_books
from gutenbergtozim.export import export_all_books, export_skeleton
from gutenbergtozim.urls import setup_urls
from gutenbergtozim.zim import build_zimfile
from gutenbergtozim.checkdeps import check_dependencies
help = ("""Usage: gutenberg2zim [-y] [-F] [-l LANGS] [-f FORMATS] """
"""[-r RDF_FOLDER] [-d CACHE_PATH] [-e STATIC_PATH] """
"""[-z ZIM_PATH] [-u RDF_URL] [-b BOOKS] """
"""[-t ZIM_TITLE] [-n ZIM_DESC] """
"""[-p CONCURRENCY] [--dlc CONCURRENCY] [--no-index] """
"""[--prepare] [--parse] [--download] [--export] [--dev] """
"""[--zim] [--complete] [-m ONE_LANG_ONE_ZIM_FOLDER] """
"""[--title-search] [--bookshelves] """
"""
-h --help Display this help message
-y --wipe-db Empty cached book metadata
-F --force Redo step even if target already exist
-l --languages=<list> Comma-separated list of lang codes to filter"""
""" export to (preferably ISO 639-1, else ISO 639-3)
-f --formats=<list> Comma-separated list of formats to filter """
"""export to (epub, html, pdf, all)
-r --rdf-folder=<folder> Don't download rdf-files.tar.bz2 and use """
"""extracted folder instead
-e --static-folder=<folder> Use-as/Write-to this folder static HTML
-z --zim-file=<file> Write ZIM into this file path
-t --zim-title=<title> Set ZIM title
-n --zim-desc=<description> Set ZIM description
-d --dl-folder=<folder> Folder to use/write-to downloaded ebooks
-u --rdf-url=<url> Alternative rdf-files.tar.bz2 URL
-b --books=<ids> Execute the processes for specific books, """
"""separated by commas, or dashes for intervals
-c --concurrency=<nb> Number of concurrent process for download """
"""and parsing tasks
--dlc=<nb> Number of concurrent *download* process for """
"""download (overwrites --concurrency). """
"""if server blocks high rate requests
-m --one-language-one-zim=<folder> When more than 1 language, do one zim for each """
""" language (and one with all)
--no-index Do NOT create full-text index within ZIM file
--check Check dependencies
--prepare Download & extract rdf-files.tar.bz2
--parse Parse all RDF files and fill-up the DB
--download Download ebooks based on filters
--export Export downloaded content to zim-friendly """
"""static HTML
--dev Exports *just* Home+JS+CSS files """
"""(overwritten by --zim step)
--zim Create a ZIM file
--title-search Add title search parameter
--bookshelves Add bookshelves
This script is used to produce a ZIM file (and any intermediate state)
of Gutenberg repository using a mirror.""")
def main(arguments):
# actions constants
DO_PREPARE = arguments.get('--prepare', False)
DO_PARSE = arguments.get('--parse', False)
DO_DOWNLOAD = arguments.get('--download', False)
DO_EXPORT = arguments.get('--export', False)
DO_DEV = arguments.get('--dev', False)
DO_ZIM = arguments.get('--zim', False)
DO_CHECKDEPS = arguments.get('--check', False)
ONE_LANG_ONE_ZIM_FOLDER = arguments.get("--one-language-one-zim") or None
COMPLETE_DUMP = arguments.get('--complete', False)
RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files')
STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static')
ZIM_NAME = arguments.get('--zim-file')
WIPE_DB = arguments.get('--wipe-db') or False
RDF_URL = arguments.get('--rdf-url') \
or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache')
BOOKS = arguments.get('--books') or ''
ZIM_TITLE = arguments.get('--zim-title')
ZIM_DESC = arguments.get('--zim-desc')
CONCURRENCY = int(arguments.get('--concurrency') or 16)
DL_CONCURRENCY = int(arguments.get('--dlc') or CONCURRENCY)
FORCE = arguments.get('--force', False)
NO_INDEX = arguments.get('--no-index', False)
TITLE_SEARCH = arguments.get('--title-search',False)
BOOKSHELVES = arguments.get('--bookshelves',False)
# create tmp dir
path('tmp').mkdir_p()
LANGUAGES = [x.strip().lower()
for x in (arguments.get('--languages') or '').split(',')
if x.strip()]
# special shortcuts for "all"
if arguments.get('--formats') in ['all', None]:
FORMATS = ['epub', 'pdf', 'html']
else:
FORMATS = list(set([x.strip().lower()
for x in (arguments.get('--formats') or '').split(',')
if x.strip()]))
try:
BOOKS = [bid for bid in BOOKS.split(',')]
def f(x):
return list(map(int, [i for i in x.split('-') if i.isdigit()]))
books = []
for i in BOOKS:
blst = f(i)
if len(blst) > 1:
blst = range(blst[0], blst[1] + 1)
books.extend(blst)
BOOKS = list(set(books))
except Exception as e:
logger.error(e)
BOOKS = []
# no arguments, default to --complete
if not (DO_PREPARE + DO_PARSE + DO_DOWNLOAD + DO_EXPORT + DO_ZIM + DO_DEV):
COMPLETE_DUMP = True
if COMPLETE_DUMP:
DO_CHECKDEPS = DO_PREPARE = DO_PARSE = \
DO_DOWNLOAD = DO_EXPORT = DO_ZIM = True
if DO_CHECKDEPS:
logger.info("CHECKING for dependencies on the system")
if not check_dependencies()[0]:
logger.error("Exiting...")
sys.exit(1)
if DO_PREPARE:
logger.info("PREPARING rdf-files cache from {}".format(RDF_URL))
setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER, force=FORCE)
if WIPE_DB:
logger.info("RESETING DATABASE IF EXISTS")
logger.info("SETTING UP DATABASE")
setup_database(wipe=WIPE_DB)
if DO_PARSE:
logger.info("PARSING rdf-files in {}".format(RDF_FOLDER))
parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS,
concurrency=CONCURRENCY, force=FORCE)
logger.info("Add possible url to db")
#setup_urls()
if DO_DOWNLOAD:
logger.info("DOWNLOADING ebooks from mirror using filters")
download_all_books(download_cache=DL_CACHE,
concurrency=DL_CONCURRENCY,
languages=LANGUAGES,
formats=FORMATS,
only_books=BOOKS,
force=FORCE)
if ONE_LANG_ONE_ZIM_FOLDER:
if LANGUAGES == []:
zims = []
from gutenbergtozim.database import Book
for book in Book.select(Book.language).distinct():
zims.append([book.language])
zims.append([])
else:
zims = [[l] for l in LANGUAGES] + [LANGUAGES]
else:
zims = [LANGUAGES]
for zim_lang in zims:
if DO_EXPORT:
logger.info("EXPORTING ebooks to static folder (and JSON)")
export_all_books(static_folder=STATIC_FOLDER,
download_cache=DL_CACHE,
concurrency=CONCURRENCY,
languages=zim_lang,
formats=FORMATS,
only_books=BOOKS,
force=FORCE,
title_search=TITLE_SEARCH,
add_bookshelves=BOOKSHELVES)
if DO_DEV:
logger.info("EXPORTING HTML/JS/CSS to static folder")
export_skeleton(static_folder=STATIC_FOLDER,
dev_mode=True,
languages=zim_lang,
formats=FORMATS,
only_books=BOOKS,
title_search=TITLE_SEARCH,
add_bookshelves=BOOKSHELVES)
if DO_ZIM:
if not check_dependencies()[1]:
logger.error("You don't have zimwriterfs installed.")
sys.exit(1)
logger.info("BUILDING ZIM off static folder {}".format(STATIC_FOLDER))
build_zimfile(static_folder=STATIC_FOLDER,
output_folder=path(ONE_LANG_ONE_ZIM_FOLDER or '.').abspath(),
zim_name=path(ZIM_NAME).name if ZIM_NAME else None,
languages=zim_lang, formats=FORMATS,
only_books=BOOKS,
title=ZIM_TITLE, description=ZIM_DESC,
create_index=not NO_INDEX,
force=FORCE,
title_search=TITLE_SEARCH,
add_bookshelves=BOOKSHELVES)
shutil.rmtree(STATIC_FOLDER)
if __name__ == '__main__':
main(docopt(help, version=VERSION))