doctool/proc_geektime.py at master · apachecn/doctool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
from os import path
import sys
import re
from pyquery import PyQuery as pq
from concurrent.futures import ThreadPoolExecutor
from GenEpub import gen_epub
from EpubCrawler.img import process_img

def repl_pre(m):
    s = m.group(0)
    rt = pq(s)
    rt('.richcontent-pre-copy, .hljs-ln-numbers').remove()
    s = rt.__str__().replace(' ', '&nbsp;')
    s = re.sub(r'</?(table|thead|tbody|tr|th|td)[^>]*>', '', s)
    return s

def repl_div_pre(m):
    s = m.group(0)
    s = re.sub(r'</?(div|span)[^>]*>', '', s)
    s = re.sub(r'^\s*<br/>', '', s)
    return f'<pre>{s}</pre>'


def get_article(html):
    html = proc_html(html)
    rt = pq(html)
    '''
    for el in pq('img'):
        el = pq(el)
        el.attr('src', el.attr('data-savepage-src'))
    '''
    title = rt('h1').eq(0).text().strip()
    '''
    style = '\n'.join([
        pq(el).html()
        for el in rt('style')
    ])
    '''
    rt('style').remove()
    cont = rt('[data-slate-editor="true"], ._29HP61GA_0').eq(0).html()
    return {
        'title': title,
        # 'content': f'<style>{style}</style>\n{cont}',
        'content': cont,
    }


def main():
    dir = sys.argv[1]
    fnames = [f for f in os.listdir(dir) if f.endswith('.html')]

    group = {}
    for f in fnames:
        m = re.search(r'^(\d+)', f)
        if not m: continue
        no = int(m.group(1))
        group.setdefault(no, [])
        group[no].append(f)

    pool = ThreadPoolExecutor(16)
    hdls = []
    for no, fnames in group.items():
        h = pool.submit(tr_proc_book, dir, fnames)
        hdls.append(h)
    for h in hdls: h.result()


def tr_proc_book(dir, fnames):
    title = fnames[0].split('：')[0]
    print(title)
    articles = [{
        'title': title,
        'content': '',
    }]
    imgs = {}
    for f in fnames:
        print(f)
        ff = path.join(dir, f)
        html = open(ff, encoding='utf8').read()
        art = get_article(html)
        articles.append(art)
        art['content'] = process_img(art['content'], imgs, img_prefix='../Images/')
    gen_epub(articles, imgs)


def proc_html(html):
    html = re.sub(r'<pre[^>]*>[\s\S]+?</pre>', repl_pre, html)
    html = re.sub(r'(?=<div[^>]*data-slate-type="code-line"[^>]*>)', '<br/>', html)
    html = re.sub(r'<div[^>]*data-slate-type="pre"[^>]*>[\s\S]+?</div>(?=<div[^>]*data-slate-type="[\w\-]+"[^>]*>)(?!<div[^>]*data-slate-type="code-line"[^>]*>)', repl_div_pre, html)

    rt = pq(html)

    el_paras = rt.find('[data-slate-type="paragraph"], [data-slate-type="image"]')
    for  el in el_paras:
        el = pq(el)
        elp = pq('<p></p>')
        elp.html(el.html())
        el.replace_with(elp)
    el_quotes = rt.find('[data-slate-type="block-quote"]')
    for  el in el_quotes:
        el = pq(el)
        elq = pq('<blockquote></blockquote>')
        elq.html(el.html())
        el.replace_with(elq)
    el_bolds = rt.find('[data-slate-type="bold"]')
    for  el in el_bolds:
        el = pq(el)
        elb = pq('<b></b>')
        elb.html(el.html())
        el.replace_with(elb)
    el_lists = rt.find('[data-slate-type="list"]')
    for  el in el_lists:
        el = pq(el)
        elol = pq('<ol></ol>')
        elol.html(el.html())
        el.replace_with(elol)
    el_llines = rt.find('[data-slate-type="list-line"]')
    for  el in el_llines:
        el = pq(el)
        elli = pq('<li></li>')
        elli.html(el.html())
        el.replace_with(elli)
    el_icodes = rt.find('[data-slate-type="code"]')
    for  el in el_icodes:
        el = pq(el)
        elcode = pq('<code></code>')
        elcode.text(el.text())
        el.replace_with(elcode)
    el_codes = rt.find('[data-slate-type="pre"]')
    for el in el_codes:
        el = pq(el)
        el_inner = el.find('[data-origin="pm_code_preview"]')
        if not el_inner: continue
        elpre = pq('<pre></pre>')
        elpre.html(el_inner.html().replace(' ', '&nbsp;'))
        el.replace_with(elpre)

    return str(rt)

if __name__ == '__main__': main()