1
- from bs4 import PageElement , Tag
1
+ from dataclasses import dataclass , field
2
+ from typing import Any , List , Tuple
3
+ from bs4 import BeautifulSoup , Tag
2
4
3
5
from .options import Options
4
6
from .utils .soup_util import clone_element
5
7
6
8
7
- def make_indexes (soup : PageElement , options : Options ) -> None :
9
+ _MAX_HEADER_LEVEL = 6 # <h1> ... <h6>
10
+
11
+
12
+ @dataclass
13
+ class _HeaderTree :
14
+ """ Normalized tree of document headers. Missed levels have `element` set to `None` """
15
+ element : Tag | None
16
+ subheaders : List ['_HeaderTree' ] = field (default_factory = list )
17
+
18
+
19
+ def make_indexes (soup : BeautifulSoup , options : Options ) -> None :
8
20
""" Generate ordered chapter number and TOC of document.
9
21
10
22
Arguments:
11
23
soup {BeautifulSoup} -- DOM object of Document.
12
24
options {Options} -- The options of this sequence.
13
25
"""
14
26
15
- # Step 1: (re)ordered headdings
27
+ # Step 1: (re)ordered headings
16
28
_inject_heading_order (soup , options )
17
29
18
30
# Step 2: generate toc page
19
- level = options .toc_level
20
- if level < 1 or level > 3 :
31
+ start_level = 1 if options .ignore_top_header else 0
32
+ stop_level = options .toc_level
33
+ if stop_level <= start_level :
21
34
return
35
+ if stop_level > _MAX_HEADER_LEVEL :
36
+ options .logger .warning (f'Ignore `toc_level` value { stop_level } . Use max possible { _MAX_HEADER_LEVEL } instead' )
37
+ stop_level = _MAX_HEADER_LEVEL
22
38
23
- options .logger .info (
24
- f'Generate a table of contents up to heading level { level } .' )
39
+ options .logger .info (f'Generate a table of contents from h{ start_level + 1 } to h{ stop_level } ' )
25
40
26
- h1li = None
27
- h2ul = h2li = h3ul = None
28
- exclude_lv2 = exclude_lv3 = False
29
-
30
- def makeLink (h : Tag ) -> Tag :
41
+ def make_link (h : Tag ) -> Tag :
31
42
li = soup .new_tag ('li' )
32
43
ref = h .get ('id' , '' )
33
44
a = soup .new_tag ('a' , href = f'#{ ref } ' )
@@ -40,98 +51,118 @@ def makeLink(h: Tag) -> Tag:
40
51
options .logger .debug (f"| [{ h .get_text (separator = ' ' )} ]({ ref } )" )
41
52
return li
42
53
54
+ def create_toc (headers : List [_HeaderTree ], parent : Tag ):
55
+ ul_tag = soup .new_tag ('ul' )
56
+ parent .append (ul_tag )
57
+ for header in headers :
58
+ if header .element is not None :
59
+ link_tag = make_link (header .element )
60
+ else :
61
+ options .logger .warning (f'Adding missed header to TOC' )
62
+ link_tag = soup .new_tag ('li' )
63
+ ul_tag .append (link_tag )
64
+ if len (header .subheaders ) > 0 :
65
+ create_toc (header .subheaders , link_tag )
66
+
67
+ top_headers = _collect_headers (soup , options , start_level , stop_level )
68
+
43
69
toc = soup .new_tag ('article' , id = 'doc-toc' )
44
70
title = soup .new_tag ('h1' )
45
71
title .append (soup .new_string (options .toc_title ))
46
72
toc .append (title )
47
73
48
- h1ul = soup .new_tag ('ul' )
49
- toc .append (h1ul )
50
-
51
- headings = soup .find_all (['h1' , 'h2' , 'h3' ])
52
- for h in headings :
74
+ create_toc (top_headers , toc )
75
+ soup .body .insert (0 , toc )
53
76
54
- if h .name == 'h1' :
55
77
56
- h1li = makeLink ( h )
57
- h1ul . append ( h1li )
58
- h2ul = h2li = h3ul = None
78
+ def _set_list_elements ( l : List [ Any ], value : Any , start : int , end : int | None = None ) -> None :
79
+ for i in range ( start , end if end is not None else len ( l )):
80
+ l [ i ] = value
59
81
60
- exclude_lv2 = _is_exclude (h .get ('id' , None ), options )
61
82
62
- elif not exclude_lv2 and h .name == 'h2' and level >= 2 :
83
+ def _collect_headers (soup : BeautifulSoup , options : Options , start_level : int , stop_level : int ) -> List [_HeaderTree ]:
84
+ """Collect document headers.
85
+ Retuns a list of top headers with their subheaders
86
+ Levels are counted from zero i.e. zero level corresponds to h1
87
+ """
88
+ assert 0 <= start_level < stop_level
89
+ assert 0 < stop_level <= _MAX_HEADER_LEVEL
63
90
64
- if not h2ul :
65
- h2ul = soup .new_tag ('ul' )
66
- h1li .append (h2ul )
67
- h2li = makeLink (h )
68
- h2ul .append (h2li )
69
- h3ul = None
91
+ top_headers : List [_HeaderTree ] = []
70
92
71
- exclude_lv3 = _is_exclude (h .get ('id' , None ), options )
93
+ header_levels : List [_HeaderTree | None ] = [None ] * stop_level
94
+ exclude_levels : List [bool ] = [False ] * stop_level
72
95
73
- elif not exclude_lv2 and not exclude_lv3 \
74
- and h .name == 'h3' and level >= 3 :
96
+ html_headers = soup .find_all ([f'h{ i + 1 } ' for i in range (start_level , stop_level )])
97
+ for h in html_headers :
98
+ level = int (h .name [1 :]) - 1
75
99
76
- if not h2li :
77
- continue
78
- if not h3ul :
79
- h3ul = soup .new_tag ('ul' )
80
- h2li .append (h3ul )
81
- h3li = makeLink (h )
82
- h3ul .append (h3li )
100
+ exclude_levels [level ] = _is_exclude (h .get ('id' , None ), options )
101
+ _set_list_elements (exclude_levels , False , level + 1 )
83
102
84
- else :
103
+ if any ( exclude_levels [: level ]) :
85
104
continue
86
- pass
87
-
88
- soup .body .insert (0 , toc )
89
105
106
+ header = _HeaderTree (h )
90
107
91
- def _inject_heading_order (soup : Tag , options : Options ):
92
-
93
- level = options .ordered_chapter_level
94
- if level < 1 or level > 3 :
95
- return
96
-
97
- options .logger .info (f'Number headings up to level { level } .' )
98
-
99
- h1n = h2n = h3n = 0
100
- exclude_lv2 = exclude_lv3 = False
101
-
102
- headings = soup .find_all (['h1' , 'h2' , 'h3' ])
103
- for h in headings :
104
-
105
- if h .name == 'h1' :
106
-
107
- h1n += 1
108
- h2n = h3n = 0
109
- prefix = f'{ h1n } . '
110
-
111
- exclude_lv2 = _is_exclude (h .get ('id' , None ), options )
108
+ if level == start_level :
109
+ top_headers .append (header )
110
+ else :
111
+ parent_header = header_levels [level - 1 ]
112
+ if parent_header is None :
113
+ # Add skipped levels
114
+ for i in range (start_level , level ):
115
+ if header_levels [i ] is not None :
116
+ continue
112
117
113
- elif not exclude_lv2 and h .name == 'h2' and level >= 2 :
118
+ missed_header = _HeaderTree (None )
119
+ if i == start_level :
120
+ top_headers .append (missed_header )
121
+ else :
122
+ parent_header = header_levels [i - 1 ]
123
+ assert parent_header is not None
124
+ parent_header .subheaders .append (missed_header )
125
+ header_levels [i ] = missed_header
114
126
115
- h2n += 1
116
- h3n = 0
117
- prefix = f'{ h1n } .{ h2n } '
127
+ parent_header = header_levels [level - 1 ]
118
128
119
- exclude_lv3 = _is_exclude (h .get ('id' , None ), options )
129
+ assert parent_header is not None
130
+ parent_header .subheaders .append (header )
120
131
121
- elif not exclude_lv2 and not exclude_lv3 \
122
- and h . name == 'h3' and level >= 3 :
132
+ header_levels [ level ] = header
133
+ _set_list_elements ( header_levels , None , level + 1 )
123
134
124
- h3n += 1
125
- prefix = f'{ h1n } .{ h2n } .{ h3n } '
135
+ return top_headers
126
136
127
- else :
128
- continue
129
137
130
- options .logger .debug (f"| [{ prefix } { h .text } ]({ h .get ('id' , '(none)' )} )" )
138
+ def _inject_heading_order (soup : BeautifulSoup , options : Options ) -> None :
139
+ start_level = 1 if options .ignore_top_header else 0
140
+ stop_level = options .ordered_chapter_level
141
+ if stop_level <= start_level :
142
+ return
143
+ if stop_level > _MAX_HEADER_LEVEL :
144
+ options .logger .warning (f'Ignore `ordered_chapter_level` value { stop_level } . Use max possible { _MAX_HEADER_LEVEL } instead' )
145
+ stop_level = _MAX_HEADER_LEVEL
146
+
147
+ options .logger .info (f'Number headers from h{ start_level + 1 } to h{ stop_level } ' )
148
+
149
+ def inject_order (headers : List [_HeaderTree ], numbers_prefix : List [int ] = []):
150
+ assert len (numbers_prefix ) < _MAX_HEADER_LEVEL
151
+ for i , header in enumerate (headers ):
152
+ prefix = numbers_prefix + [i + 1 ]
153
+ prefix_str = '.' .join (str (n ) for n in prefix )
154
+ if header .element is not None :
155
+ options .logger .debug (f"| [{ prefix_str } { header .element } ]({ header .element .get ('id' , '(none)' )} )" )
156
+ nm_tag = soup .new_tag ('span' , ** {'class' : 'pdf-order' })
157
+ nm_tag .append (prefix_str + ' ' )
158
+ header .element .insert (0 , nm_tag )
159
+ else :
160
+ options .logger .warning (f'Assigned number for a missed header { prefix_str } ' )
161
+ if len (header .subheaders ) > 0 :
162
+ inject_order (header .subheaders , prefix )
131
163
132
- nm_tag = soup .new_tag ('span' , ** {'class' : 'pdf-order' })
133
- nm_tag .append (prefix )
134
- h .insert (0 , nm_tag )
164
+ top_headers = _collect_headers (soup , options , start_level , stop_level )
165
+ inject_order (top_headers )
135
166
136
167
137
168
def _is_exclude (url : str , options : Options ) -> bool :
0 commit comments