6
6
from .utils import BaseParser
7
7
8
8
9
+ HTML_TAG_RE = re .compile (r'(<[^>]+>)' )
10
+ HTML_SPACE_SQUASH_RE = re .compile (r'\s+' )
11
+ HTML_SPACE_RE = re .compile (r'\s' )
12
+
13
+
9
14
class Parser (BaseParser ):
10
15
"""Extract text from html file using beautifulsoup4. Filter text to
11
16
only show the visible parts of the page. Insipration from `here
12
17
<http://stackoverflow.com/a/1983219/564709>`_.
18
+ By default it preserves spaces and tries to render tables with ASCII
19
+ symbols '|' and '-'. It may be useless if you want to, for example,
20
+ extract text and put it to some full text search engine.
21
+ To replace several spaces with single one add option
22
+ `squash_html_spaces=True` to `textract.process` function.
23
+ To not render tables (just extract text) add an argument
24
+ `strip_html_tables=True` to `textract.process`.
13
25
"""
14
26
15
27
_disallowed_names = [
@@ -41,18 +53,23 @@ def _inline(self, element):
41
53
return True
42
54
return False
43
55
44
- def _find_any_text (self , tag ):
56
+ def _find_any_text (self , tag , squash_spaces = False ):
45
57
"""Looks for any possible text within given tag.
46
58
"""
47
59
text = ''
48
60
if tag is not None :
49
61
text = six .text_type (tag )
50
62
text = re .sub (r'(<[^>]+>)' , '' , text )
51
63
text = re .sub (r'\s' , ' ' , text )
64
+ text = re .sub (HTML_TAG_RE , '' , text )
65
+ if squash_spaces :
66
+ text = re .sub (HTML_SPACE_SQUASH_RE , ' ' , text )
67
+ else :
68
+ text = re .sub (HTML_SPACE_RE , ' ' , text )
52
69
text = text .strip ()
53
70
return text
54
71
55
- def _parse_tables (self , soup ):
72
+ def _parse_tables (self , soup , squash_spaces ):
56
73
"""Returns array containing basic informations about tables for ASCII
57
74
replacement (look: _replace_tables()).
58
75
"""
@@ -66,7 +83,9 @@ def _parse_tables(self, soup):
66
83
tds = tr .find_all ('th' ) + tr .find_all ('td' )
67
84
if len (tds ) > 0 :
68
85
for i , td in enumerate (tds ):
69
- td_text = self ._find_any_text (td )
86
+ td_text = self ._find_any_text (
87
+ td , squash_spaces = squash_spaces
88
+ )
70
89
length = len (td_text )
71
90
if i in t_dict ['col_width' ]:
72
91
t_dict ['col_width' ][i ] = max (
@@ -85,10 +104,21 @@ def _parse_tables(self, soup):
85
104
tables .append (t_dict )
86
105
return tables
87
106
88
- def _replace_tables (self , soup , v_separator = ' | ' , h_separator = '-' ):
107
+ def _strip_tables (self , soup , squash_spaces = False ):
108
+ tables = self ._parse_tables (soup , squash_spaces )
109
+ for t in tables :
110
+ html = ''
111
+ for tr in t ['trs' ]:
112
+ html += u'{0}\n ' .format (u' ' .join (td ['text' ] for td in tr ))
113
+ new_table = soup .new_tag ('div' )
114
+ new_table .string = html
115
+ t ['table' ].replace_with (new_table )
116
+ return soup
117
+
118
+ def _replace_tables (self , soup , squash_spaces = False , v_separator = ' | ' , h_separator = '-' ):
89
119
"""Replaces <table> elements with its ASCII equivalent.
90
120
"""
91
- tables = self ._parse_tables (soup )
121
+ tables = self ._parse_tables (soup , squash_spaces )
92
122
v_sep_len = len (v_separator )
93
123
v_left_sep = v_separator .lstrip ()
94
124
for t in tables :
@@ -124,12 +154,21 @@ def _join_inlines(self, soup):
124
154
elem .unwrap ()
125
155
return soup
126
156
127
- def extract (self , filename , ** kwargs ):
157
+ def extract (
158
+ self ,
159
+ filename ,
160
+ strip_html_tables = False ,
161
+ squash_html_spaces = False ,
162
+ ** kwargs
163
+ ):
128
164
with open (filename , "rb" ) as stream :
129
165
soup = BeautifulSoup (stream , 'lxml' )
130
166
131
167
# Convert tables to ASCII ones
132
- soup = self ._replace_tables (soup )
168
+ if strip_html_tables :
169
+ soup = self ._strip_tables (soup , squash_spaces = squash_html_spaces )
170
+ else :
171
+ soup = self ._replace_tables (soup , squash_spaces = squash_html_spaces )
133
172
134
173
# Join inline elements
135
174
soup = self ._join_inlines (soup )
@@ -141,7 +180,9 @@ def extract(self, filename, **kwargs):
141
180
for elem in elements :
142
181
string = elem .string
143
182
if string is None :
144
- string = self ._find_any_text (elem )
183
+ string = self ._find_any_text (
184
+ elem , squash_spaces = squash_html_spaces
185
+ )
145
186
string = string .strip ()
146
187
if len (string ) > 0 :
147
188
html += "\n " + string + "\n "
0 commit comments