Skip to content

Commit dbb4d21

Browse files
committed
Add options to minimize parsed html text
1 parent e500168 commit dbb4d21

File tree

1 file changed

+49
-8
lines changed

1 file changed

+49
-8
lines changed

Diff for: textract/parsers/html_parser.py

+49-8
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,22 @@
66
from .utils import BaseParser
77

88

9+
HTML_TAG_RE = re.compile(r'(<[^>]+>)')
10+
HTML_SPACE_SQUASH_RE = re.compile(r'\s+')
11+
HTML_SPACE_RE = re.compile(r'\s')
12+
13+
914
class Parser(BaseParser):
1015
"""Extract text from html file using beautifulsoup4. Filter text to
1116
only show the visible parts of the page. Insipration from `here
1217
<http://stackoverflow.com/a/1983219/564709>`_.
18+
By default it preserves spaces and tries to render tables with ASCII
19+
symbols '|' and '-'. It may be useless if you want to, for example,
20+
extract text and put it to some full text search engine.
21+
To replace several spaces with single one add option
22+
`squash_html_spaces=True` to `textract.process` function.
23+
To not render tables (just extract text) add an argument
24+
`strip_html_tables=True` to `textract.process`.
1325
"""
1426

1527
_disallowed_names = [
@@ -41,18 +53,23 @@ def _inline(self, element):
4153
return True
4254
return False
4355

44-
def _find_any_text(self, tag):
56+
def _find_any_text(self, tag, squash_spaces=False):
4557
"""Looks for any possible text within given tag.
4658
"""
4759
text = ''
4860
if tag is not None:
4961
text = six.text_type(tag)
5062
text = re.sub(r'(<[^>]+>)', '', text)
5163
text = re.sub(r'\s', ' ', text)
64+
text = re.sub(HTML_TAG_RE, '', text)
65+
if squash_spaces:
66+
text = re.sub(HTML_SPACE_SQUASH_RE, ' ', text)
67+
else:
68+
text = re.sub(HTML_SPACE_RE, ' ', text)
5269
text = text.strip()
5370
return text
5471

55-
def _parse_tables(self, soup):
72+
def _parse_tables(self, soup, squash_spaces):
5673
"""Returns array containing basic informations about tables for ASCII
5774
replacement (look: _replace_tables()).
5875
"""
@@ -66,7 +83,9 @@ def _parse_tables(self, soup):
6683
tds = tr.find_all('th') + tr.find_all('td')
6784
if len(tds) > 0:
6885
for i, td in enumerate(tds):
69-
td_text = self._find_any_text(td)
86+
td_text = self._find_any_text(
87+
td, squash_spaces=squash_spaces
88+
)
7089
length = len(td_text)
7190
if i in t_dict['col_width']:
7291
t_dict['col_width'][i] = max(
@@ -85,10 +104,21 @@ def _parse_tables(self, soup):
85104
tables.append(t_dict)
86105
return tables
87106

88-
def _replace_tables(self, soup, v_separator=' | ', h_separator='-'):
107+
def _strip_tables(self, soup, squash_spaces=False):
108+
tables = self._parse_tables(soup, squash_spaces)
109+
for t in tables:
110+
html = ''
111+
for tr in t['trs']:
112+
html += u'{0}\n'.format(u' '.join(td['text'] for td in tr))
113+
new_table = soup.new_tag('div')
114+
new_table.string = html
115+
t['table'].replace_with(new_table)
116+
return soup
117+
118+
def _replace_tables(self, soup, squash_spaces=False, v_separator=' | ', h_separator='-'):
89119
"""Replaces <table> elements with its ASCII equivalent.
90120
"""
91-
tables = self._parse_tables(soup)
121+
tables = self._parse_tables(soup, squash_spaces)
92122
v_sep_len = len(v_separator)
93123
v_left_sep = v_separator.lstrip()
94124
for t in tables:
@@ -124,12 +154,21 @@ def _join_inlines(self, soup):
124154
elem.unwrap()
125155
return soup
126156

127-
def extract(self, filename, **kwargs):
157+
def extract(
158+
self,
159+
filename,
160+
strip_html_tables=False,
161+
squash_html_spaces=False,
162+
**kwargs
163+
):
128164
with open(filename, "rb") as stream:
129165
soup = BeautifulSoup(stream, 'lxml')
130166

131167
# Convert tables to ASCII ones
132-
soup = self._replace_tables(soup)
168+
if strip_html_tables:
169+
soup = self._strip_tables(soup, squash_spaces=squash_html_spaces)
170+
else:
171+
soup = self._replace_tables(soup, squash_spaces=squash_html_spaces)
133172

134173
# Join inline elements
135174
soup = self._join_inlines(soup)
@@ -141,7 +180,9 @@ def extract(self, filename, **kwargs):
141180
for elem in elements:
142181
string = elem.string
143182
if string is None:
144-
string = self._find_any_text(elem)
183+
string = self._find_any_text(
184+
elem, squash_spaces=squash_html_spaces
185+
)
145186
string = string.strip()
146187
if len(string) > 0:
147188
html += "\n" + string + "\n"

0 commit comments

Comments
 (0)