Skip to content

Commit 5f4b22a

Browse files
committed
Drop unneeded text replacement in html
1 parent dbb4d21 commit 5f4b22a

File tree

1 file changed

+4
-6
lines changed

1 file changed

+4
-6
lines changed

textract/parsers/html_parser.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88

99
HTML_TAG_RE = re.compile(r'(<[^>]+>)')
10-
HTML_SPACE_SQUASH_RE = re.compile(r'\s+')
11-
HTML_SPACE_RE = re.compile(r'\s')
10+
SPACE_SQUASH_RE = re.compile(r'\s+')
11+
SPACE_RE = re.compile(r'\s')
1212

1313

1414
class Parser(BaseParser):
@@ -59,13 +59,11 @@ def _find_any_text(self, tag, squash_spaces=False):
5959
text = ''
6060
if tag is not None:
6161
text = six.text_type(tag)
62-
text = re.sub(r'(<[^>]+>)', '', text)
63-
text = re.sub(r'\s', ' ', text)
6462
text = re.sub(HTML_TAG_RE, '', text)
6563
if squash_spaces:
66-
text = re.sub(HTML_SPACE_SQUASH_RE, ' ', text)
64+
text = re.sub(SPACE_SQUASH_RE, ' ', text)
6765
else:
68-
text = re.sub(HTML_SPACE_RE, ' ', text)
66+
text = re.sub(SPACE_RE, ' ', text)
6967
text = text.strip()
7068
return text
7169

0 commit comments

Comments
 (0)