Skip to content

Commit

Permalink
ENH: Updated read_html to add option
Browse files Browse the repository at this point in the history
Adds optional boolean parameter "remove_whitespace" to skip the remove_whitespace functionality. Defaults to true to support backwards compatibility. See pandas-dev#24766

Co-authored-by: Romain Lebbadi-Breteau <[email protected]>
  • Loading branch information
Derekt2 and RomainL972 committed Feb 19, 2022
1 parent 0011846 commit b32a624
Showing 1 changed file with 22 additions and 3 deletions.
25 changes: 22 additions & 3 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ class _HtmlFrameParser:
displayed_only : bool
Whether or not items with "display:none" should be ignored
remove_whitespace : bool
Whether table row values should have all whitespace replaced with a space.
.. versionadded:: 1.5.0
Attributes
----------
io : str or file-like
Expand All @@ -198,6 +202,10 @@ class _HtmlFrameParser:
displayed_only : bool
Whether or not items with "display:none" should be ignored
remove_whitespace : bool
Whether table row values should have all whitespace replaced with a space
.. versionadded:: 1.5.0
Notes
-----
To subclass this class effectively you must override the following methods:
Expand All @@ -221,12 +229,14 @@ def __init__(
attrs: dict[str, str] | None,
encoding: str,
displayed_only: bool,
remove_whitespace: bool
):
self.io = io
self.match = match
self.attrs = attrs
self.encoding = encoding
self.displayed_only = displayed_only
self.remove_whitespace = remove_whitespace

def parse_tables(self):
"""
Expand Down Expand Up @@ -480,7 +490,10 @@ def _expand_colspan_rowspan(self, rows):
index += 1

# Append the text from this <td>, colspan times
text = _remove_whitespace(self._text_getter(td))
if self.remove_whitespace:
text = _remove_whitespace(self._text_getter(td))
else:
text = self._text_getter(td)
rowspan = int(self._attr_getter(td, "rowspan") or 1)
colspan = int(self._attr_getter(td, "colspan") or 1)

Expand Down Expand Up @@ -906,14 +919,14 @@ def _validate_flavor(flavor):
return flavor


def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
def _parse(flavor, io, match, attrs, encoding, displayed_only, remove_whitespace, **kwargs):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs, encoding, displayed_only)
p = parser(io, compiled_match, attrs, encoding, displayed_only, remove_whitespace)

try:
tables = p.parse_tables()
Expand Down Expand Up @@ -964,6 +977,7 @@ def read_html(
na_values=None,
keep_default_na: bool = True,
displayed_only: bool = True,
remove_whitespace: bool = True,
) -> list[DataFrame]:
r"""
Read HTML tables into a ``list`` of ``DataFrame`` objects.
Expand Down Expand Up @@ -1058,6 +1072,10 @@ def read_html(
displayed_only : bool, default True
Whether elements with "display: none" should be parsed.
remove_whitespace : bool, default True
Whether table row values should have all whitespace replaced with a space.
.. versionadded:: 1.5.0
Returns
-------
dfs
Expand Down Expand Up @@ -1126,4 +1144,5 @@ def read_html(
na_values=na_values,
keep_default_na=keep_default_na,
displayed_only=displayed_only,
remove_whitespace=remove_whitespace,
)

0 comments on commit b32a624

Please sign in to comment.