Skip to content

Commit b32a624

Browse files
Derekt2RomainL972
andcommitted
ENH: Updated read_html to add option
Adds optional boolean parameter "remove_whitespace" to skip the remove_whitespace functionality. Defaults to true to support backwards compatibility. See pandas-dev#24766 Co-authored-by: Romain Lebbadi-Breteau <[email protected]>
1 parent 0011846 commit b32a624

File tree

1 file changed

+22
-3
lines changed

1 file changed

+22
-3
lines changed

pandas/io/html.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,10 @@ class _HtmlFrameParser:
180180
displayed_only : bool
181181
Whether or not items with "display:none" should be ignored
182182
183+
remove_whitespace : bool
184+
Whether table row values should have all whitespace replaced with a space.
185+
.. versionadded:: 1.5.0
186+
183187
Attributes
184188
----------
185189
io : str or file-like
@@ -198,6 +202,10 @@ class _HtmlFrameParser:
198202
displayed_only : bool
199203
Whether or not items with "display:none" should be ignored
200204
205+
remove_whitespace : bool
206+
Whether table row values should have all whitespace replaced with a space
207+
.. versionadded:: 1.5.0
208+
201209
Notes
202210
-----
203211
To subclass this class effectively you must override the following methods:
@@ -221,12 +229,14 @@ def __init__(
221229
attrs: dict[str, str] | None,
222230
encoding: str,
223231
displayed_only: bool,
232+
remove_whitespace: bool
224233
):
225234
self.io = io
226235
self.match = match
227236
self.attrs = attrs
228237
self.encoding = encoding
229238
self.displayed_only = displayed_only
239+
self.remove_whitespace = remove_whitespace
230240

231241
def parse_tables(self):
232242
"""
@@ -480,7 +490,10 @@ def _expand_colspan_rowspan(self, rows):
480490
index += 1
481491

482492
# Append the text from this <td>, colspan times
483-
text = _remove_whitespace(self._text_getter(td))
493+
if self.remove_whitespace:
494+
text = _remove_whitespace(self._text_getter(td))
495+
else:
496+
text = self._text_getter(td)
484497
rowspan = int(self._attr_getter(td, "rowspan") or 1)
485498
colspan = int(self._attr_getter(td, "colspan") or 1)
486499

@@ -906,14 +919,14 @@ def _validate_flavor(flavor):
906919
return flavor
907920

908921

909-
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
922+
def _parse(flavor, io, match, attrs, encoding, displayed_only, remove_whitespace, **kwargs):
910923
flavor = _validate_flavor(flavor)
911924
compiled_match = re.compile(match) # you can pass a compiled regex here
912925

913926
retained = None
914927
for flav in flavor:
915928
parser = _parser_dispatch(flav)
916-
p = parser(io, compiled_match, attrs, encoding, displayed_only)
929+
p = parser(io, compiled_match, attrs, encoding, displayed_only, remove_whitespace)
917930

918931
try:
919932
tables = p.parse_tables()
@@ -964,6 +977,7 @@ def read_html(
964977
na_values=None,
965978
keep_default_na: bool = True,
966979
displayed_only: bool = True,
980+
remove_whitespace: bool = True,
967981
) -> list[DataFrame]:
968982
r"""
969983
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1058,6 +1072,10 @@ def read_html(
10581072
displayed_only : bool, default True
10591073
Whether elements with "display: none" should be parsed.
10601074
1075+
remove_whitespace : bool, default True
1076+
Whether table row values should have all whitespace replaced with a space.
1077+
.. versionadded:: 1.5.0
1078+
10611079
Returns
10621080
-------
10631081
dfs
@@ -1126,4 +1144,5 @@ def read_html(
11261144
na_values=na_values,
11271145
keep_default_na=keep_default_na,
11281146
displayed_only=displayed_only,
1147+
remove_whitespace=remove_whitespace,
11291148
)

0 commit comments

Comments
 (0)