@@ -180,6 +180,10 @@ class _HtmlFrameParser:
180
180
displayed_only : bool
181
181
Whether or not items with "display:none" should be ignored
182
182
183
+ remove_whitespace : bool
184
+ Whether table row values should have all whitespace replaced with a space.
185
+ .. versionadded:: 1.5.0
186
+
183
187
Attributes
184
188
----------
185
189
io : str or file-like
@@ -198,6 +202,10 @@ class _HtmlFrameParser:
198
202
displayed_only : bool
199
203
Whether or not items with "display:none" should be ignored
200
204
205
+ remove_whitespace : bool
206
+ Whether table row values should have all whitespace replaced with a space
207
+ .. versionadded:: 1.5.0
208
+
201
209
Notes
202
210
-----
203
211
To subclass this class effectively you must override the following methods:
@@ -221,12 +229,14 @@ def __init__(
221
229
attrs : dict [str , str ] | None ,
222
230
encoding : str ,
223
231
displayed_only : bool ,
232
+ remove_whitespace : bool
224
233
):
225
234
self .io = io
226
235
self .match = match
227
236
self .attrs = attrs
228
237
self .encoding = encoding
229
238
self .displayed_only = displayed_only
239
+ self .remove_whitespace = remove_whitespace
230
240
231
241
def parse_tables (self ):
232
242
"""
@@ -480,7 +490,10 @@ def _expand_colspan_rowspan(self, rows):
480
490
index += 1
481
491
482
492
# Append the text from this <td>, colspan times
483
- text = _remove_whitespace (self ._text_getter (td ))
493
+ if self .remove_whitespace :
494
+ text = _remove_whitespace (self ._text_getter (td ))
495
+ else :
496
+ text = self ._text_getter (td )
484
497
rowspan = int (self ._attr_getter (td , "rowspan" ) or 1 )
485
498
colspan = int (self ._attr_getter (td , "colspan" ) or 1 )
486
499
@@ -906,14 +919,14 @@ def _validate_flavor(flavor):
906
919
return flavor
907
920
908
921
909
- def _parse (flavor , io , match , attrs , encoding , displayed_only , ** kwargs ):
922
+ def _parse (flavor , io , match , attrs , encoding , displayed_only , remove_whitespace , ** kwargs ):
910
923
flavor = _validate_flavor (flavor )
911
924
compiled_match = re .compile (match ) # you can pass a compiled regex here
912
925
913
926
retained = None
914
927
for flav in flavor :
915
928
parser = _parser_dispatch (flav )
916
- p = parser (io , compiled_match , attrs , encoding , displayed_only )
929
+ p = parser (io , compiled_match , attrs , encoding , displayed_only , remove_whitespace )
917
930
918
931
try :
919
932
tables = p .parse_tables ()
@@ -964,6 +977,7 @@ def read_html(
964
977
na_values = None ,
965
978
keep_default_na : bool = True ,
966
979
displayed_only : bool = True ,
980
+ remove_whitespace : bool = True ,
967
981
) -> list [DataFrame ]:
968
982
r"""
969
983
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1058,6 +1072,10 @@ def read_html(
1058
1072
displayed_only : bool, default True
1059
1073
Whether elements with "display: none" should be parsed.
1060
1074
1075
+ remove_whitespace : bool, default True
1076
+ Whether table row values should have all whitespace replaced with a space.
1077
+ .. versionadded:: 1.5.0
1078
+
1061
1079
Returns
1062
1080
-------
1063
1081
dfs
@@ -1126,4 +1144,5 @@ def read_html(
1126
1144
na_values = na_values ,
1127
1145
keep_default_na = keep_default_na ,
1128
1146
displayed_only = displayed_only ,
1147
+ remove_whitespace = remove_whitespace ,
1129
1148
)
0 commit comments