Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
tables = pd.read_html("https://en.wikipedia.org/wiki/Kyrgyz_alphabets")
Issue Description
The problem arises because of a malformed colspan
found in a Wikipedia table having a value of 2;
instead of 2
.
The issue stems from this line:
Line 514 in 85be99e
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[16], line 1
----> 1 tables = pd.read_html("https://en.wikipedia.org/wiki/Kyrgyz_alphabets", converters=defaultdict(lambda: str))
File ~/.local/lib/python3.12/site-packages/pandas/io/html.py:1240, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links, dtype_backend, storage_options)
1224 if isinstance(io, str) and not any(
1225 [
1226 is_file_like(io),
(...)
1230 ]
1231 ):
1232 warnings.warn(
1233 "Passing literal html to 'read_html' is deprecated and "
1234 "will be removed in a future version. To read from a "
(...)
1237 stacklevel=find_stack_level(),
1238 )
-> 1240 return _parse(
1241 flavor=flavor,
1242 io=io,
1243 match=match,
1244 header=header,
1245 index_col=index_col,
1246 skiprows=skiprows,
1247 parse_dates=parse_dates,
1248 thousands=thousands,
1249 attrs=attrs,
1250 encoding=encoding,
1251 decimal=decimal,
1252 converters=converters,
1253 na_values=na_values,
1254 keep_default_na=keep_default_na,
1255 displayed_only=displayed_only,
1256 extract_links=extract_links,
1257 dtype_backend=dtype_backend,
1258 storage_options=storage_options,
1259 )
File ~/.local/lib/python3.12/site-packages/pandas/io/html.py:1006, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, storage_options, **kwargs)
1003 raise retained
1005 ret = []
-> 1006 for table in tables:
1007 try:
1008 df = _data_to_frame(data=table, **kwargs)
File ~/.local/lib/python3.12/site-packages/pandas/io/html.py:250, in <genexpr>(.0)
242 """
243 Parse and return all tables from the DOM.
244
(...)
247 list of parsed (header, body, footer) tuples from tables.
248 """
249 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
--> 250 return (self._parse_thead_tbody_tfoot(table) for table in tables)
File ~/.local/lib/python3.12/site-packages/pandas/io/html.py:465, in _HtmlFrameParser._parse_thead_tbody_tfoot(self, table_html)
462 header_rows.append(body_rows.pop(0))
464 header = self._expand_colspan_rowspan(header_rows, section="header")
--> 465 body = self._expand_colspan_rowspan(body_rows, section="body")
466 footer = self._expand_colspan_rowspan(footer_rows, section="footer")
468 return header, body, footer
File ~/.local/lib/python3.12/site-packages/pandas/io/html.py:521, in _HtmlFrameParser._expand_colspan_rowspan(self, rows, section)
519 text = (text, href)
520 rowspan = int(self._attr_getter(td, "rowspan") or 1)
--> 521 colspan = int(self._attr_getter(td, "colspan") or 1)
523 for _ in range(colspan):
524 texts.append(text)
ValueError: invalid literal for int() with base 10: '2;'
Expected Behavior
Expected behavior is to sanitize the malformed colspan value proceed with 2
.
Installed Versions
INSTALLED VERSIONS
commit : d9cdd2e
python : 3.12.5.final.0
python-bits : 64
OS : Linux
machine : x86_64
processor :
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 2.2.2
numpy : 1.26.4
pytz : 2024.1
dateutil : 2.9.0
setuptools : 69.5.1
pip : 24.2
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 5.2.1
html5lib : 1.1
pymysql : None
psycopg2 : None
jinja2 : 3.1.4
IPython : 8.24.0
pandas_datareader : None
adbc-driver-postgresql: None
adbc-driver-sqlite : None
bs4 : 4.12.3
bottleneck : None
dataframe-api-compat : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : 3.9.1
numba : None
numexpr : None
odfpy : None
openpyxl : 3.1.5
pandas_gbq : None
pyarrow : None
pyreadstat : None
python-calamine : None
pyxlsb : None
s3fs : None
scipy : 1.14.0
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
zstandard : None
tzdata : 2024.1
qtpy : 2.4.1
pyqt5 : None