Skip to content

Commit f283484

Browse files
Eugenio-BAYEPeterStaar-IBMCopilot
authored
fix: handle empty CSV file without crashing (#3196)
* fix: handle empty CSV file without crashing Fixes #3195 Accessing self.csv_data[0] on an empty CSV raised an IndexError. Moved the column uniformity check inside the existing if/else on csv_data so empty files log a warning and return an empty document. Signed-off-by: Eugenio-BAYE <baye.eugenio.egnb@gmail.com> * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> --------- Signed-off-by: Eugenio-BAYE <baye.eugenio.egnb@gmail.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Co-authored-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 53412ed commit f283484

File tree

2 files changed

+27
-12
lines changed

2 files changed

+27
-12
lines changed

docling/backend/csv_backend.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,6 @@ def convert(self) -> DoclingDocument:
7777
self.csv_data = list(result)
7878
_log.info(f"Detected {len(self.csv_data)} lines")
7979

80-
# Ensure uniform column length
81-
expected_length = len(self.csv_data[0])
82-
is_uniform = all(len(row) == expected_length for row in self.csv_data)
83-
if not is_uniform:
84-
warnings.warn(
85-
f"Inconsistent column lengths detected in CSV data. "
86-
f"Expected {expected_length} columns, but found rows with varying lengths. "
87-
f"Ensure all rows have the same number of columns."
88-
)
89-
9080
# Parse the CSV into a structured document model
9181
origin = DocumentOrigin(
9282
filename=self.file.name or "file.csv",
@@ -98,7 +88,18 @@ def convert(self) -> DoclingDocument:
9888

9989
if self.is_valid():
10090
# Convert CSV data to table
101-
if self.csv_data:
91+
if not self.csv_data:
92+
_log.warning("CSV file is empty, returning empty document.")
93+
else:
94+
expected_length = len(self.csv_data[0])
95+
is_uniform = all(len(row) == expected_length for row in self.csv_data)
96+
if not is_uniform:
97+
warnings.warn(
98+
f"Inconsistent column lengths detected in CSV data. "
99+
f"Expected {expected_length} columns, but found rows with varying lengths. "
100+
f"Ensure all rows have the same number of columns."
101+
)
102+
102103
num_rows = len(self.csv_data)
103104
num_cols = max(len(row) for row in self.csv_data)
104105

tests/test_backend_csv.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
from io import BytesIO
12
from pathlib import Path
23

34
from pytest import warns
45

5-
from docling.datamodel.base_models import InputFormat
6+
from docling.datamodel.base_models import DocumentStream, InputFormat
67
from docling.datamodel.document import ConversionResult, DoclingDocument
78
from docling.document_converter import DocumentConverter
89

@@ -85,3 +86,16 @@ def test_e2e_invalid_csv_conversions():
8586
print(f"converting {csv_inconsistent_header}")
8687
with warns(UserWarning, match="Inconsistent column lengths"):
8788
converter.convert(csv_inconsistent_header)
89+
90+
91+
def test_empty_csv():
92+
"""Regression test: converting an empty CSV file should not raise an IndexError."""
93+
conv_result = get_converter().convert(
94+
DocumentStream(name="empty.csv", stream=BytesIO(b"")),
95+
raises_on_error=True,
96+
)
97+
doc = conv_result.document
98+
assert doc is not None
99+
# The empty CSV should result in an empty document (no tables and no texts).
100+
assert len(getattr(doc, "tables", [])) == 0
101+
assert len(getattr(doc, "texts", [])) == 0

0 commit comments

Comments
 (0)