Skip to content

Commit 1ccb321

Browse files
committed
should be speed improvements
1 parent ef9477c commit 1ccb321

2 files changed

Lines changed: 250 additions & 65 deletions

File tree

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
package com.github.oeuvres.alix.util;
2+
3+
import static org.junit.jupiter.api.Assertions.*;
4+
5+
import java.io.StringReader;
6+
7+
import org.junit.jupiter.api.Test;
8+
9+
class CSVReaderTest {
10+
11+
@Test
12+
void readsSimpleUnquotedRow() throws Exception {
13+
CSVReader csv = new CSVReader(new StringReader("a,b,c\n"));
14+
15+
assertTrue(csv.readRow());
16+
assertEquals(3, csv.getCellCount());
17+
assertEquals("a", csv.getCellAsString(0));
18+
assertEquals("b", csv.getCellAsString(1));
19+
assertEquals("c", csv.getCellAsString(2));
20+
21+
assertFalse(csv.readRow());
22+
}
23+
24+
@Test
25+
void readsLastRowWithoutTrailingNewline() throws Exception {
26+
CSVReader csv = new CSVReader(new StringReader("a,b,c"));
27+
28+
assertTrue(csv.readRow());
29+
assertEquals(3, csv.getCellCount());
30+
assertEquals("a", csv.getCellAsString(0));
31+
assertEquals("b", csv.getCellAsString(1));
32+
assertEquals("c", csv.getCellAsString(2));
33+
34+
assertFalse(csv.readRow());
35+
}
36+
37+
@Test
38+
void handlesEmptyCells() throws Exception {
39+
CSVReader csv = new CSVReader(new StringReader(",,\n"));
40+
41+
assertTrue(csv.readRow());
42+
assertEquals(3, csv.getCellCount());
43+
assertEquals("", csv.getCellAsString(0));
44+
assertEquals("", csv.getCellAsString(1));
45+
assertEquals("", csv.getCellAsString(2));
46+
}
47+
48+
@Test
49+
void handlesQuotedSeparatorAndEmbeddedNewline() throws Exception {
50+
// second cell contains an embedded newline
51+
CSVReader csv = new CSVReader(new StringReader("\"a,b\",\"c\nd\"\n"));
52+
53+
assertTrue(csv.readRow());
54+
assertEquals(2, csv.getCellCount());
55+
assertEquals("a,b", csv.getCellAsString(0));
56+
assertEquals("c\nd", csv.getCellAsString(1));
57+
}
58+
59+
@Test
60+
void handlesEscapedQuotesInsideQuotedField() throws Exception {
61+
CSVReader csv = new CSVReader(new StringReader("\"a\"\"b\",x\n"));
62+
63+
assertTrue(csv.readRow());
64+
assertEquals(2, csv.getCellCount());
65+
assertEquals("a\"b", csv.getCellAsString(0));
66+
assertEquals("x", csv.getCellAsString(1));
67+
}
68+
69+
@Test
70+
void supportsCRLFAndCRLineEndings() throws Exception {
71+
CSVReader csv = new CSVReader(new StringReader("a,b\r\nc,d\re,f\n"));
72+
73+
assertTrue(csv.readRow());
74+
assertEquals(2, csv.getCellCount());
75+
assertEquals("a", csv.getCellAsString(0));
76+
assertEquals("b", csv.getCellAsString(1));
77+
78+
assertTrue(csv.readRow());
79+
assertEquals(2, csv.getCellCount());
80+
assertEquals("c", csv.getCellAsString(0));
81+
assertEquals("d", csv.getCellAsString(1));
82+
83+
assertTrue(csv.readRow());
84+
assertEquals(2, csv.getCellCount());
85+
assertEquals("e", csv.getCellAsString(0));
86+
assertEquals("f", csv.getCellAsString(1));
87+
88+
assertFalse(csv.readRow());
89+
}
90+
91+
@Test
92+
void skipsUtf8BomAtStartOfStream() throws Exception {
93+
CSVReader csv = new CSVReader(new StringReader("\uFEFFa,b\n"));
94+
95+
assertTrue(csv.readRow());
96+
assertEquals(2, csv.getCellCount());
97+
assertEquals("a", csv.getCellAsString(0));
98+
assertEquals("b", csv.getCellAsString(1));
99+
}
100+
101+
@Test
102+
void getCellAsStringIsSnapshotNotAffectedByNextReadRow() throws Exception {
103+
CSVReader csv = new CSVReader(new StringReader("a,b\nc,d\n"));
104+
105+
assertTrue(csv.readRow());
106+
String a0 = csv.getCellAsString(0); // snapshot
107+
assertEquals("a", a0);
108+
109+
assertTrue(csv.readRow());
110+
assertEquals("c", csv.getCellAsString(0));
111+
112+
// Snapshot must remain unchanged
113+
assertEquals("a", a0);
114+
}
115+
116+
@Test
117+
void getCellThrowsOnInvalidIndex() throws Exception {
118+
CSVReader csv = new CSVReader(new StringReader("a,b\n"));
119+
120+
assertTrue(csv.readRow());
121+
assertThrows(IndexOutOfBoundsException.class, () -> csv.getCell(-1));
122+
assertThrows(IndexOutOfBoundsException.class, () -> csv.getCell(2));
123+
}
124+
125+
@Test
126+
void cellMaxLimitsReturnedCells() throws Exception {
127+
// Intended semantics: return only first 2 cells, ignore the rest of the row.
128+
CSVReader csv = new CSVReader(new StringReader("a,b,c\n"), ',', 2);
129+
130+
assertTrue(csv.readRow());
131+
assertEquals(2, csv.getCellCount());
132+
assertEquals("a", csv.getCellAsString(0));
133+
assertEquals("b", csv.getCellAsString(1));
134+
}
135+
}

util/src/java/com/github/oeuvres/alix/util/CSVReader.java

Lines changed: 115 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ public CSVReader(Reader in, char separator)
216216
*/
217217
public CSVReader(Reader in, char separator, final int cellMax)
218218
{
219-
this(in, separator, cellMax, '"', 8192);
219+
this(in, separator, cellMax, '"', 64*1024);
220220
}
221221

222222
/**
@@ -281,13 +281,10 @@ public CSVReader(final Reader in, final char separator, final int cellMax, final
281281
* @throws IOException if an I/O error occurs while reading from the underlying
282282
* {@link Reader}
283283
*/
284-
public boolean readRow() throws IOException
285-
{
286-
if (eof) {
287-
return false;
288-
}
284+
public boolean readRow() throws IOException {
285+
if (eof) return false;
289286

290-
// Reset row state
287+
// Ensure we have at least one cell buffer
291288
cellCount = 1;
292289
ensureCellCapacity(1);
293290
StringBuilder cell = cells.get(0);
@@ -297,97 +294,150 @@ public boolean readRow() throws IOException
297294
boolean atCellStart = true;
298295
boolean sawAny = false;
299296

300-
for (;;) {
297+
// Handle BOM once at stream start (faster than checking per char)
298+
if (atStart) {
299+
atStart = false;
300+
if (bufPos >= bufLen && !fill()) {
301+
eof = true;
302+
cellCount = 0;
303+
return false;
304+
}
305+
if (bufLen > 0 && buf[bufPos] == '\uFEFF') {
306+
bufPos++;
307+
}
308+
}
301309

310+
for (;;) {
302311
if (bufPos >= bufLen) {
303312
if (!fill()) {
304-
// Reached EOF
305313
eof = true;
306314
if (!sawAny && atCellStart) {
307-
// No data at all for a new row
308315
cellCount = 0;
309316
return false;
310317
}
311-
// Return last (possibly unterminated) row
312-
return true;
318+
return true; // last unterminated row
313319
}
314320
}
315321

316-
char c = buf[bufPos++];
322+
if (inQuotes) {
323+
// Scan until next quote; append chunks in bulk
324+
int start = bufPos;
325+
while (true) {
326+
while (bufPos < bufLen && buf[bufPos] != quote) {
327+
bufPos++;
328+
}
317329

318-
// Handle optional BOM only on first character of stream
319-
if (atStart) {
320-
atStart = false;
321-
if (c == '\uFEFF') {
322-
// Skip BOM and continue with the next character
323-
continue;
324-
}
325-
}
330+
// append chunk before quote
331+
if (cell != null && bufPos > start) {
332+
cell.append(buf, start, bufPos - start);
333+
}
334+
if (bufPos < bufLen) sawAny = true; // we saw something in this buffer
335+
336+
if (bufPos >= bufLen) {
337+
// need more data
338+
if (!fill()) {
339+
eof = true;
340+
// treat EOF as end-of-field/row
341+
return true;
342+
}
343+
start = bufPos;
344+
continue;
345+
}
326346

327-
sawAny = true;
347+
// buf[bufPos] is quote
348+
bufPos++; // consume quote
328349

329-
if (inQuotes) {
330-
// Inside a quoted field
331-
if (c == quote) {
332-
// Possible closing quote or escaped quote ""
333-
if (bufPos >= bufLen && !fill()) {
334-
// Treat as closing quote at EOF
335-
inQuotes = false;
336-
continue;
350+
// Lookahead for escaped quote
351+
if (bufPos >= bufLen) {
352+
if (!fill()) {
353+
eof = true;
354+
inQuotes = false;
355+
return true;
356+
}
337357
}
338358
if (bufPos < bufLen && buf[bufPos] == quote) {
339-
// Escaped double-quote: "" -> "
340-
cell.append(quote);
341-
bufPos++;
342-
} else {
343-
// Closing quote
344-
inQuotes = false;
359+
// escaped quote
360+
if (cell != null) cell.append(quote);
361+
bufPos++; // consume second quote
362+
start = bufPos;
363+
continue;
345364
}
346-
} else {
347-
cell.append(c);
365+
366+
// closing quote
367+
inQuotes = false;
368+
atCellStart = false;
369+
break;
348370
}
349371
continue;
350372
}
351373

352374
// Outside quotes
353-
if (c == '\n') {
354-
// Linefeed: end of row
355-
return true;
356-
} else if (c == '\r') {
357-
// Carriage return: may be CRLF
358-
if (bufPos >= bufLen && !fill()) {
359-
// CR at EOF
360-
eof = true;
361-
return true;
362-
}
363-
if (bufPos < bufLen && buf[bufPos] == '\n') {
364-
// Consume the LF in CRLF
365-
bufPos++;
366-
}
367-
return true;
368-
} else if (c == quote && atCellStart) {
369-
// Opening quote at the start of a cell
375+
char c = buf[bufPos];
376+
377+
// Opening quote only if it's the first char of the cell
378+
if (atCellStart && c == quote) {
370379
inQuotes = true;
380+
bufPos++;
381+
sawAny = true;
371382
atCellStart = false;
372-
}
373-
if (cellMax > 0 && cellMax > cellCount) {
374-
// fixed count of cells, no more cells needed, let loop till endOfLine
375383
continue;
376-
} else if (c == separator) {
377-
// End of current cell; start a new cell
384+
}
385+
386+
// Scan unquoted run until separator or newline
387+
int start = bufPos;
388+
while (bufPos < bufLen) {
389+
c = buf[bufPos];
390+
if (c == separator || c == '\n' || c == '\r') break;
391+
bufPos++;
392+
}
393+
if (cell != null && bufPos > start) {
394+
cell.append(buf, start, bufPos - start);
395+
}
396+
if (bufPos > start) sawAny = true;
397+
atCellStart = false;
398+
399+
if (bufPos >= bufLen) continue; // refill and continue
400+
401+
// Consume delimiter
402+
c = buf[bufPos++];
403+
sawAny = true;
404+
405+
if (c == separator) {
406+
atCellStart = true;
407+
408+
// Enforce "return at most cellMax cells"
409+
if (cellMax > 0 && cellCount >= cellMax) {
410+
cell = null; // discard subsequent cells' content
411+
inQuotes = false; // quote handling continues via state machine
412+
continue;
413+
}
414+
378415
ensureCellCapacity(cellCount + 1);
379416
cell = cells.get(cellCount);
380417
cell.setLength(0);
381418
cellCount++;
382-
atCellStart = true;
383-
} else {
384-
// Regular character in an unquoted cell
385-
cell.append(c);
386-
atCellStart = false;
419+
continue;
420+
}
421+
422+
if (c == '\n') {
423+
return true;
387424
}
425+
426+
// c == '\r': handle CRLF
427+
if (bufPos >= bufLen) {
428+
if (!fill()) {
429+
eof = true;
430+
return true;
431+
}
432+
}
433+
if (bufPos < bufLen && buf[bufPos] == '\n') {
434+
bufPos++;
435+
}
436+
return true;
388437
}
389438
}
390439

440+
391441
/**
392442
* Returns the number of cells in the last row successfully read by
393443
* {@link #readRow()}.

0 commit comments

Comments
 (0)