Skip to content

Commit 79b544f

Browse files
Merge pull request #168 from data-integrations/cherrypick-schema-issue
[Cherrypick] [PLUGIN-1775] Changes done for schema detection bug while using enable quotes with csv format.
2 parents a83c587 + 16e510c commit 79b544f

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

src/main/java/io/cdap/plugin/http/source/common/DelimitedSchemaDetector.java

+23-1
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@
2020
import io.cdap.cdap.etl.api.FailureCollector;
2121
import io.cdap.plugin.format.delimited.common.DataTypeDetectorStatusKeeper;
2222
import io.cdap.plugin.format.delimited.common.DataTypeDetectorUtils;
23+
import io.cdap.plugin.format.delimited.input.SplitQuotesIterator;
2324
import io.cdap.plugin.http.source.batch.HttpBatchSourceConfig;
2425

26+
import java.util.ArrayList;
2527
import java.util.HashMap;
28+
import java.util.Iterator;
2629
import java.util.List;
2730

2831
/**
@@ -39,7 +42,7 @@ public static Schema detectSchema(HttpBatchSourceConfig config, String delimiter
3942
try {
4043
for (int rowIndex = 0; rowIndex < sampleSize && rawStringPerLine.hasNext(); rowIndex++) {
4144
line = rawStringPerLine.next();
42-
rowValue = line.split(delimiter, -1);
45+
rowValue = getRowValues(line, config.getEnableQuotesValues(), delimiter);
4346
if (rowIndex == 0) {
4447
columnNames = DataTypeDetectorUtils.setColumnNames(line, config.getCsvSkipFirstRow(),
4548
config.getEnableQuotesValues(), delimiter);
@@ -61,4 +64,23 @@ public static Schema detectSchema(HttpBatchSourceConfig config, String delimiter
6164
new HashMap<>(), columnNames, dataTypeDetectorStatusKeeper);
6265
return Schema.recordOf("text", fields);
6366
}
67+
68+
/**
69+
* @param rawLine line to parse and find out the exact number of columns in a row.
70+
* @param enableQuotedValues flag whether file can contain Quoted values.
71+
* @param delimiter delimiter for the file
72+
* @return Array of all the column values within the provided row.
73+
*/
74+
public static String[] getRowValues(String rawLine, boolean enableQuotedValues, String delimiter) {
75+
if (!enableQuotedValues) {
76+
return rawLine.split(delimiter, -1);
77+
} else {
78+
Iterator<String> splitsIterator = new SplitQuotesIterator(rawLine, delimiter, null, false);
79+
List<String> rowValues = new ArrayList<>();
80+
while (splitsIterator.hasNext()) {
81+
rowValues.add(splitsIterator.next());
82+
}
83+
return rowValues.toArray(new String[rowValues.size()]);
84+
}
85+
}
6486
}

src/test/java/io/cdap/plugin/http/source/common/DelimitedSchemaDetectorTest.java

+23
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ public class DelimitedSchemaDetectorTest {
3535
RawStringPerLine rawStringPerLineIterator;
3636
HttpBatchSourceConfig configSkipHeaderTrue;
3737
HttpBatchSourceConfig configSkipHeaderFalse;
38+
HttpBatchSourceConfig configSkipHeaderTrueAndQuotesEnabled;
3839
String csvDelimiter = ",";
3940
String tsvDelimiter = "\t";
4041

@@ -46,6 +47,8 @@ public void setUp() {
4647
MockitoAnnotations.initMocks(this);
4748
configSkipHeaderTrue = HttpBatchSourceConfig.builder().setCsvSkipFirstRow("true").build();
4849
configSkipHeaderFalse = HttpBatchSourceConfig.builder().setCsvSkipFirstRow("false").build();
50+
configSkipHeaderTrueAndQuotesEnabled = HttpBatchSourceConfig.builder().setCsvSkipFirstRow("true")
51+
.setEnableQuotesValues(true).build();
4952
expectedSchemaWithHeaders = Schema.recordOf("text",
5053
Schema.Field.of("name", Schema.of(Schema.Type.STRING)),
5154
Schema.Field.of("age", Schema.of(Schema.Type.INT)),
@@ -101,4 +104,24 @@ public void testDetectSchemaTsvHeader() throws IOException {
101104
Assert.assertEquals(expectedSchemaWithHeaders, schema);
102105
}
103106

107+
@Test
108+
public void testDetectSchemaWithQuotesEnabled() throws IOException {
109+
String[] lines = new String[]{"name,age,isIndian,country", "\"raj,singh\",29,true,india", "rahul,30,false,"};
110+
Mockito.when(rawStringPerLineIterator.hasNext()).thenReturn(true, true, true, false);
111+
Mockito.when(rawStringPerLineIterator.next()).thenReturn(lines[0], lines[1], lines[2]);
112+
Schema schema = DelimitedSchemaDetector.detectSchema(
113+
configSkipHeaderTrueAndQuotesEnabled, csvDelimiter, rawStringPerLineIterator, null);
114+
Assert.assertEquals(expectedSchemaWithHeaders, schema);
115+
}
116+
117+
@Test
118+
public void testDetectSchemaWithQuotesDisabled() throws IOException {
119+
String[] lines = new String[]{"name,age,isIndian,country", "\"raj,singh\",29,true,india", "rahul,30,false,"};
120+
Mockito.when(rawStringPerLineIterator.hasNext()).thenReturn(true, true, true, false);
121+
Mockito.when(rawStringPerLineIterator.next()).thenReturn(lines[0], lines[1], lines[2]);
122+
Schema schema = DelimitedSchemaDetector.detectSchema(
123+
configSkipHeaderTrue, csvDelimiter, rawStringPerLineIterator, null);
124+
Assert.assertNotEquals(expectedSchemaWithHeaders, schema);
125+
}
126+
104127
}

0 commit comments

Comments
 (0)