Skip to content

Commit c776af5

Browse files
authored
fix: distinguish quoted and unquoted empty and null string when loading CSV (#19207)
* feat: CSV support options QUOTED_EMPTY_FIELD_AS and ALLOW_QUOTED_NULLS * feat: unload CSV support NULL_DISPLAY. * feat: `EMPTY_FIELD_AS=String` try set NULL for non-string columns. * ci: update tpcds logic test. * ci: update logic test. * skip compat test case
1 parent d397fe9 commit c776af5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+663
-455
lines changed

Cargo.lock

Lines changed: 2 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ cookie = "0.18.1"
197197
crc32fast = "1.3.2"
198198
cron = "0.12.0"
199199
crossbeam-channel = "0.5.6"
200-
csv-core = "0.1.11"
200+
csv-core = "0.1.13"
201201
ctor = "0.2"
202202
ctrlc = { version = "3.2.3", features = ["termination"] }
203203
dashmap = { version = "6.1.0", features = ["serde"] }
@@ -572,6 +572,7 @@ async-backtrace = { git = "https://github.com/datafuse-extras/async-backtrace.gi
572572
async-recursion = { git = "https://github.com/datafuse-extras/async-recursion.git", rev = "a353334" }
573573
backtrace = { git = "https://github.com/rust-lang/backtrace-rs.git", rev = "72265be" }
574574
color-eyre = { git = "https://github.com/eyre-rs/eyre.git", rev = "e5d92c3" }
575+
csv-core = { git = "https://github.com/youngsofun/rust-csv.git", rev = "346f18c" }
575576
databend-base = { git = "https://github.com/databendlabs/databend-base", tag = "v0.1.0" }
576577
deltalake = { git = "https://github.com/delta-io/delta-rs", rev = "9954bff" }
577578
jsonb = { git = "https://github.com/databendlabs/jsonb", rev = "1868abf" }

src/meta/app/src/principal/file_format.rs

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ const OPT_ESCAPE: &str = "escape";
4444
const OPT_QUOTE: &str = "quote";
4545
const OPT_ROW_TAG: &str = "row_tag";
4646
const OPT_ERROR_ON_COLUMN_COUNT_MISMATCH: &str = "error_on_column_count_mismatch";
47+
const OPT_ALLOW_QUOTED_NULLS: &str = "allow_quoted_nulls";
48+
const OPT_QUOTED_EMPTY_FIELD_AS: &str = "quoted_empty_field_as";
4749
const MISSING_FIELD_AS: &str = "missing_field_as";
4850
const NULL_FIELD_AS: &str = "null_field_as";
4951
const NULL_IF: &str = "null_if";
@@ -162,7 +164,10 @@ impl FileFormatParams {
162164
pub fn need_field_default(&self) -> bool {
163165
match self {
164166
FileFormatParams::Parquet(v) => v.missing_field_as == NullAs::FieldDefault,
165-
FileFormatParams::Csv(v) => v.empty_field_as == EmptyFieldAs::FieldDefault,
167+
FileFormatParams::Csv(v) => {
168+
v.empty_field_as == EmptyFieldAs::FieldDefault
169+
|| v.quoted_empty_field_as == EmptyFieldAs::FieldDefault
170+
}
166171
FileFormatParams::NdJson(v) => {
167172
v.null_field_as == NullAs::FieldDefault
168173
|| v.missing_field_as == NullAs::FieldDefault
@@ -257,6 +262,14 @@ impl FileFormatParams {
257262
OPT_ERROR_ON_COLUMN_COUNT_MISMATCH,
258263
default.error_on_column_count_mismatch,
259264
)?;
265+
let allow_quoted_nulls =
266+
reader.take_bool(OPT_ALLOW_QUOTED_NULLS, default.allow_quoted_nulls)?;
267+
let quoted_empty_field_as = reader
268+
.options
269+
.remove(OPT_QUOTED_EMPTY_FIELD_AS)
270+
.map(|s| EmptyFieldAs::from_str(&s))
271+
.transpose()?
272+
.unwrap_or(default.quoted_empty_field_as.clone());
260273
let output_header = reader.take_bool(OPT_OUTPUT_HEADER, default.output_header)?;
261274
FileFormatParams::Csv(CsvFileFormatParams {
262275
compression,
@@ -268,6 +281,8 @@ impl FileFormatParams {
268281
escape,
269282
quote,
270283
error_on_column_count_mismatch,
284+
allow_quoted_nulls,
285+
quoted_empty_field_as,
271286
empty_field_as,
272287
binary_format,
273288
output_header,
@@ -457,16 +472,20 @@ pub struct CsvFileFormatParams {
457472
pub quote: String,
458473
pub error_on_column_count_mismatch: bool,
459474

475+
// load only options
476+
pub allow_quoted_nulls: bool,
477+
pub empty_field_as: EmptyFieldAs,
478+
pub quoted_empty_field_as: EmptyFieldAs,
479+
460480
// header
461481
pub headers: u64,
462482
pub output_header: bool,
463483

464-
// field
465-
pub binary_format: BinaryFormat,
466-
pub null_display: String,
484+
// field encoding/decoding
467485
pub nan_display: String,
468-
pub empty_field_as: EmptyFieldAs,
486+
pub binary_format: BinaryFormat,
469487
pub geometry_format: GeometryDataType,
488+
pub null_display: String,
470489
}
471490

472491
impl Default for CsvFileFormatParams {
@@ -476,15 +495,17 @@ impl Default for CsvFileFormatParams {
476495
headers: 0,
477496
field_delimiter: ",".to_string(),
478497
record_delimiter: "\n".to_string(),
479-
null_display: NULL_BYTES_ESCAPE.to_string(),
480-
nan_display: "NaN".to_string(),
481498
escape: "".to_string(),
482499
quote: "\"".to_string(),
483500
error_on_column_count_mismatch: true,
501+
allow_quoted_nulls: false,
484502
empty_field_as: Default::default(),
503+
quoted_empty_field_as: EmptyFieldAs::String,
485504
output_header: false,
486505
binary_format: Default::default(),
487506
geometry_format: GeometryDataType::default(),
507+
nan_display: "NaN".to_string(),
508+
null_display: NULL_BYTES_ESCAPE.to_string(),
488509
}
489510
}
490511
}
@@ -850,7 +871,7 @@ impl Display for FileFormatParams {
850871
FIELD_DELIMITER = '{}' RECORD_DELIMITER = '{}' QUOTE = '{}' ESCAPE = '{}' \
851872
SKIP_HEADER= {} OUTPUT_HEADER= {} \
852873
NULL_DISPLAY = '{}' NAN_DISPLAY = '{}' EMPTY_FIELD_AS = {} BINARY_FORMAT = {} \
853-
ERROR_ON_COLUMN_COUNT_MISMATCH = {}",
874+
ERROR_ON_COLUMN_COUNT_MISMATCH = {} ALLOW_QUOTED_NULLS = {} QUOTED_EMPTY_FIELD_AS = {}",
854875
params.compression,
855876
escape_string(&params.field_delimiter),
856877
escape_string(&params.record_delimiter),
@@ -863,6 +884,8 @@ impl Display for FileFormatParams {
863884
params.empty_field_as,
864885
params.binary_format,
865886
params.error_on_column_count_mismatch,
887+
params.allow_quoted_nulls,
888+
params.quoted_empty_field_as,
866889
)
867890
}
868891
FileFormatParams::Tsv(params) => {

src/meta/proto-conv/src/file_format_from_to_protobuf_impl.rs

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -481,11 +481,7 @@ impl FromToProto for mt::principal::CsvFileFormatParams {
481481
Incompatible::new(format!("invalid StageFileCompression: {}", p.compression))
482482
})?,
483483
)?;
484-
let null_display = if p.null_display.is_empty() {
485-
"\\N".to_string()
486-
} else {
487-
p.null_display
488-
};
484+
let null_display = p.null_display.unwrap_or_else(|| "\\N".to_string());
489485

490486
let empty_field_as = p
491487
.empty_field_as
@@ -506,6 +502,12 @@ impl FromToProto for mt::principal::CsvFileFormatParams {
506502
.map_err(|e| Incompatible::new(format!("{:?}", e)))?
507503
.unwrap_or_default();
508504

505+
let quoted_empty_field_as = if let Some(value) = p.quoted_empty_field_as {
506+
EmptyFieldAs::from_str(&value).map_err(|e| Incompatible::new(format!("{:?}", e)))?
507+
} else {
508+
EmptyFieldAs::String
509+
};
510+
509511
Ok(Self {
510512
compression,
511513
headers: p.headers,
@@ -516,7 +518,9 @@ impl FromToProto for mt::principal::CsvFileFormatParams {
516518
nan_display: p.nan_display,
517519
null_display,
518520
error_on_column_count_mismatch: !p.allow_column_count_mismatch,
521+
allow_quoted_nulls: p.allow_quoted_nulls,
519522
empty_field_as,
523+
quoted_empty_field_as,
520524
binary_format,
521525
output_header: p.output_header,
522526
geometry_format,
@@ -536,8 +540,10 @@ impl FromToProto for mt::principal::CsvFileFormatParams {
536540
quote: self.quote.clone(),
537541
escape: self.escape.clone(),
538542
nan_display: self.nan_display.clone(),
539-
null_display: self.null_display.clone(),
543+
null_display: Some(self.null_display.clone()),
540544
allow_column_count_mismatch: !self.error_on_column_count_mismatch,
545+
allow_quoted_nulls: self.allow_quoted_nulls,
546+
quoted_empty_field_as: Some(self.quoted_empty_field_as.to_string()),
541547
empty_field_as: Some(self.empty_field_as.to_string()),
542548
binary_format: Some(self.binary_format.to_string()),
543549
output_header: self.output_header,

src/meta/proto-conv/src/util.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ const META_CHANGE_LOG: &[(u64, &str)] = &[
193193
(161, "2025-12-04: Add: Grant/OwnershipRowAccessPolicyObject and row access policy privileges"),
194194
(162, "2025-12-16: Add: tag.proto"),
195195
(163, "2025-12-31: Add: SnapshotRef"),
196+
(164, "2026-01-22: Update: user.proto/CsvFileFormatParams add allow_quoted_nulls and quoted_empty_field_as, change null_display to optional", ),
196197
// Dear developer:
197198
// If you're gonna add a new metadata version, you'll have to add a test for it.
198199
// You could just copy an existing test file(e.g., `../tests/it/v024_table_meta.rs`)

src/meta/proto-conv/tests/it/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,4 @@ mod v160_udf_update_on;
155155
mod v161_grant_object_row_access_policy;
156156
mod v162_tag;
157157
mod v163_snapshot_ref;
158+
mod v164_csv_format_params;

src/meta/proto-conv/tests/it/v032_file_format_params.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
use databend_common_meta_app as mt;
1616
use databend_common_meta_app::principal::CsvFileFormatParams;
17+
use databend_common_meta_app::principal::EmptyFieldAs;
1718
use databend_common_meta_app::principal::NullAs;
1819
use databend_common_meta_app::principal::StageFileCompression;
1920
use databend_common_meta_app::principal::TsvFileFormatParams;
@@ -52,7 +53,9 @@ fn test_decode_v32_csv_file_format_params() -> anyhow::Result<()> {
5253
escape: "\\".to_string(),
5354
quote: "\'".to_string(),
5455
error_on_column_count_mismatch: true,
56+
allow_quoted_nulls: false,
5557
empty_field_as: Default::default(),
58+
quoted_empty_field_as: EmptyFieldAs::String,
5659
binary_format: Default::default(),
5760
geometry_format: Default::default(),
5861
})

src/meta/proto-conv/tests/it/v053_csv_format_params.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
use databend_common_meta_app as mt;
1616
use databend_common_meta_app::principal::CsvFileFormatParams;
17+
use databend_common_meta_app::principal::EmptyFieldAs;
1718
use databend_common_meta_app::principal::StageFileCompression;
1819
use fastrace::func_name;
1920

@@ -46,7 +47,9 @@ fn test_decode_v53_csv_file_format_params() -> anyhow::Result<()> {
4647
escape: "\\".to_string(),
4748
quote: "\'".to_string(),
4849
error_on_column_count_mismatch: true,
50+
allow_quoted_nulls: false,
4951
empty_field_as: Default::default(),
52+
quoted_empty_field_as: EmptyFieldAs::String,
5053
binary_format: Default::default(),
5154
geometry_format: Default::default(),
5255
})

src/meta/proto-conv/tests/it/v059_csv_format_params.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
use databend_common_meta_app as mt;
1616
use databend_common_meta_app::principal::CsvFileFormatParams;
17+
use databend_common_meta_app::principal::EmptyFieldAs;
1718
use databend_common_meta_app::principal::StageFileCompression;
1819
use fastrace::func_name;
1920

@@ -46,7 +47,9 @@ fn test_decode_v59_csv_file_format_params() -> anyhow::Result<()> {
4647
escape: "\\".to_string(),
4748
quote: "\'".to_string(),
4849
error_on_column_count_mismatch: false,
50+
allow_quoted_nulls: false,
4951
empty_field_as: Default::default(),
52+
quoted_empty_field_as: EmptyFieldAs::String,
5053
binary_format: Default::default(),
5154
geometry_format: Default::default(),
5255
})

src/meta/proto-conv/tests/it/v072_csv_format_params.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ fn test_decode_v72_csv_file_format_params() -> anyhow::Result<()> {
4848
escape: "|".to_string(),
4949
quote: "\'".to_string(),
5050
error_on_column_count_mismatch: false,
51+
allow_quoted_nulls: false,
5152
empty_field_as: EmptyFieldAs::String,
53+
quoted_empty_field_as: EmptyFieldAs::String,
5254
binary_format: Default::default(),
5355
geometry_format: Default::default(),
5456
})

0 commit comments

Comments
 (0)