Skip to content

Commit 2ecf2b4

Browse files
committed
vendor: Update vendored sources to duckdb/duckdb@1f0067f
[CSV Reader] Detect SQLNULL types for schema merging, use schema merging in csv relations, add files_to_sniff option. (duckdb/duckdb#17467) [Python Dev] Fix failing tests for the Python SQLLogicTester (duckdb/duckdb#17529)
1 parent 3734c0a commit 2ecf2b4

File tree

11 files changed

+78
-31
lines changed

11 files changed

+78
-31
lines changed

src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,14 @@ vector<LogicalType> CSVSchema::GetTypes() const {
113113
return types;
114114
}
115115

116+
void CSVSchema::ReplaceNullWithVarchar() {
117+
for (auto &column : columns) {
118+
if (column.type.id() == LogicalTypeId::SQLNULL) {
119+
column.type = LogicalType::VARCHAR;
120+
}
121+
}
122+
}
123+
116124
bool CSVSchema::Empty() const {
117125
return columns.empty();
118126
}
@@ -207,7 +215,12 @@ bool CSVSchema::SchemasMatch(string &error_message, SnifferResult &sniffer_resul
207215
}
208216

209217
// Lets suggest some potential fixes
210-
error << "Potential Fix: Since your schema has a mismatch, consider setting union_by_name=true.";
218+
error << "Potential Fixes "
219+
<< "\n";
220+
error << "* Consider setting union_by_name=true."
221+
<< "\n";
222+
error << "* Consider setting files_to_sniff to a higher value (e.g., files_to_sniff = -1)"
223+
<< "\n";
211224
if (!match) {
212225
error_message = error.str();
213226
}

src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const Logica
4545

4646
void CSVSniffer::RefineTypes() {
4747
auto &sniffing_state_machine = best_candidate->GetStateMachine();
48-
// if data types were provided, exit here if number of columns does not match
48+
// if data types were provided, exit here if the number of columns does not match
4949
detected_types.assign(sniffing_state_machine.dialect_options.num_cols, LogicalType::VARCHAR);
5050
if (sniffing_state_machine.options.all_varchar) {
5151
// return all types varchar
@@ -59,10 +59,6 @@ void CSVSniffer::RefineTypes() {
5959
detected_types.clear();
6060
for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) {
6161
LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
62-
if (best_sql_types_candidates_per_column_idx[column_idx].size() ==
63-
sniffing_state_machine.options.auto_type_candidates.size()) {
64-
d_type = LogicalType::VARCHAR;
65-
}
6662
detected_types.push_back(d_type);
6763
}
6864
return;
@@ -98,7 +94,8 @@ void CSVSniffer::RefineTypes() {
9894
LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
9995
if (best_sql_types_candidates_per_column_idx[column_idx].size() ==
10096
best_candidate->GetStateMachine().options.auto_type_candidates.size() &&
101-
default_null_to_varchar) {
97+
default_null_to_varchar && !best_candidate->FinishedFile()) {
98+
// We only default SQLNull to Varchar if we haven't finished the file yet.
10299
d_type = LogicalType::VARCHAR;
103100
}
104101
detected_types.push_back(d_type);

src/duckdb/src/execution/operator/csv_scanner/table_function/csv_multi_file_info.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,10 @@ unique_ptr<TableFunctionData> CSVMultiFileInfo::InitializeBindData(MultiFileBind
5959
}
6060

6161
//! Function to do schema discovery over one CSV file or a list/glob of CSV files
62-
void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptions &options,
63-
const MultiFileOptions &file_options, vector<LogicalType> &return_types, vector<string> &names,
64-
MultiFileList &multi_file_list) {
62+
CSVSchema CSVSchemaDiscovery::SchemaDiscovery(ClientContext &context, shared_ptr<CSVBufferManager> &buffer_manager,
63+
CSVReaderOptions &options, const MultiFileOptions &file_options,
64+
vector<LogicalType> &return_types, vector<string> &names,
65+
MultiFileList &multi_file_list) {
6566
vector<CSVSchema> schemas;
6667
const auto option_og = options;
6768

@@ -75,17 +76,17 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
7576
idx_t current_file = 0;
7677
options.file_path = file_paths[current_file].path;
7778

78-
result.buffer_manager = make_shared_ptr<CSVBufferManager>(context, options, options.file_path, false);
79+
buffer_manager = make_shared_ptr<CSVBufferManager>(context, options, options.file_path, false);
7980
idx_t only_header_or_empty_files = 0;
8081

8182
{
82-
CSVSniffer sniffer(options, file_options, result.buffer_manager, CSVStateMachineCache::Get(context));
83+
CSVSniffer sniffer(options, file_options, buffer_manager, CSVStateMachineCache::Get(context));
8384
auto sniffer_result = sniffer.SniffCSV();
8485
idx_t rows_read = sniffer.LinesSniffed() -
8586
(options.dialect_options.skip_rows.GetValue() + options.dialect_options.header.GetValue());
8687

8788
schemas.emplace_back(sniffer_result.names, sniffer_result.return_types, file_paths[0].path, rows_read,
88-
result.buffer_manager->GetBuffer(0)->actual_size == 0);
89+
buffer_manager->GetBuffer(0)->actual_size == 0);
8990
total_number_of_rows += sniffer.LinesSniffed();
9091
current_file++;
9192
if (sniffer.EmptyOrOnlyHeader()) {
@@ -94,19 +95,22 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
9495
}
9596

9697
// We do a copy of the options to not pollute the options of the first file.
97-
constexpr idx_t max_files_to_sniff = 10;
98+
idx_t max_files_to_sniff = static_cast<idx_t>(options.files_to_sniff == -1)
99+
? NumericLimits<idx_t>::Maximum()
100+
: static_cast<idx_t>(options.files_to_sniff);
98101
idx_t files_to_sniff = file_paths.size() > max_files_to_sniff ? max_files_to_sniff : file_paths.size();
99102
while (total_number_of_rows < required_number_of_lines && current_file < files_to_sniff) {
100103
auto option_copy = option_og;
101104
option_copy.file_path = file_paths[current_file].path;
102-
auto buffer_manager = make_shared_ptr<CSVBufferManager>(context, option_copy, option_copy.file_path, false);
105+
auto file_buffer_manager =
106+
make_shared_ptr<CSVBufferManager>(context, option_copy, option_copy.file_path, false);
103107
// TODO: We could cache the sniffer to be reused during scanning. Currently that's an exercise left to the
104108
// reader
105-
CSVSniffer sniffer(option_copy, file_options, buffer_manager, CSVStateMachineCache::Get(context));
109+
CSVSniffer sniffer(option_copy, file_options, file_buffer_manager, CSVStateMachineCache::Get(context));
106110
auto sniffer_result = sniffer.SniffCSV();
107111
idx_t rows_read = sniffer.LinesSniffed() - (option_copy.dialect_options.skip_rows.GetValue() +
108112
option_copy.dialect_options.header.GetValue());
109-
if (buffer_manager->GetBuffer(0)->actual_size == 0) {
113+
if (file_buffer_manager->GetBuffer(0)->actual_size == 0) {
110114
schemas.emplace_back(true);
111115
} else {
112116
schemas.emplace_back(sniffer_result.names, sniffer_result.return_types, option_copy.file_path, rows_read);
@@ -125,14 +129,17 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
125129
// A schema is bettah than no schema
126130
best_schema = schema;
127131
} else if (best_schema.GetRowsRead() == 0) {
128-
// If the best-schema has no data-rows, that's easy, we just take the new schema
132+
// If the best-schema has no data-rows, that's easy; we just take the new schema
129133
best_schema = schema;
130134
} else if (schema.GetRowsRead() != 0) {
131135
// We might have conflicting-schemas, we must merge them
132136
best_schema.MergeSchemas(schema, options.null_padding);
133137
}
134138
}
135139

140+
// At this point, replace a sqlnull with varchar for the type
141+
best_schema.ReplaceNullWithVarchar();
142+
136143
if (names.empty()) {
137144
names = best_schema.GetNames();
138145
return_types = best_schema.GetTypes();
@@ -151,7 +158,7 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
151158
return_types[i] = LogicalType::VARCHAR;
152159
}
153160
}
154-
result.csv_schema = best_schema;
161+
return best_schema;
155162
}
156163

157164
void CSVMultiFileInfo::BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
@@ -161,7 +168,9 @@ void CSVMultiFileInfo::BindReader(ClientContext &context, vector<LogicalType> &r
161168
auto &options = csv_data.options;
162169
if (!bind_data.file_options.union_by_name) {
163170
if (options.auto_detect) {
164-
SchemaDiscovery(context, csv_data, options, bind_data.file_options, return_types, names, multi_file_list);
171+
csv_data.csv_schema =
172+
CSVSchemaDiscovery::SchemaDiscovery(context, csv_data.buffer_manager, options, bind_data.file_options,
173+
return_types, names, multi_file_list);
165174
} else {
166175
// If we are not running the sniffer, the columns must be set!
167176
if (!options.columns_set) {

src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,12 @@ void CSVReaderOptions::ParseOption(ClientContext &context, const string &key, co
744744
}
745745
} else if (loption == "all_varchar") {
746746
all_varchar = GetBooleanValue(loption, val);
747+
} else if (loption == "files_to_sniff") {
748+
files_to_sniff = ParseInteger(val, loption);
749+
if (files_to_sniff < 1 && files_to_sniff != -1) {
750+
throw BinderException(
751+
"Unsupported parameter for files_to_sniff: value must be -1 for all files or higher than one.");
752+
}
747753
} else if (loption == "normalize_names") {
748754
normalize_names = GetBooleanValue(loption, val);
749755
} else {

src/duckdb/src/function/table/read_csv.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ void ReadCSVTableFunction::ReadCSVAddNamedParameters(TableFunction &table_functi
9393
table_function.named_parameters["encoding"] = LogicalType::VARCHAR;
9494
table_function.named_parameters["strict_mode"] = LogicalType::BOOLEAN;
9595
table_function.named_parameters["thousands"] = LogicalType::VARCHAR;
96+
table_function.named_parameters["files_to_sniff"] = LogicalType::BIGINT;
9697

9798
MultiFileReader::AddParameters(table_function);
9899
}

src/duckdb/src/function/table/sniff_csv.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "duckdb/main/client_context.hpp"
88
#include "duckdb/function/table/range.hpp"
99
#include "duckdb/execution/operator/csv_scanner/csv_file_handle.hpp"
10+
#include "duckdb/execution/operator/csv_scanner/csv_multi_file_info.hpp"
1011
#include "duckdb/function/table/read_csv.hpp"
1112

1213
namespace duckdb {
@@ -169,6 +170,11 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p,
169170
sniffer_result.return_types[i] = LogicalType::VARCHAR;
170171
}
171172
}
173+
for (auto &type : sniffer_result.return_types) {
174+
if (type.id() == LogicalTypeId::SQLNULL) {
175+
type = LogicalType::VARCHAR;
176+
}
177+
}
172178
string str_opt;
173179
string separator = ", ";
174180
// Set output

src/duckdb/src/function/table/version/pragma_version.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#ifndef DUCKDB_PATCH_VERSION
2-
#define DUCKDB_PATCH_VERSION "0-dev3621"
2+
#define DUCKDB_PATCH_VERSION "0-dev3631"
33
#endif
44
#ifndef DUCKDB_MINOR_VERSION
55
#define DUCKDB_MINOR_VERSION 3
@@ -8,10 +8,10 @@
88
#define DUCKDB_MAJOR_VERSION 1
99
#endif
1010
#ifndef DUCKDB_VERSION
11-
#define DUCKDB_VERSION "v1.3.0-dev3621"
11+
#define DUCKDB_VERSION "v1.3.0-dev3631"
1212
#endif
1313
#ifndef DUCKDB_SOURCE_ID
14-
#define DUCKDB_SOURCE_ID "4075e3394f"
14+
#define DUCKDB_SOURCE_ID "1f0067f1a5"
1515
#endif
1616
#include "duckdb/function/table/system_functions.hpp"
1717
#include "duckdb/main/database.hpp"

src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_multi_file_info.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
#include "duckdb/common/multi_file/multi_file_function.hpp"
1212
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
13+
#include "duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp"
14+
#include "duckdb/execution/operator/csv_scanner/csv_schema.hpp"
1315

1416
namespace duckdb {
1517

@@ -22,6 +24,13 @@ class CSVFileReaderOptions : public BaseFileReaderOptions {
2224
CSVReaderOptions options;
2325
};
2426

27+
struct CSVSchemaDiscovery {
28+
static CSVSchema SchemaDiscovery(ClientContext &context, shared_ptr<CSVBufferManager> &buffer_manager,
29+
CSVReaderOptions &options, const MultiFileOptions &file_options,
30+
vector<LogicalType> &return_types, vector<string> &names,
31+
MultiFileList &multi_file_list);
32+
};
33+
2534
struct CSVMultiFileInfo : public MultiFileReaderInterface {
2635
static unique_ptr<MultiFileReaderInterface> InitializeInterface(ClientContext &context, MultiFileReader &reader,
2736
MultiFileList &file_list);

src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ struct CSVReaderOptions {
104104
//! Result size of sniffing phases
105105
static constexpr idx_t sniff_size = 2048;
106106

107+
//! In case this is a glob or list of multiple files, how many shall be used to sniff.
108+
//! -1 means all
109+
int64_t files_to_sniff = 10;
110+
107111
//! Number of sample chunks used in auto-detection
108112
idx_t sample_size_chunks = 20480 / sniff_size;
109113
//! Consider all columns to be of type varchar

src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ struct CSVSchema {
5959
//! Get a vector with types
6060
vector<LogicalType> GetTypes() const;
6161

62+
//! Replace any SQLNull types with Varchar
63+
void ReplaceNullWithVarchar();
64+
6265
private:
6366
//! If a type can be cast to another
6467
static bool CanWeCastIt(LogicalTypeId source, LogicalTypeId destination);

0 commit comments

Comments
 (0)