vendor: Update vendored sources to duckdb/duckdb@1f0067f

krlmlr · krlmlr · commit 2ecf2b40757c · 2025-05-21T19:37:19.000+02:00
[CSV Reader] Detect SQLNULL types for schema merging, use schema merging in csv relations, add files_to_sniff option. (duckdb/duckdb#17467) [Python Dev] Fix failing tests for the Python SQLLogicTester (duckdb/duckdb#17529)
diff --git a/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp b/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp
@@ -113,6 +113,14 @@ vector<LogicalType> CSVSchema::GetTypes() const {
 	return types;
 }
 
+void CSVSchema::ReplaceNullWithVarchar() {
+	for (auto &column : columns) {
+		if (column.type.id() == LogicalTypeId::SQLNULL) {
+			column.type = LogicalType::VARCHAR;
+		}
+	}
+}
+
 bool CSVSchema::Empty() const {
 	return columns.empty();
 }
@@ -207,7 +215,12 @@ bool CSVSchema::SchemasMatch(string &error_message, SnifferResult &sniffer_resul
 	}
 
 	// Lets suggest some potential fixes
-	error << "Potential Fix: Since your schema has a mismatch, consider setting union_by_name=true.";
+	error << "Potential Fixes "
+	      << "\n";
+	error << "* Consider setting union_by_name=true."
+	      << "\n";
+	error << "* Consider setting files_to_sniff to a higher value (e.g., files_to_sniff = -1)"
+	      << "\n";
 	if (!match) {
 		error_message = error.str();
 	}
diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp
@@ -45,7 +45,7 @@ bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const Logica
 
 void CSVSniffer::RefineTypes() {
 	auto &sniffing_state_machine = best_candidate->GetStateMachine();
-	// if data types were provided, exit here if number of columns does not match
+	// if data types were provided, exit here if the number of columns does not match
 	detected_types.assign(sniffing_state_machine.dialect_options.num_cols, LogicalType::VARCHAR);
 	if (sniffing_state_machine.options.all_varchar) {
 		// return all types varchar
@@ -59,10 +59,6 @@ void CSVSniffer::RefineTypes() {
 			detected_types.clear();
 			for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) {
 				LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
-				if (best_sql_types_candidates_per_column_idx[column_idx].size() ==
-				    sniffing_state_machine.options.auto_type_candidates.size()) {
-					d_type = LogicalType::VARCHAR;
-				}
 				detected_types.push_back(d_type);
 			}
 			return;
@@ -98,7 +94,8 @@ void CSVSniffer::RefineTypes() {
 		LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
 		if (best_sql_types_candidates_per_column_idx[column_idx].size() ==
 		        best_candidate->GetStateMachine().options.auto_type_candidates.size() &&
-		    default_null_to_varchar) {
+		    default_null_to_varchar && !best_candidate->FinishedFile()) {
+			// We only default SQLNull to Varchar if we haven't finished the file yet.
 			d_type = LogicalType::VARCHAR;
 		}
 		detected_types.push_back(d_type);
diff --git a/src/duckdb/src/execution/operator/csv_scanner/table_function/csv_multi_file_info.cpp b/src/duckdb/src/execution/operator/csv_scanner/table_function/csv_multi_file_info.cpp
@@ -59,9 +59,10 @@ unique_ptr<TableFunctionData> CSVMultiFileInfo::InitializeBindData(MultiFileBind
 }
 
 //! Function to do schema discovery over one CSV file or a list/glob of CSV files
-void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptions &options,
-                     const MultiFileOptions &file_options, vector<LogicalType> &return_types, vector<string> &names,
-                     MultiFileList &multi_file_list) {
+CSVSchema CSVSchemaDiscovery::SchemaDiscovery(ClientContext &context, shared_ptr<CSVBufferManager> &buffer_manager,
+                                              CSVReaderOptions &options, const MultiFileOptions &file_options,
+                                              vector<LogicalType> &return_types, vector<string> &names,
+                                              MultiFileList &multi_file_list) {
 	vector<CSVSchema> schemas;
 	const auto option_og = options;
 
@@ -75,17 +76,17 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
 	idx_t current_file = 0;
 	options.file_path = file_paths[current_file].path;
 
-	result.buffer_manager = make_shared_ptr<CSVBufferManager>(context, options, options.file_path, false);
+	buffer_manager = make_shared_ptr<CSVBufferManager>(context, options, options.file_path, false);
 	idx_t only_header_or_empty_files = 0;
 
 	{
-		CSVSniffer sniffer(options, file_options, result.buffer_manager, CSVStateMachineCache::Get(context));
+		CSVSniffer sniffer(options, file_options, buffer_manager, CSVStateMachineCache::Get(context));
 		auto sniffer_result = sniffer.SniffCSV();
 		idx_t rows_read = sniffer.LinesSniffed() -
 		                  (options.dialect_options.skip_rows.GetValue() + options.dialect_options.header.GetValue());
 
 		schemas.emplace_back(sniffer_result.names, sniffer_result.return_types, file_paths[0].path, rows_read,
-		                     result.buffer_manager->GetBuffer(0)->actual_size == 0);
+		                     buffer_manager->GetBuffer(0)->actual_size == 0);
 		total_number_of_rows += sniffer.LinesSniffed();
 		current_file++;
 		if (sniffer.EmptyOrOnlyHeader()) {
@@ -94,19 +95,22 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
 	}
 
 	// We do a copy of the options to not pollute the options of the first file.
-	constexpr idx_t max_files_to_sniff = 10;
+	idx_t max_files_to_sniff = static_cast<idx_t>(options.files_to_sniff == -1)
+	                               ? NumericLimits<idx_t>::Maximum()
+	                               : static_cast<idx_t>(options.files_to_sniff);
 	idx_t files_to_sniff = file_paths.size() > max_files_to_sniff ? max_files_to_sniff : file_paths.size();
 	while (total_number_of_rows < required_number_of_lines && current_file < files_to_sniff) {
 		auto option_copy = option_og;
 		option_copy.file_path = file_paths[current_file].path;
-		auto buffer_manager = make_shared_ptr<CSVBufferManager>(context, option_copy, option_copy.file_path, false);
+		auto file_buffer_manager =
+		    make_shared_ptr<CSVBufferManager>(context, option_copy, option_copy.file_path, false);
 		// TODO: We could cache the sniffer to be reused during scanning. Currently that's an exercise left to the
 		// reader
-		CSVSniffer sniffer(option_copy, file_options, buffer_manager, CSVStateMachineCache::Get(context));
+		CSVSniffer sniffer(option_copy, file_options, file_buffer_manager, CSVStateMachineCache::Get(context));
 		auto sniffer_result = sniffer.SniffCSV();
 		idx_t rows_read = sniffer.LinesSniffed() - (option_copy.dialect_options.skip_rows.GetValue() +
 		                                            option_copy.dialect_options.header.GetValue());
-		if (buffer_manager->GetBuffer(0)->actual_size == 0) {
+		if (file_buffer_manager->GetBuffer(0)->actual_size == 0) {
 			schemas.emplace_back(true);
 		} else {
 			schemas.emplace_back(sniffer_result.names, sniffer_result.return_types, option_copy.file_path, rows_read);
@@ -125,14 +129,17 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
 			// A schema is bettah than no schema
 			best_schema = schema;
 		} else if (best_schema.GetRowsRead() == 0) {
-			// If the best-schema has no data-rows, that's easy, we just take the new schema
+			// If the best-schema has no data-rows, that's easy; we just take the new schema
 			best_schema = schema;
 		} else if (schema.GetRowsRead() != 0) {
 			// We might have conflicting-schemas, we must merge them
 			best_schema.MergeSchemas(schema, options.null_padding);
 		}
 	}
 
+	// At this point, replace a sqlnull with varchar for the type
+	best_schema.ReplaceNullWithVarchar();
+
 	if (names.empty()) {
 		names = best_schema.GetNames();
 		return_types = best_schema.GetTypes();
@@ -151,7 +158,7 @@ void SchemaDiscovery(ClientContext &context, ReadCSVData &result, CSVReaderOptio
 			return_types[i] = LogicalType::VARCHAR;
 		}
 	}
-	result.csv_schema = best_schema;
+	return best_schema;
 }
 
 void CSVMultiFileInfo::BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
@@ -161,7 +168,9 @@ void CSVMultiFileInfo::BindReader(ClientContext &context, vector<LogicalType> &r
 	auto &options = csv_data.options;
 	if (!bind_data.file_options.union_by_name) {
 		if (options.auto_detect) {
-			SchemaDiscovery(context, csv_data, options, bind_data.file_options, return_types, names, multi_file_list);
+			csv_data.csv_schema =
+			    CSVSchemaDiscovery::SchemaDiscovery(context, csv_data.buffer_manager, options, bind_data.file_options,
+			                                        return_types, names, multi_file_list);
 		} else {
 			// If we are not running the sniffer, the columns must be set!
 			if (!options.columns_set) {
diff --git a/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp
@@ -744,6 +744,12 @@ void CSVReaderOptions::ParseOption(ClientContext &context, const string &key, co
 		}
 	} else if (loption == "all_varchar") {
 		all_varchar = GetBooleanValue(loption, val);
+	} else if (loption == "files_to_sniff") {
+		files_to_sniff = ParseInteger(val, loption);
+		if (files_to_sniff < 1 && files_to_sniff != -1) {
+			throw BinderException(
+			    "Unsupported parameter for files_to_sniff: value must be -1 for all files or higher than one.");
+		}
 	} else if (loption == "normalize_names") {
 		normalize_names = GetBooleanValue(loption, val);
 	} else {
diff --git a/src/duckdb/src/function/table/read_csv.cpp b/src/duckdb/src/function/table/read_csv.cpp
@@ -93,6 +93,7 @@ void ReadCSVTableFunction::ReadCSVAddNamedParameters(TableFunction &table_functi
 	table_function.named_parameters["encoding"] = LogicalType::VARCHAR;
 	table_function.named_parameters["strict_mode"] = LogicalType::BOOLEAN;
 	table_function.named_parameters["thousands"] = LogicalType::VARCHAR;
+	table_function.named_parameters["files_to_sniff"] = LogicalType::BIGINT;
 
 	MultiFileReader::AddParameters(table_function);
 }
diff --git a/src/duckdb/src/function/table/sniff_csv.cpp b/src/duckdb/src/function/table/sniff_csv.cpp
@@ -7,6 +7,7 @@
 #include "duckdb/main/client_context.hpp"
 #include "duckdb/function/table/range.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_file_handle.hpp"
+#include "duckdb/execution/operator/csv_scanner/csv_multi_file_info.hpp"
 #include "duckdb/function/table/read_csv.hpp"
 
 namespace duckdb {
@@ -169,6 +170,11 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p,
 			sniffer_result.return_types[i] = LogicalType::VARCHAR;
 		}
 	}
+	for (auto &type : sniffer_result.return_types) {
+		if (type.id() == LogicalTypeId::SQLNULL) {
+			type = LogicalType::VARCHAR;
+		}
+	}
 	string str_opt;
 	string separator = ", ";
 	// Set output
diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp
@@ -1,5 +1,5 @@
 #ifndef DUCKDB_PATCH_VERSION
-#define DUCKDB_PATCH_VERSION "0-dev3621"
+#define DUCKDB_PATCH_VERSION "0-dev3631"
 #endif
 #ifndef DUCKDB_MINOR_VERSION
 #define DUCKDB_MINOR_VERSION 3
@@ -8,10 +8,10 @@
 #define DUCKDB_MAJOR_VERSION 1
 #endif
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "v1.3.0-dev3621"
+#define DUCKDB_VERSION "v1.3.0-dev3631"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "4075e3394f"
+#define DUCKDB_SOURCE_ID "1f0067f1a5"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"
diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_multi_file_info.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_multi_file_info.hpp
@@ -10,6 +10,8 @@
 
 #include "duckdb/common/multi_file/multi_file_function.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
+#include "duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp"
+#include "duckdb/execution/operator/csv_scanner/csv_schema.hpp"
 
 namespace duckdb {
 
@@ -22,6 +24,13 @@ class CSVFileReaderOptions : public BaseFileReaderOptions {
 	CSVReaderOptions options;
 };
 
+struct CSVSchemaDiscovery {
+	static CSVSchema SchemaDiscovery(ClientContext &context, shared_ptr<CSVBufferManager> &buffer_manager,
+	                                 CSVReaderOptions &options, const MultiFileOptions &file_options,
+	                                 vector<LogicalType> &return_types, vector<string> &names,
+	                                 MultiFileList &multi_file_list);
+};
+
 struct CSVMultiFileInfo : public MultiFileReaderInterface {
 	static unique_ptr<MultiFileReaderInterface> InitializeInterface(ClientContext &context, MultiFileReader &reader,
 	                                                                MultiFileList &file_list);
diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp
@@ -104,6 +104,10 @@ struct CSVReaderOptions {
 	//! Result size of sniffing phases
 	static constexpr idx_t sniff_size = 2048;
 
+	//! In case this is a glob or list of multiple files, how many shall be used to sniff.
+	//! -1 means all
+	int64_t files_to_sniff = 10;
+
 	//! Number of sample chunks used in auto-detection
 	idx_t sample_size_chunks = 20480 / sniff_size;
 	//! Consider all columns to be of type varchar
diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp
@@ -59,6 +59,9 @@ struct CSVSchema {
 	//! Get a vector with types
 	vector<LogicalType> GetTypes() const;
 
+	//! Replace any SQLNull types with Varchar
+	void ReplaceNullWithVarchar();
+
 private:
 	//! If a type can be cast to another
 	static bool CanWeCastIt(LogicalTypeId source, LogicalTypeId destination);
diff --git a/src/duckdb/src/main/relation/read_csv_relation.cpp b/src/duckdb/src/main/relation/read_csv_relation.cpp
@@ -44,9 +44,9 @@ CSVReaderOptions ReadCSVRelationBind(const shared_ptr<ClientContext> &context, c
 	csv_options.FromNamedParameters(options, *context, file_options);
 
 	// Run the auto-detect, populating the options with the detected settings
+	SimpleMultiFileList multi_file_list(files);
 
 	if (file_options.union_by_name) {
-		SimpleMultiFileList multi_file_list(files);
 		vector<LogicalType> types;
 		vector<string> names;
 		auto result = make_uniq<MultiFileBindData>();
@@ -73,14 +73,13 @@ CSVReaderOptions ReadCSVRelationBind(const shared_ptr<ClientContext> &context, c
 		}
 	} else {
 		if (csv_options.auto_detect) {
+			vector<LogicalType> return_types;
+			vector<string> names;
 			shared_ptr<CSVBufferManager> buffer_manager;
-			buffer_manager = make_shared_ptr<CSVBufferManager>(*context, csv_options, files[0].path, 0);
-			CSVSniffer sniffer(csv_options, file_options, buffer_manager, CSVStateMachineCache::Get(*context));
-			auto sniffer_result = sniffer.SniffCSV();
-			auto &types = sniffer_result.return_types;
-			auto &names = sniffer_result.names;
-			for (idx_t i = 0; i < types.size(); i++) {
-				columns.emplace_back(names[i], types[i]);
+			CSVSchemaDiscovery::SchemaDiscovery(*context, buffer_manager, csv_options, file_options, return_types,
+			                                    names, multi_file_list);
+			for (idx_t i = 0; i < return_types.size(); i++) {
+				columns.emplace_back(names[i], return_types[i]);
 			}
 		} else {
 			for (idx_t i = 0; i < csv_options.sql_type_list.size(); i++) {

Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,7 @@ void ReadCSVTableFunction::ReadCSVAddNamedParameters(TableFunction &table_functi`
`93`	`93`	`table_function.named_parameters["encoding"] = LogicalType::VARCHAR;`
`94`	`94`	`table_function.named_parameters["strict_mode"] = LogicalType::BOOLEAN;`
`95`	`95`	`table_function.named_parameters["thousands"] = LogicalType::VARCHAR;`
	`96`	`+ table_function.named_parameters["files_to_sniff"] = LogicalType::BIGINT;`
`96`	`97`
`97`	`98`	`MultiFileReader::AddParameters(table_function);`
`98`	`99`	`}`