duckdb · Tmonster · Dec 3, 2025 · Dec 4, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,15 +2,16 @@ cmake_minimum_required(VERSION 3.5...3.29)
 
 # Set extension name here
 set(TARGET_NAME iceberg)
-project(${TARGET_NAME})
+
+set(EXTENSION_NAME ${TARGET_NAME}_extension)
+set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
 
 set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard")
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
 set(EXTENSION_NAME ${TARGET_NAME}_extension)
 include_directories(src/include)
 
-
 set(EXTENSION_SOURCES
     src/iceberg_extension.cpp
     src/iceberg_functions.cpp

diff --git a/extension_config.cmake b/extension_config.cmake
@@ -1,3 +1,9 @@
+# Extension from this repo
+duckdb_extension_load(iceberg
+		SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
+		LOAD_TESTS
+)
+
 # This file is included by DuckDB's build system. It specifies which extension to load
 if (NOT EMSCRIPTEN)
 duckdb_extension_load(avro
@@ -7,12 +13,6 @@ duckdb_extension_load(avro
 )
 endif()
 
-# Extension from this repo
-duckdb_extension_load(iceberg
-    SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
-    LOAD_TESTS
-)
-
 if (NOT EMSCRIPTEN)
 duckdb_extension_load(tpch)
 duckdb_extension_load(icu)

diff --git a/scripts/data_generators/generate_data.py b/scripts/data_generators/generate_data.py
@@ -31,6 +31,8 @@
     print(f"Generating for '{target}'")
     for test in tests:
         print(f"Generating test '{test.table}'")
+        if test.table == "spark_written_upper_lower_bounds":
+            continue
         test.generate(con)
 
 if __name__ == "__main__":

diff --git a/scripts/data_generators/tests/spark_written_upper_lower_bounds/__init__.py b/scripts/data_generators/tests/spark_written_upper_lower_bounds/__init__.py
@@ -0,0 +1,52 @@
+from scripts.data_generators.tests.base import IcebergTest
+import pathlib
+from pyspark.sql import SparkSession
+import pyspark
+import pyspark.sql
+from pyspark import SparkContext
+import os
+
+SPARK_RUNTIME_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'iceberg-spark-runtime-3.5_2.12-1.9.0.jar')
+
+@IcebergTest.register()
+class Test(IcebergTest):
+    def __init__(self):
+        path = pathlib.PurePath(__file__)
+        super().__init__(path.parent.name)
+
+    def setup(self, con):
+        os.environ["PYSPARK_SUBMIT_ARGS"] = (
+            "--packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.apache.iceberg:iceberg-aws-bundle:1.9.0 pyspark-shell"
+        )
+        os.environ["AWS_REGION"] = "us-east-1"
+        os.environ["AWS_ACCESS_KEY_ID"] = "admin"
+        os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
+        if SparkContext._active_spark_context is not None:
+            SparkContext._active_spark_context.stop()
+
+        spark = (
+            SparkSession.builder.appName("DuckDB REST Integration test")
+            .master("local[1]")
+            .config(
+                "spark.sql.extensions",
+                "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
+            )
+            .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
+            .config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
+            .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
+            .config("spark.sql.catalog.demo.type", "rest")
+            .config("spark.sql.catalog.demo.uri", "http://127.0.0.1:8181")
+            .config("spark.sql.catalog.demo.warehouse", "s3://warehouse/wh/")
+            .config("spark.sql.catalog.demo.s3.endpoint", "http://127.0.0.1:9000")
+            .config("spark.sql.catalog.demo.s3.path-style-access", "true")
+            .config('spark.driver.memory', '10g')
+            .config('spark.sql.session.timeZone', 'UTC')
+            .config("spark.sql.catalogImplementation", "in-memory")
+            .config("spark.sql.catalog.demo.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
+            .config('spark.jars', SPARK_RUNTIME_PATH)
+            .getOrCreate()
+        )
+        spark.sql("USE demo")
+        spark.sql("CREATE NAMESPACE IF NOT EXISTS default")
+        con.con = spark
+
diff --git a/scripts/data_generators/tests/spark_written_upper_lower_bounds/q00.sql b/scripts/data_generators/tests/spark_written_upper_lower_bounds/q00.sql
@@ -0,0 +1,21 @@
+CREATE OR REPLACE TABLE default.spark_written_upper_lower_bounds (
+    int_type INTEGER,
+    bigint_type BIGINT,
+    varchar_type VARCHAR(100),
+    bool_type BOOLEAN,
+    float_type FLOAT,
+    double_type DOUBLE,
+    decimal_type_18_3 DECIMAL(18, 3),
+    date_type DATE,
+--     time_type TIME,
+    timestamp_type TIMESTAMP,
+--     timestamp_tz_type TIMESTAMPTZ,
+--     uuid_type UUID,
+    binary_type BINARY
+)
+USING ICEBERG
+TBLPROPERTIES (
+    'format-version' = '2',
+    'write.update.mode' = 'copy-on-write',
+    'write.target-file-size-bytes' = '2132'
+);
diff --git a/scripts/data_generators/tests/spark_written_upper_lower_bounds/q01.sql b/scripts/data_generators/tests/spark_written_upper_lower_bounds/q01.sql
@@ -0,0 +1,46 @@
+INSERT INTO default.spark_written_upper_lower_bounds VALUES
+-- Lower bounds
+(
+    -2147483648,                         -- int_type (Integer min)
+    -9223372036854775808,               -- bigint_type (Long min)
+    '',                                  -- varchar_type (empty string as lower bound)
+    false,                               -- bool_type
+    -3.4028235E38,                       -- float_type (Float min)
+    -1.7976931348623157E308,            -- double_type (Double min)
+    -9999999999999.999,                 -- decimal(18,3) lower bound
+    DATE '0001-01-01',                   -- date_type (Spark's min date)
+--     TIME '00:00:00',                     -- time_type
+    TIMESTAMP '0001-01-01 00:00:00',     -- timestamp_type
+--     TIMESTAMPTZ '0001-01-01 00:00:00+00',-- timestamp_tz_type
+--     UUID '00000000-0000-0000-0000-000000000000', -- uuid_type
+    X''                                   -- binary_type (empty binary)
+),
+-- Upper bounds
+(
+    2147483647,                          -- int_type (Integer max)
+    9223372036854775807,                 -- bigint_type (Long max)
+    RPAD('Z', 100, 'Z'),                 -- varchar_type (max-length string)
+    true,                                -- bool_type
+    3.4028235E38,                        -- float_type (Float max)
+    1.7976931348623157E308,             -- double_type (Double max)
+    9999999999999.999,                  -- decimal(18,3) upper bound
+    DATE '9999-12-31',                   -- date_type
+--     TIME '23:59:59.999999',              -- time_type (microsecond max)
+    TIMESTAMP '9999-12-31 23:59:59.999999', -- timestamp_type
+--     TIMESTAMPTZ '9999-12-31 23:59:59.999999+00', -- timestamp_tz_type
+--     UUID 'ffffffff-ffff-ffff-ffff-ffffffffffff', -- uuid_type
+    X'FFFFFFFF'                           -- binary_type (example max-ish binary)
+),
+-- NULL values
+(
+    NULL,                 -- int_type (Integer max)
+    NULL,                 -- bigint_type (Long max)
+    NULL,                 -- varchar_type (max-length string)
+    NULL,                 -- bool_type
+    NULL,                 -- float_type (Float max)
+    NULL,                 -- double_type (Double max)
+    NULL,                 -- decimal(18,3) upper bound
+    NULL,                 -- date_type
+    NULL,                 -- timestamp_type
+    NULL                  -- binary_type (example max-ish binary)
+);
diff --git a/src/catalog_api.cpp b/src/catalog_api.cpp
@@ -1,6 +1,4 @@
 #include "catalog_api.hpp"
-#include "include/catalog_api.hpp"
-
 #include "catalog_utils.hpp"
 #include "iceberg_logging.hpp"
 #include "storage/irc_catalog.hpp"
@@ -14,9 +12,7 @@
 #include "duckdb/common/error_data.hpp"
 #include "duckdb/common/http_util.hpp"
 #include "duckdb/common/exception/http_exception.hpp"
-#include "include/storage/irc_authorization.hpp"
-#include "include/storage/irc_catalog.hpp"
-
+#include "storage/irc_authorization.hpp"
 #include "rest_catalog/objects/list.hpp"
 
 using namespace duckdb_yyjson;

diff --git a/src/iceberg_manifest.cpp b/src/iceberg_manifest.cpp
@@ -3,6 +3,7 @@
 
 #include "duckdb/storage/caching_file_system.hpp"
 #include "catalog_utils.hpp"
+#include "iceberg_value.hpp"
 #include "storage/iceberg_table_information.hpp"
 
 namespace duckdb {
@@ -51,8 +52,21 @@
 	for (auto &child : upper_bounds) {
 		upper_bounds_values.push_back(Value::STRUCT({{"key", child.first}, {"value", child.second}}));
 	}
+
 	children.push_back(Value::MAP(LogicalType::STRUCT(bounds_types), upper_bounds_values));
 
+	child_list_t<LogicalType> null_value_count_types;
+	null_value_count_types.emplace_back("key", LogicalType::INTEGER);
+	null_value_count_types.emplace_back("value", LogicalType::BIGINT);
+
+	vector<Value> null_value_counts_values;
+	// upper bounds: map<129: int, 130: binary> - 128
+	for (auto &child : null_value_counts) {
+		null_value_counts_values.push_back(Value::STRUCT({{"key", child.first}, {"value", child.second}}));
+	}
+
+	children.push_back(Value::MAP(LogicalType::STRUCT(null_value_count_types), null_value_counts_values));
+
 	return Value::STRUCT(type, children);
 }
 
@@ -220,7 +234,7 @@
 		yyjson_mut_obj_add_strcpy(doc, partition_struct, "type", "struct");
 		//! NOTE: this has to be populated with the fields of the partition spec when we support INSERT into a
 		//! partitioned table
 		[[maybe_unused]] auto partition_fields = yyjson_mut_obj_add_arr(doc, partition_struct, "fields");
 	}

 	{
@@ -335,6 +349,47 @@
 		yyjson_mut_obj_add_uint(doc, val_obj, "id", UPPER_BOUNDS_VALUE);
 	}
 
+	// null_value_counts_struct
+	// lower bounds struct
+	child_list_t<LogicalType> null_value_counts_fields;
+	null_value_counts_fields.emplace_back("key", LogicalType::INTEGER);
+	null_value_counts_fields.emplace_back("value", LogicalType::BIGINT);
+	{
+		// child_list_t<Value> null_value_counts_struct;
+		// upper bounds: map<121: int, 122: binary>
+		children.emplace_back("null_value_counts", LogicalType::MAP(LogicalType::STRUCT(null_value_counts_fields)));
+
+		child_list_t<Value> null_values_counts_record_field_ids;
+		null_values_counts_record_field_ids.emplace_back("__duckdb_field_id", Value::INTEGER(NULL_VALUE_COUNTS));
+		null_values_counts_record_field_ids.emplace_back("key", Value::INTEGER(NULL_VALUE_COUNTS_KEY));
+		null_values_counts_record_field_ids.emplace_back("value", Value::INTEGER(NULL_VALUE_COUNTS_VALUE));
+
+		data_file_field_ids.emplace_back("null_value_counts", Value::STRUCT(null_values_counts_record_field_ids));
+		auto field_obj = yyjson_mut_arr_add_obj(doc, child_fields_arr);
+		yyjson_mut_obj_add_uint(doc, field_obj, "id", NULL_VALUE_COUNTS);
+		yyjson_mut_obj_add_strcpy(doc, field_obj, "name", "null_value_counts");
+		yyjson_mut_obj_add_bool(doc, field_obj, "required", false);
+
+		auto null_value_counts_type_struct = yyjson_mut_obj_add_obj(doc, field_obj, "type");
+		yyjson_mut_obj_add_strcpy(doc, null_value_counts_type_struct, "type", "array");
+		auto items_obj = yyjson_mut_obj_add_obj(doc, null_value_counts_type_struct, "items");
+		yyjson_mut_obj_add_strcpy(doc, items_obj, "type", "record");
+		yyjson_mut_obj_add_strcpy(
+		    doc, items_obj, "name",
+		    StringUtil::Format("k%d_k%d", NULL_VALUE_COUNTS_KEY, NULL_VALUE_COUNTS_VALUE).c_str());
+		auto record_fields_arr = yyjson_mut_obj_add_arr(doc, items_obj, "fields");
+
+		auto key_obj = yyjson_mut_arr_add_obj(doc, record_fields_arr);
+		yyjson_mut_obj_add_strcpy(doc, key_obj, "name", "key");
+		yyjson_mut_obj_add_strcpy(doc, key_obj, "type", "int");
+		yyjson_mut_obj_add_uint(doc, key_obj, "id", NULL_VALUE_COUNTS_KEY);
+
+		auto val_obj = yyjson_mut_arr_add_obj(doc, record_fields_arr);
+		yyjson_mut_obj_add_strcpy(doc, val_obj, "name", "value");
+		yyjson_mut_obj_add_strcpy(doc, val_obj, "type", "binary");
+		yyjson_mut_obj_add_uint(doc, val_obj, "id", NULL_VALUE_COUNTS_VALUE);
+	}
+
 	{
 		// data_file: struct(...) - 2
 		names.push_back("data_file");