Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@ cmake_minimum_required(VERSION 3.5...3.29)

# Set extension name here
set(TARGET_NAME iceberg)
project(${TARGET_NAME})

set(EXTENSION_NAME ${TARGET_NAME}_extension)
set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)

set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard")
set(CMAKE_CXX_STANDARD_REQUIRED True)

set(EXTENSION_NAME ${TARGET_NAME}_extension)
include_directories(src/include)


set(EXTENSION_SOURCES
src/iceberg_extension.cpp
src/iceberg_functions.cpp
Expand Down
12 changes: 6 additions & 6 deletions extension_config.cmake
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# Extension from this repo
duckdb_extension_load(iceberg
SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
LOAD_TESTS
)

# This file is included by DuckDB's build system. It specifies which extension to load
if (NOT EMSCRIPTEN)
duckdb_extension_load(avro
Expand All @@ -7,12 +13,6 @@ duckdb_extension_load(avro
)
endif()

# Extension from this repo
duckdb_extension_load(iceberg
SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
LOAD_TESTS
)

if (NOT EMSCRIPTEN)
duckdb_extension_load(tpch)
duckdb_extension_load(icu)
Expand Down
2 changes: 2 additions & 0 deletions scripts/data_generators/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
print(f"Generating for '{target}'")
for test in tests:
print(f"Generating test '{test.table}'")
if test.table == "spark_written_upper_lower_bounds":
continue
test.generate(con)

if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from scripts.data_generators.tests.base import IcebergTest
import pathlib
from pyspark.sql import SparkSession
import pyspark
import pyspark.sql
from pyspark import SparkContext
import os

SPARK_RUNTIME_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'iceberg-spark-runtime-3.5_2.12-1.9.0.jar')

@IcebergTest.register()
class Test(IcebergTest):
def __init__(self):
path = pathlib.PurePath(__file__)
super().__init__(path.parent.name)

def setup(self, con):
os.environ["PYSPARK_SUBMIT_ARGS"] = (
"--packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.apache.iceberg:iceberg-aws-bundle:1.9.0 pyspark-shell"
)
os.environ["AWS_REGION"] = "us-east-1"
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
if SparkContext._active_spark_context is not None:
SparkContext._active_spark_context.stop()

spark = (
SparkSession.builder.appName("DuckDB REST Integration test")
.master("local[1]")
.config(
"spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
)
.config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
.config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
.config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
.config("spark.sql.catalog.demo.type", "rest")
.config("spark.sql.catalog.demo.uri", "http://127.0.0.1:8181")
.config("spark.sql.catalog.demo.warehouse", "s3://warehouse/wh/")
.config("spark.sql.catalog.demo.s3.endpoint", "http://127.0.0.1:9000")
.config("spark.sql.catalog.demo.s3.path-style-access", "true")
.config('spark.driver.memory', '10g')
.config('spark.sql.session.timeZone', 'UTC')
.config("spark.sql.catalogImplementation", "in-memory")
.config("spark.sql.catalog.demo.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
.config('spark.jars', SPARK_RUNTIME_PATH)
.getOrCreate()
)
spark.sql("USE demo")
spark.sql("CREATE NAMESPACE IF NOT EXISTS default")
con.con = spark

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
CREATE OR REPLACE TABLE default.spark_written_upper_lower_bounds (
int_type INTEGER,
bigint_type BIGINT,
varchar_type VARCHAR(100),
bool_type BOOLEAN,
float_type FLOAT,
double_type DOUBLE,
decimal_type_18_3 DECIMAL(18, 3),
date_type DATE,
-- time_type TIME,
timestamp_type TIMESTAMP,
-- timestamp_tz_type TIMESTAMPTZ,
-- uuid_type UUID,
binary_type BINARY
)
USING ICEBERG
TBLPROPERTIES (
'format-version' = '2',
'write.update.mode' = 'copy-on-write',
'write.target-file-size-bytes' = '2132'
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
INSERT INTO default.spark_written_upper_lower_bounds VALUES
-- Lower bounds
(
-2147483648, -- int_type (Integer min)
-9223372036854775808, -- bigint_type (Long min)
'', -- varchar_type (empty string as lower bound)
false, -- bool_type
-3.4028235E38, -- float_type (Float min)
-1.7976931348623157E308, -- double_type (Double min)
-9999999999999.999, -- decimal(18,3) lower bound
DATE '0001-01-01', -- date_type (Spark's min date)
-- TIME '00:00:00', -- time_type
TIMESTAMP '0001-01-01 00:00:00', -- timestamp_type
-- TIMESTAMPTZ '0001-01-01 00:00:00+00',-- timestamp_tz_type
-- UUID '00000000-0000-0000-0000-000000000000', -- uuid_type
X'' -- binary_type (empty binary)
),
-- Upper bounds
(
2147483647, -- int_type (Integer max)
9223372036854775807, -- bigint_type (Long max)
RPAD('Z', 100, 'Z'), -- varchar_type (max-length string)
true, -- bool_type
3.4028235E38, -- float_type (Float max)
1.7976931348623157E308, -- double_type (Double max)
9999999999999.999, -- decimal(18,3) upper bound
DATE '9999-12-31', -- date_type
-- TIME '23:59:59.999999', -- time_type (microsecond max)
TIMESTAMP '9999-12-31 23:59:59.999999', -- timestamp_type
-- TIMESTAMPTZ '9999-12-31 23:59:59.999999+00', -- timestamp_tz_type
-- UUID 'ffffffff-ffff-ffff-ffff-ffffffffffff', -- uuid_type
X'FFFFFFFF' -- binary_type (example max-ish binary)
),
-- NULL values
(
NULL, -- int_type (Integer max)
NULL, -- bigint_type (Long max)
NULL, -- varchar_type (max-length string)
NULL, -- bool_type
NULL, -- float_type (Float max)
NULL, -- double_type (Double max)
NULL, -- decimal(18,3) upper bound
NULL, -- date_type
NULL, -- timestamp_type
NULL -- binary_type (example max-ish binary)
);
6 changes: 1 addition & 5 deletions src/catalog_api.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#include "catalog_api.hpp"
#include "include/catalog_api.hpp"

#include "catalog_utils.hpp"
#include "iceberg_logging.hpp"
#include "storage/irc_catalog.hpp"
Expand All @@ -14,9 +12,7 @@
#include "duckdb/common/error_data.hpp"
#include "duckdb/common/http_util.hpp"
#include "duckdb/common/exception/http_exception.hpp"
#include "include/storage/irc_authorization.hpp"
#include "include/storage/irc_catalog.hpp"

#include "storage/irc_authorization.hpp"
#include "rest_catalog/objects/list.hpp"

using namespace duckdb_yyjson;
Expand Down
55 changes: 55 additions & 0 deletions src/iceberg_manifest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "duckdb/storage/caching_file_system.hpp"
#include "catalog_utils.hpp"
#include "iceberg_value.hpp"
#include "storage/iceberg_table_information.hpp"

namespace duckdb {
Expand Down Expand Up @@ -51,8 +52,21 @@
for (auto &child : upper_bounds) {
upper_bounds_values.push_back(Value::STRUCT({{"key", child.first}, {"value", child.second}}));
}

children.push_back(Value::MAP(LogicalType::STRUCT(bounds_types), upper_bounds_values));

child_list_t<LogicalType> null_value_count_types;
null_value_count_types.emplace_back("key", LogicalType::INTEGER);
null_value_count_types.emplace_back("value", LogicalType::BIGINT);

vector<Value> null_value_counts_values;
// upper bounds: map<129: int, 130: binary> - 128
for (auto &child : null_value_counts) {
null_value_counts_values.push_back(Value::STRUCT({{"key", child.first}, {"value", child.second}}));
}

children.push_back(Value::MAP(LogicalType::STRUCT(null_value_count_types), null_value_counts_values));

return Value::STRUCT(type, children);
}

Expand Down Expand Up @@ -220,7 +234,7 @@
yyjson_mut_obj_add_strcpy(doc, partition_struct, "type", "struct");
//! NOTE: this has to be populated with the fields of the partition spec when we support INSERT into a
//! partitioned table
[[maybe_unused]] auto partition_fields = yyjson_mut_obj_add_arr(doc, partition_struct, "fields");

Check warning on line 237 in src/iceberg_manifest.cpp

View workflow job for this annotation

GitHub Actions / Build extension binaries / Windows (windows_amd64, windows-latest, x64-windows-static-release, x64-windows-static-release, t...

attribute [[maybe_unused]] requires at least '/std:c++17'; ignored
}

{
Expand Down Expand Up @@ -335,6 +349,47 @@
yyjson_mut_obj_add_uint(doc, val_obj, "id", UPPER_BOUNDS_VALUE);
}

// null_value_counts_struct
// lower bounds struct
child_list_t<LogicalType> null_value_counts_fields;
null_value_counts_fields.emplace_back("key", LogicalType::INTEGER);
null_value_counts_fields.emplace_back("value", LogicalType::BIGINT);
{
// child_list_t<Value> null_value_counts_struct;
// upper bounds: map<121: int, 122: binary>
children.emplace_back("null_value_counts", LogicalType::MAP(LogicalType::STRUCT(null_value_counts_fields)));

child_list_t<Value> null_values_counts_record_field_ids;
null_values_counts_record_field_ids.emplace_back("__duckdb_field_id", Value::INTEGER(NULL_VALUE_COUNTS));
null_values_counts_record_field_ids.emplace_back("key", Value::INTEGER(NULL_VALUE_COUNTS_KEY));
null_values_counts_record_field_ids.emplace_back("value", Value::INTEGER(NULL_VALUE_COUNTS_VALUE));

data_file_field_ids.emplace_back("null_value_counts", Value::STRUCT(null_values_counts_record_field_ids));
auto field_obj = yyjson_mut_arr_add_obj(doc, child_fields_arr);
yyjson_mut_obj_add_uint(doc, field_obj, "id", NULL_VALUE_COUNTS);
yyjson_mut_obj_add_strcpy(doc, field_obj, "name", "null_value_counts");
yyjson_mut_obj_add_bool(doc, field_obj, "required", false);

auto null_value_counts_type_struct = yyjson_mut_obj_add_obj(doc, field_obj, "type");
yyjson_mut_obj_add_strcpy(doc, null_value_counts_type_struct, "type", "array");
auto items_obj = yyjson_mut_obj_add_obj(doc, null_value_counts_type_struct, "items");
yyjson_mut_obj_add_strcpy(doc, items_obj, "type", "record");
yyjson_mut_obj_add_strcpy(
doc, items_obj, "name",
StringUtil::Format("k%d_k%d", NULL_VALUE_COUNTS_KEY, NULL_VALUE_COUNTS_VALUE).c_str());
auto record_fields_arr = yyjson_mut_obj_add_arr(doc, items_obj, "fields");

auto key_obj = yyjson_mut_arr_add_obj(doc, record_fields_arr);
yyjson_mut_obj_add_strcpy(doc, key_obj, "name", "key");
yyjson_mut_obj_add_strcpy(doc, key_obj, "type", "int");
yyjson_mut_obj_add_uint(doc, key_obj, "id", NULL_VALUE_COUNTS_KEY);

auto val_obj = yyjson_mut_arr_add_obj(doc, record_fields_arr);
yyjson_mut_obj_add_strcpy(doc, val_obj, "name", "value");
yyjson_mut_obj_add_strcpy(doc, val_obj, "type", "binary");
yyjson_mut_obj_add_uint(doc, val_obj, "id", NULL_VALUE_COUNTS_VALUE);
}

{
// data_file: struct(...) - 2
names.push_back("data_file");
Expand Down
Loading
Loading