@@ -1032,8 +1032,16 @@ def test_data_file_uuid_array_written_as_string(
10321032 "WITH (compatibility_mode = 'snowflake');" ,
10331033 pg_conn ,
10341034 )
1035+ # a populated row, a whole-array NULL, a NULL element, and an empty array:
1036+ # none may introduce a uuid leaf, and the NULLs must survive the round trip.
10351037 run_command (
1036- f"INSERT INTO compat_df_arr VALUES (1, ARRAY['{ u1 } ','{ u2 } ']::uuid[]);" ,
1038+ f"""
1039+ INSERT INTO compat_df_arr VALUES
1040+ (1, ARRAY['{ u1 } ','{ u2 } ']::uuid[]),
1041+ (2, NULL),
1042+ (3, ARRAY['{ u1 } ', NULL]::uuid[]),
1043+ (4, ARRAY[]::uuid[]);
1044+ """ ,
10371045 pg_conn ,
10381046 )
10391047 pg_conn .commit ()
@@ -1043,6 +1051,14 @@ def test_data_file_uuid_array_written_as_string(
10431051 assert ("element" , "BYTE_ARRAY" ) in _parquet_leaf_types (
10441052 superuser_conn , "compat_df_arr"
10451053 )
1054+ # the NULLs round-trip: whole-array NULL stays NULL, the NULL element is
1055+ # preserved inside the (now text) array, and uuids read back as text.
1056+ # Read on pg_conn (the owner) so the scan's lock doesn't block the DROP.
1057+ rows = run_query ("SELECT id, us FROM compat_df_arr ORDER BY id" , pg_conn )
1058+ assert rows [0 ][1 ] == [u1 , u2 ]
1059+ assert rows [1 ][1 ] is None
1060+ assert rows [2 ][1 ] == [u1 , None ]
1061+ assert rows [3 ][1 ] == []
10461062 finally :
10471063 run_command ("DROP TABLE compat_df_arr" , pg_conn )
10481064 pg_conn .commit ()
@@ -1062,17 +1078,32 @@ def test_data_file_composite_and_map_written_as_string(
10621078 pg_conn ,
10631079 )
10641080 converted_map = _col_format_type (pg_conn , "compat_df_comp" , "m" )
1081+ # populated; whole composite + map NULL; composite with NULL uuid fields +
1082+ # empty map; composite with a NULL array element + NULL map value.
10651083 run_command (
10661084 f"""
10671085 INSERT INTO compat_df_comp VALUES
10681086 (1, ROW('{ u } ', ARRAY['{ u } ']::uuid[], 'hi'),
1069- ARRAY[('k', '{ u } ')]::{ converted_map } );
1087+ ARRAY[('k', '{ u } ')]::{ converted_map } ),
1088+ (2, NULL, NULL),
1089+ (3, ROW(NULL, NULL, 'no-ids'), ARRAY[]::{ converted_map } ),
1090+ (4, ROW('{ u } ', ARRAY['{ u } ', NULL]::uuid[], NULL),
1091+ ARRAY[('k', NULL)]::{ converted_map } );
10701092 """ ,
10711093 pg_conn ,
10721094 )
10731095 pg_conn .commit ()
10741096 try :
10751097 _assert_data_files_uuid_free (superuser_conn , "compat_df_comp" )
1098+ # NULLs survive: whole composite/map NULL, NULL fields, and the NULL
1099+ # array element inside the (now text) composite all round-trip.
1100+ rows = run_query (
1101+ "SELECT id, (c).cid, (c).cids, m FROM compat_df_comp ORDER BY id" ,
1102+ pg_conn ,
1103+ )
1104+ assert rows [1 ][1 ] is None and rows [1 ][2 ] is None and rows [1 ][3 ] is None
1105+ assert rows [2 ][1 ] is None and rows [2 ][2 ] is None
1106+ assert rows [3 ][2 ] == [u , None ]
10761107 finally :
10771108 run_command ("DROP TABLE compat_df_comp" , pg_conn )
10781109 pg_conn .commit ()
@@ -1105,13 +1136,15 @@ def test_data_file_deeply_nested_written_as_string(
11051136 # array-of-composite structure at the schema level.
11061137 leaf = f"ROW('{ u } ', ARRAY['{ u } '], 'l')"
11071138 mid = f"ROW({ leaf } , NULL, '{ u } ')"
1139+ # a populated deep row; an all-NULL row; and a row with NULLs at every
1140+ # intermediate level (NULL leaf, NULL array element, NULL mid_id).
1141+ mid_nulls = f"ROW(NULL, NULL, NULL)"
11081142 run_command (
11091143 f"""
1110- INSERT INTO compat_df_deep VALUES (
1111- 1, '{ u } ', ARRAY['{ u } '],
1112- { mid } ,
1113- NULL
1114- );
1144+ INSERT INTO compat_df_deep VALUES
1145+ (1, '{ u } ', ARRAY['{ u } '], { mid } , NULL),
1146+ (2, NULL, NULL, NULL, NULL),
1147+ (3, '{ u } ', ARRAY['{ u } ', NULL], { mid_nulls } , NULL);
11151148 """ ,
11161149 pg_conn ,
11171150 )
@@ -1150,11 +1183,15 @@ def test_data_file_copy_from_uuid_parquet_written_as_string(
11501183 src = f"s3://{ TEST_BUCKET } /compat_df_src/u.parquet"
11511184 duck = create_duckdb_conn ()
11521185 try :
1186+ # source rows: populated, whole-array NULL, NULL element, empty array
11531187 duck .execute (
11541188 f"""
1155- COPY (SELECT 1 AS id,
1156- ['{ u1 } '::uuid, '{ u2 } '::uuid] AS us)
1157- TO '{ src } ' (FORMAT parquet);
1189+ COPY (
1190+ SELECT 1 AS id, ['{ u1 } '::uuid, '{ u2 } '::uuid] AS us
1191+ UNION ALL SELECT 2, NULL
1192+ UNION ALL SELECT 3, ['{ u1 } '::uuid, NULL]
1193+ UNION ALL SELECT 4, []::uuid[]
1194+ ) TO '{ src } ' (FORMAT parquet);
11581195 """
11591196 )
11601197 # sanity: the source really is a uuid leaf (proves the test can detect it)
@@ -1174,6 +1211,69 @@ def test_data_file_copy_from_uuid_parquet_written_as_string(
11741211 assert ("element" , "BYTE_ARRAY" ) in _parquet_leaf_types (
11751212 superuser_conn , "compat_df_copy"
11761213 )
1214+ # NULLs from the source parquet round-trip through the cast
1215+ rows = run_query ("SELECT id, us FROM compat_df_copy ORDER BY id" , pg_conn )
1216+ assert rows [0 ][1 ] == [u1 , u2 ]
1217+ assert rows [1 ][1 ] is None
1218+ assert rows [2 ][1 ] == [u1 , None ]
1219+ assert rows [3 ][1 ] == []
11771220 finally :
11781221 run_command ("DROP TABLE compat_df_copy" , pg_conn )
11791222 pg_conn .commit ()
1223+
1224+
1225+ def test_data_file_all_null_nested_column_still_string (
1226+ s3 , pg_conn , superuser_conn , extension , with_default_location
1227+ ):
1228+ """
1229+ The hardest case for a column-oriented writer: when EVERY row's nested value
1230+ is NULL the writer never observes a concrete uuid, so nothing but the column
1231+ type can force the leaf to string. The data file must still serialize the
1232+ rewritten leaves as BYTE_ARRAY (never uuid), and the top-level uuid stays
1233+ uuid even when it too is all-NULL.
1234+ """
1235+ run_command (
1236+ """
1237+ CREATE TYPE df_null_comp AS (cid uuid, cids uuid[], note text);
1238+ CREATE TABLE compat_df_allnull (
1239+ id int,
1240+ top_id uuid, -- top-level: stays uuid
1241+ us uuid[], -- nested: -> text[]
1242+ c df_null_comp -- nested: cid / cids -> text
1243+ ) USING iceberg WITH (compatibility_mode = 'snowflake');
1244+ """ ,
1245+ pg_conn ,
1246+ )
1247+ run_command (
1248+ """
1249+ INSERT INTO compat_df_allnull VALUES
1250+ (1, NULL, NULL, NULL),
1251+ (2, NULL, NULL, NULL);
1252+ """ ,
1253+ pg_conn ,
1254+ )
1255+ pg_conn .commit ()
1256+ try :
1257+ # no rewritten leaf may be a uuid, even with no concrete value to infer
1258+ paths = _data_file_paths (superuser_conn , "compat_df_allnull" )
1259+ duck = create_duckdb_conn ()
1260+ try :
1261+ for path in paths :
1262+ offenders = [
1263+ (n , p , l )
1264+ for (n , p , l ) in _parquet_uuid_leaves (duck , path )
1265+ if n != "top_id" # the top-level uuid legitimately stays uuid
1266+ ]
1267+ assert (
1268+ not offenders
1269+ ), f"all-NULL file serialized uuid leaves: { offenders } "
1270+ finally :
1271+ duck .close ()
1272+ # positively: the rewritten array element is a string leaf
1273+ assert ("element" , "BYTE_ARRAY" ) in _parquet_leaf_types (
1274+ superuser_conn , "compat_df_allnull"
1275+ )
1276+ finally :
1277+ run_command ("DROP TABLE compat_df_allnull" , pg_conn )
1278+ run_command ("DROP TYPE df_null_comp" , pg_conn )
1279+ pg_conn .commit ()
0 commit comments