Skip to content

Commit 6715bd0

Browse files
pg_lake_iceberg: cover NULLs in compatibility_mode data-file tests
The data-file assertions only inserted fully-populated rows. Add NULL coverage to every case -- whole-value NULL, NULL array element, empty array, NULL composite/map fields -- since a column-oriented writer can only get a rewritten leaf's type from the schema (not an observed value). New test_data_file_all_null_nested_column_still_string drives the extreme: every row NULL, so nothing but the column type can force the string leaf. Read-backs run on the owning connection so the scan's lock cannot block the teardown DROP. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 04d0bbd commit 6715bd0

1 file changed

Lines changed: 110 additions & 10 deletions

File tree

pg_lake_table/tests/pytests/test_compatibility_mode.py

Lines changed: 110 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,8 +1032,16 @@ def test_data_file_uuid_array_written_as_string(
10321032
"WITH (compatibility_mode = 'snowflake');",
10331033
pg_conn,
10341034
)
1035+
# a populated row, a whole-array NULL, a NULL element, and an empty array:
1036+
# none may introduce a uuid leaf, and the NULLs must survive the round trip.
10351037
run_command(
1036-
f"INSERT INTO compat_df_arr VALUES (1, ARRAY['{u1}','{u2}']::uuid[]);",
1038+
f"""
1039+
INSERT INTO compat_df_arr VALUES
1040+
(1, ARRAY['{u1}','{u2}']::uuid[]),
1041+
(2, NULL),
1042+
(3, ARRAY['{u1}', NULL]::uuid[]),
1043+
(4, ARRAY[]::uuid[]);
1044+
""",
10371045
pg_conn,
10381046
)
10391047
pg_conn.commit()
@@ -1043,6 +1051,14 @@ def test_data_file_uuid_array_written_as_string(
10431051
assert ("element", "BYTE_ARRAY") in _parquet_leaf_types(
10441052
superuser_conn, "compat_df_arr"
10451053
)
1054+
# the NULLs round-trip: whole-array NULL stays NULL, the NULL element is
1055+
# preserved inside the (now text) array, and uuids read back as text.
1056+
# Read on pg_conn (the owner) so the scan's lock doesn't block the DROP.
1057+
rows = run_query("SELECT id, us FROM compat_df_arr ORDER BY id", pg_conn)
1058+
assert rows[0][1] == [u1, u2]
1059+
assert rows[1][1] is None
1060+
assert rows[2][1] == [u1, None]
1061+
assert rows[3][1] == []
10461062
finally:
10471063
run_command("DROP TABLE compat_df_arr", pg_conn)
10481064
pg_conn.commit()
@@ -1062,17 +1078,32 @@ def test_data_file_composite_and_map_written_as_string(
10621078
pg_conn,
10631079
)
10641080
converted_map = _col_format_type(pg_conn, "compat_df_comp", "m")
1081+
# populated; whole composite + map NULL; composite with NULL uuid fields +
1082+
# empty map; composite with a NULL array element + NULL map value.
10651083
run_command(
10661084
f"""
10671085
INSERT INTO compat_df_comp VALUES
10681086
(1, ROW('{u}', ARRAY['{u}']::uuid[], 'hi'),
1069-
ARRAY[('k', '{u}')]::{converted_map});
1087+
ARRAY[('k', '{u}')]::{converted_map}),
1088+
(2, NULL, NULL),
1089+
(3, ROW(NULL, NULL, 'no-ids'), ARRAY[]::{converted_map}),
1090+
(4, ROW('{u}', ARRAY['{u}', NULL]::uuid[], NULL),
1091+
ARRAY[('k', NULL)]::{converted_map});
10701092
""",
10711093
pg_conn,
10721094
)
10731095
pg_conn.commit()
10741096
try:
10751097
_assert_data_files_uuid_free(superuser_conn, "compat_df_comp")
1098+
# NULLs survive: whole composite/map NULL, NULL fields, and the NULL
1099+
# array element inside the (now text) composite all round-trip.
1100+
rows = run_query(
1101+
"SELECT id, (c).cid, (c).cids, m FROM compat_df_comp ORDER BY id",
1102+
pg_conn,
1103+
)
1104+
assert rows[1][1] is None and rows[1][2] is None and rows[1][3] is None
1105+
assert rows[2][1] is None and rows[2][2] is None
1106+
assert rows[3][2] == [u, None]
10761107
finally:
10771108
run_command("DROP TABLE compat_df_comp", pg_conn)
10781109
pg_conn.commit()
@@ -1105,13 +1136,15 @@ def test_data_file_deeply_nested_written_as_string(
11051136
# array-of-composite structure at the schema level.
11061137
leaf = f"ROW('{u}', ARRAY['{u}'], 'l')"
11071138
mid = f"ROW({leaf}, NULL, '{u}')"
1139+
# a populated deep row; an all-NULL row; and a row with NULLs at every
1140+
# intermediate level (NULL leaf, NULL array element, NULL mid_id).
1141+
mid_nulls = f"ROW(NULL, NULL, NULL)"
11081142
run_command(
11091143
f"""
1110-
INSERT INTO compat_df_deep VALUES (
1111-
1, '{u}', ARRAY['{u}'],
1112-
{mid},
1113-
NULL
1114-
);
1144+
INSERT INTO compat_df_deep VALUES
1145+
(1, '{u}', ARRAY['{u}'], {mid}, NULL),
1146+
(2, NULL, NULL, NULL, NULL),
1147+
(3, '{u}', ARRAY['{u}', NULL], {mid_nulls}, NULL);
11151148
""",
11161149
pg_conn,
11171150
)
@@ -1150,11 +1183,15 @@ def test_data_file_copy_from_uuid_parquet_written_as_string(
11501183
src = f"s3://{TEST_BUCKET}/compat_df_src/u.parquet"
11511184
duck = create_duckdb_conn()
11521185
try:
1186+
# source rows: populated, whole-array NULL, NULL element, empty array
11531187
duck.execute(
11541188
f"""
1155-
COPY (SELECT 1 AS id,
1156-
['{u1}'::uuid, '{u2}'::uuid] AS us)
1157-
TO '{src}' (FORMAT parquet);
1189+
COPY (
1190+
SELECT 1 AS id, ['{u1}'::uuid, '{u2}'::uuid] AS us
1191+
UNION ALL SELECT 2, NULL
1192+
UNION ALL SELECT 3, ['{u1}'::uuid, NULL]
1193+
UNION ALL SELECT 4, []::uuid[]
1194+
) TO '{src}' (FORMAT parquet);
11581195
"""
11591196
)
11601197
# sanity: the source really is a uuid leaf (proves the test can detect it)
@@ -1174,6 +1211,69 @@ def test_data_file_copy_from_uuid_parquet_written_as_string(
11741211
assert ("element", "BYTE_ARRAY") in _parquet_leaf_types(
11751212
superuser_conn, "compat_df_copy"
11761213
)
1214+
# NULLs from the source parquet round-trip through the cast
1215+
rows = run_query("SELECT id, us FROM compat_df_copy ORDER BY id", pg_conn)
1216+
assert rows[0][1] == [u1, u2]
1217+
assert rows[1][1] is None
1218+
assert rows[2][1] == [u1, None]
1219+
assert rows[3][1] == []
11771220
finally:
11781221
run_command("DROP TABLE compat_df_copy", pg_conn)
11791222
pg_conn.commit()
1223+
1224+
1225+
def test_data_file_all_null_nested_column_still_string(
1226+
s3, pg_conn, superuser_conn, extension, with_default_location
1227+
):
1228+
"""
1229+
The hardest case for a column-oriented writer: when EVERY row's nested value
1230+
is NULL the writer never observes a concrete uuid, so nothing but the column
1231+
type can force the leaf to string. The data file must still serialize the
1232+
rewritten leaves as BYTE_ARRAY (never uuid), and the top-level uuid stays
1233+
uuid even when it too is all-NULL.
1234+
"""
1235+
run_command(
1236+
"""
1237+
CREATE TYPE df_null_comp AS (cid uuid, cids uuid[], note text);
1238+
CREATE TABLE compat_df_allnull (
1239+
id int,
1240+
top_id uuid, -- top-level: stays uuid
1241+
us uuid[], -- nested: -> text[]
1242+
c df_null_comp -- nested: cid / cids -> text
1243+
) USING iceberg WITH (compatibility_mode = 'snowflake');
1244+
""",
1245+
pg_conn,
1246+
)
1247+
run_command(
1248+
"""
1249+
INSERT INTO compat_df_allnull VALUES
1250+
(1, NULL, NULL, NULL),
1251+
(2, NULL, NULL, NULL);
1252+
""",
1253+
pg_conn,
1254+
)
1255+
pg_conn.commit()
1256+
try:
1257+
# no rewritten leaf may be a uuid, even with no concrete value to infer
1258+
paths = _data_file_paths(superuser_conn, "compat_df_allnull")
1259+
duck = create_duckdb_conn()
1260+
try:
1261+
for path in paths:
1262+
offenders = [
1263+
(n, p, l)
1264+
for (n, p, l) in _parquet_uuid_leaves(duck, path)
1265+
if n != "top_id" # the top-level uuid legitimately stays uuid
1266+
]
1267+
assert (
1268+
not offenders
1269+
), f"all-NULL file serialized uuid leaves: {offenders}"
1270+
finally:
1271+
duck.close()
1272+
# positively: the rewritten array element is a string leaf
1273+
assert ("element", "BYTE_ARRAY") in _parquet_leaf_types(
1274+
superuser_conn, "compat_df_allnull"
1275+
)
1276+
finally:
1277+
run_command("DROP TABLE compat_df_allnull", pg_conn)
1278+
run_command("DROP TYPE df_null_comp", pg_conn)
1279+
pg_conn.commit()

0 commit comments

Comments
 (0)