Use partition_keys MAP instead of path parsing for partitioned pushdown writes

sfc-gh-mslot · sfc-gh-mslot · commit ab93e6b74158 · 2026-04-21T10:25:44.000Z
Replace ParsePartitionValuesFromPath with ParsePartitionValuesFromPartitionKeys
which reads partition values directly from DuckDB's partition_keys MAP(VARCHAR,
VARCHAR) column in COPY TO return_stats. This eliminates URL-decoding of
Hive-style paths and correctly handles values containing special characters
like forward slashes.

Additional fixes from PR review:
- Add UTC timezone conversion for timestamptz in year/month/day/hour transforms
  (DuckDB's temporal functions use session timezone, Iceberg spec requires UTC)
- Gate hour transform pushdown to TIMESTAMP/TIMESTAMPTZ only (TIME/TIMETZ fall
  back to row-by-row)
- Whitelist identity partition types to those with compatible DuckDB-to-PG text
  representations (excludes bytea whose BLOB-to-VARCHAR format PG cannot parse)
- Pre-create MAP(TEXT,TEXT) type in extension SQL to avoid runtime creation
- Export GetOrCreatePGMapType with PGDLLEXPORT for cross-library visibility

Signed-off-by: Marco Slot &lt;marco.slot@snowflake.com&gt;
diff --git a/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h b/pg_lake_engine/include/pg_lake/data_file/data_file_stats.h
@@ -92,6 +92,9 @@ typedef struct DataFileStats
 
 	/* for a new data file with row IDs, the start of the range */
 	int64		rowIdStart;
+
+	/* partition key values from COPY TO return_stats (NULL if not partitioned) */
+	char	   *partitionKeysText;
 }			DataFileStats;
 
 typedef struct StatsCollector
diff --git a/pg_lake_engine/include/pg_lake/pgduck/map.h b/pg_lake_engine/include/pg_lake/pgduck/map.h
@@ -19,7 +19,7 @@
 
 #include "pg_lake/pgduck/type.h"
 
-Oid			GetOrCreatePGMapType(const char *name);
+extern PGDLLEXPORT Oid GetOrCreatePGMapType(const char *name);
 char	   *GetDuckDBMapDefinitionForPGType(Oid postgresTypeId,
 											CopyDataFormat format);
 
diff --git a/pg_lake_engine/pg_lake_engine--3.3--3.4.sql b/pg_lake_engine/pg_lake_engine--3.3--3.4.sql
@@ -1 +1,5 @@
 -- Upgrade script for pg_lake_engine from 3.3 to 3.4
+
+-- Pre-create MAP(TEXT,TEXT) type for partition_keys parsing in partitioned writes.
+-- This avoids runtime type creation during DML operations.
+SELECT map_type.create('text'::regtype, 'text'::regtype);
diff --git a/pg_lake_engine/src/data_file/data_file_stats.c b/pg_lake_engine/src/data_file/data_file_stats.c
@@ -192,6 +192,11 @@ GetDataFileStatsListFromPGResult(PGresult *result, List *leafFields, DataFileSch
 			{
 				fileStats->dataFilePath = pstrdup(resultValue);
 			}
+			else if (strcmp(resultColName, "partition_keys") == 0)
+			{
+				if (!PQgetisnull(result, resultRowIndex, resultColIndex))
+					fileStats->partitionKeysText = pstrdup(resultValue);
+			}
 		}
 
 		statsList = lappend(statsList, fileStats);
diff --git a/pg_lake_table/include/pg_lake/fdw/partition_pushdown.h b/pg_lake_table/include/pg_lake/fdw/partition_pushdown.h
@@ -24,4 +24,5 @@
 
 extern PGDLLEXPORT bool AllPartitionTransformsPushdownable(List *transforms);
 extern List *GetPartitionByExpressions(List *transforms);
-extern Partition * ParsePartitionValuesFromPath(char *filePath, List *transforms);
+extern Partition * ParsePartitionValuesFromPartitionKeys(char *partitionKeysText,
+														 List *transforms);
diff --git a/pg_lake_table/src/fdw/partition_pushdown.c b/pg_lake_table/src/fdw/partition_pushdown.c
@@ -24,13 +24,16 @@
  */
 #include "postgres.h"
 
+#include "executor/executor.h"
+#include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 
 #include "pg_lake/fdw/partition_pushdown.h"
 #include "pg_lake/fdw/partition_transform.h"
 #include "pg_lake/iceberg/api/partitioning.h"
 #include "pg_lake/iceberg/manifest_spec.h"
+#include "pg_lake/pgduck/map.h"
 
 
 static char *PartitionTransformToDuckDBExpression(IcebergPartitionTransform * transform);
@@ -85,34 +88,90 @@ PartitionTransformToDuckDBExpression(IcebergPartitionTransform * transform)
 		case PARTITION_TRANSFORM_IDENTITY:
 			{
 				/*
-				 * Identity partitions use the column value directly for
-				 * non-temporal types. For date/timestamp types, we produce
-				 * epoch integers to avoid DuckDB text formatting issues (e.g.
-				 * BC dates formatted as "4713-01-01 (BC)").
+				 * Only push down identity partitions for types whose
+				 * DuckDB VARCHAR representation can be parsed by PG's
+				 * type input function. Types like bytea are excluded
+				 * because DuckDB's BLOB-to-VARCHAR cast uses a format
+				 * PG cannot parse (same issue as column_statistics,
+				 * which skips bytea via ShouldSkipStatistics).
 				 *
-				 * ParsePartitionValuesFromPath uses
-				 * DeserializePartitionValueFromEpochInteger to convert epoch
-				 * integers back to Iceberg binary.
+				 * For date/timestamp types, we produce epoch integers
+				 * to avoid DuckDB text formatting issues (e.g. BC
+				 * dates formatted as "4713-01-01 (BC)").
+				 *
+				 * ParsePartitionValuesFromPartitionKeys uses
+				 * DeserializePartitionValueFromEpochInteger to convert
+				 * epoch integers back to Iceberg binary.
 				 */
 				if (typeOid == DATEOID)
 					return psprintf("datediff('day', date '1970-01-01', %s::date)", col);
 				else if (typeOid == TIMESTAMPOID || typeOid == TIMESTAMPTZOID)
 					return psprintf("epoch_us(%s)", col);
-				else
+				else if (typeOid == INT2OID || typeOid == INT4OID ||
+						 typeOid == INT8OID || typeOid == FLOAT4OID ||
+						 typeOid == FLOAT8OID || typeOid == NUMERICOID ||
+						 typeOid == BOOLOID || typeOid == TEXTOID ||
+						 typeOid == VARCHAROID || typeOid == BPCHAROID ||
+						 typeOid == UUIDOID || typeOid == TIMEOID ||
+						 typeOid == TIMETZOID)
 					return psprintf("%s", col);
+				else
+					return NULL;
 			}
 
 		case PARTITION_TRANSFORM_YEAR:
-			return psprintf("(year(%s) - 1970)", col);
+			{
+				/*
+				 * Iceberg spec requires UTC for timestamptz. PG stores
+				 * timestamptz internally in UTC, so the non-pushdown path
+				 * works correctly. In DuckDB, year() uses session timezone,
+				 * so we must convert to UTC first.
+				 */
+				if (typeOid == TIMESTAMPTZOID)
+					return psprintf("(year(timezone('UTC', %s)) - 1970)", col);
+				else
+					return psprintf("(year(%s) - 1970)", col);
+			}
 
 		case PARTITION_TRANSFORM_MONTH:
-			return psprintf("((year(%s) - 1970) * 12 + month(%s) - 1)", col, col);
+			{
+				if (typeOid == TIMESTAMPTZOID)
+					return psprintf("((year(timezone('UTC', %s)) - 1970) * 12 + "
+									"month(timezone('UTC', %s)) - 1)", col, col);
+				else
+					return psprintf("((year(%s) - 1970) * 12 + month(%s) - 1)",
+									col, col);
+			}
 
 		case PARTITION_TRANSFORM_DAY:
-			return psprintf("datediff('day', date '1970-01-01', %s::date)", col);
+			{
+				/*
+				 * Iceberg spec requires UTC for day transforms. For
+				 * timestamptz, convert to UTC before computing the day.
+				 */
+				if (typeOid == TIMESTAMPTZOID)
+					return psprintf("datediff('day', date '1970-01-01', "
+									"timezone('UTC', %s)::date)", col);
+				else
+					return psprintf("datediff('day', date '1970-01-01', %s::date)", col);
+			}
 
 		case PARTITION_TRANSFORM_HOUR:
-			return psprintf("datediff('hour', timestamp '1970-01-01', %s::timestamp)", col);
+			{
+				/*
+				 * Only TIMESTAMP and TIMESTAMPTZ are pushdownable for hour
+				 * transforms. TIME/TIMETZ fall back to row-by-row processing.
+				 * Iceberg spec requires UTC for timestamptz.
+				 */
+				if (typeOid == TIMESTAMPTZOID)
+					return psprintf("datediff('hour', timestamp '1970-01-01', "
+									"timezone('UTC', %s)::timestamp)", col);
+				else if (typeOid == TIMESTAMPOID)
+					return psprintf("datediff('hour', timestamp '1970-01-01', "
+									"%s::timestamp)", col);
+				else
+					return NULL;
+			}
 
 		case PARTITION_TRANSFORM_BUCKET:
 		case PARTITION_TRANSFORM_TRUNCATE:
@@ -150,59 +209,6 @@ GetPartitionByExpressions(List *transforms)
 }
 
 
-/*
- * HexDigitToInt converts a hex character ('0'-'9', 'A'-'F', 'a'-'f') to its
- * integer value (0-15). Returns -1 for invalid characters.
- */
-static int
-HexDigitToInt(char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 10;
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 10;
-	return -1;
-}
-
-
-/*
- * UrlDecodePartitionValue decodes percent-encoded characters in a Hive-style
- * partition value (e.g. "1e%2B20" -> "1e+20").
- *
- * DuckDB percent-encodes special characters when writing partition directory
- * names. We must decode them before parsing the value.
- */
-static char *
-UrlDecodePartitionValue(const char *encoded)
-{
-	int			len = strlen(encoded);
-	char	   *decoded = palloc(len + 1);
-	int			j = 0;
-
-	for (int i = 0; i < len; i++)
-	{
-		if (encoded[i] == '%' && i + 2 < len)
-		{
-			int			hi = HexDigitToInt(encoded[i + 1]);
-			int			lo = HexDigitToInt(encoded[i + 2]);
-
-			if (hi >= 0 && lo >= 0)
-			{
-				decoded[j++] = (char) (hi * 16 + lo);
-				i += 2;
-				continue;
-			}
-		}
-		decoded[j++] = encoded[i];
-	}
-
-	decoded[j] = '\0';
-	return decoded;
-}
-
-
 /*
  * NormalizeDuckDBTextToPGText converts a DuckDB text representation of a value
  * to PostgreSQL's canonical text format by roundtripping through PG's type I/O.
@@ -230,62 +236,100 @@ NormalizeDuckDBTextToPGText(const char *duckdbText, Oid resultTypeOid,
 
 
 /*
- * ParsePartitionValuesFromPath extracts partition values from the Hive-style
- * directory path produced by DuckDB COPY TO with PARTITION_BY.
+ * ParsePartitionValuesFromPartitionKeys extracts partition values from the
+ * partition_keys MAP(VARCHAR, VARCHAR) returned by DuckDB's COPY TO with
+ * return_stats.
  *
- * A path like:
- *   s3://bucket/data/abc123/__part_0=54/__part_1=us-east/data_0.parquet
+ * The partition_keys map has entries like:
+ *   {__part_0=54, __part_1=us-east}
  *
- * is parsed to extract __part_0=54 and __part_1=us-east, which are then
- * converted to the proper Iceberg binary format using the partition transforms.
+ * Each value is converted to the proper Iceberg binary format using the
+ * partition transforms.
  */
 Partition *
-ParsePartitionValuesFromPath(char *filePath, List *transforms)
+ParsePartitionValuesFromPartitionKeys(char *partitionKeysText, List *transforms)
 {
 	int			numTransforms = list_length(transforms);
 	Partition  *partition = palloc0(sizeof(Partition));
 
 	partition->fields = palloc0(sizeof(PartitionField) * numTransforms);
 	partition->fields_length = numTransforms;
 
-	for (int partIndex = 0; partIndex < numTransforms; partIndex++)
+	/* parse the MAP(TEXT,TEXT) text into a datum */
+	Oid			mapTypeOid = GetOrCreatePGMapType("MAP(TEXT,TEXT)");
+	Oid			typoinput;
+	Oid			typioparam;
+
+	getTypeInputInfo(mapTypeOid, &typoinput, &typioparam);
+	Datum		mapDatum = OidInputFunctionCall(typoinput, partitionKeysText,
+												typioparam, -1);
+
+	/*
+	 * Build an array of value texts indexed by partition index. We iterate the
+	 * map entries and match __part_N keys to their indices.
+	 */
+	char	  **valueTexts = palloc0(sizeof(char *) * numTransforms);
+	bool	   *valueIsNull = palloc0(sizeof(bool) * numTransforms);
+
+	ArrayType  *elementsArray = DatumGetArrayTypeP(mapDatum);
+	ArrayIterator arrayIterator = array_create_iterator(elementsArray, 0, NULL);
+	Datum		elemDatum;
+	bool		isNull = false;
+
+	while (array_iterate(arrayIterator, &elemDatum, &isNull))
 	{
-		IcebergPartitionTransform *transform = list_nth(transforms, partIndex);
+		if (isNull)
+			continue;
+
+		HeapTupleHeader tupleHeader = DatumGetHeapTupleHeader(elemDatum);
+		bool		keyIsNull = false;
+		bool		valIsNull = false;
+
+		Datum		keyDatum = GetAttributeByNum(tupleHeader, 1, &keyIsNull);
+		Datum		valDatum = GetAttributeByNum(tupleHeader, 2, &valIsNull);
 
-		/* build the search key: "__part_N=" */
-		char	   *searchKey = psprintf("__part_%d=", partIndex);
-		int			searchKeyLen = strlen(searchKey);
+		if (keyIsNull)
+			continue;
 
-		/* find this key in the path */
-		char	   *found = strstr(filePath, searchKey);
+		char	   *key = TextDatumGetCString(keyDatum);
 
-		if (found == NULL)
+		/* parse __part_N to get the partition index */
+		if (strncmp(key, "__part_", 7) != 0)
+			continue;
+
+		int			partIndex = pg_strtoint32(key + 7);
+
+		if (partIndex < 0 || partIndex >= numTransforms)
 		{
 			ereport(ERROR,
 					(errcode(ERRCODE_INTERNAL_ERROR),
-					 errmsg("could not find partition key %s in path %s",
-							searchKey, filePath)));
+					 errmsg("unexpected partition key %s (expected 0..%d)",
+							key, numTransforms - 1)));
 		}
 
-		/* extract the value (from after '=' up to the next '/' or end) */
-		char	   *valueStart = found + searchKeyLen;
-		char	   *valueEnd = strchr(valueStart, '/');
-		int			valueLen = (valueEnd != NULL) ?
-			(valueEnd - valueStart) : strlen(valueStart);
-
-		char	   *valueText = pnstrdup(valueStart, valueLen);
+		if (valIsNull)
+		{
+			valueIsNull[partIndex] = true;
+		}
+		else
+		{
+			valueTexts[partIndex] = TextDatumGetCString(valDatum);
+		}
+	}
 
-		/* URL-decode (DuckDB percent-encodes special chars in Hive paths) */
-		valueText = UrlDecodePartitionValue(valueText);
+	array_free_iterator(arrayIterator);
 
-		/* populate the partition field */
+	/* convert each partition value to Iceberg binary format */
+	for (int partIndex = 0; partIndex < numTransforms; partIndex++)
+	{
+		IcebergPartitionTransform *transform = list_nth(transforms, partIndex);
 		PartitionField *field = &partition->fields[partIndex];
 
 		field->field_id = transform->partitionFieldId;
 		field->field_name = pstrdup(transform->partitionFieldName);
 		field->value_type = GetTransformResultAvroType(transform);
 
-		if (strcmp(valueText, "NULL") == 0)
+		if (valueIsNull[partIndex] || valueTexts[partIndex] == NULL)
 		{
 			/* NULL partition value */
 			field->value = NULL;
@@ -297,25 +341,28 @@ ParsePartitionValuesFromPath(char *filePath, List *transforms)
 				  transform->pgType.postgresTypeOid == TIMESTAMPTZOID))
 		{
 			/*
-			 * Identity temporal types use epoch integers in the path (days
-			 * for date, microseconds for timestamp).
+			 * Identity temporal types use epoch integers (days for date,
+			 * microseconds for timestamp).
 			 */
 			field->value = DeserializePartitionValueFromEpochInteger(
-																	 transform, valueText, &field->value_length);
+																	 transform, valueTexts[partIndex],
+																	 &field->value_length);
 		}
 		else
 		{
 			/*
-			 * Normalize DuckDB text to PG canonical format (e.g. "1.0" -> "1"
-			 * for numeric) so the roundtrip assertion in
+			 * Normalize DuckDB text to PG canonical format (e.g. "1.0" ->
+			 * "1" for numeric) so the roundtrip assertion in
 			 * DeserializePartitionValueFromPGText passes.
 			 */
-			valueText = NormalizeDuckDBTextToPGText(valueText,
-													transform->resultPgType.postgresTypeOid,
-													transform->resultPgType.postgresTypeMod);
+			char	   *normalizedText =
+				NormalizeDuckDBTextToPGText(valueTexts[partIndex],
+										   transform->resultPgType.postgresTypeOid,
+										   transform->resultPgType.postgresTypeMod);
 
 			field->value = DeserializePartitionValueFromPGText(
-															   transform, valueText, &field->value_length);
+															   transform, normalizedText,
+															   &field->value_length);
 		}
 	}
 
diff --git a/pg_lake_table/src/fdw/writable_table.c b/pg_lake_table/src/fdw/writable_table.c
diff --git a/pg_lake_table/tests/pytests/test_partitioned_pushdown.py b/pg_lake_table/tests/pytests/test_partitioned_pushdown.py

Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,11 @@ GetDataFileStatsListFromPGResult(PGresult result, List leafFields, DataFileSch`
`192`	`192`	`{`
`193`	`193`	`fileStats->dataFilePath = pstrdup(resultValue);`
`194`	`194`	`}`
	`195`	`+ else if (strcmp(resultColName, "partition_keys") == 0)`
	`196`	`+ {`
	`197`	`+ if (!PQgetisnull(result, resultRowIndex, resultColIndex))`
	`198`	`+ fileStats->partitionKeysText = pstrdup(resultValue);`
	`199`	`+ }`
`195`	`200`	`}`
`196`	`201`
`197`	`202`	`statsList = lappend(statsList, fileStats);`