Skip to content

Commit 4dd6e54

Browse files
Replace per-column TupleDesc array with HTAB cache in CopyToStateData
Previously the TupleDesc cache was a flat TupleDesc* array indexed by attnum, pre-populated only for top-level composite columns. Replace it with an HTAB (Oid -> TupleDescCacheEntry) that: - Is allocated in StartCopyTo and destroyed in EndCopyTo. - Is pre-populated with the Oids of all top-level composite columns (same as before). - Is passed recursively through PGDuckSerialize / StructOutForPGDuck / ArrayOutForPGDuck so that nested composite types discovered during serialization are looked up once and inserted into the cache on first encounter, avoiding repeated lookup_rowtype_tupdesc calls for every row. TupleDescCacheEntry (typid key + tupdesc value) is defined in serialize.h alongside the updated PGDuckSerialize signature. All existing NULL-passing call sites (rewrite_query, map_conversion, etc.) are unaffected. Signed-off-by: David Christensen <david.christensen@snowflake.com>
1 parent 7b919df commit 4dd6e54

File tree

7 files changed

+90
-55
lines changed

7 files changed

+90
-55
lines changed

pg_lake_engine/include/pg_lake/pgduck/array_conversion.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919

2020
#include "access/tupdesc.h"
2121
#include "utils/array.h"
22+
#include "utils/hsearch.h"
2223
#include "pg_lake/copy/copy_format.h"
2324

2425
#define ARRAY_OUT_OID (751)
2526

2627
extern PGDLLEXPORT char *ArrayOutForPGDuck(ArrayType *array, CopyDataFormat format,
27-
TupleDesc cachedElemTupleDesc);
28+
HTAB *tupdesc_cache);

pg_lake_engine/include/pg_lake/pgduck/serialize.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,22 @@
2323
#include "access/tupdesc.h"
2424
#include "pg_lake/parquet/field.h"
2525
#include "pg_lake/pgduck/type.h"
26+
#include "utils/hsearch.h"
2627

2728
#define BYTEA_OUT_OID 31
2829

30+
/*
31+
* Entry in the Oid -> TupleDesc hash table used to cache composite-type
32+
* descriptors across rows. typid is the hash key and must be first.
33+
*/
34+
typedef struct TupleDescCacheEntry
35+
{
36+
Oid typid;
37+
TupleDesc tupdesc;
38+
} TupleDescCacheEntry;
39+
2940
extern PGDLLEXPORT char *PGDuckSerialize(FmgrInfo *flinfo, Oid typeOid, Datum value,
30-
CopyDataFormat format, TupleDesc cachedTupleDesc);
41+
CopyDataFormat format, HTAB *tupdesc_cache);
3142
extern PGDLLEXPORT char *PGDuckOnlySerialize(Oid typeOid, Datum value);
3243
extern PGDLLEXPORT bool IsPGDuckSerializeRequired(PGType postgresType);
3344
extern PGDLLEXPORT char *IntervalOutForPGDuck(Datum value);

pg_lake_engine/include/pg_lake/pgduck/struct_conversion.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@
1919

2020
#include "access/tupdesc.h"
2121
#include "pg_lake/copy/copy_format.h"
22+
#include "utils/hsearch.h"
2223

2324
#define RECORD_OUT_OID 2291
2425

2526
extern PGDLLEXPORT char *StructOutForPGDuck(Datum myStruct, CopyDataFormat format,
26-
TupleDesc cachedTupleDesc);
27+
HTAB *tupdesc_cache);

pg_lake_engine/src/csv/csv_writer.c

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "pg_lake/extensions/postgis.h"
3939
#include "pg_lake/pgduck/numeric.h"
4040
#include "pg_lake/pgduck/serialize.h"
41+
#include "pg_lake/pgduck/struct_conversion.h"
4142
#include "pg_lake/util/numeric.h"
4243
#include "executor/executor.h"
4344
#include "mb/pg_wchar.h"
@@ -106,11 +107,12 @@ typedef struct CopyToStateData
106107
FmgrInfo *out_functions; /* lookup info for output functions */
107108

108109
/*
109-
* Per-column TupleDesc cache for composite-type columns. NULL means the
110-
* column is not composite (or is an anonymous RECORD column). A non-NULL
111-
* value is a pinned TupleDesc looked up once during StartCopyTo.
110+
* Oid -> TupleDesc hash table for composite-type columns. Pre-populated
111+
* during StartCopyTo with the top-level composite column types; lazily
112+
* extended with nested composite types encountered during serialization.
113+
* Each entry holds a pinned TupleDesc released in EndCopy.
112114
*/
113-
TupleDesc *composite_tupdescs;
115+
HTAB *tupdesc_cache;
114116
MemoryContext rowcontext; /* per-row evaluation context */
115117
uint64 bytes_processed; /* number of bytes processed so far */
116118

@@ -297,18 +299,15 @@ EndCopy(CopyToState cstate)
297299
}
298300
}
299301

300-
/* Release pinned TupleDescs from the composite type cache */
302+
/* Release all pinned TupleDescs from the cache */
301303
{
302-
ListCell *cur;
304+
HASH_SEQ_STATUS seq;
305+
TupleDescCacheEntry *entry;
303306

304-
foreach(cur, cstate->attnumlist)
305-
{
306-
int attnum = lfirst_int(cur);
307-
TupleDesc cached = cstate->composite_tupdescs[attnum - 1];
308-
309-
if (cached != NULL)
310-
ReleaseTupleDesc(cached);
311-
}
307+
hash_seq_init(&seq, cstate->tupdesc_cache);
308+
while ((entry = (TupleDescCacheEntry *) hash_seq_search(&seq)) != NULL)
309+
ReleaseTupleDesc(entry->tupdesc);
310+
hash_destroy(cstate->tupdesc_cache);
312311
}
313312

314313
MemoryContextDelete(cstate->rowcontext);
@@ -477,7 +476,19 @@ StartCopyTo(CopyToState cstate, TupleDesc tupDesc)
477476

478477
/* Get info about the columns we need to process. */
479478
cstate->out_functions = (FmgrInfo *) palloc(num_phys_attrs * sizeof(FmgrInfo));
480-
cstate->composite_tupdescs = (TupleDesc *) palloc0(num_phys_attrs * sizeof(TupleDesc));
479+
480+
/* Create the Oid -> TupleDesc cache for composite-type columns. */
481+
{
482+
HASHCTL hctl;
483+
484+
memset(&hctl, 0, sizeof(hctl));
485+
hctl.keysize = sizeof(Oid);
486+
hctl.entrysize = sizeof(TupleDescCacheEntry);
487+
hctl.hcxt = cstate->copycontext;
488+
cstate->tupdesc_cache = hash_create("TupleDesc cache", 16, &hctl,
489+
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
490+
}
491+
481492
foreach(cur, cstate->attnumlist)
482493
{
483494
int attnum = lfirst_int(cur);
@@ -496,9 +507,9 @@ StartCopyTo(CopyToState cstate, TupleDesc tupDesc)
496507
fmgr_info(out_func_oid, &cstate->out_functions[attnum - 1]);
497508

498509
/*
499-
* Pre-populate the TupleDesc cache for composite columns and
500-
* arrays of composite elements. Anonymous RECORD types are skipped
501-
* because their structure is not known until row data is inspected.
510+
* Pre-populate the TupleDesc cache for composite columns and arrays
511+
* of composite elements. Anonymous RECORD types are skipped because
512+
* their structure is not known until row data is inspected.
502513
*/
503514
{
504515
Oid elemTypeOid = get_element_type(attr->atttypid);
@@ -507,8 +518,15 @@ StartCopyTo(CopyToState cstate, TupleDesc tupDesc)
507518

508519
if (get_typtype(baseTypeOid) == TYPTYPE_COMPOSITE &&
509520
baseTypeOid != RECORDOID)
510-
cstate->composite_tupdescs[attnum - 1] =
511-
lookup_rowtype_tupdesc(baseTypeOid, baseTypmod);
521+
{
522+
bool found;
523+
TupleDescCacheEntry *entry = (TupleDescCacheEntry *)
524+
hash_search(cstate->tupdesc_cache, &baseTypeOid,
525+
HASH_ENTER, &found);
526+
527+
if (!found)
528+
entry->tupdesc = lookup_rowtype_tupdesc(baseTypeOid, baseTypmod);
529+
}
512530
}
513531
}
514532

@@ -812,29 +830,11 @@ CopyOneRowTo(CopyToState cstate, TupleTableSlot *slot)
812830

813831
if (ShouldUseDuckSerialization(cstate->targetFormat, MakePGType(attr->atttypid, attr->atttypmod)))
814832
{
815-
/*
816-
* Since we are at the top-level when emitting an
817-
* attribute in CopyOneRowTo(), we are not inside a
818-
* composite type.
819-
*
820-
* For composite-type columns, cache the TupleDesc so
821-
* we avoid repeating the expensive lookup_rowtype_tupdesc
822-
* call on every row.
823-
*/
824-
/*
825-
* Pass the pre-populated TupleDesc (non-NULL for
826-
* composite columns and arrays of composite elements)
827-
* so PGDuckSerialize can skip the per-row
828-
* lookup_rowtype_tupdesc call.
829-
*/
830-
TupleDesc cachedTupleDesc =
831-
cstate->composite_tupdescs[attnum - 1];
832-
833833
string = PGDuckSerialize(&out_functions[attnum - 1],
834834
attr->atttypid,
835835
value,
836836
cstate->targetFormat,
837-
cachedTupleDesc);
837+
cstate->tupdesc_cache);
838838
}
839839
else
840840
string = OutputFunctionCall(&out_functions[attnum - 1],

pg_lake_engine/src/pgduck/array_conversion.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
*/
3737
char *
3838
ArrayOutForPGDuck(ArrayType *array, CopyDataFormat format,
39-
TupleDesc cachedElemTupleDesc)
39+
HTAB *tupdesc_cache)
4040
{
4141
Oid element_type = ARR_ELEMTYPE(array);
4242
char *p,
@@ -115,7 +115,7 @@ ArrayOutForPGDuck(ArrayType *array, CopyDataFormat format,
115115
else
116116
{
117117
values[i] = PGDuckSerialize(&my_extra->proc, element_type, itemvalue,
118-
format, cachedElemTupleDesc);
118+
format, tupdesc_cache);
119119

120120
/* count data plus backslashes; detect chars needing quotes */
121121
needquote = !IsSerializedAsContainer(element_type, format);

pg_lake_engine/src/pgduck/serialize.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,19 +125,19 @@ ConvertISOYearToBCIfNeeded(const char *dateTimestampString)
125125
*/
126126
char *
127127
PGDuckSerialize(FmgrInfo *flinfo, Oid columnType, Datum value,
128-
CopyDataFormat format, TupleDesc cachedTupleDesc)
128+
CopyDataFormat format, HTAB *tupdesc_cache)
129129
{
130130
if (flinfo->fn_oid == ARRAY_OUT_OID)
131131
{
132132
/* maps are a type of array */
133133
if (IsMapTypeOid(columnType))
134134
return MapOutForPGDuck(value, format);
135135

136-
return ArrayOutForPGDuck(DatumGetArrayTypeP(value), format, cachedTupleDesc);
136+
return ArrayOutForPGDuck(DatumGetArrayTypeP(value), format, tupdesc_cache);
137137
}
138138

139139
if (flinfo->fn_oid == RECORD_OUT_OID)
140-
return StructOutForPGDuck(value, format, cachedTupleDesc);
140+
return StructOutForPGDuck(value, format, tupdesc_cache);
141141

142142
/*
143143
* For Iceberg, intervals are serialized as struct(months, days,

pg_lake_engine/src/pgduck/struct_conversion.c

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@
2020

2121
#include "access/htup_details.h"
2222
#include "utils/builtins.h"
23-
#include "pg_lake/pgduck/serialize.h"
24-
#include "pg_lake/pgduck/struct_conversion.h"
23+
#include "utils/hsearch.h"
2524
#include "utils/lsyscache.h"
2625
#include "utils/typcache.h"
26+
#include "pg_lake/pgduck/serialize.h"
27+
#include "pg_lake/pgduck/struct_conversion.h"
2728

2829
/*
2930
* structure to cache metadata needed for record I/O
@@ -52,12 +53,13 @@ typedef struct RecordIOData
5253
*/
5354

5455
char *
55-
StructOutForPGDuck(Datum myStruct, CopyDataFormat format, TupleDesc cachedTupleDesc)
56+
StructOutForPGDuck(Datum myStruct, CopyDataFormat format, HTAB *tupdesc_cache)
5657
{
5758
HeapTupleHeader rec = DatumGetHeapTupleHeader(myStruct);
5859
Oid tupType;
5960
int32 tupTypmod;
6061
TupleDesc tupdesc;
62+
bool tupdesc_from_cache = false;
6163
HeapTupleData tuple;
6264
RecordIOData *my_extra;
6365
bool needComma = false;
@@ -72,8 +74,28 @@ StructOutForPGDuck(Datum myStruct, CopyDataFormat format, TupleDesc cachedTupleD
7274
/* Extract type info from the tuple itself */
7375
tupType = HeapTupleHeaderGetTypeId(rec);
7476
tupTypmod = HeapTupleHeaderGetTypMod(rec);
75-
if (cachedTupleDesc != NULL)
76-
tupdesc = cachedTupleDesc;
77+
78+
if (tupdesc_cache != NULL && tupType != RECORDOID)
79+
{
80+
bool found;
81+
TupleDescCacheEntry *entry = (TupleDescCacheEntry *)
82+
hash_search(tupdesc_cache, &tupType, HASH_FIND, &found);
83+
84+
if (found)
85+
{
86+
tupdesc = entry->tupdesc;
87+
tupdesc_from_cache = true;
88+
}
89+
else
90+
{
91+
/* New nested composite type; fetch, pin, and add to cache */
92+
tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
93+
entry = (TupleDescCacheEntry *)
94+
hash_search(tupdesc_cache, &tupType, HASH_ENTER, &found);
95+
entry->tupdesc = tupdesc;
96+
tupdesc_from_cache = true;
97+
}
98+
}
7799
else
78100
tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
79101
ncolumns = tupdesc->natts;
@@ -155,7 +177,7 @@ StructOutForPGDuck(Datum myStruct, CopyDataFormat format, TupleDesc cachedTupleD
155177
}
156178

157179
attr = values[i];
158-
value = PGDuckSerialize(&column_info->proc, column_type, attr, format, NULL);
180+
value = PGDuckSerialize(&column_info->proc, column_type, attr, format, tupdesc_cache);
159181

160182
/* Detect whether we need double quotes for this value */
161183
bool needQuotes = !IsContainerType(column_type);
@@ -180,8 +202,8 @@ StructOutForPGDuck(Datum myStruct, CopyDataFormat format, TupleDesc cachedTupleD
180202
pfree(values);
181203
pfree(nulls);
182204

183-
/* Only release if we looked it up ourselves */
184-
if (cachedTupleDesc == NULL)
205+
/* Only release if we looked it up outside the cache */
206+
if (!tupdesc_from_cache)
185207
ReleaseTupleDesc(tupdesc);
186208

187209
return buf.data;

0 commit comments

Comments
 (0)