Skip to content

Commit 6a8c8de

Browse files
yaowangmjiaqizho
authored andcommitted
Add explain analyze detailed info of hash agg (#15917)
After upgrading to GPDB7, we lost the detailed cdb executor instruments of hashagg since the code base has been changed in a large size. The patch is to re-implement explain analyze related code of hashagg to provide more critical info for troubleshooting issues.
1 parent 1cbab9b commit 6a8c8de

File tree

7 files changed

+532
-1
lines changed

7 files changed

+532
-1
lines changed

src/backend/commands/explain.c

+8-1
Original file line numberDiff line numberDiff line change
@@ -4265,8 +4265,15 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
42654265
appendStringInfoChar(es->str, '\n');
42664266
}
42674267

4268+
/*
4269+
* Greenplums outputs hash aggregate information in "Extra Text" via
4270+
* cdbexplainbuf, hash_agg_update_metrics() is never called on QD.
4271+
*/
4272+
if (Gp_role != GP_ROLE_UTILITY || !es->analyze)
4273+
return;
4274+
42684275
/* Display stats for each parallel worker */
4269-
if (es->analyze && aggstate->shared_info != NULL)
4276+
if (aggstate->shared_info != NULL)
42704277
{
42714278
for (int n = 0; n < aggstate->shared_info->num_workers; n++)
42724279
{

src/backend/executor/nodeAgg.c

+145
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,8 @@ static void build_pertrans_for_aggref(AggStatePerTrans pertrans,
481481

482482
static void ExecEagerFreeAgg(AggState *node);
483483

484+
void agg_hash_explain_extra_message(AggState *aggstate);
485+
484486
/*
485487
* Select the current grouping set; affects current_set and
486488
* curaggcontext.
@@ -1540,6 +1542,16 @@ build_hash_tables(AggState *aggstate)
15401542
memory);
15411543

15421544
build_hash_table(aggstate, setno, nbuckets);
1545+
1546+
/* initialize some statistic info of hash table */
1547+
perhash->num_output_groups = 0;
1548+
perhash->num_spill_parts = 0;
1549+
perhash->num_expansions = 0;
1550+
perhash->bucket_total = 0;
1551+
perhash->bucket_used = 0;
1552+
perhash->chain_count = 0;
1553+
perhash->chain_length_total = 0;
1554+
perhash->chain_length_max = 0;
15431555
}
15441556

15451557
aggstate->hash_ngroups_current = 0;
@@ -1966,6 +1978,7 @@ hash_agg_enter_spill_mode(AggState *aggstate)
19661978
hashagg_spill_init(aggstate, spill, aggstate->hash_tapeinfo, 0,
19671979
perhash->aggnode->numGroups,
19681980
aggstate->hashentrysize);
1981+
perhash->num_spill_parts += spill->npartitions;
19691982
}
19701983

19711984
if (aggstate->ss.ps.instrument)
@@ -2019,6 +2032,12 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
20192032
}
20202033

20212034
/* update hashentrysize estimate based on contents */
2035+
/*
2036+
* Greenplum doesn't use hashentrysize in the instrumentation, it will
2037+
* calculate hash table chain length to get an accurate number.
2038+
*
2039+
* See the following code to collect hash table statistic info.
2040+
*/
20222041
if (aggstate->hash_ngroups_current > 0)
20232042
{
20242043
aggstate->hashentrysize =
@@ -2031,6 +2050,47 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
20312050
Instrumentation *instrument = aggstate->ss.ps.instrument;
20322051

20332052
instrument->workmemused = aggstate->hash_mem_peak;
2053+
2054+
/*
2055+
* workmemwanted to avoid scratch i/o, that how much memory is needed
2056+
* if we want to load into the hashtable at once:
2057+
*
2058+
* 1. add meta_mem only when from_tape is false, because when we are
2059+
* reading from tape/spilled file, we can reuse the existing hash
2060+
* table's meta.
2061+
* 2. add hash_mem every time.
2062+
* 3. don't add buffer_mem since it's unnecessary when we can load into
2063+
* into the memory at once.
2064+
*/
2065+
if (!from_tape)
2066+
instrument->workmemwanted += meta_mem;
2067+
instrument->workmemwanted += hashkey_mem;
2068+
2069+
/* Scan all perhashs and collect hash table statistic info */
2070+
for (int setno = 0; setno < aggstate->num_hashes; setno++)
2071+
{
2072+
AggStatePerHash perhash = &aggstate->perhash[setno];
2073+
tuplehash_hash *hashtab = perhash->hashtable->hashtab;
2074+
2075+
Assert(hashtab);
2076+
2077+
perhash->num_expansions += hashtab->num_expansions;
2078+
perhash->bucket_total += hashtab->size;
2079+
perhash->bucket_used += hashtab->members;
2080+
if (hashtab->members > 0)
2081+
{
2082+
uint32 perht_chain_length_total = 0;
2083+
uint32 perht_chain_count = 0;
2084+
2085+
/* collect statistic info of chain length per hash table */
2086+
tuplehash_coll_stat(hashtab,
2087+
&(perhash->chain_length_max),
2088+
&perht_chain_length_total,
2089+
&perht_chain_count);
2090+
perhash->chain_count += perht_chain_count;
2091+
perhash->chain_length_total += perht_chain_length_total;
2092+
}
2093+
}
20342094
}
20352095
}
20362096

@@ -2225,6 +2285,7 @@ lookup_hash_entries(AggState *aggstate)
22252285
hashagg_spill_init(aggstate, spill, aggstate->hash_tapeinfo, 0,
22262286
perhash->aggnode->numGroups,
22272287
aggstate->hashentrysize);
2288+
perhash->num_spill_parts += spill->npartitions;
22282289

22292290
hashagg_spill_tuple(aggstate, spill, slot, hash);
22302291
pergroup[setno] = NULL;
@@ -2277,6 +2338,13 @@ ExecAgg(PlanState *pstate)
22772338
return result;
22782339
}
22792340

2341+
/* Save statistics into the cdbexplainbuf for EXPLAIN ANALYZE */
2342+
if (node->ss.ps.instrument &&
2343+
(node->ss.ps.instrument)->need_cdb &&
2344+
(node->phase->aggstrategy == AGG_HASHED ||
2345+
node->phase->aggstrategy == AGG_MIXED))
2346+
agg_hash_explain_extra_message(node);
2347+
22802348
return NULL;
22812349
}
22822350

@@ -2807,6 +2875,7 @@ agg_refill_hash_table(AggState *aggstate)
28072875
spill_initialized = true;
28082876
hashagg_spill_init(aggstate, &spill, tapeinfo, batch->used_bits,
28092877
batch->input_card, aggstate->hashentrysize);
2878+
aggstate->perhash[aggstate->current_set].num_spill_parts += spill.npartitions;
28102879
}
28112880
/* no memory for a new group, spill */
28122881
hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
@@ -2981,6 +3050,8 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
29813050
}
29823051
}
29833052

3053+
perhash->num_output_groups++;
3054+
29843055
/*
29853056
* Clear the per-output-tuple context for each group
29863057
*
@@ -5139,3 +5210,77 @@ ReuseHashTable(AggState *node)
51395210
!node->streaming &&
51405211
!bms_overlap(node->ss.ps.chgParam, aggnode->aggParams));
51415212
}
5213+
5214+
/*
5215+
* Save statistics into the cdbexplainbuf for EXPLAIN ANALYZE
5216+
*/
5217+
void
5218+
agg_hash_explain_extra_message(AggState *aggstate)
5219+
{
5220+
/*
5221+
* Check cdbexplain_depositStatsToNode(), Greenplum only saves extra
5222+
* message text for the most interesting winning qExecs.
5223+
*/
5224+
StringInfo hbuf = aggstate->ss.ps.cdbexplainbuf;
5225+
uint64 sum_num_expansions = 0;
5226+
uint64 sum_output_groups = 0;
5227+
uint64 sum_spill_parts = 0;
5228+
uint64 sum_chain_length_total = 0;
5229+
uint64 sum_chain_count = 0;
5230+
uint32 chain_length_max = 0;
5231+
uint64 sum_bucket_used = 0;
5232+
uint64 sum_bucket_total = 0;
5233+
5234+
Assert(hbuf);
5235+
5236+
appendStringInfo(hbuf, "hash table(s): %d", aggstate->num_hashes);
5237+
5238+
/* Scan all perhashs and collect statistic info */
5239+
for (int setno = 0; setno < aggstate->num_hashes; setno++)
5240+
{
5241+
AggStatePerHash perhash = &aggstate->perhash[setno];
5242+
5243+
/* spill statistic info */
5244+
if (aggstate->hash_ever_spilled)
5245+
{
5246+
sum_output_groups += perhash->num_output_groups;
5247+
sum_spill_parts += perhash->num_spill_parts;
5248+
}
5249+
5250+
/* inner hash table statistic info */
5251+
if (perhash->chain_count > 0)
5252+
{
5253+
sum_chain_length_total += perhash->chain_length_total;
5254+
sum_chain_count += perhash->chain_count;
5255+
if (perhash->chain_length_max > chain_length_max)
5256+
chain_length_max = perhash->chain_length_max;
5257+
sum_bucket_used = perhash->bucket_used;
5258+
sum_bucket_total = perhash->bucket_total;
5259+
sum_num_expansions += perhash->num_expansions;
5260+
}
5261+
}
5262+
5263+
if (aggstate->hash_ever_spilled)
5264+
{
5265+
appendStringInfo(hbuf,
5266+
"; " UINT64_FORMAT " groups total in %d batches, " UINT64_FORMAT
5267+
" spill partitions; disk usage: " INT64_FORMAT "KB",
5268+
sum_output_groups,
5269+
aggstate->hash_batches_used,
5270+
sum_spill_parts,
5271+
aggstate->hash_disk_used);
5272+
}
5273+
5274+
if (sum_chain_count > 0)
5275+
{
5276+
appendStringInfo(hbuf,
5277+
"; chain length %.1f avg, %d max;"
5278+
" using " INT64_FORMAT " of " INT64_FORMAT " buckets;"
5279+
" total " INT64_FORMAT " expansions.\n",
5280+
(double)sum_chain_length_total / sum_chain_count,
5281+
chain_length_max,
5282+
sum_bucket_used,
5283+
sum_bucket_total,
5284+
sum_num_expansions);
5285+
}
5286+
}

src/include/executor/nodeAgg.h

+23
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,29 @@ typedef struct AggStatePerHashData
311311
AttrNumber *hashGrpColIdxInput; /* hash col indices in input slot */
312312
AttrNumber *hashGrpColIdxHash; /* indices in hash table tuples */
313313
Agg *aggnode; /* original Agg node, for numGroups etc. */
314+
315+
/*
316+
* Some statistic info of hash table, used for EXPLAIN ANALYZE.
317+
* Note that they are accumulated info and will not be reset even
318+
* after the hash table is reset.
319+
*/
320+
321+
/* number of groups/entries output by the iterator */
322+
uint64 num_output_groups;
323+
/* number of spilled partitions */
324+
uint64 num_spill_parts;
325+
/* number of hash table expansions */
326+
uint32 num_expansions;
327+
/* total number of buckets */
328+
uint64 bucket_total;
329+
/* number of used buckets */
330+
uint64 bucket_used;
331+
/* number of all chains */
332+
uint64 chain_count;
333+
/* total length of all chains */
334+
uint64 chain_length_total;
335+
/* max chain length */
336+
uint32 chain_length_max;
314337
} AggStatePerHashData;
315338

316339

src/include/lib/simplehash.h

+81
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@
126126
#define SH_ALLOCATE SH_MAKE_NAME(allocate)
127127
#define SH_FREE SH_MAKE_NAME(free)
128128
#define SH_STAT SH_MAKE_NAME(stat)
129+
#define SH_COLL_STAT SH_MAKE_NAME(coll_stat)
129130

130131
/* internal helper functions (no externally visible prototypes) */
131132
#define SH_COMPUTE_PARAMETERS SH_MAKE_NAME(compute_parameters)
@@ -169,6 +170,14 @@ typedef struct SH_TYPE
169170

170171
/* user defined data, useful for callbacks */
171172
void *private_data;
173+
174+
/*
175+
* number of times hash table is expanded
176+
*
177+
* Since the max size of hash table is UINT32_MAX, uint32 is good
178+
* enough for the number of expanded times.
179+
*/
180+
uint32 num_expansions;
172181
} SH_TYPE;
173182

174183
typedef enum SH_STATUS
@@ -244,6 +253,12 @@ SH_SCOPE SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
244253
/* void <prefix>_stat(<prefix>_hash *tb */
245254
SH_SCOPE void SH_STAT(SH_TYPE * tb);
246255

256+
SH_SCOPE void
257+
SH_COLL_STAT(SH_TYPE * tb,
258+
uint32 * max_chain_length,
259+
uint32 * total_chain_length,
260+
uint32 * chain_count);
261+
247262
#endif /* SH_DECLARE */
248263

249264

@@ -450,6 +465,7 @@ SH_CREATE(MemoryContext ctx, uint32 nelements, void *private_data)
450465
SH_COMPUTE_PARAMETERS(tb, size);
451466

452467
tb->data = SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
468+
tb->num_expansions = 0;
453469

454470
return tb;
455471
}
@@ -468,6 +484,7 @@ SH_RESET(SH_TYPE * tb)
468484
{
469485
memset(tb->data, 0, sizeof(SH_ELEMENT_TYPE) * tb->size);
470486
tb->members = 0;
487+
tb->num_expansions = 0;
471488
}
472489

473490
/*
@@ -580,6 +597,7 @@ SH_GROW(SH_TYPE * tb, uint64 newsize)
580597
}
581598
}
582599

600+
(tb->num_expansions)++;
583601
SH_FREE(tb, olddata);
584602
}
585603

@@ -1120,6 +1138,69 @@ SH_STAT(SH_TYPE * tb)
11201138
total_collisions, max_collisions, avg_collisions);
11211139
}
11221140

1141+
/*
1142+
* Greenplum specific
1143+
*
1144+
* Collect some statistics about the state of the hashtable. Major code was
1145+
* copied from SH_STAT() with some modifications to keep consistent with GPDB6.
1146+
*/
1147+
SH_SCOPE void
1148+
SH_COLL_STAT(SH_TYPE * tb,
1149+
uint32 * max_chain_length,
1150+
uint32 * total_chain_length,
1151+
uint32 * chain_count)
1152+
{
1153+
*total_chain_length = 0;
1154+
*chain_count = 0;
1155+
uint32 last_dist = 0;
1156+
1157+
for (int i = 0; i < tb->size; i++)
1158+
{
1159+
uint32 hash;
1160+
uint32 optimal;
1161+
uint32 dist;
1162+
SH_ELEMENT_TYPE *elem;
1163+
1164+
elem = &tb->data[i];
1165+
1166+
if (elem->status != SH_STATUS_IN_USE)
1167+
continue;
1168+
1169+
hash = SH_ENTRY_HASH(tb, elem);
1170+
optimal = SH_INITIAL_BUCKET(tb, hash);
1171+
dist = SH_DISTANCE_FROM_OPTIMAL(tb, optimal, i);
1172+
1173+
/*
1174+
* Different from SH_STAT(), always calculate chain length from 1 but
1175+
* not 0, e.g. when there is only one element in bucket, the length
1176+
* is 1.
1177+
*/
1178+
dist++;
1179+
1180+
/*
1181+
* In same chain, dist must be always increasing. If dist < last_dist,
1182+
* we must hit a new chain; take the length of old chain into account.
1183+
*/
1184+
if (dist < last_dist)
1185+
{
1186+
if (last_dist > *max_chain_length)
1187+
*max_chain_length = last_dist;
1188+
*total_chain_length += last_dist;
1189+
(*chain_count)++;
1190+
}
1191+
last_dist = dist;
1192+
}
1193+
1194+
/* Count the last chain. */
1195+
if (last_dist != 0)
1196+
{
1197+
if (last_dist > *max_chain_length)
1198+
*max_chain_length = last_dist;
1199+
*total_chain_length += last_dist;
1200+
(*chain_count)++;
1201+
}
1202+
}
1203+
11231204
#endif /* SH_DEFINE */
11241205

11251206

0 commit comments

Comments
 (0)