Skip to content

Commit 1237fd6

Browse files
committed
feature: asynchronously update file statistics during vacuum
Previously, statistics (min-max, sum, count, etc.) were computed synchronously during data insertion, causing significant slowdowns due to heavy computational overhead. This change introduces an asynchronous approach to maintain statistics: - Add a GUC parameter to control statistics collection during writes (disabled by default) - Skip statistics computation during INSERT to ensure fast writes - Update statistics asynchronously during VACUUM on PAX tables by scanning file metadata - Re-read files and refresh statistics only when metadata indicates they are stale - Vacuum which data files have been marked for deletion ``` create table t1(c1 int, c2 int, c3 int, c4 int, c5 int, c6 int) using pax with(minmax_columns='c1,c2,c3,c4,c5,c6'); set pax.enable_sync_collect_stats to on; -- collect stats synchronously insert into t1 select i,i,i,i,i,i from generate_series(1,1000000) i; INSERT 0 1000000 Time: 2733.731 ms (00:02.734) create table t2(c1 int, c2 int, c3 int, c4 int, c5 int, c6 int) using pax; insert into t2 select i,i,i,i,i,i from generate_series(1,1000000) i; INSERT 0 1000000 Time: 1816.836 ms (00:01.817) ```
1 parent da0b9d4 commit 1237fd6

File tree

81 files changed

+1657
-676
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+1657
-676
lines changed

contrib/pax_storage/expected/cluster.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
set pax.max_tuples_per_file to 131072;
2+
set pax.enable_sync_collect_stats = on;
23
-- cluster table using index
34
-- start_ignore
45
drop table if EXISTS t_index_cluster;
@@ -295,3 +296,4 @@ select ptblockname,ptstatistics,ptisclustered from get_pax_aux_table('t_lexical_
295296
(6 rows)
296297

297298
drop table t_lexical_cluster;
299+
reset pax.enable_sync_collect_stats;
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
alter system set autovacuum = off;
2+
select gp_segment_id, pg_reload_conf() from gp_id union select gp_segment_id, pg_reload_conf() from gp_dist_random('gp_id');
3+
gp_segment_id | pg_reload_conf
4+
---------------+----------------
5+
2 | t
6+
1 | t
7+
0 | t
8+
-1 | t
9+
(4 rows)
10+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
alter system set autovacuum = on;
2+
select gp_segment_id, pg_reload_conf() from gp_id union select gp_segment_id, pg_reload_conf() from gp_dist_random('gp_id');
3+
gp_segment_id | pg_reload_conf
4+
---------------+----------------
5+
2 | t
6+
1 | t
7+
0 | t
8+
-1 | t
9+
(4 rows)
10+

contrib/pax_storage/expected/filter.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
set pax.enable_debug to on;
22
set pax.enable_sparse_filter = on;
3+
set pax.enable_sync_collect_stats = on;
34
create table pax_test.null_test_t(a int, b int, c text) using pax;
45
insert into pax_test.null_test_t(a) select null from generate_series(1,2)i;
56
insert into pax_test.null_test_t select 1, i, 'cc_' || i from generate_series(1,2)i;
@@ -223,3 +224,4 @@ kind group, filter rate: 0 / 1
223224
reset client_min_messages;
224225
drop table pax_test.in_test_t;
225226
reset pax.enable_sparse_filter;
227+
reset pax.enable_sync_collect_stats;

contrib/pax_storage/expected/filter_1.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
set pax.enable_debug to on;
22
set pax.enable_sparse_filter = on;
3+
set pax.enable_sync_collect_stats = on;
34
create table pax_test.null_test_t(a int, b int, c text) using pax;
45
insert into pax_test.null_test_t(a) select null from generate_series(1,2)i;
56
insert into pax_test.null_test_t select 1, i, 'cc_' || i from generate_series(1,2)i;
@@ -223,3 +224,4 @@ kind group, filter rate: 0 / 1
223224
reset client_min_messages;
224225
drop table pax_test.in_test_t;
225226
reset pax.enable_sparse_filter;
227+
reset pax.enable_sync_collect_stats;

contrib/pax_storage/expected/filter_tree.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
set default_table_access_method to pax;
88
set pax.enable_debug to on;
99
set pax.enable_sparse_filter to on;
10+
set pax.enable_sync_collect_stats = on;
1011
create or replace function intrc(iint int)
1112
returns int as $$
1213
begin return iint; end;
@@ -735,6 +736,7 @@ LOG: statement: select count(*) from t1 where coalesce(v1, 2) != 1;
735736

736737
reset client_min_messages;
737738
LOG: statement: reset client_min_messages;
739+
reset pax.enable_sync_collect_stats;
738740
drop table t1;
739741
drop table t2;
740742
drop table t_allnull;

contrib/pax_storage/expected/filter_tree_1.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
set default_table_access_method to pax;
88
set pax.enable_debug to on;
99
set pax.enable_sparse_filter to on;
10+
set pax.enable_sync_collect_stats = on;
1011
create or replace function intrc(iint int)
1112
returns int as $$
1213
begin return iint; end;
@@ -742,6 +743,7 @@ LOG: statement: select count(*) from t1 where coalesce(v1, 2) != 1;
742743

743744
reset client_min_messages;
744745
LOG: statement: reset client_min_messages;
746+
reset pax.enable_sync_collect_stats;
745747
drop table t1;
746748
drop table t2;
747749
drop table t_allnull;

contrib/pax_storage/expected/filter_tree_arithmetic.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
set default_table_access_method to pax;
88
set pax.enable_debug to on;
99
set pax.enable_sparse_filter to on;
10+
set pax.enable_sync_collect_stats = on;
1011
create table t_arithmetic(same int, v1 int, v2 int, v3 int) using pax with (minmax_columns='v1,v2,v3');
1112
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'same' as the Apache Cloudberry data distribution key for this table.
1213
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
@@ -1063,5 +1064,6 @@ LOG: kind file, filter rate: 0 / 1
10631064

10641065
reset client_min_messages;
10651066
LOG: statement: reset client_min_messages;
1067+
reset pax.enable_sync_collect_stats;
10661068
drop table t_arithmetic;
10671069
drop table ta_mul;

contrib/pax_storage/expected/filter_tree_arithmetic_1.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
set default_table_access_method to pax;
88
set pax.enable_debug to on;
99
set pax.enable_sparse_filter to on;
10+
set pax.enable_sync_collect_stats = on;
1011
create table t_arithmetic(same int, v1 int, v2 int, v3 int) using pax with (minmax_columns='v1,v2,v3');
1112
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'same' as the Apache Cloudberry data distribution key for this table.
1213
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
@@ -938,5 +939,6 @@ LOG: statement: select count(*) from ta_mul where v2 * v2 >= 10000;
938939

939940
reset client_min_messages;
940941
LOG: statement: reset client_min_messages;
942+
reset pax.enable_sync_collect_stats;
941943
drop table t_arithmetic;
942944
drop table ta_mul;

contrib/pax_storage/expected/filter_tree_optimizer.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
set default_table_access_method to pax;
88
set pax.enable_debug to on;
99
set pax.enable_sparse_filter to on;
10+
set pax.enable_sync_collect_stats = on;
1011
create or replace function intrc(iint int)
1112
returns int as $$
1213
begin return iint; end;
@@ -738,6 +739,7 @@ LOG: statement: select count(*) from t1 where coalesce(v1, 2) != 1;
738739

739740
reset client_min_messages;
740741
LOG: statement: reset client_min_messages;
742+
reset pax.enable_sync_collect_stats;
741743
drop table t1;
742744
drop table t2;
743745
drop table t_allnull;

0 commit comments

Comments
 (0)