Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions _benchplot/benchplot-dict.R
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ groupby.syntax.dict = {list(
"sum v1:v3 by id6" = "SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM tbl GROUP BY id6",
"median v3 sd v3 by id4 id5" = "SELECT id4, id5, quantile_cont(v3, 0.5) AS median_v3, stddev(v3) AS sd_v3 FROM tbl GROUP BY id4, id5",
"max v1 - min v2 by id3" = "SELECT id3, max(v1)-min(v2) AS range_v1_v2 FROM tbl GROUP BY id3",
"largest two v3 by id6" = "SELECT id6, v3 AS largest2_v3 FROM (SELECT id6, v3, row_number() OVER (PARTITION BY id6 ORDER BY v3 DESC) AS order_v3 FROM x WHERE v3 IS NOT NULL) sub_query WHERE order_v3 <= 2",
"largest two v3 by id6" = "SELECT id6, unnest(max(v3, 2)) largest2_v3 FROM x WHERE v3 IS NOT NULL GROUP BY id6",
"regression v1 v2 by id2 id4" = "SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM tbl GROUP BY id2, id4",
"sum v3 count by id1:id6" = "SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count(*) AS count FROM tbl GROUP BY id1, id2, id3, id4, id5, id6"
)},
Expand All @@ -232,7 +232,7 @@ groupby.syntax.dict = {list(
"sum v1:v3 by id6" = "SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM tbl GROUP BY id6",
"median v3 sd v3 by id4 id5" = "SELECT id4, id5, quantile_cont(v3, 0.5) AS median_v3, stddev(v3) AS sd_v3 FROM tbl GROUP BY id4, id5",
"max v1 - min v2 by id3" = "SELECT id3, max(v1)-min(v2) AS range_v1_v2 FROM tbl GROUP BY id3",
"largest two v3 by id6" = "SELECT id6, v3 AS largest2_v3 FROM (SELECT id6, v3, row_number() OVER (PARTITION BY id6 ORDER BY v3 DESC) AS order_v3 FROM x WHERE v3 IS NOT NULL) sub_query WHERE order_v3 <= 2",
"largest two v3 by id6" = "SELECT id6, unnest(max(v3, 2)) largest2_v3 FROM x WHERE v3 IS NOT NULL GROUP BY id6",
"regression v1 v2 by id2 id4" = "SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM tbl GROUP BY id2, id4",
"sum v3 count by id1:id6" = "SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count(*) AS count FROM tbl GROUP BY id1, id2, id3, id4, id5, id6"
)},
Expand Down
1 change: 0 additions & 1 deletion _control/skipped_benchmarks.csv
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,4 @@ pandas,join,J1_1e9_NA_0_0,c6id.4xlarge
polars,join,J1_1e9_NA_0_0,c6id.4xlarge
pydatatable,join,J1_1e9_NA_0_0,c6id.4xlarge
spark,join,J1_1e9_NA_0_0,c6id.4xlarge
clickhouse,join,J1_1e9_NA_0_0,c6id.4xlarge

2 changes: 1 addition & 1 deletion _run/run_large.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ echo "Running all solutions on large (50GB) datasets"
./run.sh


###
##
echo "done..."
echo "removing data files"
rm data/*.csv
Expand Down
2 changes: 1 addition & 1 deletion clickhouse/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
25.1.3.23
25.9.3.48
3 changes: 2 additions & 1 deletion clickhouse/setup-clickhouse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ sudo service clickhouse-server start ||:

# modify clickhouse settings so data is stored on the mount.
sudo mkdir -p /var/lib/mount/clickhouse-nvme-mount/
sudo chown clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount
sudo chown -R clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount
sudo chown -R clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount/store

# set up clickhouse tmp space
sudo mkdir -p /var/lib/mount/clickhouse-tmp/
Expand Down
2 changes: 1 addition & 1 deletion clickhouse/ver-clickhouse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ ch_installed && clickhouse-client --version-clean > clickhouse/VERSION && echo "
if [[ $TEST_RUN != "true" ]]; then
sudo chown ubuntu:ubuntu clickhouse/VERSION
sudo chown ubuntu:ubuntu clickhouse/REVISION
fi
fi
6 changes: 4 additions & 2 deletions collapse/setup-collapse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ set -e

# install stable collapse
mkdir -p ./collapse/r-collapse
Rscript -e 'install.packages(c("Rcpp", "collapse"), lib="./collapse/r-collapse", repos = "http://cloud.r-project.org")'
ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
MAKEFLAGS="-j$ncores" Rscript -e 'install.packages(c("Rcpp", "collapse"), lib="./collapse/r-collapse", repos = "http://cloud.r-project.org")'

./collapse/ver-collapse.sh

./collapse/ver-collapse.sh
2 changes: 1 addition & 1 deletion datafusion/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
50.0.0
50.1.0
2 changes: 1 addition & 1 deletion datatable/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.16.99
1.17.99
2 changes: 1 addition & 1 deletion duckdb/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.4.0
1.4.1
82 changes: 39 additions & 43 deletions duckdb/groupby-duckdb.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ if (on_disk) {

table_type = "TEMP"
if (machine_type == 'c6id.4xlarge' && on_disk) {
dbExecute(con, "pragma memory_limit='25G'")
dbExecute(con, "pragma memory_limit='20G'")
}

dbExecute(con, "SET enable_progress_bar = false;")
Expand Down Expand Up @@ -106,27 +106,25 @@ print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))


if (!(machine_type == 'c6id.4xlarge' && on_disk)) {
question = "sum v1 mean v3 by id3" # q3
t = system.time({
dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM x GROUP BY id3", table_type))
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT sum(v1) AS v1, sum(v3) AS v3 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM x GROUP BY id3", table_type))
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT sum(v1) AS v1, sum(v3) AS v3 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
}
question = "sum v1 mean v3 by id3" # q3
t = system.time({
dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM x GROUP BY id3", table_type))
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT sum(v1) AS v1, sum(v3) AS v3 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM x GROUP BY id3", table_type))
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT sum(v1) AS v1, sum(v3) AS v3 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))

question = "mean v1:v3 by id4" # q4
t = system.time({
Expand Down Expand Up @@ -209,27 +207,25 @@ print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))


if (!(machine_type == 'c6id.4xlarge' && on_disk)) {
question = "largest two v3 by id6" # q8
t = system.time({
dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT id6, unnest(max(v3, 2)) largest2_v3 FROM x WHERE v3 IS NOT NULL GROUP BY id6", table_type))
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT sum(largest2_v3) AS largest2_v3 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT id6, unnest(max(v3, 2)) largest2_v3 FROM x WHERE v3 IS NOT NULL GROUP BY id6", table_type))
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT sum(largest2_v3) AS largest2_v3 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
}
question = "largest two v3 by id6" # q8
t = system.time({
dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT id6, unnest(max(v3, 2)) largest2_v3 FROM x WHERE v3 IS NOT NULL GROUP BY id6", table_type))
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT sum(largest2_v3) AS largest2_v3 FROM ans"))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))
t = system.time({
dbExecute(con, sprintf("CREATE %s TABLE ans AS SELECT id6, unnest(max(v3, 2)) largest2_v3 FROM x WHERE v3 IS NOT NULL GROUP BY id6", table_type))
print(c(nr<-dbGetQuery(con, "SELECT count(*) AS cnt FROM ans")$cnt, nc<-ncol(dbGetQuery(con, "SELECT * FROM ans LIMIT 0"))))
})[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-dbGetQuery(con, "SELECT sum(largest2_v3) AS largest2_v3 FROM ans"))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=in_nr, question=question, out_rows=nr, out_cols=nc, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
print(dbGetQuery(con, "SELECT * FROM ans LIMIT 3")) ## head
print(dbGetQuery(con, "SELECT * FROM ans WHERE ROWID > (SELECT count(*) FROM ans) - 4")) ## tail
invisible(dbExecute(con, "DROP TABLE IF EXISTS ans"))

question = "regression v1 v2 by id2 id4" # q9
t = system.time({
Expand Down
Loading
Loading