Skip to content
This repository was archived by the owner on Sep 27, 2019. It is now read-only.

[15721] Hash Join Cost Update #1344

Open
wants to merge 47 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
b8b5ad4
Fork sync
Mar 23, 2018
6d619a8
Initial
Mar 24, 2018
52b7f53
Plan creation working
Mar 25, 2018
9bb6cc3
Able to get table names
Mar 26, 2018
c6badc0
Refactored plan selection test and added sort test and another join o…
nappelson Mar 28, 2018
d96f02f
Add analyze statements to plan selection test
nappelson Mar 28, 2018
a1d6f4d
Fix to optimizer rule bitmap
Mar 29, 2018
368c201
Added test case
Mar 29, 2018
ba827a9
Merge in tests branch
Apr 2, 2018
7bc5434
Support for generating worst plan
Apr 2, 2018
1c13695
Add timer output to plan selection tests
nappelson Apr 3, 2018
1d61adb
Merge branch 'master' into master
pervazea Apr 3, 2018
0c2e9e6
add simple worst case plan selection test
nappelson Apr 4, 2018
9b6fd0d
refactor Cost Model instantiation (make it a parameter to optimizer c…
nappelson Apr 9, 2018
b17f5cb
Initial
Mar 24, 2018
60f5193
Plan creation working
Mar 25, 2018
b689891
Able to get table names
Mar 26, 2018
9ef318f
Refactored plan selection test and added sort test and another join o…
nappelson Mar 28, 2018
a112af1
Add analyze statements to plan selection test
nappelson Mar 28, 2018
dc89ec2
Support for generating worst plan
Apr 2, 2018
966b231
Add timer output to plan selection tests
nappelson Apr 3, 2018
393661e
add simple worst case plan selection test
nappelson Apr 4, 2018
14220e3
refactor Cost Model instantiation (make it a parameter to optimizer c…
nappelson Apr 9, 2018
17d4f1a
Merge branch 'tests' of github.com:GustavoAngulo/peloton into project_3
nappelson Apr 10, 2018
4f1505e
add example of bad cost model using new optimizer cost model refactor
nappelson Apr 10, 2018
48a09dd
reverse failed unit test order to make tests pass (need to fix commun…
nappelson Apr 12, 2018
d604638
Reverted changes from another branch
Apr 12, 2018
df8f4a5
Fix to optimizer rule bitmap
Mar 29, 2018
3f086f2
Added test case
Mar 29, 2018
ff8af6e
Fix to memo table
Apr 11, 2018
f95cc30
Merge branch 'gus_master' into project_3
nappelson Apr 23, 2018
20facd1
Fix for PrintPlan bug
Apr 23, 2018
a4e9f5e
Reset the child idx and total cost before we return if it is the firs…
nappelson Apr 24, 2018
b22603a
Merge branch 'tests' of github.com:GustavoAngulo/peloton into project_3
nappelson Apr 24, 2018
e4674b2
Fixed bug where stats not properly generated
May 3, 2018
9f36c6a
Initial commit for updated hash join cost
May 5, 2018
1eaa236
added flag that allows users to specify different cost calculators w…
nappelson May 6, 2018
a33e8fc
Moved new postgresesque hash join cost model code into new postgres c…
nappelson May 6, 2018
d4e2331
Addressing comments from PR 1
May 6, 2018
79f9ff2
Moved new postgresesque hash join cost model code into new postgres c…
nappelson May 6, 2018
750773c
add cost model evaluator
nappelson May 12, 2018
5a8c7a4
Merge branch 'tests' of github.com:GustavoAngulo/peloton into project…
nappelson May 12, 2018
49ec363
add some documentation'
nappelson May 12, 2018
9ba572e
Fix to highest freq calculation
May 13, 2018
5731859
add table printing info to explain statement
nappelson May 13, 2018
670c633
Merge branch 'tests' of github.com:GustavoAngulo/peloton into project…
nappelson May 13, 2018
e48873c
fix query count in cost model evaluator
nappelson May 14, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 159 additions & 0 deletions script/testing/cost_model/cost_model_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import argparse
import psycopg2
import re
import socket
import subprocess
import time

from collections import namedtuple

# TODO add more stats
ExecutionStats = namedtuple('ExecutionStats', ['average'])


class Peloton(object):
def __init__(self, peloton_path, port, cost_calculator=None):
self.peloton_path = peloton_path
self.peloton_port = port
self.cost_calculator = cost_calculator
self.peloton_process = None
self.peloton_output_fd = None
self.conn = None

def run(self):
outfile = "/tmp/peloton_log.txt"
args = [self.peloton_path, "-port", str(self.peloton_port)]
if self.cost_calculator:
args += ["-cost_calculator", self.cost_calculator]
args += ['-codegen', 'false']
args += ['-hash_join_bloom_filter', 'false']
self.peloton_output_fd = open(outfile, "w+")
self.peloton_process = subprocess.Popen(args, stdout=self.peloton_output_fd, stderr=self.peloton_output_fd)
self.wait()
self.conn = psycopg2.connect(
"dbname=default_database user=postgres password=postgres host=localhost port={}".format(self.peloton_port))

def stop(self):
if not self.peloton_process:
raise Exception("No peloton process to stop")
self.peloton_process.poll()
if self.peloton_process.returncode is not None:
# Peloton terminated already
self.peloton_output_fd.close()
msg = "Peloton terminated with return code {}".format(self.peloton_process.returncode)
raise RuntimeError(msg)

# still(correctly) running, terminate it
self.conn.close()
self.peloton_process.terminate()
return

def wait(self):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# max wait of 10s in 0.1s increments
for i in xrange(100):
try:
s.connect(('localhost', self.peloton_port))
s.close()
print("connected to server in {} seconds".format(i * 0.1))
return
except:
time.sleep(0.1)
continue
return

def process_query(self, query):
cur = self.conn.cursor()
rows = None
if query:
cur.execute(query)
try:
rows = cur.fetchall()
except Exception:
pass
self.conn.commit()
cur.close()
return rows


def get_query_list(sql_string_file):
with open(sql_string_file, 'r') as infile:
sql_string = infile.read()
sql_string = sql_string.replace('\n', '')
sql_string = re.sub(' +', ' ', sql_string)
query_list = [x for x in sql_string.split(';') if x]
return query_list


def execute_sql_statements(data_path, peloton):
query_list = get_query_list(data_path)
for query in query_list:
peloton.process_query(query)


def execute_sql_statements_with_stats(data_path, peloton, execution_count):
execution_stat_list = []
explain_result_list = []
results = []
query_list = get_query_list(data_path)
for i, query in enumerate(query_list):
sum_time = 0
for _ in xrange(execution_count):
start_time = time.time()
rows = peloton.process_query(query)
end_time = time.time()
sum_time += (end_time - start_time)
result = {
'num': i + 1,
'query': query,
'num_rows': len(rows) if rows else 0,
'execution_stats': ExecutionStats(average=sum_time / execution_count),
'explain_result': peloton.process_query("".join(["EXPLAIN ", query]))
}
results.append(result)
return results


def analyze(peloton):
# Note this doesn't actually do anything. When https://github.com/cmu-db/peloton/issues/1360 is resolved, this will work
peloton.process_query("ANALYZE;")


def run_pipeline(args):
peloton = Peloton(args.peloton_path, args.port, args.cost_model)
try:
peloton = Peloton(args.peloton_path, args.port, args.cost_model)
peloton.run()
execute_sql_statements(args.data_load_path, peloton)
analyze(peloton)
results = execute_sql_statements_with_stats(args.data_query_path, peloton, args.query_count)
for result in results:
print 'Query Num: {}'.format(result['num'])
print 'Query: {}'.format(result['query'])
print 'Num Result Rows: {}'.format(result['num_rows'])
print result['execution_stats']
for row in result['explain_result']:
print row
peloton.stop()
except Exception as e:
print e
peloton.stop()


def main():
parser = argparse.ArgumentParser(description="Evaluate the provided cost model")
parser.add_argument("--cost-model")
parser.add_argument("--port", default=15721, help="Optional port override if you aren't using 15721")
parser.add_argument("--peloton-path", default="peloton",
help="Optional path to peloton binary if peloton is not on your path")
parser.add_argument('--query-count', default=500, type=int,
help="Number of times to run the query defined in the data query path")
# For now, data load path needs to include analyze statements at end. we don't support analyze on all tables
parser.add_argument("data_load_path")
parser.add_argument("data_query_path")
args = parser.parse_args()
run_pipeline(args)


if __name__ == "__main__":
main()
29 changes: 29 additions & 0 deletions src/include/optimizer/abstract_cost_calculator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
//===----------------------------------------------------------------------===//
//
// Peloton
//
// abstract_cost_calculator.h
//
// Identification: src/include/optimizer/abstract_cost_calculator.h
//
// Copyright (c) 2015-18, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once

#include "optimizer/operator_visitor.h"

namespace peloton {
namespace optimizer {

class Memo;

class AbstractCostCalculator : public OperatorVisitor {
public:
virtual double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn) = 0;
};

} // namespace optimizer
} // namespace peloton
58 changes: 30 additions & 28 deletions src/include/optimizer/cost_calculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,40 +12,42 @@

#pragma once

#include "optimizer/operator_visitor.h"
#include "optimizer/abstract_cost_calculator.h"

namespace peloton {
namespace optimizer {

class Memo;
// Derive cost for a physical group expressionh
class CostCalculator : public OperatorVisitor {
// Derive cost for a physical group expression
class CostCalculator : public AbstractCostCalculator {
public:
double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn);

void Visit(const DummyScan *) override;
void Visit(const PhysicalSeqScan *) override;
void Visit(const PhysicalIndexScan *) override;
void Visit(const QueryDerivedScan *) override;
void Visit(const PhysicalOrderBy *) override;
void Visit(const PhysicalLimit *) override;
void Visit(const PhysicalInnerNLJoin *) override;
void Visit(const PhysicalLeftNLJoin *) override;
void Visit(const PhysicalRightNLJoin *) override;
void Visit(const PhysicalOuterNLJoin *) override;
void Visit(const PhysicalInnerHashJoin *) override;
void Visit(const PhysicalLeftHashJoin *) override;
void Visit(const PhysicalRightHashJoin *) override;
void Visit(const PhysicalOuterHashJoin *) override;
void Visit(const PhysicalInsert *) override;
void Visit(const PhysicalInsertSelect *) override;
void Visit(const PhysicalDelete *) override;
void Visit(const PhysicalUpdate *) override;
void Visit(const PhysicalHashGroupBy *) override;
void Visit(const PhysicalSortGroupBy *) override;
void Visit(const PhysicalDistinct *) override;
void Visit(const PhysicalAggregate *) override;
CostCalculator(){};

virtual double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn) override;

virtual void Visit(const DummyScan *) override;
virtual void Visit(const PhysicalSeqScan *) override;
virtual void Visit(const PhysicalIndexScan *) override;
virtual void Visit(const QueryDerivedScan *) override;
virtual void Visit(const PhysicalOrderBy *) override;
virtual void Visit(const PhysicalLimit *) override;
virtual void Visit(const PhysicalInnerNLJoin *) override;
virtual void Visit(const PhysicalLeftNLJoin *) override;
virtual void Visit(const PhysicalRightNLJoin *) override;
virtual void Visit(const PhysicalOuterNLJoin *) override;
virtual void Visit(const PhysicalInnerHashJoin *) override;
virtual void Visit(const PhysicalLeftHashJoin *) override;
virtual void Visit(const PhysicalRightHashJoin *) override;
virtual void Visit(const PhysicalOuterHashJoin *) override;
virtual void Visit(const PhysicalInsert *) override;
virtual void Visit(const PhysicalInsertSelect *) override;
virtual void Visit(const PhysicalDelete *) override;
virtual void Visit(const PhysicalUpdate *) override;
virtual void Visit(const PhysicalHashGroupBy *) override;
virtual void Visit(const PhysicalSortGroupBy *) override;
virtual void Visit(const PhysicalDistinct *) override;
virtual void Visit(const PhysicalAggregate *) override;

private:
double HashCost();
Expand Down
31 changes: 31 additions & 0 deletions src/include/optimizer/cost_calculator_factory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
//===----------------------------------------------------------------------===//
//
// Peloton
//
// cost_model_factory.h
//
// Identification: src/include/optimizer/cost_model_factory.h
//
// Copyright (c) 2015-18, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once
#include "optimizer/cost_calculator.h"

#include "common/exception.h"

namespace peloton {
namespace optimizer {

class CostCalculatorFactory {
public:
/*
* Creates the respective cost calculator given a cost calculator name
*/
static std::unique_ptr<AbstractCostCalculator> CreateCostCalculator(
const std::string &cost_model_name);
};

} // namespace peloton
} // namespace optimizer
26 changes: 17 additions & 9 deletions src/include/optimizer/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <memory>

#include "optimizer/abstract_optimizer.h"
#include "optimizer/abstract_cost_calculator.h"
#include "optimizer/property_set.h"
#include "optimizer/optimizer_metadata.h"

Expand All @@ -38,9 +39,9 @@ class TransactionContext;
}

namespace test {
class OptimizerRuleTests_SimpleAssociativeRuleTest_Test;
class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test;
}
class OptimizerRuleTests_SimpleAssociativeRuleTest_Test;
class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test;
}

namespace optimizer {

Expand All @@ -60,16 +61,19 @@ class Optimizer : public AbstractOptimizer {
friend class BindingIterator;
friend class GroupBindingIterator;

friend class ::peloton::test::OptimizerRuleTests_SimpleAssociativeRuleTest_Test;
friend class ::peloton::test::OptimizerRuleTests_SimpleAssociativeRuleTest2_Test;
friend class ::peloton::test::
OptimizerRuleTests_SimpleAssociativeRuleTest_Test;
friend class ::peloton::test::
OptimizerRuleTests_SimpleAssociativeRuleTest2_Test;

public:
Optimizer(const Optimizer &) = delete;
Optimizer &operator=(const Optimizer &) = delete;
Optimizer(Optimizer &&) = delete;
Optimizer &operator=(Optimizer &&) = delete;

Optimizer();
Optimizer(){};
Optimizer(std::unique_ptr<AbstractCostCalculator> cost_calculator);

std::shared_ptr<planner::AbstractPlan> BuildPelotonPlanTree(
const std::unique_ptr<parser::SQLStatementList> &parse_tree,
Expand All @@ -83,14 +87,16 @@ class Optimizer : public AbstractOptimizer {

OptimizerMetadata &GetMetadata() { return metadata_; }

AbstractCostCalculator *GetCostCalculator() { return cost_calculator_.get(); }

/* For test purposes only */
std::shared_ptr<GroupExpression> TestInsertQueryTree(parser::SQLStatement *tree,
concurrency::TransactionContext *txn) {
std::shared_ptr<GroupExpression> TestInsertQueryTree(
parser::SQLStatement *tree, concurrency::TransactionContext *txn) {
return InsertQueryTree(tree, txn);
}
/* For test purposes only */
void TestExecuteTaskStack(OptimizerTaskStack &task_stack, int root_group_id,
std::shared_ptr<OptimizeContext> root_context) {
std::shared_ptr<OptimizeContext> root_context) {
return ExecuteTaskStack(task_stack, root_group_id, root_context);
}

Expand Down Expand Up @@ -153,6 +159,8 @@ class Optimizer : public AbstractOptimizer {
//////////////////////////////////////////////////////////////////////////////
/// Metadata
OptimizerMetadata metadata_;
/// Cost Model
std::unique_ptr<AbstractCostCalculator> cost_calculator_;
};

} // namespace optimizer
Expand Down
Loading