cmu-db · sivaprasadsudhir · Mar 29, 2018 · Mar 29, 2018 · Mar 30, 2018 · Mar 30, 2018
diff --git a/src/brain/index_selection.cpp b/src/brain/index_selection.cpp
diff --git a/src/brain/index_selection_context.cpp b/src/brain/index_selection_context.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+//                         Peloton
+//
+// index_selection_context.cpp
+//
+// Identification: src/brain/index_selection_context.cpp
+//
+// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
+//
+//===----------------------------------------------------------------------===//
+
+#include "brain/index_selection_context.h"
+#include "common/logger.h"
+
+namespace peloton {
+namespace brain {
+
+IndexSelectionContext::IndexSelectionContext(IndexSelectionKnobs knobs)
+    : knobs_(knobs) {}
+
+}  // namespace brain
+}  // namespace peloton
diff --git a/src/brain/index_selection_job.cpp b/src/brain/index_selection_job.cpp
@@ -0,0 +1,174 @@
+//===----------------------------------------------------------------------===//
+//
+//                         Peloton
+//
+// index_selection_job.cpp
+//
+// Identification: src/brain/index_selection_job.cpp
+//
+// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
+//
+//===----------------------------------------------------------------------===//
+
+#include "brain/index_selection_util.h"
+#include "brain/index_selection_job.h"
+#include "brain/index_selection.h"
+#include "catalog/query_history_catalog.h"
+#include "catalog/system_catalogs.h"
+#include "optimizer/stats/stats_storage.h"
+
+namespace peloton {
+namespace brain {
+
+#define BRAIN_SUGGESTED_INDEX_MAGIC_STR "brain_suggested_index_"
+
+void IndexSelectionJob::OnJobInvocation(BrainEnvironment *env) {
+  auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance();
+  auto txn = txn_manager.BeginTransaction();
+  LOG_INFO("Started Index Suggestion Task");
+
+  // Generate column stats for all the tables before we begin.
+  // TODO[vamshi]
+  // Instead of collecting stats for every table, collect them only for the
+  // tables
+  // we are analyzing i.e. tables that are referenced in the current workload.
+  optimizer::StatsStorage *stats_storage =
+      optimizer::StatsStorage::GetInstance();
+  ResultType result = stats_storage->AnalyzeStatsForAllTables(txn);
+  if (result != ResultType::SUCCESS) {
+    LOG_ERROR(
+        "Cannot generate stats for table columns. Not performing index "
+        "suggestion...");
+    txn_manager.AbortTransaction(txn);
+    return;
+  }
+
+  // Query the catalog for new SQL queries.
+  // New SQL queries are the queries that were added to the system
+  // after the last_timestamp_
+  auto query_catalog = &catalog::QueryHistoryCatalog::GetInstance(txn);
+  auto query_history =
+      query_catalog->GetQueryStringsAfterTimestamp(last_timestamp_, txn);
+  if (query_history->size() > num_queries_threshold_) {
+    LOG_INFO("Tuning threshold has crossed. Time to tune the DB!");
+
+    // Run the index selection.
+    std::vector<std::string> queries;
+    for (auto query_pair : *query_history) {
+      queries.push_back(query_pair.second);
+    }
+
+    // TODO: Handle multiple databases
+    brain::Workload workload(queries, DEFAULT_DB_NAME, txn);
+    brain::IndexSelection is = {workload, env->GetIndexSelectionKnobs(), txn};
+    brain::IndexConfiguration best_config;
+    is.GetBestIndexes(best_config);
+
+    if (best_config.IsEmpty()) {
+      LOG_INFO("Best config is empty");
+    }
+
+    // Get the existing indexes and drop them.
+    // TODO: Handle multiple databases
+    auto database_object = catalog::Catalog::GetInstance()->GetDatabaseObject(
+      DEFAULT_DB_NAME, txn);
+    auto pg_index = catalog::Catalog::GetInstance()
+      ->GetSystemCatalogs(database_object->GetDatabaseOid())
+      ->GetIndexCatalog();
+    auto indexes = pg_index->GetIndexObjects(txn);
+    for (auto index : indexes) {
+      auto index_name = index.second->GetIndexName();
+      // TODO [vamshi]:
+      // This is a hack for now. Add a boolean to the index catalog to
+      // find out if an index is a brain suggested index/user created index.
+      if (index_name.find(BRAIN_SUGGESTED_INDEX_MAGIC_STR) !=
+          std::string::npos) {
+        bool found = false;
+        for (auto installed_index: best_config.GetIndexes()) {
+          if ((index.second.get()->GetTableOid() == installed_index.get()->table_oid) &&
+          (index.second.get()->GetKeyAttrs() == installed_index.get()->column_oids)) {
+            found = true;
+          }
+        }
+        // Drop only indexes which are not suggested this time.
+        if (!found) {
+          LOG_DEBUG("Dropping Index: %s", index_name.c_str());
+          DropIndexRPC(database_object->GetDatabaseOid(), index.second.get());
+        }
+      }
+    }
+
+    for (auto index : best_config.GetIndexes()) {
+      // Create RPC for index creation on the server side.
+      CreateIndexRPC(index.get());
+    }
+
+    // Update the last_timestamp to the be the latest query's timestamp in
+    // the current workload, so that we fetch the new queries next time.
+    // TODO[vamshi]: Make this efficient. Currently assuming that the latest
+    // query can be anywhere in the vector. if the latest query is always at the
+    // end, then we can avoid scan over all the queries.
+    last_timestamp_ = GetLatestQueryTimestamp(query_history.get());
+  } else {
+    LOG_INFO("Tuning - not this time");
+  }
+  txn_manager.CommitTransaction(txn);
+}
+
+void IndexSelectionJob::CreateIndexRPC(brain::HypotheticalIndexObject *index) {
+  // TODO: Remove hardcoded database name and server end point.
+  capnp::EzRpcClient client("localhost:15445");
+  PelotonService::Client peloton_service = client.getMain<PelotonService>();
+
+  // Create the index name: concat - db_id, table_id, col_ids
+  std::stringstream sstream;
+  sstream << BRAIN_SUGGESTED_INDEX_MAGIC_STR << ":" << index->db_oid << ":"
+          << index->table_oid << ":";
+  std::vector<oid_t> col_oid_vector;
+  for (auto col : index->column_oids) {
+    col_oid_vector.push_back(col);
+    sstream << col << ",";
+  }
+  auto index_name = sstream.str();
+
+  auto request = peloton_service.createIndexRequest();
+  request.getRequest().setDatabaseOid(index->db_oid);
+  request.getRequest().setTableOid(index->table_oid);
+  request.getRequest().setIndexName(index_name);
+  request.getRequest().setUniqueKeys(false);
+
+  auto col_list =
+      request.getRequest().initKeyAttrOids(index->column_oids.size());
+  for (auto i = 0UL; i < index->column_oids.size(); i++) {
+    col_list.set(i, index->column_oids[i]);
+  }
+
+  PELOTON_ASSERT(index->column_oids.size() > 0);
+  auto response = request.send().wait(client.getWaitScope());
+}
+
+void IndexSelectionJob::DropIndexRPC(oid_t database_oid,
+                                     catalog::IndexCatalogObject *index) {
+  // TODO: Remove hardcoded database name and server end point.
+  capnp::EzRpcClient client("localhost:15445");
+  PelotonService::Client peloton_service = client.getMain<PelotonService>();
+
+  auto request = peloton_service.dropIndexRequest();
+  request.getRequest().setDatabaseOid(database_oid);
+  request.getRequest().setIndexOid(index->GetIndexOid());
+
+  auto response = request.send().wait(client.getWaitScope());
+}
+
+uint64_t IndexSelectionJob::GetLatestQueryTimestamp(
+    std::vector<std::pair<uint64_t, std::string>> *queries) {
+  uint64_t latest_time = 0;
+  for (auto query : *queries) {
+    if (query.first > latest_time) {
+      latest_time = query.first;
+    }
+  }
+  return latest_time;
+}
+}
+}
diff --git a/src/brain/index_selection_util.cpp b/src/brain/index_selection_util.cpp
@@ -0,0 +1,199 @@
+//===----------------------------------------------------------------------===//
+//
+//                         Peloton
+//
+// index_selection_util.cpp
+//
+// Identification: src/brain/index_selection_util.cpp
+//
+// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
+//
+//===----------------------------------------------------------------------===//
+
+#include "brain/index_selection_util.h"
+#include "common/logger.h"
+
+namespace peloton {
+namespace brain {
+
+//===--------------------------------------------------------------------===//
+// IndexObject
+//===--------------------------------------------------------------------===//
+
+const std::string HypotheticalIndexObject::ToString() const {
+  std::stringstream str_stream;
+  str_stream << "Database: " << db_oid << "\n";
+  str_stream << "Table: " << table_oid << "\n";
+  str_stream << "Columns: ";
+  for (auto col : column_oids) {
+    str_stream << col << ", ";
+  }
+  str_stream << "\n";
+  return str_stream.str();
+}
+
+bool HypotheticalIndexObject::operator==(
+    const HypotheticalIndexObject &obj) const {
+  return (db_oid == obj.db_oid && table_oid == obj.table_oid &&
+          column_oids == obj.column_oids);
+}
+
+bool HypotheticalIndexObject::IsCompatible(
+    std::shared_ptr<HypotheticalIndexObject> index) const {
+  return (db_oid == index->db_oid) && (table_oid == index->table_oid);
+}
+
+HypotheticalIndexObject HypotheticalIndexObject::Merge(
+    std::shared_ptr<HypotheticalIndexObject> index) {
+  HypotheticalIndexObject result;
+  result.db_oid = db_oid;
+  result.table_oid = table_oid;
+  result.column_oids = column_oids;
+  for (auto column : index->column_oids) {
+    if (std::find(column_oids.begin(), column_oids.end(), column) ==
+        column_oids.end())
+      result.column_oids.push_back(column);
+  }
+  return result;
+}
+
+//===--------------------------------------------------------------------===//
+// IndexConfiguration
+//===--------------------------------------------------------------------===//
+
+void IndexConfiguration::Merge(IndexConfiguration &config) {
+  auto indexes = config.GetIndexes();
+  for (auto it = indexes.begin(); it != indexes.end(); it++) {
+    indexes_.insert(*it);
+  }
+}
+
+void IndexConfiguration::Set(IndexConfiguration &config) {
+  indexes_.clear();
+  auto indexes = config.GetIndexes();
+  for (auto it = indexes.begin(); it != indexes.end(); it++) {
+    indexes_.insert(*it);
+  }
+}
+
+void IndexConfiguration::RemoveIndexObject(
+    std::shared_ptr<HypotheticalIndexObject> index_info) {
+  indexes_.erase(index_info);
+}
+
+void IndexConfiguration::AddIndexObject(
+    std::shared_ptr<HypotheticalIndexObject> index_info) {
+  indexes_.insert(index_info);
+}
+
+size_t IndexConfiguration::GetIndexCount() const { return indexes_.size(); }
+
+bool IndexConfiguration::IsEmpty() const { return indexes_.empty(); }
+
+const std::set<std::shared_ptr<HypotheticalIndexObject>>
+    &IndexConfiguration::GetIndexes() const {
+  return indexes_;
+}
+
+const std::string IndexConfiguration::ToString() const {
+  std::stringstream str_stream;
+  str_stream << "Num of indexes: " << GetIndexCount() << "\n";
+  for (auto index : indexes_) {
+    str_stream << index->ToString() << " ";
+  }
+  return str_stream.str();
+}
+
+bool IndexConfiguration::operator==(const IndexConfiguration &config) const {
+  auto config_indexes = config.GetIndexes();
+  return indexes_ == config_indexes;
+}
+
+IndexConfiguration IndexConfiguration::operator-(
+    const IndexConfiguration &config) {
+  auto config_indexes = config.GetIndexes();
+
+  std::set<std::shared_ptr<HypotheticalIndexObject>> result;
+  std::set_difference(indexes_.begin(), indexes_.end(), config_indexes.begin(),
+                      config_indexes.end(),
+                      std::inserter(result, result.end()));
+  return IndexConfiguration(result);
+}
+
+void IndexConfiguration::Clear() { indexes_.clear(); }
+
+//===--------------------------------------------------------------------===//
+// IndexObjectPool
+//===--------------------------------------------------------------------===//
+
+std::shared_ptr<HypotheticalIndexObject> IndexObjectPool::GetIndexObject(
+    HypotheticalIndexObject &obj) {
+  auto ret = map_.find(obj);
+  if (ret != map_.end()) {
+    return ret->second;
+  }
+  return nullptr;
+}
+
+std::shared_ptr<HypotheticalIndexObject> IndexObjectPool::PutIndexObject(
+    HypotheticalIndexObject &obj) {
+  auto index_s_ptr = GetIndexObject(obj);
+  if (index_s_ptr != nullptr) return index_s_ptr;
+  HypotheticalIndexObject *index_copy = new HypotheticalIndexObject();
+  *index_copy = obj;
+  index_s_ptr = std::shared_ptr<HypotheticalIndexObject>(index_copy);
+  map_[*index_copy] = index_s_ptr;
+  return index_s_ptr;
+}
+
+//===--------------------------------------------------------------------===//
+// Workload
+//===--------------------------------------------------------------------===//
+
+Workload::Workload(std::vector<std::string> &queries, std::string database_name,
+                   concurrency::TransactionContext *txn)
+    : database_name(database_name) {
+  LOG_TRACE("Initializing workload with %ld queries", queries.size());
+  std::unique_ptr<binder::BindNodeVisitor> binder(
+      new binder::BindNodeVisitor(txn, database_name));
+
+  // Parse and bind every query. Store the results in the workload vector.
+  for (auto query : queries) {
+    LOG_DEBUG("Query: %s", query.c_str());
+
+    // Create a unique_ptr to free this pointer at the end of this loop
+    // iteration.
+    auto stmt_list = std::unique_ptr<parser::SQLStatementList>(
+        parser::PostgresParser::ParseSQLString(query));
+    PELOTON_ASSERT(stmt_list->is_valid);
+    // TODO[vamshi]: Only one query for now.
+    PELOTON_ASSERT(stmt_list->GetNumStatements() == 1);
+
+    // Create a new shared ptr from the unique ptr because
+    // these queries will be referenced by multiple objects later.
+    // Release the unique ptr from the stmt list to avoid freeing at the end
+    // of
+    // this loop iteration.
+    auto stmt = stmt_list->PassOutStatement(0);
+    auto stmt_shared = std::shared_ptr<parser::SQLStatement>(stmt.release());
+    PELOTON_ASSERT(stmt_shared->GetType() != StatementType::INVALID);
+
+    // Bind the query
+    binder->BindNameToNode(stmt_shared.get());
+
+    // Only take the DML queries from the workload
+    switch (stmt_shared->GetType()) {
+      case StatementType::INSERT:
+      case StatementType::DELETE:
+      case StatementType::UPDATE:
+      case StatementType::SELECT:
+        AddQuery(stmt_shared);
+      default:
+        // Ignore other queries.
+        LOG_TRACE("Ignoring query: %s" + stmt->GetInfo().c_str());
+    }
+  }
+}
+
+}  // namespace brain
+}  // namespace peloton