diff --git a/src/expression/abstract_expression.cpp b/src/expression/abstract_expression.cpp index b5bcd936f87..d09e297378d 100644 --- a/src/expression/abstract_expression.cpp +++ b/src/expression/abstract_expression.cpp @@ -113,6 +113,13 @@ bool AbstractExpression::operator==(const AbstractExpression &rhs) const { if (exp_type_ != rhs.exp_type_ || children_.size() != rhs.children_.size()) return false; + // TODO: Try sorting the children + // TODO: Extend this to other comparison predicates + if (exp_type_ == ExpressionType::COMPARE_EQUAL && children_.size() == 2 && rhs.children_.size() == 2) { + return (*children_[0] == *rhs.children_[0] && *children_[1] == *rhs.children_[1]) || + (*children_[0] == *rhs.children_[1] && *children_[1] == *rhs.children_[0]); + } + for (unsigned i = 0; i < children_.size(); i++) { if (*children_[i].get() != *rhs.children_[i].get()) return false; } diff --git a/src/include/optimizer/cost_calculator.h b/src/include/optimizer/cost_calculator.h deleted file mode 100644 index 8ef40330d6b..00000000000 --- a/src/include/optimizer/cost_calculator.h +++ /dev/null @@ -1,63 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// cost_and_stats_calculator.h -// -// Identification: src/include/optimizer/cost_calculator.h -// -// Copyright (c) 2015-16, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "optimizer/operator_visitor.h" - -namespace peloton { -namespace optimizer { - -class Memo; -// Derive cost for a physical group expressionh -class CostCalculator : public OperatorVisitor { - public: - double CalculateCost(GroupExpression *gexpr, Memo *memo, - concurrency::TransactionContext *txn); - - void Visit(const DummyScan *) override; - void Visit(const PhysicalSeqScan *) override; - void Visit(const PhysicalIndexScan *) override; - void Visit(const ExternalFileScan *) override; - void Visit(const QueryDerivedScan *) override; - void Visit(const PhysicalOrderBy *) override; - void Visit(const PhysicalLimit *) override; - void Visit(const PhysicalInnerNLJoin *) override; - void Visit(const PhysicalLeftNLJoin *) override; - void Visit(const PhysicalRightNLJoin *) override; - void Visit(const PhysicalOuterNLJoin *) override; - void Visit(const PhysicalInnerHashJoin *) override; - void Visit(const PhysicalLeftHashJoin *) override; - void Visit(const PhysicalRightHashJoin *) override; - void Visit(const PhysicalOuterHashJoin *) override; - void Visit(const PhysicalInsert *) override; - void Visit(const PhysicalInsertSelect *) override; - void Visit(const PhysicalDelete *) override; - void Visit(const PhysicalUpdate *) override; - void Visit(const PhysicalHashGroupBy *) override; - void Visit(const PhysicalSortGroupBy *) override; - void Visit(const PhysicalDistinct *) override; - void Visit(const PhysicalAggregate *) override; - - private: - double HashCost(); - double SortCost(); - double GroupByCost(); - - GroupExpression *gexpr_; - Memo *memo_; - concurrency::TransactionContext *txn_; - double output_cost_ = 0; -}; - -} // namespace optimizer -} // namespace peloton diff --git a/src/include/optimizer/cost_model/abstract_cost_model.h b/src/include/optimizer/cost_model/abstract_cost_model.h new file mode 100644 index 00000000000..95a593f04d9 --- /dev/null +++ b/src/include/optimizer/cost_model/abstract_cost_model.h @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// abstract_cost_calculator.h +// +// Identification: src/include/optimizer/abstract_cost_calculator.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "optimizer/operator_visitor.h" + +namespace peloton { +namespace optimizer { + +class Memo; + +// Default cost when cost model cannot compute correct cost. +static constexpr double DEFAULT_COST = 1; + +// Estimate the cost of processing each row during a query. +static constexpr double DEFAULT_TUPLE_COST = 0.01; + +// Estimate the cost of processing each index entry during an index scan. +static constexpr double DEFAULT_INDEX_TUPLE_COST = 0.005; + +// Estimate the cost of processing each operator or function executed during a +// query. +static constexpr double DEFAULT_OPERATOR_COST = 0.0025; + +class AbstractCostModel : public OperatorVisitor { + public: + virtual double CalculateCost(GroupExpression *gexpr, Memo *memo, + concurrency::TransactionContext *txn) = 0; +}; + +} // namespace optimizer +} // namespace peloton + diff --git a/src/include/optimizer/cost_model/default_cost_model.h b/src/include/optimizer/cost_model/default_cost_model.h new file mode 100644 index 00000000000..a92cb091db7 --- /dev/null +++ b/src/include/optimizer/cost_model/default_cost_model.h @@ -0,0 +1,161 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// cost_calculator.h +// +// Identification: src/include/optimizer/cost_calculator.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "optimizer/cost_model/abstract_cost_model.h" +#include "expression/tuple_value_expression.h" +#include "catalog/table_catalog.h" +#include "optimizer/memo.h" +#include "optimizer/operators.h" +#include "optimizer/stats/stats_storage.h" +#include "optimizer/stats/table_stats.h" + +namespace peloton { +namespace optimizer { + +class Memo; +// Derive cost for a physical group expression +class DefaultCostModel : public AbstractCostModel { + public: + DefaultCostModel(){}; + + double CalculateCost(GroupExpression *gexpr, Memo *memo, + concurrency::TransactionContext *txn) { + gexpr_ = gexpr; + memo_ = memo; + txn_ = txn; + gexpr_->Op().Accept(this); + return output_cost_; + } + + void Visit(UNUSED_ATTRIBUTE const DummyScan *op) { + output_cost_ = 0.f; + } + void Visit(const PhysicalSeqScan *op) { + auto table_stats = std::dynamic_pointer_cast( + StatsStorage::GetInstance()->GetTableStats( + op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); + if (table_stats->GetColumnCount() == 0) { + output_cost_ = 1.f; + return; + } + output_cost_ = table_stats->num_rows * DEFAULT_TUPLE_COST; + } + void Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) { + auto table_stats = std::dynamic_pointer_cast( + StatsStorage::GetInstance()->GetTableStats( + op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); + if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) { + output_cost_ = 0.f; + return; + } + // Index search cost + scan cost + output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST + + memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() * + DEFAULT_TUPLE_COST; + } + + void Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) { + output_cost_ = 0.f; + } + + void Visit(const PhysicalOrderBy *) { SortCost(); } + + void Visit(const PhysicalLimit *op) { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + + output_cost_ = + std::min((size_t)child_num_rows, (size_t)op->limit) * DEFAULT_TUPLE_COST; + } + void Visit(UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) { + auto left_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + auto right_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows(); + + output_cost_ = left_child_rows * right_child_rows * DEFAULT_TUPLE_COST; + } + void Visit(UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) { + auto left_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + auto right_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows(); + // TODO(boweic): Build (left) table should have different cost to probe table + output_cost_ = (left_child_rows + right_child_rows) * DEFAULT_TUPLE_COST; + } + void Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) {} + void Visit(UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) { + // TODO(boweic): Integrate hash in groupby may cause us to miss the + // opportunity to further optimize some query where the child output is + // already hashed by the GroupBy key, we'll do a hash anyway + output_cost_ = HashCost() + GroupByCost(); + } + void Visit(UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) { + // Sort group by does not sort the tuples, it requires input columns to be + // sorted + output_cost_ = GroupByCost(); + } + void Visit(UNUSED_ATTRIBUTE const PhysicalDistinct *op) { + output_cost_ = HashCost(); + } + void Visit(UNUSED_ATTRIBUTE const PhysicalAggregate *op) { + // TODO(boweic): Ditto, separate groupby operator and implementation(e.g. + // hash, sort) may enable opportunity for further optimization + output_cost_ = HashCost() + GroupByCost(); + } + + private: + + double HashCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + // O(tuple) + return child_num_rows * DEFAULT_TUPLE_COST; + } + + double SortCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + + if (child_num_rows == 0) { + return 1.0f; + } + // O(tuple * log(tuple)) + return child_num_rows * std::log2(child_num_rows) * DEFAULT_TUPLE_COST; + } + + double GroupByCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + // O(tuple) + return child_num_rows * DEFAULT_TUPLE_COST; + } + + GroupExpression *gexpr_; + Memo *memo_; + concurrency::TransactionContext *txn_; + double output_cost_ = 0; +}; + +} // namespace optimizer +} // namespace peloton diff --git a/src/include/optimizer/cost_model/postgres_cost_model.h b/src/include/optimizer/cost_model/postgres_cost_model.h new file mode 100644 index 00000000000..2632a247a39 --- /dev/null +++ b/src/include/optimizer/cost_model/postgres_cost_model.h @@ -0,0 +1,282 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// postgres_cost_calculator.h +// +// Identification: src/include/optimizer/postgres_cost_calculator.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + + +#pragma once + +#include "optimizer/cost_model/abstract_cost_model.h" +#include "expression/tuple_value_expression.h" +#include "catalog/table_catalog.h" +#include "optimizer/memo.h" +#include "optimizer/operators.h" +#include "optimizer/stats/stats_storage.h" +#include "optimizer/stats/table_stats.h" + +// TODO: This is not fully reflective of the postgres cost model. Currently we +// are attempting +// to emulate their hash join cost model + +namespace peloton { +namespace optimizer { + +class Memo; +// Derive cost for a physical group expression +class PostgresCostModel : public AbstractCostModel { + public: + PostgresCostModel(){}; + + double CalculateCost(GroupExpression *gexpr, Memo *memo, + concurrency::TransactionContext *txn) override { + gexpr_ = gexpr; + memo_ = memo; + txn_ = txn; + gexpr_->Op().Accept(this); + return output_cost_; + }; + + void Visit(UNUSED_ATTRIBUTE const DummyScan *op) override { + output_cost_ = 0.f; + } + + void Visit(const PhysicalSeqScan *op) override { + auto table_stats = std::dynamic_pointer_cast( + StatsStorage::GetInstance()->GetTableStats( + op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); + if (table_stats->GetColumnCount() == 0) { // We have no table stats + output_cost_ = 1.f; + return; + } + output_cost_ = table_stats->num_rows * DEFAULT_TUPLE_COST; + } + + void Visit(const PhysicalIndexScan *op) override { + auto table_stats = std::dynamic_pointer_cast( + StatsStorage::GetInstance()->GetTableStats( + op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); + if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) { + output_cost_ = 0.f; + return; + } + // Index search cost + scan cost + output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST + + memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() * + DEFAULT_TUPLE_COST; + } + + void Visit( + UNUSED_ATTRIBUTE const QueryDerivedScan *op) override { + output_cost_ = 0.f; + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalOrderBy *) override { + SortCost(); + } + + void Visit(const PhysicalLimit *op) override { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + + output_cost_ = + std::min((size_t)child_num_rows, (size_t)op->limit) * DEFAULT_TUPLE_COST; + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) override { + auto left_child_rows = + std::max(0, memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows()); + auto right_child_rows = + std::max(0, memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows()); + output_cost_ = left_child_rows * right_child_rows * DEFAULT_TUPLE_COST; + LOG_DEBUG("----------NL Join Output--------"); + LOG_DEBUG("Left: %s | Rows: %d", GetTableName(op->left_keys).c_str(), left_child_rows); + LOG_DEBUG("Right: %s | Rows: %d", GetTableName(op->right_keys).c_str(), right_child_rows); + LOG_DEBUG("Cost: %f", output_cost_); + LOG_DEBUG("--------------------------------"); + } + void Visit(UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) override {} + + /* The main idea of this cost estimate is that the comparisons done is the outer + * table (probe side) times tuples + * bucket. Thus, we estimate attempt to estimate the bucket size in a similar + * manner to postgres. + */ + void Visit(const PhysicalInnerHashJoin *op) override { + auto bucket_size_frac = 1.0; + + // Assuming you build table on right relation + if (IsBaseTable(op->right_keys)) { + auto right_group = memo_->GetGroupByID(gexpr_->GetChildGroupId(1)); + + // Iterate over all keys, take the largest fraction (smallest bucket sizes) + // TODO: Add more estimate adjustments from postgres + for (auto &expr : op->right_keys) { + auto tv_expr = + reinterpret_cast(expr.get()); + auto stats = right_group->GetStats(tv_expr->GetColFullName()); + + if (stats == nullptr) continue; + + // TODO: A new hash join PR uses 256 as default so we're using this for + // now and hardcoding it here + auto num_buckets = 256.0; + + double frac_est; + + if (stats->cardinality > num_buckets) { + frac_est = 1.0 / num_buckets; + } else { + frac_est = 1.0 / std::max(stats->cardinality, 1.0); + } + + /* Average frequency of values, taken from Postgres */ + auto avgfreq = (1.0 - stats->frac_null) / stats->cardinality; + + // Adjust for skew (Highest freq / avg freq) + if (avgfreq > 0.0 && !stats->most_common_vals.empty() && + !stats->most_common_freqs.empty() && + (stats->most_common_freqs[0] / stats->num_rows) > avgfreq) { + frac_est *= (stats->most_common_freqs[0] / stats->num_rows) / avgfreq; + } + + // Clamp the bucket frac estimate (taken from postgres) + if (frac_est < 1.0e-6) { + frac_est = 1.0e-6; + } else if (frac_est > 1.0) { + frac_est = 1.0; + } + bucket_size_frac = std::min(bucket_size_frac, frac_est); + LOG_DEBUG("Bucket_size_frac: %f", bucket_size_frac); + } + } + + auto left_child_rows = + std::max(0, memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows()); + auto right_child_rows = + std::max(0, memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows()); + + + output_cost_ = (left_child_rows + (right_child_rows * bucket_size_frac)) * DEFAULT_TUPLE_COST; + LOG_DEBUG("---------Hash Join Output-------"); + LOG_DEBUG("Left: %s | Rows: %d", GetTableName(op->left_keys).c_str(), left_child_rows); + LOG_DEBUG("Right: %s | Rows: %d", GetTableName(op->right_keys).c_str(), right_child_rows); + LOG_DEBUG("Cost: %f", output_cost_); + LOG_DEBUG("--------------------------------"); + + + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) override{} + void Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) override {} + + void Visit(UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) override { + // TODO(boweic): Integrate hash in groupby may cause us to miss the + // opportunity to further optimize some query where the child output is + // already hashed by the GroupBy key, we'll do a hash anyway + output_cost_ = HashCost() + GroupByCost(); + } + void Visit(UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) override { + // Sort group by does not sort the tuples, it requires input columns to be + // sorted + output_cost_ = GroupByCost(); + } + void Visit(UNUSED_ATTRIBUTE const PhysicalDistinct *op) override { + output_cost_ = HashCost(); + } + void Visit(UNUSED_ATTRIBUTE const PhysicalAggregate *op) override { + // TODO(boweic): Ditto, separate groupby operator and implementation(e.g. + // hash, sort) may enable opportunity for further optimization + output_cost_ = HashCost() + GroupByCost(); + } + + private: + double HashCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + // O(tuple) + return child_num_rows * DEFAULT_TUPLE_COST; + } + + double SortCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + + if (child_num_rows == 0) { + return 1.0f; + } + // O(tuple * log(tuple)) + return child_num_rows * std::log2(child_num_rows) * DEFAULT_TUPLE_COST; + } + + double GroupByCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + // O(tuple) + return child_num_rows * DEFAULT_TUPLE_COST; + } + + + GroupExpression *gexpr_; + Memo *memo_; + concurrency::TransactionContext *txn_; + double output_cost_ = 0; + + /* Checks if keys for a join child only reference one table */ + bool IsBaseTable( + const std::vector> &keys) { + std::unordered_set seen_set; + + for (auto &expr : keys) { + if (expr->GetExpressionType() != ExpressionType::VALUE_TUPLE) continue; + + auto tv_expr = + reinterpret_cast(expr.get()); + seen_set.insert(tv_expr->GetTableName()); + } + return seen_set.size() == 1; + } + + // Returns string of tables, used for debugging + std::string GetTableName(const std::vector> &keys) { + std::unordered_set table_set; + for (auto &expr : keys) { + if (expr->GetExpressionType() != ExpressionType::VALUE_TUPLE) continue; + + auto tv_expr = + reinterpret_cast(expr.get()); + table_set.insert(tv_expr->GetTableName()); + } + + std::stringstream stream; + if (table_set.size() == 1) { + stream << *table_set.begin(); + } else { + for (auto table : table_set) { + if (!stream.str().empty()) { + stream << ","; + } + stream << table; + } + } + + return stream.str(); + } + +}; + +} // namespace optimizer +} // namespace peloton \ No newline at end of file diff --git a/src/include/optimizer/cost_model/trivial_cost_model.h b/src/include/optimizer/cost_model/trivial_cost_model.h new file mode 100644 index 00000000000..2c5994ee728 --- /dev/null +++ b/src/include/optimizer/cost_model/trivial_cost_model.h @@ -0,0 +1,119 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// trivial_cost_calculator.h +// +// Identification: src/include/optimizer/trivial_cost_calculator.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + + +#pragma once + +#include "optimizer/cost_model/abstract_cost_model.h" +#include "abstract_cost_model.h" + +#include "expression/tuple_value_expression.h" +#include "catalog/table_catalog.h" +#include "optimizer/memo.h" +#include "optimizer/operators.h" +#include "optimizer/stats/stats_storage.h" +#include "optimizer/stats/table_stats.h" + +// This cost model is meant to just be a trivial cost model. The decisions it makes are as follows +// * Always choose index scan (cost of 0) over sequential scan (cost of 1) +// * Choose NL if left rows is a single record (for single record lookup queries), else choose hash join +// * Choose hash group by over sort group by + +namespace peloton { +namespace optimizer { + +class Memo; +class TrivialCostModel : public AbstractCostModel { + public: + TrivialCostModel(){}; + + double CalculateCost(GroupExpression *gexpr, Memo *memo, + concurrency::TransactionContext *txn) override { + gexpr_ = gexpr; + memo_ = memo; + txn_ = txn; + gexpr_->Op().Accept(this); + return output_cost_; + }; + + void Visit(UNUSED_ATTRIBUTE const DummyScan *op) override { + output_cost_ = 0.f; + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalSeqScan *op) override { + output_cost_ = 1.f; + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) override { + output_cost_ = 0.f; + } + + void Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) override { + output_cost_ = 0.f; + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalOrderBy *) override { + output_cost_ = 0.f; + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalLimit *op) override { + output_cost_ = 0.f; + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) override { + auto left_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + if (left_child_rows == 1) { + output_cost_ = 0.f; + } else { + output_cost_ = 2.f; + } + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) override {} + + void Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) override { + output_cost_ = 1.f; + } + + void Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) override {} + void Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) override{} + void Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) override {} + + void Visit(UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) override { + output_cost_ = 0.f; + } + void Visit(UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) override { + output_cost_ = 1.f; + } + void Visit(UNUSED_ATTRIBUTE const PhysicalDistinct *op) override { + output_cost_ = 0.f; + } + void Visit(UNUSED_ATTRIBUTE const PhysicalAggregate *op) override { + output_cost_ = 0.f; + } + + private: + GroupExpression *gexpr_; + Memo *memo_; + concurrency::TransactionContext *txn_; + double output_cost_ = 0; +}; + +} // namespace optimizer +} // namespace peloton \ No newline at end of file diff --git a/src/include/optimizer/group_expression.h b/src/include/optimizer/group_expression.h index f3ce5580251..303ebaf036e 100644 --- a/src/include/optimizer/group_expression.h +++ b/src/include/optimizer/group_expression.h @@ -13,7 +13,7 @@ #pragma once #include "optimizer/operator_node.h" -#include "optimizer/stats.h" +#include "optimizer/stats/stats.h" #include "optimizer/util.h" #include "optimizer/property_set.h" #include "common/internal_types.h" diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index b10d01c3acd..ebf82d625b4 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -15,6 +15,7 @@ #include #include "optimizer/abstract_optimizer.h" +#include "optimizer/cost_model/abstract_cost_model.h" #include "optimizer/property_set.h" #include "optimizer/optimizer_metadata.h" @@ -53,6 +54,8 @@ struct QueryInfo { std::shared_ptr physical_props; }; +enum CostModels {DEFAULT, POSTGRES, TRIVIAL}; + //===--------------------------------------------------------------------===// // Optimizer //===--------------------------------------------------------------------===// @@ -71,7 +74,7 @@ class Optimizer : public AbstractOptimizer { Optimizer(Optimizer &&) = delete; Optimizer &operator=(Optimizer &&) = delete; - Optimizer(); + Optimizer(const CostModels cost_model = CostModels::DEFAULT); std::shared_ptr BuildPelotonPlanTree( const std::unique_ptr &parse_tree_list, @@ -164,6 +167,7 @@ class Optimizer : public AbstractOptimizer { ////////////////////////////////////////////////////////////////////////////// /// Metadata OptimizerMetadata metadata_; + std::unique_ptr cost_model_; }; } // namespace optimizer diff --git a/src/include/optimizer/optimizer_metadata.h b/src/include/optimizer/optimizer_metadata.h index 8606d8a3dae..3f33e3ee8b1 100644 --- a/src/include/optimizer/optimizer_metadata.h +++ b/src/include/optimizer/optimizer_metadata.h @@ -2,9 +2,9 @@ // // Peloton // -// optimizer.h +// optimizer_metadata.h // -// Identification: src/include/optimizer/optimizer.h +// Identification: src/include/optimizer/optimizer_metadata.h // // Copyright (c) 2015-2018, Carnegie Mellon University Database Group // @@ -13,6 +13,7 @@ #pragma once #include "common/timer.h" +#include "optimizer/cost_model/default_cost_model.h" #include "optimizer/memo.h" #include "optimizer/group_expression.h" #include "optimizer/rule.h" @@ -30,14 +31,16 @@ class RuleSet; class OptimizerMetadata { public: - OptimizerMetadata() - : timeout_limit(settings::SettingsManager::GetInt( - settings::SettingId::task_execution_timeout)), + + OptimizerMetadata(std::unique_ptr cost_model) + : cost_model(std::move(cost_model)), timeout_limit(settings::SettingsManager::GetInt( + settings::SettingId::task_execution_timeout)), timer(Timer()) {} Memo memo; RuleSet rule_set; OptimizerTaskPool *task_pool; + std::unique_ptr cost_model; catalog::CatalogCache *catalog_cache; unsigned int timeout_limit; Timer timer; diff --git a/src/include/optimizer/child_stats_deriver.h b/src/include/optimizer/stats/child_stats_deriver.h similarity index 100% rename from src/include/optimizer/child_stats_deriver.h rename to src/include/optimizer/stats/child_stats_deriver.h diff --git a/src/include/optimizer/stats/column_stats.h b/src/include/optimizer/stats/column_stats.h index e3851688425..c464b0615d9 100644 --- a/src/include/optimizer/stats/column_stats.h +++ b/src/include/optimizer/stats/column_stats.h @@ -1,106 +1,108 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// column_stats.h -// -// Identification: src/include/optimizer/stats/column_stats.h -// -// Copyright (c) 2015-16, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -#include "common/macros.h" -#include "common/internal_types.h" - -namespace peloton { -namespace optimizer { - -//===--------------------------------------------------------------------===// -// ColumnStats -//===--------------------------------------------------------------------===// -class ColumnStats { - public: - ColumnStats(oid_t database_id, oid_t table_id, oid_t column_id, - const std::string column_name, bool has_index, size_t num_rows, - double cardinality, double frac_null, - std::vector most_common_vals, - std::vector most_common_freqs, - std::vector histogram_bounds) - : database_id(database_id), - table_id(table_id), - column_id(column_id), - column_name(column_name), - has_index(has_index), - num_rows(num_rows), - cardinality(cardinality), - frac_null(frac_null), - most_common_vals(most_common_vals), - most_common_freqs(most_common_freqs), - histogram_bounds(histogram_bounds), - is_basetable{true} {} - - oid_t database_id; - oid_t table_id; - oid_t column_id; - std::string column_name; - bool has_index; - - size_t num_rows; - double cardinality; - double frac_null; - std::vector most_common_vals; - std::vector most_common_freqs; - std::vector histogram_bounds; - - bool is_basetable; - - std::string ToString() { - std::ostringstream os; - os << "column_id :" << column_id << "\n" - << "column_name :" << column_name << "\n" - << "num_rows :" << num_rows << "\n"; - return os.str(); - } - - // vector of double to comma seperated string - std::string VectorToString(const std::vector& vec) { - std::ostringstream os; - for (auto v : vec) { - os << v << ", "; - } - std::string res = os.str(); - if (res.size() > 0) { - res.pop_back(); - } - return res; - } - - std::string ToCSV() { - std::ostringstream os; - os << column_id << "|" << column_name << "|" << num_rows << "|" << has_index - << "|" << cardinality << "|" << frac_null << "|" - << VectorToString(most_common_vals) << "|" - << VectorToString(most_common_freqs) << "|" - << VectorToString(histogram_bounds) << "\n"; - return os.str(); - } - - void UpdateJoinStats(size_t table_num_rows, size_t sample_size, - size_t sample_card) { - num_rows = table_num_rows; - - // FIX ME: for now using samples's cardinality * samples size / number of - // rows to ensure the same selectivity among samples and the whole table - size_t estimated_card = - (size_t)(sample_card * num_rows / (double)sample_size); - cardinality = cardinality < estimated_card ? cardinality : estimated_card; - } -}; - -} // namespace optimizer -} // namespace peloton +//===----------------------------------------------------------------------===// +// +// Peloton +// +// column_stats.h +// +// Identification: src/include/optimizer/stats/column_stats.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include "common/macros.h" +#include "common/internal_types.h" + +namespace peloton { +namespace optimizer { + +//===--------------------------------------------------------------------===// +// ColumnStats +//===--------------------------------------------------------------------===// +class ColumnStats { + public: + ColumnStats(oid_t database_id, oid_t table_id, oid_t column_id, + const std::string column_name, bool has_index, size_t num_rows, + double cardinality, double frac_null, + std::vector most_common_vals, + std::vector most_common_freqs, + std::vector histogram_bounds) + : database_id(database_id), + table_id(table_id), + column_id(column_id), + column_name(column_name), + has_index(has_index), + num_rows(num_rows), + cardinality(cardinality), + frac_null(frac_null), + most_common_vals(most_common_vals), + most_common_freqs(most_common_freqs), + histogram_bounds(histogram_bounds), + is_basetable{true} {} + + oid_t database_id; + oid_t table_id; + oid_t column_id; + std::string column_name; + bool has_index; + + size_t num_rows; + double cardinality; + double frac_null; + std::vector most_common_vals; + std::vector most_common_freqs; + std::vector histogram_bounds; + + bool is_basetable; + + std::string ToString() { + std::ostringstream os; + os << "column_id :" << column_id << "\n" + << "column_name :" << column_name << "\n" + << "num_rows :" << num_rows << "\n" + << "cardinality: " << cardinality << "\n" + << "frac_null: " << frac_null << "\n"; + return os.str(); + } + + // vector of double to comma seperated string + std::string VectorToString(const std::vector& vec) { + std::ostringstream os; + for (auto v : vec) { + os << v << ", "; + } + std::string res = os.str(); + if (res.size() > 0) { + res.pop_back(); + } + return res; + } + + std::string ToCSV() { + std::ostringstream os; + os << column_id << "|" << column_name << "|" << num_rows << "|" << has_index + << "|" << cardinality << "|" << frac_null << "|" + << VectorToString(most_common_vals) << "|" + << VectorToString(most_common_freqs) << "|" + << VectorToString(histogram_bounds) << "\n"; + return os.str(); + } + + void UpdateJoinStats(size_t table_num_rows, size_t sample_size, + size_t sample_card) { + num_rows = table_num_rows; + + // FIX ME: for now using samples's cardinality * samples size / number of + // rows to ensure the same selectivity among samples and the whole table + size_t estimated_card = + (size_t)(sample_card * num_rows / (double)sample_size); + cardinality = cardinality < estimated_card ? cardinality : estimated_card; + } +}; + +} // namespace optimizer +} // namespace peloton diff --git a/src/include/optimizer/stats/cost.h b/src/include/optimizer/stats/cost.h deleted file mode 100644 index 6e887f1fb24..00000000000 --- a/src/include/optimizer/stats/cost.h +++ /dev/null @@ -1,192 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// cost.h -// -// Identification: src/include/optimizer/stats/cost.h -// -// Copyright (c) 2015-16, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include - -#include "common/logger.h" -#include "common/macros.h" -#include "table_stats.h" -#include "value_condition.h" - -namespace peloton { -namespace optimizer { -// Default cost when cost model cannot compute correct cost. -static constexpr double DEFAULT_COST = 1; - -// Estimate the cost of processing each row during a query. -static constexpr double DEFAULT_TUPLE_COST = 0.01; - -// Estimate the cost of processing each index entry during an index scan. -static constexpr double DEFAULT_INDEX_TUPLE_COST = 0.005; - -// Estimate the cost of processing each operator or function executed during a -// query. -static constexpr double DEFAULT_OPERATOR_COST = 0.0025; - -//===----------------------------------------------------------------------===// -// Cost -//===----------------------------------------------------------------------===// -class Cost { - // private: - // // Default cost of sorting n elements - // static double default_sorting_cost(size_t n) { return n * std::log2(n); } - // - // // Default number of index tuple to access for n elements - // static double default_index_height(size_t n) { return std::log2(n); } - // - // public: - /* - * Cost of seq scan for the whole table - */ - // static inline double NoConditionSeqScanCost( - // const std::shared_ptr& input_stats) { - // return input_stats->num_rows * DEFAULT_TUPLE_COST; - // } - /* - * Cost of scan for single condition. For scan with multiple conditions, - * you should use CombineConjunctionStats to combine output_stats. - */ - // static double SingleConditionSeqScanCost( - // const std::shared_ptr& input_stats, - // const ValueCondition& condition, - // std::shared_ptr& output_stats); - // - // static double SingleConditionIndexScanCost( - // const std::shared_ptr& input_stats, - // const ValueCondition& condition, - // std::shared_ptr& output_stats); - // - /* - * Combine two stats with conjunction clause. - * ExpressionType type can be CONJUNCTION_AND / CONJUNCTION_OR - */ - // static void CombineConjunctionStats( - // const std::shared_ptr& lhs, - // const std::shared_ptr& rhs, const size_t num_rows, - // const ExpressionType type, std::shared_ptr& output_stats); - // - /* - * Cost of GROUP BY. - */ - // static double SortGroupByCost(const std::shared_ptr& input_stats, - // std::vector columns, - // std::shared_ptr& output_stats); - // - // static double HashGroupByCost(const std::shared_ptr& input_stats, - // std::vector columns, - // std::shared_ptr& output_stats); - // - /* - * Aggregation (SUM, COUNT, etc) cost = cost of scan input table. - * Note it does not update table stats. - */ - // static inline double AggregateCost( - // const std::shared_ptr& input_stats) { - // PELOTON_ASSERT(input_stats != nullptr); - // return input_stats->num_rows * DEFAULT_TUPLE_COST; - // } - // - /* - * Cost of DISTINCT = cost of building hash table. - */ - // static double DistinctCost(const std::shared_ptr& input_stats, - // std::string column_name, - // std::shared_ptr& output_stats); - // - /* - * Cost of projection = full table scan. - */ - // static double ProjectCost(const std::shared_ptr& input_stats, - // std::vector columns, - // std::shared_ptr& output_stats); - // - /* - * Cost of LIMIT = limit * tuple_cost - */ - // static double LimitCost(const std::shared_ptr& input_stats, - // size_t limit, - // std::shared_ptr& output_stats); - // - /* - * Cost of ORDER BY = cost of sorting or 1 if column has index. - * Note right only first column is taken into consideration. - */ - // static double OrderByCost(const std::shared_ptr& input_stats, - // const std::vector& columns, - // const std::vector& orders, - // std::shared_ptr& output_stats); - // - /* - * Join - */ - // static double NLJoinCost( - // const std::shared_ptr& left_input_stats, - // const std::shared_ptr& right_input_stats, - // std::shared_ptr& output_stats, - // const std::shared_ptr predicate, - // JoinType join_type, bool enable_sampling = false); - // - // static double HashJoinCost( - // const std::shared_ptr& left_input_stats, - // const std::shared_ptr& right_input_stats, - // std::shared_ptr& output_stats, - // const std::shared_ptr predicate, - // JoinType join_type, bool enable_sampling = false); - // - // static std::vector GenerateJoinSamples( - // const std::shared_ptr& left_input_stats, - // const std::shared_ptr& right_input_stats, - // std::shared_ptr& output_stats, - // const std::string& left_column_name, const std::string& right_column_name, - // bool& enable_sampling); - // - // static bool UpdateJoinOutputStatsWithSampling( - // const std::shared_ptr& left_input_stats, - // const std::shared_ptr& right_input_stats, - // std::shared_ptr& output_stats, - // const std::string& left_column_name, - // const std::string& right_column_name); - /* - * Update output statistics given left, right input statistics and predicate - * for join operators - */ - // static void UpdateJoinOutputStats( - // const std::shared_ptr& left_input_stats, - // const std::shared_ptr& right_input_stats, - // std::shared_ptr& output_stats, - // const std::shared_ptr predicate, - // JoinType join_type, bool enable_sampling); - // - /* - * Update output statistics given input table and one condition. - * Updated stats will be placed in output_stats. - */ - // static void UpdateConditionStats( - // const std::shared_ptr& input_stats, - // const ValueCondition& condition, - // std::shared_ptr& output_stats); - // - /* - * Return estimated number of rows after group by operation. - * This function is used by HashGroupBy and SortGroupBy. - */ - // static size_t GetEstimatedGroupByRows( - // const std::shared_ptr& input_stats, - // std::vector& columns); -}; - -} // namespace optimizer -} // namespace peloton diff --git a/src/include/optimizer/stats.h b/src/include/optimizer/stats/stats.h similarity index 94% rename from src/include/optimizer/stats.h rename to src/include/optimizer/stats/stats.h index 8eb85eed3ed..bcfd8a15098 100644 --- a/src/include/optimizer/stats.h +++ b/src/include/optimizer/stats/stats.h @@ -12,7 +12,7 @@ #pragma once -#include "optimizer/tuple_sample.h" +#include "optimizer/stats/tuple_sample.h" namespace peloton { namespace optimizer { diff --git a/src/include/optimizer/stats_calculator.h b/src/include/optimizer/stats/stats_calculator.h similarity index 100% rename from src/include/optimizer/stats_calculator.h rename to src/include/optimizer/stats/stats_calculator.h diff --git a/src/include/optimizer/stats/stats_util.h b/src/include/optimizer/stats/stats_util.h index bec5974b08d..3dd55bfd284 100644 --- a/src/include/optimizer/stats/stats_util.h +++ b/src/include/optimizer/stats/stats_util.h @@ -55,7 +55,7 @@ class StatsUtil { case type::TypeId::VARCHAR: case type::TypeId::VARBINARY: { const char* key = value.GetData(); - MurmurHash3_x64_128(key, (uint64_t)strlen(key), 0, hash); + MurmurHash3_x64_128(key, value.GetLength(), 0, hash); } break; case type::TypeId::BOOLEAN: case type::TypeId::TINYINT: { @@ -90,7 +90,7 @@ class StatsUtil { // Hack for other data types. std::string value_str = value.ToString(); const char* key = value_str.c_str(); - MurmurHash3_x64_128(key, (uint64_t)strlen(key), 0, hash); + MurmurHash3_x64_128(key, value.GetLength(), 0, hash); } return hash[0]; } diff --git a/src/include/optimizer/stats/table_stats.h b/src/include/optimizer/stats/table_stats.h index 0017d673658..89a0f6eeeff 100644 --- a/src/include/optimizer/stats/table_stats.h +++ b/src/include/optimizer/stats/table_stats.h @@ -16,7 +16,7 @@ #include "common/macros.h" #include "index/index.h" -#include "optimizer/stats.h" +#include "optimizer/stats/stats.h" #include "common/internal_types.h" #include "optimizer/stats/tuple_sampler.h" diff --git a/src/include/optimizer/tuple_sample.h b/src/include/optimizer/stats/tuple_sample.h similarity index 100% rename from src/include/optimizer/tuple_sample.h rename to src/include/optimizer/stats/tuple_sample.h diff --git a/src/optimizer/cost_calculator.cpp b/src/optimizer/cost_calculator.cpp deleted file mode 100644 index 56cbbecc64e..00000000000 --- a/src/optimizer/cost_calculator.cpp +++ /dev/null @@ -1,151 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// cost_and_stats_calculator.h -// -// Identification: src/optimizer/cost_calculator.cpp -// -// Copyright (c) 2015-16, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "optimizer/cost_calculator.h" - -#include - -#include "catalog/table_catalog.h" -#include "optimizer/memo.h" -#include "optimizer/operators.h" -#include "optimizer/stats/cost.h" -#include "optimizer/stats/stats_storage.h" -#include "optimizer/stats/table_stats.h" - -namespace peloton { -namespace optimizer { - -double CostCalculator::CalculateCost(GroupExpression *gexpr, Memo *memo, - concurrency::TransactionContext *txn) { - gexpr_ = gexpr; - memo_ = memo; - txn_ = txn; - gexpr_->Op().Accept(this); - return output_cost_; -} - -void CostCalculator::Visit(UNUSED_ATTRIBUTE const DummyScan *op) { - output_cost_ = 0.f; -} -void CostCalculator::Visit(const PhysicalSeqScan *op) { - auto table_stats = std::dynamic_pointer_cast( - StatsStorage::GetInstance()->GetTableStats( - op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); - if (table_stats->GetColumnCount() == 0) { - output_cost_ = 1.f; - return; - } - output_cost_ = table_stats->num_rows * DEFAULT_TUPLE_COST; -} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) { - auto table_stats = std::dynamic_pointer_cast( - StatsStorage::GetInstance()->GetTableStats( - op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); - if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) { - output_cost_ = 0.f; - return; - } - // Index search cost + scan cost - output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST + - memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() * - DEFAULT_TUPLE_COST; -} - -void CostCalculator::Visit(UNUSED_ATTRIBUTE const ExternalFileScan *) { - output_cost_ = 0.0; -} - -void CostCalculator::Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) { - output_cost_ = 0.f; -} - -void CostCalculator::Visit(const PhysicalOrderBy *) { SortCost(); } - -void CostCalculator::Visit(const PhysicalLimit *op) { - auto child_num_rows = - memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); - - output_cost_ = - std::min((size_t)child_num_rows, (size_t)op->limit) * DEFAULT_TUPLE_COST; -} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) { - auto left_child_rows = - memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); - auto right_child_rows = - memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows(); - - output_cost_ = left_child_rows * right_child_rows * DEFAULT_TUPLE_COST; -} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) { - auto left_child_rows = - memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); - auto right_child_rows = - memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows(); - // TODO(boweic): Build (left) table should have different cost to probe table - output_cost_ = (left_child_rows + right_child_rows) * DEFAULT_TUPLE_COST; -} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) {} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) { - // TODO(boweic): Integrate hash in groupby may cause us to miss the - // opportunity to further optimize some query where the child output is - // already hashed by the GroupBy key, we'll do a hash anyway - output_cost_ = HashCost() + GroupByCost(); -} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) { - // Sort group by does not sort the tuples, it requires input columns to be - // sorted - output_cost_ = GroupByCost(); -} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalDistinct *op) { - output_cost_ = HashCost(); -} -void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalAggregate *op) { - // TODO(boweic): Ditto, separate groupby operator and implementation(e.g. - // hash, sort) may enable opportunity for further optimization - output_cost_ = HashCost() + GroupByCost(); -} - -double CostCalculator::HashCost() { - auto child_num_rows = - memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); - // O(tuple) - return child_num_rows * DEFAULT_TUPLE_COST; -} - -double CostCalculator::SortCost() { - auto child_num_rows = - memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); - - if (child_num_rows == 0) { - return 1.0f; - } - // O(tuple * log(tuple)) - return child_num_rows * std::log2(child_num_rows) * DEFAULT_TUPLE_COST; -} - -double CostCalculator::GroupByCost() { - auto child_num_rows = - memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); - // O(tuple) - return child_num_rows * DEFAULT_TUPLE_COST; -} -} // namespace optimizer -} // namespace peloton diff --git a/src/optimizer/group_expression.cpp b/src/optimizer/group_expression.cpp index b1f76b5b3c7..498c949b583 100644 --- a/src/optimizer/group_expression.cpp +++ b/src/optimizer/group_expression.cpp @@ -85,19 +85,7 @@ hash_t GroupExpression::Hash() const { } bool GroupExpression::operator==(const GroupExpression &r) { - bool eq = (op == r.Op()); - - auto left_groups = child_groups; - auto right_groups = r.child_groups; - - std::sort(left_groups.begin(), left_groups.end()); - - std::sort(right_groups.begin(), right_groups.end()); - for (size_t i = 0; i < left_groups.size(); ++i) { - eq = eq && (left_groups[i] == right_groups[i]); - } - - return eq; + return (op == r.Op()) && (child_groups == r.child_groups); } void GroupExpression::SetRuleExplored(Rule *rule) { diff --git a/src/optimizer/memo.cpp b/src/optimizer/memo.cpp index 2a6cc49bf4c..ca68a52c1d0 100644 --- a/src/optimizer/memo.cpp +++ b/src/optimizer/memo.cpp @@ -13,7 +13,7 @@ #include "optimizer/group_expression.h" #include "optimizer/memo.h" #include "optimizer/operators.h" -#include "optimizer/stats_calculator.h" +#include "optimizer/stats/stats_calculator.h" namespace peloton { namespace optimizer { diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index 0cbe61b5c9d..83bcadde4de 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -20,6 +20,9 @@ #include "common/exception.h" +#include "optimizer/cost_model/default_cost_model.h" +#include "optimizer/cost_model/postgres_cost_model.h" +#include "optimizer/cost_model/trivial_cost_model.h" #include "optimizer/binding.h" #include "optimizer/input_column_deriver.h" #include "optimizer/operator_visitor.h" @@ -58,7 +61,25 @@ namespace optimizer { //===--------------------------------------------------------------------===// // Optimizer //===--------------------------------------------------------------------===// -Optimizer::Optimizer() {} +Optimizer::Optimizer(const CostModels cost_model) : metadata_(nullptr) { + + switch (cost_model) { + case CostModels::DEFAULT: { + metadata_ = OptimizerMetadata(std::unique_ptr(new DefaultCostModel)); + break; + } + case CostModels::POSTGRES: { + metadata_ = OptimizerMetadata(std::unique_ptr(new PostgresCostModel)); + break; + } + case CostModels::TRIVIAL: { + metadata_ = OptimizerMetadata(std::unique_ptr(new TrivialCostModel)); + break; + } + default: + throw OptimizerException("Invalid cost model"); + } +} void Optimizer::OptimizeLoop(int root_group_id, std::shared_ptr required_props) { @@ -136,7 +157,9 @@ shared_ptr Optimizer::BuildPelotonPlanTree( } } -void Optimizer::Reset() { metadata_ = OptimizerMetadata(); } +void Optimizer::Reset() { + metadata_ = OptimizerMetadata(std::move(metadata_.cost_model)); +} unique_ptr Optimizer::HandleDDLStatement( parser::SQLStatement *tree, bool &is_ddl_stmt, diff --git a/src/optimizer/optimizer_task.cpp b/src/optimizer/optimizer_task.cpp index 87f4809d461..e1cfac5643d 100644 --- a/src/optimizer/optimizer_task.cpp +++ b/src/optimizer/optimizer_task.cpp @@ -16,9 +16,8 @@ #include "optimizer/optimizer_metadata.h" #include "optimizer/binding.h" #include "optimizer/child_property_deriver.h" -#include "optimizer/cost_calculator.h" -#include "optimizer/stats_calculator.h" -#include "optimizer/child_stats_deriver.h" +#include "optimizer/stats/stats_calculator.h" +#include "optimizer/stats/child_stats_deriver.h" namespace peloton { namespace optimizer { @@ -41,7 +40,6 @@ void OptimizerTask::ConstructValidRules( if (root_pattern_mismatch || already_explored || child_pattern_mismatch) { continue; } - auto promise = rule->Promise(group_expr, context); if (promise > 0) valid_rules.emplace_back(rule.get(), promise); } @@ -98,7 +96,7 @@ void OptimizeExpression::execute() { GetRuleSet().GetImplementationRules(), valid_rules); std::sort(valid_rules.begin(), valid_rules.end()); - LOG_TRACE("OptimizeExpression::execute() op %d, valid rules : %lu", + LOG_DEBUG("OptimizeExpression::execute() op %d, valid rules : %lu", static_cast(group_expr_->Op().GetType()), valid_rules.size()); // Apply rule for (auto &r : valid_rules) { @@ -171,7 +169,7 @@ void ExploreExpression::execute() { // ApplyRule //===--------------------------------------------------------------------===// void ApplyRule::execute() { - LOG_TRACE("ApplyRule::execute() "); + LOG_TRACE("ApplyRule::execute() for rule: %d", rule_->GetRuleIdx()); if (group_expr_->HasRuleExplored(rule_)) return; GroupExprBindingIterator iterator(GetMemo(), group_expr_, @@ -295,8 +293,7 @@ void OptimizeInputs::execute() { // Compute the cost of the root operator // 1. Collect stats needed and cache them in the group // 2. Calculate cost based on children's stats - CostCalculator cost_calculator; - cur_total_cost_ += cost_calculator.CalculateCost( + cur_total_cost_ += context_->metadata->cost_model->CalculateCost( group_expr_, &context_->metadata->memo, context_->metadata->txn); } @@ -369,8 +366,7 @@ void OptimizeInputs::execute() { // Cost the enforced expression auto extended_prop_set = std::make_shared(extended_output_properties); - CostCalculator cost_calculator; - cur_total_cost_ += cost_calculator.CalculateCost( + cur_total_cost_ += context_->metadata->cost_model->CalculateCost( memo_enforced_expr, &context_->metadata->memo, context_->metadata->txn); diff --git a/src/optimizer/child_stats_deriver.cpp b/src/optimizer/stats/child_stats_deriver.cpp similarity index 98% rename from src/optimizer/child_stats_deriver.cpp rename to src/optimizer/stats/child_stats_deriver.cpp index 9c8f650d512..d320547915c 100644 --- a/src/optimizer/child_stats_deriver.cpp +++ b/src/optimizer/stats/child_stats_deriver.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "optimizer/child_stats_deriver.h" +#include "optimizer/stats/child_stats_deriver.h" #include "expression/expression_util.h" #include "expression/tuple_value_expression.h" diff --git a/src/optimizer/stats/cost.cpp b/src/optimizer/stats/cost.cpp deleted file mode 100644 index 15b9c75ae5d..00000000000 --- a/src/optimizer/stats/cost.cpp +++ /dev/null @@ -1,461 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// cost.cpp -// -// Identification: src/optimizer/stats/cost.cpp -// -// Copyright (c) 2015-16, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "optimizer/stats/cost.h" -#include "expression/comparison_expression.h" -#include "expression/tuple_value_expression.h" -#include "optimizer/stats/selectivity.h" -#include "type/value.h" - -#include - -namespace peloton { -namespace optimizer { -// -// //===----------------------------------------------------------------------===// -// // Scan -// //===----------------------------------------------------------------------===// -// double Cost::SingleConditionSeqScanCost( -// const std::shared_ptr &input_stats, -// const ValueCondition &condition, -// std::shared_ptr &output_stats) { -// // PELOTON_ASSERT(input_stats != nullptr); -// // PELOTON_ASSERT(condition != nullptr); -// -// UpdateConditionStats(input_stats, condition, output_stats); -// -// return input_stats->num_rows * DEFAULT_TUPLE_COST; -// } -// -// double Cost::SingleConditionIndexScanCost( -// const std::shared_ptr &input_stats, -// const ValueCondition &condition, -// std::shared_ptr &output_stats) { -// double index_height = default_index_height(input_stats->num_rows); -// double index_cost = index_height * DEFAULT_INDEX_TUPLE_COST; -// -// double selectivity = Selectivity::ComputeSelectivity(input_stats, condition); -// double scan_cost = selectivity * DEFAULT_TUPLE_COST; -// -// UpdateConditionStats(input_stats, condition, output_stats); -// -// return index_cost + scan_cost; -// } -// -// void Cost::CombineConjunctionStats(const std::shared_ptr &lhs, -// const std::shared_ptr &rhs, -// const size_t num_rows, -// const ExpressionType type, -// std::shared_ptr &output_stats) { -// PELOTON_ASSERT(lhs != nullptr); -// PELOTON_ASSERT(rhs != nullptr); -// PELOTON_ASSERT(num_rows > 0); -// -// size_t num_tuples = 1; -// double sel1 = lhs->num_rows / static_cast(num_rows); -// double sel2 = rhs->num_rows / static_cast(num_rows); -// LOG_TRACE("Conjunction sel1[%f] sel2[%f]", sel1, sel2); -// switch (type) { -// case ExpressionType::CONJUNCTION_AND: -// // (sel1 * sel2) * num_rows -// num_tuples = static_cast(num_rows * sel1 * sel2); -// break; -// case ExpressionType::CONJUNCTION_OR: -// // (sel1 + sel2 - sel1 * sel2) * num_rows -// num_tuples = static_cast((sel1 + sel2 - sel1 * sel2) * num_rows); -// break; -// default: -// LOG_WARN("Cost model conjunction on expression type %s not supported", -// ExpressionTypeToString(type).c_str()); -// } -// if (output_stats != nullptr) { -// output_stats->num_rows = num_tuples; -// } -// } -// -// //===----------------------------------------------------------------------===// -// // GROUP BY -// //===----------------------------------------------------------------------===// -// -// double Cost::SortGroupByCost(const std::shared_ptr &input_stats, -// std::vector columns, -// std::shared_ptr &output_stats) { -// PELOTON_ASSERT(input_stats); -// PELOTON_ASSERT(columns.size() > 0); -// -// // if (output_stats != nullptr) { -// if (false) { -// output_stats->num_rows = GetEstimatedGroupByRows(input_stats, columns); -// } -// -// double cost = -// default_sorting_cost(input_stats->num_rows) * DEFAULT_TUPLE_COST; -// -// // Update cost to trivial if first group by column has index. -// // TODO: use more complicated cost when group by multiple columns when -// // primary index operator is supported. -// if (!columns.empty() && input_stats->HasPrimaryIndex(columns[0])) { -// // underestimation of group by with index. -// cost = DEFAULT_OPERATOR_COST; -// } -// -// return cost; -// } -// -// double Cost::HashGroupByCost(const std::shared_ptr &input_stats, -// std::vector columns, -// std::shared_ptr &output_stats) { -// PELOTON_ASSERT(input_stats); -// -// if (output_stats != nullptr) { -// output_stats->num_rows = GetEstimatedGroupByRows(input_stats, columns); -// } -// -// // Directly hash tuple -// return input_stats->num_rows * DEFAULT_TUPLE_COST; -// } -// -// //===----------------------------------------------------------------------===// -// // DISTINCT -// //===----------------------------------------------------------------------===// -// // TODO: support multiple distinct columns -// // what if the column has index? -// double Cost::DistinctCost(const std::shared_ptr &input_stats, -// std::string column_name, -// std::shared_ptr &output_stats) { -// PELOTON_ASSERT(input_stats); -// -// if (output_stats != nullptr) { -// // update number of rows to be number of unique element of column -// output_stats->num_rows = input_stats->GetCardinality(column_name); -// } -// return input_stats->num_rows * DEFAULT_TUPLE_COST; -// } -// -// //===----------------------------------------------------------------------===// -// // Project -// //===----------------------------------------------------------------------===// -// double Cost::ProjectCost(const std::shared_ptr &input_stats, -// UNUSED_ATTRIBUTE std::vector columns, -// std::shared_ptr &output_stats) { -// PELOTON_ASSERT(input_stats); -// -// if (output_stats != nullptr) { -// // update column information for output_stats table -// } -// -// return input_stats->num_rows * DEFAULT_TUPLE_COST; -// } -// -// //===----------------------------------------------------------------------===// -// // LIMIT -// //===----------------------------------------------------------------------===// -// double Cost::LimitCost(const std::shared_ptr &input_stats, -// size_t limit, -// std::shared_ptr &output_stats) { -// PELOTON_ASSERT(input_stats != nullptr); -// if (output_stats != nullptr) { -// output_stats->num_rows = std::max(input_stats->num_rows, limit); -// } -// return limit * DEFAULT_TUPLE_COST; -// } -// -// //===----------------------------------------------------------------------===// -// // ORDER BY -// //===----------------------------------------------------------------------===// -// double Cost::OrderByCost(const std::shared_ptr &input_stats, -// const std::vector &columns, -// const std::vector &orders, -// std::shared_ptr &output_stats) { -// PELOTON_ASSERT(input_stats); -// // Invalid case. -// if (columns.size() == 0 || columns.size() != orders.size()) { -// return DEFAULT_COST; -// } -// std::string column = columns[0]; -// bool order = orders[0]; // CmpTrue is ASC, CmpFalse is DESC -// double cost = DEFAULT_COST; -// // Special case when first column has index. -// if (input_stats->HasPrimaryIndex(column)) { -// if (order) { // ascending -// // No cost for order by for now. We might need to take -// // cardinality of first column into account in the future. -// cost = DEFAULT_OPERATOR_COST; -// } else { // descending -// // Reverse sequence. -// cost = input_stats->num_rows * DEFAULT_TUPLE_COST; -// } -// } else { -// cost = default_sorting_cost(input_stats->num_rows) * DEFAULT_TUPLE_COST; -// } -// if (output_stats != nullptr) { -// output_stats->num_rows = input_stats->num_rows; -// // Also set HasPrimaryIndex for first column to true. -// } -// return cost; -// } -// -// //===----------------------------------------------------------------------===// -// // NL JOIN -// //===----------------------------------------------------------------------===// -// double Cost::NLJoinCost( -// const std::shared_ptr &left_input_stats, -// const std::shared_ptr &right_input_stats, -// std::shared_ptr &output_stats, -// const std::shared_ptr predicate, -// JoinType join_type, bool enable_sampling) { -// UpdateJoinOutputStats(left_input_stats, right_input_stats, output_stats, -// predicate, join_type, enable_sampling); -// -// return left_input_stats->num_rows * right_input_stats->num_rows * -// DEFAULT_TUPLE_COST; -// } -// -// //===----------------------------------------------------------------------===// -// // HASH JOIN -// //===----------------------------------------------------------------------===// -// double Cost::HashJoinCost( -// const std::shared_ptr &left_input_stats, -// const std::shared_ptr &right_input_stats, -// std::shared_ptr &output_stats, -// const std::shared_ptr predicate, -// JoinType join_type, bool enable_sampling) { -// UpdateJoinOutputStats(left_input_stats, right_input_stats, output_stats, -// predicate, join_type, enable_sampling); -// return (left_input_stats->num_rows + right_input_stats->num_rows) * -// DEFAULT_TUPLE_COST; -// } -// -// //===----------------------------------------------------------------------===// -// // Helper functions -// //===----------------------------------------------------------------------===// -// bool Cost::UpdateJoinOutputStatsWithSampling( -// const std::shared_ptr &left_input_stats, -// const std::shared_ptr &right_input_stats, -// std::shared_ptr &output_stats, -// const std::string &left_column_name, const std::string &right_column_name) { -// std::string left = left_column_name, right = right_column_name; -// if (!left_input_stats->HasColumnStats(left_column_name)) { -// left = right_column_name; -// right = left_column_name; -// } -// bool enable_sampling = true; -// auto column_ids = -// GenerateJoinSamples(left_input_stats, right_input_stats, output_stats, -// left, right, enable_sampling); -// if (column_ids.empty()) { -// return enable_sampling; -// } -// output_stats->UpdateJoinColumnStats(column_ids); -// return true; -// } -// -// // Helper function for generating join samples for output stats and calculate -// // and update num_rows. -// std::vector Cost::GenerateJoinSamples( -// const std::shared_ptr &left_input_stats, -// const std::shared_ptr &right_input_stats, -// std::shared_ptr &output_stats, -// const std::string &left_column_name, const std::string &right_column_name, -// bool &enable_sampling) { -// std::vector column_ids; -// auto sample_stats = left_input_stats, index_stats = right_input_stats; -// auto sample_column = left_column_name, index_column = right_column_name; -// if (!right_input_stats->IsBaseTable() || -// right_input_stats->GetIndex(right_column_name) == nullptr) { -// sample_stats = right_input_stats; -// index_stats = left_input_stats; -// sample_column = right_column_name; -// index_column = left_column_name; -// } -// -// auto index = index_stats->GetIndex(index_column); -// // No index available or sample_stats doesn't have samples available -// if (index == nullptr || !sample_stats->GetSampler()) { -// enable_sampling = false; -// return column_ids; -// } -// // index_stats should be base table and have non-null sampler -// PELOTON_ASSERT(index_stats->GetSampler() != nullptr); -// -// // Already have tuple sampled, copy the sampled tuples -// if (!index_stats->GetSampler()->GetSampledTuples().empty()) { -// output_stats->SetTupleSampler(index_stats->GetSampler()); -// return column_ids; -// } -// -// if (sample_stats->IsBaseTable() && -// sample_stats->GetSampler()->GetSampledTuples().empty()) { -// sample_stats->SampleTuples(); -// } -// auto column_id = sample_stats->GetColumnStats(sample_column)->column_id; -// auto &sample_tuples = sample_stats->GetSampler()->GetSampledTuples(); -// if (sample_tuples.empty()) { -// enable_sampling = false; -// return column_ids; -// } -// int cnt = 0; -// std::vector> matched_tuples; -// for (size_t i = 0; i < sample_tuples.size(); i++) { -// auto key = sample_tuples.at(i)->GetValue(column_id); -// -// std::vector fetched_tuples; -// auto schema = sample_tuples.at(i)->GetSchema(); -// std::vector key_attrs; -// key_attrs.push_back(column_id); -// std::unique_ptr key_schema( -// catalog::Schema::CopySchema(schema, key_attrs)); -// auto key_tuple = std::make_shared(key_schema.get(), true); -// type::Value fetched_value = (sample_tuples.at(i)->GetValue(column_id)); -// key_tuple->SetValue(0, fetched_value); -// index->ScanKey(key_tuple.get(), fetched_tuples); -// matched_tuples.push_back(fetched_tuples); -// cnt += fetched_tuples.size(); -// } -// -// if (cnt == 0) { -// enable_sampling = false; -// return column_ids; -// } -// -// index_stats->GetSampler()->AcquireSampleTuplesForIndexJoin( -// sample_tuples, matched_tuples, cnt); -// output_stats->SetTupleSampler(index_stats->GetSampler()); -// output_stats->num_rows = -// (size_t)(sample_stats->num_rows * cnt / (double)sample_tuples.size()); -// -// oid_t column_offset = sample_tuples.at(0)->GetColumnCount(); -// for (oid_t i = 0; i < output_stats->GetColumnCount(); i++) { -// auto column_stats = output_stats->GetColumnStats(i); -// if (index_stats->HasColumnStats(column_stats->column_name)) { -// column_stats->column_id = column_stats->column_id + column_offset; -// } -// column_ids.push_back(column_stats->column_id); -// } -// return column_ids; -// } -// -// void Cost::UpdateJoinOutputStats( -// const std::shared_ptr &left_input_stats, -// const std::shared_ptr &right_input_stats, -// std::shared_ptr &output_stats, -// const std::shared_ptr predicate, -// JoinType join_type, bool enable_sampling) { -// size_t adjustment; -// switch (join_type) { -// case JoinType::INNER: -// adjustment = 0; -// case JoinType::LEFT: -// adjustment = left_input_stats->num_rows; -// case JoinType::RIGHT: -// adjustment = right_input_stats->num_rows; -// case JoinType::OUTER: -// adjustment = left_input_stats->num_rows + right_input_stats->num_rows; -// default: -// adjustment = 0; -// } -// size_t default_join_size = -// left_input_stats->num_rows * right_input_stats->num_rows + adjustment; -// if (predicate == nullptr) { -// output_stats->num_rows = default_join_size; -// } else if (predicate->GetExpressionType() == ExpressionType::COMPARE_EQUAL) { -// // consider only A.a = B.a case here -// if (predicate->GetChildrenSize() != 2 || -// predicate->GetChild(0)->GetExpressionType() != -// ExpressionType::VALUE_TUPLE || -// predicate->GetChild(1)->GetExpressionType() != -// ExpressionType::VALUE_TUPLE) { -// output_stats->num_rows = default_join_size; -// LOG_ERROR("Join predicate not supported %s", -// predicate->GetInfo().c_str()); -// return; -// } -// -// auto left_child = -// reinterpret_cast( -// predicate->GetChild(0)); -// auto right_child = -// reinterpret_cast( -// predicate->GetChild(1)); -// std::string left_column_name = -// left_child->GetTableName() + "." + left_child->GetColumnName(); -// std::string right_column_name = -// right_child->GetTableName() + "." + right_child->GetColumnName(); -// -// if (!enable_sampling || -// !UpdateJoinOutputStatsWithSampling(left_input_stats, right_input_stats, -// output_stats, left_column_name, -// right_column_name)) { -// double left_cardinality, right_cardinality; -// if (left_input_stats->HasColumnStats(left_column_name)) { -// left_cardinality = left_input_stats->GetCardinality(left_column_name); -// } else if (right_input_stats->HasColumnStats(left_column_name)) { -// left_cardinality = right_input_stats->GetCardinality(left_column_name); -// } else { -// left_cardinality = 0; -// LOG_ERROR("join column %s not found", left_column_name.c_str()); -// } -// -// if (left_input_stats->HasColumnStats(right_column_name)) { -// right_cardinality = left_input_stats->GetCardinality(right_column_name); -// } else if (right_input_stats->HasColumnStats(right_column_name)) { -// right_cardinality = -// right_input_stats->GetCardinality(right_column_name); -// } else { -// right_cardinality = 0; -// LOG_ERROR("join column %s not found", right_column_name.c_str()); -// } -// if (left_cardinality == 0 || right_cardinality == 0) { -// output_stats->num_rows = default_join_size; -// } else { -// // n_l * n_r / sqrt(V(A, l) * V(A, r)) -// output_stats->num_rows = -// (size_t)(left_input_stats->num_rows * right_input_stats->num_rows / -// std::max(left_cardinality, right_cardinality)) + -// adjustment; -// } -// } -// } else { -// // conjunction predicates -// output_stats->num_rows = default_join_size; -// } -// } -// -// void Cost::UpdateConditionStats(const std::shared_ptr &input_stats, -// const ValueCondition &condition, -// std::shared_ptr &output_stats) { -// if (output_stats != nullptr) { -// double selectivity = -// Selectivity::ComputeSelectivity(input_stats, condition); -// output_stats->num_rows = input_stats->num_rows * selectivity; -// } -// } -// -// size_t Cost::GetEstimatedGroupByRows( -// const std::shared_ptr &input_stats, -// std::vector &columns) { -// // Idea is to assume each column is uniformaly network and get an -// // overestimation. -// // Then use max cardinality among all columns as underestimation. -// // And combine them together. -// double rows = 1; -// double max_cardinality = 0; -// for (auto column : columns) { -// double cardinality = input_stats->GetCardinality(column); -// max_cardinality = std::max(max_cardinality, cardinality); -// rows *= cardinality; -// } -// return static_cast(rows + max_cardinality / 2); -// } -// -} // namespace optimizer -} // namespace peloton diff --git a/src/optimizer/stats.cpp b/src/optimizer/stats/stats.cpp similarity index 94% rename from src/optimizer/stats.cpp rename to src/optimizer/stats/stats.cpp index b114990ed9d..8549246214c 100644 --- a/src/optimizer/stats.cpp +++ b/src/optimizer/stats/stats.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "optimizer/stats.h" +#include "optimizer/stats/stats.h" namespace peloton { namespace optimizer { diff --git a/src/optimizer/stats_calculator.cpp b/src/optimizer/stats/stats_calculator.cpp similarity index 99% rename from src/optimizer/stats_calculator.cpp rename to src/optimizer/stats/stats_calculator.cpp index 1389bca0919..d086938a817 100644 --- a/src/optimizer/stats_calculator.cpp +++ b/src/optimizer/stats/stats_calculator.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "optimizer/stats_calculator.h" +#include "optimizer/stats/stats_calculator.h" #include diff --git a/src/optimizer/tuple_sample.cpp b/src/optimizer/stats/tuple_sample.cpp similarity index 94% rename from src/optimizer/tuple_sample.cpp rename to src/optimizer/stats/tuple_sample.cpp index 90af47eadba..62be8defb67 100644 --- a/src/optimizer/tuple_sample.cpp +++ b/src/optimizer/stats/tuple_sample.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "optimizer/tuple_sample.h" +#include "optimizer/stats/tuple_sample.h" namespace peloton { namespace optimizer { diff --git a/src/traffic_cop/traffic_cop.cpp b/src/traffic_cop/traffic_cop.cpp index bbf0846ac9a..c6f8df81fda 100644 --- a/src/traffic_cop/traffic_cop.cpp +++ b/src/traffic_cop/traffic_cop.cpp @@ -6,7 +6,7 @@ // // Identification: src/traffic_cop/traffic_cop.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -30,11 +30,11 @@ namespace tcop { TrafficCop::TrafficCop() : is_queuing_(false), rows_affected_(0), - optimizer_(new optimizer::Optimizer()), + optimizer_(new optimizer::Optimizer(optimizer::CostModels::TRIVIAL)), single_statement_txn_(true) {} TrafficCop::TrafficCop(void (*task_callback)(void *), void *task_callback_arg) - : optimizer_(new optimizer::Optimizer()), + : optimizer_(new optimizer::Optimizer(optimizer::CostModels::TRIVIAL)), single_statement_txn_(true), task_callback_(task_callback), task_callback_arg_(task_callback_arg) {} diff --git a/test/optimizer/cost_test.cpp b/test/optimizer/cost_test.cpp deleted file mode 100644 index 2c1531ab367..00000000000 --- a/test/optimizer/cost_test.cpp +++ /dev/null @@ -1,195 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// cost_test.cpp -// -// Identification: test/optimizer/cost_test.cpp -// -// Copyright (c) 2015-2018, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "common/harness.h" - -#include -#include -#include -#include - -#include "catalog/catalog.h" -#include "catalog/column_catalog.h" -#include "common/logger.h" -#include "concurrency/transaction_manager_factory.h" -#include "executor/testing_executor_util.h" -#include "expression/tuple_value_expression.h" -#include "expression/expression_util.h" -#include "expression/star_expression.h" -#include "optimizer/stats/cost.h" -#include "optimizer/stats/stats_storage.h" -#include "optimizer/stats/table_stats.h" -#include "optimizer/stats/value_condition.h" -#include "sql/testing_sql_util.h" -#include "common/internal_types.h" -#include "type/value.h" -#include "type/value_factory.h" -#include "optimizer/properties.h" - -namespace peloton { -namespace test { - -using namespace optimizer; - -// const int N_ROW = 100; - -class CostTests : public PelotonTest {}; - -// tablename: test -// database name: DEFAULT_DB_NAME -// void CreateAndLoadTable(const std::string& table_name = {"test"}) { -// TestingSQLUtil::ExecuteSQLQuery( -// "CREATE TABLE " + table_name + " (id INT PRIMARY KEY, name VARCHAR, salary DECIMAL);"); -// for (int i = 1; i <= N_ROW; i++) { -// std::stringstream ss; -// ss << "INSERT INTO " << table_name << " VALUES (" << i << ", 'name', 1.1);"; -// TestingSQLUtil::ExecuteSQLQuery(ss.str()); -// } -// } -// -// std::shared_ptr GetPropertyColumns() { -// std::vector> cols; -// auto star_expr = std::shared_ptr(new expression::StarExpression()); -// cols.push_back(star_expr); -// return std::make_shared(cols); -// } -// -// std::shared_ptr GetTableStatsWithName( -// std::string table_name, concurrency::TransactionContext *txn) { -// auto catalog = catalog::Catalog::GetInstance(); -// auto database = catalog->GetDatabaseWithName(DEFAULT_DB_NAME, txn); -// auto table = catalog->GetTableWithName(DEFAULT_DB_NAME, table_name, txn); -// oid_t db_id = database->GetOid(); -// oid_t table_id = table->GetOid(); -// auto stats_storage = StatsStorage::GetInstance(); -// return stats_storage->GetTableStats(db_id, table_id); -// } -// -// -// std::shared_ptr GetTableStatsForJoin( -// std::string table_name, concurrency::Transaction *txn) { -// auto catalog = catalog::Catalog::GetInstance(); -// auto database = catalog->GetDatabaseWithName(DEFAULT_DB_NAME, txn); -// auto table = catalog->GetTableWithName(DEFAULT_DB_NAME, table_name, txn); -// oid_t db_id = database->GetOid(); -// oid_t table_id = table->GetOid(); -// auto stats_storage = StatsStorage::GetInstance(); -// auto table_stats = stats_storage->GetTableStats(db_id, table_id); -// table_stats->SetTupleSampler(std::make_shared(table)); -// auto column_prop = GetPropertyColumns(); -// return generateOutputStat(table_stats, column_prop.get(), table); -// } -// -// TEST_F(CostTests, ScanCostTest) { -// auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); -// auto txn = txn_manager.BeginTransaction(); -// catalog::Catalog::GetInstance()->CreateDatabase(DEFAULT_DB_NAME, txn); -// txn_manager.CommitTransaction(txn); -// -// // create table with name test -// CreateAndLoadTable(); -// -// // Collect stats -// TestingSQLUtil::ExecuteSQLQuery("ANALYZE test"); -// -// txn = txn_manager.BeginTransaction(); -// auto table_stats = GetTableStatsWithName("test", txn); -// txn_manager.CommitTransaction(txn); -// EXPECT_NE(table_stats, nullptr); -// EXPECT_EQ(table_stats->num_rows, N_ROW); -// -// // condition1: id < 1000 -// type::Value value1 = type::ValueFactory::GetIntegerValue(1000); -// ValueCondition condition1{0, "id", ExpressionType::COMPARE_LESSTHAN, value1}; -// std::shared_ptr output_stats(new TableStats{}); -// double cost1 = -// Cost::SingleConditionSeqScanCost(table_stats, condition1, output_stats); -// LOG_INFO("cost for condition 1 is %f", cost1); -// EXPECT_GE(cost1, 0); -// // EXPECT_EQ(output_stats->num_rows, 1000); -// -// // condition2: id = 1000 -// ValueCondition condition2{0, "id", ExpressionType::COMPARE_EQUAL, value1}; -// output_stats->ClearColumnStats(); -// double cost2 = -// Cost::SingleConditionSeqScanCost(table_stats, condition2, output_stats); -// LOG_INFO("cost for condition 2 is: %f", cost2); -// EXPECT_GE(cost2, 0); -// // EXPECT_EQ(output_stats->num_rows, 1); -// -// // Two seq scan cost should be the same -// EXPECT_EQ(cost1, cost2); -// -// // Free the database -// txn = txn_manager.BeginTransaction(); -// catalog::Catalog::GetInstance()->DropDatabaseWithName(DEFAULT_DB_NAME, txn); -// txn_manager.CommitTransaction(txn); -// } -// -// TEST_F(CostTests, ConjunctionTest) { -// std::shared_ptr lhs(new TableStats{8080}); -// std::shared_ptr rhs(new TableStats{3695}); -// std::shared_ptr output(new TableStats{}); -// int n_rows = 200000; -// Cost::CombineConjunctionStats(lhs, rhs, n_rows, -// ExpressionType::CONJUNCTION_AND, output); -// EXPECT_GE(output->num_rows, 149); -// EXPECT_LE(output->num_rows, 150); -// Cost::CombineConjunctionStats(lhs, rhs, n_rows, -// ExpressionType::CONJUNCTION_OR, output); -// EXPECT_GE(output->num_rows, 11625); -// EXPECT_LE(output->num_rows, 11626); -// } -// -// TEST_F(CostTests, JoinTest) { -// auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); -// auto txn = txn_manager.BeginTransaction(); -// catalog::Catalog::GetInstance()->CreateDatabase(DEFAULT_DB_NAME, txn); -// txn_manager.CommitTransaction(txn); -// -// // create table with name test1 and test2 -// CreateAndLoadTable("test1"); -// CreateAndLoadTable("test2"); -// -// // Collect stats -// TestingSQLUtil::ExecuteSQLQuery("ANALYZE test1"); -// TestingSQLUtil::ExecuteSQLQuery("ANALYZE test2"); -// -// txn = txn_manager.BeginTransaction(); -// auto left_table_stats = GetTableStatsForJoin("test1", txn); -// auto right_table_stats = GetTableStatsForJoin("test2", txn); -// -// txn_manager.CommitTransaction(txn); -// -// auto expr1 = new expression::TupleValueExpression("id", "test1"); -// auto expr2 = new expression::TupleValueExpression("id", "test2"); -// auto predicate = std::shared_ptr(expression::ExpressionUtil::ComparisonFactory( -// ExpressionType::COMPARE_EQUAL, expr1, expr2)); -// -// -// auto column_prop = GetPropertyColumns(); -// -// std::shared_ptr output_stats = generateOutputStatFromTwoTable( -// left_table_stats, -// right_table_stats, -// column_prop.get()); -// -// double cost = Cost::NLJoinCost(left_table_stats, right_table_stats, output_stats, predicate, JoinType::INNER, true); -// LOG_INFO("Estimated output size %lu", output_stats->num_rows); -// EXPECT_EQ(cost, 100); -// EXPECT_EQ(output_stats->GetSampler()->GetSampledTuples().size(), 100); -// -// -// -// } -} // namespace test -} // namespace peloton diff --git a/test/optimizer/optimizer_test_util.cpp b/test/optimizer/optimizer_test_util.cpp index 5c650084993..b159fb0cb78 100644 --- a/test/optimizer/optimizer_test_util.cpp +++ b/test/optimizer/optimizer_test_util.cpp @@ -16,8 +16,7 @@ #include "common/harness.h" #include "sql/testing_sql_util.h" #include "parser/postgresparser.h" - - +#include "planner/abstract_scan_plan.h" namespace peloton { namespace test { @@ -44,6 +43,10 @@ class OptimizerTestUtil : public PelotonTest { PelotonTest::TearDown(); } + void SetCostModel(optimizer::CostModels cost_model) { + optimizer_ = std::unique_ptr(new optimizer::Optimizer(cost_model)); + } + // Creates the following table: table_name(a INT PRIMARY KEY, b DECIMAL, c VARCHAR) void CreateTable(const std::string &table_name) { std::stringstream ss; @@ -87,6 +90,80 @@ class OptimizerTestUtil : public PelotonTest { return GeneratePlanHelper(optimizer_, query, txn); } + + std::string CreateTwoWayJoinQuery(const std::string &table_1, + const std::string &table_2, + const std::string &column_1, + const std::string &column_2) { + return CreateTwoWayJoinQuery(table_1, table_2, column_1, column_2, "", ""); + } + + std::string CreateTwoWayJoinQuery(const std::string &table_1, + const std::string &table_2, + const std::string &column_1, + const std::string &column_2, + const std::string &order_by_table, + const std::string &order_by_column) { + std::stringstream ss; + ss << "SELECT * FROM " << table_1 << ", " << table_2 << " WHERE " << table_1 + << "." << column_1 << " = " << table_2 << "." << column_2; + if (!order_by_column.empty() and !order_by_table.empty()) { + ss << " ORDER BY " << order_by_table << "." << order_by_column; + } + ss << ";"; + return ss.str(); + } + + std::string CreateThreeWayJoinQuery(const std::string &table_1, + const std::string &table_2, + const std::string &table_3, + const std::string &column_1, + const std::string &column_2, + const std::string &column_3) { + return CreateThreeWayJoinQuery(table_1, table_2, table_3, column_1, + column_2, column_3, "", ""); + } + + std::string CreateThreeWayJoinQuery( + const std::string &table_1, const std::string &table_2, + const std::string &table_3, const std::string &column_1, + const std::string &column_2, const std::string &column_3, + const std::string &order_by_table, const std::string &order_by_column) { + std::stringstream ss; + ss << "SELECT * FROM " << table_1 << ", " << table_2 << "," << table_3 + << " WHERE " << table_1 << "." << column_1 << " = " << table_2 << "." + << column_2 << " AND " << table_2 << "." << column_2 << " = " << table_3 + << "." << column_3; + if (!order_by_column.empty() and !order_by_table.empty()) { + ss << " ORDER BY " << order_by_table << "." << order_by_column; + } + ss << ";"; + return ss.str(); + } + + void PrintPlan(std::shared_ptr plan, int level = 0) { + PrintPlan(plan.get(), level); + } + + void PrintPlan(planner::AbstractPlan *plan, int level = 0) { + auto spacing = std::string(level, '\t'); + + if (plan->GetPlanNodeType() == PlanNodeType::SEQSCAN) { + auto scan = dynamic_cast(plan); + (void)scan; /* Used to avoid unused variable warning */ + LOG_DEBUG("%s%s(%s)", spacing.c_str(), scan->GetInfo().c_str(), + scan->GetTable()->GetName().c_str()); + } else { + LOG_DEBUG("%s%s", spacing.c_str(), plan->GetInfo().c_str()); + } + + for (size_t i = 0; i < plan->GetChildren().size(); i++) { + PrintPlan(plan->GetChildren()[i].get(), level + 1); + } + + return; + } + private: void CreateDatabase() { diff --git a/test/optimizer/plan_test.cpp b/test/optimizer/plan_test.cpp new file mode 100644 index 00000000000..b39255be172 --- /dev/null +++ b/test/optimizer/plan_test.cpp @@ -0,0 +1,286 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// plan_test.cpp +// +// Identification: test/optimizer/plan_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + + +#include "optimizer_test_util.cpp" +#include "planner/abstract_scan_plan.h" + +namespace peloton { +namespace test { + +class PlanTest : public OptimizerTestUtil {}; + + +// Tests cost model outputs identical plans regardless of table orderings +TEST_F(PlanTest, PlanEqualityTest) { + + // Set cost model to postgres cost model + OptimizerTestUtil::SetCostModel(optimizer::CostModels::POSTGRES); + + // Populate Tables and run Analyze + std::string test1_table_name = "test1"; + std::string test2_table_name = "test2"; + int test1_table_size = 10; + int test2_table_size = 100; + OptimizerTestUtil::CreateTable(test1_table_name, test1_table_size); + OptimizerTestUtil::CreateTable(test2_table_name, test2_table_size); + OptimizerTestUtil::AnalyzeTable(test1_table_name); + OptimizerTestUtil::AnalyzeTable(test2_table_name); + + // Generate queries with names in different order + auto query1 = "SELECT test1.a, test2.a FROM test1,test2 WHERE test1.a = test2.a"; + auto query2 = "SELECT test1.a, test2.a FROM test2,test1 WHERE test1.a = test2.a"; + + // Generate two plans + auto plan1 = OptimizerTestUtil::GeneratePlan(query1); + auto plan2 = OptimizerTestUtil::GeneratePlan(query2); + + EXPECT_EQ(*plan1, *plan2); +} + + +TEST_F(PlanTest, PostgresTwoJoinOrderTestSmall) { + + // Set cost model to postgres cost model + OptimizerTestUtil::SetCostModel(optimizer::CostModels::POSTGRES); + + // Populate Tables and run Analyze + std::string test1_table_name = "test1"; + std::string test2_table_name = "test2"; + int test1_table_size = 10; + int test2_table_size = 100; + OptimizerTestUtil::CreateTable(test1_table_name, test1_table_size); + OptimizerTestUtil::CreateTable(test2_table_name, test2_table_size); + OptimizerTestUtil::AnalyzeTable(test1_table_name); + OptimizerTestUtil::AnalyzeTable(test2_table_name); + + + // Generate query + auto query = OptimizerTestUtil::CreateTwoWayJoinQuery(test1_table_name, test2_table_name, "a", "a"); + + auto plan = OptimizerTestUtil::GeneratePlan(query); + + EXPECT_EQ(PlanNodeType::HASHJOIN, plan->GetPlanNodeType()); + EXPECT_EQ(2, plan->GetChildren().size()); + + + // Get Left Scan + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + auto left_scan = dynamic_cast(plan->GetChildren()[0].get()); + + // Get Right Scan + EXPECT_EQ(PlanNodeType::HASH, plan->GetChildren()[1]->GetPlanNodeType()); + EXPECT_EQ(1, plan->GetChildren()[1]->GetChildren().size()); + auto right_scan = + dynamic_cast(plan->GetChildren()[1]->GetChildren()[0].get()); + EXPECT_EQ(PlanNodeType::SEQSCAN, right_scan->GetPlanNodeType()); + + // Check we build hash table on larger table (right table), and probe with smaller table (left table) + EXPECT_EQ(test1_table_name, left_scan->GetTable()->GetName().c_str()); + EXPECT_EQ(test2_table_name, right_scan->GetTable()->GetName().c_str()); +} + +// With trivial model, ordering of tables in join should be reversed as both orderings have the same cost, however +// test2 x test1 is explored after test1 x test2, so we pick the most previously explored one. +TEST_F(PlanTest, TrivialTwoJoinOrderTestSmall) { + + // Set cost model to postgres cost model + OptimizerTestUtil::SetCostModel(optimizer::CostModels::TRIVIAL); + + // Populate Tables and run Analyze + std::string test1_table_name = "test1"; + std::string test2_table_name = "test2"; + int test1_table_size = 10; + int test2_table_size = 100; + OptimizerTestUtil::CreateTable(test1_table_name, test1_table_size); + OptimizerTestUtil::CreateTable(test2_table_name, test2_table_size); + OptimizerTestUtil::AnalyzeTable(test1_table_name); + OptimizerTestUtil::AnalyzeTable(test2_table_name); + + + // Generate query + auto query = OptimizerTestUtil::CreateTwoWayJoinQuery(test2_table_name, test1_table_name, "a", "a"); + + auto plan = OptimizerTestUtil::GeneratePlan(query); + + EXPECT_EQ(PlanNodeType::HASHJOIN, plan->GetPlanNodeType()); + EXPECT_EQ(2, plan->GetChildren().size()); + + + // Get Left Scan + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + auto left_scan = dynamic_cast(plan->GetChildren()[0].get()); + + // Get Right Scan + EXPECT_EQ(PlanNodeType::HASH, plan->GetChildren()[1]->GetPlanNodeType()); + EXPECT_EQ(1, plan->GetChildren()[1]->GetChildren().size()); + auto right_scan = + dynamic_cast(plan->GetChildren()[1]->GetChildren()[0].get()); + EXPECT_EQ(PlanNodeType::SEQSCAN, right_scan->GetPlanNodeType()); + + // Check we build hash table on larger table (right table), and probe with smaller table (left table) + EXPECT_EQ(test1_table_name, left_scan->GetTable()->GetName().c_str()); + EXPECT_EQ(test2_table_name, right_scan->GetTable()->GetName().c_str()); +} + + +// Tests that since the left table is a single tuple, we just use nested loop +TEST_F(PlanTest, TrivialTwoJoinOrderTestSmall2) { + + // Set cost model to postgres cost model + OptimizerTestUtil::SetCostModel(optimizer::CostModels::TRIVIAL); + + // Populate Tables and run Analyze + std::string test1_table_name = "test1"; + std::string test2_table_name = "test2"; + int test1_table_size = 1; + int test2_table_size = 100; + OptimizerTestUtil::CreateTable(test1_table_name, test1_table_size); + OptimizerTestUtil::CreateTable(test2_table_name, test2_table_size); + OptimizerTestUtil::AnalyzeTable(test1_table_name); + OptimizerTestUtil::AnalyzeTable(test2_table_name); + + + // Generate query + auto query = OptimizerTestUtil::CreateTwoWayJoinQuery(test1_table_name, test2_table_name, "a", "a"); + + auto plan = OptimizerTestUtil::GeneratePlan(query); + + EXPECT_EQ(PlanNodeType::NESTLOOP, plan->GetPlanNodeType()); + EXPECT_EQ(2, plan->GetChildren().size()); + + + // Get Left Scan + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + auto left_scan = dynamic_cast(plan->GetChildren()[0].get()); + + // Get Right Scan + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[1]->GetPlanNodeType()); + EXPECT_EQ(0, plan->GetChildren()[1]->GetChildren().size()); + auto right_scan = + dynamic_cast(plan->GetChildren()[1].get()); + EXPECT_EQ(PlanNodeType::SEQSCAN, right_scan->GetPlanNodeType()); + + // Check we build hash table on larger table (right table), and probe with smaller table (left table) + EXPECT_EQ(test1_table_name, left_scan->GetTable()->GetName().c_str()); + EXPECT_EQ(test2_table_name, right_scan->GetTable()->GetName().c_str()); +} + +TEST_F(PlanTest, PostgresTwoJoinOrderTestLarge) { + + // Set cost model to postgres cost model + OptimizerTestUtil::SetCostModel(optimizer::CostModels::POSTGRES); + + // Populate Tables and run Analyze + std::string test1_table_name = "test1"; + std::string test2_table_name = "test2"; + int test1_table_size = 10000; + int test2_table_size = 1000; + OptimizerTestUtil::CreateTable(test1_table_name, test1_table_size); + OptimizerTestUtil::CreateTable(test2_table_name, test2_table_size); + OptimizerTestUtil::AnalyzeTable(test1_table_name); + OptimizerTestUtil::AnalyzeTable(test2_table_name); + + + // Generate query + auto query = OptimizerTestUtil::CreateTwoWayJoinQuery(test1_table_name, test2_table_name, "a", "a"); + + auto plan = OptimizerTestUtil::GeneratePlan(query); + + EXPECT_EQ(PlanNodeType::HASHJOIN, plan->GetPlanNodeType()); + EXPECT_EQ(2, plan->GetChildren().size()); + + + // Get Left Scan + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + auto left_scan = dynamic_cast(plan->GetChildren()[0].get()); + + // Get Right Scan + EXPECT_EQ(PlanNodeType::HASH, plan->GetChildren()[1]->GetPlanNodeType()); + EXPECT_EQ(1, plan->GetChildren()[1]->GetChildren().size()); + auto right_scan = + dynamic_cast(plan->GetChildren()[1]->GetChildren()[0].get()); + EXPECT_EQ(PlanNodeType::SEQSCAN, right_scan->GetPlanNodeType()); + + // Check we build hash table on larger table (right table), and probe with smaller table (left table) + EXPECT_EQ(test2_table_name, left_scan->GetTable()->GetName().c_str()); + EXPECT_EQ(test1_table_name, right_scan->GetTable()->GetName().c_str()); +} + +TEST_F(PlanTest, PostgresThreeJoinOrderTestSmall) { + + // Set cost model to postgres cost model + OptimizerTestUtil::SetCostModel(optimizer::CostModels::POSTGRES); + + // Populate Tables and run Analyze + std::string test1_table_name = "test1"; + std::string test2_table_name = "test2"; + std::string test3_table_name = "test3"; + int test1_table_size = 10; + int test2_table_size = 100; + int test3_table_size = 1000; + + OptimizerTestUtil::CreateTable(test1_table_name, test1_table_size); + OptimizerTestUtil::CreateTable(test2_table_name, test2_table_size); + OptimizerTestUtil::CreateTable(test3_table_name, test3_table_size); + + OptimizerTestUtil::AnalyzeTable(test1_table_name); + OptimizerTestUtil::AnalyzeTable(test2_table_name); + OptimizerTestUtil::AnalyzeTable(test3_table_name); + + + // Generate query + auto query = OptimizerTestUtil::CreateThreeWayJoinQuery(test2_table_name, + test3_table_name, + test1_table_name, + "a", "a", "a"); + + auto plan = OptimizerTestUtil::GeneratePlan(query); + + EXPECT_EQ(PlanNodeType::NESTLOOP, plan->GetPlanNodeType()); + EXPECT_EQ(2, plan->GetChildren().size()); + + + OptimizerTestUtil::PrintPlan(plan); + + + // Get Left Scan + auto left_scan = dynamic_cast(plan->GetChildren()[0].get()); + EXPECT_EQ(PlanNodeType::SEQSCAN, left_scan->GetPlanNodeType()); + + // Get right join Scan + EXPECT_EQ(PlanNodeType::HASHJOIN, plan->GetChildren()[1]->GetPlanNodeType()); + EXPECT_EQ(2, plan->GetChildren()[1]->GetChildren().size()); + auto right_join = plan->GetChildren()[1].get(); + + // Get Middle Scan + auto middle_scan = dynamic_cast(right_join->GetChildren()[0].get()); + EXPECT_EQ(PlanNodeType::SEQSCAN, middle_scan->GetPlanNodeType()); + + // Get Right Scan + EXPECT_EQ(PlanNodeType::HASH, right_join->GetChildren()[1]->GetPlanNodeType()); + auto right_scan = dynamic_cast(right_join->GetChildren()[1]->GetChildren()[0].get()); + EXPECT_EQ(PlanNodeType::SEQSCAN, right_scan->GetPlanNodeType()); + + // Optimal Should be: (test2) x (test1 x test3) + EXPECT_EQ(test2_table_name, left_scan->GetTable()->GetName().c_str()); + EXPECT_EQ(test1_table_name, middle_scan->GetTable()->GetName().c_str()); + EXPECT_EQ(test3_table_name, right_scan->GetTable()->GetName().c_str()); +} + + +} +} \ No newline at end of file