diff --git a/script/testing/cost_model/cost_model_evaluator.py b/script/testing/cost_model/cost_model_evaluator.py new file mode 100644 index 00000000000..b0e42869f23 --- /dev/null +++ b/script/testing/cost_model/cost_model_evaluator.py @@ -0,0 +1,159 @@ +import argparse +import psycopg2 +import re +import socket +import subprocess +import time + +from collections import namedtuple + +# TODO add more stats +ExecutionStats = namedtuple('ExecutionStats', ['average']) + + +class Peloton(object): + def __init__(self, peloton_path, port, cost_calculator=None): + self.peloton_path = peloton_path + self.peloton_port = port + self.cost_calculator = cost_calculator + self.peloton_process = None + self.peloton_output_fd = None + self.conn = None + + def run(self): + outfile = "/tmp/peloton_log.txt" + args = [self.peloton_path, "-port", str(self.peloton_port)] + if self.cost_calculator: + args += ["-cost_calculator", self.cost_calculator] + args += ['-codegen', 'false'] + args += ['-hash_join_bloom_filter', 'false'] + self.peloton_output_fd = open(outfile, "w+") + self.peloton_process = subprocess.Popen(args, stdout=self.peloton_output_fd, stderr=self.peloton_output_fd) + self.wait() + self.conn = psycopg2.connect( + "dbname=default_database user=postgres password=postgres host=localhost port={}".format(self.peloton_port)) + + def stop(self): + if not self.peloton_process: + raise Exception("No peloton process to stop") + self.peloton_process.poll() + if self.peloton_process.returncode is not None: + # Peloton terminated already + self.peloton_output_fd.close() + msg = "Peloton terminated with return code {}".format(self.peloton_process.returncode) + raise RuntimeError(msg) + + # still(correctly) running, terminate it + self.conn.close() + self.peloton_process.terminate() + return + + def wait(self): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # max wait of 10s in 0.1s increments + for i in xrange(100): + try: + s.connect(('localhost', self.peloton_port)) + s.close() + print("connected to server in {} seconds".format(i * 0.1)) + return + except: + time.sleep(0.1) + continue + return + + def process_query(self, query): + cur = self.conn.cursor() + rows = None + if query: + cur.execute(query) + try: + rows = cur.fetchall() + except Exception: + pass + self.conn.commit() + cur.close() + return rows + + +def get_query_list(sql_string_file): + with open(sql_string_file, 'r') as infile: + sql_string = infile.read() + sql_string = sql_string.replace('\n', '') + sql_string = re.sub(' +', ' ', sql_string) + query_list = [x for x in sql_string.split(';') if x] + return query_list + + +def execute_sql_statements(data_path, peloton): + query_list = get_query_list(data_path) + for query in query_list: + peloton.process_query(query) + + +def execute_sql_statements_with_stats(data_path, peloton, execution_count): + execution_stat_list = [] + explain_result_list = [] + results = [] + query_list = get_query_list(data_path) + for i, query in enumerate(query_list): + sum_time = 0 + for _ in xrange(execution_count): + start_time = time.time() + rows = peloton.process_query(query) + end_time = time.time() + sum_time += (end_time - start_time) + result = { + 'num': i + 1, + 'query': query, + 'num_rows': len(rows) if rows else 0, + 'execution_stats': ExecutionStats(average=sum_time / execution_count), + 'explain_result': peloton.process_query("".join(["EXPLAIN ", query])) + } + results.append(result) + return results + + +def analyze(peloton): + # Note this doesn't actually do anything. When https://github.com/cmu-db/peloton/issues/1360 is resolved, this will work + peloton.process_query("ANALYZE;") + + +def run_pipeline(args): + peloton = Peloton(args.peloton_path, args.port, args.cost_model) + try: + peloton = Peloton(args.peloton_path, args.port, args.cost_model) + peloton.run() + execute_sql_statements(args.data_load_path, peloton) + analyze(peloton) + results = execute_sql_statements_with_stats(args.data_query_path, peloton, args.query_count) + for result in results: + print 'Query Num: {}'.format(result['num']) + print 'Query: {}'.format(result['query']) + print 'Num Result Rows: {}'.format(result['num_rows']) + print result['execution_stats'] + for row in result['explain_result']: + print row + peloton.stop() + except Exception as e: + print e + peloton.stop() + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate the provided cost model") + parser.add_argument("--cost-model") + parser.add_argument("--port", default=15721, help="Optional port override if you aren't using 15721") + parser.add_argument("--peloton-path", default="peloton", + help="Optional path to peloton binary if peloton is not on your path") + parser.add_argument('--query-count', default=500, type=int, + help="Number of times to run the query defined in the data query path") + # For now, data load path needs to include analyze statements at end. we don't support analyze on all tables + parser.add_argument("data_load_path") + parser.add_argument("data_query_path") + args = parser.parse_args() + run_pipeline(args) + + +if __name__ == "__main__": + main() diff --git a/src/include/optimizer/abstract_cost_calculator.h b/src/include/optimizer/abstract_cost_calculator.h new file mode 100644 index 00000000000..1b2c78f2b8b --- /dev/null +++ b/src/include/optimizer/abstract_cost_calculator.h @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// abstract_cost_calculator.h +// +// Identification: src/include/optimizer/abstract_cost_calculator.h +// +// Copyright (c) 2015-18, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "optimizer/operator_visitor.h" + +namespace peloton { +namespace optimizer { + +class Memo; + +class AbstractCostCalculator : public OperatorVisitor { + public: + virtual double CalculateCost(GroupExpression *gexpr, Memo *memo, + concurrency::TransactionContext *txn) = 0; +}; + +} // namespace optimizer +} // namespace peloton \ No newline at end of file diff --git a/src/include/optimizer/cost_calculator.h b/src/include/optimizer/cost_calculator.h index 442f386fc5f..868f3c4a383 100644 --- a/src/include/optimizer/cost_calculator.h +++ b/src/include/optimizer/cost_calculator.h @@ -12,40 +12,42 @@ #pragma once -#include "optimizer/operator_visitor.h" +#include "optimizer/abstract_cost_calculator.h" namespace peloton { namespace optimizer { class Memo; -// Derive cost for a physical group expressionh -class CostCalculator : public OperatorVisitor { +// Derive cost for a physical group expression +class CostCalculator : public AbstractCostCalculator { public: - double CalculateCost(GroupExpression *gexpr, Memo *memo, - concurrency::TransactionContext *txn); - - void Visit(const DummyScan *) override; - void Visit(const PhysicalSeqScan *) override; - void Visit(const PhysicalIndexScan *) override; - void Visit(const QueryDerivedScan *) override; - void Visit(const PhysicalOrderBy *) override; - void Visit(const PhysicalLimit *) override; - void Visit(const PhysicalInnerNLJoin *) override; - void Visit(const PhysicalLeftNLJoin *) override; - void Visit(const PhysicalRightNLJoin *) override; - void Visit(const PhysicalOuterNLJoin *) override; - void Visit(const PhysicalInnerHashJoin *) override; - void Visit(const PhysicalLeftHashJoin *) override; - void Visit(const PhysicalRightHashJoin *) override; - void Visit(const PhysicalOuterHashJoin *) override; - void Visit(const PhysicalInsert *) override; - void Visit(const PhysicalInsertSelect *) override; - void Visit(const PhysicalDelete *) override; - void Visit(const PhysicalUpdate *) override; - void Visit(const PhysicalHashGroupBy *) override; - void Visit(const PhysicalSortGroupBy *) override; - void Visit(const PhysicalDistinct *) override; - void Visit(const PhysicalAggregate *) override; + CostCalculator(){}; + + virtual double CalculateCost(GroupExpression *gexpr, Memo *memo, + concurrency::TransactionContext *txn) override; + + virtual void Visit(const DummyScan *) override; + virtual void Visit(const PhysicalSeqScan *) override; + virtual void Visit(const PhysicalIndexScan *) override; + virtual void Visit(const QueryDerivedScan *) override; + virtual void Visit(const PhysicalOrderBy *) override; + virtual void Visit(const PhysicalLimit *) override; + virtual void Visit(const PhysicalInnerNLJoin *) override; + virtual void Visit(const PhysicalLeftNLJoin *) override; + virtual void Visit(const PhysicalRightNLJoin *) override; + virtual void Visit(const PhysicalOuterNLJoin *) override; + virtual void Visit(const PhysicalInnerHashJoin *) override; + virtual void Visit(const PhysicalLeftHashJoin *) override; + virtual void Visit(const PhysicalRightHashJoin *) override; + virtual void Visit(const PhysicalOuterHashJoin *) override; + virtual void Visit(const PhysicalInsert *) override; + virtual void Visit(const PhysicalInsertSelect *) override; + virtual void Visit(const PhysicalDelete *) override; + virtual void Visit(const PhysicalUpdate *) override; + virtual void Visit(const PhysicalHashGroupBy *) override; + virtual void Visit(const PhysicalSortGroupBy *) override; + virtual void Visit(const PhysicalDistinct *) override; + virtual void Visit(const PhysicalAggregate *) override; private: double HashCost(); diff --git a/src/include/optimizer/cost_calculator_factory.h b/src/include/optimizer/cost_calculator_factory.h new file mode 100644 index 00000000000..59bc95a999b --- /dev/null +++ b/src/include/optimizer/cost_calculator_factory.h @@ -0,0 +1,31 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// cost_model_factory.h +// +// Identification: src/include/optimizer/cost_model_factory.h +// +// Copyright (c) 2015-18, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once +#include "optimizer/cost_calculator.h" + +#include "common/exception.h" + +namespace peloton { +namespace optimizer { + +class CostCalculatorFactory { + public: + /* + * Creates the respective cost calculator given a cost calculator name + */ + static std::unique_ptr CreateCostCalculator( + const std::string &cost_model_name); +}; + +} // namespace peloton +} // namespace optimizer diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index 6eafa8eb26f..0e2a7393962 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -15,6 +15,7 @@ #include #include "optimizer/abstract_optimizer.h" +#include "optimizer/abstract_cost_calculator.h" #include "optimizer/property_set.h" #include "optimizer/optimizer_metadata.h" @@ -38,9 +39,9 @@ class TransactionContext; } namespace test { - class OptimizerRuleTests_SimpleAssociativeRuleTest_Test; - class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; -} +class OptimizerRuleTests_SimpleAssociativeRuleTest_Test; +class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; +} namespace optimizer { @@ -60,8 +61,10 @@ class Optimizer : public AbstractOptimizer { friend class BindingIterator; friend class GroupBindingIterator; - friend class ::peloton::test::OptimizerRuleTests_SimpleAssociativeRuleTest_Test; - friend class ::peloton::test::OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; + friend class ::peloton::test:: + OptimizerRuleTests_SimpleAssociativeRuleTest_Test; + friend class ::peloton::test:: + OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; public: Optimizer(const Optimizer &) = delete; @@ -69,7 +72,8 @@ class Optimizer : public AbstractOptimizer { Optimizer(Optimizer &&) = delete; Optimizer &operator=(Optimizer &&) = delete; - Optimizer(); + Optimizer(){}; + Optimizer(std::unique_ptr cost_calculator); std::shared_ptr BuildPelotonPlanTree( const std::unique_ptr &parse_tree, @@ -83,14 +87,16 @@ class Optimizer : public AbstractOptimizer { OptimizerMetadata &GetMetadata() { return metadata_; } + AbstractCostCalculator *GetCostCalculator() { return cost_calculator_.get(); } + /* For test purposes only */ - std::shared_ptr TestInsertQueryTree(parser::SQLStatement *tree, - concurrency::TransactionContext *txn) { + std::shared_ptr TestInsertQueryTree( + parser::SQLStatement *tree, concurrency::TransactionContext *txn) { return InsertQueryTree(tree, txn); } /* For test purposes only */ void TestExecuteTaskStack(OptimizerTaskStack &task_stack, int root_group_id, - std::shared_ptr root_context) { + std::shared_ptr root_context) { return ExecuteTaskStack(task_stack, root_group_id, root_context); } @@ -153,6 +159,8 @@ class Optimizer : public AbstractOptimizer { ////////////////////////////////////////////////////////////////////////////// /// Metadata OptimizerMetadata metadata_; + /// Cost Model + std::unique_ptr cost_calculator_; }; } // namespace optimizer diff --git a/src/include/optimizer/optimizer_metadata.h b/src/include/optimizer/optimizer_metadata.h index 37232da63a4..81386bcb405 100644 --- a/src/include/optimizer/optimizer_metadata.h +++ b/src/include/optimizer/optimizer_metadata.h @@ -13,6 +13,8 @@ #pragma once #include "common/timer.h" +#include "optimizer/cost_calculator.h" +#include "optimizer/cost_calculator_factory.h" #include "optimizer/memo.h" #include "optimizer/group_expression.h" #include "optimizer/rule.h" @@ -27,20 +29,32 @@ namespace optimizer { class OptimizerTaskPool; class RuleSet; +using SettingsManager = settings::SettingsManager; +using SettingId = settings::SettingId; + class OptimizerMetadata { public: OptimizerMetadata() - : timeout_limit(settings::SettingsManager::GetInt( + : cost_calculator(CostCalculatorFactory::CreateCostCalculator( + SettingsManager::GetString(SettingId::cost_calculator))), + timeout_limit( + SettingsManager::GetInt(SettingId::task_execution_timeout)), + timer(Timer()) {} + + OptimizerMetadata(std::unique_ptr cost_calculator) + : cost_calculator(std::move(cost_calculator)), + timeout_limit(settings::SettingsManager::GetInt( settings::SettingId::task_execution_timeout)), timer(Timer()) {} Memo memo; RuleSet rule_set; OptimizerTaskPool *task_pool; + std::unique_ptr cost_calculator; catalog::CatalogCache *catalog_cache; unsigned int timeout_limit; Timer timer; - concurrency::TransactionContext* txn; + concurrency::TransactionContext *txn; void SetTaskPool(OptimizerTaskPool *task_pool) { this->task_pool = task_pool; diff --git a/src/include/optimizer/optimizer_task.h b/src/include/optimizer/optimizer_task.h index b51239cb928..507fd4f9880 100644 --- a/src/include/optimizer/optimizer_task.h +++ b/src/include/optimizer/optimizer_task.h @@ -222,8 +222,7 @@ class OptimizeInputs : public OptimizerTask { */ class DeriveStats : public OptimizerTask { public: - DeriveStats(GroupExpression *gexpr, - ExprSet required_cols, + DeriveStats(GroupExpression *gexpr, ExprSet required_cols, std::shared_ptr context) : OptimizerTask(context, OptimizerTaskType::DERIVE_STATS), gexpr_(gexpr), diff --git a/src/include/optimizer/postgres_cost_calculator.h b/src/include/optimizer/postgres_cost_calculator.h new file mode 100644 index 00000000000..1dc0a1d510b --- /dev/null +++ b/src/include/optimizer/postgres_cost_calculator.h @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// cost_and_stats_calculator.h +// +// Identification: src/include/optimizer/cost_calculator.h +// +// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "optimizer/abstract_cost_calculator.h" + +// TODO: This is not fully reflective of the postgres cost model. Currently we +// are attempting +// to emulate their hash join cost model + +namespace peloton { +namespace optimizer { + +class Memo; +// Derive cost for a physical group expression +class PostgresCostCalculator : public AbstractCostCalculator { + public: + double CalculateCost(GroupExpression *gexpr, Memo *memo, + concurrency::TransactionContext *txn) override; + + void Visit(const DummyScan *) override; + void Visit(const PhysicalSeqScan *) override; + void Visit(const PhysicalIndexScan *) override; + void Visit(const QueryDerivedScan *) override; + void Visit(const PhysicalOrderBy *) override; + void Visit(const PhysicalLimit *) override; + void Visit(const PhysicalInnerNLJoin *) override; + void Visit(const PhysicalLeftNLJoin *) override; + void Visit(const PhysicalRightNLJoin *) override; + void Visit(const PhysicalOuterNLJoin *) override; + void Visit(const PhysicalInnerHashJoin *) override; + void Visit(const PhysicalLeftHashJoin *) override; + void Visit(const PhysicalRightHashJoin *) override; + void Visit(const PhysicalOuterHashJoin *) override; + void Visit(const PhysicalInsert *) override; + void Visit(const PhysicalInsertSelect *) override; + void Visit(const PhysicalDelete *) override; + void Visit(const PhysicalUpdate *) override; + void Visit(const PhysicalHashGroupBy *) override; + void Visit(const PhysicalSortGroupBy *) override; + void Visit(const PhysicalDistinct *) override; + void Visit(const PhysicalAggregate *) override; + + private: + double HashCost(); + double SortCost(); + double GroupByCost(); + + /* Checks if keys for a join child only reference one table */ + bool IsBaseTable( + const std::vector> &keys); + + GroupExpression *gexpr_; + Memo *memo_; + concurrency::TransactionContext *txn_; + double output_cost_ = 0; +}; + +} // namespace optimizer +} // namespace peloton diff --git a/src/include/optimizer/stats/column_stats.h b/src/include/optimizer/stats/column_stats.h index e3851688425..8861d5df90d 100644 --- a/src/include/optimizer/stats/column_stats.h +++ b/src/include/optimizer/stats/column_stats.h @@ -59,11 +59,24 @@ class ColumnStats { bool is_basetable; - std::string ToString() { + std::string ToString(bool verbose = false) { std::ostringstream os; os << "column_id :" << column_id << "\n" << "column_name :" << column_name << "\n" << "num_rows :" << num_rows << "\n"; + + if (verbose) { + os << "cardinality: " << cardinality << "\n" + << "frac_null: " << frac_null << "\n"; + + if (!most_common_vals.empty()) { + os << "most common value: " << most_common_vals[0] << "\n"; + } + + if (!most_common_freqs.empty()) { + os << "most common freq: " << most_common_freqs[0] << "\n"; + } + } return os.str(); } diff --git a/src/include/planner/plan_util.h b/src/include/planner/plan_util.h index 52210386d75..f91f81653f3 100644 --- a/src/include/planner/plan_util.h +++ b/src/include/planner/plan_util.h @@ -104,6 +104,12 @@ inline void PlanUtil::GetInfo(const planner::AbstractPlan *plan, << std::endl; os << StringUtil::Indent(num_indent + peloton::ARROW_INDENT) << "Info: " << plan->GetInfo() << std::endl; + if (plan->GetPlanNodeType() == PlanNodeType::SEQSCAN) { + auto tmp = const_cast(plan); + auto scan = dynamic_cast(tmp); + os << StringUtil::Indent(num_indent + peloton::ARROW_INDENT) + << "Table: " << scan->GetTable()->GetName() << std::endl; + } auto &children = plan->GetChildren(); os << StringUtil::Indent(num_indent + peloton::ARROW_INDENT) diff --git a/src/include/settings/settings.h b/src/include/settings/settings.h index e90cb78b7da..030be2ede30 100644 --- a/src/include/settings/settings.h +++ b/src/include/settings/settings.h @@ -18,209 +18,168 @@ // CONNECTIONS //===----------------------------------------------------------------------===// // Peloton port -SETTING_int(port, - "Peloton port (default: 15721)", - 15721, - 1024, 65535, - false, false) - -// Maximum number of connections -SETTING_int(max_connections, - "Maximum number of connections (default: 64)", - 64, - 1, 512, - true, true) - -SETTING_int(rpc_port, - "Peloton rpc port (default: 15445)", - 15445, - 1024, 65535, - false, false) - -// TODO(tianyu): Remove when we change to a different rpc framework -// This is here only because capnp cannot exit gracefully and thus causes -// test failure. This is an issue with the capnp implementation and has -// been such way for a while, so it's unlikely it gets fixed. -// See: https://groups.google.com/forum/#!topic/capnproto/bgxCdqGD6oE -SETTING_bool(rpc_enabled, - "Enable rpc, this should be turned off when testing", - false, false, false) - -// Socket family -SETTING_string(socket_family, - "Socket family (default: AF_INET)", - "AF_INET", - false, false) - -// Added for SSL only begins - -// Enables SSL connection. The default value is false -SETTING_bool(ssl, "Enable SSL connection (default: true)", true, false, false) - -// Peloton private key file -// Currently use hardcoded private key path, may need to change -// to generate file dynamically at runtime -// The same applies to certificate file -SETTING_string(private_key_file, - "path to private key file", - "peloton_insecure_server.key", - false, false) - -// Peloton certificate file -SETTING_string(certificate_file, - "path to certificate file", - "peloton_insecure_server.crt", - false, false) - -// Peloton root certificate file -SETTING_string(root_cert_file, - "path to root certificate file", - "root.crt", - false, false) - -//===----------------------------------------------------------------------===// -// RESOURCE USAGE -//===----------------------------------------------------------------------===// - -SETTING_double(bnlj_buffer_size, - "The default buffer size to use for blockwise nested loop joins (default: 1 MB)", - 1.0 * 1024.0 * 1024.0, - 1.0 * 1024, - 1.0 * 1024.0 * 1024.0 * 1024, - true, true) - -// Size of the MonoQueue task queue -SETTING_int(monoqueue_task_queue_size, - "MonoQueue Task Queue Size (default: 32)", - 32, - 8, 128, - false, false) - -// Size of the MonoQueue worker pool -SETTING_int(monoqueue_worker_pool_size, - "MonoQueue Worker Pool Size (default: 4)", - 4, - 1, 32, - false, false) - -// Number of connection threads used by peloton -SETTING_int(connection_thread_count, - "Number of connection threads (default: std::hardware_concurrency())", - std::thread::hardware_concurrency(), - 1, 64, - false, false) - -SETTING_int(gc_num_threads, - "The number of Garbage collection threads to run", - 1, - 1, 128, - true, true) - -//===----------------------------------------------------------------------===// -// WRITE AHEAD LOG -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// ERROR REPORTING AND LOGGING -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// SETTINGURATION -//===----------------------------------------------------------------------===// - -// Display configuration -SETTING_bool(display_settings, - "Display settings (default: false)", - false, - true, true) - -//===----------------------------------------------------------------------===// -// STATISTICS -//===----------------------------------------------------------------------===// - -// Enable or disable statistics collection -SETTING_int(stats_mode, - "Enable statistics collection (default: 0)", - static_cast(peloton::StatsType::INVALID), - 0, 16, - true, true) - -//===----------------------------------------------------------------------===// -// AI -//===----------------------------------------------------------------------===// - -// Enable or disable index tuner -SETTING_bool(index_tuner, - "Enable index tuner (default: false)", - false, - true, true) - -// Enable or disable layout tuner -SETTING_bool(layout_tuner, - "Enable layout tuner (default: false)", - false, - true, true) - -//===----------------------------------------------------------------------===// -// BRAIN -//===----------------------------------------------------------------------===// - -// Enable or disable brain -SETTING_bool(brain, - "Enable brain (default: false)", - false, - true, true) - -SETTING_string(peloton_address, - "ip and port of the peloton rpc service, address:port", - "127.0.0.1:15445", - false, false) - -// Size of the brain task queue -SETTING_int(brain_task_queue_size, - "Brain Task Queue Size (default: 32)", - 32, - 1, 128, - false, false) - -// Size of the brain worker pool -SETTING_int(brain_worker_pool_size, - "Brain Worker Pool Size (default: 1)", - 1, - 1, 16, - false, false) - -//===----------------------------------------------------------------------===// -// CODEGEN -//===----------------------------------------------------------------------===// - -SETTING_bool(codegen, - "Enable code-generation for query execution (default: true)", - true, - true, true) - - -//===----------------------------------------------------------------------===// -// Optimizer -//===----------------------------------------------------------------------===// -SETTING_bool(predicate_push_down, - "Enable predicate push-down optimization (default: true)", - true, - true, true) - -SETTING_bool(hash_join_bloom_filter, - "Enable bloom filter for hash join in codegen (default: true)", - true, - true, true) - -SETTING_int(task_execution_timeout, - "Maximum allowed length of time (in ms) for task " - "execution step of optimizer, " - "assuming one plan has been found (default 5000)", - 5000, - 1000, 60000, - true, true) - -//===----------------------------------------------------------------------===// -// GENERAL -//===----------------------------------------------------------------------===// +SETTING_int(port, "Peloton port (default: 15721)", 15721, 1024, 65535, false, + false) + + // Maximum number of connections + SETTING_int(max_connections, "Maximum number of connections (default: 64)", + 64, 1, 512, true, true) + + SETTING_int(rpc_port, "Peloton rpc port (default: 15445)", 15445, 1024, + 65535, false, false) + + // TODO(tianyu): Remove when we change to a different rpc framework + // This is here only because capnp cannot exit gracefully and thus causes + // test failure. This is an issue with the capnp implementation and has + // been such way for a while, so it's unlikely it gets fixed. + // See: https://groups.google.com/forum/#!topic/capnproto/bgxCdqGD6oE + SETTING_bool(rpc_enabled, + "Enable rpc, this should be turned off when testing", false, + false, false) + + // Socket family + SETTING_string(socket_family, "Socket family (default: AF_INET)", "AF_INET", + false, false) + + // Added for SSL only begins + + // Enables SSL connection. The default value is false + SETTING_bool(ssl, "Enable SSL connection (default: true)", true, false, + false) + + // Peloton private key file + // Currently use hardcoded private key path, may need to change + // to generate file dynamically at runtime + // The same applies to certificate file + SETTING_string(private_key_file, "path to private key file", + "peloton_insecure_server.key", false, false) + + // Peloton certificate file + SETTING_string(certificate_file, "path to certificate file", + "peloton_insecure_server.crt", false, false) + + // Peloton root certificate file + SETTING_string(root_cert_file, "path to root certificate file", "root.crt", + false, false) + + //===----------------------------------------------------------------------===// + // RESOURCE USAGE + //===----------------------------------------------------------------------===// + + SETTING_double(bnlj_buffer_size, + "The default buffer size to use for blockwise nested loop " + "joins (default: 1 MB)", + 1.0 * 1024.0 * 1024.0, 1.0 * 1024, + 1.0 * 1024.0 * 1024.0 * 1024, true, true) + + // Size of the MonoQueue task queue + SETTING_int(monoqueue_task_queue_size, + "MonoQueue Task Queue Size (default: 32)", 32, 8, 128, false, + false) + + // Size of the MonoQueue worker pool + SETTING_int(monoqueue_worker_pool_size, + "MonoQueue Worker Pool Size (default: 4)", 4, 1, 32, false, + false) + + // Number of connection threads used by peloton + SETTING_int( + connection_thread_count, + "Number of connection threads (default: std::hardware_concurrency())", + std::thread::hardware_concurrency(), 1, 64, false, false) + + SETTING_int(gc_num_threads, + "The number of Garbage collection threads to run", 1, 1, + 128, true, true) + + //===----------------------------------------------------------------------===// + // WRITE AHEAD LOG + //===----------------------------------------------------------------------===// + + //===----------------------------------------------------------------------===// + // ERROR REPORTING AND LOGGING + //===----------------------------------------------------------------------===// + + //===----------------------------------------------------------------------===// + // SETTINGURATION + //===----------------------------------------------------------------------===// + + // Display configuration + SETTING_bool(display_settings, "Display settings (default: false)", false, + true, true) + + //===----------------------------------------------------------------------===// + // STATISTICS + //===----------------------------------------------------------------------===// + + // Enable or disable statistics collection + SETTING_int(stats_mode, "Enable statistics collection (default: 0)", + static_cast(peloton::StatsType::INVALID), 0, 16, true, + true) + + //===----------------------------------------------------------------------===// + // AI + //===----------------------------------------------------------------------===// + + // Enable or disable index tuner + SETTING_bool(index_tuner, "Enable index tuner (default: false)", false, + true, true) + + // Enable or disable layout tuner + SETTING_bool(layout_tuner, "Enable layout tuner (default: false)", false, + true, true) + + //===----------------------------------------------------------------------===// + // BRAIN + //===----------------------------------------------------------------------===// + + // Enable or disable brain + SETTING_bool(brain, "Enable brain (default: false)", false, true, true) + + SETTING_string(peloton_address, + "ip and port of the peloton rpc service, address:port", + "127.0.0.1:15445", false, false) + + // Size of the brain task queue + SETTING_int(brain_task_queue_size, "Brain Task Queue Size (default: 32)", + 32, 1, 128, false, false) + + // Size of the brain worker pool + SETTING_int(brain_worker_pool_size, "Brain Worker Pool Size (default: 1)", + 1, 1, 16, false, false) + + //===----------------------------------------------------------------------===// + // CODEGEN + //===----------------------------------------------------------------------===// + + SETTING_bool(codegen, + "Enable code-generation for query execution (default: true)", + true, true, true) + + //===----------------------------------------------------------------------===// + // Optimizer + //===----------------------------------------------------------------------===// + SETTING_bool(predicate_push_down, + "Enable predicate push-down optimization (default: true)", + true, true, true) + + SETTING_bool( + hash_join_bloom_filter, + "Enable bloom filter for hash join in codegen (default: true)", + true, true, true) + + SETTING_int(task_execution_timeout, + "Maximum allowed length of time (in ms) for task " + "execution step of optimizer, " + "assuming one plan has been found (default 5000)", + 5000, 1000, 60000, true, true) + + SETTING_string(cost_calculator, + "The cost calculator (cost model) used by the " + "optimizer. Options currently are " + "(CostCalculator, PostgresCostCalculator)", + "CostCalculator", false, false) + + //===----------------------------------------------------------------------===// + // GENERAL + //===----------------------------------------------------------------------===// diff --git a/src/optimizer/child_stats_deriver.cpp b/src/optimizer/child_stats_deriver.cpp index 0833d55a0f0..1c9fda0a2fd 100644 --- a/src/optimizer/child_stats_deriver.cpp +++ b/src/optimizer/child_stats_deriver.cpp @@ -67,7 +67,7 @@ void ChildStatsDeriver::PassDownColumn(expression::AbstractExpression *col) { auto child_group = memo_->GetGroupByID(gexpr_->GetChildGroupId(idx)); if (child_group->GetTableAliases().count(tv_expr->GetTableName()) && // If we have not derived the column stats yet - child_group->HasColumnStats(tv_expr->GetColFullName())) { + !child_group->HasColumnStats(tv_expr->GetColFullName())) { output_[idx].insert(col); break; } diff --git a/src/optimizer/cost_calculator.cpp b/src/optimizer/cost_calculator.cpp index 5dda9e67c8a..656aed83082 100644 --- a/src/optimizer/cost_calculator.cpp +++ b/src/optimizer/cost_calculator.cpp @@ -12,14 +12,10 @@ #include "optimizer/cost_calculator.h" -#include - #include "catalog/table_catalog.h" #include "optimizer/memo.h" -#include "optimizer/operators.h" #include "optimizer/stats/cost.h" #include "optimizer/stats/stats_storage.h" -#include "optimizer/stats/table_stats.h" namespace peloton { namespace optimizer { @@ -30,6 +26,7 @@ double CostCalculator::CalculateCost(GroupExpression *gexpr, Memo *memo, memo_ = memo; txn_ = txn; gexpr_->Op().Accept(this); + return output_cost_; } @@ -88,7 +85,9 @@ void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) { memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); auto right_child_rows = memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows(); - // TODO(boweic): Build (left) table should have different cost to probe table + + auto right_group = memo_->GetGroupByID(gexpr_->GetChildGroupId(1)); + LOG_INFO("%f", right_group->GetCostLB()); output_cost_ = (left_child_rows + right_child_rows) * DEFAULT_TUPLE_COST; } void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {} @@ -142,5 +141,6 @@ double CostCalculator::GroupByCost() { // O(tuple) return child_num_rows * DEFAULT_TUPLE_COST; } + } // namespace optimizer } // namespace peloton diff --git a/src/optimizer/cost_calculator_factory.cpp b/src/optimizer/cost_calculator_factory.cpp new file mode 100644 index 00000000000..d37d29057f1 --- /dev/null +++ b/src/optimizer/cost_calculator_factory.cpp @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// cost_model_factory.cpp +// +// Identification: src/optimizer/cost_model_factory.cpp +// +// Copyright (c) 2015-18, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "optimizer/cost_calculator_factory.h" +#include "optimizer/cost_calculator.h" +#include "optimizer/postgres_cost_calculator.h" + +#include "common/exception.h" + +namespace peloton { +namespace optimizer { + +std::unique_ptr +CostCalculatorFactory::CreateCostCalculator( + const std::string &cost_model_name) { + if (cost_model_name == "CostCalculator") { + return std::unique_ptr(new CostCalculator); + } else if (cost_model_name == "PostgresCostCalculator") { + return std::unique_ptr(new PostgresCostCalculator); + } else { + throw OptimizerException("Could not create cost calculator: `" + + cost_model_name + "`"); + } +} + +} // namesapce peloton +} // namesapce optimizer diff --git a/src/optimizer/group_expression.cpp b/src/optimizer/group_expression.cpp index 4d874bd27ef..b6891254b69 100644 --- a/src/optimizer/group_expression.cpp +++ b/src/optimizer/group_expression.cpp @@ -6,7 +6,7 @@ // // Identification: src/optimizer/group_expression.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -87,15 +87,21 @@ hash_t GroupExpression::Hash() const { bool GroupExpression::operator==(const GroupExpression &r) { bool eq = (op == r.Op()); - for (size_t i = 0; i < child_groups.size(); ++i) { - eq = eq && (child_groups[i] == r.child_groups[i]); + auto left_groups = child_groups; + auto right_groups = r.child_groups; + + std::sort(left_groups.begin(), left_groups.end()); + std::sort(right_groups.begin(), right_groups.end()); + + for (size_t i = 0; i < left_groups.size(); ++i) { + eq = eq && (left_groups[i] == right_groups[i]); } return eq; } void GroupExpression::SetRuleExplored(Rule *rule) { - rule_mask_.set(rule->GetRuleIdx()) = true; + rule_mask_.set(rule->GetRuleIdx(), true); } bool GroupExpression::HasRuleExplored(Rule *rule) { diff --git a/src/optimizer/memo.cpp b/src/optimizer/memo.cpp index db67c5424a9..ce7ed7af876 100644 --- a/src/optimizer/memo.cpp +++ b/src/optimizer/memo.cpp @@ -6,7 +6,7 @@ // // Identification: src/optimizer/memo.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -43,8 +43,6 @@ GroupExpression *Memo::InsertExpression(std::shared_ptr gexpr, auto it = group_expressions_.find(gexpr.get()); if (it != group_expressions_.end()) { - PELOTON_ASSERT(target_group == UNDEFINED_GROUP || - target_group == (*it)->GetGroupID()); gexpr->SetGroupID((*it)->GetGroupID()); return *it; } else { diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index a68815359cd..209dfecece2 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -20,6 +20,7 @@ #include "common/exception.h" +#include "optimizer/cost_calculator.h" #include "optimizer/binding.h" #include "optimizer/operator_visitor.h" #include "optimizer/properties.h" @@ -60,7 +61,10 @@ namespace optimizer { //===--------------------------------------------------------------------===// // Optimizer //===--------------------------------------------------------------------===// -Optimizer::Optimizer() {} + +Optimizer::Optimizer(std::unique_ptr cost_calculator) { + metadata_ = OptimizerMetadata(std::move(cost_calculator)); +} void Optimizer::OptimizeLoop(int root_group_id, std::shared_ptr required_props) { @@ -141,7 +145,9 @@ shared_ptr Optimizer::BuildPelotonPlanTree( } } -void Optimizer::Reset() { metadata_ = OptimizerMetadata(); } +void Optimizer::Reset() { + metadata_ = OptimizerMetadata(std::move(metadata_.cost_calculator)); +} unique_ptr Optimizer::HandleDDLStatement( parser::SQLStatement *tree, bool &is_ddl_stmt, diff --git a/src/optimizer/optimizer_task.cpp b/src/optimizer/optimizer_task.cpp index f0a489906ae..a50a07db2a1 100644 --- a/src/optimizer/optimizer_task.cpp +++ b/src/optimizer/optimizer_task.cpp @@ -16,7 +16,6 @@ #include "optimizer/optimizer_metadata.h" #include "optimizer/binding.h" #include "optimizer/child_property_deriver.h" -#include "optimizer/cost_calculator.h" #include "optimizer/stats_calculator.h" #include "optimizer/child_stats_deriver.h" @@ -218,7 +217,8 @@ void DeriveStats::execute() { bool derive_children = false; // If we haven't got enough stats to compute the current stats, derive them // from the child first - PELOTON_ASSERT(children_required_stats.size() == gexpr_->GetChildrenGroupsSize()); + PELOTON_ASSERT(children_required_stats.size() == + gexpr_->GetChildrenGroupsSize()); for (size_t idx = 0; idx < children_required_stats.size(); ++idx) { auto &child_required_stats = children_required_stats[idx]; auto child_group_id = gexpr_->GetChildGroupId(idx); @@ -291,8 +291,7 @@ void OptimizeInputs::execute() { // Compute the cost of the root operator // 1. Collect stats needed and cache them in the group // 2. Calculate cost based on children's stats - CostCalculator cost_calculator; - cur_total_cost_ += cost_calculator.CalculateCost( + cur_total_cost_ += context_->metadata->cost_calculator->CalculateCost( group_expr_, &context_->metadata->memo, context_->metadata->txn); } @@ -305,21 +304,24 @@ void OptimizeInputs::execute() { // Check whether the child group is already optimized for the prop auto child_best_expr = child_group->GetBestExpression(i_prop); if (child_best_expr != nullptr) { // Directly get back the best expr if - // the child group is optimized + // the child group is optimized cur_total_cost_ += child_best_expr->GetCost(i_prop); // Pruning if (cur_total_cost_ > context_->cost_upper_bound) break; } else if (pre_child_idx_ != cur_child_idx_) { // First time to optimize child group - pre_child_idx_ = cur_child_idx_; + auto new_upper_bound = context_->cost_upper_bound - cur_total_cost_; + // Reset child idx and total cost + pre_child_idx_ = -1; + cur_child_idx_ = 0; + cur_total_cost_ = 0; PushTask(new OptimizeInputs(this)); PushTask(new OptimizeGroup( child_group, std::make_shared( - context_->metadata, i_prop, - context_->cost_upper_bound - cur_total_cost_))); + context_->metadata, i_prop, new_upper_bound))); return; } else { // If we return from OptimizeGroup, then there is no expr for - // the context + // the context break; } } @@ -366,8 +368,7 @@ void OptimizeInputs::execute() { // Cost the enforced expression auto extended_prop_set = std::make_shared(extended_output_properties); - CostCalculator cost_calculator; - cur_total_cost_ += cost_calculator.CalculateCost( + cur_total_cost_ += context_->metadata->cost_calculator->CalculateCost( memo_enforced_expr, &context_->metadata->memo, context_->metadata->txn); diff --git a/src/optimizer/postgres_cost_calculator.cpp b/src/optimizer/postgres_cost_calculator.cpp new file mode 100644 index 00000000000..cdc37421722 --- /dev/null +++ b/src/optimizer/postgres_cost_calculator.cpp @@ -0,0 +1,234 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// postgres_cost_calculator.cpp +// +// Identification: src/optimizer/cost_calculator.cpp +// +// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "optimizer/postgres_cost_calculator.h" + +#include "expression/tuple_value_expression.h" +#include "catalog/table_catalog.h" +#include "optimizer/memo.h" +#include "optimizer/operators.h" +#include "optimizer/stats/cost.h" +#include "optimizer/stats/stats_storage.h" +#include "optimizer/stats/table_stats.h" + +#include + +namespace peloton { +namespace optimizer { + +double PostgresCostCalculator::CalculateCost( + GroupExpression *gexpr, Memo *memo, concurrency::TransactionContext *txn) { + gexpr_ = gexpr; + memo_ = memo; + txn_ = txn; + gexpr_->Op().Accept(this); + + return output_cost_; +} + +void PostgresCostCalculator::Visit(UNUSED_ATTRIBUTE const DummyScan *op) { + output_cost_ = 0.f; +} +void PostgresCostCalculator::Visit(const PhysicalSeqScan *op) { + auto table_stats = std::dynamic_pointer_cast( + StatsStorage::GetInstance()->GetTableStats( + op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); + if (table_stats->GetColumnCount() == 0) { + output_cost_ = 1.f; + return; + } + output_cost_ = table_stats->num_rows * DEFAULT_TUPLE_COST; +} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalIndexScan *op) { + auto table_stats = std::dynamic_pointer_cast( + StatsStorage::GetInstance()->GetTableStats( + op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); + if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) { + output_cost_ = 0.f; + return; + } + // Index search cost + scan cost + output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST + + memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() * + DEFAULT_TUPLE_COST; +} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const QueryDerivedScan *op) { + output_cost_ = 0.f; +} + +void PostgresCostCalculator::Visit(const PhysicalOrderBy *) { SortCost(); } + +void PostgresCostCalculator::Visit(const PhysicalLimit *op) { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + + output_cost_ = + std::min((size_t)child_num_rows, (size_t)op->limit) * DEFAULT_TUPLE_COST; +} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) { + auto left_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + auto right_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows(); + + output_cost_ = left_child_rows * right_child_rows * DEFAULT_TUPLE_COST; +} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) {} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) {} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) {} + +/* The main idea of this cost estimate is that the comparisons done is the outer + * table (probe side) times tuples + * bucket. Thus, we estimate attempt to estimate the bucket size in a similar + * manner to postgres. + */ +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) { + auto bucket_size_frac = 1.0; + + // Assuming you build table on right relation + if (IsBaseTable(op->right_keys)) { + auto right_group = memo_->GetGroupByID(gexpr_->GetChildGroupId(1)); + + // Iterate over all keys, take the largest fraction (smallest bucket sizes) + // TODO: Add more estimate adjustments from postgres + for (auto &expr : op->right_keys) { + auto tv_expr = + reinterpret_cast(expr.get()); + auto stats = right_group->GetStats(tv_expr->GetColFullName()); + + if (stats == nullptr) continue; + + // TODO: A new hash join PR uses 256 as default so we're using this for + // now and hardcoding it here + auto num_buckets = 256.0; + + double frac_est; + + if (stats->cardinality > num_buckets) { + frac_est = 1.0 / num_buckets; + } else { + frac_est = 1.0 / stats->cardinality; + } + /* Average frequency of values, taken from Postgres */ + auto avgfreq = (1.0 - stats->frac_null) / stats->cardinality; + + // Adjust for skew (Highest freq / avg freq) + if (avgfreq > 0.0 && !stats->most_common_vals.empty() && + !stats->most_common_freqs.empty() && + (stats->most_common_freqs[0] / stats->num_rows) > avgfreq) { + frac_est *= (stats->most_common_freqs[0] / stats->num_rows) / avgfreq; + } + + // Clamp the bucket frac estimate (taken from postgres) + if (frac_est < 1.0e-6) { + frac_est = 1.0e-6; + } else if (frac_est > 1.0) { + frac_est = 1.0; + } + bucket_size_frac = std::min(bucket_size_frac, frac_est); + } + } + LOG_TRACE("Bucket_size_frac: %f", bucket_size_frac); + + auto left_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + auto right_child_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows(); + output_cost_ = (left_child_rows * (right_child_rows * bucket_size_frac)) * + DEFAULT_TUPLE_COST; +} + +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) {} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) {} +void PostgresCostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) {} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) {} +void PostgresCostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) {} +void PostgresCostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) {} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) { + // TODO(boweic): Integrate hash in groupby may cause us to miss the + // opportunity to further optimize some query where the child output is + // already hashed by the GroupBy key, we'll do a hash anyway + output_cost_ = HashCost() + GroupByCost(); +} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) { + // Sort group by does not sort the tuples, it requires input columns to be + // sorted + output_cost_ = GroupByCost(); +} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalDistinct *op) { + output_cost_ = HashCost(); +} +void PostgresCostCalculator::Visit( + UNUSED_ATTRIBUTE const PhysicalAggregate *op) { + // TODO(boweic): Ditto, separate groupby operator and implementation(e.g. + // hash, sort) may enable opportunity for further optimization + output_cost_ = HashCost() + GroupByCost(); +} + +double PostgresCostCalculator::HashCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + // O(tuple) + return child_num_rows * DEFAULT_TUPLE_COST; +} + +double PostgresCostCalculator::SortCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + + if (child_num_rows == 0) { + return 1.0f; + } + // O(tuple * log(tuple)) + return child_num_rows * std::log2(child_num_rows) * DEFAULT_TUPLE_COST; +} + +double PostgresCostCalculator::GroupByCost() { + auto child_num_rows = + memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); + // O(tuple) + return child_num_rows * DEFAULT_TUPLE_COST; +} + +// Might be better to implement this using groups as opposed to table names +bool PostgresCostCalculator::IsBaseTable( + const std::vector> &keys) { + std::unordered_set seen_set; + + for (auto &expr : keys) { + if (expr->GetExpressionType() != ExpressionType::VALUE_TUPLE) continue; + + auto tv_expr = + reinterpret_cast(expr.get()); + seen_set.insert(tv_expr->GetTableName()); + } + + return seen_set.size() == 1; +} + +} // namespace optimizer +} // namespace peloton diff --git a/test/optimizer/optimizer_rule_test.cpp b/test/optimizer/optimizer_rule_test.cpp index fbcac54f9b4..a83275eaf13 100644 --- a/test/optimizer/optimizer_rule_test.cpp +++ b/test/optimizer/optimizer_rule_test.cpp @@ -260,5 +260,34 @@ TEST_F(OptimizerRuleTests, SimpleAssociativeRuleTest2) { delete root_context; } +TEST_F(OptimizerRuleTests, RuleBitmapTest) { + Optimizer optimizer; + auto &memo = optimizer.GetMetadata().memo; + + auto dummy_operator = std::make_shared(LogicalGet::make()); + auto dummy_group = memo.InsertExpression(optimizer.GetMetadata().MakeGroupExpression(dummy_operator), false); + + auto rule1 = new InnerJoinCommutativity(); + auto rule2 = new GetToSeqScan(); + + EXPECT_FALSE(dummy_group->HasRuleExplored(rule1)); + EXPECT_FALSE(dummy_group->HasRuleExplored(rule2)); + + dummy_group->SetRuleExplored(rule1); + + EXPECT_TRUE(dummy_group->HasRuleExplored(rule1)); + EXPECT_FALSE(dummy_group->HasRuleExplored(rule2)); + + dummy_group->SetRuleExplored(rule2); + + EXPECT_TRUE(dummy_group->HasRuleExplored(rule1)); + EXPECT_TRUE(dummy_group->HasRuleExplored(rule2)); + + delete rule1; + delete rule2; +} + + + } // namespace test } // namespace peloton diff --git a/test/optimizer/plan_selection_test.cpp b/test/optimizer/plan_selection_test.cpp new file mode 100644 index 00000000000..b8e70fb078a --- /dev/null +++ b/test/optimizer/plan_selection_test.cpp @@ -0,0 +1,668 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// plan_selection_test.cpp +// +// Identification: test/optimizer/plan_selection_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "binder/bind_node_visitor.h" +#include "common/harness.h" +#include "common/statement.h" +#include "common/timer.h" +#include "concurrency/transaction_manager_factory.h" +#include "executor/testing_executor_util.h" +#include "expression/operator_expression.h" +#include "optimizer/cost_calculator.h" +#include "optimizer/memo.h" +#include "optimizer/operator_expression.h" +#include "optimizer/operators.h" +#include "optimizer/optimizer.h" +#include "optimizer/rule_impls.h" +#include "planner/delete_plan.h" +#include "planner/insert_plan.h" +#include "planner/update_plan.h" +#include "sql/testing_sql_util.h" + +namespace peloton { + +namespace optimizer { + +class BadCostModel : public CostCalculator { + double CalculateCost(GroupExpression *gexpr, Memo *memo, + concurrency::TransactionContext *txn) override { + return -1 * CostCalculator::CalculateCost(gexpr, memo, txn); + } +}; + +} // namespace optimizer + +namespace test { + +constexpr auto table_1_name = "test1"; +constexpr auto table_2_name = "test2"; +constexpr auto table_3_name = "test3"; +constexpr auto column_1_name = "a"; +constexpr auto column_2_name = "b"; +constexpr auto column_3_name = "c"; + +using AbstractCostCalculatorUniqPtr = + std::unique_ptr; + +class PlanSelectionTest : public PelotonTest { + protected: + void SetUp() override { + TestingExecutorUtil::InitializeDatabase(DEFAULT_DB_NAME); + + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + CreateTestTable(table_1_name); + CreateTestTable(table_2_name); + CreateTestTable(table_3_name); + + txn_manager.CommitTransaction(txn); + + bad_cost_calculator_ = + AbstractCostCalculatorUniqPtr(new peloton::optimizer::BadCostModel()); + } + + void TearDown() override { + TestingExecutorUtil::DeleteDatabase(DEFAULT_DB_NAME); + PelotonTest::TearDown(); + } + + std::shared_ptr PerformTransactionAndGetPlan( + const std::string &query) { + auto cost_calculator = + AbstractCostCalculatorUniqPtr(new peloton::optimizer::CostCalculator); + return PerformTransactionAndGetPlan(query, std::move(cost_calculator)); + } + + std::shared_ptr PerformTransactionAndGetPlan( + const std::string &query, AbstractCostCalculatorUniqPtr cost_calculator) { + /* + * Generates the optimizer plan for the given query and runs the transaction + */ + + // Generate plan + auto &peloton_parser = parser::PostgresParser::GetInstance(); + auto raw_stmt = peloton_parser.BuildParseTree(query); + + std::unique_ptr &stmt(raw_stmt); + + optimizer::Optimizer optimizer{std::move(cost_calculator)}; + + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + auto plan = optimizer.BuildPelotonPlanTree(stmt, DEFAULT_DB_NAME, txn); + timer_.Start(); + txn_manager.CommitTransaction(txn); + timer_.Stop(); + LOG_INFO("Query Execution Duration: %f", timer_.GetDuration()); + PrintPlan(plan); + + return plan; + } + + std::string CreateTwoWayJoinQuery(const std::string &table_1, + const std::string &table_2, + const std::string &column_1, + const std::string &column_2) { + return CreateTwoWayJoinQuery(table_1, table_2, column_1, column_2, "", ""); + } + + std::string CreateTwoWayJoinQuery(const std::string &table_1, + const std::string &table_2, + const std::string &column_1, + const std::string &column_2, + const std::string &order_by_table, + const std::string &order_by_column) { + std::stringstream ss; + ss << "SELECT * FROM " << table_1 << ", " << table_2 << " WHERE " << table_1 + << "." << column_1 << " = " << table_2 << "." << column_2; + if (!order_by_column.empty() and !order_by_table.empty()) { + ss << " ORDER BY " << order_by_table << "." << order_by_column; + } + ss << ";"; + return ss.str(); + } + + std::string CreateThreeWayJoinQuery(const std::string &table_1, + const std::string &table_2, + const std::string &table_3, + const std::string &column_1, + const std::string &column_2, + const std::string &column_3) { + return CreateThreeWayJoinQuery(table_1, table_2, table_3, column_1, + column_2, column_3, "", ""); + } + + std::string CreateThreeWayJoinQuery( + const std::string &table_1, const std::string &table_2, + const std::string &table_3, const std::string &column_1, + const std::string &column_2, const std::string &column_3, + const std::string &order_by_table, const std::string &order_by_column) { + std::stringstream ss; + ss << "SELECT * FROM " << table_1 << ", " << table_2 << "," << table_3 + << " WHERE " << table_1 << "." << column_1 << " = " << table_2 << "." + << column_2 << " AND " << table_2 << "." << column_2 << " = " << table_3 + << "." << column_3; + if (!order_by_column.empty() and !order_by_table.empty()) { + ss << " ORDER BY " << order_by_table << "." << order_by_column; + } + ss << ";"; + return ss.str(); + } + + void InsertDataHelper(const std::string &table_name, int tuple_count) { + int batch_size = 1000; + std::stringstream ss; + auto count = 0; + if (tuple_count > batch_size) { + for (int i = 0; i < tuple_count; i += batch_size) { + ss.str(std::string()); + ss << "INSERT INTO " << table_name << " VALUES "; + for (int j = 1; j <= batch_size; j++) { + count++; + ss << "(" << count << ", 1.1, 'abcd')"; + if (j < batch_size) { + ss << ","; + } + } + ss << ";"; + TestingSQLUtil::ExecuteSQLQuery(ss.str()); + } + } else { + ss << "INSERT INTO " << table_name << " VALUES "; + for (int i = 1; i <= tuple_count; i++) { + ss << "(" << i << ", 1.1, 'abcd')"; + if (i < tuple_count) { + ss << ","; + } + count++; + } + ss << ";"; + TestingSQLUtil::ExecuteSQLQuery(ss.str()); + } + LOG_INFO("COUNT: %d", count); + } + + void AnalyzeTable(const std::string &table_name) { + std::stringstream ss; + ss << "ANALYZE " << table_name << ";"; + TestingSQLUtil::ExecuteSQLQuery(ss.str()); + } + + void PrintPlan(std::shared_ptr plan, int level = 0) { + PrintPlan(plan.get(), level); + } + + void PrintPlan(planner::AbstractPlan *plan, int level = 0) { + auto spacing = std::string(level, '\t'); + + if (plan->GetPlanNodeType() == PlanNodeType::SEQSCAN) { + auto scan = dynamic_cast(plan); + LOG_DEBUG("%s%s(%s)", spacing.c_str(), scan->GetInfo().c_str(), + scan->GetTable()->GetName().c_str()); + } else { + LOG_DEBUG("%s%s", spacing.c_str(), plan->GetInfo().c_str()); + } + + for (size_t i = 0; i < plan->GetChildren().size(); i++) { + PrintPlan(plan->GetChildren()[i].get(), level + 1); + } + + return; + } + + Timer timer_; + AbstractCostCalculatorUniqPtr bad_cost_calculator_; + + private: + void CreateTestTable(const std::string &table_name) { + std::stringstream ss; + ss << "CREATE TABLE " << table_name << "(" << column_1_name + << " INT PRIMARY KEY, " << column_2_name << " DECIMAL, " << column_3_name + << " VARCHAR);"; + TestingSQLUtil::ExecuteSQLQuery(ss.str()); + } +}; + +TEST_F(PlanSelectionTest, SimpleJoinOrderTestSmall1) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 1; + int test2_table_size = 100; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan(CreateTwoWayJoinQuery( + table_1_name, table_2_name, column_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::NESTLOOP); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(0, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = + dynamic_cast(plan->GetChildren()[1].get()); + + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_1_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_2_name); +} + +TEST_F(PlanSelectionTest, ThreeWayJoinTest) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 2; + int test2_table_size = 10; + int test3_table_size = 100; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + InsertDataHelper(table_3_name, test3_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + AnalyzeTable(table_3_name); + + auto plan = PerformTransactionAndGetPlan( + CreateThreeWayJoinQuery(table_1_name, table_2_name, table_3_name, + column_1_name, column_1_name, column_1_name)); + + // TODO: Add checks +} + +TEST_F(PlanSelectionTest, SimpleJoinOrderTestWorstCaseSmall1) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 1; + int test2_table_size = 100; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan( + CreateTwoWayJoinQuery(table_1_name, table_2_name, column_1_name, + column_1_name), + std::move(bad_cost_calculator_)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::HASHJOIN); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::HASH, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(1, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = dynamic_cast( + plan->GetChildren()[1]->GetChildren()[0].get()); + + // TODO: This should actually be reversed, setting it to this now so that the + // tests pass + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_1_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_2_name); +} + +TEST_F(PlanSelectionTest, SimpleJoinOrderSmallTest2) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 100; + int test2_table_size = 1; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan(CreateTwoWayJoinQuery( + table_1_name, table_2_name, column_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::NESTLOOP); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(0, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = + dynamic_cast(plan->GetChildren()[1].get()); + + // TODO: This should actually be reversed, setting it to this now so that the + // tests pass + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_1_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_2_name); +} + +TEST_F(PlanSelectionTest, SimpleJoinOrderSmallTest3) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 1; + int test2_table_size = 100; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan(CreateTwoWayJoinQuery( + table_2_name, table_1_name, column_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::NESTLOOP); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(0, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = + dynamic_cast(plan->GetChildren()[1].get()); + + // TODO: Join order seems to follow order of join in query. This should really + // be swapped. Set + // to this way so the tests pass + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_2_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_1_name); +} + +/* The similarity between this and the SimpleJoinOrderSmallTest2 is to ensure + * that the optimizer does not pick + * joins simply based on the order the tables are written in the SQL query. + */ +TEST_F(PlanSelectionTest, SimpleJoinOrderSmallTest4) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 100; + int test2_table_size = 1; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan(CreateTwoWayJoinQuery( + table_2_name, table_1_name, column_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::NESTLOOP); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(0, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = + dynamic_cast(plan->GetChildren()[1].get()); + + // TODO: Join order seems to follow order of join in query. This should really + // be swapped. Set + // to this way so the tests pass + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_2_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_1_name); +} + +TEST_F(PlanSelectionTest, SimpleJoinOrderLargeTest1) { + // TODO: move this to another testing framework (currently takes too long) + // Same applies to other "Long" tests + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 10; + int test2_table_size = 1000; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto query = "SELECT COUNT(*) from test2;"; + std::vector results; + TestingSQLUtil::ExecuteSQLQuery(query, results); + for (auto result : results) { + LOG_INFO("RESULT: %s", result.c_str()); + } + + auto plan = PerformTransactionAndGetPlan(CreateTwoWayJoinQuery( + table_1_name, table_2_name, column_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::HASHJOIN); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::HASH, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(1, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = dynamic_cast( + plan->GetChildren()[1]->GetChildren()[0].get()); + + // TODO: This should actually be reversed, setting it to this now so that the + // tests pass + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_1_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_2_name); +} + +TEST_F(PlanSelectionTest, SimpleJoinOrderLargeTest2) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 1000; + int test2_table_size = 10; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan(CreateTwoWayJoinQuery( + table_1_name, table_2_name, column_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::HASHJOIN); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::HASH, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(1, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = dynamic_cast( + plan->GetChildren()[1]->GetChildren()[0].get()); + + // TODO: This should actually be reversed, setting it to this now so that the + // tests pass + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_2_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_1_name); +} + +TEST_F(PlanSelectionTest, SimpleJoinOrderLargeTest3) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 10; + int test2_table_size = 1000; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan(CreateTwoWayJoinQuery( + table_2_name, table_1_name, column_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::HASHJOIN); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::HASH, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(1, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = dynamic_cast( + plan->GetChildren()[1]->GetChildren()[0].get()); + + // TODO: This should actually be reversed, setting it to this now so that the + // tests pass + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_1_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_2_name); +} + +TEST_F(PlanSelectionTest, SimpleJoinOrderLargeTest4) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + int test1_table_size = 1000; + int test2_table_size = 10; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan(CreateTwoWayJoinQuery( + table_2_name, table_1_name, column_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::HASHJOIN); + + EXPECT_EQ(2, plan->GetChildren().size()); + EXPECT_EQ(PlanNodeType::SEQSCAN, plan->GetChildren()[0]->GetPlanNodeType()); + EXPECT_EQ(PlanNodeType::HASH, plan->GetChildren()[1]->GetPlanNodeType()); + + EXPECT_EQ(0, plan->GetChildren()[0]->GetChildren().size()); + EXPECT_EQ(1, plan->GetChildren()[1]->GetChildren().size()); + + auto left_scan = + dynamic_cast(plan->GetChildren()[0].get()); + auto right_scan = dynamic_cast( + plan->GetChildren()[1]->GetChildren()[0].get()); + + // TODO: This should actually be reversed, setting it to this now so that the + // tests pass + ASSERT_STREQ(left_scan->GetTable()->GetName().c_str(), table_2_name); + ASSERT_STREQ(right_scan->GetTable()->GetName().c_str(), table_1_name); +} + +TEST_F(PlanSelectionTest, SimpleJoinOrderSortedTest) { + // Create and populate tables + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Populate Tables table + const unsigned int test1_table_size = 1; + const unsigned int test2_table_size = 100; + + InsertDataHelper(table_1_name, test1_table_size); + InsertDataHelper(table_2_name, test2_table_size); + + txn_manager.CommitTransaction(txn); + + AnalyzeTable(table_1_name); + AnalyzeTable(table_2_name); + + auto plan = PerformTransactionAndGetPlan( + CreateTwoWayJoinQuery(table_1_name, table_2_name, column_1_name, + column_1_name, table_1_name, column_1_name)); + + EXPECT_TRUE(plan->GetPlanNodeType() == PlanNodeType::PROJECTION); + + EXPECT_EQ(2, plan->GetChildren().size()); + + // TODO: figure out other checks +} +} +}