diff --git a/src/tests/all_tables_config.cpp b/src/tests/all_tables_config.cpp new file mode 100644 index 0000000000..183697ed8e --- /dev/null +++ b/src/tests/all_tables_config.cpp @@ -0,0 +1,36 @@ +#include "all_tables_config.h" + +namespace tests { +extern TableConfig const kWDC_astronomical{"WDC_astronomical.csv", ',', true}; +extern TableConfig const kWDC_symbols{"WDC_symbols.csv", ',', true}; +extern TableConfig const kWDC_science{"WDC_science.csv", ',', true}; +extern TableConfig const kWDC_satellites{"WDC_satellites.csv", ',', true}; +extern TableConfig const kWDC_appearances{"WDC_appearances.csv", ',', true}; +extern TableConfig const kWDC_astrology{"WDC_astrology.csv", ',', true}; +extern TableConfig const kWDC_game{"WDC_game.csv", ',', true}; +extern TableConfig const kWDC_kepler{"WDC_kepler.csv", ',', true}; +extern TableConfig const kWDC_planetz{"WDC_planetz.csv", ',', true}; +extern TableConfig const kWDC_age{"WDC_age.csv", ',', true}; +extern TableConfig const kTestWide{"TestWide.csv", ',', true}; +extern TableConfig const kabalone{"abalone.csv", ',', false}; +extern TableConfig const kiris{"iris.csv", ',', false}; +extern TableConfig const kadult{"adult.csv", ';', false}; +extern TableConfig const kbreast_cancer{"breast_cancer.csv", ',', true}; +extern TableConfig const kCIPublicHighway10k{"CIPublicHighway10k.csv", ',', true}; +extern TableConfig const kneighbors10k{"neighbors10k.csv", ',', true}; +extern TableConfig const kneighbors50k{"neighbors50k.csv", ',', true}; +extern TableConfig const kneighbors100k{"neighbors100k.csv", ',', true}; +extern TableConfig const kCIPublicHighway700{"CIPublicHighway700.csv", ',', true}; +extern TableConfig const kEpicVitals{"EpicVitals.csv", '|', true}; +extern TableConfig const kEpicMeds{"EpicMeds.csv", '|', true}; +extern TableConfig const kiowa1kk{"iowa1kk.csv", ',', true}; +extern TableConfig const kfd_reduced_30{"fd-reduced-30.csv", ',', true}; +extern TableConfig const kflight_1k{"flight_1k.csv", ';', true}; +extern TableConfig const kplista_1k{"plista_1k.csv", ';', false}; +extern TableConfig const kletter{"letter.csv", ',', false}; +extern TableConfig const kCIPublicHighway{"CIPublicHighway.csv", ',', true}; +extern TableConfig const kLegacyPayors{"LegacyPayors.csv", '|', true}; +extern TableConfig const kTestEmpty{"TestEmpty.csv", ',', true}; +extern TableConfig const kTestSingleColumn{"TestSingleColumn.csv", ',', true}; +extern TableConfig const kTestLong{"TestLong.csv", ',', true}; +} // namespace tests diff --git a/src/tests/all_tables_config.h b/src/tests/all_tables_config.h new file mode 100644 index 0000000000..b2155f2c8a --- /dev/null +++ b/src/tests/all_tables_config.h @@ -0,0 +1,38 @@ +#pragma once + +#include "table_config.h" + +namespace tests { +extern TableConfig const kWDC_astronomical; +extern TableConfig const kWDC_symbols; +extern TableConfig const kWDC_science; +extern TableConfig const kWDC_satellites; +extern TableConfig const kWDC_appearances; +extern TableConfig const kWDC_astrology; +extern TableConfig const kWDC_game; +extern TableConfig const kWDC_kepler; +extern TableConfig const kWDC_planetz; +extern TableConfig const kWDC_age; +extern TableConfig const kTestWide; +extern TableConfig const kabalone; +extern TableConfig const kiris; +extern TableConfig const kadult; +extern TableConfig const kbreast_cancer; +extern TableConfig const kCIPublicHighway10k; +extern TableConfig const kneighbors10k; +extern TableConfig const kneighbors50k; +extern TableConfig const kneighbors100k; +extern TableConfig const kCIPublicHighway700; +extern TableConfig const kEpicVitals; +extern TableConfig const kEpicMeds; +extern TableConfig const kiowa1kk; +extern TableConfig const kfd_reduced_30; +extern TableConfig const kflight_1k; +extern TableConfig const kplista_1k; +extern TableConfig const kletter; +extern TableConfig const kCIPublicHighway; +extern TableConfig const kLegacyPayors; +extern TableConfig const kTestEmpty; +extern TableConfig const kTestSingleColumn; +extern TableConfig const kTestLong; +} // namespace tests diff --git a/src/tests/datasets.h b/src/tests/datasets.h deleted file mode 100644 index a6317c0602..0000000000 --- a/src/tests/datasets.h +++ /dev/null @@ -1,77 +0,0 @@ -#pragma once - -#include -#include -#include - -static const auto test_data_dir = std::filesystem::current_path() / "input_data"; - -struct Dataset { - std::string name; - size_t hash; - char separator; - bool has_header; -}; - -class LightDatasets { -public: - static inline const std::array datasets_ = { - {{"CIPublicHighway10k.csv", 33398, ',', true}, - {"neighbors10k.csv", 43368, ',', true}, - {"WDC_astronomical.csv", 22281, ',', true}, - {"WDC_age.csv", 19620, ',', true}, - {"WDC_appearances.csv", 25827, ',', true}, - {"WDC_astrology.csv", 40815, ',', true}, - {"WDC_game.csv", 6418, ',', true}, - {"WDC_science.csv", 19620, ',', true}, - {"WDC_symbols.csv", 28289, ',', true}, - {"breast_cancer.csv", 15121, ',', true}, - {"WDC_kepler.csv", 63730, ',', true}}}; - - // DEPRECATED -- just use - // for (auto dataset : LightDatasets::datasets) { ... } - static size_t DatasetQuantity() { - return datasets_.size(); - } - static std::string DatasetName(size_t i) { - return datasets_[i].name; - } - static char Separator(size_t i) { - return datasets_[i].separator; - } - static bool HasHeader(size_t i) { - return datasets_[i].has_header; - } - static unsigned int Hash(size_t i) { - return datasets_[i].hash; - } -}; - -class HeavyDatasets { -public: - static inline const std::array datasets_ = { - {{"adult.csv", 23075, ';', false}, - {"CIPublicHighway.csv", 13035, ',', true}, - {"EpicMeds.csv", 50218, '|', true}, - {"EpicVitals.csv", 2083, '|', true}, - {"iowa1kk.csv", 28573, ',', true}, - {"LegacyPayors.csv", 43612, '|', true}}}; - - // DEPRECATED -- just use - // for (auto dataset : HeavyDatasets::datasets) { ... } - static size_t DatasetQuantity() { - return datasets_.size(); - } - static std::string DatasetName(size_t i) { - return datasets_[i].name; - } - static char Separator(size_t i) { - return datasets_[i].separator; - } - static bool HasHeader(size_t i) { - return datasets_[i].has_header; - } - static unsigned int Hash(size_t i) { - return datasets_[i].hash; - } -}; diff --git a/src/tests/table_config.h b/src/tests/table_config.h new file mode 100644 index 0000000000..56252caa3c --- /dev/null +++ b/src/tests/table_config.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include +#include + +#include "config/tabular_data/input_table_type.h" +#include "parser/csv_parser/csv_parser.h" + +namespace tests { + +static auto const test_data_dir = std::filesystem::current_path() / "input_data"; + +/// csv table configuration info to create an input table +struct TableConfig { + std::string_view name; + char separator; + bool has_header; + + std::filesystem::path GetPath() const { + return test_data_dir / name; + } + + config::InputTable MakeInputTable() const { + return std::make_shared(GetPath(), separator, has_header); + } +}; + +/// a struct consisting of a table config and the expected hash +struct TableConfigHash { + TableConfig config; + size_t hash; +}; + +/// a struct consisting of a tables config and the expected hash +struct TablesConfigHash { + std::vector configs; + size_t hash; +}; + +} // namespace tests diff --git a/src/tests/test_ac_algorithm.cpp b/src/tests/test_ac_algorithm.cpp index 929ed50f88..62965020e5 100644 --- a/src/tests/test_ac_algorithm.cpp +++ b/src/tests/test_ac_algorithm.cpp @@ -7,7 +7,7 @@ #include "algorithms/algebraic_constraints/bin_operation_enum.h" #include "algorithms/algo_factory.h" #include "config/names.h" -#include "datasets.h" +#include "table_config.h" #include "types.h" namespace { diff --git a/src/tests/test_algo_interfaces.cpp b/src/tests/test_algo_interfaces.cpp index 973ea0a973..1e542b2f4d 100644 --- a/src/tests/test_algo_interfaces.cpp +++ b/src/tests/test_algo_interfaces.cpp @@ -9,7 +9,7 @@ #include "algorithms/fd/pyro/pyro.h" #include "config/error/type.h" #include "config/names.h" -#include "datasets.h" +#include "table_config.h" namespace tests { diff --git a/src/tests/test_apriori.cpp b/src/tests/test_apriori.cpp index 145a16b93e..1b8b92f960 100644 --- a/src/tests/test_apriori.cpp +++ b/src/tests/test_apriori.cpp @@ -5,7 +5,7 @@ #include "algorithms/algo_factory.h" #include "algorithms/association_rules/apriori.h" #include "config/names.h" -#include "datasets.h" +#include "table_config.h" namespace fs = std::filesystem; diff --git a/src/tests/test_cfd_algos.cpp b/src/tests/test_cfd_algos.cpp index 056d8c81c8..2824feb738 100644 --- a/src/tests/test_cfd_algos.cpp +++ b/src/tests/test_cfd_algos.cpp @@ -9,7 +9,7 @@ #include "algorithms/cfd/enums.h" #include "algorithms/cfd/fd_first_algorithm.h" #include "config/names.h" -#include "datasets.h" +#include "table_config.h" namespace tests { namespace fs = std::filesystem; diff --git a/src/tests/test_cfd_relation_data.cpp b/src/tests/test_cfd_relation_data.cpp index a84a8d20fe..b048adf0e8 100644 --- a/src/tests/test_cfd_relation_data.cpp +++ b/src/tests/test_cfd_relation_data.cpp @@ -4,8 +4,8 @@ #include #include "algorithms/cfd/model/cfd_relation_data.h" -#include "datasets.h" #include "parser/csv_parser/csv_parser.h" +#include "table_config.h" namespace tests { diff --git a/src/tests/test_data_stats.cpp b/src/tests/test_data_stats.cpp index 5f92d06173..e25da43dfc 100644 --- a/src/tests/test_data_stats.cpp +++ b/src/tests/test_data_stats.cpp @@ -3,7 +3,7 @@ #include "algorithms/algo_factory.h" #include "algorithms/statistics/data_stats.h" -#include "datasets.h" +#include "table_config.h" namespace tests { namespace mo = model; diff --git a/src/tests/test_algorithm.cpp b/src/tests/test_fd_algorithm.cpp similarity index 70% rename from src/tests/test_algorithm.cpp rename to src/tests/test_fd_algorithm.cpp index 9429e511b1..deeb64a031 100644 --- a/src/tests/test_algorithm.cpp +++ b/src/tests/test_fd_algorithm.cpp @@ -14,15 +14,17 @@ #include "algorithms/fd/hyfd/hyfd.h" #include "algorithms/fd/pyro/pyro.h" #include "algorithms/fd/tane/tane.h" -#include "datasets.h" #include "model/table/relational_schema.h" -#include "testing_utils.h" +#include "table_config.h" +#include "test_fd_util.h" using std::string, std::vector; using ::testing::ContainerEq, ::testing::Eq; namespace fs = std::filesystem; +namespace tests { + /* This is a test suite for algorithm verification. It should be possible to run these tests for any * algorithm that: * 1. extends FDAlgorithm @@ -76,12 +78,12 @@ std::set, unsigned int>> FDsToSet(std::list< TYPED_TEST_SUITE_P(AlgorithmTest); TYPED_TEST_P(AlgorithmTest, ThrowsOnEmpty) { - auto algorithm = TestFixture::CreateAndConfToLoad(test_data_dir / "TestEmpty.csv", ',', true); + auto algorithm = TestFixture::CreateAndConfToLoad(kTestEmpty); ASSERT_THROW(algorithm->LoadData(), std::runtime_error); } TYPED_TEST_P(AlgorithmTest, ReturnsEmptyOnSingleNonKey) { - auto algorithm = TestFixture::CreateAlgorithmInstance("TestSingleColumn.csv", ',', true); + auto algorithm = TestFixture::CreateAlgorithmInstance(kTestSingleColumn); algorithm->Execute(); ASSERT_TRUE(algorithm->FdList().empty()); } @@ -89,7 +91,7 @@ TYPED_TEST_P(AlgorithmTest, ReturnsEmptyOnSingleNonKey) { TYPED_TEST_P(AlgorithmTest, WorksOnLongDataset) { std::set, unsigned int>> true_fd_collection{{{2}, 1}}; - auto algorithm = TestFixture::CreateAlgorithmInstance("TestLong.csv", ',', true); + auto algorithm = TestFixture::CreateAlgorithmInstance(kTestLong); algorithm->Execute(); ASSERT_TRUE(CheckFdListEquality(true_fd_collection, algorithm->FdList())); } @@ -98,51 +100,25 @@ TYPED_TEST_P(AlgorithmTest, WorksOnWideDataset) { std::set, unsigned int>> true_fd_collection{ {{0}, 2}, {{0}, 4}, {{2}, 0}, {{2}, 4}, {{4}, 0}, {{4}, 2}, {{}, 1}, {{}, 3}}; - auto algorithm = TestFixture::CreateAlgorithmInstance("TestWide.csv", ',', true); + auto algorithm = TestFixture::CreateAlgorithmInstance(kTestWide); algorithm->Execute(); ASSERT_TRUE(CheckFdListEquality(true_fd_collection, algorithm->FdList())); } TYPED_TEST_P(AlgorithmTest, LightDatasetsConsistentHash) { - try { - for (auto const& dataset : LightDatasets::datasets_) { - auto algorithm = TestFixture::CreateAlgorithmInstance(dataset.name, dataset.separator, - dataset.has_header); - algorithm->Execute(); - std::cout << dataset.name << std::endl; - EXPECT_EQ(algorithm->Fletcher16(), dataset.hash) - << "FD collection hash changed for " << dataset.name; - } - } catch (std::runtime_error& e) { - std::cout << "Exception raised in test: " << e.what() << std::endl; - FAIL(); - } - SUCCEED(); + TestFixture::PerformConsistentHashTestOn(TestFixture::light_datasets_); } TYPED_TEST_P(AlgorithmTest, HeavyDatasetsConsistentHash) { - try { - for (auto const& dataset : HeavyDatasets::datasets_) { - auto algorithm = TestFixture::CreateAlgorithmInstance(dataset.name, dataset.separator, - dataset.has_header); - algorithm->Execute(); - EXPECT_EQ(algorithm->Fletcher16(), dataset.hash) - << "The new algorithm and Pyro yield different results at " << dataset.name; - } - } catch (std::runtime_error& e) { - std::cout << "Exception raised in test: " << e.what() << std::endl; - FAIL(); - } - SUCCEED(); + TestFixture::PerformConsistentHashTestOn(TestFixture::heavy_datasets_); } TYPED_TEST_P(AlgorithmTest, ConsistentRepeatedExecution) { - auto const path = test_data_dir / "WDC_astronomical.csv"; - auto algorithm = TestFixture::CreateAlgorithmInstance(path, ',', true); + auto algorithm = TestFixture::CreateAlgorithmInstance(kWDC_astronomical); algorithm->Execute(); auto first_res = FDsToSet(algorithm->FdList()); for (int i = 0; i < 3; ++i) { - algos::ConfigureFromMap(*algorithm, TestFixture::GetParamMap(path, ',', true)); + algos::ConfigureFromMap(*algorithm, TestFixture::GetParamMap(kWDC_astronomical)); algorithm->Execute(); ASSERT_TRUE(CheckFdListEquality(first_res, algorithm->FdList())); } @@ -155,3 +131,5 @@ REGISTER_TYPED_TEST_SUITE_P(AlgorithmTest, ThrowsOnEmpty, ReturnsEmptyOnSingleNo using Algorithms = ::testing::Types; INSTANTIATE_TYPED_TEST_SUITE_P(AlgorithmTest, AlgorithmTest, Algorithms); + +} // namespace tests diff --git a/src/tests/test_fd_mine.cpp b/src/tests/test_fd_mine.cpp index cc2fa3cc30..7174aa5be1 100644 --- a/src/tests/test_fd_mine.cpp +++ b/src/tests/test_fd_mine.cpp @@ -12,8 +12,9 @@ #include "algorithms/fd/tane/tane.h" #include "config/error/type.h" #include "config/names.h" -#include "datasets.h" #include "model/table/relational_schema.h" +#include "table_config.h" +#include "test_fd_util.h" using ::testing::ContainerEq, ::testing::Eq; @@ -23,26 +24,21 @@ using std::string, std::vector; namespace onam = config::names; -StdParamsMap FD_MineGetParamMap(const std::filesystem::path& path, char separator = ',', - bool has_header = true) { - InputTable parser = std::make_unique(path, separator, has_header); - return {{config::names::kTable, parser}}; +StdParamsMap FD_MineGetParamMap(tests::TableConfig const& info) { + return {{config::names::kTable, info.MakeInputTable()}}; } -std::unique_ptr ConfToLoadFD_Mine(std::string const& path, char separator = ',', - bool has_header = true) { +std::unique_ptr ConfToLoadFD_Mine(tests::TableConfig const& info) { std::unique_ptr algorithm = std::make_unique(); - algos::ConfigureFromMap(*algorithm, FD_MineGetParamMap(path, separator, has_header)); + algos::ConfigureFromMap(*algorithm, FD_MineGetParamMap(info)); return algorithm; } -std::unique_ptr CreateFD_MineAlgorithmInstance(std::string const& path, - char separator = ',', - bool has_header = true) { - return algos::CreateAndLoadAlgorithm(FD_MineGetParamMap(path, separator, has_header)); +std::unique_ptr CreateFD_MineAlgorithmInstance(tests::TableConfig const& info) { + return algos::CreateAndLoadAlgorithm(FD_MineGetParamMap(info)); } -class AlgorithmTest : public LightDatasets, public HeavyDatasets, public ::testing::Test {}; +using FDMineAlgorithmTest = tests::AlgorithmTest; std::vector FD_MineBitsetToIndexVector(boost::dynamic_bitset<> const& bitset) { std::vector res; @@ -85,24 +81,20 @@ std::set, unsigned int>> FD_MineFDsToSet( } TEST(AlgorithmSyntheticTest, FD_Mine_ThrowsOnEmpty) { - auto path = test_data_dir / "TestEmpty.csv"; - auto algorithm = ConfToLoadFD_Mine(test_data_dir / "TestEmpty.csv", ',', true); + auto algorithm = ConfToLoadFD_Mine(tests::kTestEmpty); ASSERT_THROW(algorithm->LoadData(), std::runtime_error); } TEST(AlgorithmSyntheticTest, FD_Mine_ReturnsEmptyOnSingleNonKey) { - auto path = test_data_dir / "TestSingleColumn.csv"; - auto algorithm = CreateFD_MineAlgorithmInstance(path, ',', true); + auto algorithm = CreateFD_MineAlgorithmInstance(tests::kTestSingleColumn); algorithm->Execute(); ASSERT_TRUE(algorithm->FdList().empty()); } TEST(AlgorithmSyntheticTest, FD_Mine_WorksOnLongDataset) { - auto path = test_data_dir / "TestLong.csv"; - std::set, unsigned int>> true_fd_collection{{{2}, 1}}; - auto algorithm = CreateFD_MineAlgorithmInstance(path, ',', true); + auto algorithm = CreateFD_MineAlgorithmInstance(tests::kTestLong); algorithm->Execute(); ASSERT_TRUE(FD_Mine_CheckFDListEquality(true_fd_collection, algorithm->FdList())); } @@ -148,22 +140,20 @@ void MinimizeFDs(std::list& fd_collection) { } } -TEST_F(AlgorithmTest, FD_Mine_ReturnsSameAsPyro) { +TEST_F(FDMineAlgorithmTest, FD_Mine_ReturnsSameAsPyro) { namespace onam = config::names; try { - for (Dataset const& dataset : LightDatasets::datasets_) { + for (auto const& [config, hash] : FDMineAlgorithmTest::light_datasets_) { // TODO: change this hotfix - if (dataset.name == "breast_cancer.csv") { + if (config.name == tests::kbreast_cancer.name) { continue; } - auto path = test_data_dir / dataset.name; - auto algorithm = - CreateFD_MineAlgorithmInstance(path, dataset.separator, dataset.has_header); + auto algorithm = CreateFD_MineAlgorithmInstance(config); - StdParamsMap params_map{{onam::kCsvPath, path}, - {onam::kSeparator, dataset.separator}, - {onam::kHasHeader, dataset.has_header}, + StdParamsMap params_map{{onam::kCsvPath, config.GetPath()}, + {onam::kSeparator, config.separator}, + {onam::kHasHeader, config.has_header}, {onam::kSeed, decltype(pyro::Parameters::seed){0}}, {onam::kError, config::ErrorType{0.0}}}; auto pyro_ptr = algos::CreateAndLoadAlgorithm(params_map); @@ -192,7 +182,7 @@ TEST_F(AlgorithmTest, FD_Mine_ReturnsSameAsPyro) { std::string results_pyro = pyro.FDAlgorithm::GetJsonFDs(); EXPECT_EQ(results_pyro, algorithm_results) - << "The new algorithm and Pyro yield different results at " << dataset.name; + << "The new algorithm and Pyro yield different results at " << config.name; } } catch (std::runtime_error& e) { std::cout << "Exception raised in test: " << e.what() << std::endl; diff --git a/src/tests/test_fd_util.h b/src/tests/test_fd_util.h new file mode 100644 index 0000000000..ce3d12c52b --- /dev/null +++ b/src/tests/test_fd_util.h @@ -0,0 +1,78 @@ +#pragma once + +#include + +#include + +#include "algorithms/algo_factory.h" +#include "algorithms/fd/fd_algorithm.h" +#include "all_tables_config.h" +#include "config/error/type.h" +#include "config/names.h" +#include "table_config.h" + +namespace tests { +template +class AlgorithmTest : public ::testing::Test { +protected: + static std::unique_ptr CreateAndConfToLoad( + tests::TableConfig const& config) { + using config::InputTable, algos::ConfigureFromMap, algos::StdParamsMap; + std::unique_ptr algorithm = std::make_unique(); + ConfigureFromMap(*algorithm, + StdParamsMap{{config::names::kTable, config.MakeInputTable()}}); + return algorithm; + } + + static algos::StdParamsMap GetParamMap(tests::TableConfig const& config) { + using namespace config::names; + return { + {kTable, config.MakeInputTable()}, + {kError, config::ErrorType{0.0}}, + {kSeed, decltype(pyro::Parameters::seed){0}}, + }; + } + + static void PerformConsistentHashTestOn(std::vector const& datasets) { + try { + for (auto const& [config, hash] : datasets) { + auto algorithm = CreateAlgorithmInstance(config); + algorithm->Execute(); + EXPECT_EQ(algorithm->Fletcher16(), hash) + << "FD collection hash changed for " << config.name; + } + } catch (std::runtime_error& e) { + std::cout << "Exception raised in test: " << e.what() << std::endl; + FAIL(); + } + SUCCEED(); + } + +public: + static std::unique_ptr CreateAlgorithmInstance( + tests::TableConfig const& config) { + return algos::CreateAndLoadAlgorithm(GetParamMap(config)); + } + + inline static std::vector const light_datasets_ = { + {{tests::kCIPublicHighway10k, 33398}, + {tests::kneighbors10k, 43368}, + {tests::kWDC_astronomical, 22281}, + {tests::kWDC_age, 19620}, + {tests::kWDC_appearances, 25827}, + {tests::kWDC_astrology, 40815}, + {tests::kWDC_game, 6418}, + {tests::kWDC_science, 19620}, + {tests::kWDC_symbols, 28289}, + {tests::kbreast_cancer, 15121}, + {tests::kWDC_kepler, 63730}}}; + + inline static std::vector const heavy_datasets_ = { + {{tests::kadult, 23075}, + {tests::kCIPublicHighway, 13035}, + {tests::kEpicMeds, 50218}, + {tests::kEpicVitals, 2083}, + {tests::kiowa1kk, 28573}, + {tests::kLegacyPayors, 43612}}}; +}; +} // namespace tests diff --git a/src/tests/test_fd_verifier.cpp b/src/tests/test_fd_verifier.cpp index e265a1ff5d..2976e2aaa7 100644 --- a/src/tests/test_fd_verifier.cpp +++ b/src/tests/test_fd_verifier.cpp @@ -7,9 +7,9 @@ #include "algo_factory.h" #include "builtin.h" #include "config/indices/type.h" -#include "datasets.h" #include "fd/fd_verifier/fd_verifier.h" #include "fd/fd_verifier/stats_calculator.h" +#include "table_config.h" namespace { using namespace algos::fd_verifier; diff --git a/src/tests/test_metric_verifier.cpp b/src/tests/test_metric_verifier.cpp index 17cc74fdda..510142c8e2 100644 --- a/src/tests/test_metric_verifier.cpp +++ b/src/tests/test_metric_verifier.cpp @@ -11,7 +11,7 @@ #include "algorithms/metric/enums.h" #include "algorithms/metric/metric_verifier.h" #include "config/names.h" -#include "datasets.h" +#include "table_config.h" namespace tests { namespace onam = config::names; diff --git a/src/tests/test_typed_column_data.cpp b/src/tests/test_typed_column_data.cpp index 808320d459..6dd307dda6 100644 --- a/src/tests/test_typed_column_data.cpp +++ b/src/tests/test_typed_column_data.cpp @@ -4,9 +4,9 @@ #include #include "algorithms/fd/fd_algorithm.h" -#include "datasets.h" #include "model/table/column_layout_typed_relation_data.h" #include "parser/csv_parser/csv_parser.h" +#include "table_config.h" namespace tests { diff --git a/src/tests/test_typo_miner.cpp b/src/tests/test_typo_miner.cpp index 27632bbc75..26788d3a86 100644 --- a/src/tests/test_typo_miner.cpp +++ b/src/tests/test_typo_miner.cpp @@ -7,7 +7,7 @@ #include "algorithms/algo_factory.h" #include "algorithms/pipelines/typo_miner/typo_miner.h" #include "config/names.h" -#include "datasets.h" +#include "table_config.h" namespace tests { namespace onam = config::names; diff --git a/src/tests/test_ucc_algorithms.cpp b/src/tests/test_ucc_algorithms.cpp index 63ee2c24cb..974169c638 100644 --- a/src/tests/test_ucc_algorithms.cpp +++ b/src/tests/test_ucc_algorithms.cpp @@ -10,8 +10,9 @@ #include "algorithms/ucc/hyucc/hyucc.h" #include "algorithms/ucc/ucc.h" #include "algorithms/ucc/ucc_algorithm.h" +#include "all_tables_config.h" #include "config/thread_number/type.h" -#include "datasets.h" +#include "table_config.h" std::ostream& operator<<(std::ostream& os, Vertical const& v) { os << v.ToString(); @@ -22,6 +23,29 @@ namespace tests { namespace { +// Implement custom hash functions since implementation of `std::hash` or `boost::hash` may change +// depending on the library version/architecture/os/whatever leading to tests failing. +// Taken from +// https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector/72073933#72073933 +std::size_t Hash(std::vector const& vec) { + std::size_t seed = vec.size(); + for (auto x : vec) { + x = ((x >> 16) ^ x) * 0x45d9f3b; + x = ((x >> 16) ^ x) * 0x45d9f3b; + x = (x >> 16) ^ x; + seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; +} + +std::size_t Hash(std::vector> const& vec) { + size_t hash = 1; + for (auto const& v : vec) { + hash = 31 * hash + Hash(v); + } + return hash; +} + // TODO(polyntsov): think how we should organize test code, maybe implement some classes with // basic functionality common for all primitive mining algorithms. Without it every test class // for specific algorithm is similar to any other (compare this class to AlgorithmTest and @@ -35,60 +59,77 @@ class UCCAlgorithmTest : public ::testing::Test { assert(threads > 0); threads_ = threads; } + void PerformConsistentHashTestOn(std::vector const& datasets) { + for (auto const& [config, hash] : datasets) { + try { + auto ucc_algo = CreateAlgorithmInstance(config); + ucc_algo->Execute(); + + std::list const& actual_list = ucc_algo->UCCList(); + std::vector> actual; + actual.reserve(actual_list.size()); + std::transform(actual_list.begin(), actual_list.end(), std::back_inserter(actual), + [](Vertical const& v) { return v.GetColumnIndicesAsVector(); }); + std::sort(actual.begin(), actual.end()); + EXPECT_EQ(Hash(actual), hash) << "Wrong hash on dataset " << config.name; + } catch (std::exception const& e) { + std::cout << "An exception with message: " << e.what() + << "\n\tis thrown on dataset " << config.name << '\n'; + FAIL(); + } + } + } public: - static algos::StdParamsMap GetParamMap(std::filesystem::path const& path, char separator = ',', - bool has_header = true) { + static algos::StdParamsMap GetParamMap(tests::TableConfig const& config) { using namespace config::names; - return {{kCsvPath, path}, - {kSeparator, separator}, - {kHasHeader, has_header}, + return {{kCsvPath, config.GetPath()}, + {kSeparator, config.separator}, + {kHasHeader, config.has_header}, {kThreads, threads_}}; } - static std::unique_ptr CreateAlgorithmInstance(std::string const& filename, - char separator = ',', - bool has_header = true) { - return algos::CreateAndLoadAlgorithm( - GetParamMap(test_data_dir / filename, separator, has_header)); + static std::unique_ptr CreateAlgorithmInstance( + tests::TableConfig const& info) { + return algos::CreateAndLoadAlgorithm(GetParamMap(info)); } - static inline const std::vector light_datasets_ = { - {"WDC_astronomical.csv", 2089541732445U, ',', true}, - {"WDC_symbols.csv", 1, ',', true}, - {"WDC_science.csv", 2658842082150U, ',', true}, - {"WDC_satellites.csv", 5208443370856032U, ',', true}, - {"WDC_appearances.csv", 82369238361U, ',', true}, - {"WDC_astrology.csv", 79554241843163108U, ',', true}, - {"WDC_game.csv", 2555214540772530U, ',', true}, - {"WDC_kepler.csv", 82426217315737U, ',', true}, - {"WDC_planetz.csv", 2555214540772530U, ',', true}, - {"WDC_age.csv", 2658842082150U, ',', true}, - {"TestWide.csv", 2555250373874U, ',', true}, - {"abalone.csv", 16581571148699134255U, ',', true}, - {"iris.csv", 1, ',', false}, - {"adult.csv", 1, ';', false}, - {"breast_cancer.csv", 16854900230774656828U, ',', true}, + inline static std::vector const light_datasets_ = { + {kWDC_astronomical, 2089541732445U}, + {kWDC_symbols, 1}, + {kWDC_science, 2658842082150U}, + {kWDC_satellites, 5208443370856032U}, + {kWDC_appearances, 82369238361U}, + {kWDC_astrology, 79554241843163108U}, + {kWDC_game, 2555214540772530U}, + {kWDC_kepler, 82426217315737U}, + {kWDC_planetz, 2555214540772530U}, + {kWDC_age, 2658842082150U}, + {kTestWide, 2555250373874U}, + {kabalone, 16581571148699134255U}, + {kiris, 1}, + {kadult, 1}, + {kbreast_cancer, 16854900230774656828U}, // Possibly heavy datasets, if another less efficient algorithm than HyUCC is not // able to process these move them to heavy_datasets_ - {"neighbors10k.csv", 170971924188219U, ',', true}, + {kneighbors10k, 170971924188219U}, #if 0 - {"neighbors50k.csv", 1, ',', true}, + {kneighbors50k, 1}, #endif - {"neighbors100k.csv", 170971924188219U, ',', true}, - {"CIPublicHighway10k.csv", 82369238361U, ',', true}, - {"CIPublicHighway700.csv", 82369238361U, ',', true}, + {kneighbors100k, 170971924188219U}, + {kCIPublicHighway10k, 82369238361U}, + {kCIPublicHighway700, 82369238361U}, }; - static inline const std::vector heavy_datasets_ = { - {"EpicVitals.csv", 1, '|', true}, - {"EpicMeds.csv", 59037771758954037U, '|', true}, - {"iowa1kk.csv", 2654435863U, ',', true}, + inline static std::vector const heavy_datasets_ = { + {kEpicVitals, 1}, + {kEpicMeds, 59037771758954037U}, + {kiowa1kk, 2654435863U}, #if 0 - {"fd-reduced-30.csv", 275990379954778425U, ',', true}, - {"flight_1k.csv", 2512091017708538662U, ';', true}, - {"plista_1k.csv", 1, ';', false}, - {"letter.csv", 1, ',', false}, + {kfd_reduced_30, 275990379954778425U}, + {kflight_1k, 2512091017708538662U}, + {kplista_1k, 1}, + {kletter, 1}, #endif }; }; @@ -96,74 +137,28 @@ class UCCAlgorithmTest : public ::testing::Test { template config::ThreadNumType UCCAlgorithmTest::threads_ = 1; -// Implement custom hash functions since implementation of `std::hash` or `boost::hash` may change -// depending on the library version/architecture/os/whatever leading to tests failing. -// Taken from -// https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector/72073933#72073933 -std::size_t Hash(std::vector const& vec) { - std::size_t seed = vec.size(); - for (auto x : vec) { - x = ((x >> 16) ^ x) * 0x45d9f3b; - x = ((x >> 16) ^ x) * 0x45d9f3b; - x = (x >> 16) ^ x; - seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - return seed; -} - -std::size_t Hash(std::vector> const& vec) { - size_t hash = 1; - for (auto const& v : vec) { - hash = 31 * hash + Hash(v); - } - return hash; -} - -template -void PerformConsistentHashTestOn(std::vector const& datasets) { - for (Dataset const& dataset : datasets) { - try { - auto ucc_algo = - T::CreateAlgorithmInstance(dataset.name, dataset.separator, dataset.has_header); - ucc_algo->Execute(); - - std::list const& actual_list = ucc_algo->UCCList(); - std::vector> actual; - actual.reserve(actual_list.size()); - std::transform(actual_list.begin(), actual_list.end(), std::back_inserter(actual), - [](Vertical const& v) { return v.GetColumnIndicesAsVector(); }); - std::sort(actual.begin(), actual.end()); - EXPECT_EQ(Hash(actual), dataset.hash) << "Wrong hash on dataset " << dataset.name; - } catch (std::exception const& e) { - std::cout << "An exception with message: " << e.what() << "\n\tis thrown on dataset " - << dataset.name << '\n'; - FAIL(); - } - } -} - } // namespace TYPED_TEST_SUITE_P(UCCAlgorithmTest); TYPED_TEST_P(UCCAlgorithmTest, ConsistentHashOnLightDatasets) { TestFixture::SetThreadsParam(1); - PerformConsistentHashTestOn(TestFixture::light_datasets_); + TestFixture::PerformConsistentHashTestOn(TestFixture::light_datasets_); } TYPED_TEST_P(UCCAlgorithmTest, ConsistentHashOnHeavyDatasets) { TestFixture::SetThreadsParam(1); - PerformConsistentHashTestOn(TestFixture::heavy_datasets_); + TestFixture::PerformConsistentHashTestOn(TestFixture::heavy_datasets_); } TYPED_TEST_P(UCCAlgorithmTest, ConsistentHashOnLightDatasetsParallel) { TestFixture::SetThreadsParam(4); - PerformConsistentHashTestOn(TestFixture::light_datasets_); + TestFixture::PerformConsistentHashTestOn(TestFixture::light_datasets_); } TYPED_TEST_P(UCCAlgorithmTest, ConsistentHashOnHeavyDatasetsParallel) { TestFixture::SetThreadsParam(4); - PerformConsistentHashTestOn(TestFixture::heavy_datasets_); + TestFixture::PerformConsistentHashTestOn(TestFixture::heavy_datasets_); } REGISTER_TYPED_TEST_SUITE_P(UCCAlgorithmTest, ConsistentHashOnLightDatasets, diff --git a/src/tests/test_util.cpp b/src/tests/test_util.cpp index 1b56ef09d4..282adeb495 100644 --- a/src/tests/test_util.cpp +++ b/src/tests/test_util.cpp @@ -4,12 +4,12 @@ #include #include -#include "datasets.h" #include "fd/pyro/model/list_agree_set_sample.h" #include "levenshtein_distance.h" #include "model/table/agree_set_factory.h" #include "model/table/column_layout_relation_data.h" #include "model/table/identifier_set.h" +#include "table_config.h" namespace tests { diff --git a/src/tests/testing_utils.h b/src/tests/testing_utils.h deleted file mode 100644 index af7c3d9a88..0000000000 --- a/src/tests/testing_utils.h +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once - -#include - -#include - -#include "algorithms/algo_factory.h" -#include "algorithms/fd/fd_algorithm.h" -#include "config/error/type.h" -#include "config/names.h" -#include "datasets.h" - -template -class AlgorithmTest : public ::testing::Test { - static config::InputTable MakeCsvParser(std::string const& path, char separator, - bool has_header) { - return std::make_shared(path, separator, has_header); - } - -protected: - static std::unique_ptr CreateAndConfToLoad(std::string const& path, - char separator = ',', - bool has_header = true) { - using config::InputTable, algos::ConfigureFromMap, algos::StdParamsMap; - std::unique_ptr algorithm = std::make_unique(); - auto parser = MakeCsvParser(path, separator, has_header); - ConfigureFromMap(*algorithm, StdParamsMap{{config::names::kTable, parser}}); - return algorithm; - } - - static algos::StdParamsMap GetParamMap(const std::filesystem::path& path, char separator = ',', - bool has_header = true) { - using namespace config::names; - return { - {kTable, MakeCsvParser(path, separator, has_header)}, - {kError, config::ErrorType{0.0}}, - {kSeed, decltype(pyro::Parameters::seed){0}}, - }; - } - - static std::unique_ptr CreateAlgorithmInstance(const std::string& filename, - char separator = ',', - bool has_header = true) { - return algos::CreateAndLoadAlgorithm( - GetParamMap(test_data_dir / filename, separator, has_header)); - } -};