From 7ca4b053712d943c345799e4a1d1fdd151900db9 Mon Sep 17 00:00:00 2001 From: Michael Sinelnikov Date: Fri, 17 Nov 2023 21:55:58 +0300 Subject: [PATCH] Add separator validation --- src/core/algorithms/algo_factory.cpp | 5 +- src/core/parser/csv_parser/csv_parser.cpp | 90 +++++++++++++++++++++++ src/core/parser/csv_parser/csv_parser.h | 5 ++ src/core/util/separator_validator.cpp | 12 +++ src/core/util/separator_validator.h | 12 +++ src/python_bindings/py_algorithm.cpp | 5 +- 6 files changed, 127 insertions(+), 2 deletions(-) create mode 100644 src/core/util/separator_validator.cpp create mode 100644 src/core/util/separator_validator.h diff --git a/src/core/algorithms/algo_factory.cpp b/src/core/algorithms/algo_factory.cpp index ac4439a087..9b990181f3 100644 --- a/src/core/algorithms/algo_factory.cpp +++ b/src/core/algorithms/algo_factory.cpp @@ -9,6 +9,7 @@ #include "algorithms/pipelines/typo_miner/typo_miner.h" #include "config/names.h" #include "tabular_data/input_tables_type.h" +#include "util/separator_validator.h" namespace algos { @@ -47,10 +48,12 @@ void LoadAlgorithm(Algorithm& algorithm, StdParamsMap const& options) { using namespace config::names; namespace fs = std::filesystem; if (option_name == kTable && options.find(std::string{kTable}) == options.end()) { - config::InputTable parser = + auto csv_parser = std::make_shared(GetOptionValue(options, kCsvPath), GetOptionValue(options, kSeparator), GetOptionValue(options, kHasHeader)); + csv_parser->ValidateSeparator(); + config::InputTable parser = csv_parser; return boost::any{parser}; } else if (option_name == kTables && options.find(std::string{kTables}) == options.end()) { auto paths = GetOptionValue>(options, kCsvPaths); diff --git a/src/core/parser/csv_parser/csv_parser.cpp b/src/core/parser/csv_parser/csv_parser.cpp index 6ee949e0bf..360a53803b 100644 --- a/src/core/parser/csv_parser/csv_parser.cpp +++ b/src/core/parser/csv_parser/csv_parser.cpp @@ -1,14 +1,17 @@ #include "csv_parser.h" +#include #include #include #include #include +#include #include #include #include #include +#include inline std::string& CSVParser::rtrim(std::string& s) { boost::trim_right(s); @@ -137,3 +140,90 @@ std::vector CSVParser::GetNextRow() { return result; } + +std::optional CSVParser::DeduceSeparator() { + /* Calculate statistics including the header row */ + bool has_header_copy = has_header_; + has_header_ = false; + Reset(); + has_header_ = has_header_copy; + + std::unordered_map letter_count; + if (has_next_) { + for (char c : next_line_) { + letter_count[c]++; + } + } + + std::unordered_map next_letter_count; + while (has_next_) { + GetNextIfHas(); + next_letter_count.clear(); + for (char c : next_line_) { + next_letter_count[c]++; + } + for (auto letter : letter_count) { + if (letter.second != next_letter_count[letter.first]) { + letter_count[letter.first] = 0; + } + } + } + + for (auto letter : letter_count) { + if (letter.second != 0) { + Reset(); + return letter.first; + } + } + Reset(); + + return std::nullopt; +} + +bool CSVParser::CheckSeparator() { + /* Calculate statistics including the header row */ + bool has_header_copy = has_header_; + has_header_ = false; + Reset(); + has_header_ = has_header_copy; + + unsigned sep_count = 0; + if (has_next_) { + sep_count = std::count(next_line_.begin(), next_line_.end(), separator_); + } + + while (has_next_) { + GetNextIfHas(); + if (sep_count != std::count(next_line_.begin(), next_line_.end(), separator_)) { + Reset(); + return false; + } + } + Reset(); + + return true; +} + +std::optional CSVParser::ValidateSeparator() { + std::optional possible_separator = DeduceSeparator(); + + if (CheckSeparator()) { + if (possible_separator == std::nullopt || separator_ == possible_separator || + GetNumberOfColumns() != 1) { + return separator_; + } + LOG(WARNING) << "Inserted separator seems to be wrong"; + LOG(WARNING) << "Possible separator for the table is: \'" << possible_separator.value() + << "\'"; + return possible_separator; + } + + LOG(WARNING) << "Inserted separator seems to be wrong"; + if (possible_separator != std::nullopt) { + LOG(WARNING) << "Possible separator for the table is: \'" << possible_separator.value() + << "\'"; + return possible_separator; + } + + return std::nullopt; +} \ No newline at end of file diff --git a/src/core/parser/csv_parser/csv_parser.h b/src/core/parser/csv_parser/csv_parser.h index 2339629e3b..96ffaab5f8 100644 --- a/src/core/parser/csv_parser/csv_parser.h +++ b/src/core/parser/csv_parser/csv_parser.h @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -30,6 +31,8 @@ class CSVParser : public model::IDatasetStream { std::vector ParseString(std::string const& s) const; void GetNextIfHas(); void SkipLine(); + std::optional DeduceSeparator(); + bool CheckSeparator(); inline static std::string& rtrim(std::string& s); @@ -42,6 +45,8 @@ class CSVParser : public model::IDatasetStream { std::string GetUnparsedLine(unsigned long long const line_index); std::vector ParseLine(unsigned long long const line_index); + std::optional ValidateSeparator(); + bool HasNextRow() const override { return has_next_; } diff --git a/src/core/util/separator_validator.cpp b/src/core/util/separator_validator.cpp new file mode 100644 index 0000000000..d7e628d204 --- /dev/null +++ b/src/core/util/separator_validator.cpp @@ -0,0 +1,12 @@ +#include "separator_validator.h" + +#include + +namespace util { + +std::optional ValidateSeparator(std::filesystem::path const& path, char separator) { + auto parser = std::make_unique(path, separator, false); + return parser->ValidateSeparator(); +} + +} // namespace util \ No newline at end of file diff --git a/src/core/util/separator_validator.h b/src/core/util/separator_validator.h new file mode 100644 index 0000000000..2196e750ca --- /dev/null +++ b/src/core/util/separator_validator.h @@ -0,0 +1,12 @@ +#pragma once + +#include +#include + +#include "parser/csv_parser/csv_parser.h" + +namespace util { + +std::optional ValidateSeparator(std::filesystem::path const& path, char separator); + +} // namespace util \ No newline at end of file diff --git a/src/python_bindings/py_algorithm.cpp b/src/python_bindings/py_algorithm.cpp index 575eeff39a..2f9099f504 100644 --- a/src/python_bindings/py_algorithm.cpp +++ b/src/python_bindings/py_algorithm.cpp @@ -14,6 +14,7 @@ #include "get_py_type.h" #include "parser/csv_parser/csv_parser.h" #include "py_to_any.h" +#include "util/separator_validator.h" namespace python_bindings { @@ -80,7 +81,9 @@ void PyAlgorithmBase::LoadProvidedData(pybind11::kwargs const& kwargs, InputTabl void PyAlgorithmBase::LoadData(std::string_view path, char separator, bool has_header, py::kwargs const& kwargs) { - LoadProvidedData(kwargs, std::make_shared(path, separator, has_header)); + auto parser = std::make_shared(path, separator, has_header); + parser->ValidateSeparator(); + LoadProvidedData(kwargs, parser); } void PyAlgorithmBase::LoadData(py::handle dataframe, std::string name, py::kwargs const& kwargs) {