Skip to content

Commit

Permalink
Add separator validation
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelS239 committed Dec 22, 2023
1 parent f187179 commit 7ca4b05
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 2 deletions.
5 changes: 4 additions & 1 deletion src/core/algorithms/algo_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "algorithms/pipelines/typo_miner/typo_miner.h"
#include "config/names.h"
#include "tabular_data/input_tables_type.h"
#include "util/separator_validator.h"

namespace algos {

Expand Down Expand Up @@ -47,10 +48,12 @@ void LoadAlgorithm(Algorithm& algorithm, StdParamsMap const& options) {
using namespace config::names;
namespace fs = std::filesystem;
if (option_name == kTable && options.find(std::string{kTable}) == options.end()) {
config::InputTable parser =
auto csv_parser =
std::make_shared<CSVParser>(GetOptionValue<fs::path>(options, kCsvPath),
GetOptionValue<char>(options, kSeparator),
GetOptionValue<bool>(options, kHasHeader));
csv_parser->ValidateSeparator();
config::InputTable parser = csv_parser;
return boost::any{parser};
} else if (option_name == kTables && options.find(std::string{kTables}) == options.end()) {
auto paths = GetOptionValue<std::vector<fs::path>>(options, kCsvPaths);
Expand Down
90 changes: 90 additions & 0 deletions src/core/parser/csv_parser/csv_parser.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#include "csv_parser.h"

#include <algorithm>
#include <cassert>
#include <filesystem>
#include <fstream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include <boost/algorithm/string.hpp>
#include <boost/tokenizer.hpp>
#include <easylogging++.h>

inline std::string& CSVParser::rtrim(std::string& s) {
boost::trim_right(s);
Expand Down Expand Up @@ -137,3 +140,90 @@ std::vector<std::string> CSVParser::GetNextRow() {

return result;
}

std::optional<char> CSVParser::DeduceSeparator() {
/* Calculate statistics including the header row */
bool has_header_copy = has_header_;
has_header_ = false;
Reset();
has_header_ = has_header_copy;

std::unordered_map<char, unsigned> letter_count;
if (has_next_) {
for (char c : next_line_) {
letter_count[c]++;
}
}

std::unordered_map<char, unsigned> next_letter_count;
while (has_next_) {
GetNextIfHas();
next_letter_count.clear();
for (char c : next_line_) {
next_letter_count[c]++;
}
for (auto letter : letter_count) {
if (letter.second != next_letter_count[letter.first]) {
letter_count[letter.first] = 0;
}
}
}

for (auto letter : letter_count) {
if (letter.second != 0) {
Reset();
return letter.first;
}
}
Reset();

return std::nullopt;
}

bool CSVParser::CheckSeparator() {
/* Calculate statistics including the header row */
bool has_header_copy = has_header_;
has_header_ = false;
Reset();
has_header_ = has_header_copy;

unsigned sep_count = 0;
if (has_next_) {
sep_count = std::count(next_line_.begin(), next_line_.end(), separator_);
}

while (has_next_) {
GetNextIfHas();
if (sep_count != std::count(next_line_.begin(), next_line_.end(), separator_)) {
Reset();
return false;
}
}
Reset();

return true;
}

std::optional<char> CSVParser::ValidateSeparator() {
std::optional<char> possible_separator = DeduceSeparator();

if (CheckSeparator()) {
if (possible_separator == std::nullopt || separator_ == possible_separator ||
GetNumberOfColumns() != 1) {
return separator_;
}
LOG(WARNING) << "Inserted separator seems to be wrong";
LOG(WARNING) << "Possible separator for the table is: \'" << possible_separator.value()
<< "\'";
return possible_separator;
}

LOG(WARNING) << "Inserted separator seems to be wrong";
if (possible_separator != std::nullopt) {
LOG(WARNING) << "Possible separator for the table is: \'" << possible_separator.value()
<< "\'";
return possible_separator;
}

return std::nullopt;
}
5 changes: 5 additions & 0 deletions src/core/parser/csv_parser/csv_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <filesystem>
#include <fstream>
#include <optional>
#include <string>
#include <vector>

Expand All @@ -30,6 +31,8 @@ class CSVParser : public model::IDatasetStream {
std::vector<std::string> ParseString(std::string const& s) const;
void GetNextIfHas();
void SkipLine();
std::optional<char> DeduceSeparator();
bool CheckSeparator();

inline static std::string& rtrim(std::string& s);

Expand All @@ -42,6 +45,8 @@ class CSVParser : public model::IDatasetStream {
std::string GetUnparsedLine(unsigned long long const line_index);
std::vector<std::string> ParseLine(unsigned long long const line_index);

std::optional<char> ValidateSeparator();

bool HasNextRow() const override {
return has_next_;
}
Expand Down
12 changes: 12 additions & 0 deletions src/core/util/separator_validator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#include "separator_validator.h"

#include <easylogging++.h>

namespace util {

std::optional<char> ValidateSeparator(std::filesystem::path const& path, char separator) {
auto parser = std::make_unique<CSVParser>(path, separator, false);
return parser->ValidateSeparator();
}

} // namespace util
12 changes: 12 additions & 0 deletions src/core/util/separator_validator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include <filesystem>
#include <optional>

#include "parser/csv_parser/csv_parser.h"

namespace util {

std::optional<char> ValidateSeparator(std::filesystem::path const& path, char separator);

} // namespace util
5 changes: 4 additions & 1 deletion src/python_bindings/py_algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "get_py_type.h"
#include "parser/csv_parser/csv_parser.h"
#include "py_to_any.h"
#include "util/separator_validator.h"

namespace python_bindings {

Expand Down Expand Up @@ -80,7 +81,9 @@ void PyAlgorithmBase::LoadProvidedData(pybind11::kwargs const& kwargs, InputTabl

void PyAlgorithmBase::LoadData(std::string_view path, char separator, bool has_header,
py::kwargs const& kwargs) {
LoadProvidedData(kwargs, std::make_shared<CSVParser>(path, separator, has_header));
auto parser = std::make_shared<CSVParser>(path, separator, has_header);
parser->ValidateSeparator();
LoadProvidedData(kwargs, parser);
}

void PyAlgorithmBase::LoadData(py::handle dataframe, std::string name, py::kwargs const& kwargs) {
Expand Down

0 comments on commit 7ca4b05

Please sign in to comment.