From 7e6164e65d089d6f160f3a71db4fe103788e5190 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Fri, 4 Oct 2024 17:42:04 +0000 Subject: [PATCH] Have DataFrame maintain the column order when reading and writing --- cxx/pclean/csv.cc | 33 ++++++++++++++++----------------- cxx/pclean/csv.hh | 4 ++-- cxx/pclean/csv_test.cc | 12 ++---------- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/cxx/pclean/csv.cc b/cxx/pclean/csv.cc index 14dcc24..e9f527f 100644 --- a/cxx/pclean/csv.cc +++ b/cxx/pclean/csv.cc @@ -23,10 +23,9 @@ DataFrame DataFrame::from_csv( DataFrame DataFrame::from_csv( std::istream& is, const std::vector& column_names) { DataFrame df; - std::vector col_names; if (!column_names.empty()) { - col_names = column_names; + df.columns = column_names; for (const auto& c : column_names) { df.data[c] = {}; } @@ -50,21 +49,21 @@ DataFrame DataFrame::from_csv( part.pop_back(); } if (first_line && column_names.empty()) { - col_names.push_back(part); + df.columns.push_back(part); df.data[part] = {}; continue; } - df.data[col_names[i++]].push_back(part); + df.data[df.columns[i++]].push_back(part); } if (!first_line) { if (line.back() == ',') { // std::getline is broken and won't let the last field be empty. - df.data[col_names[i++]].push_back(""); + df.data[df.columns[i++]].push_back(""); } - if (i != col_names.size()) { + if (i != df.columns.size()) { printf("Only found %ld out of %ld expected columns in line\n%s\n", - i, col_names.size(), line.c_str()); + i, df.columns.size(), line.c_str()); assert(false); } } @@ -86,24 +85,24 @@ bool DataFrame::to_csv(const std::string& filename) { bool DataFrame::to_csv(std::ostream& os) { // TODO(thomaswc): Quote column names or data items that contain commas. - auto it = data.begin(); - if (it == data.end()) { + if (columns.size() == 0) { return true; } - os << it->first; - size_t num_rows = it->second.size(); - ++it; - for (; it != data.end(); ++it) { - os << "," << it->first; + for (size_t j = 0; j < columns.size(); ++j) { + if (j != 0) { + os << ","; + } + os << columns[j]; } os << "\n"; + size_t num_rows = data[columns[0]].size(); for (size_t i = 0; i < num_rows; ++i) { - for (auto it = data.begin(); it != data.end(); ++it) { - if (it != data.begin()) { + for (size_t j = 0; j < columns.size(); ++j) { + if (j != 0) { os << ","; } - os << it->second[i]; + os << data[columns[j]][i]; } os << "\n"; } diff --git a/cxx/pclean/csv.hh b/cxx/pclean/csv.hh index c6d33dd..0d5fe80 100644 --- a/cxx/pclean/csv.hh +++ b/cxx/pclean/csv.hh @@ -27,8 +27,8 @@ class DataFrame { bool to_csv(const std::string& filename); bool to_csv(std::ostream& os); + std::vector columns; + // data['column_name'] holds the data for that column. - // TODO(thomaswc): Also hold the column names in a vector so we can preserve - // their order when reading & writing. std::map> data; }; diff --git a/cxx/pclean/csv_test.cc b/cxx/pclean/csv_test.cc index fe5f28e..aea2922 100644 --- a/cxx/pclean/csv_test.cc +++ b/cxx/pclean/csv_test.cc @@ -48,8 +48,7 @@ BOOST_AUTO_TEST_CASE(test_carriage_ret_at_end_of_line) { } BOOST_AUTO_TEST_CASE(test_round_trip) { - std::string s = R"""( -Name,Specialty,Degree,School,Address,City,State,Zip + std::string s = R"""(Name,Specialty,Degree,School,Address,City,State,Zip K. Ryan,Family Medicine,DO,PCOM,6317 York Rd,Baltimore,MD,21212 K. Ryan,Family Medicine,DO,PCOM,100 Walter Ward Blvd,Abingdon,MD,21009 S. Evans,Internal Medicine,MD,UMD,100 Walter Ward Blvd,Abingdon,MD,21009 @@ -59,12 +58,5 @@ M. Grady,Physical Therapy,PT,Other,3491 Merchants Blvd,Abingdon,MD,21009 DataFrame df = DataFrame::from_csv(ss); std::stringstream oss; BOOST_TEST(df.to_csv(oss)); - // Order isn't preserved because map sorts keys alphabetically. - std::string s2 = R"""(Address,City,Degree,Name,School,Specialty,State,Zip -6317 York Rd,Baltimore,DO,K. Ryan,PCOM,Family Medicine,MD,21212 -100 Walter Ward Blvd,Abingdon,DO,K. Ryan,PCOM,Family Medicine,MD,21009 -100 Walter Ward Blvd,Abingdon,MD,S. Evans,UMD,Internal Medicine,MD,21009 -3491 Merchants Blvd,Abingdon,PT,M. Grady,Other,Physical Therapy,MD,21009 -)"""; - BOOST_TEST(s2 == oss.str(), tt::per_element()); + BOOST_TEST(s == oss.str(), tt::per_element()); }