Skip to content

Commit

Permalink
Merge pull request #230 from probcomp/041024-thomaswc-csv_columns
Browse files Browse the repository at this point in the history
Have DataFrame maintain the column order when reading and writing
  • Loading branch information
ThomasColthurst authored Oct 4, 2024
2 parents 42d8dc4 + 7e6164e commit b7a4b6b
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 29 deletions.
33 changes: 16 additions & 17 deletions cxx/pclean/csv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@ DataFrame DataFrame::from_csv(
DataFrame DataFrame::from_csv(
std::istream& is, const std::vector<std::string>& column_names) {
DataFrame df;
std::vector<std::string> col_names;

if (!column_names.empty()) {
col_names = column_names;
df.columns = column_names;
for (const auto& c : column_names) {
df.data[c] = {};
}
Expand All @@ -50,21 +49,21 @@ DataFrame DataFrame::from_csv(
part.pop_back();
}
if (first_line && column_names.empty()) {
col_names.push_back(part);
df.columns.push_back(part);
df.data[part] = {};
continue;
}

df.data[col_names[i++]].push_back(part);
df.data[df.columns[i++]].push_back(part);
}
if (!first_line) {
if (line.back() == ',') {
// std::getline is broken and won't let the last field be empty.
df.data[col_names[i++]].push_back("");
df.data[df.columns[i++]].push_back("");
}
if (i != col_names.size()) {
if (i != df.columns.size()) {
printf("Only found %ld out of %ld expected columns in line\n%s\n",
i, col_names.size(), line.c_str());
i, df.columns.size(), line.c_str());
assert(false);
}
}
Expand All @@ -86,24 +85,24 @@ bool DataFrame::to_csv(const std::string& filename) {

bool DataFrame::to_csv(std::ostream& os) {
// TODO(thomaswc): Quote column names or data items that contain commas.
auto it = data.begin();
if (it == data.end()) {
if (columns.size() == 0) {
return true;
}
os << it->first;
size_t num_rows = it->second.size();
++it;
for (; it != data.end(); ++it) {
os << "," << it->first;
for (size_t j = 0; j < columns.size(); ++j) {
if (j != 0) {
os << ",";
}
os << columns[j];
}
os << "\n";

size_t num_rows = data[columns[0]].size();
for (size_t i = 0; i < num_rows; ++i) {
for (auto it = data.begin(); it != data.end(); ++it) {
if (it != data.begin()) {
for (size_t j = 0; j < columns.size(); ++j) {
if (j != 0) {
os << ",";
}
os << it->second[i];
os << data[columns[j]][i];
}
os << "\n";
}
Expand Down
4 changes: 2 additions & 2 deletions cxx/pclean/csv.hh
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ class DataFrame {
bool to_csv(const std::string& filename);
bool to_csv(std::ostream& os);

std::vector<std::string> columns;

// data['column_name'] holds the data for that column.
// TODO(thomaswc): Also hold the column names in a vector so we can preserve
// their order when reading & writing.
std::map<std::string, std::vector<std::string>> data;
};
12 changes: 2 additions & 10 deletions cxx/pclean/csv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ BOOST_AUTO_TEST_CASE(test_carriage_ret_at_end_of_line) {
}

BOOST_AUTO_TEST_CASE(test_round_trip) {
std::string s = R"""(
Name,Specialty,Degree,School,Address,City,State,Zip
std::string s = R"""(Name,Specialty,Degree,School,Address,City,State,Zip
K. Ryan,Family Medicine,DO,PCOM,6317 York Rd,Baltimore,MD,21212
K. Ryan,Family Medicine,DO,PCOM,100 Walter Ward Blvd,Abingdon,MD,21009
S. Evans,Internal Medicine,MD,UMD,100 Walter Ward Blvd,Abingdon,MD,21009
Expand All @@ -59,12 +58,5 @@ M. Grady,Physical Therapy,PT,Other,3491 Merchants Blvd,Abingdon,MD,21009
DataFrame df = DataFrame::from_csv(ss);
std::stringstream oss;
BOOST_TEST(df.to_csv(oss));
// Order isn't preserved because map sorts keys alphabetically.
std::string s2 = R"""(Address,City,Degree,Name,School,Specialty,State,Zip
6317 York Rd,Baltimore,DO,K. Ryan,PCOM,Family Medicine,MD,21212
100 Walter Ward Blvd,Abingdon,DO,K. Ryan,PCOM,Family Medicine,MD,21009
100 Walter Ward Blvd,Abingdon,MD,S. Evans,UMD,Internal Medicine,MD,21009
3491 Merchants Blvd,Abingdon,PT,M. Grady,Other,Physical Therapy,MD,21009
)""";
BOOST_TEST(s2 == oss.str(), tt::per_element());
BOOST_TEST(s == oss.str(), tt::per_element());
}

0 comments on commit b7a4b6b

Please sign in to comment.