Skip to content

Commit

Permalink
refactor RListExporter
Browse files Browse the repository at this point in the history
  • Loading branch information
pachadotdev committed Nov 13, 2024
1 parent cef6b69 commit 4f7d3cd
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 19 deletions.
90 changes: 88 additions & 2 deletions dev/04-test-equivalence.r
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
library(dplyr)
library(data.table)
library(ipumsr
library(janitor))
library(ipumsr)
library(janitor)

# IPUMS data ----

Expand Down Expand Up @@ -100,6 +100,84 @@ generate_tables <- function(d_control, file_name, country_code, census_year, var
return(list(d_control_sex = d_control_sex, d_control_age = d_control_age))
}

generate_tables_r <- function(d_control, country_code, census_year, var_sex, var_age) {
# just Chile, it is identical to the other validation function
file_name <- "downloads/CP2017CHL/BaseOrg16/CPV2017-16.dic"
# Process d_control for sex
d_control_sex <- d_control %>%
filter(country == country_code, year == census_year) %>%
group_by(sex) %>%
summarise(n_ipums = sum(perwt)) %>%
ungroup() %>%
mutate(pct_ipums = n_ipums / sum(n_ipums) * 100)

# Process d_control for age
d_control_age <- d_control %>%
filter(country == country_code, year == census_year) %>%
group_by(age2) %>%
summarise(n_ipums = sum(perwt)) %>%
ungroup() %>%
mutate(pct_ipums = n_ipums / sum(n_ipums) * 100)

# Read and clean the external data
d_external <- redatam::read_redatam(file_name)

names(d_external)

d_sex <- d_external$persona %>%
group_by(sex = !!sym(var_sex)) %>%
count(name = "n_rdtm") %>%
ungroup() %>%
mutate(pct_rdtm = n_rdtm / sum(n_rdtm) * 100)

# Bind columns and calculate differences for sex
d_control_sex <- d_control_sex %>%
left_join(d_sex) %>%
mutate(
n_diff = n_ipums - n_rdtm,
pct_diff = pct_ipums - pct_rdtm
)

# Bind columns and calculate differences for age
d_control_age <- d_control_age %>%
left_join(
d_external$persona %>%
mutate(
age2 = case_when(
!!sym(var_age) >= 0 & !!sym(var_age) <= 4 ~ 1,
!!sym(var_age) >= 5 & !!sym(var_age) <= 9 ~ 2,
!!sym(var_age) >= 10 & !!sym(var_age) <= 14 ~ 3,
!!sym(var_age) >= 15 & !!sym(var_age) <= 19 ~ 4,
!!sym(var_age) >= 20 & !!sym(var_age) <= 24 ~ 12,
!!sym(var_age) >= 25 & !!sym(var_age) <= 29 ~ 13,
!!sym(var_age) >= 30 & !!sym(var_age) <= 34 ~ 14,
!!sym(var_age) >= 35 & !!sym(var_age) <= 39 ~ 15,
!!sym(var_age) >= 40 & !!sym(var_age) <= 44 ~ 16,
!!sym(var_age) >= 45 & !!sym(var_age) <= 49 ~ 17,
!!sym(var_age) >= 50 & !!sym(var_age) <= 54 ~ 18,
!!sym(var_age) >= 55 & !!sym(var_age) <= 59 ~ 19,
!!sym(var_age) >= 60 & !!sym(var_age) <= 64 ~ 20,
!!sym(var_age) >= 65 & !!sym(var_age) <= 69 ~ 21,
!!sym(var_age) >= 70 & !!sym(var_age) <= 74 ~ 22,
!!sym(var_age) >= 75 & !!sym(var_age) <= 79 ~ 23,
!!sym(var_age) >= 80 & !!sym(var_age) <= 84 ~ 24,
!!sym(var_age) >= 85 ~ 25
)
) %>%
group_by(age2) %>%
count(name = "n_rdtm") %>%
ungroup() %>%
mutate(pct_rdtm = n_rdtm / sum(n_rdtm) * 100)
) %>%
mutate(
n_diff = n_ipums - n_rdtm,
pct_diff = pct_ipums - pct_rdtm
)

# Return the tables
return(list(d_control_sex = d_control_sex, d_control_age = d_control_age))
}

# Chile 2017 ----

# https://redatam-ine.ine.cl/manuales/Manual-Usuario.pdf
Expand All @@ -115,6 +193,14 @@ result <- generate_tables(d_control,
"p09"
)

result <- generate_tables_r(
d_control,
152L,
2017L,
"p08",
"p09"
)

# Bolivia 2001 ----

result <- generate_tables(
Expand Down
2 changes: 1 addition & 1 deletion rpkg/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: redatam
Type: Package
Title: Import 'REDATAM' Files
Version: 2.0.3
Version: 2.0.4
Authors@R: c(
person(
given = "Mauricio",
Expand Down
7 changes: 7 additions & 0 deletions rpkg/NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# redatam 2.0.4

* Refactored C++ to R list casting to avoid growing a list in a loop. It now
creates a list with a lenght equal to the number of entities and variables
with descriptions and then fills it with the data. This is 3 to 5 times
faster.

# redatam 2.0.3

* Fixes memory management issues suggested by Ivan Krylov regarding the C++ to R
Expand Down
36 changes: 20 additions & 16 deletions rpkg/src/redatamlib/exporters/RListExporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,22 @@ ListExporter::ListExporter(const std::string &outputDirectory)

cpp11::list ListExporter::ExportAllR(
const std::vector<Entity> &entities) const {
cpp11::writable::list result;
cpp11::writable::strings resultNames;
size_t numEntities = entities.size();
cpp11::writable::list result(numEntities);
cpp11::writable::strings resultNames(numEntities);

for (const Entity &entity : entities) {
for (size_t entityIndex = 0; entityIndex < numEntities; ++entityIndex) {
const Entity &entity = entities[entityIndex];
std::string entityName = entity.GetName();
std::transform(entityName.begin(), entityName.end(), entityName.begin(),
::tolower);

std::string exportingEntityMsg = "Exporting " + entityName + "...";
cpp11::message(exportingEntityMsg.c_str());

cpp11::writable::list entityList;
cpp11::writable::strings variableNames;
size_t numVariables = entity.GetVariables()->size();
cpp11::writable::list entityList(numVariables + 2); // +2 for REF_ID and PARENT_REF_ID
cpp11::writable::strings variableNames(numVariables + 2);

// Add REF_ID and PARENT_REF_ID columns
size_t numRows = entity.GetRowsCount();
Expand All @@ -50,16 +53,17 @@ cpp11::list ListExporter::ExportAllR(
}
}

entityList.push_back(ref_id_vec);
variableNames.push_back(ref_id_name);
entityList[0] = ref_id_vec;
variableNames[0] = ref_id_name;

if (!entity.GetParentName().empty()) {
entityList.push_back(parent_ref_id_vec);
variableNames.push_back(parent_ref_id_name);
entityList[1] = parent_ref_id_vec;
variableNames[1] = parent_ref_id_name;
}

// Add vectors for each variable
for (const Variable &v : *(entity.GetVariables().get())) {
for (size_t varIndex = 0; varIndex < numVariables; ++varIndex) {
const Variable &v = entity.GetVariables()->at(varIndex);
try {
switch (v.GetType()) {
case BIN:
Expand All @@ -72,7 +76,7 @@ cpp11::list ListExporter::ExportAllR(
for (size_t i = 0; i < numRows; i++) {
rvalues[i] = values->at(i);
}
entityList.push_back(rvalues);
entityList[varIndex + 2] = rvalues;
break;
}
case CHR: {
Expand All @@ -85,7 +89,7 @@ cpp11::list ListExporter::ExportAllR(
std::replace(clean_string.begin(), clean_string.end(), '\0', ' ');
rvalues[i] = clean_string;
}
entityList.push_back(rvalues);
entityList[varIndex + 2] = rvalues;
break;
}
case DBL: {
Expand All @@ -95,7 +99,7 @@ cpp11::list ListExporter::ExportAllR(
for (size_t i = 0; i < numRows; i++) {
rvalues[i] = values->at(i);
}
entityList.push_back(rvalues);
entityList[varIndex + 2] = rvalues;
break;
}
default:
Expand All @@ -110,7 +114,7 @@ cpp11::list ListExporter::ExportAllR(
cpp11::message(errorExportingVariableMsg.c_str());
}

variableNames.push_back(v.GetName());
variableNames[varIndex + 2] = v.GetName();

// Add variable labels to the main list
AddVariableLabels(v, result, resultNames, entity.GetName());
Expand All @@ -119,8 +123,8 @@ cpp11::list ListExporter::ExportAllR(
if (variableNames.size() > 0) {
entityList.names() = variableNames;
}
result.push_back(entityList);
resultNames.push_back(entity.GetName());
result[entityIndex] = entityList;
resultNames[entityIndex] = entity.GetName();
}
result.names() = resultNames;
return result;
Expand Down

0 comments on commit 4f7d3cd

Please sign in to comment.