Skip to content

Commit

Permalink
Merge pull request #335 from ONSdigital/RDRP-1004_imp_breakdowns
Browse files Browse the repository at this point in the history
RDRP-1004 imputation breakdowns
  • Loading branch information
AnneONS authored Sep 18, 2024
2 parents 0150aba + 416a8ad commit 2053b0d
Show file tree
Hide file tree
Showing 11 changed files with 609 additions and 118 deletions.
68 changes: 68 additions & 0 deletions helpers/regression_imputation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Regression test to compare two versions of outputs
Reads two csv files, old and new
Selects the columns of interest
Joins old and new on key columns, outer
Checks which records are in old only (left), new only (right) or both
Compares if the old and new values are the same within tolerance
Saves the ouotputs
"""

#%% Configuration settings
import pandas as pd

# Input folder and file names
root_path = "R:/BERD Results System Development 2023/DAP_emulation/2023_surveys/BERD/06_imputation/imputation_qa/"
in_file_old = "2023_full_responses_imputed_24-09-10_v764.csv"
in_file_new = "tmp_qa_output2.csv"

# Output folder and file
out_fol = root_path
out_file = "imputation_breakdown_check.csv"

# Columns to select
key_cols = ["reference", "instance"]
value_col = "211"
other_cols = [
"200",
"201",
"formtype",
"imp_class",
"imp_marker"
"status",
]
tolerance = 0.001
#%% Read files
cols_read = key_cols + [value_col] + other_cols
df_old = pd.read_csv(root_path + in_file_old)
df_new = pd.read_csv(root_path + in_file_new)

#%% join old and new
df_merge = df_old.merge(df_new, on=key_cols, how="inner", suffixes=("_old", "_new"))

#%%
df_merge.to_csv(root_path + out_file, index=False)


#%% Filter good statuses only
imp_markers_to_keep = ["TMI", "CF", "MoR", "constructed"]
df_old_good = df_old[df_old["imp_marker"].isin(imp_markers_to_keep)]
df_new_good = df_new[df_new["imp_marker"].isin(imp_markers_to_keep)]

#%% sizes
print(f"Old size: {df_old_good.shape}")
print(f"New size: {df_new_good.shape}")

#%% Join
df_merge = df_old_good.merge(
df_new_good, on=key_cols, how="outer", suffixes=("_old", "_new"), indicator=True
)
#%% Compare the values
df_merge["value_different"] = (
df_merge[value_col + "_old"] - df_merge[value_col + "_new"]
) ** 2 > tolerance**2

# %% Save output
write_csv(out_fol + out_file, df_merge)

# %%
2 changes: 1 addition & 1 deletion src/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.1.7"
__version__ = "2.0.0"
3 changes: 2 additions & 1 deletion src/construction/all_data_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def all_data_construction(
construction_df: pd.DataFrame,
snapshot_df: pd.DataFrame,
construction_logger: logging.Logger,
config: dict,
is_northern_ireland: bool = False,
) -> pd.DataFrame:
"""Run all data construction on the GB or NI data.
Expand Down Expand Up @@ -122,7 +123,7 @@ def all_data_construction(
# Check breakdowns
if not is_northern_ireland:
updated_snapshot_df = run_breakdown_validation(
updated_snapshot_df, check="constructed"
updated_snapshot_df, config, check="constructed"
)

construction_logger.info(f"Construction edited {construction_df.shape[0]} rows.")
Expand Down
5 changes: 2 additions & 3 deletions src/construction/construction_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Callable

import pandas as pd
import numpy as np

from src.construction.construction_read_validate import (
read_validate_all_construction_files,
Expand Down Expand Up @@ -63,7 +62,7 @@ def run_construction( # noqa: C901
is_northern_ireland=True,
)
updated_snapshot_df = all_data_construction(
df, snapshot_df, construction_logger, is_northern_ireland=True
df, snapshot_df, construction_logger, config, is_northern_ireland=True
)

elif is_run_all_data_construction:
Expand All @@ -73,7 +72,7 @@ def run_construction( # noqa: C901
config, check_file_exists, read_csv, construction_logger
)
updated_snapshot_df = all_data_construction(
df, snapshot_df, construction_logger
df, snapshot_df, construction_logger, config
)

elif is_run_postcode_construction:
Expand Down
24 changes: 24 additions & 0 deletions src/dev_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,30 @@ breakdowns:
- "headcount_tec_f"
- "headcount_oth_m"
- "headcount_oth_f"
consistency_checks:
2xx_totals:
purchases_split: ["222", "223", "203"]
sal_oth_expend: ["202", "203", "204"]
research_expend: ["205", "206", "207", "204"]
capex: ["219", "220", "209", "210"]
intram: ["204", "210", "211"]
funding: ['212', '214', '216', '242', '250', '243', '244', '245', '246', '247', '248', '249', '218']
ownership: ['225', '226', '227', '228', '229', '237', '218']
equality: ['211', '218']
3xx_totals:
purchases: ['302', '303', '304', '305']
4xx_totals:
emp_civil: ['405', '407', '409', '411']
emp_defence: ['406', '408', '410', '412']
5xx_totals:
hc_res_m: ['501', '503', '505', '507']
hc_res_f: ['502', '504', '506', '508']
apportioned_totals:
employment: ["emp_researcher", "emp_technician", "emp_other", "emp_total"]
hc_male: ["headcount_res_m", "headcount_tec_m", "headcount_oth_m", "headcount_tot_m"]
hc_female: ["headcount_res_f", "headcount_tec_f", "headcount_oth_f", "headcount_tot_f"]
hc_tot: ["headcount_tot_m", "headcount_tot_f", "headcount_total"]

s3:
ssl_file: "/etc/pki/tls/certs/ca-bundle.crt"
s3_bucket: "onscdp-dev-data01-5320d6ca"
21 changes: 17 additions & 4 deletions src/imputation/imputation_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,11 +279,11 @@ def split_df_on_imp_class(df: pd.DataFrame, exclusion_list: List = ["817", "nan"
(Product Group)- these will generally be filtered out from the imputation classes.
Where short forms are under consideration, "817" imputation classes will be excluded
Args:
df (pd.DataFrame): The dataframe to split
exclusion_list (List, optional): A list of imputation classes to exclude.
exclusion_list (List, optional): A list of imputation classes to exclude.
Returns:
pd.DataFrame: The filtered dataframe with the invalid imp classes removed
pd.DataFrame: The excluded dataframe
Expand Down Expand Up @@ -363,9 +363,22 @@ def calculate_totals(df):
return df


def breakdown_checks_after_imputation(df: pd.DataFrame) -> None:
"""After imputation check required columns still sum correctly.
Args:
df (pd.DataFrame): The dataframe with imputed values.
Returns:
None
"""
# create dictionary of checks: the last col in the list is the total col
# the sum of the other cols should equal the total


def tidy_imputation_dataframe(df: pd.DataFrame, to_impute_cols: List) -> pd.DataFrame:
"""Update cols with imputed values and remove rows and columns no longer needed.
Args:
df (pd.DataFrame): The dataframe with imputed values.
to_impute_cols (List): The columns that were imputed.
Expand Down
10 changes: 7 additions & 3 deletions src/imputation/imputation_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
from src.imputation.sf_expansion import run_sf_expansion
from src.imputation import manual_imputation as mimp
from src.imputation.MoR import run_mor
from src.construction.construction_main import run_construction
from src.mapping.itl_mapping import join_itl_regions
from src.outputs.outputs_helpers import create_output_df
from src.utils.breakdown_validation import run_breakdown_validation


ImputationMainLogger = logging.getLogger(__name__)
Expand Down Expand Up @@ -144,7 +143,9 @@ def run_imputation(
f"{survey_year}_full_responses_imputed_{tdate}_v{run_id}.csv"
)
wrong_604_filename = f"{survey_year}_wrong_604_error_qa_{tdate}_v{run_id}.csv"
trimmed_counts_filename = f"{survey_year}_tmi_trim_count_qa_{tdate}_v{run_id}.csv"
trimmed_counts_filename = (
f"{survey_year}_tmi_trim_count_qa_{tdate}_v{run_id}.csv"
)

# create trimming qa dataframe with required columns from schema
schema_path = config["schema_paths"]["manual_trimming_schema"]
Expand All @@ -160,6 +161,9 @@ def run_imputation(
# remove rows and columns no longer needed from the imputed dataframe
imputed_df = hlp.tidy_imputation_dataframe(imputed_df, to_impute_cols)

# Check the imputed values are consistent with breakdown cols summing to totals.
run_breakdown_validation(imputed_df, config, check="imputed")

# optionally output backdata for imputation
if config["global"]["output_backdata"]:
ImputationMainLogger.info("Outputting backdata for imputation.")
Expand Down
13 changes: 9 additions & 4 deletions src/staging/staging_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import src.staging.staging_helpers as helpers
from src.staging import validation as val

# from src.utils.breakdown_validation import run_breakdown_validation

StagingMainLogger = logging.getLogger(__name__)


Expand Down Expand Up @@ -160,6 +162,12 @@ def run_staging( # noqa: C901
rd_file_exists(postcode_mapper, raise_error=True)
postcode_mapper = rd_read_csv(postcode_mapper)

# Staging of the main snapshot data is now complete
StagingMainLogger.info("Staging of main snapshot data complete.")
# run validation on the breakdowns
# run_breakdown_validation(full_responses, config, "staged")

# Staging of the additional data
if config["global"]["load_manual_outliers"]:
# Stage the manual outliers file
StagingMainLogger.info("Loading Manual Outlier File")
Expand All @@ -180,10 +188,7 @@ def run_staging( # noqa: C901
# Get the latest manual trim file
manual_trim_path = staging_dict["manual_imp_trim_path"]

if (
config["global"]["load_manual_imputation"] and
rd_file_exists(manual_trim_path)
):
if config["global"]["load_manual_imputation"] and rd_file_exists(manual_trim_path):
StagingMainLogger.info("Loading Imputation Manual Trimming File")
wanted_cols = ["reference", "instance", "manual_trim"]
manual_trim_df = rd_read_csv(manual_trim_path, wanted_cols)
Expand Down
2 changes: 1 addition & 1 deletion src/user_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ global:
output_imputation_qa: False
output_auto_outliers: False
output_outlier_qa : False
output_estimation_qa: True
output_estimation_qa: False
output_apportionment_qa: False
# Final output settings
output_long_form: False
Expand Down
Loading

0 comments on commit 2053b0d

Please sign in to comment.