Merge pull request #335 from ONSdigital/RDRP-1004_imp_breakdowns

RDRP-1004 imputation breakdowns
ONSdigital · Sep 18, 2024 · 2053b0d · 2053b0d
2 parents 0150aba + 416a8ad
commit 2053b0d
Show file tree

Hide file tree

Showing 11 changed files with 609 additions and 118 deletions.
diff --git a/helpers/regression_imputation.py b/helpers/regression_imputation.py
@@ -0,0 +1,68 @@
+"""
+Regression test to compare two versions of outputs
+Reads two csv files, old and new
+Selects the columns of interest
+Joins old and new on key columns, outer
+Checks which records are in old only (left), new only (right) or both
+Compares if the old and new values are the same within tolerance
+Saves the ouotputs
+"""
+
+#%% Configuration settings
+import pandas as pd
+
+# Input folder and file names
+root_path = "R:/BERD Results System Development 2023/DAP_emulation/2023_surveys/BERD/06_imputation/imputation_qa/"
+in_file_old = "2023_full_responses_imputed_24-09-10_v764.csv"
+in_file_new = "tmp_qa_output2.csv"
+
+# Output folder and file
+out_fol = root_path
+out_file = "imputation_breakdown_check.csv"
+
+# Columns to select
+key_cols = ["reference", "instance"]
+value_col = "211"
+other_cols = [
+    "200",
+    "201",
+    "formtype",
+    "imp_class",
+    "imp_marker"
+    "status",
+]
+tolerance = 0.001
+#%% Read files
+cols_read = key_cols + [value_col] + other_cols
+df_old = pd.read_csv(root_path + in_file_old)
+df_new = pd.read_csv(root_path + in_file_new)
+
+#%% join old and new
+df_merge = df_old.merge(df_new, on=key_cols, how="inner", suffixes=("_old", "_new"))
+
+#%%
+df_merge.to_csv(root_path + out_file, index=False)
+
+
+#%% Filter good statuses only
+imp_markers_to_keep = ["TMI", "CF", "MoR", "constructed"]
+df_old_good = df_old[df_old["imp_marker"].isin(imp_markers_to_keep)]
+df_new_good = df_new[df_new["imp_marker"].isin(imp_markers_to_keep)]
+
+#%% sizes
+print(f"Old size: {df_old_good.shape}")
+print(f"New size: {df_new_good.shape}")
+
+#%% Join
+df_merge = df_old_good.merge(
+    df_new_good, on=key_cols, how="outer", suffixes=("_old", "_new"), indicator=True
+)
+#%% Compare the values
+df_merge["value_different"] = (
+    df_merge[value_col + "_old"] - df_merge[value_col + "_new"]
+) ** 2 > tolerance**2
+
+# %% Save output
+write_csv(out_fol + out_file, df_merge)
+
+# %%
diff --git a/src/_version.py b/src/_version.py
@@ -1 +1 @@
-__version__ = "1.1.7"
+__version__ = "2.0.0"
diff --git a/src/construction/all_data_construction.py b/src/construction/all_data_construction.py
@@ -24,6 +24,7 @@ def all_data_construction(
     construction_df: pd.DataFrame,
     snapshot_df: pd.DataFrame,
     construction_logger: logging.Logger,
+    config: dict,
     is_northern_ireland: bool = False,
 ) -> pd.DataFrame:
     """Run all data construction on the GB or NI data.
@@ -122,7 +123,7 @@ def all_data_construction(
     # Check breakdowns
     if not is_northern_ireland:
         updated_snapshot_df = run_breakdown_validation(
-            updated_snapshot_df, check="constructed"
+            updated_snapshot_df, config, check="constructed"
         )
 
     construction_logger.info(f"Construction edited {construction_df.shape[0]} rows.")

diff --git a/src/construction/construction_main.py b/src/construction/construction_main.py
@@ -3,7 +3,6 @@
 from typing import Callable
 
 import pandas as pd
-import numpy as np
 
 from src.construction.construction_read_validate import (
     read_validate_all_construction_files,
@@ -63,7 +62,7 @@ def run_construction(  # noqa: C901
             is_northern_ireland=True,
         )
         updated_snapshot_df = all_data_construction(
-            df, snapshot_df, construction_logger, is_northern_ireland=True
+            df, snapshot_df, construction_logger, config, is_northern_ireland=True
         )
 
     elif is_run_all_data_construction:
@@ -73,7 +72,7 @@ def run_construction(  # noqa: C901
             config, check_file_exists, read_csv, construction_logger
         )
         updated_snapshot_df = all_data_construction(
-            df, snapshot_df, construction_logger
+            df, snapshot_df, construction_logger, config
         )
 
     elif is_run_postcode_construction:

diff --git a/src/dev_config.yaml b/src/dev_config.yaml
@@ -199,6 +199,30 @@ breakdowns:
     - "headcount_tec_f"
     - "headcount_oth_m"
     - "headcount_oth_f"
+consistency_checks:
+  2xx_totals:
+    purchases_split: ["222", "223", "203"]
+    sal_oth_expend: ["202", "203", "204"]
+    research_expend: ["205", "206", "207", "204"]
+    capex: ["219", "220", "209", "210"]
+    intram: ["204", "210", "211"]
+    funding: ['212', '214', '216', '242', '250', '243', '244', '245', '246', '247', '248', '249', '218']
+    ownership: ['225', '226', '227', '228', '229', '237', '218']
+    equality: ['211', '218']
+  3xx_totals:
+    purchases: ['302', '303', '304', '305']
+  4xx_totals:
+    emp_civil: ['405', '407', '409', '411']
+    emp_defence: ['406', '408', '410', '412']
+  5xx_totals:
+    hc_res_m: ['501', '503', '505', '507']
+    hc_res_f: ['502', '504', '506', '508']
+  apportioned_totals:
+    employment: ["emp_researcher", "emp_technician", "emp_other", "emp_total"]
+    hc_male: ["headcount_res_m", "headcount_tec_m", "headcount_oth_m", "headcount_tot_m"]
+    hc_female: ["headcount_res_f", "headcount_tec_f", "headcount_oth_f", "headcount_tot_f"]
+    hc_tot: ["headcount_tot_m", "headcount_tot_f", "headcount_total"]
+
 s3:
   ssl_file: "/etc/pki/tls/certs/ca-bundle.crt"
   s3_bucket: "onscdp-dev-data01-5320d6ca"
diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py
@@ -279,11 +279,11 @@ def split_df_on_imp_class(df: pd.DataFrame, exclusion_list: List = ["817", "nan"
     (Product Group)- these will generally be filtered out from the imputation classes.
 
     Where short forms are under consideration, "817" imputation classes will be excluded
-    
+
     Args:
         df (pd.DataFrame): The dataframe to split
-        exclusion_list (List, optional): A list of imputation classes to exclude. 
-    
+        exclusion_list (List, optional): A list of imputation classes to exclude.
+
     Returns:
         pd.DataFrame: The filtered dataframe with the invalid imp classes removed
         pd.DataFrame: The excluded dataframe
@@ -363,9 +363,22 @@ def calculate_totals(df):
     return df
 
 
+def breakdown_checks_after_imputation(df: pd.DataFrame) -> None:
+    """After imputation check required columns still sum correctly.
+
+    Args:
+        df (pd.DataFrame): The dataframe with imputed values.
+
+    Returns:
+        None
+    """
+    # create dictionary of checks: the last col in the list is the total col
+    # the sum of the other cols should equal the total
+
+
 def tidy_imputation_dataframe(df: pd.DataFrame, to_impute_cols: List) -> pd.DataFrame:
     """Update cols with imputed values and remove rows and columns no longer needed.
-    
+
     Args:
         df (pd.DataFrame): The dataframe with imputed values.
         to_impute_cols (List): The columns that were imputed.

diff --git a/src/imputation/imputation_main.py b/src/imputation/imputation_main.py
@@ -13,9 +13,8 @@
 from src.imputation.sf_expansion import run_sf_expansion
 from src.imputation import manual_imputation as mimp
 from src.imputation.MoR import run_mor
-from src.construction.construction_main import run_construction
-from src.mapping.itl_mapping import join_itl_regions
 from src.outputs.outputs_helpers import create_output_df
+from src.utils.breakdown_validation import run_breakdown_validation
 
 
 ImputationMainLogger = logging.getLogger(__name__)
@@ -144,7 +143,9 @@ def run_imputation(
             f"{survey_year}_full_responses_imputed_{tdate}_v{run_id}.csv"
         )
         wrong_604_filename = f"{survey_year}_wrong_604_error_qa_{tdate}_v{run_id}.csv"
-        trimmed_counts_filename = f"{survey_year}_tmi_trim_count_qa_{tdate}_v{run_id}.csv"
+        trimmed_counts_filename = (
+            f"{survey_year}_tmi_trim_count_qa_{tdate}_v{run_id}.csv"
+        )
 
         # create trimming qa dataframe with required columns from schema
         schema_path = config["schema_paths"]["manual_trimming_schema"]
@@ -160,6 +161,9 @@ def run_imputation(
     # remove rows and columns no longer needed from the imputed dataframe
     imputed_df = hlp.tidy_imputation_dataframe(imputed_df, to_impute_cols)
 
+    # Check the imputed values are consistent with breakdown cols summing to totals.
+    run_breakdown_validation(imputed_df, config, check="imputed")
+
     # optionally output backdata for imputation
     if config["global"]["output_backdata"]:
         ImputationMainLogger.info("Outputting backdata for imputation.")

diff --git a/src/staging/staging_main.py b/src/staging/staging_main.py
@@ -10,6 +10,8 @@
 import src.staging.staging_helpers as helpers
 from src.staging import validation as val
 
+# from src.utils.breakdown_validation import run_breakdown_validation
+
 StagingMainLogger = logging.getLogger(__name__)
 
 
@@ -160,6 +162,12 @@ def run_staging(  # noqa: C901
         rd_file_exists(postcode_mapper, raise_error=True)
         postcode_mapper = rd_read_csv(postcode_mapper)
 
+    # Staging of the main snapshot data is now complete
+    StagingMainLogger.info("Staging of main snapshot data complete.")
+    # run validation on the breakdowns
+    # run_breakdown_validation(full_responses, config, "staged")
+
+    # Staging of the additional data
     if config["global"]["load_manual_outliers"]:
         # Stage the manual outliers file
         StagingMainLogger.info("Loading Manual Outlier File")
@@ -180,10 +188,7 @@ def run_staging(  # noqa: C901
     # Get the latest manual trim file
     manual_trim_path = staging_dict["manual_imp_trim_path"]
 
-    if (
-        config["global"]["load_manual_imputation"] and
-        rd_file_exists(manual_trim_path)
-    ):
+    if config["global"]["load_manual_imputation"] and rd_file_exists(manual_trim_path):
         StagingMainLogger.info("Loading Imputation Manual Trimming File")
         wanted_cols = ["reference", "instance", "manual_trim"]
         manual_trim_df = rd_read_csv(manual_trim_path, wanted_cols)

diff --git a/src/user_config.yaml b/src/user_config.yaml
@@ -29,7 +29,7 @@ global:
   output_imputation_qa: False
   output_auto_outliers: False
   output_outlier_qa : False
-  output_estimation_qa: True
+  output_estimation_qa: False
   output_apportionment_qa: False
   # Final output settings
   output_long_form: False