Skip to content

Commit

Permalink
Merge pull request #334 from ONSdigital/RDRP-1001_split_construction
Browse files Browse the repository at this point in the history
RDRP-1001: Form sent out/short form instance fix
  • Loading branch information
AnneONS authored Sep 17, 2024
2 parents f4c2420 + 00569d9 commit 0150aba
Show file tree
Hide file tree
Showing 11 changed files with 429 additions and 314 deletions.
130 changes: 130 additions & 0 deletions src/construction/all_data_construction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import logging
from typing import Callable

import pandas as pd
import numpy as np

from src.utils.breakdown_validation import run_breakdown_validation

from src.construction.construction_helpers import (
prepare_forms_gb,
clean_construction_type,
add_constructed_nonresponders,
remove_short_to_long_0,
finalise_forms_gb,
replace_values_in_construction,
)
from src.construction.construction_validation import (
validate_short_to_long,
validate_construction_references,
)


def all_data_construction(
construction_df: pd.DataFrame,
snapshot_df: pd.DataFrame,
construction_logger: logging.Logger,
is_northern_ireland: bool = False,
) -> pd.DataFrame:
"""Run all data construction on the GB or NI data.
This process is different from the postcode only construction that happens
after imputation.
Args:
construction_df (pd.DataFrame): The construction data
snapshot_df (pd.DataFrame): The snapshot data
construction_logger (logging.Logger): The logger for the construction
is_northern_ireland (bool, optional): Whether the data is for Northern Ireland.
Defaults to False.
Returns:
pd.DataFrame: The snapshot data with the constructed values
"""
# to ensure compatibility, change short_to_long to construction_type
# short_to_long used for 2022
if "short_to_long" in construction_df.columns:
construction_df.rename(
columns={"short_to_long": "construction_type"}, inplace=True
)
construction_df.loc[
construction_df["construction_type"] == True, "construction_type"
] = "short_to_long"

# clean construction type column
if "construction_type" in construction_df.columns:
construction_df.construction_type = construction_df.construction_type.apply(
lambda x: clean_construction_type(x)
)
# validate that 'construction_type' is valid
valid_types = ["short_to_long", "new", np.NaN]
if False in list(construction_df.construction_type.isin(valid_types)):
raise ValueError(
f"Invalid value for construction_type. Expected one of {valid_types}"
)

if not is_northern_ireland:
validate_short_to_long(construction_df, construction_logger)

# validate the references passed in construction
validate_construction_references(
construction_df=construction_df,
snapshot_df=snapshot_df,
logger=construction_logger,
)

# Drop columns without constructed values
construction_df = construction_df.dropna(axis="columns", how="all")

# Make a copy of the snapshot
updated_snapshot_df = snapshot_df.copy()

# Add flags to indicate whether a row was constructed or should be imputed
updated_snapshot_df["is_constructed"] = False
updated_snapshot_df["force_imputation"] = False
construction_df["is_constructed"] = True

# Run GB specific actions
if not is_northern_ireland:
updated_snapshot_df, construction_df = prepare_forms_gb(
updated_snapshot_df, construction_df
)

# NI data has no instance but needs an instance of 1
if is_northern_ireland:
construction_df["instance"] = 1

# Add constructed non-responders (i.e. new rows) to df
if "construction_type" in construction_df.columns:
if "new" in construction_df["construction_type"].values:
updated_snapshot_df, construction_df = add_constructed_nonresponders(
updated_snapshot_df, construction_df
)

updated_snapshot_df, construction_df = replace_values_in_construction(
updated_snapshot_df, construction_df
)

if "construction_type" in construction_df.columns:
if "short_to_long" in construction_df["construction_type"].values:
construction_df.reset_index(inplace=True)
updated_snapshot_df = remove_short_to_long_0(
updated_snapshot_df, construction_df
)

# Run GB specific actions
if not is_northern_ireland:
updated_snapshot_df = finalise_forms_gb(updated_snapshot_df)

updated_snapshot_df = updated_snapshot_df.sort_values(
["reference", "instance"], ascending=[True, True]
).reset_index(drop=True)

# Check breakdowns
if not is_northern_ireland:
updated_snapshot_df = run_breakdown_validation(
updated_snapshot_df, check="constructed"
)

construction_logger.info(f"Construction edited {construction_df.shape[0]} rows.")

return updated_snapshot_df
71 changes: 57 additions & 14 deletions src/construction/construction_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def prepare_forms_gb(
def prepare_short_to_long(updated_snapshot_df, construction_df):
"""Create addional instances for short to long construction"""

construction_df.loc[construction_df["construction_type"] == "short_to_long", "604"] = "Yes"
construction_df.loc[
construction_df["construction_type"] == "short_to_long", "604"
] = "Yes"

# Check which references are going to be converted to long forms
# and how many instances they have
Expand Down Expand Up @@ -233,9 +235,8 @@ def remove_short_to_long_0(


def prep_new_rows(
rows_to_add: pd.DataFrame,
updated_snapshot_df: pd.DataFrame
) -> pd.DataFrame:
rows_to_add: pd.DataFrame, updated_snapshot_df: pd.DataFrame
) -> pd.DataFrame:
"""Prepare new rows from construction to be added to the snapshot.
Args:
Expand All @@ -246,26 +247,68 @@ def prep_new_rows(
ValueError: Raised if there are rows with missing formtype/cellnumber.
Returns:
pd.DataFrame: The new rows (from construction) containing formtype and
pd.DataFrame: The new rows (from construction) containing formtype and
cellnumber.
"""
# iterate through new rows and add formtype/cellnumber from snapshot
for index, row in rows_to_add.iterrows():
if pd.isna(row['formtype']) or pd.isna(row['cellnumber']):
reference = row['reference']
snapshot_row = updated_snapshot_df[updated_snapshot_df['reference'] == reference].iloc[0]
if pd.isna(row['formtype']):
rows_to_add.at[index, 'formtype'] = snapshot_row['formtype']
if pd.isna(row['cellnumber']):
rows_to_add.at[index, 'cellnumber'] = snapshot_row['cellnumber']
if pd.isna(row["formtype"]) or pd.isna(row["cellnumber"]):
reference = row["reference"]
snapshot_row = updated_snapshot_df[
updated_snapshot_df["reference"] == reference
].iloc[0]
if pd.isna(row["formtype"]):
rows_to_add.at[index, "formtype"] = snapshot_row["formtype"]
if pd.isna(row["cellnumber"]):
rows_to_add.at[index, "cellnumber"] = snapshot_row["cellnumber"]
# obtain references with missing formtype/cellnumber
missing_references = rows_to_add[
rows_to_add['formtype'].isna() | rows_to_add['cellnumber'].isna()
]['reference']
rows_to_add["formtype"].isna() | rows_to_add["cellnumber"].isna()
]["reference"]
if not missing_references.empty:
raise ValueError(
"Missing formtype and/or cellnumber for new reference in construction: "
f"ref {missing_references.tolist()}"
)

return rows_to_add


def replace_values_in_construction(
updated_snapshot_df: pd.DataFrame, construction_df: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Replace values in the snapshot dataframe with those in the construction dataframe.
Args:
updated_snapshot_df (pd.DataFrame): The updated snapshot dataframe.
construction_df (pd.DataFrame): The construction dataframe.
Returns:
Tuple[pd.DataFrame, pd.DataFrame]: The updated snapshot dataframe and the
modified construction dataframe.
"""
# Update the values with the constructed ones
construction_df.set_index(
[
"reference",
"instance",
"period_year",
],
inplace=True,
)
updated_snapshot_df.set_index(
[
"reference",
"instance",
"period_year",
],
inplace=True,
)
updated_snapshot_df.update(construction_df)
updated_snapshot_df.reset_index(inplace=True)

updated_snapshot_df = updated_snapshot_df.astype(
{"reference": "Int64", "instance": "Int64", "period_year": "Int64"}
)

return updated_snapshot_df, construction_df
Loading

0 comments on commit 0150aba

Please sign in to comment.