Skip to content

Commit

Permalink
921 merge in develop
Browse files Browse the repository at this point in the history
  • Loading branch information
AnneONS committed Jul 24, 2024
2 parents 423dd9b + e90ee48 commit ad32dd8
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 2 deletions.
19 changes: 19 additions & 0 deletions src/staging/staging_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,3 +393,22 @@ def stage_validate_harmonise_postcodes(
StagingHelperLogger.info("Finished PostCode Validation")

return full_responses, postcode_mapper


def filter_pnp_data(full_responses):
"""
Filter out all PNP data or equivalently all records with legalstatus of 7
Args:
full_responses (pandas.DataFrame):
The DataFrame containing the full resonses data.
Returns:
pandas.DataFrame: DataFrame without rows where 'legalstatus' == '7'
"""

# filter out PNP data or equivalently records with legalstatus=7
full_responses = full_responses.loc[(full_responses["legalstatus"] != "7")]

return full_responses
4 changes: 2 additions & 2 deletions src/staging/staging_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def run_staging( # noqa: C901
# Load data from first feather file found
StagingMainLogger.info("Skipping data validation. Loading from feather")
full_responses = helpers.load_snapshot_feather(feather_file, read_feather)
# filter out PNP data legalstatus=7
full_responses = full_responses.loc[(full_responses["legalstatus"] != "7")]
# filter out PNP data equivalent to legalstatus=7
full_responses = helpers.filter_pnp_data(full_responses)
if load_updated_snapshot:
secondary_full_responses = helpers.load_snapshot_feather(
secondary_feather_file, read_feather
Expand Down
68 changes: 68 additions & 0 deletions tests/test_staging/test_staging_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# Third Party Imports
import pandas as pd
import numpy as np
from pandas import DataFrame as pandasDF
import pyarrow.feather as feather

# Local Imports
Expand All @@ -22,6 +23,7 @@
load_validate_secondary_snapshot,
df_to_feather,
stage_validate_harmonise_postcodes,
filter_pnp_data,
)
from src.utils.local_file_mods import (
rd_file_exists as check_file_exists,
Expand Down Expand Up @@ -393,3 +395,69 @@ def test_stage_validate_harmonise_postcodes(
assert (
filename in files
), "stage_validate_harmonise_postcodes failed to save out invalid PCs"


class Testfilter_pnp_data:
"""Tests for the filter_pnp_data function."""

def create_input_df(self):
"""Create an input dataframe for the test."""
input_columns = [
"reference",
"instance",
"legalstatus",
"statusencoded",
"postcodes_harmonised",
]

data = [
[49900000404, 0, "1", "210", "AB15 3GU"],
[49900000406, np.NaN, "2", "210", "BA1 5DA"],
[49900000409, 1, "1", "100", "CB1 3NF"],
[49900000510, 2, "7", "201", "BA1 5DA"],
[49912758922, 3, "1", "303", "DE72 3AU"],
[49900187320, 4, "2", "304", "NP30 7ZZ"],
[49900184433, 1, "7", "210", "CF10 BZZ"],
[49911791786, 1, "4", "201", "CF10 BZZ"],
[49901183959, 4, "1", "309", "SA50 5BE"],
]

input_df = pandasDF(data=data, columns=input_columns)
input_df["legalstatus"].astype("category")
input_df["statusencoded"].astype("category")
return input_df

def create_exp_output_df(self):
"""Create an output dataframe for the test."""
exp_output_columns = [
"reference",
"instance",
"legalstatus",
"statusencoded",
"postcodes_harmonised",
]

data = [
[49900000404, 0, "1", "210", "AB15 3GU"],
[49900000406, np.NaN, "2", "210", "BA1 5DA"],
[49900000409, 1, "1", "100", "CB1 3NF"],
[49912758922, 3, "1", "303", "DE72 3AU"],
[49900187320, 4, "2", "304", "NP30 7ZZ"],
[49911791786, 1, "4", "201", "CF10 BZZ"],
[49901183959, 4, "1", "309", "SA50 5BE"],
]
exp_output_df = pandasDF(data=data, columns=exp_output_columns)
exp_output_df["legalstatus"].astype("category")
exp_output_df["statusencoded"].astype("category")
return exp_output_df

def test_filter_pnp_data(self):
"""Test for the filter_pnp_data function."""
input_df = self.create_input_df()
exp_df = self.create_exp_output_df()

result_df = filter_pnp_data(input_df)

pd.testing.assert_frame_equal(
result_df.reset_index(drop=True), exp_df.reset_index(drop=True)
)

0 comments on commit ad32dd8

Please sign in to comment.