diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0a8a41daf..4f4d4f37e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,14 +33,14 @@ repos: - id: black name: black - consistent Python code formatting (auto-fixes) language_version: python # Should be a command that runs python3.6+ - exclude: .*/tests/.*|^\.cruft\.json$|.*\tests\.*|helpers/.* + exclude: .*/tests/.*|^\.cruft\.json$|.*\tests\.*|helpers/.*|^tests args: ["--experimental-string-processing", "--line-length=88"] - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: - id: flake8 name: flake8 - Python linting - exclude: .*/tests*|^\.cruft\.json|helpers/.*|.*\tests*$ + exclude: .*/tests*|^\.cruft\.json|helpers/.*|.*\tests*$|^tests - repo: https://github.com/nbQA-dev/nbQA rev: 0.12.0 hooks: diff --git a/config/backdata_schema.toml b/config/backdata_schema.toml index e6d5719cc..c2cfe4a8e 100644 --- a/config/backdata_schema.toml +++ b/config/backdata_schema.toml @@ -1,1157 +1,367 @@ -# Network schema file matches working schema on DAP - [reference] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = false -Current_Data_Type = "str" -Length = ">=1" -Min_values = 11001603625 -Max_values = 19891309165 -Possible_Categorical_Values = ["nan"] - -[status] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = false -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Clear - overridden SE", "Form sent out", "Check needed", "Combined child (NIL2)", "Out of scope (NIL3)", "Ceased trading (NIL4)", "Dormant (NIL5)", "Part year return (NIL8)", "No UK activity (NIL9)"] +old_name = "reference" +Deduced_Data_Type = "int64" [instance] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] -q_code = "" -old_ref = "nan" +old_name = "instance" +Deduced_Data_Type = "int64" [101] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan",] -q_code = "" -old_ref = "nan" - -[102] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan",] -q_code = "" -old_ref = "nan" +old_name = "101" +Deduced_Data_Type = "object" [103] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan",] -q_code = "" -old_ref = "nan" +old_name = "103" +Deduced_Data_Type = "float64" [104] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan",] -q_code = "" -old_ref = "nan" +old_name = "104" +Deduced_Data_Type = "float64" [200] -Description = "Business type: Civil or Defence" -Deduced_Data_Type = "str" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_Categorical_Values = [] -q_code = "" -old_ref = "c_or_d" +old_name = "200" +Deduced_Data_Type = "object" [201] -Description = "Published Product Group" -Deduced_Data_Type = "str" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_Categorical_Values = [] -q_code = "" -old_ref = "selgrpno" +old_name = "201" +Deduced_Data_Type = "object" [202] -Description = "Salaries & Wages" +old_name = "202" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_Categorical_Values = [] -q_code = "q202" -old_ref = "slries" [203] -Description = "Other current expenditure" +old_name = "203" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_Categorical_Values = [] -q_code = "q203" -old_ref = "curr_oth" [204] -Description = "Total Current Expenditure" +old_name = "204" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q204" -old_ref = "curr_tot" [205] -Description = "Basic Research" +old_name = "205" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q205" -old_ref = "res_bas" [206] -Description = "Applied Research" +old_name = "206" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q206" -old_ref = "res_app" [207] -Description = "Experimental Development" +old_name = "207" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q207" -old_ref = "experm" [209] -Description = "Equipment & Machinery CapEx" +old_name = "209" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q209" -old_ref = "cxplm" [210] -Description = "Total Capex." +old_name = "210" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q210" -old_ref = "None" [211] -Description = "Total Inhouse Expenditure" +old_name = "211" Deduced_Data_Type = "float64" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q211" -old_ref = "intram" [212] -Description = "Own Funds" +old_name = "212" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q212" -old_ref = "own_f" [214] -Description = "Funding - UK government" +old_name = "214" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q214" -old_ref = "ex_uk_gv" [216] -Description = "Funding - Other UK Private Bus/Public Orgs" +old_name = "216" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q216" -old_ref = "ex_uk" [218] -Description = "Total Funding" +old_name = "218" Deduced_Data_Type = "float64" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q218" -old_ref = "f_tot" [219] -Description = "Land Acquired for R&D (Split of Land & Build CapEx)" +old_name = "219" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q219" -old_ref = "None" [220] -Description = "Buildings acquired/constructed for R&D (Split of Land & Build CapEx)" +old_name = "220" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q220" -old_ref = "None" [221] -Description = "Expenditure on computer software only (of which from Equipment & Machinery CapEx)" +old_name = "221" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q221" -old_ref = "None" [222] -Description = "Purchase of Materials (Split of Other current)" +old_name = "222" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q222" -old_ref = "None" [223] -Description = "Purchase of Services (Split of Other current)" +old_name = "223" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q223" -old_ref = "None" [225] -Description = "Ownership - Own Business" +old_name = "225" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q225" -old_ref = "None" [226] -Description = "Ownership - UK Government" +old_name = "226" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q226" -old_ref = "None" [227] -Description = "Ownership - Other UK Priv Bus" +old_name = "227" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q227" -old_ref = "None" [228] -Description = "Ownership - Other UK Orgs" +old_name = "228" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q228" -old_ref = "None" [229] -Description = "Ownership - Bus Enterprises in Group Outside UK" +old_name = "229" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q229" -old_ref = "None" [237] -Description = "Ownership - not owned freely available" +old_name = "237" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q237" -old_ref = "None" [242] -Description = "Funding - Any other UK organisations" +old_name = "242" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q242" -old_ref = "None" [243] -Description = "Funding - Business Enterprises in group outside UK" +old_name = "243" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q243" -old_ref = "None" [244] -Description = "Funding - Other Business Enterprises outside UK" +old_name = "244" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q244" -old_ref = "None" [245] -Description = "Funding - Other Governments outside UK" +old_name = "245" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q245" -old_ref = "None" [246] -Description = "Funding - Higher Education Est Outside UK" +old_name = "246" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q246" -old_ref = "None" [247] -Description = "Funding - Non-profit Orgs outside UK" +old_name = "247" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q247" -old_ref = "None" [248] -Description = "Funding - International Orgs" +old_name = "248" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q248" -old_ref = "None" [249] -Description = "Funding - Any other orgs outside UK" +old_name = "249" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q249" -old_ref = "None" [250] -Description = "Funding - UK Higher Education Establishments" +old_name = "250" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q250" -old_ref = "None" [251] -Description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM" -Deduced_Data_Type = "str" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_categorical_Values = [] -q_code = "q251" -old_ref = "None" +old_name = "251" +Deduced_Data_Type = "object" [300] -Description = "nan" +old_name = "300" Deduced_Data_Type = "object" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [301] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" +old_name = "301" +Deduced_Data_Type = "float64" [302] -Description = "Purchased/funded R&D in the UK (Yes or No)" +old_name = "302" Deduced_Data_Type = "float64" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_categorical_Values = [] -q_code = "q302" -old_ref = "None" [303] -Description = "Purchased Outside UK (Govt Funded)" +old_name = "303" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q303" -old_ref = "None" [304] -Description = "Purchased Outside UK (Other)" +old_name = "304" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q304" -old_ref = "None" [305] -Description = "Total Purchased" +old_name = "305" Deduced_Data_Type = "float64" -Nullable = "Yes" -Length = "# an integer of the length, or N/A" -Min_values = 0 -Max_values = 1000000 -Possible_categorical_Values = [] -q_code = "q305" -old_ref = "None" [307] -Description = "Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM" -Deduced_Data_Type = "str" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_categorical_Values = [] -q_code = "q307" -old_ref = "None" +old_name = "307" +Deduced_Data_Type = "object" [308] -Description = "Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM" -Deduced_Data_Type = "str" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_categorical_Values = [] -q_code = "q308" -old_ref = "None" +old_name = "308" +Deduced_Data_Type = "object" [309] -Description = "Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM" -Deduced_Data_Type = "str" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_categorical_Values = [] -q_code = "q309" -old_ref = "None" +old_name = "309" +Deduced_Data_Type = "object" [405] -Description ="emp_researcher (Defence)" -Deduced_Data_Type ="float64" -Nullable ="Yes" -Length ="# an integer of the length, or N/A" -Min_values = 0 -Max_values =1000000 -Possible_categorical_Values =[ ] -q_code ="" +old_name = "405" +Deduced_Data_Type = "float64" [406] -Description ="emp_researcher (Civil)" -Deduced_Data_Type ="float64" -Nullable ="Yes" -Length ="# an integer of the length, or N/A" -Min_values = 0 -Max_values =1000000 -Possible_categorical_Values =[ ] -q_code ="" +old_name = "406" +Deduced_Data_Type = "float64" [407] -Description ="emp_technician (Civil)" -Deduced_Data_Type ="float64" -Nullable ="Yes" -Length ="# an integer of the length, or N/A" -Min_values = 0 -Max_values =1000000 -Possible_categorical_Values =[ ] -q_code ="" +old_name = "407" +Deduced_Data_Type = "float64" [408] -Description ="emp_technician (Defence)" -Deduced_Data_Type ="float64" -Nullable ="Yes" -Length ="# an integer of the length, or N/A" -Min_values = 0 -Max_values =1000000 -Possible_categorical_Values =[ ] -q_code ="" +old_name = "408" +Deduced_Data_Type = "float64" [409] -Description ="emp_other (Civil)" -Deduced_Data_Type ="float64" -Nullable ="Yes" -Length ="# an integer of the length, or N/A" -Min_values = 0 -Max_values =1000000 -Possible_categorical_Values =[ ] -q_code ="" +old_name = "409" +Deduced_Data_Type = "float64" [410] -Description ="emp_other (Defence)" -Deduced_Data_Type ="float64" -Nullable ="Yes" -Length ="# an integer of the length, or N/A" -Min_values = 0 -Max_values =1000000 -Possible_categorical_Values =[ ] -q_code ="" +old_name = "410" +Deduced_Data_Type = "float64" [411] -Description ="total fte(Civil)" -Deduced_Data_Type ="float64" -Nullable ="Yes" -Length ="# an integer of the length, or N/A" -Min_values = 0 -Max_values =1000000 -Possible_categorical_Values =[ ] -q_code ="" +old_name = "411" +Deduced_Data_Type = "float64" [412] -Description ="total fte(Defence)" -Deduced_Data_Type ="float64" -Nullable ="Yes" -Length ="# an integer of the length, or N/A" -Min_values = 0 -Max_values =1000000 -Possible_categorical_Values =[ ] -q_code ="" +old_name = "412" +Deduced_Data_Type = "float64" [501] -Description = "nan" +old_name = "501" Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [502] -Description = "nan" +old_name = "502" Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [503] -Description = "nan" +old_name = "503" Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [504] -Description = "nan" +old_name = "504" Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [505] -Description = "nan" +old_name = "505" Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [506] -Description = "nan" +old_name = "506" Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [507] -Description = "nan" +old_name = "507" Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [508] -Description = "nan" +old_name = "508" Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [601] -Description = "nan" +old_name = "601" Deduced_Data_Type = "object" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" [602] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[603] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" +old_name = "602" +Deduced_Data_Type = "float64" [604] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[605] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" - -[701] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[702] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "Int64" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[703] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[704] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[705] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[706] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[707] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" +old_name = "604" +Deduced_Data_Type = "float64" [708] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[709] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[710] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[711] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[712] -Description = "nan" +old_name = "708" Deduced_Data_Type = "object" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[713] -Description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM" -Deduced_Data_Type = "str" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_categorical_Values = [] -q_code = "q713" -old_ref = "None" - -[714] -Description = "Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM" -Deduced_Data_Type = "str" -Nullable = "No" -Length = "# an integer of the length, or N/A" -Min_values = "None" -Max_values = "None" -Possible_categorical_Values = [] -q_code = "q714" -old_ref = "None" - -[9000] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9001] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9002] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9003] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9004] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9005] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9006] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9007] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9008] -Description = "nan" -Deduced_Data_Type = "float64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9009] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9010] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9011] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9012] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" - -[9013] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = "nan" -Current_Data_Type = "object" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [ "nan",] -q_code = "" -old_ref = "nan" + +[period] +old_name = "period" +Deduced_Data_Type = "int64" + +[survey] +old_name = "survey" +Deduced_Data_Type = "int64" + +[status] +old_name = "status" +Deduced_Data_Type = "object" + +[formid] +old_name = "formid" +Deduced_Data_Type = "int64" + +[formtype] +old_name = "formtype" +Deduced_Data_Type = "int64" [cellnumber] -Description = "nan" -Deduced_Data_Type = "Int64" -Nullable = false -Current_Data_Type = "Int64" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [0] +old_name = "cellnumber" +Deduced_Data_Type = "int64" + +[pg_numeric] +old_name = "pg_numeric" +Deduced_Data_Type = "float64" + +[emp_researcher] +old_name = "emp_researcher" +Deduced_Data_Type = "float64" + +[emp_technician] +old_name = "emp_technician" +Deduced_Data_Type = "float64" + +[emp_other] +old_name = "emp_other" +Deduced_Data_Type = "float64" + +[emp_total] +old_name = "emp_total" +Deduced_Data_Type = "float64" + +[headcount_res_m] +old_name = "headcount_res_m" +Deduced_Data_Type = "float64" + +[headcount_res_f] +old_name = "headcount_res_f" +Deduced_Data_Type = "float64" + +[headcount_tec_m] +old_name = "headcount_tec_m" +Deduced_Data_Type = "float64" + +[headcount_tec_f] +old_name = "headcount_tec_f" +Deduced_Data_Type = "float64" + +[headcount_oth_m] +old_name = "headcount_oth_m" +Deduced_Data_Type = "float64" + +[headcount_oth_f] +old_name = "headcount_oth_f" +Deduced_Data_Type = "float64" + +[headcount_tot_m] +old_name = "headcount_tot_m" +Deduced_Data_Type = "float64" + +[headcount_tot_f] +old_name = "headcount_tot_f" +Deduced_Data_Type = "float64" + +[headcount_total] +old_name = "headcount_total" +Deduced_Data_Type = "float64" + +[imp_class] +old_name = "imp_class" +Deduced_Data_Type = "object" + +[imp_marker] +old_name = "imp_marker" +Deduced_Data_Type = "object" diff --git a/src/_version.py b/src/_version.py index 382021f30..9e604c040 100644 --- a/src/_version.py +++ b/src/_version.py @@ -1 +1 @@ -__version__ = "1.0.6" +__version__ = "1.0.7" diff --git a/src/dev_config.yaml b/src/dev_config.yaml index 15de1e669..8537d0810 100644 --- a/src/dev_config.yaml +++ b/src/dev_config.yaml @@ -42,6 +42,7 @@ imputation_paths: folder: "06_imputation" qa_path: "imputation_qa" manual_trimming_path: "manual_trimming" + backdata_out_path: "backdata_output" outliers_paths: folder: "07_outliers" qa_path: "outliers_qa" @@ -61,11 +62,12 @@ export_paths: network_paths: root: "R:/BERD Results System Development 2023/DAP_emulation/" logs_foldername: "logs/run_logs" # dev config - snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/survey_return_data/snapshot-202212-002-83b5bacd-7c99-45cf-b989-d43d762dd054.json" + snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/spp_snapshots/2023_snapshots/snapshot-202312-002-b9b6048a-51c9-4669-919a-e92fc6e9c433.json" # dev config secondary_snapshot_path: "R:/BERD Results System Development 2023/DAP_emulation/berd_survey/anonymised/v1/snapshot-202012-002-fba5c4ba-fb8c-4a62-87bb-66c725eea5fd.json" # TODO Check if this works ni_full_responses_path: "03_northern_ireland/2021/TEST_ni.csv" # TESTER FILE # Imputation and outliers input paths - backdata_path: "R:/BERD Results System Development 2023/2021_data/validation-extract-responses-202112.csv" + # backdata_path: "R:/BERD Results System Development 2023/2021_data/2021_backdata_expanded.csv" + backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2022_surveys/BERD/06_imputation/backdata_output/2022_backdata_24-07-19_v604.csv" manual_imp_trim_path: "06_imputation/manual_trimming/trimming_qa_2023-11-27_v359.csv" manual_outliers_path: "07_outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv" # Construction paths @@ -77,6 +79,7 @@ network_paths: pcode_val_path: "01_staging/staging_qa/postcode_validation" # schema paths schema_paths: + manual_trimming_schema: "config/output_schemas/manual_trimming_qa_schema.toml" short_form_schema: "config/output_schemas/short_form_schema.toml" long_form_schema: "config/output_schemas/long_form_schema.toml" tau_schema: "config/output_schemas/tau_schema.toml" @@ -109,7 +112,7 @@ run_log_sql: log_db: "test_runlog" log_mode: "append" estimation: - num_expected_cellnos: 588 # the number of cell_no items expected in the coverage mapper + num_expected_cellnos: 589 # the number of cell_no items expected in the coverage mapper numeric_cols: ["701", "702", "703", "704", "705", "706", "707", "709", "710", "711"] imputation: lf_target_vars: diff --git a/src/estimation/estimation_main.py b/src/estimation/estimation_main.py index 4a0f1dd31..96ff62e5a 100644 --- a/src/estimation/estimation_main.py +++ b/src/estimation/estimation_main.py @@ -31,8 +31,6 @@ def run_estimation( """ EstMainLogger.info("Starting estimation weights calculation...") - est_qa_path = config["estimation_paths"]["qa_path"] - # # clean and create a dictionary from the cellno mapper # cell_unit_dict = cmap.cellno_unit_dict(cellno_df) @@ -50,6 +48,7 @@ def run_estimation( EstMainLogger.info("Outputting estimation QA file.") tdate = datetime.now().strftime("%y-%m-%d") survey_year = config["years"]["survey_year"] + est_qa_path = config["estimation_paths"]["qa_path"] cell_qa_filename = f"{survey_year}_estimation_weights_qa_{tdate}_v{run_id}.csv" full_qa_filename = f"{survey_year}_full_estimation_qa_{tdate}_v{run_id}.csv" write_csv(f"{est_qa_path}/{cell_qa_filename}", qa_df) diff --git a/src/imputation/MoR.py b/src/imputation/MoR.py index 20f680ab2..e8a1e12b8 100644 --- a/src/imputation/MoR.py +++ b/src/imputation/MoR.py @@ -1,18 +1,15 @@ """Functions for the Mean of Ratios (MoR) methods.""" import itertools +import re import pandas as pd import numpy as np -import re -from src.staging import postcode_validation as pcval -from src.imputation.apportionment import run_apportionment from src.imputation.tmi_imputation import ( create_imp_class_col, trim_bounds, calculate_totals, ) - good_statuses = ["Clear", "Clear - overridden"] bad_statuses = ["Form sent out", "Check needed"] @@ -62,27 +59,17 @@ def mor_preprocessing(df, backdata): df (pd.DataFrame): full responses for the current year backdata (pd.Dataframe): backdata file read in during staging. """ - # Convert backdata column names from qXXX to XXX - # Note that this is only applicable when using the backdata on the network - p = re.compile(r"q\d{3}") - cols = [col for col in list(backdata.columns) if p.match(col)] - to_rename = {col: col[1:] for col in cols} - backdata = backdata.rename(columns=to_rename) - # Add a QA column for the group size df["cf_group_size"] = np.nan # TODO move this to imputation main # Select only values to be imputed df = create_imp_class_col(df, "200", "201") - backdata = create_imp_class_col(backdata, "200", "201") imputation_cond = (df["formtype"] == "0001") & (df["status"].isin(bad_statuses)) to_impute_df = df.copy().loc[imputation_cond, :] remainder_df = df.copy().loc[~imputation_cond, :] - backdata = run_apportionment(backdata) - clear_status_cond = backdata["status"].isin(good_statuses) # Only pick up clear statuses from backdata @@ -133,11 +120,6 @@ def carry_forwards(df, backdata, impute_vars): # Copy values from relevant columns where references match match_cond = df["_merge"] == "both" - # Apply the postcode formatting to clean the postcodes in col 601 of the back data - df.loc[match_cond, "601_prev"] = df.loc[match_cond, "601_prev"].apply( - pcval.format_postcodes - ) - # Replace the values of certain columns with the values from the back data replace_vars = ["instance", "200", "201", "601", "602", "604"] for var in replace_vars: diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py index d2bc9362e..e032d7050 100644 --- a/src/imputation/imputation_helpers.py +++ b/src/imputation/imputation_helpers.py @@ -4,6 +4,8 @@ from typing import List, Dict, Tuple, Callable from itertools import chain +from src.staging.validation import load_schema + ImputationHelpersLogger = logging.getLogger(__name__) @@ -335,3 +337,26 @@ def tidy_imputation_dataframe( df = df.drop(columns=to_drop) return df + + +def create_new_backdata(backdata: pd.DataFrame, config) -> pd.DataFrame: + """Create a new backdata dataframe with the required columns from schema. + + Use the backdata toml schema to select the required columns from the backdata. + filter for the clear and imputed statuses. + + Args: + backdata (pd.DataFrame): The backdata dataframe. + + Returns: + pd.DataFrame: The filtered backdata with only the required columns. + """ + # filter for the clear and imputed statuses + imp_markers_to_keep: list = ["R", "TMI", "CF", "MoR", "constructed"] + backdata = backdata.loc[backdata["imp_marker"].isin(imp_markers_to_keep)] + + # get the wanted columns from the backdata schema + schema = load_schema("./config/backdata_schema.toml") + wanted_cols = list(schema.keys()) + + return backdata[wanted_cols] diff --git a/src/imputation/imputation_main.py b/src/imputation/imputation_main.py index 58b61f941..3a45a1fb4 100644 --- a/src/imputation/imputation_main.py +++ b/src/imputation/imputation_main.py @@ -1,5 +1,6 @@ """The main file for the Imputation module.""" import logging +import os import pandas as pd from typing import Callable, Dict, Any from datetime import datetime @@ -13,6 +14,7 @@ # from src.imputation.MoR import run_mor from src.imputation.sf_expansion import run_sf_expansion from src.imputation import manual_imputation as mimp +from src.imputation.MoR import run_mor from src.outputs.outputs_helpers import create_output_df @@ -91,10 +93,10 @@ def run_imputation( trimmed_df, df = hlp.split_df_on_trim(df, "manual_trim") # Run MoR - # if backdata is not None: - # MoR will be re-written with new backdata - # lf_target_vars = config["imputation"]["lf_target_vars"] - # df, links_df = run_mor(df, backdata, to_impute_cols, lf_target_vars, config) + if backdata is not None: + # MoR will be re-written with new backdata + lf_target_vars = config["imputation"]["lf_target_vars"] + df, links_df = run_mor(df, backdata, to_impute_cols, lf_target_vars, config) # Run TMI for long forms and short forms imputed_df, qa_df = tmi.run_tmi(df, config) @@ -124,14 +126,12 @@ def run_imputation( ).reset_index(drop=True) # Output QA files + tdate = datetime.now().strftime("%y-%m-%d") + survey_year = config["years"]["survey_year"] if config["global"]["output_imputation_qa"]: - ImputationMainLogger.info("Outputting Imputation files.") - tdate = datetime.now().strftime("%y-%m-%d") - survey_year = config["years"]["survey_year"] + ImputationMainLogger.info("Outputting Imputation QA files.") trim_qa_filename = f"{survey_year}_trimming_qa_{tdate}_v{run_id}.csv" - # if config["global"]["load_backdata"]: - # links_filename = f"{survey_year}_links_qa_{tdate}_v{run_id}.csv" full_imp_filename = ( f"{survey_year}_full_responses_imputed_{tdate}_v{run_id}.csv" ) @@ -142,11 +142,13 @@ def run_imputation( schema_dict = load_schema(schema_path) trimming_qa_output = create_output_df(qa_df, schema_dict) - write_csv(f"{qa_path}/{trim_qa_filename}", trimming_qa_output) - write_csv(f"{qa_path}/{full_imp_filename}", imputed_df) - write_csv(f"{qa_path}/{wrong_604_filename}", wrong_604_qa_df) - # if config["global"]["load_backdata"]: - # write_csv(f"{qa_path}{links_filename}", links_df) + write_csv(os.path.join(qa_path, trim_qa_filename), trimming_qa_output) + write_csv(os.path.join(qa_path, full_imp_filename), imputed_df) + write_csv(os.path.join(qa_path, wrong_604_filename), wrong_604_qa_df) + if config["global"]["load_backdata"]: + links_filename = f"{survey_year}_links_qa_{tdate}_v{run_id}.csv" + write_csv(os.path.join(qa_path, links_filename), links_df) + ImputationMainLogger.info("Finished Imputation calculation.") # remove rows and columns no longer needed from the imputed dataframe @@ -159,4 +161,12 @@ def run_imputation( run_id, ) + # optionally output backdata for imputation + if config["global"]["output_backdata"]: + ImputationMainLogger.info("Outputting backdata for imputation.") + backdata_path = config["imputation_paths"]["backdata_out_path"] + backdata_filename = f"{survey_year}_backdata_{tdate}_v{run_id}.csv" + new_backdata = hlp.create_new_backdata(imputed_df, config) + write_csv(os.path.join(backdata_path, backdata_filename), new_backdata) + return imputed_df diff --git a/src/mapping/conv_2021_backdata.py b/src/mapping/conv_2021_backdata.py new file mode 100644 index 000000000..848bcab09 --- /dev/null +++ b/src/mapping/conv_2021_backdata.py @@ -0,0 +1,116 @@ +"""NOTE: This is a temporary script to convert the 2021 backdata to the format required +for MoR imputation. When the mapping module is complete, we can produce a one-off update +of the 2021 data and remove this script.""" + +import os +import re +import logging +import pandas as pd + +from src.utils.local_file_mods import rd_read_csv, rd_write_csv +from src.staging import staging_helpers as stage_hlp +from src.staging import postcode_validation as pcval +from src.mapping.pg_conversion import pg_to_pg_mapper +from src.imputation.tmi_imputation import create_imp_class_col +from src.imputation.apportionment import run_apportionment + +MappingMainLogger = logging.getLogger(__name__) + + +def do_pg_conv(backdata, config) -> pd.DataFrame: + + # Load and validate the PG mappers + pg_num_alpha = stage_hlp.load_validate_mapper( + "pg_num_alpha_mapper_path", + config, + MappingMainLogger, + ) + + backdata = pg_to_pg_mapper( + backdata, + pg_num_alpha, + ) + return backdata + + +def prep_2021_backdata(backdata) -> pd.DataFrame: + """Prepare the backdata for MoR imputation. + + Args: + backdata (pd.DataFrame): Backdata for the current year. + + Returns: + pd.DataFrame: Prepped backdata. + """ + # Convert backdata column names from qXXX to XXX + # Note that this is only applicable when using the backdata on the network + p = re.compile(r"q\d{3}") + cols = [col for col in list(backdata.columns) if p.match(col)] + to_rename = {col: col[1:] for col in cols} + backdata = backdata.rename(columns=to_rename) + + # Apply the postcode formatting to clean the postcodes in col 601 of the back data + backdata["601"] = backdata["601"].apply(pcval.format_postcodes) + + return backdata + + +def get_backdate_wanted_cols(backdata: pd.DataFrame, config: dict) -> pd.DataFrame: + """Get the columns required for the backdata. + + Args: + backdata (pd.DataFrame): The backdata. + config (dict): The configuration settings. + + Returns: + pd.DataFrame: The backdata with only the required columns. + """ + # Load the columns to keep + backdata_cols = stage_hlp.load_required_columns( + "backdata_required_cols_path", + config, + MappingMainLogger, + ) + + # Get the columns that are in the backdata + cols = list(backdata.columns) + wanted_cols = [col for col in backdata_cols if col in cols] + + return backdata[wanted_cols] + + +def create_imp_marker_col(df: pd.DataFrame) -> pd.DataFrame: + """Create the imp_marker column for the backdata. + + Args: + df (pd.DataFrame): The backdata. + + Returns: + pd.DataFrame: The backdata with the imp_marker column. + """ + clear_responders_mask = df.status.isin(["Clear", "Clear - overridden"]) + df.loc[clear_responders_mask, "imp_marker"] = "R" + df.loc[~clear_responders_mask, "imp_marker"] = "no_imputation" + + return df + + +def create_backdata(backdata: pd.DataFrame, config: dict) -> pd.DataFrame: + staging_dict = config["staging_paths"] + backdata_path = staging_dict["backdata_path"] + + backdata = rd_read_csv(backdata_path) + + backdata = prep_2021_backdata(backdata) + + backdata = do_pg_conv(backdata, config) + + backdata = run_apportionment(backdata) + + backdata = create_imp_class_col(backdata, "200", "201") + + backdata = create_imp_marker_col(backdata) + + backdata_out_path = config["imputation_paths"]["backdata_out_path"] + + rd_write_csv(os.path.join(backdata_out_path, "2021_backdata.csv"), backdata) diff --git a/src/mapping/mapping_main.py b/src/mapping/mapping_main.py index 7bc87290b..580e34bb8 100644 --- a/src/mapping/mapping_main.py +++ b/src/mapping/mapping_main.py @@ -111,4 +111,4 @@ def run_mapping( MappingMainLogger.info("Finished Mapping NI QA calculation.") # return mapped_df - return (full_responses, ni_full_responses, itl_mapper, cellno_df) + return (full_responses, ni_full_responses, itl_mapper) diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py index 3123ee46a..265fee09d 100644 --- a/src/outputs/outputs_main.py +++ b/src/outputs/outputs_main.py @@ -59,7 +59,6 @@ def run_outputs( # noqa: C901 sic_division_detailed (pd.DataFrame): Detailed descriptons of SIC divisions """ - # Remove instance 0 from weighted df, so that it does not go to Tau outputs weighted_df = weighted_df.copy().loc[weighted_df.instance != 0] (ni_full_responses, outputs_df, tau_outputs_df) = form_output_prep( diff --git a/src/pipeline.py b/src/pipeline.py index 6dc28dd25..4ba23829c 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -17,6 +17,7 @@ from src.site_apportionment.site_apportionment_main import run_site_apportionment from src.outputs.outputs_main import run_outputs + MainLogger = logging.getLogger(__name__) @@ -113,7 +114,7 @@ def run_pipeline(user_config_path, dev_config_path): # Mapping module MainLogger.info("Starting Mapping...") - (mapped_df, ni_full_responses, itl_mapper, cellno_df,) = run_mapping( + (mapped_df, ni_full_responses, itl_mapper) = run_mapping( full_responses, ni_df, config, @@ -155,7 +156,6 @@ def run_pipeline(user_config_path, dev_config_path): mods.rd_write_csv, run_id, "estimated", - output_file=True, ) weighted_responses_df = run_site_apportionment( weighted_responses_df, @@ -163,19 +163,9 @@ def run_pipeline(user_config_path, dev_config_path): mods.rd_write_csv, run_id, "weighted", - output_file=True, ) MainLogger.info("Finished Site Apportionment module.") - # Data processing: Regional Apportionment - - # Data processing: Aggregation - - # Data display: Visualisations - - # Data output: Disclosure Control - - # Data output: File Outputs MainLogger.info("Starting Outputs...") run_outputs( diff --git a/src/site_apportionment/site_apportionment_main.py b/src/site_apportionment/site_apportionment_main.py index c596ac38d..a24bc033d 100644 --- a/src/site_apportionment/site_apportionment_main.py +++ b/src/site_apportionment/site_apportionment_main.py @@ -15,8 +15,7 @@ def run_site_apportionment( config: Dict[str, Any], write_csv: Callable, run_id: int, - file_suffix, - output_file=False, + output_type: str, ) -> pd.DataFrame: """Run the apportionment to sites module. @@ -25,14 +24,17 @@ def run_site_apportionment( instance 0 to all other instances. Same percentages are used for each product group. - When running on the local network, - Args: config (dict): The pipeline configuration df (pd.DataFrame): Main dataset before the outputs + write_csv (Callable): Function to write to a csv file. + This will be the hdfs or network version depending on settings. + run_id (int): The current run id + output_type (str): The type of output being processed, either "estimated_df" + or "weighted_df". Needed for the QA file naming. Returns: df_out (pd.DataFrame): Percentages filled in for short forms and applied - to apportion for long forms + to apportion for long forms """ # Create variable for output of QA apportionment file qa_path = config["apportionment_paths"]["qa_path"] @@ -43,26 +45,16 @@ def run_site_apportionment( if config["global"]["output_status_filtered"]: osf.output_status_filtered(df, imp_markers_to_keep, config, write_csv, run_id) - # Check if this module needs to be applied - if config["global"]["apportion_sites"]: - SitesMainLogger.info("Starting apportionment to sites...") - df_out = sap.run_apportion_sites(df, imp_markers_to_keep, config) - - # Output QA files - if config["global"]["output_apportionment_qa"] & output_file: - SitesMainLogger.info("Outputting Apportionment files.") - tdate = datetime.now().strftime("%y-%m-%d") - survey_year = config["years"]["survey_year"] - filename = ( - f"{survey_year}_{file_suffix}_df_apportioned_{tdate}_v{run_id}.csv" - ) - write_csv(f"{qa_path}/{filename}", df_out) + SitesMainLogger.info("Starting apportionment to sites...") + df_out = sap.run_apportion_sites(df, imp_markers_to_keep, config) - SitesMainLogger.info("Finished apportionment to sites.") - return df_out + # Output QA files + if config["global"]["output_apportionment_qa"]: + SitesMainLogger.info("Outputting Apportionment files.") + tdate = datetime.now().strftime("%y-%m-%d") + survey_year = config["years"]["survey_year"] + filename = f"{survey_year}_{output_type}_apportioned_{tdate}_v{run_id}.csv" + write_csv(f"{qa_path}/{filename}", df_out) - else: - SitesMainLogger.info("Apportionment to sites disabled, skipped") - # Remove records that are neither clear nor imputed, based on imputation marker. - filtered_df = sap.keep_good_markers(df) - return filtered_df + SitesMainLogger.info("Finished apportionment to sites.") + return df_out diff --git a/src/user_config.yaml b/src/user_config.yaml index 434cd70a3..8e70c3350 100644 --- a/src/user_config.yaml +++ b/src/user_config.yaml @@ -2,12 +2,12 @@ config_validation: validate: True path: src/user_config_schema.yaml years: - survey_year: 2022 + survey_year: 2023 global: # Staging and validation settings postcode_csv_check: True load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions - load_ni_data: True + load_ni_data: False # Construction Settings run_construction: False run_all_data_construction: False @@ -15,11 +15,13 @@ global: run_ni_construction: False load_manual_outliers: False load_manual_imputation: False - load_backdata: False # whether to load previous year data for MoR + load_backdata: True # whether to load previous year data for MoR + # Backdata output settings + output_backdata: True # QA output settings output_full_responses: False output_ni_full_responses: False - output_imputation_qa: False + output_imputation_qa: True output_mapping_qa: False output_mapping_ni_qa: False output_auto_outliers: False @@ -48,7 +50,6 @@ hdfs_paths: secondary_snapshot_path: "/ons/rdbe_dev/berd_survey/anonymised/v1/NEW_SNAPSHOT_TODO.json" ni_full_responses_path: "03_northern_ireland/2021/TEST_ni.csv" feather_path: "staging/feather" - history_path: "BERD_V7_Anonymised" # Imputation and outliers input paths backdata_path: "" manual_imp_trim_path: "06_imputation/manual_trimming/trimming_qa_2023-11-27_v359.csv" @@ -80,7 +81,7 @@ hdfs_paths: postcodes_mapper: "postcodes_2023.csv" itl_mapper_path: "itl_2023.csv" ultfoc_mapper_path: "BERD_2023_ultfoc_anon.csv" - cellno_2023_path: 'berd_2023_cellno_coverage.csv' + cellno_path: 'berd_2023_cellno_coverage_TEMP.csv' pg_num_alpha_mapper_path: 'pg_num_alpha_2023.csv' sic_pg_alpha_mapper_path: 'sic_pg_alpha_2023.csv' sic_pg_num_mapper_path: 'sic_pg_num_2023.csv' diff --git a/src/utils/path_helpers.py b/src/utils/path_helpers.py index 4eed36ee5..b001bccfb 100644 --- a/src/utils/path_helpers.py +++ b/src/utils/path_helpers.py @@ -59,6 +59,7 @@ def create_staging_config(config: dict) -> dict: staging_dict["snapshot_path"] = paths["snapshot_path"] staging_dict["secondary_snapshot_path"] = paths["secondary_snapshot_path"] staging_dict["postcode_masterlist"] = paths["postcode_masterlist"] + staging_dict["backdata_path"] = paths["backdata_path"] staging_dict["manual_outliers_path"] = f"{berd_path}{paths['manual_outliers_path']}" staging_dict["manual_imp_trim_path"] = f"{berd_path}{paths['manual_imp_trim_path']}" diff --git a/tests/test_estimation/test_apply_weights.py b/tests/test_estimation/test_apply_weights.py index bbd23c8c8..85faec1b7 100644 --- a/tests/test_estimation/test_apply_weights.py +++ b/tests/test_estimation/test_apply_weights.py @@ -64,288 +64,17 @@ def create_input_df(self): "headcount_oth_f", "a_weight", ] - data = [ - [ - "1", - np.nan, - 1.5968726069999999, - 0.254832457, - 1501.531337, - 500.8443931, - 499.9360421, - 500.75090159999996, - 29.94531777, - 9.602584921, - 9.653985014, - 10.68874783, - 6.486029315, - 2.533542905, - 2.308242877, - 1.644243533, - 35.5, - 17.6, - 17.9, - 5.2, - 5.0, - 5.9, - 6.1, - 6.4, - 6.8, - 2.5, - ], - [ - "2", - 0.0, - 0.484444627, - 0.660795649, - 1501.150221, - 499.7596024, - 501.6974991, - 499.6931194, - 31.82855842, - 10.97246129, - 10.18047899, - 10.67561814, - 9.256449084, - 3.6957475310000003, - 3.053765916, - 2.5069356369999998, - 31.3, - 15.6, - 15.7, - 5.1, - 5.0, - 5.2, - 5.3, - 5.3, - 5.4, - 6.0, - ], - [ - "3", - 0.245754563, - 0.053153144000000006, - 0.24898895399999998, - 1499.305683, - 499.0019342, - 499.56546389999994, - 500.7382846, - 31.09016923, - 10.200109300000001, - 11.24314965, - 9.646910282999999, - 11.18471882, - 5.38607754, - 2.750773441, - 3.047867843, - 31.9, - 16.2, - 15.7, - 5.9, - 5.8, - 5.4, - 5.2, - 5.0, - 4.7, - 7.0, - ], - [ - "4", - 0.091581569, - 1.29306045, - 2.222550393, - 1497.3095039999998, - 500.2853801, - 498.26926339999994, - 498.7548608, - 31.49017978, - 10.80040763, - 9.527649379, - 11.16212277, - 10.22765186, - 3.03791456, - 3.8605661610000004, - 3.3291711439999996, - 19.5, - 8.5, - 11.0, - 2.8, - 4.8, - 2.6, - 3.2, - 3.1, - 3.0, - 14.0, - ], - [ - "5", - 0.48547690299999996, - 0.614760438, - 1.620874173, - 1498.722371, - 500.4694436000001, - 500.44068089999996, - 497.81224689999993, - 30.44836156, - 10.06357482, - 10.79756546, - 9.587221282, - 8.654173735, - 5.294000296, - 2.062538336, - 1.297635102, - 38.7, - 16.0, - 22.7, - 2.3, - 5.6, - 5.1, - 7.2, - 8.6, - 10.0, - 2.0, - ], - [ - "6", - 0.216682565, - 0.420119477, - 0.077609457, - 1497.974779, - 499.146944, - 498.292069, - 500.5357663, - 29.76051868, - 9.465821929, - 9.133324308999999, - 11.16137244, - 11.35927018, - 5.351916919, - 2.9535873489999998, - 3.053765916, - 29.9, - 14.9, - 15.0, - 4.8, - 4.7, - 5.0, - 5.0, - 5.2, - 5.3, - 1.0, - ], - [ - "7", - 0.256570729, - 1.635266926, - 0.877078037, - 1497.8444539999998, - 499.91778619999997, - 499.7594455, - 498.16722269999997, - 28.24588485, - 9.099873094, - 9.312124146, - 9.833887614, - 11.20396942, - 4.079138201, - 5.134802344, - 1.990028871, - 27.8, - 15.0, - 12.8, - 5.4, - 4.0, - 5.1, - 4.6, - 4.4, - 4.3, - 6.3, - ], - [ - "8", - 0.733170964, - 0.111266445, - 0.506586275, - 1500.596397, - 501.64230189999995, - 501.381718, - 497.5723767, - 28.169746500000002, - 9.444435161, - 9.593212578, - 9.132098765, - 10.80790153, - 3.316276783, - 4.322564458, - 3.169060286, - 34.8, - 15.9, - 18.9, - 5.2, - 7.6, - 5.0, - 5.8, - 5.7, - 5.6, - 8.1, - ], - [ - "9", - 2.134056558, - 0.31692025, - 1.3120006770000001, - 1498.986525, - 501.01687419999996, - 499.2999597, - 498.66969119999993, - 31.76279059, - 10.04786784, - 10.48995616, - 11.22496658, - 8.855932629, - 4.065679953, - 2.443454044, - 2.346798631, - 33.1, - 16.4, - 16.8, - 5.3, - 5.5, - 5.4, - 5.6, - 5.6, - 5.7, - 1.0, - ], - [ - "10", - 0.678504648, - 0.35838638700000003, - 0.569093572, - 1498.674708, - 500.2971126, - 500.64281950000003, - 497.7347761, - 30.178638399999997, - 9.173285232000001, - 11.11531108, - 9.890042091, - 10.35101118, - 3.143504622, - 3.1301464180000003, - 4.077360139, - 31.8, - 15.6, - 16.2, - 4.3, - 4.1, - 5.3, - 5.5, - 6.0, - 6.5, - 9.0, - ], + ["1", np.nan, 1.5968726069999999, 0.254832457, 1501.531337, 500.8443931, 499.9360421, 500.75090159999996, 29.94531777, 9.602584921, 9.653985014, 10.68874783, 6.486029315, 2.533542905, 2.308242877, 1.644243533, 35.5, 17.6, 17.9, 5.2, 5.0, 5.9, 6.1, 6.4, 6.8, 2.5], + ["2", 0.0, 0.484444627, 0.660795649, 1501.150221, 499.7596024, 501.6974991, 499.6931194, 31.82855842, 10.97246129, 10.18047899, 10.67561814, 9.256449084, 3.6957475310000003, 3.053765916, 2.5069356369999998, 31.3, 15.6, 15.7, 5.1, 5.0, 5.2, 5.3, 5.3, 5.4, 6.0], + ["3", 0.245754563, 0.053153144000000006, 0.24898895399999998, 1499.305683, 499.0019342, 499.56546389999994, 500.7382846, 31.09016923, 10.200109300000001, 11.24314965, 9.646910282999999, 11.18471882, 5.38607754, 2.750773441, 3.047867843, 31.9, 16.2, 15.7, 5.9, 5.8, 5.4, 5.2, 5.0, 4.7, 7.0], + ["4", 0.091581569, 1.29306045, 2.222550393, 1497.3095039999998, 500.2853801, 498.26926339999994, 498.7548608, 31.49017978, 10.80040763, 9.527649379, 11.16212277, 10.22765186, 3.03791456, 3.8605661610000004, 3.3291711439999996, 19.5, 8.5, 11.0, 2.8, 4.8, 2.6, 3.2, 3.1, 3.0, 14.0], + ["5", 0.48547690299999996, 0.614760438, 1.620874173, 1498.722371, 500.4694436000001, 500.44068089999996, 497.81224689999993, 30.44836156, 10.06357482, 10.79756546, 9.587221282, 8.654173735, 5.294000296, 2.062538336, 1.297635102, 38.7, 16.0, 22.7, 2.3, 5.6, 5.1, 7.2, 8.6, 10.0, 2.0], + ["6", 0.216682565, 0.420119477, 0.077609457, 1497.974779, 499.146944, 498.292069, 500.5357663, 29.76051868, 9.465821929, 9.133324308999999, 11.16137244, 11.35927018, 5.351916919, 2.9535873489999998, 3.053765916, 29.9, 14.9, 15.0, 4.8, 4.7, 5.0, 5.0, 5.2, 5.3, 1.0], + ["7", 0.256570729, 1.635266926, 0.877078037, 1497.8444539999998, 499.91778619999997, 499.7594455, 498.16722269999997, 28.24588485, 9.099873094, 9.312124146, 9.833887614, 11.20396942, 4.079138201, 5.134802344, 1.990028871, 27.8, 15.0, 12.8, 5.4, 4.0, 5.1, 4.6, 4.4, 4.3, 6.3], + ["8", 0.733170964, 0.111266445, 0.506586275, 1500.596397, 501.64230189999995, 501.381718, 497.5723767, 28.169746500000002, 9.444435161, 9.593212578, 9.132098765, 10.80790153, 3.316276783, 4.322564458, 3.169060286, 34.8, 15.9, 18.9, 5.2, 7.6, 5.0, 5.8, 5.7, 5.6, 8.1], + ["9", 2.134056558, 0.31692025, 1.3120006770000001, 1498.986525, 501.01687419999996, 499.2999597, 498.66969119999993, 31.76279059, 10.04786784, 10.48995616, 11.22496658, 8.855932629, 4.065679953, 2.443454044, 2.346798631, 33.1, 16.4, 16.8, 5.3, 5.5, 5.4, 5.6, 5.6, 5.7, 1.0], + ["10", 0.678504648, 0.35838638700000003, 0.569093572, 1498.674708, 500.2971126, 500.64281950000003, 497.7347761, 30.178638399999997, 9.173285232000001, 11.11531108, 9.890042091, 10.35101118, 3.143504622, 3.1301464180000003, 4.077360139, 31.8, 15.6, 16.2, 4.3, 4.1, 5.3, 5.5, 6.0, 6.5, 9.0], ] input_df = pd.DataFrame(data=data, columns=input_columns) @@ -408,526 +137,16 @@ def create_expected_output(self): ] data = [ - [ - "1", - np.nan, - 1.5968726069999999, - 0.254832457, - 1501.531337, - 500.8443931, - 499.9360421, - 500.75090159999996, - 29.94531777, - 9.602584921, - 9.653985014, - 10.68874783, - 6.486029315, - 2.533542905, - 2.308242877, - 1.644243533, - 35.5, - 17.6, - 17.9, - 5.2, - 5.0, - 5.9, - 6.1, - 6.4, - 6.8, - 2.5, - np.nan, - 3.9922, - 0.6371, - 3753.8283, - 1252.111, - 1249.8401, - 1251.8773, - 74.8633, - 24.0065, - 24.135, - 26.7219, - 16.2151, - 6.3339, - 5.7706, - 4.1106, - 88.78, - 44.04, - 44.74, - 13.11, - 12.56, - 14.84, - 15.23, - 16.09, - 16.96, - ], - [ - "2", - 0.0, - 0.484444627, - 0.660795649, - 1501.150221, - 499.7596024, - 501.6974991, - 499.6931194, - 31.82855842, - 10.97246129, - 10.18047899, - 10.67561814, - 9.256449084, - 3.6957475310000003, - 3.053765916, - 2.5069356369999998, - 31.3, - 15.6, - 15.7, - 5.1, - 5.0, - 5.2, - 5.3, - 5.3, - 5.4, - 6.0, - 0.0, - 2.9067, - 3.9648, - 9006.9013, - 2998.5576, - 3010.185, - 2998.1587, - 190.9714, - 65.8348, - 61.0829, - 64.0537, - 55.5387, - 22.1745, - 18.3226, - 15.0416, - 188.0, - 93.77, - 94.23, - 30.44, - 30.23, - 31.33, - 31.55, - 32.0, - 32.44, - ], - [ - "3", - 0.245754563, - 0.053153144000000006, - 0.24898895399999998, - 1499.305683, - 499.0019342, - 499.56546389999994, - 500.7382846, - 31.09016923, - 10.200109300000001, - 11.24314965, - 9.646910282999999, - 11.18471882, - 5.38607754, - 2.750773441, - 3.047867843, - 31.9, - 16.2, - 15.7, - 5.9, - 5.8, - 5.4, - 5.2, - 5.0, - 4.7, - 7.0, - 1.7203, - 0.3721, - 1.7429, - 10495.1398, - 3493.0135, - 3496.9582, - 3505.1679999999997, - 217.6312, - 71.4008, - 78.702, - 67.5284, - 78.293, - 37.7025, - 19.2554, - 21.3351, - 223.37, - 113.33, - 110.03, - 40.98, - 40.59, - 37.63, - 36.39, - 34.72, - 33.05, - ], - [ - "4", - 0.091581569, - 1.29306045, - 2.222550393, - 1497.3095039999998, - 500.2853801, - 498.26926339999994, - 498.7548608, - 31.49017978, - 10.80040763, - 9.527649379, - 11.16212277, - 10.22765186, - 3.03791456, - 3.8605661610000004, - 3.3291711439999996, - 19.5, - 8.5, - 11.0, - 2.8, - 4.8, - 2.6, - 3.2, - 3.1, - 3.0, - 14.0, - 1.2821, - 18.1028, - 31.1157, - 20962.3331, - 7003.9953, - 6975.7697, - 6982.5681, - 440.8625, - 151.2057, - 133.3871, - 156.2697, - 143.1871, - 42.5308, - 54.0479, - 46.6084, - 272.46, - 118.55, - 153.91, - 39.34, - 67.83, - 36.17, - 44.62, - 43.04, - 41.46, - ], - [ - "5", - 0.48547690299999996, - 0.614760438, - 1.620874173, - 1498.722371, - 500.4694436000001, - 500.44068089999996, - 497.81224689999993, - 30.44836156, - 10.06357482, - 10.79756546, - 9.587221282, - 8.654173735, - 5.294000296, - 2.062538336, - 1.297635102, - 38.7, - 16.0, - 22.7, - 2.3, - 5.6, - 5.1, - 7.2, - 8.6, - 10.0, - 2.0, - 0.971, - 1.2295, - 3.2417, - 2997.4447, - 1000.9389, - 1000.8814, - 995.6245, - 60.8967, - 20.1271, - 21.5951, - 19.1744, - 17.3083, - 10.588, - 4.1251, - 2.5953, - 77.47, - 32.06, - 45.41, - 4.64, - 11.13, - 10.28, - 14.32, - 17.14, - 19.96, - ], - [ - "6", - 0.216682565, - 0.420119477, - 0.077609457, - 1497.974779, - 499.146944, - 498.292069, - 500.5357663, - 29.76051868, - 9.465821929, - 9.133324308999999, - 11.16137244, - 11.35927018, - 5.351916919, - 2.9535873489999998, - 3.053765916, - 29.9, - 14.9, - 15.0, - 4.8, - 4.7, - 5.0, - 5.0, - 5.2, - 5.3, - 1.0, - 0.2167, - 0.4201, - 0.0776, - 1497.9748, - 499.1469, - 498.2921, - 500.5358, - 29.7605, - 9.4658, - 9.1333, - 11.1614, - 11.3593, - 5.3519, - 2.9536, - 3.0538, - 29.89, - 14.89, - 15.01, - 4.75, - 4.7, - 4.98, - 5.04, - 5.15, - 5.27, - ], - [ - "7", - 0.256570729, - 1.635266926, - 0.877078037, - 1497.8444539999998, - 499.91778619999997, - 499.7594455, - 498.16722269999997, - 28.24588485, - 9.099873094, - 9.312124146, - 9.833887614, - 11.20396942, - 4.079138201, - 5.134802344, - 1.990028871, - 27.8, - 15.0, - 12.8, - 5.4, - 4.0, - 5.1, - 4.6, - 4.4, - 4.3, - 6.3, - 1.6164, - 10.3022, - 5.5256, - 9436.4201, - 3149.4821, - 3148.4845, - 3138.4535, - 177.9491, - 57.3292, - 58.6664, - 61.9535, - 70.585, - 25.6986, - 32.3493, - 12.5372, - 175.25, - 94.52, - 80.73, - 34.26, - 25.06, - 32.43, - 28.75, - 27.84, - 26.92, - ], - [ - "8", - 0.733170964, - 0.111266445, - 0.506586275, - 1500.596397, - 501.64230189999995, - 501.381718, - 497.5723767, - 28.169746500000002, - 9.444435161, - 9.593212578, - 9.132098765, - 10.80790153, - 3.316276783, - 4.322564458, - 3.169060286, - 34.8, - 15.9, - 18.9, - 5.2, - 7.6, - 5.0, - 5.8, - 5.7, - 5.6, - 8.1, - 5.9387, - 0.9013, - 4.1033, - 12154.8308, - 4063.3026, - 4061.1919, - 4030.3363, - 228.1749, - 76.4999, - 77.705, - 73.97, - 87.544, - 26.8618, - 35.0128, - 25.6694, - 282.04, - 128.61, - 153.43, - 41.83, - 61.18, - 40.65, - 46.71, - 46.12, - 45.54, - ], - [ - "9", - 2.134056558, - 0.31692025, - 1.3120006770000001, - 1498.986525, - 501.01687419999996, - 499.2999597, - 498.66969119999993, - 31.76279059, - 10.04786784, - 10.48995616, - 11.22496658, - 8.855932629, - 4.065679953, - 2.443454044, - 2.346798631, - 33.1, - 16.4, - 16.8, - 5.3, - 5.5, - 5.4, - 5.6, - 5.6, - 5.7, - 1.0, - 2.1341, - 0.3169, - 1.3119999999999998, - 1498.9865, - 501.0169, - 499.3, - 498.6697, - 31.7628, - 10.0479, - 10.49, - 11.225, - 8.8559, - 4.0657, - 2.4435, - 2.3468, - 33.13, - 16.35, - 16.78, - 5.29, - 5.53, - 5.43, - 5.56, - 5.63, - 5.7, - ], - [ - "10", - 0.678504648, - 0.35838638700000003, - 0.569093572, - 1498.674708, - 500.2971126, - 500.64281950000003, - 497.7347761, - 30.178638399999997, - 9.173285232000001, - 11.11531108, - 9.890042091, - 10.35101118, - 3.143504622, - 3.1301464180000003, - 4.077360139, - 31.8, - 15.6, - 16.2, - 4.3, - 4.1, - 5.3, - 5.5, - 6.0, - 6.5, - 9.0, - 6.1065, - 3.2255, - 5.1218, - 13488.0724, - 4502.674, - 4505.7854, - 4479.613, - 271.6077, - 82.5596, - 100.0378, - 89.0104, - 93.1591, - 28.2915, - 28.1713, - 36.6962, - 286.0, - 140.3, - 145.7, - 38.61, - 37.17, - 47.42, - 49.87, - 54.27, - 58.67, - ], + ["1", np.nan, 1.5968726069999999, 0.254832457, 1501.531337, 500.8443931, 499.9360421, 500.75090159999996, 29.94531777, 9.602584921, 9.653985014, 10.68874783, 6.486029315, 2.533542905, 2.308242877, 1.644243533, 35.5, 17.6, 17.9, 5.2, 5.0, 5.9, 6.1, 6.4, 6.8, 2.5, np.nan, 3.9922, 0.6371, 3753.8283, 1252.111, 1249.8401, 1251.8773, 74.8633, 24.0065, 24.135, 26.7219, 16.2151, 6.3339, 5.7706, 4.1106, 88.78, 44.04, 44.74, 13.11, 12.56, 14.84, 15.23, 16.09, 16.96], + ["2", 0.0, 0.484444627, 0.660795649, 1501.150221, 499.7596024, 501.6974991, 499.6931194, 31.82855842, 10.97246129, 10.18047899, 10.67561814, 9.256449084, 3.6957475310000003, 3.053765916, 2.5069356369999998, 31.3, 15.6, 15.7, 5.1, 5.0, 5.2, 5.3, 5.3, 5.4, 6.0, 0.0, 2.9067, 3.9648, 9006.9013, 2998.5576, 3010.185, 2998.1587, 190.9714, 65.8348, 61.0829, 64.0537, 55.5387, 22.1745, 18.3226, 15.0416, 188.0, 93.77, 94.23, 30.44, 30.23, 31.33, 31.55, 32.0, 32.44], + ["3", 0.245754563, 0.053153144000000006, 0.24898895399999998, 1499.305683, 499.0019342, 499.56546389999994, 500.7382846, 31.09016923, 10.200109300000001, 11.24314965, 9.646910282999999, 11.18471882, 5.38607754, 2.750773441, 3.047867843, 31.9, 16.2, 15.7, 5.9, 5.8, 5.4, 5.2, 5.0, 4.7, 7.0, 1.7203, 0.3721, 1.7429, 10495.1398, 3493.0135, 3496.9582, 3505.1679999999997, 217.6312, 71.4008, 78.702, 67.5284, 78.293, 37.7025, 19.2554, 21.3351, 223.37, 113.33, 110.03, 40.98, 40.59, 37.63, 36.39, 34.72, 33.05], + ["4", 0.091581569, 1.29306045, 2.222550393, 1497.3095039999998, 500.2853801, 498.26926339999994, 498.7548608, 31.49017978, 10.80040763, 9.527649379, 11.16212277, 10.22765186, 3.03791456, 3.8605661610000004, 3.3291711439999996, 19.5, 8.5, 11.0, 2.8, 4.8, 2.6, 3.2, 3.1, 3.0, 14.0, 1.2821, 18.1028, 31.1157, 20962.3331, 7003.9953, 6975.7697, 6982.5681, 440.8625, 151.2057, 133.3871, 156.2697, 143.1871, 42.5308, 54.0479, 46.6084, 272.46, 118.55, 153.91, 39.34, 67.83, 36.17, 44.62, 43.04, 41.46], + ["5", 0.48547690299999996, 0.614760438, 1.620874173, 1498.722371, 500.4694436000001, 500.44068089999996, 497.81224689999993, 30.44836156, 10.06357482, 10.79756546, 9.587221282, 8.654173735, 5.294000296, 2.062538336, 1.297635102, 38.7, 16.0, 22.7, 2.3, 5.6, 5.1, 7.2, 8.6, 10.0, 2.0, 0.971, 1.2295, 3.2417, 2997.4447, 1000.9389, 1000.8814, 995.6245, 60.8967, 20.1271, 21.5951, 19.1744, 17.3083, 10.588, 4.1251, 2.5953, 77.47, 32.06, 45.41, 4.64, 11.13, 10.28, 14.32, 17.14, 19.96], + ["6", 0.216682565, 0.420119477, 0.077609457, 1497.974779, 499.146944, 498.292069, 500.5357663, 29.76051868, 9.465821929, 9.133324308999999, 11.16137244, 11.35927018, 5.351916919, 2.9535873489999998, 3.053765916, 29.9, 14.9, 15.0, 4.8, 4.7, 5.0, 5.0, 5.2, 5.3, 1.0, 0.2167, 0.4201, 0.0776, 1497.9748, 499.1469, 498.2921, 500.5358, 29.7605, 9.4658, 9.1333, 11.1614, 11.3593, 5.3519, 2.9536, 3.0538, 29.89, 14.89, 15.01, 4.75, 4.7, 4.98, 5.04, 5.15, 5.27], + ["7", 0.256570729, 1.635266926, 0.877078037, 1497.8444539999998, 499.91778619999997, 499.7594455, 498.16722269999997, 28.24588485, 9.099873094, 9.312124146, 9.833887614, 11.20396942, 4.079138201, 5.134802344, 1.990028871, 27.8, 15.0, 12.8, 5.4, 4.0, 5.1, 4.6, 4.4, 4.3, 6.3, 1.6164, 10.3022, 5.5256, 9436.4201, 3149.4821, 3148.4845, 3138.4535, 177.9491, 57.3292, 58.6664, 61.9535, 70.585, 25.6986, 32.3493, 12.5372, 175.25, 94.52, 80.73, 34.26, 25.06, 32.43, 28.75, 27.84, 26.92], + ["8", 0.733170964, 0.111266445, 0.506586275, 1500.596397, 501.64230189999995, 501.381718, 497.5723767, 28.169746500000002, 9.444435161, 9.593212578, 9.132098765, 10.80790153, 3.316276783, 4.322564458, 3.169060286, 34.8, 15.9, 18.9, 5.2, 7.6, 5.0, 5.8, 5.7, 5.6, 8.1, 5.9387, 0.9013, 4.1033, 12154.8308, 4063.3026, 4061.1919, 4030.3363, 228.1749, 76.4999, 77.705, 73.97, 87.544, 26.8618, 35.0128, 25.6694, 282.04, 128.61, 153.43, 41.83, 61.18, 40.65, 46.71, 46.12, 45.54], + ["9", 2.134056558, 0.31692025, 1.3120006770000001, 1498.986525, 501.01687419999996, 499.2999597, 498.66969119999993, 31.76279059, 10.04786784, 10.48995616, 11.22496658, 8.855932629, 4.065679953, 2.443454044, 2.346798631, 33.1, 16.4, 16.8, 5.3, 5.5, 5.4, 5.6, 5.6, 5.7, 1.0, 2.1341, 0.3169, 1.3119999999999998, 1498.9865, 501.0169, 499.3, 498.6697, 31.7628, 10.0479, 10.49, 11.225, 8.8559, 4.0657, 2.4435, 2.3468, 33.13, 16.35, 16.78, 5.29, 5.53, 5.43, 5.56, 5.63, 5.7], + ["10", 0.678504648, 0.35838638700000003, 0.569093572, 1498.674708, 500.2971126, 500.64281950000003, 497.7347761, 30.178638399999997, 9.173285232000001, 11.11531108, 9.890042091, 10.35101118, 3.143504622, 3.1301464180000003, 4.077360139, 31.8, 15.6, 16.2, 4.3, 4.1, 5.3, 5.5, 6.0, 6.5, 9.0, 6.1065, 3.2255, 5.1218, 13488.0724, 4502.674, 4505.7854, 4479.613, 271.6077, 82.5596, 100.0378, 89.0104, 93.1591, 28.2915, 28.1713, 36.6962, 286.0, 140.3, 145.7, 38.61, 37.17, 47.42, 49.87, 54.27, 58.67], ] exp_output_df = pd.DataFrame(data=data, columns=exp_output_columns) diff --git a/tests/test_utils/test_path_helpers.py b/tests/test_utils/test_path_helpers.py index 36a0d7fb2..348bf4330 100644 --- a/tests/test_utils/test_path_helpers.py +++ b/tests/test_utils/test_path_helpers.py @@ -25,6 +25,7 @@ def config(): "ni_full_responses_path": "03_northern_ireland/2021/TEST_ni.csv", "manual_imp_trim_path": "06_imputation/man_trim/trim_qa.csv", "manual_outliers_path": "07_outliers/man_out/man_out.csv", + "backdata_path": "2021_data/backdata.csv", "all_data_construction_file_path": ( "04_construction/man_con/construction_file.csv" ), @@ -80,6 +81,7 @@ def test_get_paths(config): "ni_full_responses_path": "03_northern_ireland/2021/TEST_ni.csv", "manual_outliers_path": "07_outliers/man_out/man_out.csv", "manual_imp_trim_path": "06_imputation/man_trim/trim_qa.csv", + "backdata_path": "2021_data/backdata.csv", "all_data_construction_file_path": ( "04_construction/man_con/construction_file.csv" ), @@ -108,6 +110,7 @@ def expected_staging_dict(): "manual_imp_trim_path": ( "R:/DAP_emulation/2022_surveys/BERD/06_imputation/man_trim/trim_qa.csv" ), + "backdata_path": "2021_data/backdata.csv", } return expected_staging_dict