From 989c80e9307c41151a2cad9fa219a12c4a03357a Mon Sep 17 00:00:00 2001 From: James Westwood <67740306+jwestw@users.noreply.github.com> Date: Mon, 15 May 2023 16:18:37 +0100 Subject: [PATCH 001/411] Update README.md --- README.md | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/README.md b/README.md index 85d932614..dea5a4c7c 100644 --- a/README.md +++ b/README.md @@ -5,25 +5,6 @@ Calculating national and regional research and development expenditure as part o Additional information about the aims and objectives of the project will go here when it is available. The project is currently in pre-discovery. - - - - - -## Required secrets and credentials - -To run this project, [you need a `.secrets` file with secrets/credentials as -environmental variables][docs-loading-environment-variables-secrets]. The -secrets/credentials should have the following environment variable name(s): - -| Secret/credential | Environment variable name | Description | -|-------------------|---------------------------|--------------------------------------------| -| Secret 1 | `SECRET_VARIABLE_1` | Plain English description of Secret 1. | -| Credential 1 | `CREDENTIAL_VARIABLE_1` | Plain English description of Credential 1. | - -Once you've added, [load these environment variables using -`.env`][docs-loading-environment-variables]. - ## Licence Unless stated otherwise, the codebase is released under the MIT License. This covers @@ -36,8 +17,6 @@ Crown copyright and available under the terms of the Open Government 3.0 licence [This project structure is based on the `govcookiecutter` template project][govcookiecutter]. Guidance on using the govcookiecutter can be found on [this youtube video](https://www.youtube.com/watch?v=N7_d3k3uQ_M) and in the [documentation here](https://dataingovernment.blog.gov.uk/2021/07/20/govcookiecutter-a-template-for-data-science-projects/). -The text in the "For Developers" section was adapted from the README of the [Transport Efficiency Project](https://github.com/jwestw/Public_Transport_Efficiency) which was mostly written by Chloe Murrell. - Some of the text, especially that covering git configuration and security considerations was adapted from work by David Foster and Rowan Hemsi at ONS. [contributing]: ./docs/contributor_guide/CONTRIBUTING.md From 56550e462e653de56ec0962c96ca5fa4e9c2de88 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 17:11:58 +0100 Subject: [PATCH 002/411] Created a data_schema.py file. --- src/data_schema.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 src/data_schema.py diff --git a/src/data_schema.py b/src/data_schema.py new file mode 100644 index 000000000..d8caf59ea --- /dev/null +++ b/src/data_schema.py @@ -0,0 +1,39 @@ +import pandas as pd + + +def read_xlsx(excel_file) -> pd.DataFrame: + """_summary_ + + Arguments: + excel_file -- _description_ + + Returns: + _description_ + """ + xl_dataframe = pd.read_excel(excel_file, "Sheet1") + return xl_dataframe + + +# def convert_dataFrame(pdFrame)->dict: +# """_summary_ +# +# Arguments: +# pdFrame -- _description_ +# +# Returns: +# _description_ +# """ +# return +# +# +# def create_toml(pdDict)->str: +# """_summary_ +# +# Arguments: +# pdDict -- _description_ +# +# Returns: +# _description_ +# """ +# return +# From cbd9d3edf139bd7e2597cd09d78d198e335161e4 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 26 Apr 2023 19:46:05 +0100 Subject: [PATCH 003/411] Created several functions to read a local copy of the excel file, convert it to a dictionary, reformat said dictionary suitable to be converted into a toml file. --- src/data_validation/data_schema.py | 142 +++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 src/data_validation/data_schema.py diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py new file mode 100644 index 000000000..1429a723c --- /dev/null +++ b/src/data_validation/data_schema.py @@ -0,0 +1,142 @@ +import math +import toml +import pandas as pd + +from typing import IO + + +def read_xlsx(excel_file) -> pd.DataFrame: + """Read an excel file and convert it into a + pandas dataframe, dropping any 'Unnamed:' columns. + + + Arguments: + excel_file -- the excel file to be converted + + Returns: + A pd.DataFrame: a pandas dataframe object. + """ + xl_dataframe = pd.read_excel(excel_file, nrows=93, engine="openpyxl") + xl_dataframe = xl_dataframe[ + xl_dataframe.columns.drop(list(xl_dataframe.filter(regex="Unnamed:"))) + ] + + return xl_dataframe + + +def convert_dataFrame(pdFrame: pd.DataFrame) -> dict: + """Convert a pandas dataframe into a dictionary oriented by + index. This makes the keys in the dictionary the row index + of the dataframe, and the values are a dictionary containing + the other key-value information. + + Arguments: + pdFrame -- the pandas dataframe to be converted + + Returns: + A dict: dict object oriented by index + """ + pd_dict = pdFrame.to_dict(orient="index") + return pd_dict + + +def is_nan(value) -> bool: + """Takes in a value and returns a boolean indicating + whether it is a 'not a number' or not. + + Arguments: + value -- Any value + + Returns: + A bool: boolean indicating whether the value is + 'not a number' or not, as determined by the 'math' + module. + """ + return math.isnan(float(value)) + + +def reformat_tomlDict(pdDict: dict) -> dict: + """Creates a dictionary suitable to be converted + into a toml file. Takes an index oriented + dictionary as input and creates a new dictionary + at as + + Arguments: + pdDict -- a dictionary + + Returns: + A dict: dictionary ready to be used to create + a toml file. + """ + newDict = {} + tomlDict = {} + + # Loop over input dictionary to create a sub dictionary + for key in pdDict: + newDict[str(key)] = pdDict[key] + + subDict1 = newDict[str(key)] + var = subDict1.pop("Field Name (as it appears in dataset)") + + tomlDict[var] = subDict1 + + # Loop over each key in sub-dictionary and reformat values for usability + for key in tomlDict: + + subDict2 = tomlDict[key] + + subDict2["description"] = subDict2.pop("Description") + subDict2["data_type"] = subDict2.pop( + """Data Type (Numeric integer/Numeric float (or decimal) + /Text/Categorical/Boolean (True or False, 1 or 0))""" + ) + subDict2["nullable"] = subDict2.pop( + "Nullable (is it acceptable to have a null value? Acceptable = Yes)" + ) + + acceptable_values_str = str(subDict2["Acceptable Values (>0 or 0 – 1,000,000)"]) + acceptable_values_list = acceptable_values_str.split() + + subDict2["min_acceptable_value"] = acceptable_values_list[0] + subDict2["max_acceptable_value"] = acceptable_values_list[-1].replace(",", "") + + if is_nan(subDict2["min_acceptable_value"]): + subDict2["min_acceptable_value"] = acceptable_values_list[0] + elif is_nan(subDict2["max_acceptable_value"]): + subDict2["max_acceptable_value"] = acceptable_values_list[-1] + else: + subDict2["min_acceptable_value"] = int(acceptable_values_list[0]) + subDict2["max_acceptable_value"] = int( + acceptable_values_list[-1].replace(",", "") + ) + + subDict2.pop("Acceptable Values (>0 or 0 – 1,000,000)") + + tomlDict[key] = subDict2 + + return tomlDict + + +def create_toml(pdDict: dict) -> IO[str]: + """Write a toml file from a dictionary. + + Arguments: + pdDict -- A dictionary containing a dictionary as + its values. + + Returns: + A toml file - IO[str] type indicates a text based file + (.toml) will be returned. + """ + + output_file_name = "./config/DataSchema.toml" + with open(output_file_name, "w") as toml_file: + toml.dump(pdDict, toml_file) + + return pdDict + + +test = read_xlsx("C:\\Users\\macrar\\Downloads\\Data Dictionary - BERD.xlsx") +test2 = convert_dataFrame(test) +test3 = reformat_tomlDict(test2) +test4 = create_toml(test3) From 56d661ea38c3888cf44903e201b815b871bb650f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 26 Apr 2023 19:47:49 +0100 Subject: [PATCH 004/411] Result of data_schema.py. Toml file indicating the different characteristics of each variable. --- config/DataSchema.toml | 650 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 650 insertions(+) create mode 100644 config/DataSchema.toml diff --git a/config/DataSchema.toml b/config/DataSchema.toml new file mode 100644 index 000000000..aa15a8a78 --- /dev/null +++ b/config/DataSchema.toml @@ -0,0 +1,650 @@ +[cell_id] +description = "Cell ID" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[civ_or_def] +description = "Business type: Civil or Defence" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[current_sic] +description = "Sic - Standard Industry Classification" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[data_source] +description = "Constructed" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[emp_other] +description = "emp_other (Full Time Equivalent)" +data_type = "Numeric float (or decimal)" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[emp_researcher] +description = "emp_researcher (Full Time Equivalent)" +data_type = "Numeric float (or decimal)" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[emp_technician] +description = "emp_technician (Full Time Equivalent)" +data_type = "Numeric float (or decimal)" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[emp_total] +description = "emp_total (Full Time Equivalent)" +data_type = "Numeric float (or decimal)" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[employee_count] +description = "Employee Count (IDBR)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[foreign_owner] +description = "Foreign Owner" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[form_status] +description = "Status" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[form_type] +description = "Form Type" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[freeze_id] +description = "Freeze ID - bespoke to openroad" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[headcount_oth_f] +description = "Other Female (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_oth_m] +description = "Other Male (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_res_f] +description = "Researchers Females (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_res_m] +description = "Researchers Male (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_tec_f] +description = "Technicians Female (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_tec_m] +description = "Technicians Male (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_total] +description = "Total Headcount" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[period] +description = "Openroad Specific" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[period_contributor_id] +description = "Openroad Specific" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[period_year] +description = "Period" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[product_group] +description = "Published Product Group" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[ru_ref] +description = "Reference" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[sizeband] +description = "SizeBand" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[wowentref] +description = "Wowentref" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q202] +description = "Salaries & Wages" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q203] +description = "Other current expenditure" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q204] +description = "Total Current Expenditure" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q205] +description = "Basic Research" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q206] +description = "Applied Research" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q207] +description = "Experimental Development" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q208] +description = "Land & Build CapEx " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q209] +description = "Equipment & Machinery CapEx" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q210] +description = "Total Capex." +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q211] +description = "Total Inhouse Expenditure " +data_type = "Numeric Integer" +nullable = "No" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q212] +description = "Own Funds" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q213] +description = "Funding - Commission of the EU" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q214] +description = "Funding - UK government" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q215] +description = "Funding - Organisations outside the Uk " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q216] +description = "Funding - Other UK Private Bus/Public Orgs " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q217] +description = "Funding - Any Other UK " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q218] +description = "Total Funding " +data_type = "Numeric Integer" +nullable = "No" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q219] +description = "Land Acquired for R&D (Split of Land & Build CapEx)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q220] +description = "Buildings acquired/constructed for R&D (Split of Land & Build CapEx)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q221] +description = "Expenditure on computer software only (of which from Equipment & Machinery CapEx)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q222] +description = "Purchase of Materials (Split of Other current)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q223] +description = "Purchase of Services (Split of Other current)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q224] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q225] +description = "Ownership - Own Business" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q226] +description = "Ownership - UK Government" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q227] +description = "Ownership - Other UK Priv Bus" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q228] +description = "Ownership - Other UK Orgs" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q229] +description = "Ownership - Bus Enterprises in Group Outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q230] +description = "Ownership - Other Bus Enterprises outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q231] +description = "Ownership - Other Governments outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q232] +description = "Ownership - Higher Education Establishments outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q233] +description = "Ownership - Non-profit Orgs outside the UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q234] +description = "Ownership - Commission of EU" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q235] +description = "Ownership - International Orgs" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q236] +description = "Ownership - Any other Orgs outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q237] +description = "Ownership - not owned freely available" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q238] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q239] +description = "Life Length - Basic Research" +data_type = "Numeric Integer" +nullable = "Not Asked" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q240] +description = "Life Length - Applied Research" +data_type = "Numeric Integer" +nullable = "Not Asked" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q241] +description = "Life Length - Experimental Res" +data_type = "Numeric Integer" +nullable = "Not Asked" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q242] +description = "Funding - Any other UK organisations" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q243] +description = "Funding - Business Enterprises in group outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q244] +description = "Funding - Other Business Enterprises outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q245] +description = "Funding - Other Governments outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q246] +description = "Funding - Higher Education Est Outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q247] +description = "Funding - Non-profit Orgs outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q248] +description = "Funding - International Orgs" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q249] +description = "Funding - Any other orgs outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q250] +description = "Funding - UK Higher Education Establishments" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q251] +description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q252] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q253] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q254] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q255] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q256] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q257] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q258] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q302] +description = "Purchased/funded R&D in the UK (Yes or No)" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q303] +description = "Purchased Outside UK (Govt Funded) " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q304] +description = "Purchased Outside UK (Other) " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q305] +description = "Total Purchased" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q307] +description = "Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q308] +description = "Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q309] +description = "Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q713] +description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q714] +description = "Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" From c04905ca11fd7a8b811a6cdd93a3adf60e4cc7b8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 27 Apr 2023 13:13:56 +0100 Subject: [PATCH 005/411] Updated the environment.yml to include xlrd as needed that at one point for parsing the xlsx file. --- environment.yml | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/environment.yml b/environment.yml index bef14f979..e0f37351d 100644 --- a/environment.yml +++ b/environment.yml @@ -1,21 +1,22 @@ name: resdev36 dependencies: -- python=3 -- coverage -- pyyaml -- requests -- sphinx -- pip -- pip: - - arrow - - cookiecutter - - detect-secrets - - myst-parser - - pre-commit==2.17.0 - - python-dotenv - - table_logger - - pandas==1.1.5 - - numpy - - pydoop - - setuptools - - pytest + - python=3 + - coverage + - pytest + - pytest-cov + - pyyaml + - pandas + - numpy + - requests + - sphinx + - pip + - pip: + - arrow + - cookiecutter + - detect-secrets + - myst-parser + - pre-commit==2.17.0 + - python-dotenv + - table_logger + - toml + - xlrd From c9997f3004dd54af3ddd893afa60cab13d410689 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 27 Apr 2023 13:33:00 +0100 Subject: [PATCH 006/411] Had to comment out several lines in main as functions missing, including Config_settings. Branch cloned from develop. --- src/main.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/main.py b/src/main.py index 21c8866b8..358f5d69a 100644 --- a/src/main.py +++ b/src/main.py @@ -1,11 +1,13 @@ """The main pipeline""" -from src.utils import runlog -from src._version import __version__ as version -from src.utils.helpers import Config_settings -from src.utils.wrappers import logger_creator +# from src.utils import runlog +# from src._version import __version__ as version + +# from src.utils.helpers import Config_settings +# from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data -import time + +# import time import logging @@ -21,25 +23,24 @@ def run_pipeline(start): generated from the time module using time.time() """ - conf_obj = Config_settings() - config = conf_obj.config_dict - global_config = config["global"] + # conf_obj = Config_settings() + # config = conf_obj.config_dict + # global_config = config["global"] - runlog_obj = runlog.RunLog(config, version) + # runlog_obj = runlog.RunLog(config, version) - logger = logger_creator(global_config) + # logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") - logger.info("Collecting logging parameters ..........") + # logger.info("Collecting logging parameters ..........") Manipulate_data() MainLogger.info("Finishing Pipeline .......................") - runlog_obj.retrieve_pipeline_logs() + # runlog_obj.retrieve_pipeline_logs() - run_time = round(time.time() - start, 5) - runlog_obj._record_time_taken(run_time) + # run_time = round(time.time() - start, 5) + # runlog_obj._record_time_taken(run_time) - runlog_obj.retrieve_configs() - runlog_obj._create_runlog_dicts() - runlog_obj._create_runlog_dfs() - runlog_obj.create_runlog_files() - runlog_obj._write_runlog() + # runlog_obj._create_runlog_dicts() + # runlog_obj._create_runlog_dfs() + # runlog_obj.create_runlog_files() + # runlog_obj._write_runlog() From 4997c8e973eca3ef2320f46601052ffdf14fb9f4 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 3 May 2023 16:39:59 +0100 Subject: [PATCH 007/411] Removed legacy data_schema.py file from top level directory. --- src/data_schema.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 src/data_schema.py diff --git a/src/data_schema.py b/src/data_schema.py deleted file mode 100644 index d8caf59ea..000000000 --- a/src/data_schema.py +++ /dev/null @@ -1,39 +0,0 @@ -import pandas as pd - - -def read_xlsx(excel_file) -> pd.DataFrame: - """_summary_ - - Arguments: - excel_file -- _description_ - - Returns: - _description_ - """ - xl_dataframe = pd.read_excel(excel_file, "Sheet1") - return xl_dataframe - - -# def convert_dataFrame(pdFrame)->dict: -# """_summary_ -# -# Arguments: -# pdFrame -- _description_ -# -# Returns: -# _description_ -# """ -# return -# -# -# def create_toml(pdDict)->str: -# """_summary_ -# -# Arguments: -# pdDict -- _description_ -# -# Returns: -# _description_ -# """ -# return -# From f008c171233794c3472156976fa8dcfe1d8187e8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 3 May 2023 16:41:46 +0100 Subject: [PATCH 008/411] Removed trailing spaces/lines at end of file. --- src/data_validation/data_schema.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index 1429a723c..44feda745 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -9,7 +9,6 @@ def read_xlsx(excel_file) -> pd.DataFrame: """Read an excel file and convert it into a pandas dataframe, dropping any 'Unnamed:' columns. - Arguments: excel_file -- the excel file to be converted @@ -134,9 +133,3 @@ def create_toml(pdDict: dict) -> IO[str]: toml.dump(pdDict, toml_file) return pdDict - - -test = read_xlsx("C:\\Users\\macrar\\Downloads\\Data Dictionary - BERD.xlsx") -test2 = convert_dataFrame(test) -test3 = reformat_tomlDict(test2) -test4 = create_toml(test3) From 77ead7d22b63c895121f3d1acdcdbba338936120 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 3 May 2023 17:23:36 +0100 Subject: [PATCH 009/411] Removed trailing spaces in DataSchema.toml file. --- config/DataSchema.toml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/config/DataSchema.toml b/config/DataSchema.toml index aa15a8a78..7854a9848 100644 --- a/config/DataSchema.toml +++ b/config/DataSchema.toml @@ -230,7 +230,7 @@ min_acceptable_value = 0 max_acceptable_value = 1000000 [q208] -description = "Land & Build CapEx " +description = "Land & Build CapEx" data_type = "Numeric Integer" nullable = "Yes" min_acceptable_value = 0 @@ -251,7 +251,7 @@ min_acceptable_value = 0 max_acceptable_value = 1000000 [q211] -description = "Total Inhouse Expenditure " +description = "Total Inhouse Expenditure" data_type = "Numeric Integer" nullable = "No" min_acceptable_value = 0 @@ -279,28 +279,28 @@ min_acceptable_value = 0 max_acceptable_value = 1000000 [q215] -description = "Funding - Organisations outside the Uk " +description = "Funding - Organisations outside the Uk" data_type = "Numeric Integer" nullable = "Yes" min_acceptable_value = 0 max_acceptable_value = 1000000 [q216] -description = "Funding - Other UK Private Bus/Public Orgs " +description = "Funding - Other UK Private Bus/Public Orgs" data_type = "Numeric Integer" nullable = "Yes" min_acceptable_value = 0 max_acceptable_value = 1000000 [q217] -description = "Funding - Any Other UK " +description = "Funding - Any Other UK" data_type = "Numeric Integer" nullable = "Yes" min_acceptable_value = 0 max_acceptable_value = 1000000 [q218] -description = "Total Funding " +description = "Total Funding" data_type = "Numeric Integer" nullable = "No" min_acceptable_value = 0 @@ -594,14 +594,14 @@ min_acceptable_value = "nan" max_acceptable_value = "nan" [q303] -description = "Purchased Outside UK (Govt Funded) " +description = "Purchased Outside UK (Govt Funded)" data_type = "Numeric Integer" nullable = "Yes" min_acceptable_value = 0 max_acceptable_value = 1000000 [q304] -description = "Purchased Outside UK (Other) " +description = "Purchased Outside UK (Other)" data_type = "Numeric Integer" nullable = "Yes" min_acceptable_value = 0 From 78190f9037ad7a6fa8d4635b763a535a7a76f688 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 10:36:15 +0100 Subject: [PATCH 010/411] Added a function to read csv files from DAP and changed return value of create_toml() to return the toml file itself. --- src/data_validation/data_schema.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index 44feda745..4f614d6df 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -1,10 +1,27 @@ import math import toml import pandas as pd +import pydoop.hdfs as hdfs from typing import IO +def read_DAP_csv(excel_file) -> pd.DataFrame: + """Read an excel file from DAP and convert it into a + pandas dataframe, dropping any 'Unnamed:' columns. + Arguments: + excel_file -- the excel file to be converted + Returns: + A pd.DataFrame: a pandas dataframe object. + """ + with hdfs.open(excel_file, "r") as file: + + # Import csv file and convert to Dataframe + sheet = pd.read_csv(file) + + return sheet + + def read_xlsx(excel_file) -> pd.DataFrame: """Read an excel file and convert it into a pandas dataframe, dropping any 'Unnamed:' columns. @@ -132,4 +149,4 @@ def create_toml(pdDict: dict) -> IO[str]: with open(output_file_name, "w") as toml_file: toml.dump(pdDict, toml_file) - return pdDict + return toml_file From 1b80c5b2302ec7db50357dbc9029aa95e5b69434 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 10:37:34 +0100 Subject: [PATCH 011/411] Added pydoop to environment.yml as needed if using the read_DAP_csv() function in data_schema.py --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index e0f37351d..ef7f567c6 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - detect-secrets - myst-parser - pre-commit==2.17.0 + - pydoop - python-dotenv - table_logger - toml From 69ea4c91c6dbf0e3b3bb492e2fd01eaaca8f871a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 12:34:04 +0100 Subject: [PATCH 012/411] Removed redundant function read_xlsx(). --- src/data_validation/data_schema.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index 4f614d6df..e28060816 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -22,24 +22,6 @@ def read_DAP_csv(excel_file) -> pd.DataFrame: return sheet -def read_xlsx(excel_file) -> pd.DataFrame: - """Read an excel file and convert it into a - pandas dataframe, dropping any 'Unnamed:' columns. - - Arguments: - excel_file -- the excel file to be converted - - Returns: - A pd.DataFrame: a pandas dataframe object. - """ - xl_dataframe = pd.read_excel(excel_file, nrows=93, engine="openpyxl") - xl_dataframe = xl_dataframe[ - xl_dataframe.columns.drop(list(xl_dataframe.filter(regex="Unnamed:"))) - ] - - return xl_dataframe - - def convert_dataFrame(pdFrame: pd.DataFrame) -> dict: """Convert a pandas dataframe into a dictionary oriented by index. This makes the keys in the dictionary the row index From ff7960a5ac984bfc627e6206411f2b69546e6716 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 17:37:37 +0100 Subject: [PATCH 013/411] Added lines at end of data_schema.py to call the functions and produce the toml file. --- src/data_validation/data_schema.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index e28060816..e49ff8059 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -132,3 +132,9 @@ def create_toml(pdDict: dict) -> IO[str]: toml.dump(pdDict, toml_file) return toml_file + + +test = read_DAP_csv("Data Dictionary - BERD.csv") +test2 = convert_dataFrame(test) +test3 = reformat_tomlDict(test2) +test4 = create_toml(test3) From 2b04cdf31c3e738337cb68324878aa15020c1acf Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 17:57:35 +0100 Subject: [PATCH 014/411] Added lines to deal with string that is too long in reformat_tomlDict(), and attempted to remove trailing whitespace automatically in tomlDict values. --- src/data_validation/data_schema.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index e49ff8059..44da4581a 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -78,16 +78,15 @@ def reformat_tomlDict(pdDict: dict) -> dict: tomlDict[var] = subDict1 + data_type_substr1 = "Data Type (Numeric integer/Numeric float (or decimal)" + data_type_substr2 = "/Text/Categorical/Boolean (True or False, 1 or 0))" # Loop over each key in sub-dictionary and reformat values for usability for key in tomlDict: subDict2 = tomlDict[key] subDict2["description"] = subDict2.pop("Description") - subDict2["data_type"] = subDict2.pop( - """Data Type (Numeric integer/Numeric float (or decimal) - /Text/Categorical/Boolean (True or False, 1 or 0))""" - ) + subDict2["data_type"] = subDict2.pop(f"{data_type_substr1}{data_type_substr2}") subDict2["nullable"] = subDict2.pop( "Nullable (is it acceptable to have a null value? Acceptable = Yes)" ) @@ -110,7 +109,10 @@ def reformat_tomlDict(pdDict: dict) -> dict: subDict2.pop("Acceptable Values (>0 or 0 – 1,000,000)") - tomlDict[key] = subDict2 + if isinstance(type(tomlDict[key]), str) and key == "description": + tomlDict[key] = tomlDict[key].strip() + else: + tomlDict[key] = subDict2 return tomlDict @@ -134,7 +136,7 @@ def create_toml(pdDict: dict) -> IO[str]: return toml_file -test = read_DAP_csv("Data Dictionary - BERD.csv") +test = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") test2 = convert_dataFrame(test) test3 = reformat_tomlDict(test2) test4 = create_toml(test3) From 693e1c459d52209b37ac09042cf484ce2ba584f8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 18:01:48 +0100 Subject: [PATCH 015/411] Corrected stripping white space in tomlDict creation. --- src/data_validation/data_schema.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index 44da4581a..f6b958c4c 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -85,7 +85,10 @@ def reformat_tomlDict(pdDict: dict) -> dict: subDict2 = tomlDict[key] - subDict2["description"] = subDict2.pop("Description") + if isinstance(subDict2["Description"], str): + subDict2["description"] = (subDict2.pop("Description")).strip() + else: + subDict2["description"] = subDict2.pop("Description") subDict2["data_type"] = subDict2.pop(f"{data_type_substr1}{data_type_substr2}") subDict2["nullable"] = subDict2.pop( "Nullable (is it acceptable to have a null value? Acceptable = Yes)" @@ -109,10 +112,7 @@ def reformat_tomlDict(pdDict: dict) -> dict: subDict2.pop("Acceptable Values (>0 or 0 – 1,000,000)") - if isinstance(type(tomlDict[key]), str) and key == "description": - tomlDict[key] = tomlDict[key].strip() - else: - tomlDict[key] = subDict2 + tomlDict[key] = subDict2 return tomlDict From 573ca55fbba1fe1132e0201880dc2a18320ad330 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 9 May 2023 17:11:54 +0100 Subject: [PATCH 016/411] Gave more meaningful names to test variables on lines 140 - 143. --- src/data_validation/data_schema.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index f6b958c4c..ba9d632a0 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -57,7 +57,6 @@ def reformat_tomlDict(pdDict: dict) -> dict: """Creates a dictionary suitable to be converted into a toml file. Takes an index oriented dictionary as input and creates a new dictionary - at as Arguments: pdDict -- a dictionary @@ -80,6 +79,7 @@ def reformat_tomlDict(pdDict: dict) -> dict: data_type_substr1 = "Data Type (Numeric integer/Numeric float (or decimal)" data_type_substr2 = "/Text/Categorical/Boolean (True or False, 1 or 0))" + # Loop over each key in sub-dictionary and reformat values for usability for key in tomlDict: @@ -89,6 +89,7 @@ def reformat_tomlDict(pdDict: dict) -> dict: subDict2["description"] = (subDict2.pop("Description")).strip() else: subDict2["description"] = subDict2.pop("Description") + subDict2["data_type"] = subDict2.pop(f"{data_type_substr1}{data_type_substr2}") subDict2["nullable"] = subDict2.pop( "Nullable (is it acceptable to have a null value? Acceptable = Yes)" @@ -136,7 +137,7 @@ def create_toml(pdDict: dict) -> IO[str]: return toml_file -test = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") -test2 = convert_dataFrame(test) -test3 = reformat_tomlDict(test2) -test4 = create_toml(test3) +csv_dataframe = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") +csv_dataDict = convert_dataFrame(csv_dataframe) +reformated_Dict = reformat_tomlDict(csv_dataDict) +tomlFile = create_toml(reformated_Dict) From 865ca8121c5f03c08224178dfdbefcb46011cbd0 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 11 May 2023 14:21:36 +0100 Subject: [PATCH 017/411] Tried to be less generic with naming on lines 140 - 143, and match Python naming conventions. --- src/data_validation/data_schema.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index ba9d632a0..69e1582b0 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -137,7 +137,7 @@ def create_toml(pdDict: dict) -> IO[str]: return toml_file -csv_dataframe = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") -csv_dataDict = convert_dataFrame(csv_dataframe) -reformated_Dict = reformat_tomlDict(csv_dataDict) -tomlFile = create_toml(reformated_Dict) +berd_schema_df = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") +berd_schema_dict = convert_dataFrame(berd_schema_df) +reshaped_schema_dict = reformat_tomlDict(berd_schema_dict) +tomlfile = create_toml(reshaped_schema_dict) From 57367ba9ce8bbd7bd843acf15c852c091bdde5b6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 10:38:00 +0100 Subject: [PATCH 018/411] Better naming convention for create_toml() function arguments. Moved output toml file path from inside function to function argument with a default argument. --- src/data_validation/data_schema.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index 69e1582b0..d177def13 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -118,21 +118,23 @@ def reformat_tomlDict(pdDict: dict) -> dict: return tomlDict -def create_toml(pdDict: dict) -> IO[str]: +def create_toml( + pd_dict: dict, output_toml_file: str = "./config/DataSchema.toml" +) -> IO[str]: """Write a toml file from a dictionary. Arguments: - pdDict -- A dictionary containing a dictionary as + pd_dict -- A dictionary containing a dictionary as its values. - + output_toml_file -- Path to the output toml file. + (default: {"./config/DataSchema.toml"}) Returns: A toml file - IO[str] type indicates a text based file (.toml) will be returned. """ - output_file_name = "./config/DataSchema.toml" - with open(output_file_name, "w") as toml_file: - toml.dump(pdDict, toml_file) + with open(output_toml_file, "w") as toml_file: + toml.dump(pd_dict, toml_file) return toml_file From cf4f9f2607d314c6dc19544c43297bfe4276e1e1 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 10:45:31 +0100 Subject: [PATCH 019/411] Reverted lines at the end of data_schema.py after some testing. --- src/data_validation/data_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index d177def13..339a54c73 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -142,4 +142,4 @@ def create_toml( berd_schema_df = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") berd_schema_dict = convert_dataFrame(berd_schema_df) reshaped_schema_dict = reformat_tomlDict(berd_schema_dict) -tomlfile = create_toml(reshaped_schema_dict) +tomlfile = create_toml(reshaped_schema_dict, "./config/Data_Schema_New.toml") From 007c05a4432f65c047c65ff7e9abef4c5e89f947 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 10:52:46 +0100 Subject: [PATCH 020/411] Re-added function to read excel file (.xlsx) to data_schema.py. --- src/data_validation/data_schema.py | 32 ++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index 339a54c73..4f481b81f 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -6,6 +6,27 @@ from typing import IO +def read_xlsx(excel_file) -> pd.DataFrame: + """Read an excel file and convert it into a + pandas dataframe, dropping any 'Unnamed:' columns. + + + Arguments: + excel_file -- the excel file to be converted + + Returns: + A pd.DataFrame: a pandas dataframe object. + """ + xl_dataframe = pd.read_excel( + excel_file, sheet_name="contributors", engine="openpyxl" + ) + # xl_dataframe = xl_dataframe[ + # xl_dataframe.columns.drop(list(xl_dataframe.filter(regex="Unnamed:"))) + # ] + + return xl_dataframe + + def read_DAP_csv(excel_file) -> pd.DataFrame: """Read an excel file from DAP and convert it into a pandas dataframe, dropping any 'Unnamed:' columns. @@ -139,7 +160,10 @@ def create_toml( return toml_file -berd_schema_df = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") -berd_schema_dict = convert_dataFrame(berd_schema_df) -reshaped_schema_dict = reformat_tomlDict(berd_schema_dict) -tomlfile = create_toml(reshaped_schema_dict, "./config/Data_Schema_New.toml") +# berd_schema_df = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") +berd_schema_df = read_xlsx("C:\\Users\\macrar\\Downloads\\SPP Snapshot Schema.xlsx") +print(berd_schema_df) + +# berd_schema_dict = convert_dataFrame(berd_schema_df) +# reshaped_schema_dict = reformat_tomlDict(berd_schema_dict) +# tomlfile = create_toml(reshaped_schema_dict, "./config/Data_Schema_New.toml") From f7bf14e368a7e5eacef2d6749e9d07005bffa96b Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 17:58:09 +0100 Subject: [PATCH 021/411] Completed new data schema based on SPP consumer data snapshot. Saved to Data_Schema.toml. --- config/Data_Schema.toml | 469 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 469 insertions(+) create mode 100644 config/Data_Schema.toml diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml new file mode 100644 index 000000000..a7a021b23 --- /dev/null +++ b/config/Data_Schema.toml @@ -0,0 +1,469 @@ +[snapshot_id] +Description = nannan +Deduced Data Type = nan +Nullable = nan +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[reference] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = 11001603625.0 +Max values = 19891309165.0 +Possible Categorical Values = nan + +[period] +Description = nan +Deduced Data Type = Category(int) +Nullable = 0.0 +Current Data Type = str +Length = 6 +Min values = nan +Max values = nan +Possible Categorical Values = 202012 + +[survey] +Description = All values are 002 +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = nan +Max values = nan +Possible Categorical Values = 002 + +[formid] +Description = nan +Deduced Data Type = Category(int) +Nullable = 0.0 +Current Data Type = int +Length = 2 +Min values = nan +Max values = nan +Possible Categorical Values = 20, 21 + +[status] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = nan +Max values = nan +Possible Categorical Values = Clear, Clear - overridden, Form saved, Clear - overridden SE, Form sent out, Check needed, Combined child (NIL2), Out of scope (NIL3), Ceased trading (NIL4), Dormant (NIL5), Part year return (NIL8), No UK activity (NIL9) + +[statusencoded] +Description = nan +Deduced Data Type = Category(int) +Nullable = 0.0 +Current Data Type = int +Length = 3 +Min values = nan +Max values = nan +Possible Categorical Values = 100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309 + +[receiptdate] +Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 +Deduced Data Type = Datetime +Nullable = 0.0 +Current Data Type = None/str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[lockedby] +Description = All empty strings +Deduced Data Type = ? +Nullable = 1.0 +Current Data Type = None/str +Length = 0 +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[lockeddate] +Description = All None type +Deduced Data Type = ? +Nullable = 1.0 +Current Data Type = None/str +Length = 0 +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[formtype] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = 0001, 0006 + +[checkletter] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = 1 +Min values = nan +Max values = nan +Possible Categorical Values = T, H, F, J, D, A, K, C, B, L, S + +[frozensicoutdated] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1120.0 +Max values = 93059.0 +Possible Categorical Values = nan + +[rusicoutdated] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1120.0 +Max values = 93059.0 +Possible Categorical Values = nan + +[frozensic] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1300.0 +Max values = 96090.0 +Possible Categorical Values = nan + +[rusic] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1300.0 +Max values = 96090.0 +Possible Categorical Values = nan + +[frozenemployees] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 272527.0 +Possible Categorical Values = nan + +[employees] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 272528.0 +Possible Categorical Values = nan + +[frozenemployment] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1.0 +Max values = 272527.0 +Possible Categorical Values = nan + +[employment] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1.0 +Max values = 272528.0 +Possible Categorical Values = nan + +[frozenfteemployment] +Description = nan +Deduced Data Type = float +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 177699.0 +Possible Categorical Values = nan + +[fteemployment] +Description = nan +Deduced Data Type = float +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 177699.5 +Possible Categorical Values = nan + +[frozenturnover] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 55277352.0 +Possible Categorical Values = nan + +[turnover] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 55277352.0 +Possible Categorical Values = nan + +[enterprisereference] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = 1001603625.0 +Max values = 9891309165.0 +Possible Categorical Values = nan + +[wowenterprisereference] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = 1001603625.0 +Max values = 9891309165.0 +Possible Categorical Values = nan + +[cellnumber] +Description = nan +Deduced Data Type = Category(int) +Nullable = 0.0 +Current Data Type = int +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = 0 + +[currency] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = S, E + +[vatreference] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = VATREF + +[payereference] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = PAYEREF + +[companyregistrationnumber] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = CRN + +[numberlivelocalunits] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 6063.0 +Possible Categorical Values = nan + +[numberlivevat] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 255.0 +Possible Categorical Values = nan + +[numberlivepaye] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 24.0 +Possible Categorical Values = nan + +[legalstatus] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1.0 +Max values = 4.0 +Possible Categorical Values = 1, 2, 3, 4 + +[reportingunitmarker] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = L, E + +[region] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = WW, BB, FE, GG, JG, HH, ED, KJ, XX, AA, DC, GF, BA + +[birthdate] +Description = Datetime format = format=%d/%m/%Y +Deduced Data Type = Datetime +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[referencename] +Description = nan +Deduced Data Type = str +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[referencepostcode] +Description = nan +Deduced Data Type = str +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[tradingstyle] +Description = nan +Deduced Data Type = str +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[selectiontype] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = L + +[inclusionexclusion] +Description = All values are +Deduced Data Type = ? +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[createdby] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = nan +Max values = nan +Possible Categorical Values = ingestion + +[createddate] +Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 +Deduced Data Type = Datetime +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = nan +Max values = nan +Possible Categorical Values = nan + +[lastupdatedby] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = None/str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = data_migration, Cheri, Adela, David + +[lastupdateddate] +Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 +Deduced Data Type = Datetime +Nullable = 0.0 +Current Data Type = None/str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan From 9f3a3b7b23a12fed11ded54445e842b117401a3b Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:22:55 +0100 Subject: [PATCH 022/411] Changed function to read SPP snapshot name to better suit what it does. Added variable to specify sheet name. --- src/data_validation/data_schema.py | 77 +++++++----------------------- 1 file changed, 18 insertions(+), 59 deletions(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index 4f481b81f..ccc1c1bd0 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -6,23 +6,22 @@ from typing import IO -def read_xlsx(excel_file) -> pd.DataFrame: - """Read an excel file and convert it into a - pandas dataframe, dropping any 'Unnamed:' columns. - +def read_SPP_snapshot(excel_file, excel_sheet) -> pd.DataFrame: + """Read the updated SPP Snapshot Schema, specifying the name + of the sheet to read. Convert it into a pandas dataframe, + dropping any rows which include NaN values in the 'Field Name' + column. Arguments: excel_file -- the excel file to be converted - + excel_sheet -- the name of the excel sheet to be converted Returns: A pd.DataFrame: a pandas dataframe object. """ - xl_dataframe = pd.read_excel( - excel_file, sheet_name="contributors", engine="openpyxl" - ) - # xl_dataframe = xl_dataframe[ - # xl_dataframe.columns.drop(list(xl_dataframe.filter(regex="Unnamed:"))) - # ] + xl_dataframe = pd.read_excel(excel_file, sheet_name=excel_sheet, engine="openpyxl") + + # Drop rows with NaN values in the 'Field Name' column + xl_dataframe = xl_dataframe.dropna(subset=["Field Name"]) return xl_dataframe @@ -94,53 +93,16 @@ def reformat_tomlDict(pdDict: dict) -> dict: newDict[str(key)] = pdDict[key] subDict1 = newDict[str(key)] - var = subDict1.pop("Field Name (as it appears in dataset)") + var = subDict1.pop("Field Name") + var = var.replace('"', "") tomlDict[var] = subDict1 - data_type_substr1 = "Data Type (Numeric integer/Numeric float (or decimal)" - data_type_substr2 = "/Text/Categorical/Boolean (True or False, 1 or 0))" - - # Loop over each key in sub-dictionary and reformat values for usability - for key in tomlDict: - - subDict2 = tomlDict[key] - - if isinstance(subDict2["Description"], str): - subDict2["description"] = (subDict2.pop("Description")).strip() - else: - subDict2["description"] = subDict2.pop("Description") - - subDict2["data_type"] = subDict2.pop(f"{data_type_substr1}{data_type_substr2}") - subDict2["nullable"] = subDict2.pop( - "Nullable (is it acceptable to have a null value? Acceptable = Yes)" - ) - - acceptable_values_str = str(subDict2["Acceptable Values (>0 or 0 – 1,000,000)"]) - acceptable_values_list = acceptable_values_str.split() - - subDict2["min_acceptable_value"] = acceptable_values_list[0] - subDict2["max_acceptable_value"] = acceptable_values_list[-1].replace(",", "") - - if is_nan(subDict2["min_acceptable_value"]): - subDict2["min_acceptable_value"] = acceptable_values_list[0] - elif is_nan(subDict2["max_acceptable_value"]): - subDict2["max_acceptable_value"] = acceptable_values_list[-1] - else: - subDict2["min_acceptable_value"] = int(acceptable_values_list[0]) - subDict2["max_acceptable_value"] = int( - acceptable_values_list[-1].replace(",", "") - ) - - subDict2.pop("Acceptable Values (>0 or 0 – 1,000,000)") - - tomlDict[key] = subDict2 - - return tomlDict + return tomlDict def create_toml( - pd_dict: dict, output_toml_file: str = "./config/DataSchema.toml" + pd_dict: dict, output_toml_file: str = "./config/Data_Schema.toml" ) -> IO[str]: """Write a toml file from a dictionary. @@ -160,10 +122,7 @@ def create_toml( return toml_file -# berd_schema_df = read_DAP_csv("/ons/rdbe_dev/data_dictionary_berd.csv") -berd_schema_df = read_xlsx("C:\\Users\\macrar\\Downloads\\SPP Snapshot Schema.xlsx") -print(berd_schema_df) - -# berd_schema_dict = convert_dataFrame(berd_schema_df) -# reshaped_schema_dict = reformat_tomlDict(berd_schema_dict) -# tomlfile = create_toml(reshaped_schema_dict, "./config/Data_Schema_New.toml") +berd_schema_df = read_SPP_snapshot("./config/SPP Snapshot Schema.xlsx", "contributors") +berd_schema_dict = convert_dataFrame(berd_schema_df) +reshaped_schema_dict = reformat_tomlDict(berd_schema_dict) +tomlfile = create_toml(reshaped_schema_dict, "./config/Data_Schema.toml") From 5ad7efbc26390ad56b6877afbc2d7a9ed1b5ae18 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:27:54 +0100 Subject: [PATCH 023/411] Reverted src/main.py to have no commented lines. --- src/main.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/main.py b/src/main.py index 358f5d69a..1153c9e62 100644 --- a/src/main.py +++ b/src/main.py @@ -1,13 +1,13 @@ """The main pipeline""" -# from src.utils import runlog -# from src._version import __version__ as version +from src.utils import runlog +from src._version import __version__ as version -# from src.utils.helpers import Config_settings -# from src.utils.wrappers import logger_creator +from src.utils.helpers import Config_settings +from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data -# import time +import time import logging @@ -23,24 +23,24 @@ def run_pipeline(start): generated from the time module using time.time() """ - # conf_obj = Config_settings() - # config = conf_obj.config_dict - # global_config = config["global"] + conf_obj = Config_settings() + config = conf_obj.config_dict + global_config = config["global"] - # runlog_obj = runlog.RunLog(config, version) + runlog_obj = runlog.RunLog(config, version) - # logger = logger_creator(global_config) + logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") - # logger.info("Collecting logging parameters ..........") + logger.info("Collecting logging parameters ..........") Manipulate_data() MainLogger.info("Finishing Pipeline .......................") - # runlog_obj.retrieve_pipeline_logs() + runlog_obj.retrieve_pipeline_logs() - # run_time = round(time.time() - start, 5) - # runlog_obj._record_time_taken(run_time) + run_time = round(time.time() - start, 5) + runlog_obj._record_time_taken(run_time) - # runlog_obj._create_runlog_dicts() - # runlog_obj._create_runlog_dfs() - # runlog_obj.create_runlog_files() - # runlog_obj._write_runlog() + runlog_obj._create_runlog_dicts() + runlog_obj._create_runlog_dfs() + runlog_obj.create_runlog_files() + runlog_obj._write_runlog() From 9314fb078dec8c763fc08bf0c7d58465f8837c84 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:33:10 +0100 Subject: [PATCH 024/411] Reverted src/main.py to how to was before (removed newlines in imports). --- src/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main.py b/src/main.py index 1153c9e62..ff334c7f1 100644 --- a/src/main.py +++ b/src/main.py @@ -2,11 +2,9 @@ from src.utils import runlog from src._version import __version__ as version - from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data - import time import logging From cc2ca6a3e5d43d9679785796dd6a5b7165d11776 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:34:53 +0100 Subject: [PATCH 025/411] Removed an extra 'nan' in the Description of snapshot_id. --- config/Data_Schema.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index a7a021b23..89379fd4d 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -1,5 +1,5 @@ [snapshot_id] -Description = nannan +Description = nan Deduced Data Type = nan Nullable = nan Current Data Type = str @@ -423,7 +423,7 @@ Description = All values are Deduced Data Type = ? Nullable = 0.0 Current Data Type = str -Length = nan +Length = nanread_ Min values = nan Max values = nan Possible Categorical Values = nan From df03ebc9deacd3d12de8edfabbd585ed377b0c19 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:37:02 +0100 Subject: [PATCH 026/411] Removed old data schema toml file. --- config/DataSchema.toml | 650 ----------------------------------------- 1 file changed, 650 deletions(-) delete mode 100644 config/DataSchema.toml diff --git a/config/DataSchema.toml b/config/DataSchema.toml deleted file mode 100644 index 7854a9848..000000000 --- a/config/DataSchema.toml +++ /dev/null @@ -1,650 +0,0 @@ -[cell_id] -description = "Cell ID" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[civ_or_def] -description = "Business type: Civil or Defence" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[current_sic] -description = "Sic - Standard Industry Classification" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[data_source] -description = "Constructed" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[emp_other] -description = "emp_other (Full Time Equivalent)" -data_type = "Numeric float (or decimal)" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[emp_researcher] -description = "emp_researcher (Full Time Equivalent)" -data_type = "Numeric float (or decimal)" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[emp_technician] -description = "emp_technician (Full Time Equivalent)" -data_type = "Numeric float (or decimal)" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[emp_total] -description = "emp_total (Full Time Equivalent)" -data_type = "Numeric float (or decimal)" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[employee_count] -description = "Employee Count (IDBR)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[foreign_owner] -description = "Foreign Owner" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[form_status] -description = "Status" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[form_type] -description = "Form Type" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[freeze_id] -description = "Freeze ID - bespoke to openroad" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[headcount_oth_f] -description = "Other Female (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_oth_m] -description = "Other Male (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_res_f] -description = "Researchers Females (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_res_m] -description = "Researchers Male (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_tec_f] -description = "Technicians Female (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_tec_m] -description = "Technicians Male (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_total] -description = "Total Headcount" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[period] -description = "Openroad Specific" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[period_contributor_id] -description = "Openroad Specific" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[period_year] -description = "Period" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[product_group] -description = "Published Product Group" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[ru_ref] -description = "Reference" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[sizeband] -description = "SizeBand" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[wowentref] -description = "Wowentref" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q202] -description = "Salaries & Wages" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q203] -description = "Other current expenditure" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q204] -description = "Total Current Expenditure" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q205] -description = "Basic Research" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q206] -description = "Applied Research" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q207] -description = "Experimental Development" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q208] -description = "Land & Build CapEx" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q209] -description = "Equipment & Machinery CapEx" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q210] -description = "Total Capex." -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q211] -description = "Total Inhouse Expenditure" -data_type = "Numeric Integer" -nullable = "No" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q212] -description = "Own Funds" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q213] -description = "Funding - Commission of the EU" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q214] -description = "Funding - UK government" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q215] -description = "Funding - Organisations outside the Uk" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q216] -description = "Funding - Other UK Private Bus/Public Orgs" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q217] -description = "Funding - Any Other UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q218] -description = "Total Funding" -data_type = "Numeric Integer" -nullable = "No" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q219] -description = "Land Acquired for R&D (Split of Land & Build CapEx)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q220] -description = "Buildings acquired/constructed for R&D (Split of Land & Build CapEx)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q221] -description = "Expenditure on computer software only (of which from Equipment & Machinery CapEx)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q222] -description = "Purchase of Materials (Split of Other current)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q223] -description = "Purchase of Services (Split of Other current)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q224] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q225] -description = "Ownership - Own Business" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q226] -description = "Ownership - UK Government" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q227] -description = "Ownership - Other UK Priv Bus" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q228] -description = "Ownership - Other UK Orgs" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q229] -description = "Ownership - Bus Enterprises in Group Outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q230] -description = "Ownership - Other Bus Enterprises outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q231] -description = "Ownership - Other Governments outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q232] -description = "Ownership - Higher Education Establishments outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q233] -description = "Ownership - Non-profit Orgs outside the UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q234] -description = "Ownership - Commission of EU" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q235] -description = "Ownership - International Orgs" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q236] -description = "Ownership - Any other Orgs outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q237] -description = "Ownership - not owned freely available" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q238] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q239] -description = "Life Length - Basic Research" -data_type = "Numeric Integer" -nullable = "Not Asked" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q240] -description = "Life Length - Applied Research" -data_type = "Numeric Integer" -nullable = "Not Asked" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q241] -description = "Life Length - Experimental Res" -data_type = "Numeric Integer" -nullable = "Not Asked" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q242] -description = "Funding - Any other UK organisations" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q243] -description = "Funding - Business Enterprises in group outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q244] -description = "Funding - Other Business Enterprises outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q245] -description = "Funding - Other Governments outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q246] -description = "Funding - Higher Education Est Outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q247] -description = "Funding - Non-profit Orgs outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q248] -description = "Funding - International Orgs" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q249] -description = "Funding - Any other orgs outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q250] -description = "Funding - UK Higher Education Establishments" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q251] -description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q252] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q253] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q254] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q255] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q256] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q257] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q258] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q302] -description = "Purchased/funded R&D in the UK (Yes or No)" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q303] -description = "Purchased Outside UK (Govt Funded)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q304] -description = "Purchased Outside UK (Other)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q305] -description = "Total Purchased" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q307] -description = "Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q308] -description = "Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q309] -description = "Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q713] -description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q714] -description = "Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" From 729fc5d75d4956b9fa0e5853c1d8872f3328c5e2 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:46:50 +0100 Subject: [PATCH 027/411] Updated environment.yml. Ordering was wrong. --- environment.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index ef7f567c6..20a9cecdb 100644 --- a/environment.yml +++ b/environment.yml @@ -2,11 +2,7 @@ name: resdev36 dependencies: - python=3 - coverage - - pytest - - pytest-cov - pyyaml - - pandas - - numpy - requests - sphinx - pip @@ -15,8 +11,12 @@ dependencies: - cookiecutter - detect-secrets - myst-parser + - numpy + - pandas - pre-commit==2.17.0 - pydoop + - pytest + - pytest-cov - python-dotenv - table_logger - toml From c21536f9c2cd54a919af58b3ae61c757e2c60156 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:48:55 +0100 Subject: [PATCH 028/411] Updated environment.yml. Ordering was wrong. --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 20a9cecdb..6ed2d1261 100644 --- a/environment.yml +++ b/environment.yml @@ -12,12 +12,13 @@ dependencies: - detect-secrets - myst-parser - numpy - - pandas + - pandas==1.1.5 - pre-commit==2.17.0 - pydoop - pytest - pytest-cov - python-dotenv + - setuptools - table_logger - toml - xlrd From 03ee14af8610d17fe506b7c13cb1de8e678d977c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:55:27 +0100 Subject: [PATCH 029/411] Removed line 97 which removed quotation marks in key names as no keys have quotation marks. --- src/data_validation/data_schema.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/data_validation/data_schema.py b/src/data_validation/data_schema.py index ccc1c1bd0..344d0944e 100644 --- a/src/data_validation/data_schema.py +++ b/src/data_validation/data_schema.py @@ -94,7 +94,6 @@ def reformat_tomlDict(pdDict: dict) -> dict: subDict1 = newDict[str(key)] var = subDict1.pop("Field Name") - var = var.replace('"', "") tomlDict[var] = subDict1 From 4489ca0ee57a7d0db4670f8c93bad8e977c7d359 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 18 May 2023 15:16:28 +0100 Subject: [PATCH 030/411] Renamed data_schema.py to schema_parser.py. Moved it from src/data_validation/ tto src/utils/ . Also renamed read_SPP_snapshot function to read_SPP_schema. --- .../data_schema.py => utils/schema_parser.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename src/{data_validation/data_schema.py => utils/schema_parser.py} (95%) diff --git a/src/data_validation/data_schema.py b/src/utils/schema_parser.py similarity index 95% rename from src/data_validation/data_schema.py rename to src/utils/schema_parser.py index 344d0944e..b3cea38a2 100644 --- a/src/data_validation/data_schema.py +++ b/src/utils/schema_parser.py @@ -6,7 +6,7 @@ from typing import IO -def read_SPP_snapshot(excel_file, excel_sheet) -> pd.DataFrame: +def read_SPP_schema(excel_file, excel_sheet) -> pd.DataFrame: """Read the updated SPP Snapshot Schema, specifying the name of the sheet to read. Convert it into a pandas dataframe, dropping any rows which include NaN values in the 'Field Name' @@ -121,7 +121,7 @@ def create_toml( return toml_file -berd_schema_df = read_SPP_snapshot("./config/SPP Snapshot Schema.xlsx", "contributors") +berd_schema_df = read_SPP_schema("./config/SPP Snapshot Schema.xlsx", "contributors") berd_schema_dict = convert_dataFrame(berd_schema_df) reshaped_schema_dict = reformat_tomlDict(berd_schema_dict) tomlfile = create_toml(reshaped_schema_dict, "./config/Data_Schema.toml") From a41f1909d970c770ca3f199a01b478bb3deb2dc1 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 18:02:48 +0100 Subject: [PATCH 031/411] Redid some changes to data_schema.toml file that seemed to be undone during a rebase. --- config/Data_Schema.toml | 636 ++++++++++++++++++++-------------------- 1 file changed, 318 insertions(+), 318 deletions(-) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index 89379fd4d..1f80ef344 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -1,469 +1,469 @@ [snapshot_id] -Description = nan -Deduced Data Type = nan -Nullable = nan -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Description = "nan" +Deduced_Data_Type = "nan" +Nullable = "nan" +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] [reference] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = 11001603625.0 -Max values = 19891309165.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = ">=1" +Min_values = 11001603625.0 +Max_values = 19891309165.0 +Possible_Categorical_Values = ["nan"] [period] -Description = nan -Deduced Data Type = Category(int) +Description = "nan" +Deduced_Data_Type = Category("int") Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = 6 -Min values = nan -Max values = nan -Possible Categorical Values = 202012 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [202012] [survey] Description = All values are 002 -Deduced Data Type = Category(str) +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = nan -Max values = nan -Possible Categorical Values = 002 +Current_Data_Type = "str" +Length = ">=1" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [002] [formid] -Description = nan -Deduced Data Type = Category(int) +Description = "nan" +Deduced_Data_Type = Category("int") Nullable = 0.0 -Current Data Type = int +Current_Data_Type = "int" Length = 2 -Min values = nan -Max values = nan -Possible Categorical Values = 20, 21 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [20, 21] [status] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = nan -Max values = nan -Possible Categorical Values = Clear, Clear - overridden, Form saved, Clear - overridden SE, Form sent out, Check needed, Combined child (NIL2), Out of scope (NIL3), Ceased trading (NIL4), Dormant (NIL5), Part year return (NIL8), No UK activity (NIL9) +Current_Data_Type = "str" +Length = ">=1" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [Clear, Clear - overridden, Form saved, Clear - overridden SE, Form sent out, Check needed, Combined child (NIL2), Out of scope (NIL3), Ceased trading (NIL4), Dormant (NIL5), Part year return (NIL8), No UK activity (NIL9)] [statusencoded] -Description = nan -Deduced Data Type = Category(int) +Description = "nan" +Deduced_Data_Type = Category("int") Nullable = 0.0 -Current Data Type = int +Current_Data_Type = "int" Length = 3 -Min values = nan -Max values = nan -Possible Categorical Values = 100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309] [receiptdate] Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 -Deduced Data Type = Datetime +Deduced_Data_Type = Datetime Nullable = 0.0 -Current Data Type = None/str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = None/"str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] [lockedby] -Description = All empty strings -Deduced Data Type = ? +Description = All empty "str"ings +Deduced_Data_Type = ? Nullable = 1.0 -Current Data Type = None/str +Current_Data_Type = None/"str" Length = 0 -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] [lockeddate] Description = All None type -Deduced Data Type = ? +Deduced_Data_Type = ? Nullable = 1.0 -Current Data Type = None/str +Current_Data_Type = None/"str" Length = 0 -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] [formtype] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = 0001, 0006 +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [0001, 0006] [checkletter] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = 1 -Min values = nan -Max values = nan -Possible Categorical Values = T, H, F, J, D, A, K, C, B, L, S +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [T, H, F, J, D, A, K, C, B, L, S] [frozensicoutdated] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 1120.0 -Max values = 93059.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 1120.0 +Max_values = 93059.0 +Possible_Categorical_Values = ["nan"] [rusicoutdated] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 1120.0 -Max values = 93059.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 1120.0 +Max_values = 93059.0 +Possible_Categorical_Values = ["nan"] [frozensic] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 1300.0 -Max values = 96090.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 1300.0 +Max_values = 96090.0 +Possible_Categorical_Values = ["nan"] [rusic] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 1300.0 -Max values = 96090.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 1300.0 +Max_values = 96090.0 +Possible_Categorical_Values = ["nan"] [frozenemployees] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 272527.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 272527.0 +Possible_Categorical_Values = ["nan"] [employees] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 272528.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 272528.0 +Possible_Categorical_Values = ["nan"] [frozenemployment] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 1.0 -Max values = 272527.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 1.0 +Max_values = 272527.0 +Possible_Categorical_Values = ["nan"] [employment] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 1.0 -Max values = 272528.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 1.0 +Max_values = 272528.0 +Possible_Categorical_Values = ["nan"] [frozenfteemployment] -Description = nan -Deduced Data Type = float +Description = "nan" +Deduced_Data_Type = float Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 177699.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 177699.0 +Possible_Categorical_Values = ["nan"] [fteemployment] -Description = nan -Deduced Data Type = float +Description = "nan" +Deduced_Data_Type = float Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 177699.5 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 177699.5 +Possible_Categorical_Values = ["nan"] [frozenturnover] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 55277352.0 -Possible Categorical Values = nan +Current_Data_Type = ""str"" +Length = "nan" +Min_values = 0.0 +Max_values = 55277352.0 +Possible_Categorical_Values = ["nan"] [turnover] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 55277352.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 55277352.0 +Possible_Categorical_Values = ["nan"] [enterprisereference] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = 1001603625.0 -Max values = 9891309165.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = ">=1" +Min_values = 1001603625.0 +Max_values = 9891309165.0 +Possible_categorical_Values = ["nan"] [wowenterprisereference] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = 1001603625.0 -Max values = 9891309165.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = ">=1" +Min_values = 1001603625.0 +Max_values = 9891309165.0 +Possible_categorical_Values = ["nan"] [cellnumber] -Description = nan -Deduced Data Type = Category(int) +Description = "nan" +Deduced_Data_Type = Category("int") Nullable = 0.0 -Current Data Type = int -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = 0 +Current_Data_Type = "int" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = 0 [currency] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = S, E +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = S, E [vatreference] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = VATREF +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = VATREF [payereference] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = PAYEREF +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = PAYEREF -[companyregistrationnumber] -Description = nan -Deduced Data Type = Category(str) +[companyregi"str"ationnumber] +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = CRN +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = CRN [numberlivelocalunits] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 6063.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 6063.0 +Possible_categorical_Values = ["nan"] [numberlivevat] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 255.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 255.0 +Possible_categorical_Values = ["nan"] [numberlivepaye] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 0.0 -Max values = 24.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 24.0 +Possible_categorical_Values = ["nan"] [legalstatus] -Description = nan -Deduced Data Type = int +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = 1.0 -Max values = 4.0 -Possible Categorical Values = 1, 2, 3, 4 +Current_Data_Type = "str" +Length = "nan" +Min_values = 1.0 +Max_values = 4.0 +Possible_Categorical_Values = 1, 2, 3, 4 [reportingunitmarker] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = L, E +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = L, E [region] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = WW, BB, FE, GG, JG, HH, ED, KJ, XX, AA, DC, GF, BA +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = WW, BB, FE, GG, JG, HH, ED, KJ, XX, AA, DC, GF, BA [birthdate] Description = Datetime format = format=%d/%m/%Y -Deduced Data Type = Datetime +Deduced_Data_Type = Datetime Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [referencename] -Description = nan -Deduced Data Type = str +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [referencepostcode] -Description = nan -Deduced Data Type = str +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [tradingstyle] -Description = nan -Deduced Data Type = str +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [selectiontype] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = L +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = L [inclusionexclusion] Description = All values are -Deduced Data Type = ? +Deduced_Data_Type = ? Nullable = 0.0 -Current Data Type = str -Length = nanread_ -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nan"read_ +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [createdby] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = nan -Max values = nan -Possible Categorical Values = ingestion +Current_Data_Type = "str" +Length = ">=1" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ingestion [createddate] Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 -Deduced Data Type = Datetime +Deduced_Data_Type = Datetime Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = ">=1" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [lastupdatedby] -Description = nan -Deduced Data Type = Category(str) +Description = "nan" +Deduced_Data_Type = Category("str") Nullable = 0.0 -Current Data Type = None/str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = data_migration, Cheri, Adela, David +Current_Data_Type = None/"str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = data_migration, Cheri, Adela, David [lastupdateddate] Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 -Deduced Data Type = Datetime +Deduced_Data_Type = Datetime Nullable = 0.0 -Current Data Type = None/str -Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = None/"str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] From eccb2d9bb82564be3ddf56599479ee053159fa04 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 18:30:06 +0100 Subject: [PATCH 032/411] Finished amending Data_Schema.toml file to match correct syntax and formatting. --- config/Data_Schema.toml | 106 ++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index 1f80ef344..b11125c69 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -20,7 +20,7 @@ Possible_Categorical_Values = ["nan"] [period] Description = "nan" -Deduced_Data_Type = Category("int") +Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" Length = 6 @@ -29,18 +29,18 @@ Max_values = "nan" Possible_Categorical_Values = [202012] [survey] -Description = All values are 002 -Deduced_Data_Type = Category("str") +Description = "All values are 002" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = [002] +Possible_Categorical_Values = ["002"] [formid] Description = "nan" -Deduced_Data_Type = Category("int") +Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "int" Length = 2 @@ -50,17 +50,17 @@ Possible_Categorical_Values = [20, 21] [status] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = [Clear, Clear - overridden, Form saved, Clear - overridden SE, Form sent out, Check needed, Combined child (NIL2), Out of scope (NIL3), Ceased trading (NIL4), Dormant (NIL5), Part year return (NIL8), No UK activity (NIL9)] +Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Clear - overridden SE", "Form sent out", "Check needed", "Combined child (NIL2)", "Out of scope (NIL3)", "Ceased trading (NIL4)", "Dormant (NIL5)", "Part year return (NIL8)", "No UK activity (NIL9)"] [statusencoded] Description = "nan" -Deduced_Data_Type = Category("int") +Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "int" Length = 3 @@ -69,30 +69,30 @@ Max_values = "nan" Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309] [receiptdate] -Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = Datetime Nullable = 0.0 -Current_Data_Type = None/"str" +Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" Max_values = "nan" Possible_Categorical_Values = ["nan"] [lockedby] -Description = All empty "str"ings +Description = "All empty strings" Deduced_Data_Type = ? Nullable = 1.0 -Current_Data_Type = None/"str" +Current_Data_Type = ["None","str"] Length = 0 Min_values = "nan" Max_values = "nan" Possible_Categorical_Values = ["nan"] [lockeddate] -Description = All None type +Description = "All None type" Deduced_Data_Type = ? Nullable = 1.0 -Current_Data_Type = None/"str" +Current_Data_Type = ["None","str"] Length = 0 Min_values = "nan" Max_values = "nan" @@ -100,23 +100,23 @@ Possible_Categorical_Values = ["nan"] [formtype] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = [0001, 0006] +Possible_Categorical_Values = ["0001", "0006"] [checkletter] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = 1 Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = [T, H, F, J, D, A, K, C, B, L, S] +Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", "S"] [frozensicoutdated] Description = "nan" @@ -200,7 +200,7 @@ Possible_Categorical_Values = ["nan"] [frozenfteemployment] Description = "nan" -Deduced_Data_Type = float +Deduced_Data_Type = "float" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" @@ -210,7 +210,7 @@ Possible_Categorical_Values = ["nan"] [fteemployment] Description = "nan" -Deduced_Data_Type = float +Deduced_Data_Type = "float" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" @@ -222,7 +222,7 @@ Possible_Categorical_Values = ["nan"] Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 -Current_Data_Type = ""str"" +Current_Data_Type = "str" Length = "nan" Min_values = 0.0 Max_values = 55277352.0 @@ -260,53 +260,53 @@ Possible_categorical_Values = ["nan"] [cellnumber] Description = "nan" -Deduced_Data_Type = Category("int") +Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "int" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = 0 +Possible_Categorical_Values = [0] [currency] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = S, E +Possible_Categorical_Values = ["S", "E"] [vatreference] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = VATREF +Possible_Categorical_Values = ["VATREF"] [payereference] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = PAYEREF +Possible_Categorical_Values = ["PAYEREF"] [companyregi"str"ationnumber] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = CRN +Possible_Categorical_Values = ["CRN"] [numberlivelocalunits] Description = "nan" @@ -346,31 +346,31 @@ Current_Data_Type = "str" Length = "nan" Min_values = 1.0 Max_values = 4.0 -Possible_Categorical_Values = 1, 2, 3, 4 +Possible_Categorical_Values = [1, 2, 3, 4] [reportingunitmarker] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = L, E +Possible_Categorical_Values = ["L", "E"] [region] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = WW, BB, FE, GG, JG, HH, ED, KJ, XX, AA, DC, GF, BA +Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", "XX", "AA", "DC", "GF", "BA"] [birthdate] -Description = Datetime format = format=%d/%m/%Y -Deduced_Data_Type = Datetime +Description = "Datetime format = format=%d/%m/%Y" +Deduced_Data_Type = "Datetime" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" @@ -410,37 +410,37 @@ Possible_categorical_Values = ["nan"] [selectiontype] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = L +Possible_Categorical_Values = [L] [inclusionexclusion] -Description = All values are -Deduced_Data_Type = ? +Description = 'All values are " "' +Deduced_Data_Type = "?" Nullable = 0.0 Current_Data_Type = "str" -Length = "nan"read_ +Length = "nan" Min_values = "nan" Max_values = "nan" Possible_categorical_Values = ["nan"] [createdby] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = ingestion +Possible_Categorical_Values = ["ingestion"] [createddate] -Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 -Deduced_Data_Type = Datetime +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" +Deduced_Data_Type = "Datetime" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" @@ -450,19 +450,19 @@ Possible_categorical_Values = ["nan"] [lastupdatedby] Description = "nan" -Deduced_Data_Type = Category("str") +Deduced_Data_Type = "str" Nullable = 0.0 -Current_Data_Type = None/"str" +Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = data_migration, Cheri, Adela, David +Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] [lastupdateddate] -Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 -Deduced_Data_Type = Datetime +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" +Deduced_Data_Type = "Datetime" Nullable = 0.0 -Current_Data_Type = None/"str" +Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" Max_values = "nan" From f8c4e49eea8a07629d34be033ba7ec4c67cfa6fb Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 19:22:13 +0100 Subject: [PATCH 033/411] Corrected a few missed quotation marks in Data_Schema.toml. --- config/Data_Schema.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index b11125c69..ad243f119 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -70,7 +70,7 @@ Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, [receiptdate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" -Deduced_Data_Type = Datetime +Deduced_Data_Type = "Datetime" Nullable = 0.0 Current_Data_Type = ["None","str"] Length = "nan" @@ -80,7 +80,7 @@ Possible_Categorical_Values = ["nan"] [lockedby] Description = "All empty strings" -Deduced_Data_Type = ? +Deduced_Data_Type = "?" Nullable = 1.0 Current_Data_Type = ["None","str"] Length = 0 @@ -90,7 +90,7 @@ Possible_Categorical_Values = ["nan"] [lockeddate] Description = "All None type" -Deduced_Data_Type = ? +Deduced_Data_Type = "?" Nullable = 1.0 Current_Data_Type = ["None","str"] Length = 0 @@ -416,7 +416,7 @@ Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" -Possible_Categorical_Values = [L] +Possible_Categorical_Values = ["L"] [inclusionexclusion] Description = 'All values are " "' From 0c86a129799fd11d7da1ee8b1eeaf033ef14e20b Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 19:25:29 +0100 Subject: [PATCH 034/411] Corrected error in [companyregistrationnumber] caused by search and replace. --- config/Data_Schema.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index ad243f119..9f00c6f5c 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -298,7 +298,7 @@ Min_values = "nan" Max_values = "nan" Possible_Categorical_Values = ["PAYEREF"] -[companyregi"str"ationnumber] +[companyregistrationnumber] Description = "nan" Deduced_Data_Type = "str" Nullable = 0.0 From 7759b02684937dbfa1f24d79856aab3575f78f56 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 25 May 2023 16:07:08 +0100 Subject: [PATCH 035/411] Deleted schema_parser.py as it was a one-off use case that is no longer needed. --- src/utils/schema_parser.py | 127 ------------------------------------- 1 file changed, 127 deletions(-) delete mode 100644 src/utils/schema_parser.py diff --git a/src/utils/schema_parser.py b/src/utils/schema_parser.py deleted file mode 100644 index b3cea38a2..000000000 --- a/src/utils/schema_parser.py +++ /dev/null @@ -1,127 +0,0 @@ -import math -import toml -import pandas as pd -import pydoop.hdfs as hdfs - -from typing import IO - - -def read_SPP_schema(excel_file, excel_sheet) -> pd.DataFrame: - """Read the updated SPP Snapshot Schema, specifying the name - of the sheet to read. Convert it into a pandas dataframe, - dropping any rows which include NaN values in the 'Field Name' - column. - - Arguments: - excel_file -- the excel file to be converted - excel_sheet -- the name of the excel sheet to be converted - Returns: - A pd.DataFrame: a pandas dataframe object. - """ - xl_dataframe = pd.read_excel(excel_file, sheet_name=excel_sheet, engine="openpyxl") - - # Drop rows with NaN values in the 'Field Name' column - xl_dataframe = xl_dataframe.dropna(subset=["Field Name"]) - - return xl_dataframe - - -def read_DAP_csv(excel_file) -> pd.DataFrame: - """Read an excel file from DAP and convert it into a - pandas dataframe, dropping any 'Unnamed:' columns. - Arguments: - excel_file -- the excel file to be converted - Returns: - A pd.DataFrame: a pandas dataframe object. - """ - with hdfs.open(excel_file, "r") as file: - - # Import csv file and convert to Dataframe - sheet = pd.read_csv(file) - - return sheet - - -def convert_dataFrame(pdFrame: pd.DataFrame) -> dict: - """Convert a pandas dataframe into a dictionary oriented by - index. This makes the keys in the dictionary the row index - of the dataframe, and the values are a dictionary containing - the other key-value information. - - Arguments: - pdFrame -- the pandas dataframe to be converted - - Returns: - A dict: dict object oriented by index - """ - pd_dict = pdFrame.to_dict(orient="index") - return pd_dict - - -def is_nan(value) -> bool: - """Takes in a value and returns a boolean indicating - whether it is a 'not a number' or not. - - Arguments: - value -- Any value - - Returns: - A bool: boolean indicating whether the value is - 'not a number' or not, as determined by the 'math' - module. - """ - return math.isnan(float(value)) - - -def reformat_tomlDict(pdDict: dict) -> dict: - """Creates a dictionary suitable to be converted - into a toml file. Takes an index oriented - dictionary as input and creates a new dictionary - - Arguments: - pdDict -- a dictionary - - Returns: - A dict: dictionary ready to be used to create - a toml file. - """ - newDict = {} - tomlDict = {} - - # Loop over input dictionary to create a sub dictionary - for key in pdDict: - newDict[str(key)] = pdDict[key] - - subDict1 = newDict[str(key)] - var = subDict1.pop("Field Name") - - tomlDict[var] = subDict1 - - return tomlDict - - -def create_toml( - pd_dict: dict, output_toml_file: str = "./config/Data_Schema.toml" -) -> IO[str]: - """Write a toml file from a dictionary. - - Arguments: - pd_dict -- A dictionary containing a dictionary as - its values. - output_toml_file -- Path to the output toml file. - (default: {"./config/DataSchema.toml"}) - Returns: - A toml file - IO[str] type indicates a text based file - (.toml) will be returned. - """ - - with open(output_toml_file, "w") as toml_file: - toml.dump(pd_dict, toml_file) - - return toml_file - - -berd_schema_df = read_SPP_schema("./config/SPP Snapshot Schema.xlsx", "contributors") -berd_schema_dict = convert_dataFrame(berd_schema_df) -reshaped_schema_dict = reformat_tomlDict(berd_schema_dict) -tomlfile = create_toml(reshaped_schema_dict, "./config/Data_Schema.toml") From 46d2a73476fa2f58f27e8b9cfa779d7efd797a0e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 25 May 2023 16:10:59 +0100 Subject: [PATCH 036/411] Replaced all Nullable values with True or False, rather than 1.0 and 0.0. Also fixed min/max values to be ints rather than floats. --- config/Data_Schema.toml | 166 ++++++++++++++++++++-------------------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index 9f00c6f5c..df6cf8aba 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -11,17 +11,17 @@ Possible_Categorical_Values = ["nan"] [reference] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" -Min_values = 11001603625.0 -Max_values = 19891309165.0 +Min_values = 11001603625 +Max_values = 19891309165 Possible_Categorical_Values = ["nan"] [period] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = 6 Min_values = "nan" @@ -31,7 +31,7 @@ Possible_Categorical_Values = [202012] [survey] Description = "All values are 002" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -41,7 +41,7 @@ Possible_Categorical_Values = ["002"] [formid] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "int" Length = 2 Min_values = "nan" @@ -51,7 +51,7 @@ Possible_Categorical_Values = [20, 21] [status] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -61,7 +61,7 @@ Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Cle [statusencoded] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "int" Length = 3 Min_values = "nan" @@ -71,7 +71,7 @@ Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, [receiptdate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = 0.0 +Nullable = False Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" @@ -81,7 +81,7 @@ Possible_Categorical_Values = ["nan"] [lockedby] Description = "All empty strings" Deduced_Data_Type = "?" -Nullable = 1.0 +Nullable = True Current_Data_Type = ["None","str"] Length = 0 Min_values = "nan" @@ -91,7 +91,7 @@ Possible_Categorical_Values = ["nan"] [lockeddate] Description = "All None type" Deduced_Data_Type = "?" -Nullable = 1.0 +Nullable = True Current_Data_Type = ["None","str"] Length = 0 Min_values = "nan" @@ -101,7 +101,7 @@ Possible_Categorical_Values = ["nan"] [formtype] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -111,7 +111,7 @@ Possible_Categorical_Values = ["0001", "0006"] [checkletter] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = 1 Min_values = "nan" @@ -121,147 +121,147 @@ Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", [frozensicoutdated] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1120.0 -Max_values = 93059.0 +Min_values = 1120 +Max_values = 93059 Possible_Categorical_Values = ["nan"] [rusicoutdated] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1120.0 -Max_values = 93059.0 +Min_values = 1120 +Max_values = 93059 Possible_Categorical_Values = ["nan"] [frozensic] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1300.0 -Max_values = 96090.0 +Min_values = 1300 +Max_values = 96090 Possible_Categorical_Values = ["nan"] [rusic] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1300.0 -Max_values = 96090.0 +Min_values = 1300 +Max_values = 96090 Possible_Categorical_Values = ["nan"] [frozenemployees] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 272527.0 +Min_values = 0 +Max_values = 272527 Possible_Categorical_Values = ["nan"] [employees] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 272528.0 +Min_values = 0 +Max_values = 272528 Possible_Categorical_Values = ["nan"] [frozenemployment] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1.0 -Max_values = 272527.0 +Min_values = 1 +Max_values = 272527 Possible_Categorical_Values = ["nan"] [employment] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1.0 -Max_values = 272528.0 +Min_values = 1 +Max_values = 272528 Possible_Categorical_Values = ["nan"] [frozenfteemployment] Description = "nan" Deduced_Data_Type = "float" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 177699.0 +Min_values = 0 +Max_values = 177699 Possible_Categorical_Values = ["nan"] [fteemployment] Description = "nan" Deduced_Data_Type = "float" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 +Min_values = 0 Max_values = 177699.5 Possible_Categorical_Values = ["nan"] [frozenturnover] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 55277352.0 +Min_values = 0 +Max_values = 55277352 Possible_Categorical_Values = ["nan"] [turnover] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 55277352.0 +Min_values = 0 +Max_values = 55277352 Possible_Categorical_Values = ["nan"] [enterprisereference] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" -Min_values = 1001603625.0 -Max_values = 9891309165.0 +Min_values = 1001603625 +Max_values = 9891309165 Possible_categorical_Values = ["nan"] [wowenterprisereference] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" -Min_values = 1001603625.0 -Max_values = 9891309165.0 +Min_values = 1001603625 +Max_values = 9891309165 Possible_categorical_Values = ["nan"] [cellnumber] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "int" Length = "nan" Min_values = "nan" @@ -271,7 +271,7 @@ Possible_Categorical_Values = [0] [currency] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -281,7 +281,7 @@ Possible_Categorical_Values = ["S", "E"] [vatreference] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -291,7 +291,7 @@ Possible_Categorical_Values = ["VATREF"] [payereference] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -301,7 +301,7 @@ Possible_Categorical_Values = ["PAYEREF"] [companyregistrationnumber] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -311,47 +311,47 @@ Possible_Categorical_Values = ["CRN"] [numberlivelocalunits] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 6063.0 +Min_values = 0 +Max_values = 6063 Possible_categorical_Values = ["nan"] [numberlivevat] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 255.0 +Min_values = 0 +Max_values = 255 Possible_categorical_Values = ["nan"] [numberlivepaye] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 24.0 +Min_values = 0 +Max_values = 24 Possible_categorical_Values = ["nan"] [legalstatus] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1.0 -Max_values = 4.0 +Min_values = 1 +Max_values = 4 Possible_Categorical_Values = [1, 2, 3, 4] [reportingunitmarker] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -361,7 +361,7 @@ Possible_Categorical_Values = ["L", "E"] [region] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -371,7 +371,7 @@ Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", " [birthdate] Description = "Datetime format = format=%d/%m/%Y" Deduced_Data_Type = "Datetime" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -381,7 +381,7 @@ Possible_categorical_Values = ["nan"] [referencename] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -391,7 +391,7 @@ Possible_categorical_Values = ["nan"] [referencepostcode] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -401,7 +401,7 @@ Possible_categorical_Values = ["nan"] [tradingstyle] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -411,7 +411,7 @@ Possible_categorical_Values = ["nan"] [selectiontype] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -421,7 +421,7 @@ Possible_Categorical_Values = ["L"] [inclusionexclusion] Description = 'All values are " "' Deduced_Data_Type = "?" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -431,7 +431,7 @@ Possible_categorical_Values = ["nan"] [createdby] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -441,7 +441,7 @@ Possible_Categorical_Values = ["ingestion"] [createddate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -451,7 +451,7 @@ Possible_categorical_Values = ["nan"] [lastupdatedby] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" @@ -461,7 +461,7 @@ Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] [lastupdateddate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = 0.0 +Nullable = False Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" From b6fc4574a238e02487abaff1076e4a183fd7f940 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 25 May 2023 16:47:29 +0100 Subject: [PATCH 037/411] Corrected int and str Deduced_Data_Type to category dtype from pandas. --- config/Data_Schema.toml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index df6cf8aba..fb92d1fe0 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -40,7 +40,7 @@ Possible_Categorical_Values = ["002"] [formid] Description = "nan" -Deduced_Data_Type = "int" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "int" Length = 2 @@ -50,7 +50,7 @@ Possible_Categorical_Values = [20, 21] [status] Description = "nan" -Deduced_Data_Type = "str" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "str" Length = ">=1" @@ -60,7 +60,7 @@ Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Cle [statusencoded] Description = "nan" -Deduced_Data_Type = "int" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "int" Length = 3 @@ -80,7 +80,7 @@ Possible_Categorical_Values = ["nan"] [lockedby] Description = "All empty strings" -Deduced_Data_Type = "?" +Deduced_Data_Type = "pandas.NA" Nullable = True Current_Data_Type = ["None","str"] Length = 0 @@ -90,7 +90,7 @@ Possible_Categorical_Values = ["nan"] [lockeddate] Description = "All None type" -Deduced_Data_Type = "?" +Deduced_Data_Type = "pandas.NA" Nullable = True Current_Data_Type = ["None","str"] Length = 0 @@ -100,7 +100,7 @@ Possible_Categorical_Values = ["nan"] [formtype] Description = "nan" -Deduced_Data_Type = "str" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "str" Length = "nan" @@ -110,7 +110,7 @@ Possible_Categorical_Values = ["0001", "0006"] [checkletter] Description = "nan" -Deduced_Data_Type = "str" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "str" Length = 1 @@ -204,8 +204,8 @@ Deduced_Data_Type = "float" Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0 -Max_values = 177699 +Min_values = 0.0 +Max_values = 177699.0 Possible_Categorical_Values = ["nan"] [fteemployment] @@ -214,7 +214,7 @@ Deduced_Data_Type = "float" Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0 +Min_values = 0.0 Max_values = 177699.5 Possible_Categorical_Values = ["nan"] @@ -270,7 +270,7 @@ Possible_Categorical_Values = [0] [currency] Description = "nan" -Deduced_Data_Type = "str" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "str" Length = "nan" @@ -340,7 +340,7 @@ Possible_categorical_Values = ["nan"] [legalstatus] Description = "nan" -Deduced_Data_Type = "int" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "str" Length = "nan" @@ -350,7 +350,7 @@ Possible_Categorical_Values = [1, 2, 3, 4] [reportingunitmarker] Description = "nan" -Deduced_Data_Type = "str" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "str" Length = "nan" @@ -360,7 +360,7 @@ Possible_Categorical_Values = ["L", "E"] [region] Description = "nan" -Deduced_Data_Type = "str" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = "str" Length = "nan" @@ -420,7 +420,7 @@ Possible_Categorical_Values = ["L"] [inclusionexclusion] Description = 'All values are " "' -Deduced_Data_Type = "?" +Deduced_Data_Type = "pandas.NA" Nullable = False Current_Data_Type = "str" Length = "nan" @@ -450,7 +450,7 @@ Possible_categorical_Values = ["nan"] [lastupdatedby] Description = "nan" -Deduced_Data_Type = "str" +Deduced_Data_Type = "category" Nullable = False Current_Data_Type = ["None","str"] Length = "nan" From 222736198b2c3dc620371e75b102c8a2366a26bb Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Mon, 5 Jun 2023 10:53:53 +0100 Subject: [PATCH 038/411] added retrieve config to main --- src/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main.py b/src/main.py index ff334c7f1..21c8866b8 100644 --- a/src/main.py +++ b/src/main.py @@ -38,6 +38,7 @@ def run_pipeline(start): run_time = round(time.time() - start, 5) runlog_obj._record_time_taken(run_time) + runlog_obj.retrieve_configs() runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() runlog_obj.create_runlog_files() From 749cd5d1f2b09d971b99e5c05435c076f7ac3b22 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 30 Mar 2023 11:00:40 +0100 Subject: [PATCH 039/411] Add coverage folder --- coverage/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 coverage/.gitkeep diff --git a/coverage/.gitkeep b/coverage/.gitkeep new file mode 100644 index 000000000..e69de29bb From cf3cf3dd9a13375a5df83cccb55e56bed56ab633 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 30 Mar 2023 17:20:46 +0100 Subject: [PATCH 040/411] Removed unneeded dependency --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 6ed2d1261..f77f24dff 100644 --- a/environment.yml +++ b/environment.yml @@ -22,3 +22,4 @@ dependencies: - table_logger - toml - xlrd + From de557b7b8d1bb2285af00c918d1f094c27e4e971 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 30 Mar 2023 17:24:00 +0100 Subject: [PATCH 041/411] The coverage badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index dea5a4c7c..836dfc16a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Research and Development Project +![Code Coverage](https://img.shields.io/badge/Coverage-33%25-red.svg) + Calculating national and regional research and development expenditure as part of [national accounts](https://www.ons.gov.uk/economy/nationalaccounts). Additional information about the aims and objectives of the project will go here when it is available. The project is currently in pre-discovery. From bb9e9bf58a9c4c4343897c64ff7710234aa204df Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 30 Mar 2023 17:34:54 +0100 Subject: [PATCH 042/411] Adding pre-commit hook --- .pre-commit-config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2949a3114..d033445c9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,3 +67,9 @@ repos: entry: These file extensions are restricted. Data should be removed from the commit language: fail files: .*\.(csv|feather|xlsx|zip|hdf5|h5|txt|json|xml|hd|parquet) + - repo: local + hooks: + - id: coverage-badge + name: Update the coverage badge in the readme + entry: bash -c 'lines=$(readme-cov)' + language: system From 930db4c214aaac3383ce407cb310b3d051789eec Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 18 May 2023 14:05:23 +0100 Subject: [PATCH 043/411] Adding coverage-badge --- environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environment.yml b/environment.yml index f77f24dff..6ed2d1261 100644 --- a/environment.yml +++ b/environment.yml @@ -22,4 +22,3 @@ dependencies: - table_logger - toml - xlrd - From 29a61408da02802b62de0a54aec6d23a94a301a8 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 18:11:25 +0100 Subject: [PATCH 044/411] including readme-coverage-badger --- .pre-commit-config.yaml | 4 ++-- README.md | 2 +- {coverage => cov_reports}/.gitkeep | 0 cov_reports/update_readme.py | 5 +++++ environment.yml | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) rename {coverage => cov_reports}/.gitkeep (100%) create mode 100644 cov_reports/update_readme.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d033445c9..1de130749 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -71,5 +71,5 @@ repos: hooks: - id: coverage-badge name: Update the coverage badge in the readme - entry: bash -c 'lines=$(readme-cov)' - language: system + entry: python /home/cdsw/research-and-development/cov_reports/update_readme.py + language: python diff --git a/README.md b/README.md index 836dfc16a..629a87859 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Research and Development Project -![Code Coverage](https://img.shields.io/badge/Coverage-33%25-red.svg) +![Code Coverage]() Calculating national and regional research and development expenditure as part of [national accounts](https://www.ons.gov.uk/economy/nationalaccounts). diff --git a/coverage/.gitkeep b/cov_reports/.gitkeep similarity index 100% rename from coverage/.gitkeep rename to cov_reports/.gitkeep diff --git a/cov_reports/update_readme.py b/cov_reports/update_readme.py new file mode 100644 index 000000000..5ac38dd4e --- /dev/null +++ b/cov_reports/update_readme.py @@ -0,0 +1,5 @@ +import subprocess + + +subprocess.run(["python", "-m", "coverage", "run", "-m", "pytest"]) +subprocess.run(["python", "-m", "readme-cov"]) diff --git a/environment.yml b/environment.yml index 6ed2d1261..a96025b78 100644 --- a/environment.yml +++ b/environment.yml @@ -18,6 +18,7 @@ dependencies: - pytest - pytest-cov - python-dotenv + - readme-coverage-badger - setuptools - table_logger - toml From fd4fc83bbf865367c932e128411a9135bb295b2a Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 18:11:55 +0100 Subject: [PATCH 045/411] Updated readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 629a87859..63d64b3b7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Research and Development Project -![Code Coverage]() +![Code Coverage](https://img.shields.io/badge/Coverage-23%25-red.svg) Calculating national and regional research and development expenditure as part of [national accounts](https://www.ons.gov.uk/economy/nationalaccounts). From 49846db1bdd7527d4a9032542ba0511f7bdf4dc2 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Mon, 5 Jun 2023 18:13:38 +0100 Subject: [PATCH 046/411] testing commits in 116 --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 21c8866b8..f3c47002e 100644 --- a/src/main.py +++ b/src/main.py @@ -31,7 +31,7 @@ def run_pipeline(start): MainLogger.info("Launching Pipeline .......................") logger.info("Collecting logging parameters ..........") Manipulate_data() - MainLogger.info("Finishing Pipeline .......................") + MainLogger.info("Finishing Pipeline .........................") runlog_obj.retrieve_pipeline_logs() From 8d3e4100eb8ea58fb7e814e1a9fdcc9d4fada7c5 Mon Sep 17 00:00:00 2001 From: allmag Date: Thu, 11 May 2023 18:41:39 +0100 Subject: [PATCH 047/411] Added a function into loading.py and added new script to reformat spp_snapshot dataframe --- src/data_ingest/loading.py | 28 +++++--- src/data_processing/spp_snapshot_processing | 77 +++++++++++++++++++++ 2 files changed, 95 insertions(+), 10 deletions(-) create mode 100644 src/data_processing/spp_snapshot_processing diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py index 03fd147fa..5dfd97e05 100644 --- a/src/data_ingest/loading.py +++ b/src/data_ingest/loading.py @@ -7,16 +7,24 @@ config = conf_obj.config_dict snapshot_path = config["paths"]["snapshot_path"] # Taken from config file -snapdata = hdfs_load_json(snapshot_path) -contributerdict = snapdata["contributors"] -responsesdict = snapdata["responses"] +def load_snapshot_data(snapshot_path, data_type): -contributers = pd.DataFrame(contributerdict) -responses = pd.DataFrame(responsesdict) + """Load data from SPP Snapshot file in HUE and return two DataFrames containing + contributor and response data respectively. -print(contributers.head()) -print("\n") -print(responses.head()) -print("\n") -print([responses["questioncode"].unique()]) + Arguments: + snapshot_path -- Filepath + data_type -- String with value either "contributors" or "responses". + Determines which part of the snapshot file should be loaded. + + Returns: + data -- DataFrame containing either contributor or response data for BERD + from SPP Snapshot file + """ + + snapshot_data = hdfs_load_json(snapshot_path) + + data = pd.DataFrame(snapshot_data[data_type]) + + return data \ No newline at end of file diff --git a/src/data_processing/spp_snapshot_processing b/src/data_processing/spp_snapshot_processing new file mode 100644 index 000000000..6479d71f8 --- /dev/null +++ b/src/data_processing/spp_snapshot_processing @@ -0,0 +1,77 @@ +import pandas as pd + +from src.utils.helpers import Config_settings +from src.utils.hdfs_mods import hdfs_load_json + +conf_obj = Config_settings() +config = conf_obj.config_dict +snapshot_path = config["snapshot_path"] # Taken from config file + +from src.data_ingest.loading import load_snapshot_data + + +def full_responses(contributors, responses): + + """Merges contributor and response data together into a dataframe that is in a + format allowing for easier manipulation later in pipeline - notably through + having each questioncode as its own column. + + Arguments: + contributors -- DataFrame containing contributor data for BERD + from SPP Snapshot file + responses -- DataFrame containing response data for BERD from SPP Snapshot file + + Returns: + full_responses -- DataFrame containing both response and contributor data + """ + + drop_cols = ["createdby", "createddate", "lastupdatedby", "lastupdateddate"] + + unique_id_cols = ["reference", "period", "survey"] + + contributors_dropped = contributors.drop(drop_cols, axis=1) + responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) + + merged_df = contributors_dropped.merge(responses_dropped, + on = unique_id_cols) + + contextual_df = merged_df.drop(["questioncode", "response"], + axis=1).drop_duplicates() + + response_df = merged_df.pivot_table(index = unique_id_cols, + columns='questioncode', + values='response', + aggfunc=','.join).reset_index() + + full_responses = response_df.merge(contextual_df, on = unique_id_cols) + + return full_responses + + +def response_rate(contributors, responses): + + """Generates a response rate based on the contributor and response data + from the SPP Snapshot file. + + Arguments: + contributors -- DataFrame containing contributor data for BERD + from SPP Snapshot file + responses -- DataFrame containing response data for BERD from SPP Snapshot file + + Returns: + response_rate -- Float representing proportion of contributors who responded + """ + + no_responses = len(responses["reference"].unique()) + no_contributors = len(contributors["reference"].unique()) + + response_rate = no_responses / no_contributors + + return response_rate + +contributors = load_snapshot_data(snapshot_path, data_type = "contributors") +responses = load_snapshot_data(snapshot_path, data_type = "responses") + +full_responses = full_responses(contributors, responses) + +print("\nThe response rate is", "{0:.1%}".format(response_rate(contributors, responses))) From a2b5cde13ab53b06760ceb830fc4fc902005eb64 Mon Sep 17 00:00:00 2001 From: allmag Date: Tue, 23 May 2023 12:05:23 +0100 Subject: [PATCH 048/411] Added to main.py and fixed test# --- src/data_ingest/loading.py | 32 ++++++------ ..._processing => spp_snapshot_processing.py} | 10 +--- src/main.py | 36 ++++++++++++- .../test_spp_snapshot_processing.py | 50 +++++++++++++++++++ 4 files changed, 102 insertions(+), 26 deletions(-) rename src/data_processing/{spp_snapshot_processing => spp_snapshot_processing.py} (84%) create mode 100644 tests/test_data_processing/test_spp_snapshot_processing.py diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py index 5dfd97e05..3a6a102a8 100644 --- a/src/data_ingest/loading.py +++ b/src/data_ingest/loading.py @@ -1,30 +1,32 @@ import pandas as pd +from typing import Tuple from src.utils.helpers import Config_settings from src.utils.hdfs_mods import hdfs_load_json + conf_obj = Config_settings() config = conf_obj.config_dict snapshot_path = config["paths"]["snapshot_path"] # Taken from config file +def parse_snap_data(snapdata: dict = snapshot_path) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Loads the data from the survey via the SPP snapshot. The data is supplied as dict + and is parsed into dataframes, one for survey contributers (company details) + and another one for their responses. -def load_snapshot_data(snapshot_path, data_type): - - """Load data from SPP Snapshot file in HUE and return two DataFrames containing - contributor and response data respectively. - - Arguments: - snapshot_path -- Filepath - data_type -- String with value either "contributors" or "responses". - Determines which part of the snapshot file should be loaded. + Args: + snapdata (dict, optional): The data from the SPP snapshot. Defaults to snapdata. Returns: - data -- DataFrame containing either contributor or response data for BERD - from SPP Snapshot file + Tuple[pd.DataFrame, pd.DataFrame]: The contributers and responders dataframes """ + # Load the dicts + snapdata = hdfs_load_json(snapshot_path) + contributordict = snapdata["contributors"] + responsesdict = snapdata["responses"] - snapshot_data = hdfs_load_json(snapshot_path) - - data = pd.DataFrame(snapshot_data[data_type]) + # Make dataframes + contributors_df = pd.DataFrame(contributordict) + responses_df = pd.DataFrame(responsesdict) - return data \ No newline at end of file + return contributors_df, responses_df \ No newline at end of file diff --git a/src/data_processing/spp_snapshot_processing b/src/data_processing/spp_snapshot_processing.py similarity index 84% rename from src/data_processing/spp_snapshot_processing rename to src/data_processing/spp_snapshot_processing.py index 6479d71f8..f8e854d12 100644 --- a/src/data_processing/spp_snapshot_processing +++ b/src/data_processing/spp_snapshot_processing.py @@ -7,8 +7,6 @@ config = conf_obj.config_dict snapshot_path = config["snapshot_path"] # Taken from config file -from src.data_ingest.loading import load_snapshot_data - def full_responses(contributors, responses): @@ -41,7 +39,7 @@ def full_responses(contributors, responses): response_df = merged_df.pivot_table(index = unique_id_cols, columns='questioncode', values='response', - aggfunc=','.join).reset_index() + aggfunc='first').reset_index() full_responses = response_df.merge(contextual_df, on = unique_id_cols) @@ -69,9 +67,3 @@ def response_rate(contributors, responses): return response_rate -contributors = load_snapshot_data(snapshot_path, data_type = "contributors") -responses = load_snapshot_data(snapshot_path, data_type = "responses") - -full_responses = full_responses(contributors, responses) - -print("\nThe response rate is", "{0:.1%}".format(response_rate(contributors, responses))) diff --git a/src/main.py b/src/main.py index f3c47002e..7300b00e0 100644 --- a/src/main.py +++ b/src/main.py @@ -5,6 +5,8 @@ from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data +from src.data_ingest import loading +from src.data_processing import spp_snapshot_processing as processing import time import logging @@ -21,17 +23,47 @@ def run_pipeline(start): generated from the time module using time.time() """ + # Get the config seetings conf_obj = Config_settings() config = conf_obj.config_dict global_config = config["global"] + # Set up the run logger runlog_obj = runlog.RunLog(config, version) logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") logger.info("Collecting logging parameters ..........") Manipulate_data() - MainLogger.info("Finishing Pipeline .........................") + + # Data Ingest + contributors_df, responses_df = loading.parse_snap_data() + full_responses = processing.full_responses(contributors_df, responses_df) + + logger.info("The response rate is %.3%", processing.response_rate(contributors_df, responses_df)) + + + # Data validation + + # Outlier detection + + # Data cleaning + + # Data processing: Imputation + + # Data processing: Estimation + + # Data processing: Regional Apportionment + + # Data processing: Aggregation + + # Data display: Visualisations + + # Data output: Disclosure Control + + # Data output: File Outputs + + MainLogger.info("Finishing Pipeline .......................") runlog_obj.retrieve_pipeline_logs() @@ -42,4 +74,4 @@ def run_pipeline(start): runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() runlog_obj.create_runlog_files() - runlog_obj._write_runlog() + runlog_obj._write_runlog() \ No newline at end of file diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py new file mode 100644 index 000000000..8095c3da1 --- /dev/null +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -0,0 +1,50 @@ +"""Unit testing module.""" +# Import testing packages +import pandas as pd + +# Import modules to test +from src.data_processing.spp_snapshot_processing import full_responses + +class TestFullResponses: + """Tests for full_responses function.""" + + def test_full_responses(self): + + # Import modules to test + from src.data_processing.spp_snapshot_processing import full_responses + + contributor_data = pd.DataFrame({ + "reference": [101, 102], + "period": [202012, 202012], + "survey": [1, 1], + "createdby": ["A", "A"], + "createddate": [2020, 2020], + "lastupdatedby": ["A", "A"], + "lastupdateddate": [2020, 2020] + }) + + responses_data = pd.DataFrame({ + "reference": [101, 101, 101, 102, 102, 102], + "period": [202012, 202012, 202012, 202012, 202012, 202012], + "survey": [1, 1, 1, 1, 1, 1], + "createdby": ["A", "A", "A", "A", "A", "A"], + "createddate": [2020, 2020, 2020, 2020, 2020, 2020], + "lastupdatedby": ["A", "A", "A", "A", "A", "A"], + "lastupdateddate": [2020, 2020, 2020, 2020, 2020, 2020], + "questioncode": [200, 201, 202, 200, 201, 202], + "response": [0, 50, 100, 75, 25, 65], + "adjustedresponse": ["","","","","",""] + }) + + expected_output = pd.DataFrame({ + "reference": [101, 102], + "period": [202012, 202012], + "survey": [1, 1], + 200: [0, 75], + 201: [50, 25], + 202: [100, 65] + }) + + df_result = full_responses(contributor_data, responses_data) + + pd.testing.assert_frame_equal(df_result, expected_output) From 32207fa2fd340d1807967cfdb1b4e3c77008b92c Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 14:19:12 +0100 Subject: [PATCH 049/411] Adding ignore for tests involving pydoop --- .github/workflows/pytest-action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index e46679180..77ce1c5af 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -29,7 +29,7 @@ jobs: # Specify shell to run the command in shell: bash -l {0} run: | - coverage run --branch --source=./src -m pytest -ra ./tests --junitxml=junit_result.xml && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 + coverage run --branch --source=./src -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 # 4) Get the coverage report in to the pull request comments - name: Pytest coverage comment uses: MishaKav/pytest-coverage-comment@main From 74dac9a1acd6c2cb37fcd2c055f5f14117a771b4 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 14:33:40 +0100 Subject: [PATCH 050/411] Faking import of pydoop --- src/conftest.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 src/conftest.py diff --git a/src/conftest.py b/src/conftest.py new file mode 100644 index 000000000..4b986834f --- /dev/null +++ b/src/conftest.py @@ -0,0 +1,5 @@ +"""Mocking the import of Pydoop""" +import sys + +module = type(sys)("pydoop") +sys.modules["pydoop"] = module From c8b4566dd2aa5fabae65c88dde8ff89f92895184 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 14:38:45 +0100 Subject: [PATCH 051/411] Try creating sub module --- src/conftest.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/conftest.py b/src/conftest.py index 4b986834f..e0eed9f64 100644 --- a/src/conftest.py +++ b/src/conftest.py @@ -1,5 +1,11 @@ """Mocking the import of Pydoop""" import sys -module = type(sys)("pydoop") -sys.modules["pydoop"] = module + +def hdfs_fake(): + pass + + +pydoop_fake = type(sys)("pydoop") +pydoop_fake.hdfs = hdfs_fake +sys.modules["pydoop"] = pydoop_fake From 44fbf62cb5490800e0fd10104f0eaf974374caac Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 14:56:44 +0100 Subject: [PATCH 052/411] Add requi file. Remove txt from pre-commit check --- .pre-commit-config.yaml | 8 +------- requirements.txt | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) create mode 100644 requirements.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1de130749..e86459372 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -66,10 +66,4 @@ repos: name: Check commits for restricted file extensions entry: These file extensions are restricted. Data should be removed from the commit language: fail - files: .*\.(csv|feather|xlsx|zip|hdf5|h5|txt|json|xml|hd|parquet) - - repo: local - hooks: - - id: coverage-badge - name: Update the coverage badge in the readme - entry: python /home/cdsw/research-and-development/cov_reports/update_readme.py - language: python + files: .*\.(csv|feather|xlsx|zip|hdf5|h5|json|xml|hd|parquet) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..19bbd5b53 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +- python==3.6 +- arrow +- cookiecutter +- detect-secrets +- myst-parser +- pre-commit==2.17.0 +- python-dotenv +- table_logger +- pandas==1.1.5 +- numpy +- pydoop +- setuptools +- pytest +- coverage +- pyyaml +- requests +- sphinx +- pip From 16978d0c81164c3c962bafbce6697035ff39a230 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 15:00:28 +0100 Subject: [PATCH 053/411] Updating action to include pydoop install --- .github/workflows/pytest-action.yaml | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 77ce1c5af..3e6f67159 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -13,17 +13,20 @@ jobs: # 1) Checkout the code - uses: actions/checkout@v3 -# 2) Removing PyDoop from the environment yaml - - name: Remove pydoop dependency - shell: bash -l {0} - run: | - awk '!/pydoop.*/' environment.yml > temp && mv temp environment.yml -# 3) Use Setup Miniconda github action to setup environment - - uses: conda-incubator/setup-miniconda@v2 - with: - python-version: 3.6 - environment-file: environment.yml - activate-environment: resdev36 +# # 2) Removing PyDoop from the environment yaml +# - name: Remove pydoop dependency +# shell: bash -l {0} +# run: | +# awk '!/pydoop.*/' environment.yml > temp && mv temp environment.yml +# # 3) Use Setup Miniconda github action to setup environment +# - uses: conda-incubator/setup-miniconda@v2 +# with: +# python-version: 3.6 +# environment-file: environment.yml +# activate-environment: resdev36 +# install dependencies from requirements.txt + - name: Install dependencies + run: pip install requirements.txt # 3) Run pytest to run all tests in the tests folder - name: Use coverage to run pytest # Specify shell to run the command in From ddeea0471e81089a560253782870288cf50b9e67 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 15:03:09 +0100 Subject: [PATCH 054/411] forgot -r --- .github/workflows/pytest-action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 3e6f67159..0403ef1ef 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -26,7 +26,7 @@ jobs: # activate-environment: resdev36 # install dependencies from requirements.txt - name: Install dependencies - run: pip install requirements.txt + run: pip install -r requirements.txt # 3) Run pytest to run all tests in the tests folder - name: Use coverage to run pytest # Specify shell to run the command in From 5fbc76ea3d147f7446704a1a7dc23c7eef1022ae Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 15:07:07 +0100 Subject: [PATCH 055/411] correcting format of requir file --- requirements.txt | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/requirements.txt b/requirements.txt index 19bbd5b53..fb51bb6cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ -- python==3.6 -- arrow -- cookiecutter -- detect-secrets -- myst-parser -- pre-commit==2.17.0 -- python-dotenv -- table_logger -- pandas==1.1.5 -- numpy -- pydoop -- setuptools -- pytest -- coverage -- pyyaml -- requests -- sphinx -- pip +python==3.6 +arrow +cookiecutter +detect-secrets +myst-parser +pre-commit==2.17.0 +python-dotenv +table_logger +pandas==1.1.5 +numpy +pydoop +setuptools +pytest +coverage +pyyaml +requests +sphinx +pip From c9a97fbc75db4f756a96247deeee9b848b6e6df4 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 15:16:02 +0100 Subject: [PATCH 056/411] removing python as dependcy --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fb51bb6cd..0b68aaafb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -python==3.6 +# python==3.6 arrow cookiecutter detect-secrets @@ -15,4 +15,3 @@ coverage pyyaml requests sphinx -pip From 591f4b2ce1b0a0e752bcc4924d1ea77bb7f8fae4 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 15:21:35 +0100 Subject: [PATCH 057/411] apt-get installing hdfs reqs --- .github/workflows/pytest-action.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 0403ef1ef..f64aeac83 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -24,6 +24,12 @@ jobs: # python-version: 3.6 # environment-file: environment.yml # activate-environment: resdev36 +# install dependencies for pydoop + - name: Install pydoop dependencies + run: | + sudo apt-get update + sudo apt-get install -y libhdfs3 libhdfspp-dev + # install dependencies from requirements.txt - name: Install dependencies run: pip install -r requirements.txt From ba6ea89699b15e4cc83b5fc716126b7ac05f1001 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 16:29:19 +0100 Subject: [PATCH 058/411] Testing if jdk installs --- .github/workflows/pytest-action.yaml | 40 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index f64aeac83..7ef22d2cb 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -28,24 +28,24 @@ jobs: - name: Install pydoop dependencies run: | sudo apt-get update - sudo apt-get install -y libhdfs3 libhdfspp-dev + sudo apt-get install -y openjdk-8-jdk -# install dependencies from requirements.txt - - name: Install dependencies - run: pip install -r requirements.txt -# 3) Run pytest to run all tests in the tests folder - - name: Use coverage to run pytest - # Specify shell to run the command in - shell: bash -l {0} - run: | - coverage run --branch --source=./src -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 -# 4) Get the coverage report in to the pull request comments - - name: Pytest coverage comment - uses: MishaKav/pytest-coverage-comment@main - with: - title: Detailed Coverage Report - badge-title: Percentage Coverage for this PR - pytest-xml-coverage-path: ./python_coverage.xml - coverage-path-prefix: src/ - junitxml-title: Summary of tests - junitxml-path: ./junit_result.xml +# # install dependencies from requirements.txt +# - name: Install dependencies +# run: pip install -r requirements.txt +# # 3) Run pytest to run all tests in the tests folder +# - name: Use coverage to run pytest +# # Specify shell to run the command in +# shell: bash -l {0} +# run: | +# coverage run --branch --source=./src -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 +# # 4) Get the coverage report in to the pull request comments +# - name: Pytest coverage comment +# uses: MishaKav/pytest-coverage-comment@main +# with: +# title: Detailed Coverage Report +# badge-title: Percentage Coverage for this PR +# pytest-xml-coverage-path: ./python_coverage.xml +# coverage-path-prefix: src/ +# junitxml-title: Summary of tests +# junitxml-path: ./junit_result.xml From 52c8907a4d8eb79954f78c9a48580390129f8635 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 16:36:44 +0100 Subject: [PATCH 059/411] Trying jre dependencies --- .github/workflows/pytest-action.yaml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 7ef22d2cb..42004cc3e 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -29,10 +29,18 @@ jobs: run: | sudo apt-get update sudo apt-get install -y openjdk-8-jdk + export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 + export PATH=$JAVA_HOME/bin:$PATH + wget https://archive.apache.org/dist/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz + tar -xzf hadoop-3.2.1.tar.gz + export HADOOP_HOME=$PWD/hadoop-3.2.1 + export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH + export CLASSPATH=$(hadoop classpath --glob):$CLASSPATH -# # install dependencies from requirements.txt -# - name: Install dependencies -# run: pip install -r requirements.txt + +# install dependencies from requirements.txt + - name: Install dependencies + run: pip install -r requirements.txt # # 3) Run pytest to run all tests in the tests folder # - name: Use coverage to run pytest # # Specify shell to run the command in From 3313bbc95dfa4c58ecfb95b3b348a16d47fa7fe0 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 17:09:56 +0100 Subject: [PATCH 060/411] Importing conftest --- .../test_spp_snapshot_processing.py | 76 ++++++++++--------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py index 8095c3da1..5dc0afeab 100644 --- a/tests/test_data_processing/test_spp_snapshot_processing.py +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -2,49 +2,55 @@ # Import testing packages import pandas as pd -# Import modules to test -from src.data_processing.spp_snapshot_processing import full_responses +import conftest # noqa + class TestFullResponses: """Tests for full_responses function.""" def test_full_responses(self): - + # Import modules to test from src.data_processing.spp_snapshot_processing import full_responses - contributor_data = pd.DataFrame({ - "reference": [101, 102], - "period": [202012, 202012], - "survey": [1, 1], - "createdby": ["A", "A"], - "createddate": [2020, 2020], - "lastupdatedby": ["A", "A"], - "lastupdateddate": [2020, 2020] - }) - - responses_data = pd.DataFrame({ - "reference": [101, 101, 101, 102, 102, 102], - "period": [202012, 202012, 202012, 202012, 202012, 202012], - "survey": [1, 1, 1, 1, 1, 1], - "createdby": ["A", "A", "A", "A", "A", "A"], - "createddate": [2020, 2020, 2020, 2020, 2020, 2020], - "lastupdatedby": ["A", "A", "A", "A", "A", "A"], - "lastupdateddate": [2020, 2020, 2020, 2020, 2020, 2020], - "questioncode": [200, 201, 202, 200, 201, 202], - "response": [0, 50, 100, 75, 25, 65], - "adjustedresponse": ["","","","","",""] - }) - - expected_output = pd.DataFrame({ - "reference": [101, 102], - "period": [202012, 202012], - "survey": [1, 1], - 200: [0, 75], - 201: [50, 25], - 202: [100, 65] - }) - + contributor_data = pd.DataFrame( + { + "reference": [101, 102], + "period": [202012, 202012], + "survey": [1, 1], + "createdby": ["A", "A"], + "createddate": [2020, 2020], + "lastupdatedby": ["A", "A"], + "lastupdateddate": [2020, 2020], + } + ) + + responses_data = pd.DataFrame( + { + "reference": [101, 101, 101, 102, 102, 102], + "period": [202012, 202012, 202012, 202012, 202012, 202012], + "survey": [1, 1, 1, 1, 1, 1], + "createdby": ["A", "A", "A", "A", "A", "A"], + "createddate": [2020, 2020, 2020, 2020, 2020, 2020], + "lastupdatedby": ["A", "A", "A", "A", "A", "A"], + "lastupdateddate": [2020, 2020, 2020, 2020, 2020, 2020], + "questioncode": [200, 201, 202, 200, 201, 202], + "response": [0, 50, 100, 75, 25, 65], + "adjustedresponse": ["", "", "", "", "", ""], + } + ) + + expected_output = pd.DataFrame( + { + "reference": [101, 102], + "period": [202012, 202012], + "survey": [1, 1], + 200: [0, 75], + 201: [50, 25], + 202: [100, 65], + } + ) + df_result = full_responses(contributor_data, responses_data) pd.testing.assert_frame_equal(df_result, expected_output) From c8edd250f19928e5597865f440c80525513dda5f Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 17:12:45 +0100 Subject: [PATCH 061/411] Stop full installation of pydoop JRE etc --- .github/workflows/pytest-action.yaml | 32 ++++++++-------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 42004cc3e..a4391f76b 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -13,30 +13,16 @@ jobs: # 1) Checkout the code - uses: actions/checkout@v3 -# # 2) Removing PyDoop from the environment yaml -# - name: Remove pydoop dependency -# shell: bash -l {0} -# run: | -# awk '!/pydoop.*/' environment.yml > temp && mv temp environment.yml -# # 3) Use Setup Miniconda github action to setup environment -# - uses: conda-incubator/setup-miniconda@v2 -# with: -# python-version: 3.6 -# environment-file: environment.yml -# activate-environment: resdev36 -# install dependencies for pydoop - - name: Install pydoop dependencies +# 2) Removing PyDoop from the environment yaml + - name: Remove pydoop requirements.txt + shell: bash -l {0} run: | - sudo apt-get update - sudo apt-get install -y openjdk-8-jdk - export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 - export PATH=$JAVA_HOME/bin:$PATH - wget https://archive.apache.org/dist/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz - tar -xzf hadoop-3.2.1.tar.gz - export HADOOP_HOME=$PWD/hadoop-3.2.1 - export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH - export CLASSPATH=$(hadoop classpath --glob):$CLASSPATH - + awk '!/pydoop.*/' requirements.txt> temp && mv temp requirements.txt +# 3) Set up Python + - name: Set up Python 3.6 + uses: actions/setup-python@v4 + with: + python-version: '3.6' # install dependencies from requirements.txt - name: Install dependencies From 8aa143b0ceba34c2a6a4ca8b8636ca817561e761 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 17:17:12 +0100 Subject: [PATCH 062/411] Running coverage and pytest --- .github/workflows/pytest-action.yaml | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index a4391f76b..94fd0f087 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -27,19 +27,19 @@ jobs: # install dependencies from requirements.txt - name: Install dependencies run: pip install -r requirements.txt -# # 3) Run pytest to run all tests in the tests folder -# - name: Use coverage to run pytest -# # Specify shell to run the command in -# shell: bash -l {0} -# run: | -# coverage run --branch --source=./src -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 -# # 4) Get the coverage report in to the pull request comments -# - name: Pytest coverage comment -# uses: MishaKav/pytest-coverage-comment@main -# with: -# title: Detailed Coverage Report -# badge-title: Percentage Coverage for this PR -# pytest-xml-coverage-path: ./python_coverage.xml -# coverage-path-prefix: src/ -# junitxml-title: Summary of tests -# junitxml-path: ./junit_result.xml +# 3) Run pytest to run all tests in the tests folder + - name: Use coverage to run pytest + # Specify shell to run the command in + shell: bash -l {0} + run: | + coverage run --branch --source=./src -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 +# 4) Get the coverage report in to the pull request comments + - name: Pytest coverage comment + uses: MishaKav/pytest-coverage-comment@main + with: + title: Detailed Coverage Report + badge-title: Percentage Coverage for this PR + pytest-xml-coverage-path: ./python_coverage.xml + coverage-path-prefix: src/ + junitxml-title: Summary of tests + junitxml-path: ./junit_result.xml From ca65853fe0dc1aad4c4a1153ce8bfd07d82fab7b Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 17:22:16 +0100 Subject: [PATCH 063/411] Moved conftest to tests folder --- {src => tests}/conftest.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {src => tests}/conftest.py (100%) diff --git a/src/conftest.py b/tests/conftest.py similarity index 100% rename from src/conftest.py rename to tests/conftest.py From 0a411b94ce68a75fa8b72c885b1bae9b6068b1be Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 17:25:36 +0100 Subject: [PATCH 064/411] Moving conftest to root --- conftest.py | 11 +++++++++++ tests/conftest.py | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) delete mode 100644 tests/conftest.py diff --git a/conftest.py b/conftest.py index e69de29bb..e0eed9f64 100644 --- a/conftest.py +++ b/conftest.py @@ -0,0 +1,11 @@ +"""Mocking the import of Pydoop""" +import sys + + +def hdfs_fake(): + pass + + +pydoop_fake = type(sys)("pydoop") +pydoop_fake.hdfs = hdfs_fake +sys.modules["pydoop"] = pydoop_fake diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index e0eed9f64..000000000 --- a/tests/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Mocking the import of Pydoop""" -import sys - - -def hdfs_fake(): - pass - - -pydoop_fake = type(sys)("pydoop") -pydoop_fake.hdfs = hdfs_fake -sys.modules["pydoop"] = pydoop_fake From 000e2ae0371ee3a595548213c2c47b8cb8235a52 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 17:34:23 +0100 Subject: [PATCH 065/411] Module not used --- .../spp_snapshot_processing.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index f8e854d12..f6cea4dc8 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -1,7 +1,8 @@ -import pandas as pd +# import pandas as pd from src.utils.helpers import Config_settings -from src.utils.hdfs_mods import hdfs_load_json + +# from src.utils.hdfs_mods import hdfs_load_json conf_obj = Config_settings() config = conf_obj.config_dict @@ -10,8 +11,8 @@ def full_responses(contributors, responses): - """Merges contributor and response data together into a dataframe that is in a - format allowing for easier manipulation later in pipeline - notably through + """Merges contributor and response data together into a dataframe that is in a + format allowing for easier manipulation later in pipeline - notably through having each questioncode as its own column. Arguments: @@ -30,25 +31,24 @@ def full_responses(contributors, responses): contributors_dropped = contributors.drop(drop_cols, axis=1) responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) - merged_df = contributors_dropped.merge(responses_dropped, - on = unique_id_cols) + merged_df = contributors_dropped.merge(responses_dropped, on=unique_id_cols) - contextual_df = merged_df.drop(["questioncode", "response"], - axis=1).drop_duplicates() + contextual_df = merged_df.drop( + ["questioncode", "response"], axis=1 + ).drop_duplicates() - response_df = merged_df.pivot_table(index = unique_id_cols, - columns='questioncode', - values='response', - aggfunc='first').reset_index() + response_df = merged_df.pivot_table( + index=unique_id_cols, columns="questioncode", values="response", aggfunc="first" + ).reset_index() - full_responses = response_df.merge(contextual_df, on = unique_id_cols) + full_responses = response_df.merge(contextual_df, on=unique_id_cols) return full_responses def response_rate(contributors, responses): - """Generates a response rate based on the contributor and response data + """Generates a response rate based on the contributor and response data from the SPP Snapshot file. Arguments: @@ -66,4 +66,3 @@ def response_rate(contributors, responses): response_rate = no_responses / no_contributors return response_rate - From 07241922ab3182d49567023046e37f9b800016a4 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 17:36:18 +0100 Subject: [PATCH 066/411] Updating the keys --- src/data_processing/spp_snapshot_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index f6cea4dc8..66fea1e61 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -6,7 +6,7 @@ conf_obj = Config_settings() config = conf_obj.config_dict -snapshot_path = config["snapshot_path"] # Taken from config file +snapshot_path = config["paths"]["snapshot_path"] # Taken from config file def full_responses(contributors, responses): From 9625f9c829d697ce16089f3195ace47f6399f2e8 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 17:37:41 +0100 Subject: [PATCH 067/411] Tidying --- src/data_processing/spp_snapshot_processing.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index 66fea1e61..f92b33781 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -1,9 +1,5 @@ -# import pandas as pd - from src.utils.helpers import Config_settings -# from src.utils.hdfs_mods import hdfs_load_json - conf_obj = Config_settings() config = conf_obj.config_dict snapshot_path = config["paths"]["snapshot_path"] # Taken from config file From 2365a7f0fba8757ef72613043a53b53426454654 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Fri, 2 Jun 2023 11:18:25 +0100 Subject: [PATCH 068/411] Fixing tests in vscode --- conftest.py => otherconftest.py | 0 tests/conftest.py | 6 ++++++ .../test_spp_snapshot_processing.py | 2 -- tests/test_utils/test_hdfs_mods.py | 20 +++++++++---------- 4 files changed, 16 insertions(+), 12 deletions(-) rename conftest.py => otherconftest.py (100%) create mode 100644 tests/conftest.py diff --git a/conftest.py b/otherconftest.py similarity index 100% rename from conftest.py rename to otherconftest.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..cb47bf3a3 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,6 @@ +import pytest +import os + +hdfs_skip = pytest.mark.skipif( + os.environ.get("USER") == "cdsw", reason="HDFS cannot be accessed from Jenkins" +) diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py index 5dc0afeab..b144260f7 100644 --- a/tests/test_data_processing/test_spp_snapshot_processing.py +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -2,8 +2,6 @@ # Import testing packages import pandas as pd -import conftest # noqa - class TestFullResponses: """Tests for full_responses function.""" diff --git a/tests/test_utils/test_hdfs_mods.py b/tests/test_utils/test_hdfs_mods.py index 998b8bad4..84cbc10ed 100644 --- a/tests/test_utils/test_hdfs_mods.py +++ b/tests/test_utils/test_hdfs_mods.py @@ -4,7 +4,10 @@ import pandas as pd # Import modules to test -from src.utils.hdfs_mods import read_hdfs_csv, write_hdfs_csv, hdfs_load_json +import sys + +sys.modules["mock_f"] = mock.Mock() +from src.utils.hdfs_mods import read_hdfs_csv, write_hdfs_csv, hdfs_load_json # noqa class TestReadCsv: @@ -37,14 +40,13 @@ def expout_data(self): def test_read_hdfs_csv(self, mock_hdfs, mock_pd_csv): """Test the expected functionality of read_hdfs_csv.""" - mock_f = mock.Mock() - mock_hdfs.open.return_value.__enter__.return_value = mock_f + mock_hdfs.open.return_value.__enter__.return_value = sys.modules["mock_f"] mock_pd_csv.read_csv.return_value = self.input_data() df_result = read_hdfs_csv("file/path/filename.csv") - mock_pd_csv.read_csv.assert_called_with(mock_f) + mock_pd_csv.read_csv.assert_called_with(sys.modules["mock_f"]) df_expout = self.expout_data() pd.testing.assert_frame_equal(df_result, df_expout) @@ -55,14 +57,13 @@ class TestWriteCsv: def test_write_hdfs_csv(self, mock_hdfs): """Test the expected functionality of write_hdfs_csv.""" - mock_f = mock.Mock() - mock_hdfs.open.return_value.__enter__.return_value = mock_f + mock_hdfs.open.return_value.__enter__.return_value = sys.modules["mock_f"] test_df = pd.DataFrame({"col": ["data"]}) with mock.patch.object(test_df, "to_csv") as to_csv_mock: write_hdfs_csv("file/path/filename.csv", test_df) - to_csv_mock.assert_called_with(mock_f, index=False) + to_csv_mock.assert_called_with(sys.modules["mock_f"], index=False) class TestLoadJson: @@ -95,14 +96,13 @@ def expout_data(self): def test_hdfs_load_json(self, mock_hdfs, mock_json): """Test the expected functionality of hdfs_load_json.""" - mock_f = mock.Mock() - mock_hdfs.open.return_value.__enter__.return_value = mock_f + mock_hdfs.open.return_value.__enter__.return_value = sys.modules["mock_f"] mock_json.load.return_value = self.input_data() json_result = hdfs_load_json("file/path/filename.json") - mock_json.load.assert_called_with(mock_f) + mock_json.load.assert_called_with(sys.modules["mock_f"]) json_expout = self.expout_data() From 6a9b91e5235a02341e0febb146e2b606801b6a60 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Fri, 2 Jun 2023 11:22:13 +0100 Subject: [PATCH 069/411] test actions without toml line --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b521de66f..d8d811a19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,7 @@ profile = "black" [tool.pytest.ini_options] addopts = [ "-vv", - "--doctest-modules", - "--ignore=tests/test_utils/test_hdfs_mods.py" + "--doctest-modules" ] doctest_optionflags = "NORMALIZE_WHITESPACE" testpaths = [ From 2dcb9c1d4a8151cd3a4fae4bcb4a98e2a00e3fe5 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Fri, 2 Jun 2023 11:28:09 +0100 Subject: [PATCH 070/411] omit hdfs_mods from coverage --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d8d811a19..b1cb7afb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,8 @@ # `coverage` configurations [tool.coverage.run] +omit = + # omit this single file + src/utils/hdfs_mods.py source = [ "./src" ] From 928ecffef9c3f5c819172eb899e2a478c4d84727 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Fri, 2 Jun 2023 11:30:15 +0100 Subject: [PATCH 071/411] fixing toml error --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b1cb7afb6..dc189440a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,6 @@ # `coverage` configurations [tool.coverage.run] -omit = - # omit this single file - src/utils/hdfs_mods.py +omit = src/utils/hdfs_mods.py source = [ "./src" ] From fb718b9738b5a58301f2fcf26733ee508a618473 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Fri, 2 Jun 2023 11:32:47 +0100 Subject: [PATCH 072/411] added line to workflow --- .github/workflows/pytest-action.yaml | 2 +- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 94fd0f087..23f0007a9 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -32,7 +32,7 @@ jobs: # Specify shell to run the command in shell: bash -l {0} run: | - coverage run --branch --source=./src -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 + coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 # 4) Get the coverage report in to the pull request comments - name: Pytest coverage comment uses: MishaKav/pytest-coverage-comment@main diff --git a/pyproject.toml b/pyproject.toml index dc189440a..28be6f52c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,5 @@ # `coverage` configurations [tool.coverage.run] -omit = src/utils/hdfs_mods.py source = [ "./src" ] @@ -18,7 +17,8 @@ profile = "black" [tool.pytest.ini_options] addopts = [ "-vv", - "--doctest-modules" + "--doctest-modules", + ] doctest_optionflags = "NORMALIZE_WHITESPACE" testpaths = [ From 3be6e34a54256341de1ec61a6f2bc5145fc4a69c Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Fri, 2 Jun 2023 11:37:12 +0100 Subject: [PATCH 073/411] omitted runlog and wrappers --- .github/workflows/pytest-action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 23f0007a9..d46661a22 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -32,7 +32,7 @@ jobs: # Specify shell to run the command in shell: bash -l {0} run: | - coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 + coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 # 4) Get the coverage report in to the pull request comments - name: Pytest coverage comment uses: MishaKav/pytest-coverage-comment@main From 9dcb773217637cc01d81ffc3025c2a67ccf38ba9 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Fri, 2 Jun 2023 11:42:27 +0100 Subject: [PATCH 074/411] added main and version to omits --- .github/workflows/pytest-action.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index d46661a22..28e16f2e3 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -32,7 +32,9 @@ jobs: # Specify shell to run the command in shell: bash -l {0} run: | - coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml -o python_coverage.xml && coverage report -m --fail-under=10 + coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/main.py \ + -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ + -o python_coverage.xml && coverage report -m --fail-under=10 # 4) Get the coverage report in to the pull request comments - name: Pytest coverage comment uses: MishaKav/pytest-coverage-comment@main From 830c513a30fe581b1c5bc18413675f620cbb4598 Mon Sep 17 00:00:00 2001 From: jwestw Date: Fri, 2 Jun 2023 13:20:34 +0100 Subject: [PATCH 075/411] Abstracting dummy data. Functionalising code. Testing response rate func --- .../test_spp_snapshot_processing.py | 114 ++++++++++-------- 1 file changed, 67 insertions(+), 47 deletions(-) diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py index b144260f7..61c9ef834 100644 --- a/tests/test_data_processing/test_spp_snapshot_processing.py +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -1,54 +1,74 @@ """Unit testing module.""" # Import testing packages import pandas as pd +import pytest -class TestFullResponses: +@pytest.fixture +def dummy_data(): + # Set up the dummy data + contributor_data = pd.DataFrame( + { + "reference": [101, 102, 103], + "period": [202012, 202012, 202012], + "survey": [1, 1, 1], + "createdby": ["James", "Ilyas", "Roddy"], + "createddate": [2020, 2020, 2020], + "lastupdatedby": ["Vondy", "Charl", "Gareth"], + "lastupdateddate": [2020, 2020, 2020], + } + ) + + responses_data = pd.DataFrame( + { + "reference": [101, 101, 101, 102, 102, 102], + "period": [202012, 202012, 202012, 202012, 202012, 202012], + "survey": [1, 1, 1, 1, 1, 1], + "createdby": ["A", "A", "A", "A", "A", "A"], + "createddate": [2020, 2020, 2020, 2020, 2020, 2020], + "lastupdatedby": ["A", "A", "A", "A", "A", "A"], + "lastupdateddate": [2020, 2020, 2020, 2020, 2020, 2020], + "questioncode": [200, 201, 202, 200, 201, 202], + "response": [0, 50, 100, 75, 25, 65], + "adjustedresponse": ["", "", "", "", "", ""], + } + ) + return contributor_data, responses_data + +@pytest.fixture +def expected_output(): + expected_output = pd.DataFrame( + { + "reference": [101, 102], + "period": [202012, 202012], + "survey": [1, 1], + 200: [0, 75], + 201: [50, 25], + 202: [100, 65], + } + ) + + return expected_output + +def test_full_responses(dummy_data): """Tests for full_responses function.""" + # Import modules to test + from src.data_processing.spp_snapshot_processing import full_responses + + contributor_data, responses_data = dummy_data + expected_output_data = expected_output + + df_result = full_responses(contributor_data, responses_data) + + pd.testing.assert_frame_equal(df_result, expected_output_data) + +def test_response_rate(dummy_data): + # Import the module to test + from src.data_processing.spp_snapshot_processing import response_rate + + contributor_data, responses_data, _ = dummy_data + + response_rate_value = response_rate(contributor_data, responses_data) - def test_full_responses(self): - - # Import modules to test - from src.data_processing.spp_snapshot_processing import full_responses - - contributor_data = pd.DataFrame( - { - "reference": [101, 102], - "period": [202012, 202012], - "survey": [1, 1], - "createdby": ["A", "A"], - "createddate": [2020, 2020], - "lastupdatedby": ["A", "A"], - "lastupdateddate": [2020, 2020], - } - ) - - responses_data = pd.DataFrame( - { - "reference": [101, 101, 101, 102, 102, 102], - "period": [202012, 202012, 202012, 202012, 202012, 202012], - "survey": [1, 1, 1, 1, 1, 1], - "createdby": ["A", "A", "A", "A", "A", "A"], - "createddate": [2020, 2020, 2020, 2020, 2020, 2020], - "lastupdatedby": ["A", "A", "A", "A", "A", "A"], - "lastupdateddate": [2020, 2020, 2020, 2020, 2020, 2020], - "questioncode": [200, 201, 202, 200, 201, 202], - "response": [0, 50, 100, 75, 25, 65], - "adjustedresponse": ["", "", "", "", "", ""], - } - ) - - expected_output = pd.DataFrame( - { - "reference": [101, 102], - "period": [202012, 202012], - "survey": [1, 1], - 200: [0, 75], - 201: [50, 25], - 202: [100, 65], - } - ) - - df_result = full_responses(contributor_data, responses_data) - - pd.testing.assert_frame_equal(df_result, expected_output) + expected_response_rate = 2/3 # 2 respondents out of 3 contributors + assert expected_response_rate == response_rate_value \ No newline at end of file From fca102a7f15b0be4985150862301d8262a490f58 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Fri, 2 Jun 2023 13:48:47 +0100 Subject: [PATCH 076/411] All tests working. Corrections made --- .../test_spp_snapshot_processing.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py index 61c9ef834..c9a8454e0 100644 --- a/tests/test_data_processing/test_spp_snapshot_processing.py +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -35,6 +35,7 @@ def dummy_data(): ) return contributor_data, responses_data + @pytest.fixture def expected_output(): expected_output = pd.DataFrame( @@ -47,28 +48,30 @@ def expected_output(): 202: [100, 65], } ) - + return expected_output -def test_full_responses(dummy_data): + +def test_full_responses(dummy_data, expected_output): """Tests for full_responses function.""" # Import modules to test from src.data_processing.spp_snapshot_processing import full_responses contributor_data, responses_data = dummy_data expected_output_data = expected_output - + df_result = full_responses(contributor_data, responses_data) pd.testing.assert_frame_equal(df_result, expected_output_data) + def test_response_rate(dummy_data): # Import the module to test from src.data_processing.spp_snapshot_processing import response_rate - contributor_data, responses_data, _ = dummy_data + contributor_data, responses_data = dummy_data response_rate_value = response_rate(contributor_data, responses_data) - expected_response_rate = 2/3 # 2 respondents out of 3 contributors - assert expected_response_rate == response_rate_value \ No newline at end of file + expected_response_rate = 2 / 3 # 2 respondents out of 3 contributors + assert expected_response_rate == response_rate_value From 42fa49f3dda1844558f4036cee1f15f5673beece Mon Sep 17 00:00:00 2001 From: jwestw Date: Mon, 5 Jun 2023 09:43:05 +0100 Subject: [PATCH 077/411] improve typing in fixture --- tests/test_data_processing/test_spp_snapshot_processing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py index c9a8454e0..da01d4fe3 100644 --- a/tests/test_data_processing/test_spp_snapshot_processing.py +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -2,10 +2,11 @@ # Import testing packages import pandas as pd import pytest +from typing import Tuple @pytest.fixture -def dummy_data(): +def dummy_data() -> Tuple[pd.DataFrame, pd.DataFrame]: # Set up the dummy data contributor_data = pd.DataFrame( { From e9c45ed36325db5bccb91379519a42e9f928ee69 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 18:53:55 +0100 Subject: [PATCH 078/411] Tests for loading.py module --- src/data_ingest/loading.py | 7 +++-- tests/test_data_ingest/test_loading.py | 43 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 tests/test_data_ingest/test_loading.py diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py index 3a6a102a8..a25c0cca1 100644 --- a/src/data_ingest/loading.py +++ b/src/data_ingest/loading.py @@ -8,8 +8,10 @@ conf_obj = Config_settings() config = conf_obj.config_dict snapshot_path = config["paths"]["snapshot_path"] # Taken from config file +snapdata = hdfs_load_json(snapshot_path) -def parse_snap_data(snapdata: dict = snapshot_path) -> Tuple[pd.DataFrame, pd.DataFrame]: + +def parse_snap_data(snapdata: dict = snapdata) -> Tuple[pd.DataFrame, pd.DataFrame]: """Loads the data from the survey via the SPP snapshot. The data is supplied as dict and is parsed into dataframes, one for survey contributers (company details) and another one for their responses. @@ -21,7 +23,6 @@ def parse_snap_data(snapdata: dict = snapshot_path) -> Tuple[pd.DataFrame, pd.Da Tuple[pd.DataFrame, pd.DataFrame]: The contributers and responders dataframes """ # Load the dicts - snapdata = hdfs_load_json(snapshot_path) contributordict = snapdata["contributors"] responsesdict = snapdata["responses"] @@ -29,4 +30,4 @@ def parse_snap_data(snapdata: dict = snapshot_path) -> Tuple[pd.DataFrame, pd.Da contributors_df = pd.DataFrame(contributordict) responses_df = pd.DataFrame(responsesdict) - return contributors_df, responses_df \ No newline at end of file + return contributors_df, responses_df diff --git a/tests/test_data_ingest/test_loading.py b/tests/test_data_ingest/test_loading.py new file mode 100644 index 000000000..195727e1f --- /dev/null +++ b/tests/test_data_ingest/test_loading.py @@ -0,0 +1,43 @@ +import pandas as pd +from typing import Tuple + +# Import modules to test +from src.data_ingest.loading import parse_snap_data + + +class TestParseSPP: + """Test for Parse Snap data function""" + + def input_data(self) -> dict: + dummy_snapdata = { + "snapshot_id": "", + "contributors": [ + {"ref": "123", "con": "789"}, + {"ref": "456", "con": "910"}, + ], + "responses": [{"ref": "123", "res": "789"}, {"ref": "456", "res": "910"}], + } + + return dummy_snapdata + + def exp_output(self) -> Tuple[pd.DataFrame, pd.DataFrame]: + contributor_df = pd.DataFrame( + [{"ref": "123", "con": "789"}, {"ref": "456", "con": "910"}] + ) + + responses_df = pd.DataFrame( + [{"ref": "123", "res": "789"}, {"ref": "456", "res": "910"}] + ) + + return contributor_df, responses_df + + def test_parse_snap_data(self): + """Tests for full_responses function.""" + + inputdata = self.input_data() + df_result1, df_result2 = parse_snap_data(inputdata) + + expected_output_data1, expected_output_data2 = self.exp_output() + + pd.testing.assert_frame_equal(df_result1, expected_output_data1) + pd.testing.assert_frame_equal(df_result2, expected_output_data2) From 75364af4283f98eb37ae58e5fceffd53846fb659 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 18:59:51 +0100 Subject: [PATCH 079/411] moving import to inside func --- tests/test_data_ingest/test_loading.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_data_ingest/test_loading.py b/tests/test_data_ingest/test_loading.py index 195727e1f..b8860c45f 100644 --- a/tests/test_data_ingest/test_loading.py +++ b/tests/test_data_ingest/test_loading.py @@ -1,9 +1,6 @@ import pandas as pd from typing import Tuple -# Import modules to test -from src.data_ingest.loading import parse_snap_data - class TestParseSPP: """Test for Parse Snap data function""" @@ -34,6 +31,9 @@ def exp_output(self) -> Tuple[pd.DataFrame, pd.DataFrame]: def test_parse_snap_data(self): """Tests for full_responses function.""" + # Import modules to test + from src.data_ingest.loading import parse_snap_data + inputdata = self.input_data() df_result1, df_result2 = parse_snap_data(inputdata) From 36f3241b953fa9d6b41fb80a11260acdb7ec86f7 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 19:23:09 +0100 Subject: [PATCH 080/411] Moved data loading to main.py to avoid reloading data repeatedly --- src/data_ingest/loading.py | 11 +---------- src/main.py | 17 ++++++++++++----- tests/test_data_ingest/test_loading.py | 6 +++--- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py index a25c0cca1..74cf1fb29 100644 --- a/src/data_ingest/loading.py +++ b/src/data_ingest/loading.py @@ -1,17 +1,8 @@ import pandas as pd from typing import Tuple -from src.utils.helpers import Config_settings -from src.utils.hdfs_mods import hdfs_load_json - -conf_obj = Config_settings() -config = conf_obj.config_dict -snapshot_path = config["paths"]["snapshot_path"] # Taken from config file -snapdata = hdfs_load_json(snapshot_path) - - -def parse_snap_data(snapdata: dict = snapdata) -> Tuple[pd.DataFrame, pd.DataFrame]: +def parse_snap_data(snapdata: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: """Loads the data from the survey via the SPP snapshot. The data is supplied as dict and is parsed into dataframes, one for survey contributers (company details) and another one for their responses. diff --git a/src/main.py b/src/main.py index 7300b00e0..5d00ff0f7 100644 --- a/src/main.py +++ b/src/main.py @@ -7,6 +7,7 @@ from src.utils.testfunctions import Manipulate_data from src.data_ingest import loading from src.data_processing import spp_snapshot_processing as processing +from src.utils.hdfs_mods import hdfs_load_json import time import logging @@ -37,11 +38,17 @@ def run_pipeline(start): Manipulate_data() # Data Ingest - contributors_df, responses_df = loading.parse_snap_data() + # Load SPP data from DAP + snapshot_path = config["paths"]["snapshot_path"] + snapdata = hdfs_load_json(snapshot_path) + contributors_df, responses_df = loading.parse_snap_data(snapdata) + # Data Transmutation full_responses = processing.full_responses(contributors_df, responses_df) - - logger.info("The response rate is %.3%", processing.response_rate(contributors_df, responses_df)) - + print(full_responses.sample(5)) + logger.info( + "The response rate is %.3%", + processing.response_rate(contributors_df, responses_df), + ) # Data validation @@ -74,4 +81,4 @@ def run_pipeline(start): runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() runlog_obj.create_runlog_files() - runlog_obj._write_runlog() \ No newline at end of file + runlog_obj._write_runlog() diff --git a/tests/test_data_ingest/test_loading.py b/tests/test_data_ingest/test_loading.py index b8860c45f..195727e1f 100644 --- a/tests/test_data_ingest/test_loading.py +++ b/tests/test_data_ingest/test_loading.py @@ -1,6 +1,9 @@ import pandas as pd from typing import Tuple +# Import modules to test +from src.data_ingest.loading import parse_snap_data + class TestParseSPP: """Test for Parse Snap data function""" @@ -31,9 +34,6 @@ def exp_output(self) -> Tuple[pd.DataFrame, pd.DataFrame]: def test_parse_snap_data(self): """Tests for full_responses function.""" - # Import modules to test - from src.data_ingest.loading import parse_snap_data - inputdata = self.input_data() df_result1, df_result2 = parse_snap_data(inputdata) From ec96e3d48c177a0555072f16e3d459487bd00c83 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 19:28:07 +0100 Subject: [PATCH 081/411] rename loading to spp_parser to avoid confusion --- src/data_ingest/{loading.py => spp_parser.py} | 0 src/main.py | 4 ++-- tests/test_data_ingest/test_loading.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename src/data_ingest/{loading.py => spp_parser.py} (100%) diff --git a/src/data_ingest/loading.py b/src/data_ingest/spp_parser.py similarity index 100% rename from src/data_ingest/loading.py rename to src/data_ingest/spp_parser.py diff --git a/src/main.py b/src/main.py index 5d00ff0f7..8d33396b0 100644 --- a/src/main.py +++ b/src/main.py @@ -5,7 +5,7 @@ from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data -from src.data_ingest import loading +from data_ingest import spp_parser from src.data_processing import spp_snapshot_processing as processing from src.utils.hdfs_mods import hdfs_load_json import time @@ -41,7 +41,7 @@ def run_pipeline(start): # Load SPP data from DAP snapshot_path = config["paths"]["snapshot_path"] snapdata = hdfs_load_json(snapshot_path) - contributors_df, responses_df = loading.parse_snap_data(snapdata) + contributors_df, responses_df = spp_parser.parse_snap_data(snapdata) # Data Transmutation full_responses = processing.full_responses(contributors_df, responses_df) print(full_responses.sample(5)) diff --git a/tests/test_data_ingest/test_loading.py b/tests/test_data_ingest/test_loading.py index 195727e1f..53e1a9852 100644 --- a/tests/test_data_ingest/test_loading.py +++ b/tests/test_data_ingest/test_loading.py @@ -2,7 +2,7 @@ from typing import Tuple # Import modules to test -from src.data_ingest.loading import parse_snap_data +from data_ingest.spp_parser import parse_snap_data class TestParseSPP: From 11a9244430de3bd74e017637098f1b6fb458cd1a Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 19:29:43 +0100 Subject: [PATCH 082/411] added missing src --- src/main.py | 2 +- tests/test_data_ingest/test_loading.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index 8d33396b0..c32b02cba 100644 --- a/src/main.py +++ b/src/main.py @@ -5,7 +5,7 @@ from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data -from data_ingest import spp_parser +from src.data_ingest import spp_parser from src.data_processing import spp_snapshot_processing as processing from src.utils.hdfs_mods import hdfs_load_json import time diff --git a/tests/test_data_ingest/test_loading.py b/tests/test_data_ingest/test_loading.py index 53e1a9852..da0c865a0 100644 --- a/tests/test_data_ingest/test_loading.py +++ b/tests/test_data_ingest/test_loading.py @@ -2,7 +2,7 @@ from typing import Tuple # Import modules to test -from data_ingest.spp_parser import parse_snap_data +from src.data_ingest.spp_parser import parse_snap_data class TestParseSPP: From d4d7c13dd3e58a1669cbccaa2f70f82712a9217d Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 19:31:32 +0100 Subject: [PATCH 083/411] rename test_spp_parser to fit testing standard --- tests/test_data_ingest/{test_loading.py => test_spp_parser.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/test_data_ingest/{test_loading.py => test_spp_parser.py} (100%) diff --git a/tests/test_data_ingest/test_loading.py b/tests/test_data_ingest/test_spp_parser.py similarity index 100% rename from tests/test_data_ingest/test_loading.py rename to tests/test_data_ingest/test_spp_parser.py From 706b1d0f1ca72653f9f3cc4cbd45f8dc066aa50e Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 20:19:29 +0100 Subject: [PATCH 084/411] Adding postcodes_uk dependency --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index a96025b78..0e9377f23 100644 --- a/environment.yml +++ b/environment.yml @@ -13,6 +13,7 @@ dependencies: - myst-parser - numpy - pandas==1.1.5 + - postcodes_uk - pre-commit==2.17.0 - pydoop - pytest From ab6b7659f607f03020e628fd8582884f5a579355 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 20:31:35 +0100 Subject: [PATCH 085/411] Creating postcode validation function --- src/data_validation/validation.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 src/data_validation/validation.py diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py new file mode 100644 index 000000000..542f8161d --- /dev/null +++ b/src/data_validation/validation.py @@ -0,0 +1,14 @@ +import postcodes_uk + + +def validate_postcode(pcode: str) -> bool: + """A function to validate UK postcodes which uses the + + Args: + pcode (str): _description_ + + Returns: + bool: _description_ + """ + validation = postcodes_uk.validate(pcode) + return validation From c57759d31903f80eeb5577305f0bd71a1a0999db Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 20:32:13 +0100 Subject: [PATCH 086/411] Added the stages for the pipeline as comments --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index c32b02cba..b87d96ef6 100644 --- a/src/main.py +++ b/src/main.py @@ -70,7 +70,7 @@ def run_pipeline(start): # Data output: File Outputs - MainLogger.info("Finishing Pipeline .......................") + MainLogger.info("Finshing Pipeline .......................") runlog_obj.retrieve_pipeline_logs() From cb9839506988977dd9fbb2dfb87921f1eec66c69 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 20:36:43 +0100 Subject: [PATCH 087/411] Updating and finishing the function --- src/data_validation/validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 542f8161d..d43bd6e46 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -5,10 +5,12 @@ def validate_postcode(pcode: str) -> bool: """A function to validate UK postcodes which uses the Args: - pcode (str): _description_ + pcode (str): The postcode to validate Returns: - bool: _description_ + bool: True or False depending on if it is valid or not """ - validation = postcodes_uk.validate(pcode) - return validation + # Validation step + valid_bool = postcodes_uk.validate(pcode) + + return valid_bool From 22014a4574edb3f1cf844083efbd8172d2127439 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 20:48:58 +0100 Subject: [PATCH 088/411] just adding _df suffix --- src/data_ingest/spp_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_ingest/spp_parser.py b/src/data_ingest/spp_parser.py index 74cf1fb29..fb69255c7 100644 --- a/src/data_ingest/spp_parser.py +++ b/src/data_ingest/spp_parser.py @@ -17,7 +17,7 @@ def parse_snap_data(snapdata: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: contributordict = snapdata["contributors"] responsesdict = snapdata["responses"] - # Make dataframes + # Make dataframes! contributors_df = pd.DataFrame(contributordict) responses_df = pd.DataFrame(responsesdict) From 6bf6196263f86db357ff0f622050a8c2337056d3 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 21:07:43 +0100 Subject: [PATCH 089/411] Importing loading and validation. Loading dfs --- src/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main.py b/src/main.py index b87d96ef6..e5653b04f 100644 --- a/src/main.py +++ b/src/main.py @@ -8,6 +8,7 @@ from src.data_ingest import spp_parser from src.data_processing import spp_snapshot_processing as processing from src.utils.hdfs_mods import hdfs_load_json +from src.data_validation import validation import time import logging @@ -51,6 +52,7 @@ def run_pipeline(start): ) # Data validation + validation.validate_postcode # Outlier detection From 029dd6cf03ac8f8a54482f96061e5a21d0001bbf Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 21:19:21 +0100 Subject: [PATCH 090/411] Created function and removed print statements --- src/data_ingest/spp_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_ingest/spp_parser.py b/src/data_ingest/spp_parser.py index fb69255c7..4298fb7ea 100644 --- a/src/data_ingest/spp_parser.py +++ b/src/data_ingest/spp_parser.py @@ -13,7 +13,7 @@ def parse_snap_data(snapdata: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: Returns: Tuple[pd.DataFrame, pd.DataFrame]: The contributers and responders dataframes """ - # Load the dicts + # Load the dicts! contributordict = snapdata["contributors"] responsesdict = snapdata["responses"] From 0c0afff7b7f02c807c4e4b74e99e89949cc0a6ce Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 21:20:04 +0100 Subject: [PATCH 091/411] Want to use pandera but can't install it --- src/data_validation/validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index d43bd6e46..49a6cddfd 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,5 +1,7 @@ import postcodes_uk +# import pandera + def validate_postcode(pcode: str) -> bool: """A function to validate UK postcodes which uses the From eade13cf5d11c504a29a3e399ab55bfe05c6fab9 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 21:24:18 +0100 Subject: [PATCH 092/411] Creating func to validate whole col --- src/data_validation/validation.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 49a6cddfd..b487c38cd 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,4 +1,5 @@ import postcodes_uk +import pandas as pd # import pandera @@ -16,3 +17,19 @@ def validate_postcode(pcode: str) -> bool: valid_bool = postcodes_uk.validate(pcode) return valid_bool + + +def validate_post_col(df: pd.Dataframe) -> bool: + """_summary_ + + Args: + df (pd.Dataframe): _description_ + + Returns: + bool: _description_ + """ + bool_series = df.referencepostcode.apply(validate_postcode) + + whole_col_valid = bool_series.all() + + return whole_col_valid From 0f53c2f24b798ebbc866c3b064c486fb8d5f92db Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 21:24:36 +0100 Subject: [PATCH 093/411] validating whole col --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index e5653b04f..88f01447a 100644 --- a/src/main.py +++ b/src/main.py @@ -52,7 +52,7 @@ def run_pipeline(start): ) # Data validation - validation.validate_postcode + validation.validate_post_col(contributers_df) # Outlier detection From 18df55157b0eb674c5a0f47223e1b2b1f062a548 Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 16 May 2023 21:31:02 +0100 Subject: [PATCH 094/411] Make the parse func more integrated --- src/data_ingest/spp_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_ingest/spp_parser.py b/src/data_ingest/spp_parser.py index 4298fb7ea..cfe0d68c7 100644 --- a/src/data_ingest/spp_parser.py +++ b/src/data_ingest/spp_parser.py @@ -17,7 +17,7 @@ def parse_snap_data(snapdata: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: contributordict = snapdata["contributors"] responsesdict = snapdata["responses"] - # Make dataframes! + # Make dataframes contributors_df = pd.DataFrame(contributordict) responses_df = pd.DataFrame(responsesdict) From 832f667e09514b556c5bc379e63a691e6df0e23d Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 16 May 2023 21:45:39 +0100 Subject: [PATCH 095/411] Including error of invalid postcodes on failure --- src/data_validation/validation.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index b487c38cd..61cf13f23 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -20,16 +20,28 @@ def validate_postcode(pcode: str) -> bool: def validate_post_col(df: pd.Dataframe) -> bool: - """_summary_ + """This function checks if all postcodes in the specified DataFrame column + are valid UK postcodes. It uses the `validate_postcode` function to + perform the validation. Args: - df (pd.Dataframe): _description_ + df (pd.DataFrame): The DataFrame containing the postcodes. Returns: - bool: _description_ - """ - bool_series = df.referencepostcode.apply(validate_postcode) + bool: True if all postcodes are valid, False otherwise. + + Raises: + ValueError: If any invalid postcodes are found, a ValueError is raised. + The error message includes the list of invalid postcodes. - whole_col_valid = bool_series.all() + Example: + >>> df = pd.DataFrame({"referencepostcode": ["AB12 3CD", "EFG 456", "HIJ 789", "KL1M 2NO"]}) + >>> validate_post_col(df) + ValueError: Invalid postcodes found: ['EFG 456', 'HIJ 789'] + """ + invalid_postcodes = df.loc[~df["referencepostcode"].apply(validate_postcode), "referencepostcode"] - return whole_col_valid + if not invalid_postcodes.empty: + raise ValueError(f"Invalid postcodes found: {invalid_postcodes.to_list()}") + + return True From 4eefd5d17e4e065fa213dfd9cac8d1d698cfe7f1 Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 16 May 2023 21:55:05 +0100 Subject: [PATCH 096/411] Test for validate_postcode --- tests/test_validation.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/test_validation.py diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 000000000..8f6d55009 --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,23 @@ +import pytest +from src.data_validation import validate_postcode + +def test_validate_postcode(): + # Valid postcodes + assert validate_postcode("AB12 3CD") == True + assert validate_postcode("DE34 5FG") == True + assert validate_postcode("HI67 8JK") == True + + # Invalid postcodes + assert validate_postcode("EFG 456") == False + assert validate_postcode("HIJ 789") == False + assert validate_postcode("KL1M 2NO") == False + assert validate_postcode("B27 OAG") == False # Zero is actually an "O" + + # Edge cases + assert validate_postcode(None) == False # None value should fail + assert validate_postcode("") == False # Empty string + assert validate_postcode(" ") == False # Whitespace + assert validate_postcode("AB123CD") == False # Missing space - othewise valid + assert validate_postcode("ABC XYZ") == False # All letters but right length + assert validate_postcode("123 456") == False # All numbers but right length + From d84f925000266a7f8ef4fe815e65ea1423056ef6 Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 16 May 2023 22:01:52 +0100 Subject: [PATCH 097/411] test suit for the validation module --- tests/test_validation.py | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/test_validation.py b/tests/test_validation.py index 8f6d55009..2e61bd492 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -1,6 +1,50 @@ import pytest from src.data_validation import validate_postcode + +import pytest +import pandas as pd +from your_module import validate_post_col + +def test_validate_post_col(): + # Valid postcodes + df_valid = pd.DataFrame({"referencepostcode": ["AB12 3CD", "DE34 5FG", "HI67 8JK"]}) + assert validate_post_col(df_valid) == True + + # Invalid postcodes + df_invalid = pd.DataFrame({"referencepostcode": ["EFG 456", "HIJ 789"]}) + with pytest.raises(ValueError) as error: + validate_post_col(df_invalid) + assert str(error.value) == "Invalid postcodes found: ['EFG 456', 'HIJ 789']" + + # Mixed valid and invalid postcodes + df_mixed_valid_invalid = pd.DataFrame({"referencepostcode": ["AB12 3CD", "EFG 456", "HI67 8JK"]}) + with pytest.raises(ValueError) as error: + validate_post_col(df_mixed_valid_invalid) + assert str(error.value) == "Invalid postcodes found: ['EFG 456']" # Mixed valid and invalid postcodes + + # Edge cases: invalid column names + df_invalid_column_name = pd.DataFrame({"postcode": ["AB12 3CD", "EFG 456", "HI67 8JK"]}) + with pytest.raises(KeyError) as error: + validate_post_col(df_invalid_column_name) + assert str(error.value) == "'referencepostcode'" # Invalid column name + + # Edge cases: missing column + df_missing_column = pd.DataFrame({"other_column": ["value1", "value2", "value3"]}) + with pytest.raises(KeyError) as error: + validate_post_col(df_missing_column) + assert str(error.value) == "'referencepostcode'" # Missing column + + # Edge cases: missing DataFrame + df_missing_dataframe = None + with pytest.raises(AttributeError): + validate_post_col(df_missing_dataframe) # Missing DataFrame + + # Edge cases: empty reference postcode column + df_no_postcodes = pd.DataFrame({"referencepostcode": [""]}) + assert validate_post_col(df_no_postcodes) == False # Empty postcode + + def test_validate_postcode(): # Valid postcodes assert validate_postcode("AB12 3CD") == True From 0ddc0011c0e15b1a76e450aad909323fdb3d1892 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 17 May 2023 14:26:03 +0100 Subject: [PATCH 098/411] correcting readme --- tests/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/README.md b/tests/README.md index eac420716..68e4c6442 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,3 +1,3 @@ # `tests` folder overview -All tests for the functions defined in the `src` folder should be stored here. +All tests for the functions defined in the `src` folder should be stored here. Each test module should be stored in its correct folder. From e3c1de6b5f358b3fe261ebdd0b54e753a816f7b9 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 17 May 2023 14:27:41 +0100 Subject: [PATCH 099/411] Creating test folder structure --- tests/data_ingest/__init__.py | 0 tests/data_processing/__init__.py | 0 tests/data_processing/estimation/__init__.py | 0 tests/data_validation/__init__.py | 0 .../{ => data_validation}/test_validation.py | 50 ++++++++++--------- tests/data_visualisations/__init__.py | 0 tests/outlier_detection/__init__.py | 0 tests/utils/__init__.py | 0 8 files changed, 26 insertions(+), 24 deletions(-) create mode 100644 tests/data_ingest/__init__.py create mode 100644 tests/data_processing/__init__.py create mode 100644 tests/data_processing/estimation/__init__.py create mode 100644 tests/data_validation/__init__.py rename tests/{ => data_validation}/test_validation.py (55%) create mode 100644 tests/data_visualisations/__init__.py create mode 100644 tests/outlier_detection/__init__.py create mode 100644 tests/utils/__init__.py diff --git a/tests/data_ingest/__init__.py b/tests/data_ingest/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/data_processing/__init__.py b/tests/data_processing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/data_processing/estimation/__init__.py b/tests/data_processing/estimation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/data_validation/__init__.py b/tests/data_validation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_validation.py b/tests/data_validation/test_validation.py similarity index 55% rename from tests/test_validation.py rename to tests/data_validation/test_validation.py index 2e61bd492..3aaace3bd 100644 --- a/tests/test_validation.py +++ b/tests/data_validation/test_validation.py @@ -1,15 +1,12 @@ +import pandas as pd import pytest -from src.data_validation import validate_postcode +from src.data_validation import validate_postcode, validate_post_col -import pytest -import pandas as pd -from your_module import validate_post_col - def test_validate_post_col(): # Valid postcodes df_valid = pd.DataFrame({"referencepostcode": ["AB12 3CD", "DE34 5FG", "HI67 8JK"]}) - assert validate_post_col(df_valid) == True + assert validate_post_col(df_valid) # Invalid postcodes df_invalid = pd.DataFrame({"referencepostcode": ["EFG 456", "HIJ 789"]}) @@ -18,13 +15,19 @@ def test_validate_post_col(): assert str(error.value) == "Invalid postcodes found: ['EFG 456', 'HIJ 789']" # Mixed valid and invalid postcodes - df_mixed_valid_invalid = pd.DataFrame({"referencepostcode": ["AB12 3CD", "EFG 456", "HI67 8JK"]}) + df_mixed_valid_invalid = pd.DataFrame( + {"referencepostcode": ["AB12 3CD", "EFG 456", "HI67 8JK"]} + ) with pytest.raises(ValueError) as error: validate_post_col(df_mixed_valid_invalid) - assert str(error.value) == "Invalid postcodes found: ['EFG 456']" # Mixed valid and invalid postcodes + assert ( + str(error.value) == "Invalid postcodes found: ['EFG 456']" + ) # Mixed valid and invalid postcodes # Edge cases: invalid column names - df_invalid_column_name = pd.DataFrame({"postcode": ["AB12 3CD", "EFG 456", "HI67 8JK"]}) + df_invalid_column_name = pd.DataFrame( + {"postcode": ["AB12 3CD", "EFG 456", "HI67 8JK"]} + ) with pytest.raises(KeyError) as error: validate_post_col(df_invalid_column_name) assert str(error.value) == "'referencepostcode'" # Invalid column name @@ -42,26 +45,25 @@ def test_validate_post_col(): # Edge cases: empty reference postcode column df_no_postcodes = pd.DataFrame({"referencepostcode": [""]}) - assert validate_post_col(df_no_postcodes) == False # Empty postcode + assert validate_post_col(df_no_postcodes) is False # Empty postcode def test_validate_postcode(): # Valid postcodes - assert validate_postcode("AB12 3CD") == True - assert validate_postcode("DE34 5FG") == True - assert validate_postcode("HI67 8JK") == True + assert validate_postcode("AB12 3CD") is True + assert validate_postcode("DE34 5FG") is False + assert validate_postcode("HI67 8JK") is True # Invalid postcodes - assert validate_postcode("EFG 456") == False - assert validate_postcode("HIJ 789") == False - assert validate_postcode("KL1M 2NO") == False - assert validate_postcode("B27 OAG") == False # Zero is actually an "O" + assert validate_postcode("EFG 456") is False + assert validate_postcode("HIJ 789") is False + assert validate_postcode("KL1M 2NO") is False + assert validate_postcode("B27 OAG") is False # Zero is actually an "O" # Edge cases - assert validate_postcode(None) == False # None value should fail - assert validate_postcode("") == False # Empty string - assert validate_postcode(" ") == False # Whitespace - assert validate_postcode("AB123CD") == False # Missing space - othewise valid - assert validate_postcode("ABC XYZ") == False # All letters but right length - assert validate_postcode("123 456") == False # All numbers but right length - + assert validate_postcode(None) is False # None value should fail + assert validate_postcode("") is False # Empty string + assert validate_postcode(" ") is False # Whitespace + assert validate_postcode("AB123CD") is False # Missing space - othewise valid + assert validate_postcode("ABC XYZ") is False # All letters but right length + assert validate_postcode("123 456") is False # All numbers but right length diff --git a/tests/data_visualisations/__init__.py b/tests/data_visualisations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/outlier_detection/__init__.py b/tests/outlier_detection/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 000000000..e69de29bb From 83f8ed6d8adcfc0eaf83e57acd336fde818b6415 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 17 May 2023 14:33:14 +0100 Subject: [PATCH 100/411] Adding test_ to folder names --- tests/{data_ingest => test_data_ingest}/__init__.py | 0 tests/{data_processing => test_data_processing}/__init__.py | 0 .../estimation/__init__.py | 0 tests/{data_validation => test_data_validation}/__init__.py | 0 .../{data_validation => test_data_validation}/test_validation.py | 0 .../{data_visualisations => test_data_visualisations}/__init__.py | 0 tests/{outlier_detection => test_outlier_detection}/__init__.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename tests/{data_ingest => test_data_ingest}/__init__.py (100%) rename tests/{data_processing => test_data_processing}/__init__.py (100%) rename tests/{data_processing => test_data_processing}/estimation/__init__.py (100%) rename tests/{data_validation => test_data_validation}/__init__.py (100%) rename tests/{data_validation => test_data_validation}/test_validation.py (100%) rename tests/{data_visualisations => test_data_visualisations}/__init__.py (100%) rename tests/{outlier_detection => test_outlier_detection}/__init__.py (100%) diff --git a/tests/data_ingest/__init__.py b/tests/test_data_ingest/__init__.py similarity index 100% rename from tests/data_ingest/__init__.py rename to tests/test_data_ingest/__init__.py diff --git a/tests/data_processing/__init__.py b/tests/test_data_processing/__init__.py similarity index 100% rename from tests/data_processing/__init__.py rename to tests/test_data_processing/__init__.py diff --git a/tests/data_processing/estimation/__init__.py b/tests/test_data_processing/estimation/__init__.py similarity index 100% rename from tests/data_processing/estimation/__init__.py rename to tests/test_data_processing/estimation/__init__.py diff --git a/tests/data_validation/__init__.py b/tests/test_data_validation/__init__.py similarity index 100% rename from tests/data_validation/__init__.py rename to tests/test_data_validation/__init__.py diff --git a/tests/data_validation/test_validation.py b/tests/test_data_validation/test_validation.py similarity index 100% rename from tests/data_validation/test_validation.py rename to tests/test_data_validation/test_validation.py diff --git a/tests/data_visualisations/__init__.py b/tests/test_data_visualisations/__init__.py similarity index 100% rename from tests/data_visualisations/__init__.py rename to tests/test_data_visualisations/__init__.py diff --git a/tests/outlier_detection/__init__.py b/tests/test_outlier_detection/__init__.py similarity index 100% rename from tests/outlier_detection/__init__.py rename to tests/test_outlier_detection/__init__.py From 595cbc2c3c014e4c1ada49bc1b8e9732e0354d86 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 17 May 2023 14:35:57 +0100 Subject: [PATCH 101/411] Tidying --- tests/utils/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/utils/__init__.py diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 From 30fb42af0e0ec4535cc50c05792cfb961f30c042 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 18 May 2023 09:24:13 +0100 Subject: [PATCH 102/411] Improving tests with Ilyas's suggestions --- tests/test_data_validation/test_validation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 3aaace3bd..322c4381a 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -1,6 +1,6 @@ import pandas as pd import pytest -from src.data_validation import validate_postcode, validate_post_col +from src.data_validation.validation import validate_postcode, validate_post_col def test_validate_post_col(): @@ -45,7 +45,8 @@ def test_validate_post_col(): # Edge cases: empty reference postcode column df_no_postcodes = pd.DataFrame({"referencepostcode": [""]}) - assert validate_post_col(df_no_postcodes) is False # Empty postcode + with pytest.raises(ValueError): + validate_post_col(df_no_postcodes) # Empty postcode column def test_validate_postcode(): From 6535c8e13ed6c6d3f6c39570a14f2736296aaa2e Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 18 May 2023 11:07:33 +0100 Subject: [PATCH 103/411] Updating data type. Name func as pattern validator --- src/data_validation/validation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 61cf13f23..7822595ad 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -4,7 +4,7 @@ # import pandera -def validate_postcode(pcode: str) -> bool: +def validate_postcode_pattern(pcode: str) -> bool: """A function to validate UK postcodes which uses the Args: @@ -18,8 +18,7 @@ def validate_postcode(pcode: str) -> bool: return valid_bool - -def validate_post_col(df: pd.Dataframe) -> bool: +def validate_post_col(df: pd.DataFrame) -> bool: """This function checks if all postcodes in the specified DataFrame column are valid UK postcodes. It uses the `validate_postcode` function to perform the validation. @@ -39,7 +38,7 @@ def validate_post_col(df: pd.Dataframe) -> bool: >>> validate_post_col(df) ValueError: Invalid postcodes found: ['EFG 456', 'HIJ 789'] """ - invalid_postcodes = df.loc[~df["referencepostcode"].apply(validate_postcode), "referencepostcode"] + invalid_postcodes = df.loc[~df["referencepostcode"].apply(validate_postcode_pattern), "referencepostcode"] if not invalid_postcodes.empty: raise ValueError(f"Invalid postcodes found: {invalid_postcodes.to_list()}") From 49cf59a6fd5094c36caa62e807a3afb13bfd44f8 Mon Sep 17 00:00:00 2001 From: jwestw Date: Thu, 18 May 2023 17:40:30 +0100 Subject: [PATCH 104/411] New reality checker --- src/data_validation/validation.py | 53 ++++++++++++++++++++++++++++--- src/main.py | 11 ++++++- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 7822595ad..492f547d1 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,7 +1,18 @@ import postcodes_uk import pandas as pd -# import pandera +from src.utils.wrappers import logger_creator +from src.utils.helpers import Config_settings + + +# Get the config +conf_obj = Config_settings() +config = conf_obj.config_dict +global_config = config["global"] + +# Set up logging +logger = logger_creator(global_config) + def validate_postcode_pattern(pcode: str) -> bool: @@ -18,7 +29,22 @@ def validate_postcode_pattern(pcode: str) -> bool: return valid_bool -def validate_post_col(df: pd.DataFrame) -> bool: +def get_masterlist(masterlist_path) -> pd.Series: + """This function loads the masterlist of postcodes from a csv file + + Returns: + pd.Series: The dataframe of postcodes + """ + + masterlist = (pd.Series + (pd.read_csv + (masterlist_path, + usecols=["pcd"]))) + + return masterlist + + +def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: """This function checks if all postcodes in the specified DataFrame column are valid UK postcodes. It uses the `validate_postcode` function to perform the validation. @@ -38,9 +64,26 @@ def validate_post_col(df: pd.DataFrame) -> bool: >>> validate_post_col(df) ValueError: Invalid postcodes found: ['EFG 456', 'HIJ 789'] """ - invalid_postcodes = df.loc[~df["referencepostcode"].apply(validate_postcode_pattern), "referencepostcode"] + master_series = get_masterlist(masterlist_path) + + # Check if postcode are real + unreal_postcodes = df.loc[~df["referencepostcode"].isin(master_series), "referencepostcode"] + + # Log the unreal postcodes + if not unreal_postcodes.empty: + logger.warning(f"These postcodes are not found in the ONS postcode list: {unreal_postcodes.to_list()}") + + # Check if postcodes match pattern + invalid_pattern_postcodes = df.loc[~df["referencepostcode"].apply(validate_postcode_pattern), "referencepostcode"] + + # Log the invalid postcodes + if not invalid_pattern_postcodes.empty: + logger.warning(f"Invalid postcodes found: {invalid_pattern_postcodes.to_list()}") + + # Combine the two lists + combined_invalid_postcodes = pd.concat([unreal_postcodes, invalid_pattern_postcodes]) - if not invalid_postcodes.empty: - raise ValueError(f"Invalid postcodes found: {invalid_postcodes.to_list()}") + if not combined_invalid_postcodes.empty: + raise ValueError(f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}") return True diff --git a/src/main.py b/src/main.py index 88f01447a..490263b77 100644 --- a/src/main.py +++ b/src/main.py @@ -17,6 +17,12 @@ MainLogger.setLevel(logging.INFO) +# load config +conf_obj = Config_settings() +config = conf_obj.config_dict +masterlist_path = config["masterlist_path"] + + def run_pipeline(start): """The main pipeline. @@ -52,7 +58,10 @@ def run_pipeline(start): ) # Data validation - validation.validate_post_col(contributers_df) + + + # Check the postcode column + validation.validate_post_col(contributers_df , postcode_master_list) # Outlier detection From ae4b08363c5e1bcf3bbe8a27c1b2134160c69b3d Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 23 May 2023 11:31:37 +0100 Subject: [PATCH 105/411] Hoping this helps w installation of postcodes_uk --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 0e9377f23..2f2878bef 100644 --- a/environment.yml +++ b/environment.yml @@ -11,6 +11,7 @@ dependencies: - cookiecutter - detect-secrets - myst-parser + - more-itertools==5.0.0 - numpy - pandas==1.1.5 - postcodes_uk From 8ac56a03b1bf38837b61438a16c6a7f28a6ff63a Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 23 May 2023 11:37:22 +0100 Subject: [PATCH 106/411] Small changes to the validation examples --- src/data_validation/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 492f547d1..93842f18b 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -61,7 +61,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: Example: >>> df = pd.DataFrame({"referencepostcode": ["AB12 3CD", "EFG 456", "HIJ 789", "KL1M 2NO"]}) - >>> validate_post_col(df) + >>> validate_post_col(df, "example-path/to/masterlist.csv"") ValueError: Invalid postcodes found: ['EFG 456', 'HIJ 789'] """ master_series = get_masterlist(masterlist_path) From 9d3ffaf7bdd46ab5b9e83b25ac488cf844fe3738 Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 23 May 2023 11:38:07 +0100 Subject: [PATCH 107/411] Mock loading masterlist. Testing logs & errors --- tests/test_data_validation/test_validation.py | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 322c4381a..d6de7b177 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -3,7 +3,30 @@ from src.data_validation.validation import validate_postcode, validate_post_col -def test_validate_post_col(): +# Define test data +@pytest.fixture +def test_data(): + return pd.DataFrame({"referencepostcode": ["AB12 3CD", "EFG 456", "HIJ 789", "KL1M 2NO"]}) + +# Mock the get_masterlist function +def mock_get_masterlist(masterlist_path): + # Return a mock masterlist series + return pd.Series(["AB12 3CD", "KL1M 2NO"]) + + +# Test case for validate_post_col +def test_validate_post_col(test_data, monkeypatch, caplog): + # Monkeypatch the get_masterlist function to use the mock implementation + monkeypatch.setattr("src.data_validation.validation.get_masterlist", mock_get_masterlist) + + # Call the function under test + with pytest.raises(ValueError) as exc_info: + validate_post_col(test_data, "path/to/missing_masterlist.csv") + + # Using caplog to check the logged warning messages + assert "These postcodes are not found in the ONS postcode list: ['EFG 456', 'HIJ 789']" in caplog.text + assert "Invalid postcodes found: ['EFG 456', 'HIJ 789']" in caplog.text + # Valid postcodes df_valid = pd.DataFrame({"referencepostcode": ["AB12 3CD", "DE34 5FG", "HI67 8JK"]}) assert validate_post_col(df_valid) @@ -20,9 +43,7 @@ def test_validate_post_col(): ) with pytest.raises(ValueError) as error: validate_post_col(df_mixed_valid_invalid) - assert ( - str(error.value) == "Invalid postcodes found: ['EFG 456']" - ) # Mixed valid and invalid postcodes + assert (str(error.value) == "Invalid postcodes found: ['EFG 456']") # Edge cases: invalid column names df_invalid_column_name = pd.DataFrame( From 398dc65366087762b24324f60e57be2fabc9e5bc Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 23 May 2023 11:52:49 +0100 Subject: [PATCH 108/411] Correcting name of import --- tests/test_data_validation/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index d6de7b177..c2416e812 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -1,6 +1,6 @@ import pandas as pd import pytest -from src.data_validation.validation import validate_postcode, validate_post_col +from src.data_validation.validation import validate_postcode_pattern, validate_post_col # Define test data From bffd5c0698a5d74a002c2342625eda366511f54a Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 23 May 2023 11:54:29 +0100 Subject: [PATCH 109/411] Trying to silence warning --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 28be6f52c..bb8a5511b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,6 @@ profile = "black" addopts = [ "-vv", "--doctest-modules", - ] doctest_optionflags = "NORMALIZE_WHITESPACE" testpaths = [ From 6d983fc1625a09a776abe689bd02cdd75a4a8360 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 23 May 2023 12:25:26 +0100 Subject: [PATCH 110/411] add pytest 7 --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 2f2878bef..386e08905 100644 --- a/environment.yml +++ b/environment.yml @@ -17,7 +17,7 @@ dependencies: - postcodes_uk - pre-commit==2.17.0 - pydoop - - pytest + - pytest #using version 5 which is compatible with postcodes_uk - pytest-cov - python-dotenv - readme-coverage-badger From 77d904b0560d2f0eefce7542a183b0a403a6b077 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 23 May 2023 12:50:19 +0100 Subject: [PATCH 111/411] downgrade pytest for postcodes --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 386e08905..3628a50a1 100644 --- a/environment.yml +++ b/environment.yml @@ -17,7 +17,7 @@ dependencies: - postcodes_uk - pre-commit==2.17.0 - pydoop - - pytest #using version 5 which is compatible with postcodes_uk + - pytest #default version 5 which is compatible with postcodes_uk - pytest-cov - python-dotenv - readme-coverage-badger From fdfe571ddc7bfc55cbcabb7c4d4ac42998191a7e Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 23 May 2023 15:49:36 +0100 Subject: [PATCH 112/411] The tests are passing - WIP --- tests/test_data_validation/test_validation.py | 110 ++++++++++-------- 1 file changed, 59 insertions(+), 51 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index c2416e812..7ace8d5a9 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -1,91 +1,99 @@ import pandas as pd import pytest -from src.data_validation.validation import validate_postcode_pattern, validate_post_col +from src.data_validation.validation import validate_post_col +# import validate_postcode_pattern, # noqa -# Define test data -@pytest.fixture + +@pytest.fixture # noqa def test_data(): - return pd.DataFrame({"referencepostcode": ["AB12 3CD", "EFG 456", "HIJ 789", "KL1M 2NO"]}) + return pd.DataFrame( + {"referencepostcode": ["NP10 8XG", "SW1P 4DF", "HIJ 789", "KL1M 2NO"]} + ) + # Mock the get_masterlist function def mock_get_masterlist(masterlist_path): - # Return a mock masterlist series - return pd.Series(["AB12 3CD", "KL1M 2NO"]) + # Return a mock masterlist series - actual postcodes of ONS offices + return pd.Series(["NP10 8XG", "SW1P 4DF", "PO15 5RR"]) # Test case for validate_post_col def test_validate_post_col(test_data, monkeypatch, caplog): # Monkeypatch the get_masterlist function to use the mock implementation - monkeypatch.setattr("src.data_validation.validation.get_masterlist", mock_get_masterlist) + monkeypatch.setattr( + "src.data_validation.validation.get_masterlist", mock_get_masterlist + ) + + # Make a fake path to the masterlist + fake_path = "path/to/missing_masterlist.csv" # Call the function under test - with pytest.raises(ValueError) as exc_info: - validate_post_col(test_data, "path/to/missing_masterlist.csv") + with pytest.raises(ValueError): + validate_post_col(test_data, fake_path) # Using caplog to check the logged warning messages - assert "These postcodes are not found in the ONS postcode list: ['EFG 456', 'HIJ 789']" in caplog.text - assert "Invalid postcodes found: ['EFG 456', 'HIJ 789']" in caplog.text + assert ( + "These postcodes are not found in the ONS postcode list: ['HIJ 789', 'KL1M 2NO']" # noqa + in caplog.text + ) + assert "Invalid pattern postcodes found: ['HIJ 789']" in caplog.text - # Valid postcodes - df_valid = pd.DataFrame({"referencepostcode": ["AB12 3CD", "DE34 5FG", "HI67 8JK"]}) - assert validate_post_col(df_valid) + # Valid AND real postcodes + df_valid = pd.DataFrame({"referencepostcode": ["NP10 8XG", "PO15 5RR", "SW1P 4DF"]}) + assert validate_post_col(df_valid, fake_path) # Invalid postcodes df_invalid = pd.DataFrame({"referencepostcode": ["EFG 456", "HIJ 789"]}) with pytest.raises(ValueError) as error: - validate_post_col(df_invalid) + validate_post_col(df_invalid, fake_path) assert str(error.value) == "Invalid postcodes found: ['EFG 456', 'HIJ 789']" - # Mixed valid and invalid postcodes - df_mixed_valid_invalid = pd.DataFrame( - {"referencepostcode": ["AB12 3CD", "EFG 456", "HI67 8JK"]} - ) + # Mixed valid and invalid postcodes - as is in the test_data with pytest.raises(ValueError) as error: - validate_post_col(df_mixed_valid_invalid) - assert (str(error.value) == "Invalid postcodes found: ['EFG 456']") + validate_post_col(test_data, fake_path) + assert str(error.value) == "Invalid postcodes found: ['HIJ 789', 'KL1M 2NO']" # Edge cases: invalid column names - df_invalid_column_name = pd.DataFrame( - {"postcode": ["AB12 3CD", "EFG 456", "HI67 8JK"]} - ) + df_invalid_column_name = test_data.rename(columns={"referencepostcode": "postcode"}) with pytest.raises(KeyError) as error: - validate_post_col(df_invalid_column_name) + validate_post_col(df_invalid_column_name, fake_path) assert str(error.value) == "'referencepostcode'" # Invalid column name # Edge cases: missing column - df_missing_column = pd.DataFrame({"other_column": ["value1", "value2", "value3"]}) + df_missing_column = test_data.drop("referencepostcode", axis=1) + df_missing_column["anothercolumn"] = ["val1", "val2", "val3", "val4"] with pytest.raises(KeyError) as error: - validate_post_col(df_missing_column) + validate_post_col(df_missing_column, fake_path) assert str(error.value) == "'referencepostcode'" # Missing column # Edge cases: missing DataFrame df_missing_dataframe = None - with pytest.raises(AttributeError): - validate_post_col(df_missing_dataframe) # Missing DataFrame + with pytest.raises(TypeError): + validate_post_col(df_missing_dataframe, fake_path) # Missing DataFrame # Edge cases: empty reference postcode column df_no_postcodes = pd.DataFrame({"referencepostcode": [""]}) with pytest.raises(ValueError): - validate_post_col(df_no_postcodes) # Empty postcode column - - -def test_validate_postcode(): - # Valid postcodes - assert validate_postcode("AB12 3CD") is True - assert validate_postcode("DE34 5FG") is False - assert validate_postcode("HI67 8JK") is True - - # Invalid postcodes - assert validate_postcode("EFG 456") is False - assert validate_postcode("HIJ 789") is False - assert validate_postcode("KL1M 2NO") is False - assert validate_postcode("B27 OAG") is False # Zero is actually an "O" - - # Edge cases - assert validate_postcode(None) is False # None value should fail - assert validate_postcode("") is False # Empty string - assert validate_postcode(" ") is False # Whitespace - assert validate_postcode("AB123CD") is False # Missing space - othewise valid - assert validate_postcode("ABC XYZ") is False # All letters but right length - assert validate_postcode("123 456") is False # All numbers but right length + validate_post_col(df_no_postcodes, fake_path) # Empty postcode column + + +# def test_validate_postcode(): +# # Valid postcodes +# assert validate_postcode("AB12 3CD") is True +# assert validate_postcode("DE34 5FG") is False +# assert validate_postcode("HI67 8JK") is True + +# # Invalid postcodes +# assert validate_postcode("EFG 456") is False +# assert validate_postcode("HIJ 789") is False +# assert validate_postcode("KL1M 2NO") is False +# assert validate_postcode("B27 OAG") is False # Zero is actually an "O" + +# # Edge cases +# assert validate_postcode(None) is False # None value should fail +# assert validate_postcode("") is False # Empty string +# assert validate_postcode(" ") is False # Whitespace +# assert validate_postcode("AB123CD") is False # Missing space - othewise valid +# assert validate_postcode("ABC XYZ") is False # All letters but right length +# assert validate_postcode("123 456") is False # All numbers but right length From e6dbfa276a93b00d806cd186bde19e39e7c6a5c2 Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 23 May 2023 22:46:11 +0100 Subject: [PATCH 113/411] Try specifiy work dir to pickup pyproject.toml --- .github/workflows/pytest-action.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 28e16f2e3..e8a309dbf 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -30,6 +30,7 @@ jobs: # 3) Run pytest to run all tests in the tests folder - name: Use coverage to run pytest # Specify shell to run the command in + working-directory: ${{ github.workspace }} shell: bash -l {0} run: | coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/main.py \ From 5e75d9490aa0460ffc081955e02716e9b6dbd811 Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 23 May 2023 22:47:46 +0100 Subject: [PATCH 114/411] Removing junit line --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bb8a5511b..5130b557c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,7 @@ profile = "black" [tool.pytest.ini_options] addopts = [ "-vv", - "--doctest-modules", -] + "--doctest-modules",] doctest_optionflags = "NORMALIZE_WHITESPACE" testpaths = [ "./tests" From 8e68c64b2f9ee47b6db4073ed5fb3cdea3f36425 Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 23 May 2023 22:50:17 +0100 Subject: [PATCH 115/411] stop removing pydoop --- .github/workflows/pytest-action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index e8a309dbf..e49a716eb 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -13,7 +13,7 @@ jobs: # 1) Checkout the code - uses: actions/checkout@v3 -# 2) Removing PyDoop from the environment yaml +# 2) Removing PyDoop from the requirements.txt - name: Remove pydoop requirements.txt shell: bash -l {0} run: | From cad4ad72104bc92b870bff62fba121f093a3fe81 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 24 May 2023 09:58:58 +0100 Subject: [PATCH 116/411] pytest action now with ignore flag --- .github/workflows/pytest-action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index e49a716eb..4d468a0a9 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -27,7 +27,7 @@ jobs: # install dependencies from requirements.txt - name: Install dependencies run: pip install -r requirements.txt -# 3) Run pytest to run all tests in the tests folder +# 3) Run pytest to run all tests in the tests folder! - name: Use coverage to run pytest # Specify shell to run the command in working-directory: ${{ github.workspace }} From 051447d59d41b24641969837e24f3e7613ffda97 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 24 May 2023 10:00:50 +0100 Subject: [PATCH 117/411] Removing pydoop dependency again --- .github/workflows/pytest-action.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 4d468a0a9..c9ffb34f4 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -24,10 +24,10 @@ jobs: with: python-version: '3.6' -# install dependencies from requirements.txt +# 4) Install dependencies from requirements.txt - name: Install dependencies run: pip install -r requirements.txt -# 3) Run pytest to run all tests in the tests folder! +# 5) Run pytest to run all tests in the tests folder! - name: Use coverage to run pytest # Specify shell to run the command in working-directory: ${{ github.workspace }} @@ -36,7 +36,7 @@ jobs: coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/main.py \ -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ -o python_coverage.xml && coverage report -m --fail-under=10 -# 4) Get the coverage report in to the pull request comments +# 6) Get the coverage report in to the pull request comments - name: Pytest coverage comment uses: MishaKav/pytest-coverage-comment@main with: From 69c895affb3ada237fb78bde33ad2d678bbf5f09 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:16:22 +0100 Subject: [PATCH 118/411] Update log message --- src/data_validation/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 93842f18b..6682b4cbc 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -78,7 +78,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: # Log the invalid postcodes if not invalid_pattern_postcodes.empty: - logger.warning(f"Invalid postcodes found: {invalid_pattern_postcodes.to_list()}") + logger.warning(f"Invalid pattern postcodes found: {invalid_pattern_postcodes.to_list()}") # Combine the two lists combined_invalid_postcodes = pd.concat([unreal_postcodes, invalid_pattern_postcodes]) From d63d1abb15bbb0cbff6e4b0546ae99f2cca34c68 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:16:47 +0100 Subject: [PATCH 119/411] Update docstring in test data --- tests/test_data_validation/test_validation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 7ace8d5a9..01033f359 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -7,6 +7,10 @@ @pytest.fixture # noqa def test_data(): + """'NP10 8XG', 'SW1P 4DF' are valid and real postcodes. 'HIJ 789' is neither valid nor real + and 'KL1M 2NO' is a valid pattern but not real + + """ return pd.DataFrame( {"referencepostcode": ["NP10 8XG", "SW1P 4DF", "HIJ 789", "KL1M 2NO"]} ) From ca27a5e6eff9f92282674e764f9553bf537627b8 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:18:47 +0100 Subject: [PATCH 120/411] Removing duplicates --- src/data_validation/validation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 6682b4cbc..f50a5f57e 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -82,6 +82,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: # Combine the two lists combined_invalid_postcodes = pd.concat([unreal_postcodes, invalid_pattern_postcodes]) + combined_invalid_postcodes.drop_duplicates(inplace=True) if not combined_invalid_postcodes.empty: raise ValueError(f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}") From 12a9c3f4b7ed0c15b9cddcbb6152ef8df3a35c9c Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:23:27 +0100 Subject: [PATCH 121/411] Making a type error --- src/data_validation/validation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index f50a5f57e..3bb521dc5 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -64,6 +64,9 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: >>> validate_post_col(df, "example-path/to/masterlist.csv"") ValueError: Invalid postcodes found: ['EFG 456', 'HIJ 789'] """ + if not isinstance(df, pd.DataFrame): + raise TypeError(f"The dataframe you are attempting to validate is {type(df)}") + master_series = get_masterlist(masterlist_path) # Check if postcode are real From 0d7c3c2db753430253ed18feb364c1045ab7b707 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:27:57 +0100 Subject: [PATCH 122/411] Validate postcode pattern imported and tested --- tests/test_data_validation/test_validation.py | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 01033f359..cec3cc326 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -1,8 +1,6 @@ import pandas as pd import pytest -from src.data_validation.validation import validate_post_col - -# import validate_postcode_pattern, # noqa +from src.data_validation.validation import validate_post_col, validate_postcode_pattern # noqa @pytest.fixture # noqa @@ -82,22 +80,22 @@ def test_validate_post_col(test_data, monkeypatch, caplog): validate_post_col(df_no_postcodes, fake_path) # Empty postcode column -# def test_validate_postcode(): -# # Valid postcodes -# assert validate_postcode("AB12 3CD") is True -# assert validate_postcode("DE34 5FG") is False -# assert validate_postcode("HI67 8JK") is True - -# # Invalid postcodes -# assert validate_postcode("EFG 456") is False -# assert validate_postcode("HIJ 789") is False -# assert validate_postcode("KL1M 2NO") is False -# assert validate_postcode("B27 OAG") is False # Zero is actually an "O" - -# # Edge cases -# assert validate_postcode(None) is False # None value should fail -# assert validate_postcode("") is False # Empty string -# assert validate_postcode(" ") is False # Whitespace -# assert validate_postcode("AB123CD") is False # Missing space - othewise valid -# assert validate_postcode("ABC XYZ") is False # All letters but right length -# assert validate_postcode("123 456") is False # All numbers but right length +def test_validate_postcode(): + # Valid postcodes + assert validate_postcode_pattern("AB12 3CD") is True + assert validate_postcode_pattern("DE34 5FG") is False + assert validate_postcode_pattern("HI67 8JK") is True + + # Invalid postcodes + assert validate_postcode_pattern("EFG 456") is False + assert validate_postcode_pattern("HIJ 789") is False + assert validate_postcode_pattern("KL1M 2NO") is False + assert validate_postcode_pattern("B27 OAG") is False # Zero is actually an "O" + + # Edge cases + assert validate_postcode_pattern(None) is False # None value should fail + assert validate_postcode_pattern("") is False # Empty string + assert validate_postcode_pattern(" ") is False # Whitespace + assert validate_postcode_pattern("AB123CD") is False # Missing space - othewise valid + assert validate_postcode_pattern("ABC XYZ") is False # All letters but right length + assert validate_postcode_pattern("123 456") is False # All numbers but right length From 378a52986056b2a5b85835d55fe48efe0184bd3d Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:39:42 +0100 Subject: [PATCH 123/411] Changing failing boolean --- tests/test_data_validation/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index cec3cc326..f01b7533e 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -83,7 +83,7 @@ def test_validate_post_col(test_data, monkeypatch, caplog): def test_validate_postcode(): # Valid postcodes assert validate_postcode_pattern("AB12 3CD") is True - assert validate_postcode_pattern("DE34 5FG") is False + assert validate_postcode_pattern("DE34 5FG") is True assert validate_postcode_pattern("HI67 8JK") is True # Invalid postcodes From abd5a5d66bc205c40902ca69c1ccd7a0cadd4fc4 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:42:44 +0100 Subject: [PATCH 124/411] Remove one valid (but fake) pattern --- tests/test_data_validation/test_validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index f01b7533e..5768818a8 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -89,7 +89,6 @@ def test_validate_postcode(): # Invalid postcodes assert validate_postcode_pattern("EFG 456") is False assert validate_postcode_pattern("HIJ 789") is False - assert validate_postcode_pattern("KL1M 2NO") is False assert validate_postcode_pattern("B27 OAG") is False # Zero is actually an "O" # Edge cases From 61c6fafb2a5daec760646a724f30599c3a4357e3 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:46:47 +0100 Subject: [PATCH 125/411] add none check --- src/data_validation/validation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 3bb521dc5..3fe98bc6c 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -24,6 +24,9 @@ def validate_postcode_pattern(pcode: str) -> bool: Returns: bool: True or False depending on if it is valid or not """ + if pcode is None: + return False + # Validation step valid_bool = postcodes_uk.validate(pcode) From 57d93670bfe0bd85d7f863d440ff0a4a1726e360 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 24 May 2023 10:49:54 +0100 Subject: [PATCH 126/411] Moved missing space example to True/valid section --- tests/test_data_validation/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 5768818a8..eeb6c3046 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -83,6 +83,7 @@ def test_validate_post_col(test_data, monkeypatch, caplog): def test_validate_postcode(): # Valid postcodes assert validate_postcode_pattern("AB12 3CD") is True + assert validate_postcode_pattern("AB123CD") is True # Missing space - othewise valid assert validate_postcode_pattern("DE34 5FG") is True assert validate_postcode_pattern("HI67 8JK") is True @@ -95,6 +96,5 @@ def test_validate_postcode(): assert validate_postcode_pattern(None) is False # None value should fail assert validate_postcode_pattern("") is False # Empty string assert validate_postcode_pattern(" ") is False # Whitespace - assert validate_postcode_pattern("AB123CD") is False # Missing space - othewise valid assert validate_postcode_pattern("ABC XYZ") is False # All letters but right length assert validate_postcode_pattern("123 456") is False # All numbers but right length From 0e18bbc4b8c00d754216781d32f3c829b7a7fffd Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 25 May 2023 17:34:49 +0100 Subject: [PATCH 127/411] added path and switch to config --- src/developer_config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 620277255..622774e34 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -2,6 +2,7 @@ global: log_to_file: True # Write logs to .log file logging_level: "DEBUG" table_config: "SingleLine" + postcode_csv_check: False runlog_writer: write_csv: True # Write the runlog to a CSV file write_hdf5: False # Write the runlog to an HDF5 file @@ -11,6 +12,7 @@ runlog_writer: paths: logs_foldername: "testing_pydoop" snapshot_path: "/ons/rdbe_dev/snapshot-202012-002-fba5c4ba-fb8c-4a62-87bb-66c725eea5fd.json" + masterlist_path: "data/external/ONSPD_NOV_2022_UK.csv" csv_filenames: main: "main_runlog.csv" configs: "configs_runlog.csv" From 849381eb70a5f94b2f32600d6b47bbca72b9f82c Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 25 May 2023 17:35:16 +0100 Subject: [PATCH 128/411] fixed path issues in main --- src/main.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main.py b/src/main.py index 490263b77..e744ae09b 100644 --- a/src/main.py +++ b/src/main.py @@ -20,7 +20,7 @@ # load config conf_obj = Config_settings() config = conf_obj.config_dict -masterlist_path = config["masterlist_path"] +masterlist_path = config["paths"]["masterlist_path"] def run_pipeline(start): @@ -58,10 +58,9 @@ def run_pipeline(start): ) # Data validation - - + # Check the postcode column - validation.validate_post_col(contributers_df , postcode_master_list) + validation.validate_post_col(contributors_df, masterlist_path) # Outlier detection From e16fa81251ccdb3a2cdc2c682b032a6d291cce6c Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 25 May 2023 17:36:57 +0100 Subject: [PATCH 129/411] add switch to masterlist and fixed pd.series error --- src/data_validation/validation.py | 63 ++++++++++++++++++------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 3fe98bc6c..240546bd4 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -5,16 +5,15 @@ from src.utils.helpers import Config_settings -# Get the config +# Get the config conf_obj = Config_settings() config = conf_obj.config_dict global_config = config["global"] -# Set up logging +# Set up logging logger = logger_creator(global_config) - def validate_postcode_pattern(pcode: str) -> bool: """A function to validate UK postcodes which uses the @@ -26,30 +25,26 @@ def validate_postcode_pattern(pcode: str) -> bool: """ if pcode is None: return False - + # Validation step valid_bool = postcodes_uk.validate(pcode) return valid_bool + def get_masterlist(masterlist_path) -> pd.Series: """This function loads the masterlist of postcodes from a csv file Returns: pd.Series: The dataframe of postcodes """ - - masterlist = (pd.Series - (pd.read_csv - (masterlist_path, - usecols=["pcd"]))) - + masterlist = pd.read_csv(masterlist_path, usecols=["pcd"]).squeeze() return masterlist def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: - """This function checks if all postcodes in the specified DataFrame column - are valid UK postcodes. It uses the `validate_postcode` function to + """This function checks if all postcodes in the specified DataFrame column + are valid UK postcodes. It uses the `validate_postcode` function to perform the validation. Args: @@ -63,34 +58,50 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: The error message includes the list of invalid postcodes. Example: - >>> df = pd.DataFrame({"referencepostcode": ["AB12 3CD", "EFG 456", "HIJ 789", "KL1M 2NO"]}) + >>> df = pd.DataFrame( + {"referencepostcode": ["AB12 3CD", "EFG 456", "HIJ 789", "KL1M 2NO"]}) >>> validate_post_col(df, "example-path/to/masterlist.csv"") ValueError: Invalid postcodes found: ['EFG 456', 'HIJ 789'] """ if not isinstance(df, pd.DataFrame): raise TypeError(f"The dataframe you are attempting to validate is {type(df)}") - - master_series = get_masterlist(masterlist_path) - - # Check if postcode are real - unreal_postcodes = df.loc[~df["referencepostcode"].isin(master_series), "referencepostcode"] - + + if config["global"]["postcode_csv_check"]: + master_series = get_masterlist(masterlist_path) + + # Check if postcode are real + unreal_postcodes = df.loc[ + ~df["referencepostcode"].isin(master_series), "referencepostcode" + ] + else: + unreal_postcodes = pd.DataFrame([]) + # Log the unreal postcodes if not unreal_postcodes.empty: - logger.warning(f"These postcodes are not found in the ONS postcode list: {unreal_postcodes.to_list()}") - + logger.warning( + f"These postcodes are not found in the ONS postcode list: {unreal_postcodes.to_list()}" # noqa + ) + # Check if postcodes match pattern - invalid_pattern_postcodes = df.loc[~df["referencepostcode"].apply(validate_postcode_pattern), "referencepostcode"] + invalid_pattern_postcodes = df.loc[ + ~df["referencepostcode"].apply(validate_postcode_pattern), "referencepostcode" + ] # Log the invalid postcodes if not invalid_pattern_postcodes.empty: - logger.warning(f"Invalid pattern postcodes found: {invalid_pattern_postcodes.to_list()}") + logger.warning( + f"Invalid pattern postcodes found: {invalid_pattern_postcodes.to_list()}" + ) # Combine the two lists - combined_invalid_postcodes = pd.concat([unreal_postcodes, invalid_pattern_postcodes]) + combined_invalid_postcodes = pd.concat( + [unreal_postcodes, invalid_pattern_postcodes] + ) combined_invalid_postcodes.drop_duplicates(inplace=True) if not combined_invalid_postcodes.empty: - raise ValueError(f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}") - + raise ValueError( + f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}" + ) + return True From ae667d9f4b5314bcc99c0390c99b86d3b14b023d Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 25 May 2023 17:58:32 +0100 Subject: [PATCH 130/411] Fixed tests issue (WIP) --- src/data_validation/validation.py | 5 ++++- src/developer_config.yaml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 240546bd4..fc59aee36 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -74,7 +74,10 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: ~df["referencepostcode"].isin(master_series), "referencepostcode" ] else: - unreal_postcodes = pd.DataFrame([]) + emptydf = pd.DataFrame(columns=["referencepostcode"]) + unreal_postcodes = emptydf.loc[ + ~emptydf["referencepostcode"], "referencepostcode" + ] # Log the unreal postcodes if not unreal_postcodes.empty: diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 622774e34..71c07c508 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -2,7 +2,7 @@ global: log_to_file: True # Write logs to .log file logging_level: "DEBUG" table_config: "SingleLine" - postcode_csv_check: False + postcode_csv_check: True runlog_writer: write_csv: True # Write the runlog to a CSV file write_hdf5: False # Write the runlog to an HDF5 file From 24b92130e1010d3a55ae521b45e0640301364705 Mon Sep 17 00:00:00 2001 From: jwestw Date: Mon, 5 Jun 2023 18:45:32 +0100 Subject: [PATCH 131/411] Split out the function to check postcodes are real --- src/data_validation/validation.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index fc59aee36..78dfd5c64 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -66,18 +66,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: if not isinstance(df, pd.DataFrame): raise TypeError(f"The dataframe you are attempting to validate is {type(df)}") - if config["global"]["postcode_csv_check"]: - master_series = get_masterlist(masterlist_path) - - # Check if postcode are real - unreal_postcodes = df.loc[ - ~df["referencepostcode"].isin(master_series), "referencepostcode" - ] - else: - emptydf = pd.DataFrame(columns=["referencepostcode"]) - unreal_postcodes = emptydf.loc[ - ~emptydf["referencepostcode"], "referencepostcode" - ] + unreal_postcodes = check_pcs_real(df, masterlist_path) # Log the unreal postcodes if not unreal_postcodes.empty: @@ -108,3 +97,21 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: ) return True + +def check_pcs_real(df, masterlist_path): + """Checks if the postcodes are real against a masterlist of actual postcodes + """ + if config["global"]["postcode_csv_check"]: + master_series = get_masterlist(masterlist_path) + + # Check if postcode are real + unreal_postcodes = df.loc[ + ~df["referencepostcode"].isin(master_series), "referencepostcode" + ] + else: + emptydf = pd.DataFrame(columns=["referencepostcode"]) + unreal_postcodes = emptydf.loc[ + ~emptydf["referencepostcode"], "referencepostcode" + ] + + return unreal_postcodes From 67b0430a97e3a73ad38f1f51059e9654864068cc Mon Sep 17 00:00:00 2001 From: jwestw Date: Mon, 5 Jun 2023 19:00:51 +0100 Subject: [PATCH 132/411] Types --- src/data_validation/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 78dfd5c64..c3d614281 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -98,7 +98,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: return True -def check_pcs_real(df, masterlist_path): +def check_pcs_real(df: pd.DataFrame, masterlist_path: str): """Checks if the postcodes are real against a masterlist of actual postcodes """ if config["global"]["postcode_csv_check"]: From 0e53b06d56cf7de145baedb4511284ec4355451a Mon Sep 17 00:00:00 2001 From: jwestw Date: Mon, 5 Jun 2023 19:01:04 +0100 Subject: [PATCH 133/411] Couple of tests for new function --- tests/test_data_validation/test_validation.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index eeb6c3046..93becbc5f 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -1,6 +1,6 @@ import pandas as pd import pytest -from src.data_validation.validation import validate_post_col, validate_postcode_pattern # noqa +from src.data_validation.validation import validate_post_col, validate_postcode_pattern, check_pcs_real# noqa @pytest.fixture # noqa @@ -98,3 +98,16 @@ def test_validate_postcode(): assert validate_postcode_pattern(" ") is False # Whitespace assert validate_postcode_pattern("ABC XYZ") is False # All letters but right length assert validate_postcode_pattern("123 456") is False # All numbers but right length + + +def test_check_pcs_real_with_invalid_postcodes(test_data): + masterlist_path = "path/to/masterlist.csv" + unreal_postcodes = check_pcs_real(test_data, masterlist_path) + expected_unreal_postcodes = pd.DataFrame({"referencepostcode": ["HIJ 789", "KL1M 2NO"]}) + pd.testing.assert_frame_equal(unreal_postcodes, expected_unreal_postcodes) # Assert that the unreal postcodes match the expected ones + + +def test_check_pcs_real_with_valid_postcodes(test_data): + masterlist_path = "path/to/masterlist.csv" + unreal_postcodes = check_pcs_real(test_data, masterlist_path) + assert unreal_postcodes.str.contains(["NP10 8XG", "SW1P 4DF"]).any() is False # Assert that the real postcodes are not in the unreal postcodes \ No newline at end of file From 708024f0a04c477360aace0aca081106a30ce633 Mon Sep 17 00:00:00 2001 From: jwestw Date: Tue, 6 Jun 2023 18:15:19 +0100 Subject: [PATCH 134/411] Monkey patching the get masterlist function --- tests/test_data_validation/test_validation.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 93becbc5f..74d86f659 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -100,14 +100,28 @@ def test_validate_postcode(): assert validate_postcode_pattern("123 456") is False # All numbers but right length -def test_check_pcs_real_with_invalid_postcodes(test_data): - masterlist_path = "path/to/masterlist.csv" +def test_check_pcs_real_with_invalid_postcodes(test_data, monkeypatch): + # Monkeypatch the get_masterlist function to use the mock implementation + monkeypatch.setattr("src.data_validation.validation.get_masterlist", mock_get_masterlist) + + # Use the fake path + masterlist_path = "path/to/mock_masterlist.csv" + + # Call the function under test unreal_postcodes = check_pcs_real(test_data, masterlist_path) + expected_unreal_postcodes = pd.DataFrame({"referencepostcode": ["HIJ 789", "KL1M 2NO"]}) + pd.testing.assert_frame_equal(unreal_postcodes, expected_unreal_postcodes) # Assert that the unreal postcodes match the expected ones -def test_check_pcs_real_with_valid_postcodes(test_data): +def test_check_pcs_real_with_valid_postcodes(test_data, monkeypatch): + # Monkeypatch the get_masterlist function to use the mock implementation + monkeypatch.setattr("src.data_validation.validation.get_masterlist", mock_get_masterlist) + + # Use the fake path masterlist_path = "path/to/masterlist.csv" + + # Call the function under test unreal_postcodes = check_pcs_real(test_data, masterlist_path) assert unreal_postcodes.str.contains(["NP10 8XG", "SW1P 4DF"]).any() is False # Assert that the real postcodes are not in the unreal postcodes \ No newline at end of file From 0bdc348aeeeb257d2d4844361ca6c25f49011fcd Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 6 Jun 2023 19:35:43 +0100 Subject: [PATCH 135/411] Comment to explain real postcode test --- tests/test_data_validation/test_validation.py | 46 +++++++++++++------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 74d86f659..62c9afff8 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -1,14 +1,17 @@ import pandas as pd import pytest -from src.data_validation.validation import validate_post_col, validate_postcode_pattern, check_pcs_real# noqa +from src.data_validation.validation import ( + validate_post_col, + validate_postcode_pattern, + check_pcs_real, +) # noqa @pytest.fixture # noqa def test_data(): - """'NP10 8XG', 'SW1P 4DF' are valid and real postcodes. 'HIJ 789' is neither valid nor real - and 'KL1M 2NO' is a valid pattern but not real - - """ + """'NP10 8XG', 'SW1P 4DF' are valid and real postcodes. + 'HIJ 789' is neither valid nor real + and 'KL1M 2NO' is a valid pattern but not real""" return pd.DataFrame( {"referencepostcode": ["NP10 8XG", "SW1P 4DF", "HIJ 789", "KL1M 2NO"]} ) @@ -83,7 +86,9 @@ def test_validate_post_col(test_data, monkeypatch, caplog): def test_validate_postcode(): # Valid postcodes assert validate_postcode_pattern("AB12 3CD") is True - assert validate_postcode_pattern("AB123CD") is True # Missing space - othewise valid + assert ( + validate_postcode_pattern("AB123CD") is True + ) # Missing space - othewise valid assert validate_postcode_pattern("DE34 5FG") is True assert validate_postcode_pattern("HI67 8JK") is True @@ -102,26 +107,37 @@ def test_validate_postcode(): def test_check_pcs_real_with_invalid_postcodes(test_data, monkeypatch): # Monkeypatch the get_masterlist function to use the mock implementation - monkeypatch.setattr("src.data_validation.validation.get_masterlist", mock_get_masterlist) + monkeypatch.setattr( + "src.data_validation.validation.get_masterlist", mock_get_masterlist + ) # Use the fake path masterlist_path = "path/to/mock_masterlist.csv" - + # Call the function under test unreal_postcodes = check_pcs_real(test_data, masterlist_path) - expected_unreal_postcodes = pd.DataFrame({"referencepostcode": ["HIJ 789", "KL1M 2NO"]}) - - pd.testing.assert_frame_equal(unreal_postcodes, expected_unreal_postcodes) # Assert that the unreal postcodes match the expected ones + expected_unreal_postcodes = pd.DataFrame( + {"referencepostcode": ["HIJ 789", "KL1M 2NO"]} + ) + + pd.testing.assert_frame_equal( + unreal_postcodes, expected_unreal_postcodes + ) # Assert that the unreal postcodes match the expected ones def test_check_pcs_real_with_valid_postcodes(test_data, monkeypatch): # Monkeypatch the get_masterlist function to use the mock implementation - monkeypatch.setattr("src.data_validation.validation.get_masterlist", mock_get_masterlist) - + monkeypatch.setattr( + "src.data_validation.validation.get_masterlist", mock_get_masterlist + ) + # Use the fake path masterlist_path = "path/to/masterlist.csv" - + # Call the function under test unreal_postcodes = check_pcs_real(test_data, masterlist_path) - assert unreal_postcodes.str.contains(["NP10 8XG", "SW1P 4DF"]).any() is False # Assert that the real postcodes are not in the unreal postcodes \ No newline at end of file + # NP10 8XG and SW1P 4DF are real. Should not be presentin unreal_postcode + assert ( + unreal_postcodes.str.contains("NP10 8XG|SW1P 4DF").any() is False + ) # Assert that the real postcodes are not in the unreal postcodes From d3e979b4a328fbf59d46c5de087a048db13adb60 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 19:52:39 +0100 Subject: [PATCH 136/411] Sorting series issues in tests --- tests/test_data_validation/test_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 62c9afff8..9b3dc94c9 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -116,12 +116,12 @@ def test_check_pcs_real_with_invalid_postcodes(test_data, monkeypatch): # Call the function under test unreal_postcodes = check_pcs_real(test_data, masterlist_path) - - expected_unreal_postcodes = pd.DataFrame( - {"referencepostcode": ["HIJ 789", "KL1M 2NO"]} + unreal_postcodes = unreal_postcodes.reset_index(drop=True) + expected_unreal_postcodes = pd.Series( + ["HIJ 789", "KL1M 2NO"], name="referencepostcode" ) - pd.testing.assert_frame_equal( + pd.testing.assert_series_equal( unreal_postcodes, expected_unreal_postcodes ) # Assert that the unreal postcodes match the expected ones From 28fa2dfb4d678bcd17f6f95df396e7b280a46433 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 20:00:14 +0100 Subject: [PATCH 137/411] fixing bool issues --- tests/test_data_validation/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 9b3dc94c9..93461921d 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -139,5 +139,5 @@ def test_check_pcs_real_with_valid_postcodes(test_data, monkeypatch): unreal_postcodes = check_pcs_real(test_data, masterlist_path) # NP10 8XG and SW1P 4DF are real. Should not be presentin unreal_postcode assert ( - unreal_postcodes.str.contains("NP10 8XG|SW1P 4DF").any() is False + bool(unreal_postcodes.isin(["NP10 8XG", "SW1P 4DF"]).any()) is False ) # Assert that the real postcodes are not in the unreal postcodes From 94138dfb8da3c8a26d4656e09f2f5e6ea362836b Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 6 Jun 2023 20:27:29 +0100 Subject: [PATCH 138/411] Adding postcodes lib to reqs --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 0b68aaafb..b79d95fd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ coverage pyyaml requests sphinx +postcodes_uk From 10d82431f20d9c63c4c6b5967c27a0410dae422e Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 6 Jun 2023 20:30:08 +0100 Subject: [PATCH 139/411] Simplfying pyproj toml. Removing pytest conf --- pyproject.toml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5130b557c..e14526905 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,13 +12,3 @@ exclude_lines = [ # `isort` configurations [tool.isort] profile = "black" - -# `pytest` configurations -[tool.pytest.ini_options] -addopts = [ - "-vv", - "--doctest-modules",] -doctest_optionflags = "NORMALIZE_WHITESPACE" -testpaths = [ - "./tests" -] From d386c5d221cfa11eac8c3e44cb660f0f9552eacf Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 6 Jun 2023 20:32:35 +0100 Subject: [PATCH 140/411] Deleting pyproj --- pyproject.toml | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index e14526905..000000000 --- a/pyproject.toml +++ /dev/null @@ -1,14 +0,0 @@ -# `coverage` configurations -[tool.coverage.run] -source = [ - "./src" -] - -[tool.coverage.report] -exclude_lines = [ - "if __name__ == .__main__.:" -] - -# `isort` configurations -[tool.isort] -profile = "black" From d98d14498fafb2f08d896c88466ace33944cab3b Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 21:07:43 +0100 Subject: [PATCH 141/411] Importing loading and validation. Loading dfs --- src/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main.py b/src/main.py index e744ae09b..a76c1c396 100644 --- a/src/main.py +++ b/src/main.py @@ -58,6 +58,7 @@ def run_pipeline(start): ) # Data validation + validation.validate_postcode # Check the postcode column validation.validate_post_col(contributors_df, masterlist_path) From e2a3f6d381bef472d99a71f247074d0bd8984aa2 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Tue, 16 May 2023 21:24:36 +0100 Subject: [PATCH 142/411] validating whole col --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index a76c1c396..dd93540a4 100644 --- a/src/main.py +++ b/src/main.py @@ -58,7 +58,7 @@ def run_pipeline(start): ) # Data validation - validation.validate_postcode + validation.validate_post_col(contributers_df) # Check the postcode column validation.validate_post_col(contributors_df, masterlist_path) From 77eee40c4f98df46d7fec64c1f0f284353f031f3 Mon Sep 17 00:00:00 2001 From: jwestw Date: Thu, 18 May 2023 17:40:30 +0100 Subject: [PATCH 143/411] New reality checker --- src/data_validation/validation.py | 1 - src/main.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index c3d614281..304f61dd5 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -31,7 +31,6 @@ def validate_postcode_pattern(pcode: str) -> bool: return valid_bool - def get_masterlist(masterlist_path) -> pd.Series: """This function loads the masterlist of postcodes from a csv file diff --git a/src/main.py b/src/main.py index dd93540a4..e744ae09b 100644 --- a/src/main.py +++ b/src/main.py @@ -58,7 +58,6 @@ def run_pipeline(start): ) # Data validation - validation.validate_post_col(contributers_df) # Check the postcode column validation.validate_post_col(contributors_df, masterlist_path) From 7ccd9c15f0f165f9c005515eff9b843bb83bb459 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 16 May 2023 12:16:34 +0100 Subject: [PATCH 144/411] Formatted log outputs work in HDFS --- src/utils/runlog.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/utils/runlog.py b/src/utils/runlog.py index 176cbb27f..58346c390 100644 --- a/src/utils/runlog.py +++ b/src/utils/runlog.py @@ -75,10 +75,6 @@ def retrieve_pipeline_logs(self): return self def retrieve_configs(self): - """Retrieve the config settings for each run - whilst ignoring the top level keys. This can then be saved - in a column readable format. - """ with open("src/developer_config.yaml", "r") as file: self.configdata = yaml.load(file, Loader=yaml.FullLoader) # Convert the YAML data to a Pandas DataFrame From bd72240d3fd98d503e91b04dca11b33a57809eb4 Mon Sep 17 00:00:00 2001 From: allmag Date: Thu, 18 May 2023 12:32:41 +0100 Subject: [PATCH 145/411] Fixed misspell --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index e744ae09b..5feb333d6 100644 --- a/src/main.py +++ b/src/main.py @@ -80,7 +80,7 @@ def run_pipeline(start): # Data output: File Outputs - MainLogger.info("Finshing Pipeline .......................") + MainLogger.info("Finishing Pipeline .......................") runlog_obj.retrieve_pipeline_logs() From 2816b53c729b72255d05b4e92cd019ea0f852012 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 18 May 2023 18:43:36 +0100 Subject: [PATCH 146/411] Adding custom wrappers and loggers to pipeline --- src/data_ingest/spp_parser.py | 13 +++++++++++-- src/data_validation/validation.py | 20 ++++++++++++++------ src/main.py | 8 ++------ 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/data_ingest/spp_parser.py b/src/data_ingest/spp_parser.py index cfe0d68c7..cff9c793a 100644 --- a/src/data_ingest/spp_parser.py +++ b/src/data_ingest/spp_parser.py @@ -1,7 +1,14 @@ import pandas as pd from typing import Tuple +from src.utils.wrappers import exception_wrap, time_logger_wrap +import logging +LoadingLogger = logging.getLogger(__name__) +LoadingLogger.setLevel(logging.INFO) + +@exception_wrap +@time_logger_wrap def parse_snap_data(snapdata: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: """Loads the data from the survey via the SPP snapshot. The data is supplied as dict and is parsed into dataframes, one for survey contributers (company details) @@ -16,9 +23,11 @@ def parse_snap_data(snapdata: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: # Load the dicts! contributordict = snapdata["contributors"] responsesdict = snapdata["responses"] - + # Make dataframes contributors_df = pd.DataFrame(contributordict) responses_df = pd.DataFrame(responsesdict) - + + LoadingLogger.info("SPP Snapshot data successfully loaded...") + return contributors_df, responses_df diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 304f61dd5..c7623b164 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,19 +1,21 @@ import postcodes_uk import pandas as pd -from src.utils.wrappers import logger_creator +from src.utils.wrappers import time_logger_wrap, exception_wrap +import logging + from src.utils.helpers import Config_settings # Get the config conf_obj = Config_settings() config = conf_obj.config_dict -global_config = config["global"] -# Set up logging -logger = logger_creator(global_config) +ValidationLogger = logging.getLogger(__name__) +ValidationLogger.setLevel(logging.INFO) +@time_logger_wrap def validate_postcode_pattern(pcode: str) -> bool: """A function to validate UK postcodes which uses the @@ -31,6 +33,8 @@ def validate_postcode_pattern(pcode: str) -> bool: return valid_bool + +@exception_wrap def get_masterlist(masterlist_path) -> pd.Series: """This function loads the masterlist of postcodes from a csv file @@ -41,6 +45,8 @@ def get_masterlist(masterlist_path) -> pd.Series: return masterlist +@time_logger_wrap +@exception_wrap def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: """This function checks if all postcodes in the specified DataFrame column are valid UK postcodes. It uses the `validate_postcode` function to @@ -69,7 +75,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: # Log the unreal postcodes if not unreal_postcodes.empty: - logger.warning( + ValidationLogger.warning( f"These postcodes are not found in the ONS postcode list: {unreal_postcodes.to_list()}" # noqa ) @@ -80,7 +86,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: # Log the invalid postcodes if not invalid_pattern_postcodes.empty: - logger.warning( + ValidationLogger.warning( f"Invalid pattern postcodes found: {invalid_pattern_postcodes.to_list()}" ) @@ -95,6 +101,8 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}" ) + ValidationLogger.info("All postcodes validated....") + return True def check_pcs_real(df: pd.DataFrame, masterlist_path: str): diff --git a/src/main.py b/src/main.py index 5feb333d6..a5738fc1a 100644 --- a/src/main.py +++ b/src/main.py @@ -30,13 +30,8 @@ def run_pipeline(start): start (float): The time when the pipeline is launched generated from the time module using time.time() """ - - # Get the config seetings - conf_obj = Config_settings() - config = conf_obj.config_dict - global_config = config["global"] - # Set up the run logger + global_config = config["global"] runlog_obj = runlog.RunLog(config, version) logger = logger_creator(global_config) @@ -58,6 +53,7 @@ def run_pipeline(start): ) # Data validation + MainLogger.info("Starting Data Validation...") # Check the postcode column validation.validate_post_col(contributors_df, masterlist_path) From 067429cab77417e88bbeac8d2db01baf6e2a9760 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Mon, 22 May 2023 10:51:52 +0100 Subject: [PATCH 147/411] simplifying setup in modules --- src/data_ingest/spp_parser.py | 1 - src/data_validation/validation.py | 1 - src/main.py | 1 - 3 files changed, 3 deletions(-) diff --git a/src/data_ingest/spp_parser.py b/src/data_ingest/spp_parser.py index cff9c793a..825974486 100644 --- a/src/data_ingest/spp_parser.py +++ b/src/data_ingest/spp_parser.py @@ -5,7 +5,6 @@ import logging LoadingLogger = logging.getLogger(__name__) -LoadingLogger.setLevel(logging.INFO) @exception_wrap @time_logger_wrap diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index c7623b164..89c7b908a 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -12,7 +12,6 @@ config = conf_obj.config_dict ValidationLogger = logging.getLogger(__name__) -ValidationLogger.setLevel(logging.INFO) @time_logger_wrap diff --git a/src/main.py b/src/main.py index a5738fc1a..801e9bd35 100644 --- a/src/main.py +++ b/src/main.py @@ -14,7 +14,6 @@ MainLogger = logging.getLogger(__name__) -MainLogger.setLevel(logging.INFO) # load config From bf0ab6f04d72fd7add45ed59bd42855aeb7333f8 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Wed, 24 May 2023 14:57:42 +0100 Subject: [PATCH 148/411] fixing wrapper tuple issue --- src/data_validation/validation.py | 2 -- src/utils/testfunctions.py | 4 ++-- src/utils/wrappers.py | 7 +++---- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 89c7b908a..55dbbf3ea 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,6 +1,5 @@ import postcodes_uk import pandas as pd - from src.utils.wrappers import time_logger_wrap, exception_wrap import logging @@ -14,7 +13,6 @@ ValidationLogger = logging.getLogger(__name__) -@time_logger_wrap def validate_postcode_pattern(pcode: str) -> bool: """A function to validate UK postcodes which uses the diff --git a/src/utils/testfunctions.py b/src/utils/testfunctions.py index d223f4b47..b932e2092 100644 --- a/src/utils/testfunctions.py +++ b/src/utils/testfunctions.py @@ -72,9 +72,9 @@ def addition(a: int, b: int): class Manipulate_data: def __init__(self): - self.vf_df = self.create_dummy_df()[0] + self.vf_df = self.create_dummy_df() self.table_config = "SingleLine" - self.df = self.manipulate_df()[0] + self.df = self.manipulate_df() @time_logger_wrap @exception_wrap diff --git a/src/utils/wrappers.py b/src/utils/wrappers.py index eafa2520e..ee8616983 100644 --- a/src/utils/wrappers.py +++ b/src/utils/wrappers.py @@ -2,7 +2,6 @@ from functools import wraps from time import perf_counter import traceback - from table_logger import TableLogger import logging.config @@ -40,8 +39,8 @@ def decorator(*args, **kwargs): """Define the decorator itself.""" enter_time = starting_time() result = func(*args, **kwargs) - time_taken = finishing_time(func, enter_time) - return result, time_taken + finishing_time(func, enter_time) + return result return decorator @@ -160,7 +159,7 @@ def call(*args, **kwargs): def df_measure_change(df, rows_before, cols_before, table_config): """Log the change in a dataframe caused by a function.""" - shape = df[0].shape + shape = df.shape rows_after, cols_after = shape[0], shape[1] def _change_direction(before, after): From d82b20fc57cb15a03fac014d3ecfedd6c12c2ca3 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 25 May 2023 16:49:30 +0100 Subject: [PATCH 149/411] Modifying logger to add user --- src/utils/runlog.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/utils/runlog.py b/src/utils/runlog.py index 58346c390..38b400754 100644 --- a/src/utils/runlog.py +++ b/src/utils/runlog.py @@ -21,12 +21,19 @@ class RunLog: """Creates a runlog instance for the pipeline.""" def __init__(self, config, version): + self.user = self._generate_username() self.config = config self.run_id = self._create_run_id() self.version = version self.logs = [] self.timestamp = self._generate_time() + def _generate_username(self): + """Record the username of the user running the pipeline""" + self.context = os.getenv("HADOOP_USER_NAME") + + return self.context + def _create_run_id(self): """Create a unique run_id from the previous iteration""" # Import name of main log file @@ -65,13 +72,17 @@ def retrieve_pipeline_logs(self): f = open("logs/main.log", "r") lines = f.read().splitlines() self.runids = {"run_id": self.run_id} + self.users = {"user": self.user} for line in lines: self.logs.append(line.split(" - ")) self.runids.update({"run_id": self.run_id}) + self.users.update({"user": self.user}) self.saved_logs = pd.DataFrame( self.logs, columns=["timestamp", "module", "function", "message"] ) self.saved_logs.insert(0, "run_id", self.runids["run_id"]) + self.saved_logs.insert(1, "user", self.users["user"]) + return self def retrieve_configs(self): @@ -85,6 +96,8 @@ def retrieve_configs(self): self.ndct.update(nrow) self.configdf = pd.DataFrame(self.ndct) self.configdf.insert(0, "run_id", self.runids["run_id"]) + self.configdf.insert(1, "user", self.users["user"]) + return self def _generate_time(self): @@ -99,6 +112,7 @@ def _create_runlog_dicts(self): self.runlog_main_dict = { "run_id": self.run_id, + "user": self.user, "timestamp": self.timestamp, "version": self.version, "time_taken": self.time_taken, @@ -110,7 +124,7 @@ def _create_runlog_dfs(self): """Convert dictionaries to pandas dataframes.""" self.runlog_main_df = pd.DataFrame( [self.runlog_main_dict], - columns=["run_id", "timestamp", "version", "time_taken"], + columns=["run_id", "user", "timestamp", "version", "time_taken"], ) self.runlog_configs_df = self.configdf @@ -153,7 +167,7 @@ def create_runlog_files(self): if they don't already exist. """ - main_columns = ["run_id", "timestamp", "version", "time_taken"] + main_columns = ["run_id", "user", "timestamp", "version", "time_taken"] file_name = csv_filenames["main"] file_path = f"{main_path}/{file_name}" self.hdfs_csv_creator(file_path, main_columns) @@ -163,7 +177,7 @@ def create_runlog_files(self): file_path = f"{main_path}/{file_name}" self.hdfs_csv_creator(file_path, config_columns) - log_columns = ["run_id", "timestamp", "module", "function", "message"] + log_columns = ["run_id", "user", "timestamp", "module", "function", "message"] file_name = csv_filenames["logs"] file_path = f"{main_path}/{file_name}" self.hdfs_csv_creator(file_path, log_columns) From 8d39d76d18c006da9fb86bc75866d1c236379ced Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Wed, 7 Jun 2023 10:45:16 +0100 Subject: [PATCH 150/411] Set pcode check to false to stop validating fake pcodes --- src/developer_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 71c07c508..622774e34 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -2,7 +2,7 @@ global: log_to_file: True # Write logs to .log file logging_level: "DEBUG" table_config: "SingleLine" - postcode_csv_check: True + postcode_csv_check: False runlog_writer: write_csv: True # Write the runlog to a CSV file write_hdf5: False # Write the runlog to an HDF5 file From 7b38d57e1cf0751e4fa313bc2fe0b2f556bc2c4d Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Wed, 7 Jun 2023 10:45:50 +0100 Subject: [PATCH 151/411] Cleaning up pipeline paths --- src/main.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main.py b/src/main.py index 801e9bd35..c824701b2 100644 --- a/src/main.py +++ b/src/main.py @@ -19,7 +19,6 @@ # load config conf_obj = Config_settings() config = conf_obj.config_dict -masterlist_path = config["paths"]["masterlist_path"] def run_pipeline(start): @@ -39,22 +38,24 @@ def run_pipeline(start): Manipulate_data() # Data Ingest + MainLogger.info("Starting Data Ingest...") # Load SPP data from DAP snapshot_path = config["paths"]["snapshot_path"] snapdata = hdfs_load_json(snapshot_path) contributors_df, responses_df = spp_parser.parse_snap_data(snapdata) + MainLogger.info("Finished Data Ingest...") + # Data Transmutation + MainLogger.info("Starting Data Transmutation...") full_responses = processing.full_responses(contributors_df, responses_df) print(full_responses.sample(5)) - logger.info( - "The response rate is %.3%", - processing.response_rate(contributors_df, responses_df), - ) + processing.response_rate(contributors_df, responses_df) + MainLogger.info("Finished Data Transmutation...") # Data validation MainLogger.info("Starting Data Validation...") - # Check the postcode column + masterlist_path = config["paths"]["masterlist_path"] validation.validate_post_col(contributors_df, masterlist_path) # Outlier detection From a230c024b121a0f5b31bf7ff55b5856b090fa01f Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Wed, 7 Jun 2023 10:46:14 +0100 Subject: [PATCH 152/411] apply logger and wrappers to spp parser --- src/data_ingest/spp_parser.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/data_ingest/spp_parser.py b/src/data_ingest/spp_parser.py index 825974486..6e1340e17 100644 --- a/src/data_ingest/spp_parser.py +++ b/src/data_ingest/spp_parser.py @@ -4,7 +4,8 @@ from src.utils.wrappers import exception_wrap, time_logger_wrap import logging -LoadingLogger = logging.getLogger(__name__) +spp_parser_logger = logging.getLogger(__name__) + @exception_wrap @time_logger_wrap @@ -22,11 +23,11 @@ def parse_snap_data(snapdata: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: # Load the dicts! contributordict = snapdata["contributors"] responsesdict = snapdata["responses"] - + # Make dataframes contributors_df = pd.DataFrame(contributordict) responses_df = pd.DataFrame(responsesdict) - - LoadingLogger.info("SPP Snapshot data successfully loaded...") - + + spp_parser_logger.info("SPP Snapshot data successfully loaded...") + return contributors_df, responses_df From 60ed57be31075fe6e7b998929df42f81a07fce08 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Wed, 7 Jun 2023 10:46:54 +0100 Subject: [PATCH 153/411] clean up processing mod and add logger --- src/data_processing/spp_snapshot_processing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index f92b33781..1e6deda7c 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -1,8 +1,6 @@ -from src.utils.helpers import Config_settings +import logging -conf_obj = Config_settings() -config = conf_obj.config_dict -snapshot_path = config["paths"]["snapshot_path"] # Taken from config file +spp_processing_logger = logging.getLogger(__name__) def full_responses(contributors, responses): @@ -61,4 +59,6 @@ def response_rate(contributors, responses): response_rate = no_responses / no_contributors + spp_processing_logger.info(f"The SPP response rate is {round(response_rate,2)}%") + return response_rate From 9dfc2b2b5fbe6ecb9aff05baca098b7788c5ebd7 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Wed, 7 Jun 2023 10:47:39 +0100 Subject: [PATCH 154/411] Fixed tests failing when csv check is set to false --- tests/test_data_validation/test_validation.py | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 93461921d..2151300f4 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -5,6 +5,11 @@ validate_postcode_pattern, check_pcs_real, ) # noqa +from src.utils.helpers import Config_settings + +# Get the config +conf_obj = Config_settings() +config = conf_obj.config_dict @pytest.fixture # noqa @@ -38,11 +43,15 @@ def test_validate_post_col(test_data, monkeypatch, caplog): validate_post_col(test_data, fake_path) # Using caplog to check the logged warning messages - assert ( - "These postcodes are not found in the ONS postcode list: ['HIJ 789', 'KL1M 2NO']" # noqa - in caplog.text - ) - assert "Invalid pattern postcodes found: ['HIJ 789']" in caplog.text + if config["global"]["postcode_csv_check"]: + + assert ( + "These postcodes are not found in the ONS postcode list: ['HIJ 789', 'KL1M 2NO']" # noqa + in caplog.text + ) + + else: + assert "Invalid pattern postcodes found: ['HIJ 789']" in caplog.text # Valid AND real postcodes df_valid = pd.DataFrame({"referencepostcode": ["NP10 8XG", "PO15 5RR", "SW1P 4DF"]}) @@ -57,7 +66,10 @@ def test_validate_post_col(test_data, monkeypatch, caplog): # Mixed valid and invalid postcodes - as is in the test_data with pytest.raises(ValueError) as error: validate_post_col(test_data, fake_path) - assert str(error.value) == "Invalid postcodes found: ['HIJ 789', 'KL1M 2NO']" + if config["global"]["postcode_csv_check"]: + assert str(error.value) == "Invalid postcodes found: ['HIJ 789', 'KL1M 2NO']" + else: + assert str(error.value) == "Invalid postcodes found: ['HIJ 789']" # Edge cases: invalid column names df_invalid_column_name = test_data.rename(columns={"referencepostcode": "postcode"}) @@ -106,6 +118,7 @@ def test_validate_postcode(): def test_check_pcs_real_with_invalid_postcodes(test_data, monkeypatch): + # Monkeypatch the get_masterlist function to use the mock implementation monkeypatch.setattr( "src.data_validation.validation.get_masterlist", mock_get_masterlist @@ -117,9 +130,15 @@ def test_check_pcs_real_with_invalid_postcodes(test_data, monkeypatch): # Call the function under test unreal_postcodes = check_pcs_real(test_data, masterlist_path) unreal_postcodes = unreal_postcodes.reset_index(drop=True) - expected_unreal_postcodes = pd.Series( - ["HIJ 789", "KL1M 2NO"], name="referencepostcode" - ) + if config["global"]["postcode_csv_check"]: + + expected_unreal_postcodes = pd.Series( + ["HIJ 789", "KL1M 2NO"], name="referencepostcode" + ) + else: + expected_unreal_postcodes = pd.Series( + [], name="referencepostcode", dtype=object + ) pd.testing.assert_series_equal( unreal_postcodes, expected_unreal_postcodes From 1329d8afc588df3af94f155c3e62f11cff07f6ef Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Wed, 7 Jun 2023 12:58:27 +0100 Subject: [PATCH 155/411] adding explanatory comments --- src/utils/runlog.py | 18 +++++++++++++----- src/utils/wrappers.py | 1 + 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/utils/runlog.py b/src/utils/runlog.py index 38b400754..76bb03fca 100644 --- a/src/utils/runlog.py +++ b/src/utils/runlog.py @@ -13,8 +13,10 @@ context = os.getenv("HADOOP_USER_NAME") # Put your context name here project = config["paths"]["logs_foldername"] # Taken from config file -main_path = f"/user/{context}/{project}" -hdfs.mkdir(main_path) +main_path = ( + f"/user/{context}/{project}" # stored in the personal space of the user for now +) +hdfs.mkdir(main_path) # creates the folder if it doesn't exist class RunLog: @@ -29,7 +31,9 @@ def __init__(self, config, version): self.timestamp = self._generate_time() def _generate_username(self): - """Record the username of the user running the pipeline""" + """Record the username of the user running the pipeline + using os package""" + # Use the Hadoop Username to record user self.context = os.getenv("HADOOP_USER_NAME") return self.context @@ -59,7 +63,6 @@ def _record_time_taken(self, time_taken): This is for the total pipeline run time, not the time taken for each step. """ - self.time_taken = time_taken return self.time_taken @@ -70,9 +73,11 @@ def retrieve_pipeline_logs(self): and append them to self.saved_logs df. """ f = open("logs/main.log", "r") + # Split logs by line lines = f.read().splitlines() self.runids = {"run_id": self.run_id} self.users = {"user": self.user} + # Add run_id and user to logs for line in lines: self.logs.append(line.split(" - ")) self.runids.update({"run_id": self.run_id}) @@ -86,15 +91,18 @@ def retrieve_pipeline_logs(self): return self def retrieve_configs(self): + """Gets the configs settings for each run of the pipeline""" with open("src/developer_config.yaml", "r") as file: self.configdata = yaml.load(file, Loader=yaml.FullLoader) # Convert the YAML data to a Pandas DataFrame dct = {k: [v] for k, v in self.configdata.items()} self.ndct = {} + # Use all the 2nd level yaml keys as headers for i in dct.keys(): nrow = {k: [v] for k, v in dct[i][0].items()} self.ndct.update(nrow) self.configdf = pd.DataFrame(self.ndct) + # Add run_id and user to configs self.configdf.insert(0, "run_id", self.runids["run_id"]) self.configdf.insert(1, "user", self.users["user"]) @@ -126,7 +134,7 @@ def _create_runlog_dfs(self): [self.runlog_main_dict], columns=["run_id", "user", "timestamp", "version", "time_taken"], ) - + # These dfs were created earlier. Renaming for continuity self.runlog_configs_df = self.configdf self.runlog_logs_df = self.saved_logs diff --git a/src/utils/wrappers.py b/src/utils/wrappers.py index ee8616983..63f789a61 100644 --- a/src/utils/wrappers.py +++ b/src/utils/wrappers.py @@ -118,6 +118,7 @@ def exception_wrap(func): def wrapper(*args, **kwargs): """Define the decorator itself.""" try: + # run the function as is. result = func(*args, **kwargs) return result From bf225274520f4920dfd1982893da66ab6a63bc6e Mon Sep 17 00:00:00 2001 From: allmag Date: Thu, 11 May 2023 18:41:39 +0100 Subject: [PATCH 156/411] Added a function into loading.py and added new script to reformat spp_snapshot dataframe --- src/data_processing/spp_snapshot_processing | 77 +++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/data_processing/spp_snapshot_processing diff --git a/src/data_processing/spp_snapshot_processing b/src/data_processing/spp_snapshot_processing new file mode 100644 index 000000000..6479d71f8 --- /dev/null +++ b/src/data_processing/spp_snapshot_processing @@ -0,0 +1,77 @@ +import pandas as pd + +from src.utils.helpers import Config_settings +from src.utils.hdfs_mods import hdfs_load_json + +conf_obj = Config_settings() +config = conf_obj.config_dict +snapshot_path = config["snapshot_path"] # Taken from config file + +from src.data_ingest.loading import load_snapshot_data + + +def full_responses(contributors, responses): + + """Merges contributor and response data together into a dataframe that is in a + format allowing for easier manipulation later in pipeline - notably through + having each questioncode as its own column. + + Arguments: + contributors -- DataFrame containing contributor data for BERD + from SPP Snapshot file + responses -- DataFrame containing response data for BERD from SPP Snapshot file + + Returns: + full_responses -- DataFrame containing both response and contributor data + """ + + drop_cols = ["createdby", "createddate", "lastupdatedby", "lastupdateddate"] + + unique_id_cols = ["reference", "period", "survey"] + + contributors_dropped = contributors.drop(drop_cols, axis=1) + responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) + + merged_df = contributors_dropped.merge(responses_dropped, + on = unique_id_cols) + + contextual_df = merged_df.drop(["questioncode", "response"], + axis=1).drop_duplicates() + + response_df = merged_df.pivot_table(index = unique_id_cols, + columns='questioncode', + values='response', + aggfunc=','.join).reset_index() + + full_responses = response_df.merge(contextual_df, on = unique_id_cols) + + return full_responses + + +def response_rate(contributors, responses): + + """Generates a response rate based on the contributor and response data + from the SPP Snapshot file. + + Arguments: + contributors -- DataFrame containing contributor data for BERD + from SPP Snapshot file + responses -- DataFrame containing response data for BERD from SPP Snapshot file + + Returns: + response_rate -- Float representing proportion of contributors who responded + """ + + no_responses = len(responses["reference"].unique()) + no_contributors = len(contributors["reference"].unique()) + + response_rate = no_responses / no_contributors + + return response_rate + +contributors = load_snapshot_data(snapshot_path, data_type = "contributors") +responses = load_snapshot_data(snapshot_path, data_type = "responses") + +full_responses = full_responses(contributors, responses) + +print("\nThe response rate is", "{0:.1%}".format(response_rate(contributors, responses))) From 290ad901f588d06a0ca76798fe83c0e69bb4de8d Mon Sep 17 00:00:00 2001 From: allmag Date: Tue, 23 May 2023 12:05:23 +0100 Subject: [PATCH 157/411] Added to main.py and fixed test# --- src/data_processing/spp_snapshot_processing | 77 ------------------- .../spp_snapshot_processing.py | 24 +++--- src/main.py | 2 +- 3 files changed, 14 insertions(+), 89 deletions(-) delete mode 100644 src/data_processing/spp_snapshot_processing diff --git a/src/data_processing/spp_snapshot_processing b/src/data_processing/spp_snapshot_processing deleted file mode 100644 index 6479d71f8..000000000 --- a/src/data_processing/spp_snapshot_processing +++ /dev/null @@ -1,77 +0,0 @@ -import pandas as pd - -from src.utils.helpers import Config_settings -from src.utils.hdfs_mods import hdfs_load_json - -conf_obj = Config_settings() -config = conf_obj.config_dict -snapshot_path = config["snapshot_path"] # Taken from config file - -from src.data_ingest.loading import load_snapshot_data - - -def full_responses(contributors, responses): - - """Merges contributor and response data together into a dataframe that is in a - format allowing for easier manipulation later in pipeline - notably through - having each questioncode as its own column. - - Arguments: - contributors -- DataFrame containing contributor data for BERD - from SPP Snapshot file - responses -- DataFrame containing response data for BERD from SPP Snapshot file - - Returns: - full_responses -- DataFrame containing both response and contributor data - """ - - drop_cols = ["createdby", "createddate", "lastupdatedby", "lastupdateddate"] - - unique_id_cols = ["reference", "period", "survey"] - - contributors_dropped = contributors.drop(drop_cols, axis=1) - responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) - - merged_df = contributors_dropped.merge(responses_dropped, - on = unique_id_cols) - - contextual_df = merged_df.drop(["questioncode", "response"], - axis=1).drop_duplicates() - - response_df = merged_df.pivot_table(index = unique_id_cols, - columns='questioncode', - values='response', - aggfunc=','.join).reset_index() - - full_responses = response_df.merge(contextual_df, on = unique_id_cols) - - return full_responses - - -def response_rate(contributors, responses): - - """Generates a response rate based on the contributor and response data - from the SPP Snapshot file. - - Arguments: - contributors -- DataFrame containing contributor data for BERD - from SPP Snapshot file - responses -- DataFrame containing response data for BERD from SPP Snapshot file - - Returns: - response_rate -- Float representing proportion of contributors who responded - """ - - no_responses = len(responses["reference"].unique()) - no_contributors = len(contributors["reference"].unique()) - - response_rate = no_responses / no_contributors - - return response_rate - -contributors = load_snapshot_data(snapshot_path, data_type = "contributors") -responses = load_snapshot_data(snapshot_path, data_type = "responses") - -full_responses = full_responses(contributors, responses) - -print("\nThe response rate is", "{0:.1%}".format(response_rate(contributors, responses))) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index 1e6deda7c..7fa9a9894 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -5,8 +5,8 @@ def full_responses(contributors, responses): - """Merges contributor and response data together into a dataframe that is in a - format allowing for easier manipulation later in pipeline - notably through + """Merges contributor and response data together into a dataframe that is in a + format allowing for easier manipulation later in pipeline - notably through having each questioncode as its own column. Arguments: @@ -25,24 +25,25 @@ def full_responses(contributors, responses): contributors_dropped = contributors.drop(drop_cols, axis=1) responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) - merged_df = contributors_dropped.merge(responses_dropped, on=unique_id_cols) + merged_df = contributors_dropped.merge(responses_dropped, + on = unique_id_cols) - contextual_df = merged_df.drop( - ["questioncode", "response"], axis=1 - ).drop_duplicates() + contextual_df = merged_df.drop(["questioncode", "response"], + axis=1).drop_duplicates() - response_df = merged_df.pivot_table( - index=unique_id_cols, columns="questioncode", values="response", aggfunc="first" - ).reset_index() + response_df = merged_df.pivot_table(index = unique_id_cols, + columns='questioncode', + values='response', + aggfunc='first').reset_index() - full_responses = response_df.merge(contextual_df, on=unique_id_cols) + full_responses = response_df.merge(contextual_df, on = unique_id_cols) return full_responses def response_rate(contributors, responses): - """Generates a response rate based on the contributor and response data + """Generates a response rate based on the contributor and response data from the SPP Snapshot file. Arguments: @@ -62,3 +63,4 @@ def response_rate(contributors, responses): spp_processing_logger.info(f"The SPP response rate is {round(response_rate,2)}%") return response_rate + diff --git a/src/main.py b/src/main.py index c824701b2..9933e48ae 100644 --- a/src/main.py +++ b/src/main.py @@ -87,4 +87,4 @@ def run_pipeline(start): runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() runlog_obj.create_runlog_files() - runlog_obj._write_runlog() + runlog_obj._write_runlog() \ No newline at end of file From f50835c327ab3b1bfe85dd3d2139b3a21347c803 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 27 Apr 2023 13:33:00 +0100 Subject: [PATCH 158/411] Had to comment out several lines in main as functions missing, including Config_settings. Branch cloned from develop. --- src/main.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main.py b/src/main.py index 9933e48ae..ffb2fa834 100644 --- a/src/main.py +++ b/src/main.py @@ -2,6 +2,7 @@ from src.utils import runlog from src._version import __version__ as version + from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data @@ -32,9 +33,9 @@ def run_pipeline(start): global_config = config["global"] runlog_obj = runlog.RunLog(config, version) - logger = logger_creator(global_config) + # logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") - logger.info("Collecting logging parameters ..........") + # logger.info("Collecting logging parameters ..........") Manipulate_data() # Data Ingest @@ -78,13 +79,13 @@ def run_pipeline(start): MainLogger.info("Finishing Pipeline .......................") - runlog_obj.retrieve_pipeline_logs() + # runlog_obj.retrieve_pipeline_logs() - run_time = round(time.time() - start, 5) - runlog_obj._record_time_taken(run_time) + # run_time = round(time.time() - start, 5) + # runlog_obj._record_time_taken(run_time) runlog_obj.retrieve_configs() runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() runlog_obj.create_runlog_files() - runlog_obj._write_runlog() \ No newline at end of file + runlog_obj._write_runlog() From dd47d10d55ef027c7fc3dd9a645a74561b6e11b0 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:27:54 +0100 Subject: [PATCH 159/411] Reverted src/main.py to have no commented lines. --- src/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main.py b/src/main.py index ffb2fa834..2b7e8e38c 100644 --- a/src/main.py +++ b/src/main.py @@ -33,9 +33,9 @@ def run_pipeline(start): global_config = config["global"] runlog_obj = runlog.RunLog(config, version) - # logger = logger_creator(global_config) + logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") - # logger.info("Collecting logging parameters ..........") + logger.info("Collecting logging parameters ..........") Manipulate_data() # Data Ingest @@ -79,10 +79,10 @@ def run_pipeline(start): MainLogger.info("Finishing Pipeline .......................") - # runlog_obj.retrieve_pipeline_logs() + runlog_obj.retrieve_pipeline_logs() - # run_time = round(time.time() - start, 5) - # runlog_obj._record_time_taken(run_time) + run_time = round(time.time() - start, 5) + runlog_obj._record_time_taken(run_time) runlog_obj.retrieve_configs() runlog_obj._create_runlog_dicts() From 7256ac4a84ca235eace15cede6b8a4488f1a4f8a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:33:10 +0100 Subject: [PATCH 160/411] Reverted src/main.py to how to was before (removed newlines in imports). --- src/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.py b/src/main.py index 2b7e8e38c..c824701b2 100644 --- a/src/main.py +++ b/src/main.py @@ -2,7 +2,6 @@ from src.utils import runlog from src._version import __version__ as version - from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data From ecf70bfba63fcd2e9bb8d4bc6668dbd529fe6584 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 14:55:40 +0100 Subject: [PATCH 161/411] Created datadict for more responses --- config/Data_Schema.toml | 350 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 350 insertions(+) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index fb92d1fe0..d20254120 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -467,3 +467,353 @@ Length = "nan" Min_values = "nan" Max_values = "nan" Possible_categorical_Values = ["nan"] + +[cell_id] +Description = Cell ID +Deduced_Data_Type = Categorical +Nullable = nan +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[civ_or_def] +Description = Business type: Civil or Defence +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[current_sic] +Description = Sic - Standard Industry Classification +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[data_source] +Description = Constructed +Deduced_Data_Type = Categorical +Nullable = nan +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[emp_other] +Description = emp_other (Full Time Equivalent) +Deduced_Data_Type = Numeric float (or decimal) +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[emp_researcher] +Description = emp_researcher (Full Time Equivalent) +Deduced_Data_Type = Numeric float (or decimal) +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[emp_technician] +Description = emp_technician (Full Time Equivalent) +Deduced_Data_Type = Numeric float (or decimal) +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[emp_total] +Description = emp_total (Full Time Equivalent) +Deduced_Data_Type = Numeric float (or decimal) +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[employee_count] +Description = Employee Count (IDBR) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[foreign_owner] +Description = Foreign Owner +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[form_status] +Description = Status +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[form_type] +Description = Form Type +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[freeze_id] +Description = Freeze ID - bespoke to openroad +Deduced_Data_Type = Categorical +Nullable = nan +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[headcount_oth_f] +Description = Other Female (Headcount) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[headcount_oth_m] +Description = Other Male (Headcount) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[headcount_res_f] +Description = Researchers Females (Headcount) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[headcount_res_m] +Description = Researchers Male (Headcount) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[headcount_tec_f] +Description = Technicians Female (Headcount) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[headcount_tec_m] +Description = Technicians Male (Headcount) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[headcount_total] +Description = Total Headcount +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[period] +Description = Openroad Specific +Deduced_Data_Type = Categorical +Nullable = nan +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[period_contributor_id] +Description = Openroad Specific +Deduced_Data_Type = Categorical +Nullable = nan +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[period_year] +Description = Period +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[product_group] +Description = Published Product Group +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[ru_ref] +Description = Reference +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[sizeband] +Description = SizeBand +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = + + +[wowentref] +Description = Wowentref +Deduced_Data_Type = Categorical +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] + [q_code] + civ = + def = From f40d3fafe6c74c8519b5abc0132f0612bb55ede6 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 15:01:09 +0100 Subject: [PATCH 162/411] Corrected q codes --- config/Data_Schema.toml | 134 ++++++++-------------------------------- 1 file changed, 27 insertions(+), 107 deletions(-) diff --git a/config/Data_Schema.toml b/config/Data_Schema.toml index d20254120..eab7f35d5 100644 --- a/config/Data_Schema.toml +++ b/config/Data_Schema.toml @@ -476,10 +476,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [civ_or_def] Description = Business type: Civil or Defence @@ -489,10 +486,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [current_sic] Description = Sic - Standard Industry Classification @@ -502,10 +496,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [data_source] Description = Constructed @@ -515,10 +506,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [emp_other] Description = emp_other (Full Time Equivalent) @@ -528,10 +516,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [emp_researcher] Description = emp_researcher (Full Time Equivalent) @@ -541,10 +526,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [emp_technician] Description = emp_technician (Full Time Equivalent) @@ -554,10 +536,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [emp_total] Description = emp_total (Full Time Equivalent) @@ -567,10 +546,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [employee_count] Description = Employee Count (IDBR) @@ -580,10 +556,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [foreign_owner] Description = Foreign Owner @@ -593,10 +566,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [form_status] Description = Status @@ -606,10 +576,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [form_type] Description = Form Type @@ -619,10 +586,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [freeze_id] Description = Freeze ID - bespoke to openroad @@ -632,10 +596,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [headcount_oth_f] Description = Other Female (Headcount) @@ -645,10 +606,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [headcount_oth_m] Description = Other Male (Headcount) @@ -658,10 +616,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [headcount_res_f] Description = Researchers Females (Headcount) @@ -671,10 +626,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [headcount_res_m] Description = Researchers Male (Headcount) @@ -684,10 +636,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [headcount_tec_f] Description = Technicians Female (Headcount) @@ -697,10 +646,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [headcount_tec_m] Description = Technicians Male (Headcount) @@ -710,10 +656,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [headcount_total] Description = Total Headcount @@ -723,10 +666,7 @@ Length = # an integer of the length, or N/A Min_values = 0 Max_values = 1000000 Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [period] Description = Openroad Specific @@ -736,10 +676,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [period_contributor_id] Description = Openroad Specific @@ -749,10 +686,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [period_year] Description = Period @@ -762,10 +696,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [product_group] Description = Published Product Group @@ -775,10 +706,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [ru_ref] Description = Reference @@ -788,10 +716,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [sizeband] Description = SizeBand @@ -801,10 +726,7 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = - +q_code = [wowentref] Description = Wowentref @@ -814,6 +736,4 @@ Length = # an integer of the length, or N/A Min_values = None Max_values = None Possible_categorical_Values = [] - [q_code] - civ = - def = +q_code = From 476f02e29b532ebf70422708b75cf1b8f9a4256a Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 16:18:14 +0100 Subject: [PATCH 163/411] Col titles for reference --- config/descriptions.txt | 57 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 config/descriptions.txt diff --git a/config/descriptions.txt b/config/descriptions.txt new file mode 100644 index 000000000..89cf50f11 --- /dev/null +++ b/config/descriptions.txt @@ -0,0 +1,57 @@ +Salaries & Wages +Other current expenditure +Total Current Expenditure +Basic Research +Applied Research +Experimental Development +Land & Build CapEx +Equipment & Machinery CapEx +Total Capex. +Total Inhouse Expenditure +Own Funds +Funding - Commission of the EU +Funding - UK government +Funding - Organisations outside the Uk +Funding - Other UK Private Bus/Public Orgs +Funding - Any Other UK +Total Funding +Land Acquired for R&D (Split of Land & Build CapEx) +Buildings acquired/constructed for R&D (Split of Land & Build CapEx) +Expenditure on computer software only (of which from Equipment & Machinery CapEx) +Purchase of Materials (Split of Other current) +Purchase of Services (Split of Other current) +Ownership - Own Business +Ownership - UK Government +Ownership - Other UK Priv Bus +Ownership - Other UK Orgs +Ownership - Bus Enterprises in Group Outside UK +Ownership - Other Bus Enterprises outside UK +Ownership - Other Governments outside UK +Ownership - Higher Education Establishments outside UK +Ownership - Non-profit Orgs outside the UK +Ownership - Commission of EU +Ownership - International Orgs +Ownership - Any other Orgs outside UK +Ownership - not owned freely available +Life Length - Basic Research +Life Length - Applied Research +Life Length - Experimental Res +Funding - Any other UK organisations +Funding - Business Enterprises in group outside UK +Funding - Other Business Enterprises outside UK +Funding - Other Governments outside UK +Funding - Higher Education Est Outside UK +Funding - Non-profit Orgs outside UK +Funding - International Orgs +Funding - Any other orgs outside UK +Funding - UK Higher Education Establishments +Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM +Purchased/funded R&D in the UK (Yes or No) +Purchased Outside UK (Govt Funded) +Purchased Outside UK (Other) +Total Purchased +Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM +Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM +Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM +Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM +Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM From cf8b14121b3eea0daafee4b7fdffc5a98d120176 Mon Sep 17 00:00:00 2001 From: jwestw Date: Mon, 5 Jun 2023 16:31:24 +0100 Subject: [PATCH 164/411] Adding db friendly titles --- config/descriptions.txt | 114 ++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/config/descriptions.txt b/config/descriptions.txt index 89cf50f11..bab1c48ec 100644 --- a/config/descriptions.txt +++ b/config/descriptions.txt @@ -1,57 +1,57 @@ -Salaries & Wages -Other current expenditure -Total Current Expenditure -Basic Research -Applied Research -Experimental Development -Land & Build CapEx -Equipment & Machinery CapEx -Total Capex. -Total Inhouse Expenditure -Own Funds -Funding - Commission of the EU -Funding - UK government -Funding - Organisations outside the Uk -Funding - Other UK Private Bus/Public Orgs -Funding - Any Other UK -Total Funding -Land Acquired for R&D (Split of Land & Build CapEx) -Buildings acquired/constructed for R&D (Split of Land & Build CapEx) -Expenditure on computer software only (of which from Equipment & Machinery CapEx) -Purchase of Materials (Split of Other current) -Purchase of Services (Split of Other current) -Ownership - Own Business -Ownership - UK Government -Ownership - Other UK Priv Bus -Ownership - Other UK Orgs -Ownership - Bus Enterprises in Group Outside UK -Ownership - Other Bus Enterprises outside UK -Ownership - Other Governments outside UK -Ownership - Higher Education Establishments outside UK -Ownership - Non-profit Orgs outside the UK -Ownership - Commission of EU -Ownership - International Orgs -Ownership - Any other Orgs outside UK -Ownership - not owned freely available -Life Length - Basic Research -Life Length - Applied Research -Life Length - Experimental Res -Funding - Any other UK organisations -Funding - Business Enterprises in group outside UK -Funding - Other Business Enterprises outside UK -Funding - Other Governments outside UK -Funding - Higher Education Est Outside UK -Funding - Non-profit Orgs outside UK -Funding - International Orgs -Funding - Any other orgs outside UK -Funding - UK Higher Education Establishments -Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM -Purchased/funded R&D in the UK (Yes or No) -Purchased Outside UK (Govt Funded) -Purchased Outside UK (Other) -Total Purchased -Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM -Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM -Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM -Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM -Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM +Salaries & Wages: sal_wages +Other current expenditure: othr_expend +Total Current Expenditure: total_expend +Basic Research: basic_res +Applied Research: applied_res +Experimental Development: exp_dev +Land & Build CapEx: land_build_capex +Equipment & Machinery CapEx: equip_mach_capex +Total Capex: total_capex +Total Inhouse Expenditure: total_inhouse_expend +Own Funds: own_funds +Funding - Commission of the EU: fund_eu_commission +Funding - UK government: fund_uk_govt +Funding - Organisations outside the UK: fund_orgs_outside_uk +Funding - Other UK Private Bus/Public Orgs: fund_oth_uk_private_bus_orgs +Funding - Any Other UK: fund_oth_uk +Total Funding: total_funding +Land Acquired for R&D (Split of Land & Build CapEx): land_acq_rnd +Buildings acquired/constructed for R&D (Split of Land & Build CapEx): bldgs_acq_rnd +Expenditure on computer software only (of which from Equipment & Machinery CapEx): exp_computer_software +Purchase of Materials (Split of Other current): purchase_materials +Purchase of Services (Split of Other current): purchase_services +Ownership - Own Business: own_business +Ownership - UK Government: own_uk_govt +Ownership - Other UK Priv Bus: own_oth_uk_priv_bus +Ownership - Other UK Orgs: own_oth_uk_orgs +Ownership - Bus Enterprises in Group Outside UK: own_bus_enterp_outside_uk +Ownership - Other Bus Enterprises outside UK: own_oth_bus_enterp_outside_uk +Ownership - Other Governments outside UK: own_oth_govts_outside_uk +Ownership - Higher Education Establishments outside UK: own_high_edu_est_outside_uk +Ownership - Non-profit Orgs outside the UK: own_non_profit_orgs_outside_uk +Ownership - Commission of EU: own_eu_commission +Ownership - International Orgs: own_intl_orgs +Ownership - Any other Orgs outside UK: own_oth_orgs_outside_uk +Ownership - not owned freely available: own_not_owned_free_avail +Life Length - Basic Research: life_len_basic_res +Life Length - Applied Research: life_len_applied_res +Life Length - Experimental Res: life_len_exp_res +Funding - Any other UK organisations: fund_oth_uk_orgs +Funding - Business Enterprises in group outside UK: fund_bus_enterp_group_outside_uk +Funding - Other Business Enterprises outside UK: fund_oth_bus_enterp_outside_uk +Funding - Other Governments outside UK: fund_oth_govts_outside_uk +Funding - Higher Education Est Outside UK: fund_high_edu_est_outside_uk +Funding - Non-profit Orgs outside UK: fund_non_profit_orgs_outside_uk +Funding - International Orgs: fund_intl_orgs +Funding - Any other orgs outside UK: fund_oth_orgs_outside_uk +Funding - UK Higher Education Establishments: fund_uk_high_edu_est +Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM: tax_cred_inhouse_expend_long +Purchased/funded R&D in the UK (Yes or No): purchd_rnd_uk_yesno +Purchased Outside UK (Govt Funded): purchd_outside_uk_govt_fund +Purchased Outside UK (Other): purchd_outside_uk_oth +Total Purchased: total_purchased +Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM: tax_cred_purchd_work_uk_long +Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM: tax_cred_purchd_work_outside_uk_gov_fund_long +Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM: tax_cred_purchd_work_outside_uk_oth_long +Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM: tax_cred_inhouse_expend_short +Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM: tax_cred_purchd_rnd_short \ No newline at end of file From 97c54fd70236e4a2468a7e76aba28933eebf6d73 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 7 Jun 2023 10:27:29 +0100 Subject: [PATCH 165/411] Splitting out functions --- .../spp_snapshot_processing.py | 59 ++++++++++++++----- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index 7fa9a9894..c1887bd1e 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -1,8 +1,42 @@ +from src.utils.wrappers import validate_dataframe_not_empty + import logging spp_processing_logger = logging.getLogger(__name__) +def create_response_dataframe(df, unique_id_cols): + """Create a response dataframe using pivot_table to reshape the data. + + Arguments: + df -- DataFrame to create the response dataframe from + unique_id_cols -- List of column names that uniquely identify the data + Returns: + response_df -- Response DataFrame + """ + response_df = df.pivot_table( + index=unique_id_cols, columns="questioncode", values="response", aggfunc="first" + ).reset_index() + return response_df + + +def create_contextual_dataframe(df, unique_id_cols): + """Create a contextual dataframe by dropping 'questioncode' and 'response' columns + and removing duplicates. + + Arguments: + df -- DataFrame to create the contextual dataframe from + unique_id_cols -- List of column names that uniquely identify the data + + Returns: + contextual_df -- Contextual DataFrame + """ + cols_to_drop = ["questioncode", "response"] + contextual_df = df.drop(cols_to_drop, axis=1).drop_duplicates() + return contextual_df + + +@validate_dataframe_not_empty def full_responses(contributors, responses): """Merges contributor and response data together into a dataframe that is in a @@ -25,22 +59,18 @@ def full_responses(contributors, responses): contributors_dropped = contributors.drop(drop_cols, axis=1) responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) - merged_df = contributors_dropped.merge(responses_dropped, - on = unique_id_cols) - - contextual_df = merged_df.drop(["questioncode", "response"], - axis=1).drop_duplicates() + merged_df = contributors_dropped.merge(responses_dropped, on=unique_id_cols) + # Create a contextual df by dropping "questioncode" and "response" cols. Remove dupes + contextual_df = create_contextual_dataframe(merged_df, unique_id_cols) - response_df = merged_df.pivot_table(index = unique_id_cols, - columns='questioncode', - values='response', - aggfunc='first').reset_index() + # Create a response dataframe using pivot_table to reshape the data + response_df = create_response_dataframe(merged_df, unique_id_cols) full_responses = response_df.merge(contextual_df, on = unique_id_cols) return full_responses - +@validate_dataframe_not_empty def response_rate(contributors, responses): """Generates a response rate based on the contributor and response data @@ -54,11 +84,12 @@ def response_rate(contributors, responses): Returns: response_rate -- Float representing proportion of contributors who responded """ + # Determine num of responses + response_count = len(responses["reference"].unique()) + # Determine the number of contributors + contributor_count = len(contributors["reference"].unique()) - no_responses = len(responses["reference"].unique()) - no_contributors = len(contributors["reference"].unique()) - - response_rate = no_responses / no_contributors + response_rate = response_count / contributor_count spp_processing_logger.info(f"The SPP response rate is {round(response_rate,2)}%") From 26256ce12dfdfa832c704a5eb6a4ebd5d20f5dcb Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 7 Jun 2023 10:27:47 +0100 Subject: [PATCH 166/411] Adding tests for new functions --- .../test_spp_snapshot_processing.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py index da01d4fe3..687de5e67 100644 --- a/tests/test_data_processing/test_spp_snapshot_processing.py +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -76,3 +76,50 @@ def test_response_rate(dummy_data): expected_response_rate = 2 / 3 # 2 respondents out of 3 contributors assert expected_response_rate == response_rate_value + + +def test_create_response_dataframe(dummy_data): + + from src.data_processing.spp_snapshot_processing import create_response_dataframe + + contributor_data, responses_data = dummy_data + unique_id_cols = ["reference", "period", "survey"] + expected_columns = ["reference", "period", "survey", 200, 201, 202] + expected_data = [ + [101, 202012, 1, 0, 50, 100], + [102, 202012, 1, 75, 25, 65], + ] + + response_df = create_response_dataframe(responses_data, unique_id_cols) + + # Assert the columns + assert response_df.columns.tolist() == expected_columns + + # Assert the data + assert response_df.values.tolist() == expected_data + +def test_create_contextual_dataframe(dummy_data): + contributor_data, responses_data = dummy_data + unique_id_cols = ["reference", "period", "survey"] + expected_columns = [ + "reference", + "period", + "survey", + "createdby", + "createddate", + "lastupdatedby", + "lastupdateddate", + ] + expected_data = [ + [101, 202012, 1, "James", 2020, "Vondy", 2020], + [102, 202012, 1, "Ilyas", 2020, "Charl", 2020], + [103, 202012, 1, "Roddy", 2020, "Gareth", 2020], + ] + + contextual_df = create_contextual_dataframe(contributor_data, unique_id_cols) + + # Assert the columns + assert contextual_df.columns.tolist() == expected_columns + + # Assert the data + assert contextual_df.values.tolist() == expected_data \ No newline at end of file From d0807d88f99178c8940a6caf6967913d02e78349 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 7 Jun 2023 10:28:06 +0100 Subject: [PATCH 167/411] new wrapper to check df not empty --- src/utils/wrappers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/utils/wrappers.py b/src/utils/wrappers.py index 63f789a61..37999ac89 100644 --- a/src/utils/wrappers.py +++ b/src/utils/wrappers.py @@ -196,3 +196,10 @@ def _change_direction(before, after): """Trouble at mill!!! Mistake in config. Either 'Table' or 'SingleLine' must be specified.""" ) + +def validate_dataframe_not_empty(func): + def wrapper(df, *args, **kwargs): + if df.empty: + raise ValueError("Input dataframe is empty.") + return func(df, *args, **kwargs) + return wrapper \ No newline at end of file From 994c73c06cd1c2ae85dbd50e88ac38adfa1ee3b7 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 27 Apr 2023 13:33:00 +0100 Subject: [PATCH 168/411] Had to comment out several lines in main as functions missing, including Config_settings. Branch cloned from develop. --- src/main.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/main.py b/src/main.py index c824701b2..a44281aee 100644 --- a/src/main.py +++ b/src/main.py @@ -1,9 +1,10 @@ """The main pipeline""" -from src.utils import runlog -from src._version import __version__ as version -from src.utils.helpers import Config_settings -from src.utils.wrappers import logger_creator +# from src.utils import runlog +# from src._version import __version__ as version + +# from src.utils.helpers import Config_settings +# from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data from src.data_ingest import spp_parser from src.data_processing import spp_snapshot_processing as processing @@ -32,9 +33,9 @@ def run_pipeline(start): global_config = config["global"] runlog_obj = runlog.RunLog(config, version) - logger = logger_creator(global_config) + # logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") - logger.info("Collecting logging parameters ..........") + # logger.info("Collecting logging parameters ..........") Manipulate_data() # Data Ingest @@ -78,13 +79,12 @@ def run_pipeline(start): MainLogger.info("Finishing Pipeline .......................") - runlog_obj.retrieve_pipeline_logs() + # runlog_obj.retrieve_pipeline_logs() - run_time = round(time.time() - start, 5) - runlog_obj._record_time_taken(run_time) + # run_time = round(time.time() - start, 5) + # runlog_obj._record_time_taken(run_time) - runlog_obj.retrieve_configs() - runlog_obj._create_runlog_dicts() - runlog_obj._create_runlog_dfs() - runlog_obj.create_runlog_files() - runlog_obj._write_runlog() + # runlog_obj._create_runlog_dicts() + # runlog_obj._create_runlog_dfs() + # runlog_obj.create_runlog_files() + # runlog_obj._write_runlog() From 8ec73844f042490f976f023cb1ee4a8b794bd4ec Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:27:54 +0100 Subject: [PATCH 169/411] Reverted src/main.py to have no commented lines. --- src/main.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/main.py b/src/main.py index a44281aee..252c954d3 100644 --- a/src/main.py +++ b/src/main.py @@ -1,10 +1,10 @@ """The main pipeline""" -# from src.utils import runlog -# from src._version import __version__ as version +from src.utils import runlog +from src._version import __version__ as version -# from src.utils.helpers import Config_settings -# from src.utils.wrappers import logger_creator +from src.utils.helpers import Config_settings +from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data from src.data_ingest import spp_parser from src.data_processing import spp_snapshot_processing as processing @@ -33,9 +33,9 @@ def run_pipeline(start): global_config = config["global"] runlog_obj = runlog.RunLog(config, version) - # logger = logger_creator(global_config) + logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") - # logger.info("Collecting logging parameters ..........") + logger.info("Collecting logging parameters ..........") Manipulate_data() # Data Ingest @@ -79,12 +79,12 @@ def run_pipeline(start): MainLogger.info("Finishing Pipeline .......................") - # runlog_obj.retrieve_pipeline_logs() + runlog_obj.retrieve_pipeline_logs() - # run_time = round(time.time() - start, 5) - # runlog_obj._record_time_taken(run_time) + run_time = round(time.time() - start, 5) + runlog_obj._record_time_taken(run_time) - # runlog_obj._create_runlog_dicts() - # runlog_obj._create_runlog_dfs() - # runlog_obj.create_runlog_files() - # runlog_obj._write_runlog() + runlog_obj._create_runlog_dicts() + runlog_obj._create_runlog_dfs() + runlog_obj.create_runlog_files() + runlog_obj._write_runlog() From 9439a11eb9be4986d8fcace0772f5fed13aede15 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:33:10 +0100 Subject: [PATCH 170/411] Reverted src/main.py to how to was before (removed newlines in imports). --- src/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.py b/src/main.py index 252c954d3..91501cf3d 100644 --- a/src/main.py +++ b/src/main.py @@ -2,7 +2,6 @@ from src.utils import runlog from src._version import __version__ as version - from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data From 9da0558303f0a5a2f2f0af25a498230dcfa91e9e Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 30 Mar 2023 17:34:54 +0100 Subject: [PATCH 171/411] Adding pre-commit hook --- .pre-commit-config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e86459372..39690211d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,3 +67,9 @@ repos: entry: These file extensions are restricted. Data should be removed from the commit language: fail files: .*\.(csv|feather|xlsx|zip|hdf5|h5|json|xml|hd|parquet) + - repo: local + hooks: + - id: coverage-badge + name: Update the coverage badge in the readme + entry: bash -c 'lines=$(readme-cov)' + language: system From 7ba89118fb66c2a99e4ffb941c22fe8ef4731765 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Mon, 5 Jun 2023 18:13:38 +0100 Subject: [PATCH 172/411] testing commits in 116 --- add_ssh_key.bat | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100755 add_ssh_key.bat diff --git a/add_ssh_key.bat b/add_ssh_key.bat new file mode 100755 index 000000000..745561adc --- /dev/null +++ b/add_ssh_key.bat @@ -0,0 +1,7 @@ +#!/bin/bash + +# Start the SSH agent and evaluate its output +eval "$(ssh-agent -s)" + +# Add the SSH key to the agent +ssh-add ~/.ssh/githubwork/work_key \ No newline at end of file From 336632d74ae8e8a53d30d8cec677611fd1dd2e4f Mon Sep 17 00:00:00 2001 From: allmag Date: Thu, 11 May 2023 18:41:39 +0100 Subject: [PATCH 173/411] Added a function into loading.py and added new script to reformat spp_snapshot dataframe --- src/data_processing/spp_snapshot_processing | 77 +++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/data_processing/spp_snapshot_processing diff --git a/src/data_processing/spp_snapshot_processing b/src/data_processing/spp_snapshot_processing new file mode 100644 index 000000000..6479d71f8 --- /dev/null +++ b/src/data_processing/spp_snapshot_processing @@ -0,0 +1,77 @@ +import pandas as pd + +from src.utils.helpers import Config_settings +from src.utils.hdfs_mods import hdfs_load_json + +conf_obj = Config_settings() +config = conf_obj.config_dict +snapshot_path = config["snapshot_path"] # Taken from config file + +from src.data_ingest.loading import load_snapshot_data + + +def full_responses(contributors, responses): + + """Merges contributor and response data together into a dataframe that is in a + format allowing for easier manipulation later in pipeline - notably through + having each questioncode as its own column. + + Arguments: + contributors -- DataFrame containing contributor data for BERD + from SPP Snapshot file + responses -- DataFrame containing response data for BERD from SPP Snapshot file + + Returns: + full_responses -- DataFrame containing both response and contributor data + """ + + drop_cols = ["createdby", "createddate", "lastupdatedby", "lastupdateddate"] + + unique_id_cols = ["reference", "period", "survey"] + + contributors_dropped = contributors.drop(drop_cols, axis=1) + responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) + + merged_df = contributors_dropped.merge(responses_dropped, + on = unique_id_cols) + + contextual_df = merged_df.drop(["questioncode", "response"], + axis=1).drop_duplicates() + + response_df = merged_df.pivot_table(index = unique_id_cols, + columns='questioncode', + values='response', + aggfunc=','.join).reset_index() + + full_responses = response_df.merge(contextual_df, on = unique_id_cols) + + return full_responses + + +def response_rate(contributors, responses): + + """Generates a response rate based on the contributor and response data + from the SPP Snapshot file. + + Arguments: + contributors -- DataFrame containing contributor data for BERD + from SPP Snapshot file + responses -- DataFrame containing response data for BERD from SPP Snapshot file + + Returns: + response_rate -- Float representing proportion of contributors who responded + """ + + no_responses = len(responses["reference"].unique()) + no_contributors = len(contributors["reference"].unique()) + + response_rate = no_responses / no_contributors + + return response_rate + +contributors = load_snapshot_data(snapshot_path, data_type = "contributors") +responses = load_snapshot_data(snapshot_path, data_type = "responses") + +full_responses = full_responses(contributors, responses) + +print("\nThe response rate is", "{0:.1%}".format(response_rate(contributors, responses))) From 8756167078af3b03261f80dbe2221be1ae6a8837 Mon Sep 17 00:00:00 2001 From: allmag Date: Tue, 23 May 2023 12:05:23 +0100 Subject: [PATCH 174/411] Added to main.py and fixed test# --- src/data_processing/spp_snapshot_processing | 77 ------------------- src/main.py | 2 +- .../test_spp_snapshot_processing.py | 2 +- 3 files changed, 2 insertions(+), 79 deletions(-) delete mode 100644 src/data_processing/spp_snapshot_processing diff --git a/src/data_processing/spp_snapshot_processing b/src/data_processing/spp_snapshot_processing deleted file mode 100644 index 6479d71f8..000000000 --- a/src/data_processing/spp_snapshot_processing +++ /dev/null @@ -1,77 +0,0 @@ -import pandas as pd - -from src.utils.helpers import Config_settings -from src.utils.hdfs_mods import hdfs_load_json - -conf_obj = Config_settings() -config = conf_obj.config_dict -snapshot_path = config["snapshot_path"] # Taken from config file - -from src.data_ingest.loading import load_snapshot_data - - -def full_responses(contributors, responses): - - """Merges contributor and response data together into a dataframe that is in a - format allowing for easier manipulation later in pipeline - notably through - having each questioncode as its own column. - - Arguments: - contributors -- DataFrame containing contributor data for BERD - from SPP Snapshot file - responses -- DataFrame containing response data for BERD from SPP Snapshot file - - Returns: - full_responses -- DataFrame containing both response and contributor data - """ - - drop_cols = ["createdby", "createddate", "lastupdatedby", "lastupdateddate"] - - unique_id_cols = ["reference", "period", "survey"] - - contributors_dropped = contributors.drop(drop_cols, axis=1) - responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) - - merged_df = contributors_dropped.merge(responses_dropped, - on = unique_id_cols) - - contextual_df = merged_df.drop(["questioncode", "response"], - axis=1).drop_duplicates() - - response_df = merged_df.pivot_table(index = unique_id_cols, - columns='questioncode', - values='response', - aggfunc=','.join).reset_index() - - full_responses = response_df.merge(contextual_df, on = unique_id_cols) - - return full_responses - - -def response_rate(contributors, responses): - - """Generates a response rate based on the contributor and response data - from the SPP Snapshot file. - - Arguments: - contributors -- DataFrame containing contributor data for BERD - from SPP Snapshot file - responses -- DataFrame containing response data for BERD from SPP Snapshot file - - Returns: - response_rate -- Float representing proportion of contributors who responded - """ - - no_responses = len(responses["reference"].unique()) - no_contributors = len(contributors["reference"].unique()) - - response_rate = no_responses / no_contributors - - return response_rate - -contributors = load_snapshot_data(snapshot_path, data_type = "contributors") -responses = load_snapshot_data(snapshot_path, data_type = "responses") - -full_responses = full_responses(contributors, responses) - -print("\nThe response rate is", "{0:.1%}".format(response_rate(contributors, responses))) diff --git a/src/main.py b/src/main.py index 91501cf3d..9f69ddd4c 100644 --- a/src/main.py +++ b/src/main.py @@ -86,4 +86,4 @@ def run_pipeline(start): runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() runlog_obj.create_runlog_files() - runlog_obj._write_runlog() + runlog_obj._write_runlog() \ No newline at end of file diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py index 687de5e67..22b1eb0d3 100644 --- a/tests/test_data_processing/test_spp_snapshot_processing.py +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -122,4 +122,4 @@ def test_create_contextual_dataframe(dummy_data): assert contextual_df.columns.tolist() == expected_columns # Assert the data - assert contextual_df.values.tolist() == expected_data \ No newline at end of file + assert contextual_df.values.tolist() == expected_data From c282b952c3ee2bc7861e2acf3ca96c51d8b4dc3e Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Tue, 6 Jun 2023 18:53:55 +0100 Subject: [PATCH 175/411] Tests for loading.py module --- src/data_ingest/spp_parser.py | 2 +- tests/test_data_ingest/test_loading.py | 43 ++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 tests/test_data_ingest/test_loading.py diff --git a/src/data_ingest/spp_parser.py b/src/data_ingest/spp_parser.py index 6e1340e17..0fd4ce948 100644 --- a/src/data_ingest/spp_parser.py +++ b/src/data_ingest/spp_parser.py @@ -20,7 +20,7 @@ def parse_snap_data(snapdata: dict) -> Tuple[pd.DataFrame, pd.DataFrame]: Returns: Tuple[pd.DataFrame, pd.DataFrame]: The contributers and responders dataframes """ - # Load the dicts! + # Load the dicts contributordict = snapdata["contributors"] responsesdict = snapdata["responses"] diff --git a/tests/test_data_ingest/test_loading.py b/tests/test_data_ingest/test_loading.py new file mode 100644 index 000000000..195727e1f --- /dev/null +++ b/tests/test_data_ingest/test_loading.py @@ -0,0 +1,43 @@ +import pandas as pd +from typing import Tuple + +# Import modules to test +from src.data_ingest.loading import parse_snap_data + + +class TestParseSPP: + """Test for Parse Snap data function""" + + def input_data(self) -> dict: + dummy_snapdata = { + "snapshot_id": "", + "contributors": [ + {"ref": "123", "con": "789"}, + {"ref": "456", "con": "910"}, + ], + "responses": [{"ref": "123", "res": "789"}, {"ref": "456", "res": "910"}], + } + + return dummy_snapdata + + def exp_output(self) -> Tuple[pd.DataFrame, pd.DataFrame]: + contributor_df = pd.DataFrame( + [{"ref": "123", "con": "789"}, {"ref": "456", "con": "910"}] + ) + + responses_df = pd.DataFrame( + [{"ref": "123", "res": "789"}, {"ref": "456", "res": "910"}] + ) + + return contributor_df, responses_df + + def test_parse_snap_data(self): + """Tests for full_responses function.""" + + inputdata = self.input_data() + df_result1, df_result2 = parse_snap_data(inputdata) + + expected_output_data1, expected_output_data2 = self.exp_output() + + pd.testing.assert_frame_equal(df_result1, expected_output_data1) + pd.testing.assert_frame_equal(df_result2, expected_output_data2) From 4fc01b28b78c7d712de680e644766672bdbb1c7e Mon Sep 17 00:00:00 2001 From: allmag Date: Thu, 11 May 2023 18:41:39 +0100 Subject: [PATCH 176/411] Added a function into loading.py and added new script to reformat spp_snapshot dataframe --- src/data_processing/spp_snapshot_processing | 77 +++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/data_processing/spp_snapshot_processing diff --git a/src/data_processing/spp_snapshot_processing b/src/data_processing/spp_snapshot_processing new file mode 100644 index 000000000..6479d71f8 --- /dev/null +++ b/src/data_processing/spp_snapshot_processing @@ -0,0 +1,77 @@ +import pandas as pd + +from src.utils.helpers import Config_settings +from src.utils.hdfs_mods import hdfs_load_json + +conf_obj = Config_settings() +config = conf_obj.config_dict +snapshot_path = config["snapshot_path"] # Taken from config file + +from src.data_ingest.loading import load_snapshot_data + + +def full_responses(contributors, responses): + + """Merges contributor and response data together into a dataframe that is in a + format allowing for easier manipulation later in pipeline - notably through + having each questioncode as its own column. + + Arguments: + contributors -- DataFrame containing contributor data for BERD + from SPP Snapshot file + responses -- DataFrame containing response data for BERD from SPP Snapshot file + + Returns: + full_responses -- DataFrame containing both response and contributor data + """ + + drop_cols = ["createdby", "createddate", "lastupdatedby", "lastupdateddate"] + + unique_id_cols = ["reference", "period", "survey"] + + contributors_dropped = contributors.drop(drop_cols, axis=1) + responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) + + merged_df = contributors_dropped.merge(responses_dropped, + on = unique_id_cols) + + contextual_df = merged_df.drop(["questioncode", "response"], + axis=1).drop_duplicates() + + response_df = merged_df.pivot_table(index = unique_id_cols, + columns='questioncode', + values='response', + aggfunc=','.join).reset_index() + + full_responses = response_df.merge(contextual_df, on = unique_id_cols) + + return full_responses + + +def response_rate(contributors, responses): + + """Generates a response rate based on the contributor and response data + from the SPP Snapshot file. + + Arguments: + contributors -- DataFrame containing contributor data for BERD + from SPP Snapshot file + responses -- DataFrame containing response data for BERD from SPP Snapshot file + + Returns: + response_rate -- Float representing proportion of contributors who responded + """ + + no_responses = len(responses["reference"].unique()) + no_contributors = len(contributors["reference"].unique()) + + response_rate = no_responses / no_contributors + + return response_rate + +contributors = load_snapshot_data(snapshot_path, data_type = "contributors") +responses = load_snapshot_data(snapshot_path, data_type = "responses") + +full_responses = full_responses(contributors, responses) + +print("\nThe response rate is", "{0:.1%}".format(response_rate(contributors, responses))) From ce4d97edcd63f22acaa0357bd8e97a1f42af83da Mon Sep 17 00:00:00 2001 From: allmag Date: Tue, 23 May 2023 12:05:23 +0100 Subject: [PATCH 177/411] Added to main.py and fixed test# --- src/data_processing/spp_snapshot_processing | 77 --------------------- 1 file changed, 77 deletions(-) delete mode 100644 src/data_processing/spp_snapshot_processing diff --git a/src/data_processing/spp_snapshot_processing b/src/data_processing/spp_snapshot_processing deleted file mode 100644 index 6479d71f8..000000000 --- a/src/data_processing/spp_snapshot_processing +++ /dev/null @@ -1,77 +0,0 @@ -import pandas as pd - -from src.utils.helpers import Config_settings -from src.utils.hdfs_mods import hdfs_load_json - -conf_obj = Config_settings() -config = conf_obj.config_dict -snapshot_path = config["snapshot_path"] # Taken from config file - -from src.data_ingest.loading import load_snapshot_data - - -def full_responses(contributors, responses): - - """Merges contributor and response data together into a dataframe that is in a - format allowing for easier manipulation later in pipeline - notably through - having each questioncode as its own column. - - Arguments: - contributors -- DataFrame containing contributor data for BERD - from SPP Snapshot file - responses -- DataFrame containing response data for BERD from SPP Snapshot file - - Returns: - full_responses -- DataFrame containing both response and contributor data - """ - - drop_cols = ["createdby", "createddate", "lastupdatedby", "lastupdateddate"] - - unique_id_cols = ["reference", "period", "survey"] - - contributors_dropped = contributors.drop(drop_cols, axis=1) - responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) - - merged_df = contributors_dropped.merge(responses_dropped, - on = unique_id_cols) - - contextual_df = merged_df.drop(["questioncode", "response"], - axis=1).drop_duplicates() - - response_df = merged_df.pivot_table(index = unique_id_cols, - columns='questioncode', - values='response', - aggfunc=','.join).reset_index() - - full_responses = response_df.merge(contextual_df, on = unique_id_cols) - - return full_responses - - -def response_rate(contributors, responses): - - """Generates a response rate based on the contributor and response data - from the SPP Snapshot file. - - Arguments: - contributors -- DataFrame containing contributor data for BERD - from SPP Snapshot file - responses -- DataFrame containing response data for BERD from SPP Snapshot file - - Returns: - response_rate -- Float representing proportion of contributors who responded - """ - - no_responses = len(responses["reference"].unique()) - no_contributors = len(contributors["reference"].unique()) - - response_rate = no_responses / no_contributors - - return response_rate - -contributors = load_snapshot_data(snapshot_path, data_type = "contributors") -responses = load_snapshot_data(snapshot_path, data_type = "responses") - -full_responses = full_responses(contributors, responses) - -print("\nThe response rate is", "{0:.1%}".format(response_rate(contributors, responses))) From 62470ed89c34fc19c4ed5e2cedeb36da9b4e9e6e Mon Sep 17 00:00:00 2001 From: westwj1 Date: Wed, 31 May 2023 14:56:44 +0100 Subject: [PATCH 178/411] Add requi file. Remove txt from pre-commit check --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b79d95fd2..ec7a438fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,4 @@ coverage pyyaml requests sphinx -postcodes_uk +postcodes_uk # remove this later - use regex From a777807f4c26f4d491143b8c8b79ab9264ff3b57 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 27 Apr 2023 13:33:00 +0100 Subject: [PATCH 179/411] Had to comment out several lines in main as functions missing, including Config_settings. Branch cloned from develop. --- src/main.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main.py b/src/main.py index 9f69ddd4c..7efba1974 100644 --- a/src/main.py +++ b/src/main.py @@ -2,6 +2,7 @@ from src.utils import runlog from src._version import __version__ as version + from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data @@ -32,9 +33,9 @@ def run_pipeline(start): global_config = config["global"] runlog_obj = runlog.RunLog(config, version) - logger = logger_creator(global_config) + # logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") - logger.info("Collecting logging parameters ..........") + # logger.info("Collecting logging parameters ..........") Manipulate_data() # Data Ingest @@ -78,12 +79,12 @@ def run_pipeline(start): MainLogger.info("Finishing Pipeline .......................") - runlog_obj.retrieve_pipeline_logs() + # runlog_obj.retrieve_pipeline_logs() - run_time = round(time.time() - start, 5) - runlog_obj._record_time_taken(run_time) + # run_time = round(time.time() - start, 5) + # runlog_obj._record_time_taken(run_time) runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() runlog_obj.create_runlog_files() - runlog_obj._write_runlog() \ No newline at end of file + runlog_obj._write_runlog() From ace57ef84ba60e805f18a04f09c11690d370163d Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 18:27:54 +0100 Subject: [PATCH 180/411] Reverted src/main.py to have no commented lines. --- src/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main.py b/src/main.py index 7efba1974..252c954d3 100644 --- a/src/main.py +++ b/src/main.py @@ -33,9 +33,9 @@ def run_pipeline(start): global_config = config["global"] runlog_obj = runlog.RunLog(config, version) - # logger = logger_creator(global_config) + logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") - # logger.info("Collecting logging parameters ..........") + logger.info("Collecting logging parameters ..........") Manipulate_data() # Data Ingest @@ -79,10 +79,10 @@ def run_pipeline(start): MainLogger.info("Finishing Pipeline .......................") - # runlog_obj.retrieve_pipeline_logs() + runlog_obj.retrieve_pipeline_logs() - # run_time = round(time.time() - start, 5) - # runlog_obj._record_time_taken(run_time) + run_time = round(time.time() - start, 5) + runlog_obj._record_time_taken(run_time) runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() From 21db591059f9a794d5e232a61f8c38e270de16a0 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 16:52:41 +0100 Subject: [PATCH 181/411] Created short db names in data schema --- config/{Data_Schema.toml => data_schema.toml} | 570 ++++++++++++++++++ 1 file changed, 570 insertions(+) rename config/{Data_Schema.toml => data_schema.toml} (52%) diff --git a/config/Data_Schema.toml b/config/data_schema.toml similarity index 52% rename from config/Data_Schema.toml rename to config/data_schema.toml index eab7f35d5..564da1deb 100644 --- a/config/Data_Schema.toml +++ b/config/data_schema.toml @@ -737,3 +737,573 @@ Min_values = None Max_values = None Possible_categorical_Values = [] q_code = + +[sal_wages] +Description = Salaries & Wages +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q202 + +[othr_expend] +Description = Other current expenditure +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q203 + +[total_expend] +Description = Total Current Expenditure +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q204 + +[basic_res] +Description = Basic Research +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q205 + +[applied_res] +Description = Applied Research +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q206 + +[exp_dev] +Description = Experimental Development +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q207 + +[land_build_capex] +Description = Land & Build CapEx +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q208 + +[equip_mach_capex] +Description = Equipment & Machinery CapEx +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q209 + +[total_capex] +Description = Total Capex. +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q210 + +[total_inhouse_expend] +Description = Total Inhouse Expenditure +Deduced_Data_Type = Numeric Integer +Nullable = No +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q211 + +[own_funds] +Description = Own Funds +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q212 + +[fund_eu_commission] +Description = Funding - Commission of the EU +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q213 + +[fund_uk_govt] +Description = Funding - UK government +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q214 + +[fund_orgs_outside_uk] +Description = Funding - Organisations outside the UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q215 + +[fund_oth_uk_private_bus_orgs] +Description = Funding - Other UK Private Bus/Public Orgs +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q216 + +[fund_oth_uk] +Description = Funding - Any Other UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q217 + +[total_funding] +Description = Total Funding +Deduced_Data_Type = Numeric Integer +Nullable = No +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q218 + +[land_acq_rnd] +Description = Land Acquired for R&D (Split of Land & Build CapEx) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q219 + +[bldgs_acq_rnd] +Description = Buildings acquired/constructed for R&D (Split of Land & Build CapEx) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q220 + +[exp_computer_software] +Description = Expenditure on computer software only (of which from Equipment & Machinery CapEx) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q221 + +[purchase_materials] +Description = Purchase of Materials (Split of Other current) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q222 + +[purchase_services] +Description = Purchase of Services (Split of Other current) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q223 + +[own_business] +Description = Ownership - Own Business +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q225 + +[own_uk_govt] +Description = Ownership - UK Government +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q226 + +[own_oth_uk_priv_bus] +Description = Ownership - Other UK Priv Bus +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q227 + +[own_oth_uk_orgs] +Description = Ownership - Other UK Orgs +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q228 + +[own_bus_enterp_outside_uk] +Description = Ownership - Bus Enterprises in Group Outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q229 + +[own_oth_bus_enterp_outside_uk] +Description = Ownership - Other Bus Enterprises outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q230 + +[own_oth_govts_outside_uk] +Description = Ownership - Other Governments outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q231 + +[own_high_edu_est_outside_uk] +Description = Ownership - Higher Education Establishments outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q232 + +[own_non_profit_orgs_outside_uk] +Description = Ownership - Non-profit Orgs outside the UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q233 + +[own_eu_commission] +Description = Ownership - Commission of EU +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q234 + +[own_intl_orgs] +Description = Ownership - International Orgs +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q235 + +[own_oth_orgs_outside_uk] +Description = Ownership - Any other Orgs outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q236 + +[own_not_owned_free_avail] +Description = Ownership - not owned freely available +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q237 + +[life_len_basic_res] +Description = Life Length - Basic Research +Deduced_Data_Type = Numeric Integer +Nullable = Not Asked +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q239 + +[life_len_applied_res] +Description = Life Length - Applied Research +Deduced_Data_Type = Numeric Integer +Nullable = Not Asked +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q240 + +[life_len_exp_res] +Description = Life Length - Experimental Res +Deduced_Data_Type = Numeric Integer +Nullable = Not Asked +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q241 + +[fund_oth_uk_orgs] +Description = Funding - Any other UK organisations +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q242 + +[fund_bus_enterp_group_outside_uk] +Description = Funding - Business Enterprises in group outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q243 + +[fund_oth_bus_enterp_outside_uk] +Description = Funding - Other Business Enterprises outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q244 + +[fund_oth_govts_outside_uk] +Description = Funding - Other Governments outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q245 + +[fund_high_edu_est_outside_uk] +Description = Funding - Higher Education Est Outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q246 + +[fund_non_profit_orgs_outside_uk] +Description = Funding - Non-profit Orgs outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q247 + +[fund_intl_orgs] +Description = Funding - International Orgs +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q248 + +[fund_oth_orgs_outside_uk] +Description = Funding - Any other orgs outside UK +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q249 + +[fund_uk_high_edu_est] +Description = Funding - UK Higher Education Establishments +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q250 + +[tax_cred_inhouse_expend_long] +Description = Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM +Deduced_Data_Type = Boolean (True or False, 0 or 1) +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] +q_code = q251 + +[purchd_rnd_uk_yesno] +Description = Purchased/funded R&D in the UK (Yes or No) +Deduced_Data_Type = Boolean (True or False, 0 or 1) +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] +q_code = q302 + +[purchd_outside_uk_govt_fund] +Description = Purchased Outside UK (Govt Funded) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q303 + +[purchd_outside_uk_oth] +Description = Purchased Outside UK (Other) +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q304 + +[total_purchased] +Description = Total Purchased +Deduced_Data_Type = Numeric Integer +Nullable = Yes +Length = # an integer of the length, or N/A +Min_values = 0 +Max_values = 1000000 +Possible_categorical_Values = [] +q_code = q305 + +[tax_cred_purchd_work_uk_long] +Description = Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM +Deduced_Data_Type = Boolean (True or False, 0 or 1) +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] +q_code = q307 + +[tax_cred_purchd_work_outside_uk_gov_fund_long] +Description = Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM +Deduced_Data_Type = Boolean (True or False, 0 or 1) +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] +q_code = q308 + +[tax_cred_purchd_work_outside_uk_oth_long] +Description = Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM +Deduced_Data_Type = Boolean (True or False, 0 or 1) +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] +q_code = q309 + +[tax_cred_inhouse_expend_short] +Description = Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM +Deduced_Data_Type = Boolean (True or False, 0 or 1) +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] +q_code = q713 + +[tax_cred_purchd_rnd_short] +Description = Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM +Deduced_Data_Type = Boolean (True or False, 0 or 1) +Nullable = No +Length = # an integer of the length, or N/A +Min_values = None +Max_values = None +Possible_categorical_Values = [] +q_code = q714 From ba721dc30b83198d1698a95f0b465cb91328f7cd Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 16:56:44 +0100 Subject: [PATCH 182/411] Deleting descriptions --- config/descriptions.txt | 57 ----------------------------------------- 1 file changed, 57 deletions(-) delete mode 100644 config/descriptions.txt diff --git a/config/descriptions.txt b/config/descriptions.txt deleted file mode 100644 index bab1c48ec..000000000 --- a/config/descriptions.txt +++ /dev/null @@ -1,57 +0,0 @@ -Salaries & Wages: sal_wages -Other current expenditure: othr_expend -Total Current Expenditure: total_expend -Basic Research: basic_res -Applied Research: applied_res -Experimental Development: exp_dev -Land & Build CapEx: land_build_capex -Equipment & Machinery CapEx: equip_mach_capex -Total Capex: total_capex -Total Inhouse Expenditure: total_inhouse_expend -Own Funds: own_funds -Funding - Commission of the EU: fund_eu_commission -Funding - UK government: fund_uk_govt -Funding - Organisations outside the UK: fund_orgs_outside_uk -Funding - Other UK Private Bus/Public Orgs: fund_oth_uk_private_bus_orgs -Funding - Any Other UK: fund_oth_uk -Total Funding: total_funding -Land Acquired for R&D (Split of Land & Build CapEx): land_acq_rnd -Buildings acquired/constructed for R&D (Split of Land & Build CapEx): bldgs_acq_rnd -Expenditure on computer software only (of which from Equipment & Machinery CapEx): exp_computer_software -Purchase of Materials (Split of Other current): purchase_materials -Purchase of Services (Split of Other current): purchase_services -Ownership - Own Business: own_business -Ownership - UK Government: own_uk_govt -Ownership - Other UK Priv Bus: own_oth_uk_priv_bus -Ownership - Other UK Orgs: own_oth_uk_orgs -Ownership - Bus Enterprises in Group Outside UK: own_bus_enterp_outside_uk -Ownership - Other Bus Enterprises outside UK: own_oth_bus_enterp_outside_uk -Ownership - Other Governments outside UK: own_oth_govts_outside_uk -Ownership - Higher Education Establishments outside UK: own_high_edu_est_outside_uk -Ownership - Non-profit Orgs outside the UK: own_non_profit_orgs_outside_uk -Ownership - Commission of EU: own_eu_commission -Ownership - International Orgs: own_intl_orgs -Ownership - Any other Orgs outside UK: own_oth_orgs_outside_uk -Ownership - not owned freely available: own_not_owned_free_avail -Life Length - Basic Research: life_len_basic_res -Life Length - Applied Research: life_len_applied_res -Life Length - Experimental Res: life_len_exp_res -Funding - Any other UK organisations: fund_oth_uk_orgs -Funding - Business Enterprises in group outside UK: fund_bus_enterp_group_outside_uk -Funding - Other Business Enterprises outside UK: fund_oth_bus_enterp_outside_uk -Funding - Other Governments outside UK: fund_oth_govts_outside_uk -Funding - Higher Education Est Outside UK: fund_high_edu_est_outside_uk -Funding - Non-profit Orgs outside UK: fund_non_profit_orgs_outside_uk -Funding - International Orgs: fund_intl_orgs -Funding - Any other orgs outside UK: fund_oth_orgs_outside_uk -Funding - UK Higher Education Establishments: fund_uk_high_edu_est -Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM: tax_cred_inhouse_expend_long -Purchased/funded R&D in the UK (Yes or No): purchd_rnd_uk_yesno -Purchased Outside UK (Govt Funded): purchd_outside_uk_govt_fund -Purchased Outside UK (Other): purchd_outside_uk_oth -Total Purchased: total_purchased -Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM: tax_cred_purchd_work_uk_long -Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM: tax_cred_purchd_work_outside_uk_gov_fund_long -Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM: tax_cred_purchd_work_outside_uk_oth_long -Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM: tax_cred_inhouse_expend_short -Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM: tax_cred_purchd_rnd_short \ No newline at end of file From 24f780723d587b26d804818404de0a6cf441dacb Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 16:57:18 +0100 Subject: [PATCH 183/411] Splitting data dicts --- config/contributors_schema.toml | 469 +++++++++++++++++ ...data_schema.toml => responses_schema.toml} | 470 ------------------ 2 files changed, 469 insertions(+), 470 deletions(-) create mode 100644 config/contributors_schema.toml rename config/{data_schema.toml => responses_schema.toml} (67%) diff --git a/config/contributors_schema.toml b/config/contributors_schema.toml new file mode 100644 index 000000000..fb92d1fe0 --- /dev/null +++ b/config/contributors_schema.toml @@ -0,0 +1,469 @@ +[snapshot_id] +Description = "nan" +Deduced_Data_Type = "nan" +Nullable = "nan" +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] + +[reference] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = ">=1" +Min_values = 11001603625 +Max_values = 19891309165 +Possible_Categorical_Values = ["nan"] + +[period] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = 6 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [202012] + +[survey] +Description = "All values are 002" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = ">=1" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["002"] + +[formid] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "int" +Length = 2 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [20, 21] + +[status] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "str" +Length = ">=1" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Clear - overridden SE", "Form sent out", "Check needed", "Combined child (NIL2)", "Out of scope (NIL3)", "Ceased trading (NIL4)", "Dormant (NIL5)", "Part year return (NIL8)", "No UK activity (NIL9)"] + +[statusencoded] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "int" +Length = 3 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309] + +[receiptdate] +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" +Deduced_Data_Type = "Datetime" +Nullable = False +Current_Data_Type = ["None","str"] +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] + +[lockedby] +Description = "All empty strings" +Deduced_Data_Type = "pandas.NA" +Nullable = True +Current_Data_Type = ["None","str"] +Length = 0 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] + +[lockeddate] +Description = "All None type" +Deduced_Data_Type = "pandas.NA" +Nullable = True +Current_Data_Type = ["None","str"] +Length = 0 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] + +[formtype] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["0001", "0006"] + +[checkletter] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "str" +Length = 1 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", "S"] + +[frozensicoutdated] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 1120 +Max_values = 93059 +Possible_Categorical_Values = ["nan"] + +[rusicoutdated] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 1120 +Max_values = 93059 +Possible_Categorical_Values = ["nan"] + +[frozensic] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 1300 +Max_values = 96090 +Possible_Categorical_Values = ["nan"] + +[rusic] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 1300 +Max_values = 96090 +Possible_Categorical_Values = ["nan"] + +[frozenemployees] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0 +Max_values = 272527 +Possible_Categorical_Values = ["nan"] + +[employees] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0 +Max_values = 272528 +Possible_Categorical_Values = ["nan"] + +[frozenemployment] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 1 +Max_values = 272527 +Possible_Categorical_Values = ["nan"] + +[employment] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 1 +Max_values = 272528 +Possible_Categorical_Values = ["nan"] + +[frozenfteemployment] +Description = "nan" +Deduced_Data_Type = "float" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 177699.0 +Possible_Categorical_Values = ["nan"] + +[fteemployment] +Description = "nan" +Deduced_Data_Type = "float" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0.0 +Max_values = 177699.5 +Possible_Categorical_Values = ["nan"] + +[frozenturnover] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0 +Max_values = 55277352 +Possible_Categorical_Values = ["nan"] + +[turnover] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0 +Max_values = 55277352 +Possible_Categorical_Values = ["nan"] + +[enterprisereference] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = ">=1" +Min_values = 1001603625 +Max_values = 9891309165 +Possible_categorical_Values = ["nan"] + +[wowenterprisereference] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = ">=1" +Min_values = 1001603625 +Max_values = 9891309165 +Possible_categorical_Values = ["nan"] + +[cellnumber] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "int" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [0] + +[currency] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["S", "E"] + +[vatreference] +Description = "nan" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["VATREF"] + +[payereference] +Description = "nan" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["PAYEREF"] + +[companyregistrationnumber] +Description = "nan" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["CRN"] + +[numberlivelocalunits] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0 +Max_values = 6063 +Possible_categorical_Values = ["nan"] + +[numberlivevat] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0 +Max_values = 255 +Possible_categorical_Values = ["nan"] + +[numberlivepaye] +Description = "nan" +Deduced_Data_Type = "int" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 0 +Max_values = 24 +Possible_categorical_Values = ["nan"] + +[legalstatus] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = 1 +Max_values = 4 +Possible_Categorical_Values = [1, 2, 3, 4] + +[reportingunitmarker] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["L", "E"] + +[region] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", "XX", "AA", "DC", "GF", "BA"] + +[birthdate] +Description = "Datetime format = format=%d/%m/%Y" +Deduced_Data_Type = "Datetime" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] + +[referencename] +Description = "nan" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] + +[referencepostcode] +Description = "nan" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] + +[tradingstyle] +Description = "nan" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] + +[selectiontype] +Description = "nan" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["L"] + +[inclusionexclusion] +Description = 'All values are " "' +Deduced_Data_Type = "pandas.NA" +Nullable = False +Current_Data_Type = "str" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] + +[createdby] +Description = "nan" +Deduced_Data_Type = "str" +Nullable = False +Current_Data_Type = "str" +Length = ">=1" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["ingestion"] + +[createddate] +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" +Deduced_Data_Type = "Datetime" +Nullable = False +Current_Data_Type = "str" +Length = ">=1" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] + +[lastupdatedby] +Description = "nan" +Deduced_Data_Type = "category" +Nullable = False +Current_Data_Type = ["None","str"] +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] + +[lastupdateddate] +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" +Deduced_Data_Type = "Datetime" +Nullable = False +Current_Data_Type = ["None","str"] +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] diff --git a/config/data_schema.toml b/config/responses_schema.toml similarity index 67% rename from config/data_schema.toml rename to config/responses_schema.toml index 564da1deb..60354cc2e 100644 --- a/config/data_schema.toml +++ b/config/responses_schema.toml @@ -1,473 +1,3 @@ -[snapshot_id] -Description = "nan" -Deduced_Data_Type = "nan" -Nullable = "nan" -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] - -[reference] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = 11001603625 -Max_values = 19891309165 -Possible_Categorical_Values = ["nan"] - -[period] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = 6 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [202012] - -[survey] -Description = "All values are 002" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["002"] - -[formid] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "int" -Length = 2 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [20, 21] - -[status] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Clear - overridden SE", "Form sent out", "Check needed", "Combined child (NIL2)", "Out of scope (NIL3)", "Ceased trading (NIL4)", "Dormant (NIL5)", "Part year return (NIL8)", "No UK activity (NIL9)"] - -[statusencoded] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "int" -Length = 3 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309] - -[receiptdate] -Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" -Deduced_Data_Type = "Datetime" -Nullable = False -Current_Data_Type = ["None","str"] -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] - -[lockedby] -Description = "All empty strings" -Deduced_Data_Type = "pandas.NA" -Nullable = True -Current_Data_Type = ["None","str"] -Length = 0 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] - -[lockeddate] -Description = "All None type" -Deduced_Data_Type = "pandas.NA" -Nullable = True -Current_Data_Type = ["None","str"] -Length = 0 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] - -[formtype] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["0001", "0006"] - -[checkletter] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = 1 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", "S"] - -[frozensicoutdated] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1120 -Max_values = 93059 -Possible_Categorical_Values = ["nan"] - -[rusicoutdated] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1120 -Max_values = 93059 -Possible_Categorical_Values = ["nan"] - -[frozensic] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1300 -Max_values = 96090 -Possible_Categorical_Values = ["nan"] - -[rusic] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1300 -Max_values = 96090 -Possible_Categorical_Values = ["nan"] - -[frozenemployees] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 272527 -Possible_Categorical_Values = ["nan"] - -[employees] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 272528 -Possible_Categorical_Values = ["nan"] - -[frozenemployment] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1 -Max_values = 272527 -Possible_Categorical_Values = ["nan"] - -[employment] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1 -Max_values = 272528 -Possible_Categorical_Values = ["nan"] - -[frozenfteemployment] -Description = "nan" -Deduced_Data_Type = "float" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0.0 -Max_values = 177699.0 -Possible_Categorical_Values = ["nan"] - -[fteemployment] -Description = "nan" -Deduced_Data_Type = "float" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0.0 -Max_values = 177699.5 -Possible_Categorical_Values = ["nan"] - -[frozenturnover] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 55277352 -Possible_Categorical_Values = ["nan"] - -[turnover] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 55277352 -Possible_Categorical_Values = ["nan"] - -[enterprisereference] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = 1001603625 -Max_values = 9891309165 -Possible_categorical_Values = ["nan"] - -[wowenterprisereference] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = 1001603625 -Max_values = 9891309165 -Possible_categorical_Values = ["nan"] - -[cellnumber] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "int" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [0] - -[currency] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["S", "E"] - -[vatreference] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["VATREF"] - -[payereference] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["PAYEREF"] - -[companyregistrationnumber] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["CRN"] - -[numberlivelocalunits] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 6063 -Possible_categorical_Values = ["nan"] - -[numberlivevat] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 255 -Possible_categorical_Values = ["nan"] - -[numberlivepaye] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 24 -Possible_categorical_Values = ["nan"] - -[legalstatus] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1 -Max_values = 4 -Possible_Categorical_Values = [1, 2, 3, 4] - -[reportingunitmarker] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["L", "E"] - -[region] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", "XX", "AA", "DC", "GF", "BA"] - -[birthdate] -Description = "Datetime format = format=%d/%m/%Y" -Deduced_Data_Type = "Datetime" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] - -[referencename] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] - -[referencepostcode] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] - -[tradingstyle] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] - -[selectiontype] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["L"] - -[inclusionexclusion] -Description = 'All values are " "' -Deduced_Data_Type = "pandas.NA" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] - -[createdby] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["ingestion"] - -[createddate] -Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" -Deduced_Data_Type = "Datetime" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] - -[lastupdatedby] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = ["None","str"] -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] - -[lastupdateddate] -Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" -Deduced_Data_Type = "Datetime" -Nullable = False -Current_Data_Type = ["None","str"] -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] - [cell_id] Description = Cell ID Deduced_Data_Type = Categorical From ece8daabef4d325f70450a9713a1f8449fd17bd6 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 16:59:21 +0100 Subject: [PATCH 184/411] Rename main to pipeline --- src/{main.py => pipeline.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{main.py => pipeline.py} (100%) diff --git a/src/main.py b/src/pipeline.py similarity index 100% rename from src/main.py rename to src/pipeline.py From c4429ce4095faf187fe03739f5a336d8b99ef462 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Mon, 5 Jun 2023 16:59:37 +0100 Subject: [PATCH 185/411] Remaning pipeline import --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 24e7b03a8..25eea835f 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ from importlib import reload import time -import src.main as src +import src.pipeline as src # reload the pipeline module to implement any changes reload(src) From 0ed26721a5f4c23814514720d2113763385c5e77 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 7 Jun 2023 12:40:15 +0100 Subject: [PATCH 186/411] Adding type hints --- src/data_processing/spp_snapshot_processing.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index c1887bd1e..ed4c86704 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -1,10 +1,11 @@ from src.utils.wrappers import validate_dataframe_not_empty - +from typing import List +import pandas as pd import logging spp_processing_logger = logging.getLogger(__name__) -def create_response_dataframe(df, unique_id_cols): +def create_response_dataframe(df: pd.DataFrame, unique_id_cols: List[str]) -> pd.DataFrame: """Create a response dataframe using pivot_table to reshape the data. Arguments: @@ -20,7 +21,7 @@ def create_response_dataframe(df, unique_id_cols): return response_df -def create_contextual_dataframe(df, unique_id_cols): +def create_contextual_dataframe(df: pd.DataFrame, unique_id_cols: List[str]) -> pd.DataFrame: """Create a contextual dataframe by dropping 'questioncode' and 'response' columns and removing duplicates. @@ -37,7 +38,7 @@ def create_contextual_dataframe(df, unique_id_cols): @validate_dataframe_not_empty -def full_responses(contributors, responses): +def full_responses(contributors: pd.DataFrame, responses: pd.DataFrame) -> pd.DataFrame: """Merges contributor and response data together into a dataframe that is in a format allowing for easier manipulation later in pipeline - notably through @@ -71,7 +72,7 @@ def full_responses(contributors, responses): return full_responses @validate_dataframe_not_empty -def response_rate(contributors, responses): +def response_rate(contributors: pd.DataFrame, responses: pd.DataFrame) -> float: """Generates a response rate based on the contributor and response data from the SPP Snapshot file. From db676d285fe475f98aa4399985814e3bff172cf5 Mon Sep 17 00:00:00 2001 From: jwestw Date: Wed, 7 Jun 2023 14:34:09 +0100 Subject: [PATCH 187/411] Stop tracking add ssh bat --- .gitignore | 3 +++ add_ssh_key.bat | 7 ------- 2 files changed, 3 insertions(+), 7 deletions(-) delete mode 100755 add_ssh_key.bat diff --git a/.gitignore b/.gitignore index df27fa462..45e3c17b0 100644 --- a/.gitignore +++ b/.gitignore @@ -904,3 +904,6 @@ docs/_linkcheck/ logs/* !logs/.gitkeep !logs/logs.md + +# Utility bat file for ssh key +add_ssh_key.bat diff --git a/add_ssh_key.bat b/add_ssh_key.bat deleted file mode 100755 index 745561adc..000000000 --- a/add_ssh_key.bat +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# Start the SSH agent and evaluate its output -eval "$(ssh-agent -s)" - -# Add the SSH key to the agent -ssh-add ~/.ssh/githubwork/work_key \ No newline at end of file From f9480cad175575d07a955bbbf2950a7dc2810465 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 10:55:21 +0100 Subject: [PATCH 188/411] correcting pre-commit readme --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 39690211d..b8838c944 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -71,5 +71,5 @@ repos: hooks: - id: coverage-badge name: Update the coverage badge in the readme - entry: bash -c 'lines=$(readme-cov)' - language: system + entry: python /home/cdsw/research-and-development/cov_reports/update_readme.py + language: python From 5d9131125a062e11853f3151e77c015746cb7925 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:17:33 +0100 Subject: [PATCH 189/411] adding typing to reqs --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index ec7a438fd..fc0e95966 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ pyyaml requests sphinx postcodes_uk # remove this later - use regex +typing From c84f1086efc23e60b557e5b8e44d4f8eb105d3d7 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:17:57 +0100 Subject: [PATCH 190/411] removing deleted test_loading --- tests/test_data_ingest/test_loading.py | 43 -------------------------- 1 file changed, 43 deletions(-) delete mode 100644 tests/test_data_ingest/test_loading.py diff --git a/tests/test_data_ingest/test_loading.py b/tests/test_data_ingest/test_loading.py deleted file mode 100644 index 195727e1f..000000000 --- a/tests/test_data_ingest/test_loading.py +++ /dev/null @@ -1,43 +0,0 @@ -import pandas as pd -from typing import Tuple - -# Import modules to test -from src.data_ingest.loading import parse_snap_data - - -class TestParseSPP: - """Test for Parse Snap data function""" - - def input_data(self) -> dict: - dummy_snapdata = { - "snapshot_id": "", - "contributors": [ - {"ref": "123", "con": "789"}, - {"ref": "456", "con": "910"}, - ], - "responses": [{"ref": "123", "res": "789"}, {"ref": "456", "res": "910"}], - } - - return dummy_snapdata - - def exp_output(self) -> Tuple[pd.DataFrame, pd.DataFrame]: - contributor_df = pd.DataFrame( - [{"ref": "123", "con": "789"}, {"ref": "456", "con": "910"}] - ) - - responses_df = pd.DataFrame( - [{"ref": "123", "res": "789"}, {"ref": "456", "res": "910"}] - ) - - return contributor_df, responses_df - - def test_parse_snap_data(self): - """Tests for full_responses function.""" - - inputdata = self.input_data() - df_result1, df_result2 = parse_snap_data(inputdata) - - expected_output_data1, expected_output_data2 = self.exp_output() - - pd.testing.assert_frame_equal(df_result1, expected_output_data1) - pd.testing.assert_frame_equal(df_result2, expected_output_data2) From 618b3a450bae8574ddae3cddc9067ab5f4f84cc9 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:18:50 +0100 Subject: [PATCH 191/411] fixing spp_processing and tests --- .../spp_snapshot_processing.py | 20 +++++++++------ .../test_spp_snapshot_processing.py | 25 +++++++++++-------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index ed4c86704..452c57793 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -5,7 +5,9 @@ spp_processing_logger = logging.getLogger(__name__) -def create_response_dataframe(df: pd.DataFrame, unique_id_cols: List[str]) -> pd.DataFrame: +def create_response_dataframe( + df: pd.DataFrame, unique_id_cols: List[str] +) -> pd.DataFrame: """Create a response dataframe using pivot_table to reshape the data. Arguments: @@ -21,7 +23,9 @@ def create_response_dataframe(df: pd.DataFrame, unique_id_cols: List[str]) -> pd return response_df -def create_contextual_dataframe(df: pd.DataFrame, unique_id_cols: List[str]) -> pd.DataFrame: +def create_contextual_dataframe( + df: pd.DataFrame, unique_id_cols: List[str] +) -> pd.DataFrame: """Create a contextual dataframe by dropping 'questioncode' and 'response' columns and removing duplicates. @@ -40,8 +44,8 @@ def create_contextual_dataframe(df: pd.DataFrame, unique_id_cols: List[str]) -> @validate_dataframe_not_empty def full_responses(contributors: pd.DataFrame, responses: pd.DataFrame) -> pd.DataFrame: - """Merges contributor and response data together into a dataframe that is in a - format allowing for easier manipulation later in pipeline - notably through + """Merges contributor and response data together into a dataframe that is in a + format allowing for easier manipulation later in pipeline - notably through having each questioncode as its own column. Arguments: @@ -61,20 +65,21 @@ def full_responses(contributors: pd.DataFrame, responses: pd.DataFrame) -> pd.Da responses_dropped = responses.drop(drop_cols + ["adjustedresponse"], axis=1) merged_df = contributors_dropped.merge(responses_dropped, on=unique_id_cols) - # Create a contextual df by dropping "questioncode" and "response" cols. Remove dupes + # Create contextual df by dropping "questioncode" and "response" cols. Remove dupes contextual_df = create_contextual_dataframe(merged_df, unique_id_cols) # Create a response dataframe using pivot_table to reshape the data response_df = create_response_dataframe(merged_df, unique_id_cols) - full_responses = response_df.merge(contextual_df, on = unique_id_cols) + full_responses = response_df.merge(contextual_df, on=unique_id_cols) return full_responses + @validate_dataframe_not_empty def response_rate(contributors: pd.DataFrame, responses: pd.DataFrame) -> float: - """Generates a response rate based on the contributor and response data + """Generates a response rate based on the contributor and response data from the SPP Snapshot file. Arguments: @@ -95,4 +100,3 @@ def response_rate(contributors: pd.DataFrame, responses: pd.DataFrame) -> float: spp_processing_logger.info(f"The SPP response rate is {round(response_rate,2)}%") return response_rate - diff --git a/tests/test_data_processing/test_spp_snapshot_processing.py b/tests/test_data_processing/test_spp_snapshot_processing.py index 22b1eb0d3..dbd47c3b3 100644 --- a/tests/test_data_processing/test_spp_snapshot_processing.py +++ b/tests/test_data_processing/test_spp_snapshot_processing.py @@ -5,6 +5,14 @@ from typing import Tuple +from src.data_processing.spp_snapshot_processing import ( + create_response_dataframe, + full_responses, + response_rate, + create_contextual_dataframe, +) + + @pytest.fixture def dummy_data() -> Tuple[pd.DataFrame, pd.DataFrame]: # Set up the dummy data @@ -56,7 +64,6 @@ def expected_output(): def test_full_responses(dummy_data, expected_output): """Tests for full_responses function.""" # Import modules to test - from src.data_processing.spp_snapshot_processing import full_responses contributor_data, responses_data = dummy_data expected_output_data = expected_output @@ -68,7 +75,6 @@ def test_full_responses(dummy_data, expected_output): def test_response_rate(dummy_data): # Import the module to test - from src.data_processing.spp_snapshot_processing import response_rate contributor_data, responses_data = dummy_data @@ -79,9 +85,7 @@ def test_response_rate(dummy_data): def test_create_response_dataframe(dummy_data): - - from src.data_processing.spp_snapshot_processing import create_response_dataframe - + contributor_data, responses_data = dummy_data unique_id_cols = ["reference", "period", "survey"] expected_columns = ["reference", "period", "survey", 200, 201, 202] @@ -97,7 +101,8 @@ def test_create_response_dataframe(dummy_data): # Assert the data assert response_df.values.tolist() == expected_data - + + def test_create_contextual_dataframe(dummy_data): contributor_data, responses_data = dummy_data unique_id_cols = ["reference", "period", "survey"] @@ -109,14 +114,14 @@ def test_create_contextual_dataframe(dummy_data): "createddate", "lastupdatedby", "lastupdateddate", + "adjustedresponse", ] expected_data = [ - [101, 202012, 1, "James", 2020, "Vondy", 2020], - [102, 202012, 1, "Ilyas", 2020, "Charl", 2020], - [103, 202012, 1, "Roddy", 2020, "Gareth", 2020], + [101, 202012, 1, "A", 2020, "A", 2020, ""], + [102, 202012, 1, "A", 2020, "A", 2020, ""], ] - contextual_df = create_contextual_dataframe(contributor_data, unique_id_cols) + contextual_df = create_contextual_dataframe(responses_data, unique_id_cols) # Assert the columns assert contextual_df.columns.tolist() == expected_columns From 2e26c7b9b9149d48232af4a7763159e90bde7de5 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:37:44 +0100 Subject: [PATCH 192/411] sorted paths in pipeline --- src/pipeline.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pipeline.py b/src/pipeline.py index 252c954d3..20aae146c 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -5,7 +5,6 @@ from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator -from src.utils.testfunctions import Manipulate_data from src.data_ingest import spp_parser from src.data_processing import spp_snapshot_processing as processing from src.utils.hdfs_mods import hdfs_load_json @@ -36,8 +35,6 @@ def run_pipeline(start): logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") logger.info("Collecting logging parameters ..........") - Manipulate_data() - # Data Ingest MainLogger.info("Starting Data Ingest...") # Load SPP data from DAP @@ -84,6 +81,7 @@ def run_pipeline(start): run_time = round(time.time() - start, 5) runlog_obj._record_time_taken(run_time) + runlog_obj.retrieve_configs() runlog_obj._create_runlog_dicts() runlog_obj._create_runlog_dfs() runlog_obj.create_runlog_files() From 2b8dc85140ac1cf8d3e420e962eb9baff40737f1 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:38:55 +0100 Subject: [PATCH 193/411] added logs message and removed paths --- src/data_processing/spp_snapshot_processing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/data_processing/spp_snapshot_processing.py b/src/data_processing/spp_snapshot_processing.py index 452c57793..77715ad8d 100644 --- a/src/data_processing/spp_snapshot_processing.py +++ b/src/data_processing/spp_snapshot_processing.py @@ -97,6 +97,8 @@ def response_rate(contributors: pd.DataFrame, responses: pd.DataFrame) -> float: response_rate = response_count / contributor_count - spp_processing_logger.info(f"The SPP response rate is {round(response_rate,2)}%") + rounded_resp_rate = round(response_rate, 2) + + spp_processing_logger.info(f"The response rate is {rounded_resp_rate}%") return response_rate From 35ac8419eb08b1c99134dd24d1e88567d148145c Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:39:10 +0100 Subject: [PATCH 194/411] added log message to wrapper --- src/utils/wrappers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/utils/wrappers.py b/src/utils/wrappers.py index 37999ac89..8c44e07cd 100644 --- a/src/utils/wrappers.py +++ b/src/utils/wrappers.py @@ -197,9 +197,12 @@ def _change_direction(before, after): Either 'Table' or 'SingleLine' must be specified.""" ) + def validate_dataframe_not_empty(func): def wrapper(df, *args, **kwargs): if df.empty: + logger.warning("Input dataframe is empty.") raise ValueError("Input dataframe is empty.") return func(df, *args, **kwargs) - return wrapper \ No newline at end of file + + return wrapper From d5f27fe6296551719e53ae02292c246312b0cb30 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:46:24 +0100 Subject: [PATCH 195/411] changed main to pipeline in workflow --- .github/workflows/pytest-action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index c9ffb34f4..cfe923e8b 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -33,7 +33,7 @@ jobs: working-directory: ${{ github.workspace }} shell: bash -l {0} run: | - coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/main.py \ + coverage run --branch --source=./src --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/pipeline.py \ -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ -o python_coverage.xml && coverage report -m --fail-under=10 # 6) Get the coverage report in to the pull request comments From aa946caa069d121d1f437ad03892639edb6b1592 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:47:50 +0100 Subject: [PATCH 196/411] added readme to req --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index fc0e95966..3598ecc37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ requests sphinx postcodes_uk # remove this later - use regex typing +readme-coverage-badger From 87ca40bd35400eec59a5bb0a8c70fd42d4d256eb Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 8 Jun 2023 11:49:43 +0100 Subject: [PATCH 197/411] adding readme-cov --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 63d64b3b7..0b700ab82 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Research and Development Project -![Code Coverage](https://img.shields.io/badge/Coverage-23%25-red.svg) +![Code Coverage](https://img.shields.io/badge/Coverage-39%25-red.svg) Calculating national and regional research and development expenditure as part of [national accounts](https://www.ons.gov.uk/economy/nationalaccounts). From 86fea5b68ab9b79415086ca93b3ca1779f38efad Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 19:18:18 +0100 Subject: [PATCH 198/411] Created a DataSchema.toml file. Currently blank. --- config/DataSchema.toml | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 config/DataSchema.toml diff --git a/config/DataSchema.toml b/config/DataSchema.toml new file mode 100644 index 000000000..e69de29bb From 8e1ab5cd8698e3406a936e70990770f13ef418ba Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 19:19:09 +0100 Subject: [PATCH 199/411] Created a check_data_shape function in validation.py. Takes file path and number of columns as arguments. Returns a bool. --- src/data_validation/validation.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 55dbbf3ea..c9dfced42 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -119,3 +119,20 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): ] return unreal_postcodes +import os + + +def check_data_shape( + filePath: str = "./config/DataSchema.toml", numCols: int = 5 +) -> bool: + """_summary_ + + Keyword Arguments: + filePath -- Path to data dictionary file (default: {"./config/DataSchema.toml"}) + numCols -- Number of columns in data (default: {5}) + + Returns: + A bool: boolean, True is number of columns is as expected, otherwise False + """ + os.path.exists(filePath) + return numCols From 4c4f0e817dc5b75cb9389e0847a4a182dcc85a18 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 26 Apr 2023 19:47:49 +0100 Subject: [PATCH 200/411] Result of data_schema.py. Toml file indicating the different characteristics of each variable. --- config/DataSchema.toml | 650 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 650 insertions(+) diff --git a/config/DataSchema.toml b/config/DataSchema.toml index e69de29bb..aa15a8a78 100644 --- a/config/DataSchema.toml +++ b/config/DataSchema.toml @@ -0,0 +1,650 @@ +[cell_id] +description = "Cell ID" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[civ_or_def] +description = "Business type: Civil or Defence" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[current_sic] +description = "Sic - Standard Industry Classification" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[data_source] +description = "Constructed" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[emp_other] +description = "emp_other (Full Time Equivalent)" +data_type = "Numeric float (or decimal)" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[emp_researcher] +description = "emp_researcher (Full Time Equivalent)" +data_type = "Numeric float (or decimal)" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[emp_technician] +description = "emp_technician (Full Time Equivalent)" +data_type = "Numeric float (or decimal)" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[emp_total] +description = "emp_total (Full Time Equivalent)" +data_type = "Numeric float (or decimal)" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[employee_count] +description = "Employee Count (IDBR)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[foreign_owner] +description = "Foreign Owner" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[form_status] +description = "Status" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[form_type] +description = "Form Type" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[freeze_id] +description = "Freeze ID - bespoke to openroad" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[headcount_oth_f] +description = "Other Female (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_oth_m] +description = "Other Male (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_res_f] +description = "Researchers Females (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_res_m] +description = "Researchers Male (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_tec_f] +description = "Technicians Female (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_tec_m] +description = "Technicians Male (Headcount)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[headcount_total] +description = "Total Headcount" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[period] +description = "Openroad Specific" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[period_contributor_id] +description = "Openroad Specific" +data_type = "Categorical" +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[period_year] +description = "Period" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[product_group] +description = "Published Product Group" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[ru_ref] +description = "Reference" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[sizeband] +description = "SizeBand" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[wowentref] +description = "Wowentref" +data_type = "Categorical" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q202] +description = "Salaries & Wages" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q203] +description = "Other current expenditure" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q204] +description = "Total Current Expenditure" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q205] +description = "Basic Research" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q206] +description = "Applied Research" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q207] +description = "Experimental Development" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q208] +description = "Land & Build CapEx " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q209] +description = "Equipment & Machinery CapEx" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q210] +description = "Total Capex." +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q211] +description = "Total Inhouse Expenditure " +data_type = "Numeric Integer" +nullable = "No" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q212] +description = "Own Funds" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q213] +description = "Funding - Commission of the EU" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q214] +description = "Funding - UK government" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q215] +description = "Funding - Organisations outside the Uk " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q216] +description = "Funding - Other UK Private Bus/Public Orgs " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q217] +description = "Funding - Any Other UK " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q218] +description = "Total Funding " +data_type = "Numeric Integer" +nullable = "No" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q219] +description = "Land Acquired for R&D (Split of Land & Build CapEx)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q220] +description = "Buildings acquired/constructed for R&D (Split of Land & Build CapEx)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q221] +description = "Expenditure on computer software only (of which from Equipment & Machinery CapEx)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q222] +description = "Purchase of Materials (Split of Other current)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q223] +description = "Purchase of Services (Split of Other current)" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q224] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q225] +description = "Ownership - Own Business" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q226] +description = "Ownership - UK Government" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q227] +description = "Ownership - Other UK Priv Bus" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q228] +description = "Ownership - Other UK Orgs" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q229] +description = "Ownership - Bus Enterprises in Group Outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q230] +description = "Ownership - Other Bus Enterprises outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q231] +description = "Ownership - Other Governments outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q232] +description = "Ownership - Higher Education Establishments outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q233] +description = "Ownership - Non-profit Orgs outside the UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q234] +description = "Ownership - Commission of EU" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q235] +description = "Ownership - International Orgs" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q236] +description = "Ownership - Any other Orgs outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q237] +description = "Ownership - not owned freely available" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q238] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q239] +description = "Life Length - Basic Research" +data_type = "Numeric Integer" +nullable = "Not Asked" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q240] +description = "Life Length - Applied Research" +data_type = "Numeric Integer" +nullable = "Not Asked" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q241] +description = "Life Length - Experimental Res" +data_type = "Numeric Integer" +nullable = "Not Asked" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q242] +description = "Funding - Any other UK organisations" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q243] +description = "Funding - Business Enterprises in group outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q244] +description = "Funding - Other Business Enterprises outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q245] +description = "Funding - Other Governments outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q246] +description = "Funding - Higher Education Est Outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q247] +description = "Funding - Non-profit Orgs outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q248] +description = "Funding - International Orgs" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q249] +description = "Funding - Any other orgs outside UK" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q250] +description = "Funding - UK Higher Education Establishments" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q251] +description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q252] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q253] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q254] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q255] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q256] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q257] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q258] +description = nan +data_type = nan +nullable = nan +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q302] +description = "Purchased/funded R&D in the UK (Yes or No)" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q303] +description = "Purchased Outside UK (Govt Funded) " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q304] +description = "Purchased Outside UK (Other) " +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q305] +description = "Total Purchased" +data_type = "Numeric Integer" +nullable = "Yes" +min_acceptable_value = 0 +max_acceptable_value = 1000000 + +[q307] +description = "Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q308] +description = "Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q309] +description = "Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q713] +description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" + +[q714] +description = "Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM" +data_type = "Boolean (True or False, 0 or 1)" +nullable = "No" +min_acceptable_value = "nan" +max_acceptable_value = "nan" From 11df0b37e725c3122ab5a2d248b27aafad794bf6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 27 Apr 2023 16:59:10 +0100 Subject: [PATCH 201/411] Removed print statements from loading.py. --- src/data_ingest/loading.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 src/data_ingest/loading.py diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py new file mode 100644 index 000000000..937d61c44 --- /dev/null +++ b/src/data_ingest/loading.py @@ -0,0 +1,34 @@ +import pandas as pd + +from src.utils.helpers import Config_settings +from src.utils.hdfs_mods import hdfs_load_json + +conf_obj = Config_settings() +config = conf_obj.config_dict +snapshot_path = config["paths"]["snapshot_path"] # Taken from config file + +snapdata = hdfs_load_json(snapshot_path) + +contributerdict = snapdata["contributors"] +responsesdict = snapdata["responses"] + +contributers = pd.DataFrame(contributerdict) +responses = pd.DataFrame(responsesdict) + + # Open the file in read mode inside Hadoop context + with hdfs.open(filepath, "r") as file: + # Import csv file and convert to Dataframe + datadict = json.load(file) + contributerdict = datadict["contributors"][0] + responsesdict = datadict["responses"][0] + + datadf = pd.DataFrame.from_dict(datadict, orient="index") + + return datadf, contributerdict, responsesdict + + +snapdata, contributerdict, responsesdict = hdfs_load_json(file_path) + +# print(contributerdict) +# print("\n") +# print(responsesdict) From 116ffc58bccf0c5661b8a7de2eeebb81611da7a8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 27 Apr 2023 17:07:59 +0100 Subject: [PATCH 202/411] Imported hdfs_load_json to compare the snapshot file to the data schema. Updated function arguments, defaults and code for comparison. --- src/data_validation/validation.py | 34 ++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index c9dfced42..bb969eb20 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -120,10 +120,22 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): return unreal_postcodes import os +import toml +from loading import hdfs_load_json +import sys + +sys.path.insert(0, "./src/data_ingest/") + + +snapshot_path = ( + "/ons/rdbe_dev/snapshot-202012-002-fba5c4ba-fb8c-4a62-87bb-66c725eea5fd.json" +) def check_data_shape( - filePath: str = "./config/DataSchema.toml", numCols: int = 5 + dataFile: str = snapshot_path, + filePath: str = "./config/DataSchema.toml", + numCols: int = 5, ) -> bool: """_summary_ @@ -134,5 +146,21 @@ def check_data_shape( Returns: A bool: boolean, True is number of columns is as expected, otherwise False """ - os.path.exists(filePath) - return numCols + # Check if DataSchema.toml exists + file_exists = os.path.exists(filePath) + snapdata, contributerdict, responsesdict = hdfs_load_json(snapshot_path) + + if not file_exists: + return file_exists + else: + toml_string = toml.load(filePath) + shared_items = { + k: toml_string[k] + for k in toml_string + if k in contributerdict and toml_string[k] == contributerdict[k] + } + + data_rows, data_columns = len(contributerdict), 1 + schema_rows, schamea_columns = len(toml_string), 1 + + return len(shared_items), data_rows, data_columns, schema_rows, schamea_columns From a5f515887eb73cded2713e93ba74f510bb9523bf Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 27 Apr 2023 23:59:10 +0100 Subject: [PATCH 203/411] File containing function to check data shape. Grabs snapshot file by default and compares to data schema. --- src/data_validation/validation.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index bb969eb20..41854c0ec 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -121,8 +121,8 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): return unreal_postcodes import os import toml -from loading import hdfs_load_json import sys +from loading import hdfs_load_json sys.path.insert(0, "./src/data_ingest/") @@ -137,10 +137,14 @@ def check_data_shape( filePath: str = "./config/DataSchema.toml", numCols: int = 5, ) -> bool: - """_summary_ + """Compares the shape of the data and compares it to the shape of the toml + file based off the data schema. Returns true if there is a match and false + otherwise. Keyword Arguments: - filePath -- Path to data dictionary file (default: {"./config/DataSchema.toml"}) + dataFile -- Path to data file to compare (default: {snapshot_path}) + filePath -- Path to schema dictionary file + (default: {"./config/DataSchema.toml"}) numCols -- Number of columns in data (default: {5}) Returns: @@ -154,13 +158,21 @@ def check_data_shape( return file_exists else: toml_string = toml.load(filePath) + shared_items = { k: toml_string[k] for k in toml_string if k in contributerdict and toml_string[k] == contributerdict[k] } - data_rows, data_columns = len(contributerdict), 1 - schema_rows, schamea_columns = len(toml_string), 1 + # data_key1 = list(contributerdict.keys())[0] + # schema_key1 = list(toml_string.keys())[0] + + # data_rows, data_columns = len(contributerdict), contributerdict[data_key1] + # schema_rows, schema_columns = len(toml_string), len(toml_string[schema_key1]) + + return len(shared_items), shared_items + - return len(shared_items), data_rows, data_columns, schema_rows, schamea_columns +test = check_data_shape() +print(test) From f4a4a31a34ab50bbcf2c369cdf4a0c94349cc24c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 28 Apr 2023 11:55:58 +0100 Subject: [PATCH 204/411] Added the Config_settings class that may have been missed from a git rebase. --- src/utils/helpers.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/utils/helpers.py b/src/utils/helpers.py index e0d2216c0..2401af053 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -2,6 +2,7 @@ import toml import yaml +import os # Define paths user_config_path = "config/userconfig.toml" @@ -62,3 +63,18 @@ def period_select() -> tuple: period_dict = user_config_reader()["period"] return period_dict["start_period"], period_dict["end_period"] + + +class Config_settings: + """Get the config settings from the config file.""" + + def __init__(self): + self.config_file = "src/developer_config.yaml" + self.config_dict = self._get_config_settings() + + def _get_config_settings(self): + """Get the config settings from the config file.""" + with open(self.config_file, "r") as file: + config = yaml.safe_load(file) + + return config From 4856211f7e7bc72e56f0e56dac2ee56fafa869c9 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 28 Apr 2023 11:56:29 +0100 Subject: [PATCH 205/411] Cleaned up a few comments in loading.py. --- src/data_ingest/loading.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py index 937d61c44..3646f5347 100644 --- a/src/data_ingest/loading.py +++ b/src/data_ingest/loading.py @@ -28,7 +28,3 @@ snapdata, contributerdict, responsesdict = hdfs_load_json(file_path) - -# print(contributerdict) -# print("\n") -# print(responsesdict) From aebaddab96532a3eb61d35d74d49ab746f2a8303 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 28 Apr 2023 11:58:08 +0100 Subject: [PATCH 206/411] Uncommented several lines in src/main.py after adding Config_settings class into src/utils/helpers.py --- src/pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pipeline.py b/src/pipeline.py index 20aae146c..7fa0b5edf 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -9,6 +9,7 @@ from src.data_processing import spp_snapshot_processing as processing from src.utils.hdfs_mods import hdfs_load_json from src.data_validation import validation + import time import logging From 9f5efdecf95b61195f7c74506b54add8da9ee293 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 28 Apr 2023 14:03:27 +0100 Subject: [PATCH 207/411] Added validation.py import to src/main.py for testing. --- src/pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pipeline.py b/src/pipeline.py index 7fa0b5edf..885b233c3 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -9,6 +9,7 @@ from src.data_processing import spp_snapshot_processing as processing from src.utils.hdfs_mods import hdfs_load_json from src.data_validation import validation +from src.data_validation.validation import check_data_shape import time import logging From 28bc14619eadbcc0f85e7450979b49e8bc8c406f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 28 Apr 2023 14:05:06 +0100 Subject: [PATCH 208/411] Changed import format to work when run by call from src/main.py. Also added lines to query if data dictionary value is of type dict/list. --- src/data_validation/validation.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 41854c0ec..40f99f9bb 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -121,11 +121,8 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): return unreal_postcodes import os import toml -import sys -from loading import hdfs_load_json - -sys.path.insert(0, "./src/data_ingest/") +from src.data_ingest.loading import hdfs_load_json snapshot_path = ( "/ons/rdbe_dev/snapshot-202012-002-fba5c4ba-fb8c-4a62-87bb-66c725eea5fd.json" @@ -165,13 +162,23 @@ def check_data_shape( if k in contributerdict and toml_string[k] == contributerdict[k] } - # data_key1 = list(contributerdict.keys())[0] - # schema_key1 = list(toml_string.keys())[0] + data_key1 = list(contributerdict.keys())[0] + schema_key1 = list(toml_string.keys())[0] + + data_rows, data_columns = len(contributerdict), contributerdict[data_key1] + schema_rows, schema_columns = len(toml_string), len(toml_string[schema_key1]) + + # Check if data dictionary value is of a dict or list type + # If it isn't then set column number equal to 1, else length of value + if not type(data_columns) == dict or not type(data_columns) == list: + data_columns = 1 + else: + data_columns = len(data_columns) - # data_rows, data_columns = len(contributerdict), contributerdict[data_key1] - # schema_rows, schema_columns = len(toml_string), len(toml_string[schema_key1]) + outString = f"""Data has {data_rows} rows and {data_columns} columns. + It should have {schema_rows} rows and {schema_columns} columns.""" - return len(shared_items), shared_items + return len(shared_items), shared_items, outString test = check_data_shape() From 957e250b9cc12a18db95faafa78fda96e562761c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 28 Apr 2023 14:50:39 +0100 Subject: [PATCH 209/411] Added a boolean value that returns true if number of columns match in data compared to schema. --- src/data_validation/validation.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 40f99f9bb..46d41fbba 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -150,6 +150,7 @@ def check_data_shape( # Check if DataSchema.toml exists file_exists = os.path.exists(filePath) snapdata, contributerdict, responsesdict = hdfs_load_json(snapshot_path) + cols_match = False if not file_exists: return file_exists @@ -178,7 +179,12 @@ def check_data_shape( outString = f"""Data has {data_rows} rows and {data_columns} columns. It should have {schema_rows} rows and {schema_columns} columns.""" - return len(shared_items), shared_items, outString + if data_columns == schema_columns: + cols_match = True + else: + cols_match = False + + return cols_match, len(shared_items), shared_items, outString test = check_data_shape() From 91e91cba4464724a05e43ea3f4ea3d1d76a1cdd1 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 3 May 2023 14:26:55 +0100 Subject: [PATCH 210/411] Upgraded pandas to version 1.1.5 --- environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 3628a50a1..e247d571c 100644 --- a/environment.yml +++ b/environment.yml @@ -3,6 +3,8 @@ dependencies: - python=3 - coverage - pyyaml + - pandas==1.1.5 + - numpy - requests - sphinx - pip From cc29f33ef7d4e8e9ac1906cb2033df365a0db423 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 3 May 2023 14:28:41 +0100 Subject: [PATCH 211/411] Data now loaded from DAP directly. Changed default no. of columns in check_data_shape from 5 to 93. --- src/data_validation/validation.py | 80 ++++++++++++++++--------------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 46d41fbba..99107cb62 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -121,71 +121,73 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): return unreal_postcodes import os import toml +import pandas as pd +import pydoop.hdfs as hdfs + +datafilepath = "/ons/rdbe_dev/Frozen_Group_Data2021_244_Headers.csv" + + +def read_data(excel_file) -> pd.DataFrame: + """Read an excel file and convert it into a + pandas dataframe, dropping any 'Unnamed:' columns. + + + Arguments: + excel_file -- the excel file to be converted + + Returns: + A pd.DataFrame: a pandas dataframe object. + """ + with hdfs.open(excel_file, "r") as file: -from src.data_ingest.loading import hdfs_load_json + # Import csv file and convert to Dataframe + sheet = pd.read_csv(file) -snapshot_path = ( - "/ons/rdbe_dev/snapshot-202012-002-fba5c4ba-fb8c-4a62-87bb-66c725eea5fd.json" -) + return sheet def check_data_shape( - dataFile: str = snapshot_path, + dataFile: str = datafilepath, filePath: str = "./config/DataSchema.toml", - numCols: int = 5, + numCols: int = 93, ) -> bool: """Compares the shape of the data and compares it to the shape of the toml file based off the data schema. Returns true if there is a match and false otherwise. Keyword Arguments: - dataFile -- Path to data file to compare (default: {snapshot_path}) + dataFile -- Path to data file to compare (default: {datafilepath}) filePath -- Path to schema dictionary file (default: {"./config/DataSchema.toml"}) - numCols -- Number of columns in data (default: {5}) + numCols -- Number of columns in data (default: {93}) Returns: - A bool: boolean, True is number of columns is as expected, otherwise False + A bool: boolean, True if number of columns is as expected, otherwise False """ # Check if DataSchema.toml exists file_exists = os.path.exists(filePath) - snapdata, contributerdict, responsesdict = hdfs_load_json(snapshot_path) + cols_match = False if not file_exists: return file_exists else: - toml_string = toml.load(filePath) - - shared_items = { - k: toml_string[k] - for k in toml_string - if k in contributerdict and toml_string[k] == contributerdict[k] - } - - data_key1 = list(contributerdict.keys())[0] - schema_key1 = list(toml_string.keys())[0] + # Read data file + data = read_data(dataFile) - data_rows, data_columns = len(contributerdict), contributerdict[data_key1] - schema_rows, schema_columns = len(toml_string), len(toml_string[schema_key1]) + # Convert it to dictionary + data_dict = data.to_dict() - # Check if data dictionary value is of a dict or list type - # If it isn't then set column number equal to 1, else length of value - if not type(data_columns) == dict or not type(data_columns) == list: - data_columns = 1 - else: - data_columns = len(data_columns) - - outString = f"""Data has {data_rows} rows and {data_columns} columns. - It should have {schema_rows} rows and {schema_columns} columns.""" - - if data_columns == schema_columns: - cols_match = True - else: - cols_match = False + # Load toml data schema into dictionary + toml_string = toml.load(filePath) - return cols_match, len(shared_items), shared_items, outString + # Create a 'shared key' dictionary + shared_items = {k: toml_string[k] for k in toml_string if k in data_dict} + # Compare number of 'columns' in data to data schema + if len(shared_items) == len(toml_string): + cols_match = True + else: + cols_match = False -test = check_data_shape() -print(test) + return cols_match From 405664aa794c4e13347108d100b01d9fe845659d Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 3 May 2023 14:36:59 +0100 Subject: [PATCH 212/411] Created unit test for the validation.py check_data_shape function. --- tests/test_validation.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 tests/test_validation.py diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 000000000..aa976169a --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,22 @@ +"""Create a test suite for the validation module.""" + +import pytest + + +def test_add(): + """Test the add function.""" + # Arrange + from src.data_validation.validation import check_data_shape + + # Act: use pytest to assert the result + result_1 = check_data_shape() + + # Assert + assert isinstance(result_1, bool) + # Assert: Negative test. Should fails when the answer is wrong + with pytest.raises(AssertionError): + assert not isinstance(result_1, bool) + # Assert: test that add fails when the arguments are wrong type + pytest.raises(TypeError, check_data_shape, 1, "2", 3) + pytest.raises(TypeError, check_data_shape, "1", 2, 3) + pytest.raises(TypeError, check_data_shape, "1", "2", "3") From 532a0a1062359d38d1bd44df7098d36ffdc8a88f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 3 May 2023 14:54:25 +0100 Subject: [PATCH 213/411] Created standalone function to load data schema file into a dictionary. --- src/data_validation/validation.py | 42 +++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 99107cb62..a8d7e3127 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -146,6 +146,27 @@ def read_data(excel_file) -> pd.DataFrame: return sheet +def load_schema(filePath: str = "./config/DataSchema.toml"): + """Load the data schema from toml file into a dictionary + + Keyword Arguments: + filePath -- Path to data schema toml file + (default: {"./config/DataSchema.toml"}) + + Returns: + A dict: dictionary containing parsed schema toml file + """ + file_exists = os.path.exists(filePath) + + # Check if DataSchema.toml exists + if not file_exists: + return file_exists + else: + # Load toml data schema into dictionary + toml_string = toml.load(filePath) + return toml_string + + def check_data_shape( dataFile: str = datafilepath, filePath: str = "./config/DataSchema.toml", @@ -164,25 +185,20 @@ def check_data_shape( Returns: A bool: boolean, True if number of columns is as expected, otherwise False """ - # Check if DataSchema.toml exists - file_exists = os.path.exists(filePath) cols_match = False - if not file_exists: - return file_exists - else: - # Read data file - data = read_data(dataFile) + # Read data file + data = read_data(dataFile) - # Convert it to dictionary - data_dict = data.to_dict() + # Convert it to dictionary + data_dict = data.to_dict() - # Load toml data schema into dictionary - toml_string = toml.load(filePath) + # Load toml data schema into dictionary + toml_string = load_schema(filePath) - # Create a 'shared key' dictionary - shared_items = {k: toml_string[k] for k in toml_string if k in data_dict} + # Create a 'shared key' dictionary + shared_items = {k: toml_string[k] for k in toml_string if k in data_dict} # Compare number of 'columns' in data to data schema if len(shared_items) == len(toml_string): From b577c6f6198517be245068738d38c1cad66dfc22 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 3 May 2023 15:02:57 +0100 Subject: [PATCH 214/411] Added unit test for load_schema function in validation.py --- tests/test_validation.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index aa976169a..720035e6a 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -3,8 +3,8 @@ import pytest -def test_add(): - """Test the add function.""" +def test_check_data_shape(): + """Test the check_data_shape function.""" # Arrange from src.data_validation.validation import check_data_shape @@ -20,3 +20,21 @@ def test_add(): pytest.raises(TypeError, check_data_shape, 1, "2", 3) pytest.raises(TypeError, check_data_shape, "1", 2, 3) pytest.raises(TypeError, check_data_shape, "1", "2", "3") + + +def test_load_schema(): + """Test the load_schema function.""" + # Arrange + from src.data_validation.validation import load_schema + + # Act: use pytest to assert the result + result_1 = load_schema() + + # Assert + assert isinstance(result_1, dict) + # Assert: Negative test. Should fails when the answer is wrong + with pytest.raises(AssertionError): + assert not isinstance(result_1, dict) + # Assert: test that add fails when the arguments are wrong type + pytest.raises(TypeError, load_schema, 2) + pytest.raises(TypeError, load_schema, True) From 7efbe6229fba71d933a53dee405714b3206a6676 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 14:20:31 +0100 Subject: [PATCH 215/411] Removed read_data() in validation.py. Using read_hdfs_csv() in hdfs_mods.py instead. Removed hdfs and pandas import as unused. --- src/data_validation/validation.py | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index a8d7e3127..f07015405 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -121,32 +121,13 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): return unreal_postcodes import os import toml -import pandas as pd -import pydoop.hdfs as hdfs - -datafilepath = "/ons/rdbe_dev/Frozen_Group_Data2021_244_Headers.csv" - -def read_data(excel_file) -> pd.DataFrame: - """Read an excel file and convert it into a - pandas dataframe, dropping any 'Unnamed:' columns. +from src.utils.hdfs_mods import read_hdfs_csv as read_data - - Arguments: - excel_file -- the excel file to be converted - - Returns: - A pd.DataFrame: a pandas dataframe object. - """ - with hdfs.open(excel_file, "r") as file: - - # Import csv file and convert to Dataframe - sheet = pd.read_csv(file) - - return sheet +datafilepath = "/ons/rdbe_dev/Frozen_Group_Data2021_244_Headers.csv" -def load_schema(filePath: str = "./config/DataSchema.toml"): +def load_schema(filePath: str = "./config/DataSchema.toml") -> dict: """Load the data schema from toml file into a dictionary Keyword Arguments: @@ -207,3 +188,7 @@ def check_data_shape( cols_match = False return cols_match + + +test = check_data_shape() +print(test) From c6db5d17e5c4612893ce2d7309771de4f7122171 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 11 May 2023 15:36:24 +0100 Subject: [PATCH 216/411] Removed repeat of Config_settings class in helpers.py. --- src/utils/helpers.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/utils/helpers.py b/src/utils/helpers.py index 2401af053..e0d2216c0 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -2,7 +2,6 @@ import toml import yaml -import os # Define paths user_config_path = "config/userconfig.toml" @@ -63,18 +62,3 @@ def period_select() -> tuple: period_dict = user_config_reader()["period"] return period_dict["start_period"], period_dict["end_period"] - - -class Config_settings: - """Get the config settings from the config file.""" - - def __init__(self): - self.config_file = "src/developer_config.yaml" - self.config_dict = self._get_config_settings() - - def _get_config_settings(self): - """Get the config settings from the config file.""" - with open(self.config_file, "r") as file: - config = yaml.safe_load(file) - - return config From 3f55ed478b48a5eec9d0df92cb8b1ee20f2eab3f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 11 May 2023 15:39:29 +0100 Subject: [PATCH 217/411] Removed repeat of user_config_path in helpers.py. --- src/utils/helpers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/utils/helpers.py b/src/utils/helpers.py index e0d2216c0..5d543f12a 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -22,9 +22,6 @@ def _get_config_settings(self): return config -user_config_path = "config/userconfig.toml" - - def user_config_reader(configfile: str = user_config_path) -> dict: """Function to parse the userconfig.toml file From 16170b83852316003e7409fd85e5addece476895 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 11 May 2023 15:41:12 +0100 Subject: [PATCH 218/411] Automatic reformatting on developer_config.yaml changed indentation. --- src/developer_config.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 622774e34..dc6ee5939 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -4,19 +4,19 @@ global: table_config: "SingleLine" postcode_csv_check: False runlog_writer: - write_csv: True # Write the runlog to a CSV file - write_hdf5: False # Write the runlog to an HDF5 file - write_sql: False # Write the runlog to a SQL database - display: False # Display the runlog in the terminal - log_path: "data/log" + write_csv: True # Write the runlog to a CSV file + write_hdf5: False # Write the runlog to an HDF5 file + write_sql: False # Write the runlog to a SQL database + display: False # Display the runlog in the terminal + log_path: "data/log" paths: logs_foldername: "testing_pydoop" snapshot_path: "/ons/rdbe_dev/snapshot-202012-002-fba5c4ba-fb8c-4a62-87bb-66c725eea5fd.json" masterlist_path: "data/external/ONSPD_NOV_2022_UK.csv" csv_filenames: - main: "main_runlog.csv" - configs: "configs_runlog.csv" - logs: "logs_runlog.csv" + main: "main_runlog.csv" + configs: "configs_runlog.csv" + logs: "logs_runlog.csv" run_log_sql: - log_db: "test_runlog" - log_mode: "append" + log_db: "test_runlog" + log_mode: "append" From 7faa7e16a685bb4591fdb166eeef982c84fe51fb Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 20:15:54 +0100 Subject: [PATCH 219/411] Got rid of Camel case in function arguments. Changed order of if statement in load_schema() for readability. --- src/data_validation/validation.py | 37 +++++++++++++++++-------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index f07015405..c94b6d10e 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -127,41 +127,44 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): datafilepath = "/ons/rdbe_dev/Frozen_Group_Data2021_244_Headers.csv" -def load_schema(filePath: str = "./config/DataSchema.toml") -> dict: +def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: """Load the data schema from toml file into a dictionary Keyword Arguments: - filePath -- Path to data schema toml file - (default: {"./config/DataSchema.toml"}) + file_path -- Path to data schema toml file + (default: {"./config/Data_Schema.toml"}) Returns: A dict: dictionary containing parsed schema toml file """ - file_exists = os.path.exists(filePath) + # Create bool variable for checking if file exists + file_exists = os.path.exists(file_path) - # Check if DataSchema.toml exists - if not file_exists: - return file_exists + # Check if Data_Schema.toml exists + if file_exists: + # Load toml data schema into dictionary if toml file exists + toml_string = toml.load(file_path) else: - # Load toml data schema into dictionary - toml_string = toml.load(filePath) + # Return False if file does not exist + return file_exists + return toml_string def check_data_shape( - dataFile: str = datafilepath, - filePath: str = "./config/DataSchema.toml", - numCols: int = 93, + data_file: str = datafilepath, + schema_path: str = "./config/DataSchema.toml", + num_cols: int = 93, ) -> bool: """Compares the shape of the data and compares it to the shape of the toml file based off the data schema. Returns true if there is a match and false otherwise. Keyword Arguments: - dataFile -- Path to data file to compare (default: {datafilepath}) - filePath -- Path to schema dictionary file + data_file -- Path to data file to compare (default: {datafilepath}) + schema_path -- Path to schema dictionary file (default: {"./config/DataSchema.toml"}) - numCols -- Number of columns in data (default: {93}) + num_cols -- Number of columns in data (default: {93}) Returns: A bool: boolean, True if number of columns is as expected, otherwise False @@ -170,13 +173,13 @@ def check_data_shape( cols_match = False # Read data file - data = read_data(dataFile) + data = read_data(data_file) # Convert it to dictionary data_dict = data.to_dict() # Load toml data schema into dictionary - toml_string = load_schema(filePath) + toml_string = load_schema(schema_path) # Create a 'shared key' dictionary shared_items = {k: toml_string[k] for k in toml_string if k in data_dict} From 85c08020da548ea463a9783d87762f935b9066b9 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 20:17:48 +0100 Subject: [PATCH 220/411] Modified lines to test functions, giving more meaningful variable names. --- src/data_validation/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index c94b6d10e..50a16f632 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -193,5 +193,5 @@ def check_data_shape( return cols_match -test = check_data_shape() -print(test) +shape_matches = check_data_shape() +print(shape_matches) From e6e1ad991386b6a661a4fb38f13b85c0855431f7 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 20:35:09 +0100 Subject: [PATCH 221/411] Added new data schema, Data_Schema.toml. --- config/contributors_schema.toml | 740 ++++++++++++++++---------------- 1 file changed, 370 insertions(+), 370 deletions(-) diff --git a/config/contributors_schema.toml b/config/contributors_schema.toml index fb92d1fe0..89379fd4d 100644 --- a/config/contributors_schema.toml +++ b/config/contributors_schema.toml @@ -1,469 +1,469 @@ [snapshot_id] -Description = "nan" -Deduced_Data_Type = "nan" -Nullable = "nan" -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = nan +Nullable = nan +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan [reference] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = 11001603625 -Max_values = 19891309165 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = 11001603625.0 +Max values = 19891309165.0 +Possible Categorical Values = nan [period] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" +Description = nan +Deduced Data Type = Category(int) +Nullable = 0.0 +Current Data Type = str Length = 6 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [202012] +Min values = nan +Max values = nan +Possible Categorical Values = 202012 [survey] -Description = "All values are 002" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["002"] +Description = All values are 002 +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = nan +Max values = nan +Possible Categorical Values = 002 [formid] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "int" +Description = nan +Deduced Data Type = Category(int) +Nullable = 0.0 +Current Data Type = int Length = 2 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [20, 21] +Min values = nan +Max values = nan +Possible Categorical Values = 20, 21 [status] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Clear - overridden SE", "Form sent out", "Check needed", "Combined child (NIL2)", "Out of scope (NIL3)", "Ceased trading (NIL4)", "Dormant (NIL5)", "Part year return (NIL8)", "No UK activity (NIL9)"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = nan +Max values = nan +Possible Categorical Values = Clear, Clear - overridden, Form saved, Clear - overridden SE, Form sent out, Check needed, Combined child (NIL2), Out of scope (NIL3), Ceased trading (NIL4), Dormant (NIL5), Part year return (NIL8), No UK activity (NIL9) [statusencoded] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "int" +Description = nan +Deduced Data Type = Category(int) +Nullable = 0.0 +Current Data Type = int Length = 3 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309] +Min values = nan +Max values = nan +Possible Categorical Values = 100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309 [receiptdate] -Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" -Deduced_Data_Type = "Datetime" -Nullable = False -Current_Data_Type = ["None","str"] -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] +Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 +Deduced Data Type = Datetime +Nullable = 0.0 +Current Data Type = None/str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan [lockedby] -Description = "All empty strings" -Deduced_Data_Type = "pandas.NA" -Nullable = True -Current_Data_Type = ["None","str"] +Description = All empty strings +Deduced Data Type = ? +Nullable = 1.0 +Current Data Type = None/str Length = 0 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] +Min values = nan +Max values = nan +Possible Categorical Values = nan [lockeddate] -Description = "All None type" -Deduced_Data_Type = "pandas.NA" -Nullable = True -Current_Data_Type = ["None","str"] +Description = All None type +Deduced Data Type = ? +Nullable = 1.0 +Current Data Type = None/str Length = 0 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["nan"] +Min values = nan +Max values = nan +Possible Categorical Values = nan [formtype] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["0001", "0006"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = 0001, 0006 [checkletter] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str Length = 1 -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", "S"] +Min values = nan +Max values = nan +Possible Categorical Values = T, H, F, J, D, A, K, C, B, L, S [frozensicoutdated] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1120 -Max_values = 93059 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1120.0 +Max values = 93059.0 +Possible Categorical Values = nan [rusicoutdated] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1120 -Max_values = 93059 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1120.0 +Max values = 93059.0 +Possible Categorical Values = nan [frozensic] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1300 -Max_values = 96090 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1300.0 +Max values = 96090.0 +Possible Categorical Values = nan [rusic] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1300 -Max_values = 96090 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1300.0 +Max values = 96090.0 +Possible Categorical Values = nan [frozenemployees] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 272527 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 272527.0 +Possible Categorical Values = nan [employees] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 272528 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 272528.0 +Possible Categorical Values = nan [frozenemployment] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1 -Max_values = 272527 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1.0 +Max values = 272527.0 +Possible Categorical Values = nan [employment] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1 -Max_values = 272528 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1.0 +Max values = 272528.0 +Possible Categorical Values = nan [frozenfteemployment] -Description = "nan" -Deduced_Data_Type = "float" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0.0 -Max_values = 177699.0 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = float +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 177699.0 +Possible Categorical Values = nan [fteemployment] -Description = "nan" -Deduced_Data_Type = "float" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0.0 -Max_values = 177699.5 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = float +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 177699.5 +Possible Categorical Values = nan [frozenturnover] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 55277352 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 55277352.0 +Possible Categorical Values = nan [turnover] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 55277352 -Possible_Categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 55277352.0 +Possible Categorical Values = nan [enterprisereference] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = 1001603625 -Max_values = 9891309165 -Possible_categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = 1001603625.0 +Max values = 9891309165.0 +Possible Categorical Values = nan [wowenterprisereference] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = 1001603625 -Max_values = 9891309165 -Possible_categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = 1001603625.0 +Max values = 9891309165.0 +Possible Categorical Values = nan [cellnumber] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "int" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = [0] +Description = nan +Deduced Data Type = Category(int) +Nullable = 0.0 +Current Data Type = int +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = 0 [currency] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["S", "E"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = S, E [vatreference] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["VATREF"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = VATREF [payereference] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["PAYEREF"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = PAYEREF [companyregistrationnumber] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["CRN"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = CRN [numberlivelocalunits] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 6063 -Possible_categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 6063.0 +Possible Categorical Values = nan [numberlivevat] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 255 -Possible_categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 255.0 +Possible Categorical Values = nan [numberlivepaye] -Description = "nan" -Deduced_Data_Type = "int" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 0 -Max_values = 24 -Possible_categorical_Values = ["nan"] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 0.0 +Max values = 24.0 +Possible Categorical Values = nan [legalstatus] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = 1 -Max_values = 4 -Possible_Categorical_Values = [1, 2, 3, 4] +Description = nan +Deduced Data Type = int +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = 1.0 +Max values = 4.0 +Possible Categorical Values = 1, 2, 3, 4 [reportingunitmarker] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["L", "E"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = L, E [region] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", "XX", "AA", "DC", "GF", "BA"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = WW, BB, FE, GG, JG, HH, ED, KJ, XX, AA, DC, GF, BA [birthdate] -Description = "Datetime format = format=%d/%m/%Y" -Deduced_Data_Type = "Datetime" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] +Description = Datetime format = format=%d/%m/%Y +Deduced Data Type = Datetime +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan [referencename] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] +Description = nan +Deduced Data Type = str +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan [referencepostcode] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] +Description = nan +Deduced Data Type = str +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan [tradingstyle] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] +Description = nan +Deduced Data Type = str +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan [selectiontype] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["L"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = L [inclusionexclusion] -Description = 'All values are " "' -Deduced_Data_Type = "pandas.NA" -Nullable = False -Current_Data_Type = "str" -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] +Description = All values are +Deduced Data Type = ? +Nullable = 0.0 +Current Data Type = str +Length = nanread_ +Min values = nan +Max values = nan +Possible Categorical Values = nan [createdby] -Description = "nan" -Deduced_Data_Type = "str" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["ingestion"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = nan +Max values = nan +Possible Categorical Values = ingestion [createddate] -Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" -Deduced_Data_Type = "Datetime" -Nullable = False -Current_Data_Type = "str" -Length = ">=1" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] +Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 +Deduced Data Type = Datetime +Nullable = 0.0 +Current Data Type = str +Length >=1 +Min values = nan +Max values = nan +Possible Categorical Values = nan [lastupdatedby] -Description = "nan" -Deduced_Data_Type = "category" -Nullable = False -Current_Data_Type = ["None","str"] -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] +Description = nan +Deduced Data Type = Category(str) +Nullable = 0.0 +Current Data Type = None/str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = data_migration, Cheri, Adela, David [lastupdateddate] -Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" -Deduced_Data_Type = "Datetime" -Nullable = False -Current_Data_Type = ["None","str"] -Length = "nan" -Min_values = "nan" -Max_values = "nan" -Possible_categorical_Values = ["nan"] +Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 +Deduced Data Type = Datetime +Nullable = 0.0 +Current Data Type = None/str +Length = nan +Min values = nan +Max values = nan +Possible Categorical Values = nan From 9b38f9cf18846559b2b01198bb9360d0e8c7b4dc Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 21:46:50 +0100 Subject: [PATCH 222/411] Corrected TOML file keys and values to be compatable with TOML formatting. --- config/contributors_schema.toml | 502 ++++++++++++++++---------------- 1 file changed, 251 insertions(+), 251 deletions(-) diff --git a/config/contributors_schema.toml b/config/contributors_schema.toml index 89379fd4d..e867fbf21 100644 --- a/config/contributors_schema.toml +++ b/config/contributors_schema.toml @@ -1,469 +1,469 @@ [snapshot_id] Description = nan -Deduced Data Type = nan +Deduced_Data_Type = nan Nullable = nan -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [reference] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = 11001603625.0 -Max values = 19891309165.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = ">=1" +Min_values = 11001603625.0 +Max_values = 19891309165.0 +Possible_Categorical_Values = nan [period] Description = nan -Deduced Data Type = Category(int) +Deduced_Data_Type = "Category(int)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = 6 -Min values = nan -Max values = nan -Possible Categorical Values = 202012 +Min_values = nan +Max_values = nan +Possible_Categorical_Values = 202012 [survey] -Description = All values are 002 -Deduced Data Type = Category(str) +Description = "All values are 002" +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = nan -Max values = nan -Possible Categorical Values = 002 +Current_Data_Type = "str" +Length = ">=1" +Min_values = nan +Max_values = nan +Possible_Categorical_Values = "002" [formid] Description = nan -Deduced Data Type = Category(int) +Deduced_Data_Type = "Category(int)" Nullable = 0.0 -Current Data Type = int +Current_Data_Type = "int" Length = 2 -Min values = nan -Max values = nan -Possible Categorical Values = 20, 21 +Min_values = nan +Max_values = nan +Possible_Categorical_Values = [20, 21] [status] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = nan -Max values = nan -Possible Categorical Values = Clear, Clear - overridden, Form saved, Clear - overridden SE, Form sent out, Check needed, Combined child (NIL2), Out of scope (NIL3), Ceased trading (NIL4), Dormant (NIL5), Part year return (NIL8), No UK activity (NIL9) +Current_Data_Type = "str" +Length = ">=1" +Min_values = nan +Max_values = nan +Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Clear - overridden SE", "Form sent out", "Check needed", "Combined child (NIL2)", "Out of scope (NIL3)", "Ceased trading (NIL4)", "Dormant (NIL5)", "Part year return (NIL8)", "No UK activity (NIL9)"] [statusencoded] Description = nan -Deduced Data Type = Category(int) +Deduced_Data_Type = "Category(int)" Nullable = 0.0 -Current Data Type = int +Current_Data_Type = "int" Length = 3 -Min values = nan -Max values = nan -Possible Categorical Values = 100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309 +Min_values = nan +Max_values = nan +Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309] [receiptdate] -Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 -Deduced Data Type = Datetime +Description = "Datetime_format = %Y-%m-%d %H:%M:%S.%f+00" +Deduced_Data_Type = "Datetime" Nullable = 0.0 -Current Data Type = None/str +Current_Data_Type = "None/str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [lockedby] -Description = All empty strings -Deduced Data Type = ? +Description = "All empty strings" +Deduced_Data_Type = "?" Nullable = 1.0 -Current Data Type = None/str +Current_Data_Type = "None/str" Length = 0 -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [lockeddate] -Description = All None type -Deduced Data Type = ? +Description = "All None type" +Deduced_Data_Type = "?" Nullable = 1.0 -Current Data Type = None/str +Current_Data_Type = "None/str" Length = 0 -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [formtype] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = 0001, 0006 +Min_values = nan +Max_values = nan +Possible_Categorical_Values = ["0001", "0006"] [checkletter] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = 1 -Min values = nan -Max values = nan -Possible Categorical Values = T, H, F, J, D, A, K, C, B, L, S +Min_values = nan +Max_values = nan +Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", "S"] [frozensicoutdated] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 1120.0 -Max values = 93059.0 -Possible Categorical Values = nan +Min_values = 1120.0 +Max_values = 93059.0 +Possible_Categorical_Values = nan [rusicoutdated] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 1120.0 -Max values = 93059.0 -Possible Categorical Values = nan +Min_values = 1120.0 +Max_values = 93059.0 +Possible_Categorical_Values = nan [frozensic] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 1300.0 -Max values = 96090.0 -Possible Categorical Values = nan +Min_values = 1300.0 +Max_values = 96090.0 +Possible_Categorical_Values = nan [rusic] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 1300.0 -Max values = 96090.0 -Possible Categorical Values = nan +Min_values = 1300.0 +Max_values = 96090.0 +Possible_Categorical_Values = nan [frozenemployees] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 272527.0 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 272527.0 +Possible_Categorical_Values = nan [employees] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 272528.0 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 272528.0 +Possible_Categorical_Values = nan [frozenemployment] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 1.0 -Max values = 272527.0 -Possible Categorical Values = nan +Min_values = 1.0 +Max_values = 272527.0 +Possible_Categorical_Values = nan [employment] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 1.0 -Max values = 272528.0 -Possible Categorical Values = nan +Min_values = 1.0 +Max_values = 272528.0 +Possible_Categorical_Values = nan [frozenfteemployment] Description = nan -Deduced Data Type = float +Deduced_Data_Type = "float" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 177699.0 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 177699.0 +Possible_Categorical_Values = nan [fteemployment] Description = nan -Deduced Data Type = float +Deduced_Data_Type = "float" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 177699.5 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 177699.5 +Possible_Categorical_Values = nan [frozenturnover] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 55277352.0 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 55277352.0 +Possible_Categorical_Values = nan [turnover] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 55277352.0 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 55277352.0 +Possible_Categorical_Values = nan [enterprisereference] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = 1001603625.0 -Max values = 9891309165.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = ">=1" +Min_values = 1001603625.0 +Max_values = 9891309165.0 +Possible_Categorical_Values = nan [wowenterprisereference] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = 1001603625.0 -Max values = 9891309165.0 -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = ">=1" +Min_values = 1001603625.0 +Max_values = 9891309165.0 +Possible_Categorical_Values = nan [cellnumber] Description = nan -Deduced Data Type = Category(int) +Deduced_Data_Type = "Category(int)" Nullable = 0.0 -Current Data Type = int +Current_Data_Type = "int" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = 0 +Min_values = nan +Max_values = nan +Possible_Categorical_Values = 0 [currency] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = S, E +Min_values = nan +Max_values = nan +Possible_Categorical_Values = ["S", "E"] [vatreference] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = VATREF +Min_values = nan +Max_values = nan +Possible_Categorical_Values = "VATREF" [payereference] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = PAYEREF +Min_values = nan +Max_values = nan +Possible_Categorical_Values = "PAYEREF" [companyregistrationnumber] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = CRN +Min_values = nan +Max_values = nan +Possible_Categorical_Values = "CRN" [numberlivelocalunits] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 6063.0 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 6063.0 +Possible_Categorical_Values = nan [numberlivevat] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 255.0 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 255.0 +Possible_Categorical_Values = nan [numberlivepaye] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 0.0 -Max values = 24.0 -Possible Categorical Values = nan +Min_values = 0.0 +Max_values = 24.0 +Possible_Categorical_Values = nan [legalstatus] Description = nan -Deduced Data Type = int +Deduced_Data_Type = "int" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = 1.0 -Max values = 4.0 -Possible Categorical Values = 1, 2, 3, 4 +Min_values = 1.0 +Max_values = 4.0 +Possible_Categorical_Values = [1, 2, 3, 4] [reportingunitmarker] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = L, E +Min_values = nan +Max_values = nan +Possible_Categorical_Values = ["L", "E"] [region] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = WW, BB, FE, GG, JG, HH, ED, KJ, XX, AA, DC, GF, BA +Min_values = nan +Max_values = nan +Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", "XX", "AA", "DC", "GF", "BA"] [birthdate] -Description = Datetime format = format=%d/%m/%Y -Deduced Data Type = Datetime +Description = "Datetime_format = format=%d/%m/%Y" +Deduced_Data_Type = "Datetime" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [referencename] Description = nan -Deduced Data Type = str +Deduced_Data_Type = "str" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [referencepostcode] Description = nan -Deduced Data Type = str +Deduced_Data_Type = "str" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [tradingstyle] Description = nan -Deduced Data Type = str +Deduced_Data_Type = "str" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [selectiontype] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str +Current_Data_Type = "str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = L +Min_values = nan +Max_values = nan +Possible_Categorical_Values = "L" [inclusionexclusion] -Description = All values are -Deduced Data Type = ? +Description = "All values are \" \"" +Deduced_Data_Type = "?" Nullable = 0.0 -Current Data Type = str -Length = nanread_ -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = "nanread_" +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [createdby] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = nan -Max values = nan -Possible Categorical Values = ingestion +Current_Data_Type = "str" +Length = ">=1" +Min_values = nan +Max_values = nan +Possible_Categorical_Values = "ingestion" [createddate] -Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 -Deduced Data Type = Datetime +Description = "Datetime_format = %Y-%m-%d %H:%M:%S.%f+00" +Deduced_Data_Type = "Datetime" Nullable = 0.0 -Current Data Type = str -Length >=1 -Min values = nan -Max values = nan -Possible Categorical Values = nan +Current_Data_Type = "str" +Length = ">=1" +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan [lastupdatedby] Description = nan -Deduced Data Type = Category(str) +Deduced_Data_Type = "Category(str)" Nullable = 0.0 -Current Data Type = None/str +Current_Data_Type = "None/str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = data_migration, Cheri, Adela, David +Min_values = nan +Max_values = nan +Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] [lastupdateddate] -Description = Datetime format = %Y-%m-%d %H:%M:%S.%f+00 -Deduced Data Type = Datetime +Description = "Datetime_format = %Y-%m-%d %H:%M:%S.%f+00" +Deduced_Data_Type = "Datetime" Nullable = 0.0 -Current Data Type = None/str +Current_Data_Type = "None/str" Length = nan -Min values = nan -Max values = nan -Possible Categorical Values = nan +Min_values = nan +Max_values = nan +Possible_Categorical_Values = nan From 5bb48f717da31adb00a50cc36dff02508e8ac1e9 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 21:49:51 +0100 Subject: [PATCH 223/411] Updated name of schema TOML file and added descriptive names for testing. --- src/data_validation/validation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 50a16f632..0a9c93ff0 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -153,7 +153,7 @@ def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: def check_data_shape( data_file: str = datafilepath, - schema_path: str = "./config/DataSchema.toml", + schema_path: str = "./config/Data_Schema.toml", num_cols: int = 93, ) -> bool: """Compares the shape of the data and compares it to the shape of the toml @@ -193,5 +193,7 @@ def check_data_shape( return cols_match -shape_matches = check_data_shape() -print(shape_matches) +schema_dict = load_schema() + +# Check if data and schema shapes match +shapes_match = check_data_shape() From 52448f018baa59c02fafb1ecf586059453e305f3 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 18 May 2023 12:57:24 +0100 Subject: [PATCH 224/411] Removed num_cols in check_data_shape. Avoiding hard-coded solutions, and data has changed shape since starting the ticket. --- src/data_validation/validation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 0a9c93ff0..50d82b0f1 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -154,7 +154,6 @@ def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: def check_data_shape( data_file: str = datafilepath, schema_path: str = "./config/Data_Schema.toml", - num_cols: int = 93, ) -> bool: """Compares the shape of the data and compares it to the shape of the toml file based off the data schema. Returns true if there is a match and false @@ -164,7 +163,6 @@ def check_data_shape( data_file -- Path to data file to compare (default: {datafilepath}) schema_path -- Path to schema dictionary file (default: {"./config/DataSchema.toml"}) - num_cols -- Number of columns in data (default: {93}) Returns: A bool: boolean, True if number of columns is as expected, otherwise False From 65fdc2ce84d92293f5d4215f0b9127d8044b8e74 Mon Sep 17 00:00:00 2001 From: Ilyas Ali Date: Thu, 18 May 2023 14:20:31 +0100 Subject: [PATCH 225/411] test --- src/data_ingest/loading.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py index 3646f5347..357694652 100644 --- a/src/data_ingest/loading.py +++ b/src/data_ingest/loading.py @@ -28,3 +28,5 @@ snapdata, contributerdict, responsesdict = hdfs_load_json(file_path) + +# test commit From 2f2ee3530542f4a75c5c18b8ca7e3493f9d6ba44 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 19:23:52 +0100 Subject: [PATCH 226/411] Updated Data_Schema.toml to match that in the 118_data_schema branch. --- config/contributors_schema.toml | 396 ++++++++++++++++---------------- 1 file changed, 198 insertions(+), 198 deletions(-) diff --git a/config/contributors_schema.toml b/config/contributors_schema.toml index e867fbf21..ad243f119 100644 --- a/config/contributors_schema.toml +++ b/config/contributors_schema.toml @@ -1,469 +1,469 @@ [snapshot_id] -Description = nan -Deduced_Data_Type = nan -Nullable = nan +Description = "nan" +Deduced_Data_Type = "nan" +Nullable = "nan" Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] [reference] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" Min_values = 11001603625.0 Max_values = 19891309165.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [period] -Description = nan -Deduced_Data_Type = "Category(int)" +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" Length = 6 -Min_values = nan -Max_values = nan -Possible_Categorical_Values = 202012 +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [202012] [survey] Description = "All values are 002" -Deduced_Data_Type = "Category(str)" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" -Min_values = nan -Max_values = nan -Possible_Categorical_Values = "002" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["002"] [formid] -Description = nan -Deduced_Data_Type = "Category(int)" +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "int" Length = 2 -Min_values = nan -Max_values = nan +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = [20, 21] [status] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" -Min_values = nan -Max_values = nan +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Clear - overridden SE", "Form sent out", "Check needed", "Combined child (NIL2)", "Out of scope (NIL3)", "Ceased trading (NIL4)", "Dormant (NIL5)", "Part year return (NIL8)", "No UK activity (NIL9)"] [statusencoded] -Description = nan -Deduced_Data_Type = "Category(int)" +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "int" Length = 3 -Min_values = nan -Max_values = nan +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, 308, 309] [receiptdate] -Description = "Datetime_format = %Y-%m-%d %H:%M:%S.%f+00" +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" Nullable = 0.0 -Current_Data_Type = "None/str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Current_Data_Type = ["None","str"] +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] [lockedby] Description = "All empty strings" Deduced_Data_Type = "?" Nullable = 1.0 -Current_Data_Type = "None/str" +Current_Data_Type = ["None","str"] Length = 0 -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] [lockeddate] Description = "All None type" Deduced_Data_Type = "?" Nullable = 1.0 -Current_Data_Type = "None/str" +Current_Data_Type = ["None","str"] Length = 0 -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["nan"] [formtype] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = ["0001", "0006"] [checkletter] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = 1 -Min_values = nan -Max_values = nan +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", "S"] [frozensicoutdated] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 1120.0 Max_values = 93059.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [rusicoutdated] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 1120.0 Max_values = 93059.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [frozensic] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 1300.0 Max_values = 96090.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [rusic] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 1300.0 Max_values = 96090.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [frozenemployees] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 272527.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [employees] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 272528.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [frozenemployment] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 1.0 Max_values = 272527.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [employment] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 1.0 Max_values = 272528.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [frozenfteemployment] -Description = nan -Deduced_Data_Type = "float" +Description = "nan" +Deduced_Data_Type = "float" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 177699.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [fteemployment] -Description = nan -Deduced_Data_Type = "float" +Description = "nan" +Deduced_Data_Type = "float" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 177699.5 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [frozenturnover] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 55277352.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [turnover] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 55277352.0 -Possible_Categorical_Values = nan +Possible_Categorical_Values = ["nan"] [enterprisereference] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" Min_values = 1001603625.0 Max_values = 9891309165.0 -Possible_Categorical_Values = nan +Possible_categorical_Values = ["nan"] [wowenterprisereference] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" Min_values = 1001603625.0 Max_values = 9891309165.0 -Possible_Categorical_Values = nan +Possible_categorical_Values = ["nan"] [cellnumber] -Description = nan -Deduced_Data_Type = "Category(int)" +Description = "nan" +Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "int" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = 0 +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = [0] [currency] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = ["S", "E"] [vatreference] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = "VATREF" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["VATREF"] [payereference] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = "PAYEREF" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["PAYEREF"] -[companyregistrationnumber] -Description = nan -Deduced_Data_Type = "Category(str)" +[companyregi"str"ationnumber] +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = "CRN" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["CRN"] [numberlivelocalunits] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 6063.0 -Possible_Categorical_Values = nan +Possible_categorical_Values = ["nan"] [numberlivevat] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 255.0 -Possible_Categorical_Values = nan +Possible_categorical_Values = ["nan"] [numberlivepaye] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 0.0 Max_values = 24.0 -Possible_Categorical_Values = nan +Possible_categorical_Values = ["nan"] [legalstatus] -Description = nan +Description = "nan" Deduced_Data_Type = "int" Nullable = 0.0 Current_Data_Type = "str" -Length = nan +Length = "nan" Min_values = 1.0 Max_values = 4.0 Possible_Categorical_Values = [1, 2, 3, 4] [reportingunitmarker] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = ["L", "E"] [region] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", "XX", "AA", "DC", "GF", "BA"] [birthdate] -Description = "Datetime_format = format=%d/%m/%Y" +Description = "Datetime format = format=%d/%m/%Y" Deduced_Data_Type = "Datetime" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [referencename] -Description = nan +Description = "nan" Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [referencepostcode] -Description = nan +Description = "nan" Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [tradingstyle] -Description = nan +Description = "nan" Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [selectiontype] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = "L" +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["L"] [inclusionexclusion] -Description = "All values are \" \"" +Description = 'All values are " "' Deduced_Data_Type = "?" Nullable = 0.0 Current_Data_Type = "str" -Length = "nanread_" -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [createdby] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" -Min_values = nan -Max_values = nan -Possible_Categorical_Values = "ingestion" +Min_values = "nan" +Max_values = "nan" +Possible_Categorical_Values = ["ingestion"] [createddate] -Description = "Datetime_format = %Y-%m-%d %H:%M:%S.%f+00" +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" Nullable = 0.0 Current_Data_Type = "str" Length = ">=1" -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] [lastupdatedby] -Description = nan -Deduced_Data_Type = "Category(str)" +Description = "nan" +Deduced_Data_Type = "str" Nullable = 0.0 -Current_Data_Type = "None/str" -Length = nan -Min_values = nan -Max_values = nan +Current_Data_Type = ["None","str"] +Length = "nan" +Min_values = "nan" +Max_values = "nan" Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] [lastupdateddate] -Description = "Datetime_format = %Y-%m-%d %H:%M:%S.%f+00" +Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" Nullable = 0.0 -Current_Data_Type = "None/str" -Length = nan -Min_values = nan -Max_values = nan -Possible_Categorical_Values = nan +Current_Data_Type = ["None","str"] +Length = "nan" +Min_values = "nan" +Max_values = "nan" +Possible_categorical_Values = ["nan"] From 3527c158ab6be2585a39295e595a3ff0ca37a4b9 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 20:03:07 +0100 Subject: [PATCH 227/411] Edited loading.py to match develop branch. Previously it matched more closes with the hdfs_mods.py file, specifically the hdfs_load_json function part. --- src/data_ingest/loading.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py index 357694652..117c6c040 100644 --- a/src/data_ingest/loading.py +++ b/src/data_ingest/loading.py @@ -15,18 +15,14 @@ contributers = pd.DataFrame(contributerdict) responses = pd.DataFrame(responsesdict) - # Open the file in read mode inside Hadoop context - with hdfs.open(filepath, "r") as file: - # Import csv file and convert to Dataframe - datadict = json.load(file) - contributerdict = datadict["contributors"][0] - responsesdict = datadict["responses"][0] - - datadf = pd.DataFrame.from_dict(datadict, orient="index") - - return datadf, contributerdict, responsesdict - +contributerdict = snapdata["contributors"] +responsesdict = snapdata["responses"] -snapdata, contributerdict, responsesdict = hdfs_load_json(file_path) +contributers = pd.DataFrame(contributerdict) +responses = pd.DataFrame(responsesdict) -# test commit +print(contributers.head()) +print("\n") +print(responses.head()) +print("\n") +print([responses["questioncode"].unique()]) From e96d66aa4f24ce17fc50f31dc584760419b0d68f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 20:14:11 +0100 Subject: [PATCH 228/411] Edited datafilepath to point to correct snapshot data file. Edited check_data_shape() to compare new snapshot data from a json file. Removed unnecessary dictionary comparison and commented code for clarity. --- src/data_validation/validation.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 50d82b0f1..08c7e6d88 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -121,10 +121,11 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): return unreal_postcodes import os import toml +from src.utils.hdfs_mods import hdfs_load_json as read_data -from src.utils.hdfs_mods import read_hdfs_csv as read_data - -datafilepath = "/ons/rdbe_dev/Frozen_Group_Data2021_244_Headers.csv" +datafilepath = ( + "/ons/rdbe_dev/snapshot-202012-002-fba5c4ba-fb8c-4a62-87bb-66c725eea5fd.json" +) def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: @@ -170,20 +171,18 @@ def check_data_shape( cols_match = False - # Read data file - data = read_data(data_file) + # Read data file from json file + snapdata = read_data(data_file) - # Convert it to dictionary - data_dict = data.to_dict() + # Specify which key in snapshot data dictionary to get correct data + # List, with each element containing a dictionary for each row of data + contributerdict = snapdata["contributors"] # Load toml data schema into dictionary toml_string = load_schema(schema_path) - # Create a 'shared key' dictionary - shared_items = {k: toml_string[k] for k in toml_string if k in data_dict} - - # Compare number of 'columns' in data to data schema - if len(shared_items) == len(toml_string): + # Compare length of data dictionary to the data schema + if len(contributerdict[0]) == len(toml_string): cols_match = True else: cols_match = False @@ -191,7 +190,5 @@ def check_data_shape( return cols_match -schema_dict = load_schema() - # Check if data and schema shapes match shapes_match = check_data_shape() From 9f59ecc5784391862505cfea5345ee340f7db7c8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 20:16:43 +0100 Subject: [PATCH 229/411] Edited test_validation.py to reflect the fact check_data_shape() no longer expects three arguments, but two. --- tests/test_validation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index 720035e6a..d965486ba 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -17,9 +17,8 @@ def test_check_data_shape(): with pytest.raises(AssertionError): assert not isinstance(result_1, bool) # Assert: test that add fails when the arguments are wrong type - pytest.raises(TypeError, check_data_shape, 1, "2", 3) - pytest.raises(TypeError, check_data_shape, "1", 2, 3) - pytest.raises(TypeError, check_data_shape, "1", "2", "3") + pytest.raises(TypeError, check_data_shape, 1, "2") + pytest.raises(TypeError, check_data_shape, "1", 2) def test_load_schema(): From 228aefd68b6866844e5218a719f012a16de330ef Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 20:26:16 +0100 Subject: [PATCH 230/411] Removed test call to check_data_shape() function in src/data_validation/validation.py. --- src/data_validation/validation.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 08c7e6d88..3f935c649 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -188,7 +188,3 @@ def check_data_shape( cols_match = False return cols_match - - -# Check if data and schema shapes match -shapes_match = check_data_shape() From 8a9364c9f37e4a0027691332928a1157d66d36b6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 20:38:47 +0100 Subject: [PATCH 231/411] Edited validation.py to fetch snapshot data file path using the developer_config.yaml rather than hard coding it. --- src/data_validation/validation.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 3f935c649..2536074b4 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -122,10 +122,13 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): import os import toml from src.utils.hdfs_mods import hdfs_load_json as read_data +from src.utils.helpers import Config_settings + -datafilepath = ( - "/ons/rdbe_dev/snapshot-202012-002-fba5c4ba-fb8c-4a62-87bb-66c725eea5fd.json" -) +conf_obj = Config_settings() +config = conf_obj.config_dict +config_paths = config["paths"] +snapshot_path = config_paths["snapshot_path"] # Taken from config file def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: @@ -153,7 +156,7 @@ def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: def check_data_shape( - data_file: str = datafilepath, + data_file: str = snapshot_path, schema_path: str = "./config/Data_Schema.toml", ) -> bool: """Compares the shape of the data and compares it to the shape of the toml From ddf91fb924b990db50058c74dbb561b348324fec Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 5 Jun 2023 17:27:41 +0100 Subject: [PATCH 232/411] Moved read_data() outside of check_data_shape() to correct pydoop error in pytest. --- src/data_validation/validation.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 2536074b4..f87ec5212 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -129,6 +129,7 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): config = conf_obj.config_dict config_paths = config["paths"] snapshot_path = config_paths["snapshot_path"] # Taken from config file +snapdata = read_data(snapshot_path) # Read data file from json file def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: @@ -156,7 +157,6 @@ def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: def check_data_shape( - data_file: str = snapshot_path, schema_path: str = "./config/Data_Schema.toml", ) -> bool: """Compares the shape of the data and compares it to the shape of the toml @@ -164,7 +164,6 @@ def check_data_shape( otherwise. Keyword Arguments: - data_file -- Path to data file to compare (default: {datafilepath}) schema_path -- Path to schema dictionary file (default: {"./config/DataSchema.toml"}) @@ -174,9 +173,6 @@ def check_data_shape( cols_match = False - # Read data file from json file - snapdata = read_data(data_file) - # Specify which key in snapshot data dictionary to get correct data # List, with each element containing a dictionary for each row of data contributerdict = snapdata["contributors"] From d6827cbe61a196306cbdc635d67ca8278989771e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 5 Jun 2023 17:41:51 +0100 Subject: [PATCH 233/411] Updated Data_Schema.toml to most recent 118 version. --- config/contributors_schema.toml | 188 ++++++++++++++++---------------- 1 file changed, 94 insertions(+), 94 deletions(-) diff --git a/config/contributors_schema.toml b/config/contributors_schema.toml index ad243f119..fb92d1fe0 100644 --- a/config/contributors_schema.toml +++ b/config/contributors_schema.toml @@ -11,17 +11,17 @@ Possible_Categorical_Values = ["nan"] [reference] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" -Min_values = 11001603625.0 -Max_values = 19891309165.0 +Min_values = 11001603625 +Max_values = 19891309165 Possible_Categorical_Values = ["nan"] [period] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = 6 Min_values = "nan" @@ -31,7 +31,7 @@ Possible_Categorical_Values = [202012] [survey] Description = "All values are 002" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -40,8 +40,8 @@ Possible_Categorical_Values = ["002"] [formid] Description = "nan" -Deduced_Data_Type = "int" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "int" Length = 2 Min_values = "nan" @@ -50,8 +50,8 @@ Possible_Categorical_Values = [20, 21] [status] Description = "nan" -Deduced_Data_Type = "str" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -60,8 +60,8 @@ Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Cle [statusencoded] Description = "nan" -Deduced_Data_Type = "int" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "int" Length = 3 Min_values = "nan" @@ -71,7 +71,7 @@ Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, [receiptdate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = 0.0 +Nullable = False Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" @@ -80,8 +80,8 @@ Possible_Categorical_Values = ["nan"] [lockedby] Description = "All empty strings" -Deduced_Data_Type = "?" -Nullable = 1.0 +Deduced_Data_Type = "pandas.NA" +Nullable = True Current_Data_Type = ["None","str"] Length = 0 Min_values = "nan" @@ -90,8 +90,8 @@ Possible_Categorical_Values = ["nan"] [lockeddate] Description = "All None type" -Deduced_Data_Type = "?" -Nullable = 1.0 +Deduced_Data_Type = "pandas.NA" +Nullable = True Current_Data_Type = ["None","str"] Length = 0 Min_values = "nan" @@ -100,8 +100,8 @@ Possible_Categorical_Values = ["nan"] [formtype] Description = "nan" -Deduced_Data_Type = "str" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -110,8 +110,8 @@ Possible_Categorical_Values = ["0001", "0006"] [checkletter] Description = "nan" -Deduced_Data_Type = "str" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "str" Length = 1 Min_values = "nan" @@ -121,87 +121,87 @@ Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", [frozensicoutdated] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1120.0 -Max_values = 93059.0 +Min_values = 1120 +Max_values = 93059 Possible_Categorical_Values = ["nan"] [rusicoutdated] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1120.0 -Max_values = 93059.0 +Min_values = 1120 +Max_values = 93059 Possible_Categorical_Values = ["nan"] [frozensic] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1300.0 -Max_values = 96090.0 +Min_values = 1300 +Max_values = 96090 Possible_Categorical_Values = ["nan"] [rusic] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1300.0 -Max_values = 96090.0 +Min_values = 1300 +Max_values = 96090 Possible_Categorical_Values = ["nan"] [frozenemployees] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 272527.0 +Min_values = 0 +Max_values = 272527 Possible_Categorical_Values = ["nan"] [employees] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 272528.0 +Min_values = 0 +Max_values = 272528 Possible_Categorical_Values = ["nan"] [frozenemployment] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1.0 -Max_values = 272527.0 +Min_values = 1 +Max_values = 272527 Possible_Categorical_Values = ["nan"] [employment] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1.0 -Max_values = 272528.0 +Min_values = 1 +Max_values = 272528 Possible_Categorical_Values = ["nan"] [frozenfteemployment] Description = "nan" Deduced_Data_Type = "float" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = 0.0 @@ -211,7 +211,7 @@ Possible_Categorical_Values = ["nan"] [fteemployment] Description = "nan" Deduced_Data_Type = "float" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = 0.0 @@ -221,47 +221,47 @@ Possible_Categorical_Values = ["nan"] [frozenturnover] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 55277352.0 +Min_values = 0 +Max_values = 55277352 Possible_Categorical_Values = ["nan"] [turnover] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 55277352.0 +Min_values = 0 +Max_values = 55277352 Possible_Categorical_Values = ["nan"] [enterprisereference] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" -Min_values = 1001603625.0 -Max_values = 9891309165.0 +Min_values = 1001603625 +Max_values = 9891309165 Possible_categorical_Values = ["nan"] [wowenterprisereference] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" -Min_values = 1001603625.0 -Max_values = 9891309165.0 +Min_values = 1001603625 +Max_values = 9891309165 Possible_categorical_Values = ["nan"] [cellnumber] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "int" Length = "nan" Min_values = "nan" @@ -270,8 +270,8 @@ Possible_Categorical_Values = [0] [currency] Description = "nan" -Deduced_Data_Type = "str" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -281,7 +281,7 @@ Possible_Categorical_Values = ["S", "E"] [vatreference] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -291,17 +291,17 @@ Possible_Categorical_Values = ["VATREF"] [payereference] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" Max_values = "nan" Possible_Categorical_Values = ["PAYEREF"] -[companyregi"str"ationnumber] +[companyregistrationnumber] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -311,47 +311,47 @@ Possible_Categorical_Values = ["CRN"] [numberlivelocalunits] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 6063.0 +Min_values = 0 +Max_values = 6063 Possible_categorical_Values = ["nan"] [numberlivevat] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 255.0 +Min_values = 0 +Max_values = 255 Possible_categorical_Values = ["nan"] [numberlivepaye] Description = "nan" Deduced_Data_Type = "int" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 0.0 -Max_values = 24.0 +Min_values = 0 +Max_values = 24 Possible_categorical_Values = ["nan"] [legalstatus] Description = "nan" -Deduced_Data_Type = "int" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "str" Length = "nan" -Min_values = 1.0 -Max_values = 4.0 +Min_values = 1 +Max_values = 4 Possible_Categorical_Values = [1, 2, 3, 4] [reportingunitmarker] Description = "nan" -Deduced_Data_Type = "str" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -360,8 +360,8 @@ Possible_Categorical_Values = ["L", "E"] [region] Description = "nan" -Deduced_Data_Type = "str" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -371,7 +371,7 @@ Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", " [birthdate] Description = "Datetime format = format=%d/%m/%Y" Deduced_Data_Type = "Datetime" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -381,7 +381,7 @@ Possible_categorical_Values = ["nan"] [referencename] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -391,7 +391,7 @@ Possible_categorical_Values = ["nan"] [referencepostcode] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -401,7 +401,7 @@ Possible_categorical_Values = ["nan"] [tradingstyle] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -411,7 +411,7 @@ Possible_categorical_Values = ["nan"] [selectiontype] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -420,8 +420,8 @@ Possible_Categorical_Values = ["L"] [inclusionexclusion] Description = 'All values are " "' -Deduced_Data_Type = "?" -Nullable = 0.0 +Deduced_Data_Type = "pandas.NA" +Nullable = False Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -431,7 +431,7 @@ Possible_categorical_Values = ["nan"] [createdby] Description = "nan" Deduced_Data_Type = "str" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -441,7 +441,7 @@ Possible_Categorical_Values = ["ingestion"] [createddate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = 0.0 +Nullable = False Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -450,8 +450,8 @@ Possible_categorical_Values = ["nan"] [lastupdatedby] Description = "nan" -Deduced_Data_Type = "str" -Nullable = 0.0 +Deduced_Data_Type = "category" +Nullable = False Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" @@ -461,7 +461,7 @@ Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] [lastupdateddate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = 0.0 +Nullable = False Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" From ad83eda10593a7d3ecb58a0e4c114834c60a4d75 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 5 Jun 2023 17:42:17 +0100 Subject: [PATCH 234/411] Deleted old DataSchema.toml. --- config/DataSchema.toml | 650 ----------------------------------------- 1 file changed, 650 deletions(-) delete mode 100644 config/DataSchema.toml diff --git a/config/DataSchema.toml b/config/DataSchema.toml deleted file mode 100644 index aa15a8a78..000000000 --- a/config/DataSchema.toml +++ /dev/null @@ -1,650 +0,0 @@ -[cell_id] -description = "Cell ID" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[civ_or_def] -description = "Business type: Civil or Defence" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[current_sic] -description = "Sic - Standard Industry Classification" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[data_source] -description = "Constructed" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[emp_other] -description = "emp_other (Full Time Equivalent)" -data_type = "Numeric float (or decimal)" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[emp_researcher] -description = "emp_researcher (Full Time Equivalent)" -data_type = "Numeric float (or decimal)" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[emp_technician] -description = "emp_technician (Full Time Equivalent)" -data_type = "Numeric float (or decimal)" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[emp_total] -description = "emp_total (Full Time Equivalent)" -data_type = "Numeric float (or decimal)" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[employee_count] -description = "Employee Count (IDBR)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[foreign_owner] -description = "Foreign Owner" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[form_status] -description = "Status" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[form_type] -description = "Form Type" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[freeze_id] -description = "Freeze ID - bespoke to openroad" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[headcount_oth_f] -description = "Other Female (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_oth_m] -description = "Other Male (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_res_f] -description = "Researchers Females (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_res_m] -description = "Researchers Male (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_tec_f] -description = "Technicians Female (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_tec_m] -description = "Technicians Male (Headcount)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[headcount_total] -description = "Total Headcount" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[period] -description = "Openroad Specific" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[period_contributor_id] -description = "Openroad Specific" -data_type = "Categorical" -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[period_year] -description = "Period" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[product_group] -description = "Published Product Group" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[ru_ref] -description = "Reference" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[sizeband] -description = "SizeBand" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[wowentref] -description = "Wowentref" -data_type = "Categorical" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q202] -description = "Salaries & Wages" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q203] -description = "Other current expenditure" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q204] -description = "Total Current Expenditure" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q205] -description = "Basic Research" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q206] -description = "Applied Research" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q207] -description = "Experimental Development" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q208] -description = "Land & Build CapEx " -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q209] -description = "Equipment & Machinery CapEx" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q210] -description = "Total Capex." -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q211] -description = "Total Inhouse Expenditure " -data_type = "Numeric Integer" -nullable = "No" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q212] -description = "Own Funds" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q213] -description = "Funding - Commission of the EU" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q214] -description = "Funding - UK government" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q215] -description = "Funding - Organisations outside the Uk " -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q216] -description = "Funding - Other UK Private Bus/Public Orgs " -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q217] -description = "Funding - Any Other UK " -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q218] -description = "Total Funding " -data_type = "Numeric Integer" -nullable = "No" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q219] -description = "Land Acquired for R&D (Split of Land & Build CapEx)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q220] -description = "Buildings acquired/constructed for R&D (Split of Land & Build CapEx)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q221] -description = "Expenditure on computer software only (of which from Equipment & Machinery CapEx)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q222] -description = "Purchase of Materials (Split of Other current)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q223] -description = "Purchase of Services (Split of Other current)" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q224] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q225] -description = "Ownership - Own Business" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q226] -description = "Ownership - UK Government" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q227] -description = "Ownership - Other UK Priv Bus" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q228] -description = "Ownership - Other UK Orgs" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q229] -description = "Ownership - Bus Enterprises in Group Outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q230] -description = "Ownership - Other Bus Enterprises outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q231] -description = "Ownership - Other Governments outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q232] -description = "Ownership - Higher Education Establishments outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q233] -description = "Ownership - Non-profit Orgs outside the UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q234] -description = "Ownership - Commission of EU" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q235] -description = "Ownership - International Orgs" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q236] -description = "Ownership - Any other Orgs outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q237] -description = "Ownership - not owned freely available" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q238] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q239] -description = "Life Length - Basic Research" -data_type = "Numeric Integer" -nullable = "Not Asked" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q240] -description = "Life Length - Applied Research" -data_type = "Numeric Integer" -nullable = "Not Asked" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q241] -description = "Life Length - Experimental Res" -data_type = "Numeric Integer" -nullable = "Not Asked" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q242] -description = "Funding - Any other UK organisations" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q243] -description = "Funding - Business Enterprises in group outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q244] -description = "Funding - Other Business Enterprises outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q245] -description = "Funding - Other Governments outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q246] -description = "Funding - Higher Education Est Outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q247] -description = "Funding - Non-profit Orgs outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q248] -description = "Funding - International Orgs" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q249] -description = "Funding - Any other orgs outside UK" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q250] -description = "Funding - UK Higher Education Establishments" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q251] -description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) LONG FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q252] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q253] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q254] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q255] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q256] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q257] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q258] -description = nan -data_type = nan -nullable = nan -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q302] -description = "Purchased/funded R&D in the UK (Yes or No)" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q303] -description = "Purchased Outside UK (Govt Funded) " -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q304] -description = "Purchased Outside UK (Other) " -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q305] -description = "Total Purchased" -data_type = "Numeric Integer" -nullable = "Yes" -min_acceptable_value = 0 -max_acceptable_value = 1000000 - -[q307] -description = "Tax credit claim submitted or intended for purchased work commissioned in UK (Y/N) LONG FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q308] -description = "Tax credit claim submitted/intended for purchased work outside Ukorig. Funded by UK gov (Y/N) LONG FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q309] -description = "Tax credit claim submitted or intended for all other purchased work outside the UK (Y/N) LONG FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q713] -description = "Tax credit claim submitted or intended for In-house expenditure (Y/N) SHORT FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" - -[q714] -description = "Tax credit claim submitted or intended for purchased R&D (Y/N) SHORT FORM" -data_type = "Boolean (True or False, 0 or 1)" -nullable = "No" -min_acceptable_value = "nan" -max_acceptable_value = "nan" From a575db511d52183c13a9f5b11ee64c5eccab2c41 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 5 Jun 2023 18:07:38 +0100 Subject: [PATCH 235/411] Updated environment.yml to match develop. --- environment.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/environment.yml b/environment.yml index e247d571c..3628a50a1 100644 --- a/environment.yml +++ b/environment.yml @@ -3,8 +3,6 @@ dependencies: - python=3 - coverage - pyyaml - - pandas==1.1.5 - - numpy - requests - sphinx - pip From 79f03ed1307e0491af8d81f2904bed6ba948d273 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 13:14:39 +0100 Subject: [PATCH 236/411] Removed unused import. --- src/pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pipeline.py b/src/pipeline.py index 885b233c3..7fa0b5edf 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -9,7 +9,6 @@ from src.data_processing import spp_snapshot_processing as processing from src.utils.hdfs_mods import hdfs_load_json from src.data_validation import validation -from src.data_validation.validation import check_data_shape import time import logging From 7eec9f92abd69645104a92e426d2e743b8fb11a5 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 13:15:30 +0100 Subject: [PATCH 237/411] Rearranged imports and congif settings after rebase. --- src/data_validation/validation.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index f87ec5212..c230c2161 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,3 +1,5 @@ +import os +import toml import postcodes_uk import pandas as pd from src.utils.wrappers import time_logger_wrap, exception_wrap @@ -102,9 +104,9 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: return True + def check_pcs_real(df: pd.DataFrame, masterlist_path: str): - """Checks if the postcodes are real against a masterlist of actual postcodes - """ + """Checks if the postcodes are real against a masterlist of actual postcodes""" if config["global"]["postcode_csv_check"]: master_series = get_masterlist(masterlist_path) @@ -117,19 +119,8 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): unreal_postcodes = emptydf.loc[ ~emptydf["referencepostcode"], "referencepostcode" ] - - return unreal_postcodes -import os -import toml -from src.utils.hdfs_mods import hdfs_load_json as read_data -from src.utils.helpers import Config_settings - -conf_obj = Config_settings() -config = conf_obj.config_dict -config_paths = config["paths"] -snapshot_path = config_paths["snapshot_path"] # Taken from config file -snapdata = read_data(snapshot_path) # Read data file from json file + return unreal_postcodes def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: From c2f722f8229dd81a3bac65dd93b84f14057658bd Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 14:40:13 +0100 Subject: [PATCH 238/411] Resolved duplicates and ordering in requirements.txt. --- requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3598ecc37..29638e9ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,19 @@ -# python==3.6 arrow cookiecutter +coverage==4.5.4 detect-secrets myst-parser -pre-commit==2.17.0 -python-dotenv -table_logger -pandas==1.1.5 numpy +pandas==1.1.5 +pip +postcodes_uk +pre-commit==2.17.0 pydoop -setuptools pytest -coverage +python-dotenv pyyaml requests +setuptools sphinx postcodes_uk # remove this later - use regex typing From a39a74f04a10174c1e810f3c604d96f0d37eef12 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 20:53:20 +0100 Subject: [PATCH 239/411] Fixed all boolean values in TOML file. --- config/contributors_schema.toml | 92 ++++++++++++++++----------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/config/contributors_schema.toml b/config/contributors_schema.toml index fb92d1fe0..4cb994645 100644 --- a/config/contributors_schema.toml +++ b/config/contributors_schema.toml @@ -11,7 +11,7 @@ Possible_Categorical_Values = ["nan"] [reference] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = ">=1" Min_values = 11001603625 @@ -21,7 +21,7 @@ Possible_Categorical_Values = ["nan"] [period] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = 6 Min_values = "nan" @@ -31,7 +31,7 @@ Possible_Categorical_Values = [202012] [survey] Description = "All values are 002" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -41,7 +41,7 @@ Possible_Categorical_Values = ["002"] [formid] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "int" Length = 2 Min_values = "nan" @@ -51,7 +51,7 @@ Possible_Categorical_Values = [20, 21] [status] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -61,7 +61,7 @@ Possible_Categorical_Values = ["Clear", "Clear - overridden", "Form saved", "Cle [statusencoded] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "int" Length = 3 Min_values = "nan" @@ -71,7 +71,7 @@ Possible_Categorical_Values = [100, 200, 201, 210, 211, 301, 302, 303, 304, 305, [receiptdate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = False +Nullable = false Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" @@ -81,7 +81,7 @@ Possible_Categorical_Values = ["nan"] [lockedby] Description = "All empty strings" Deduced_Data_Type = "pandas.NA" -Nullable = True +Nullable = true Current_Data_Type = ["None","str"] Length = 0 Min_values = "nan" @@ -91,7 +91,7 @@ Possible_Categorical_Values = ["nan"] [lockeddate] Description = "All None type" Deduced_Data_Type = "pandas.NA" -Nullable = True +Nullable = true Current_Data_Type = ["None","str"] Length = 0 Min_values = "nan" @@ -101,7 +101,7 @@ Possible_Categorical_Values = ["nan"] [formtype] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -111,7 +111,7 @@ Possible_Categorical_Values = ["0001", "0006"] [checkletter] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = 1 Min_values = "nan" @@ -121,7 +121,7 @@ Possible_Categorical_Values = ["T", "H", "F", "J", "D", "A", "K", "C", "B", "L", [frozensicoutdated] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 1120 @@ -131,7 +131,7 @@ Possible_Categorical_Values = ["nan"] [rusicoutdated] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 1120 @@ -141,7 +141,7 @@ Possible_Categorical_Values = ["nan"] [frozensic] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 1300 @@ -151,7 +151,7 @@ Possible_Categorical_Values = ["nan"] [rusic] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 1300 @@ -161,7 +161,7 @@ Possible_Categorical_Values = ["nan"] [frozenemployees] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0 @@ -171,7 +171,7 @@ Possible_Categorical_Values = ["nan"] [employees] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0 @@ -181,7 +181,7 @@ Possible_Categorical_Values = ["nan"] [frozenemployment] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 1 @@ -191,7 +191,7 @@ Possible_Categorical_Values = ["nan"] [employment] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 1 @@ -201,7 +201,7 @@ Possible_Categorical_Values = ["nan"] [frozenfteemployment] Description = "nan" Deduced_Data_Type = "float" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0.0 @@ -211,7 +211,7 @@ Possible_Categorical_Values = ["nan"] [fteemployment] Description = "nan" Deduced_Data_Type = "float" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0.0 @@ -221,7 +221,7 @@ Possible_Categorical_Values = ["nan"] [frozenturnover] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0 @@ -231,7 +231,7 @@ Possible_Categorical_Values = ["nan"] [turnover] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0 @@ -241,7 +241,7 @@ Possible_Categorical_Values = ["nan"] [enterprisereference] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = ">=1" Min_values = 1001603625 @@ -251,7 +251,7 @@ Possible_categorical_Values = ["nan"] [wowenterprisereference] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = ">=1" Min_values = 1001603625 @@ -261,7 +261,7 @@ Possible_categorical_Values = ["nan"] [cellnumber] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "int" Length = "nan" Min_values = "nan" @@ -271,7 +271,7 @@ Possible_Categorical_Values = [0] [currency] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -281,7 +281,7 @@ Possible_Categorical_Values = ["S", "E"] [vatreference] Description = "nan" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -291,7 +291,7 @@ Possible_Categorical_Values = ["VATREF"] [payereference] Description = "nan" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -301,7 +301,7 @@ Possible_Categorical_Values = ["PAYEREF"] [companyregistrationnumber] Description = "nan" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -311,7 +311,7 @@ Possible_Categorical_Values = ["CRN"] [numberlivelocalunits] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0 @@ -321,7 +321,7 @@ Possible_categorical_Values = ["nan"] [numberlivevat] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0 @@ -331,7 +331,7 @@ Possible_categorical_Values = ["nan"] [numberlivepaye] Description = "nan" Deduced_Data_Type = "int" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 0 @@ -341,7 +341,7 @@ Possible_categorical_Values = ["nan"] [legalstatus] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = 1 @@ -351,7 +351,7 @@ Possible_Categorical_Values = [1, 2, 3, 4] [reportingunitmarker] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -361,7 +361,7 @@ Possible_Categorical_Values = ["L", "E"] [region] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -371,7 +371,7 @@ Possible_Categorical_Values = ["WW", "BB", "FE", "GG", "JG", "HH", "ED", "KJ", " [birthdate] Description = "Datetime format = format=%d/%m/%Y" Deduced_Data_Type = "Datetime" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -381,7 +381,7 @@ Possible_categorical_Values = ["nan"] [referencename] Description = "nan" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -391,7 +391,7 @@ Possible_categorical_Values = ["nan"] [referencepostcode] Description = "nan" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -401,7 +401,7 @@ Possible_categorical_Values = ["nan"] [tradingstyle] Description = "nan" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -411,7 +411,7 @@ Possible_categorical_Values = ["nan"] [selectiontype] Description = "nan" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -421,7 +421,7 @@ Possible_Categorical_Values = ["L"] [inclusionexclusion] Description = 'All values are " "' Deduced_Data_Type = "pandas.NA" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = "nan" Min_values = "nan" @@ -431,7 +431,7 @@ Possible_categorical_Values = ["nan"] [createdby] Description = "nan" Deduced_Data_Type = "str" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -441,7 +441,7 @@ Possible_Categorical_Values = ["ingestion"] [createddate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = False +Nullable = false Current_Data_Type = "str" Length = ">=1" Min_values = "nan" @@ -451,7 +451,7 @@ Possible_categorical_Values = ["nan"] [lastupdatedby] Description = "nan" Deduced_Data_Type = "category" -Nullable = False +Nullable = false Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" @@ -461,7 +461,7 @@ Possible_Categorical_Values = ["data_migration", "Cheri", "Adela", "David"] [lastupdateddate] Description = "Datetime format = %Y-%m-%d %H:%M:%S.%f+00" Deduced_Data_Type = "Datetime" -Nullable = False +Nullable = false Current_Data_Type = ["None","str"] Length = "nan" Min_values = "nan" From ff590e2a0f9d75208961206f17aa9c4e211f8163 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 20:54:38 +0100 Subject: [PATCH 240/411] Added check_data_shape() to src/main.py. --- src/pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pipeline.py b/src/pipeline.py index 7fa0b5edf..6189c9035 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -52,7 +52,8 @@ def run_pipeline(start): MainLogger.info("Finished Data Transmutation...") # Data validation - MainLogger.info("Starting Data Validation...") + validation.check_data_shape(contributors_df) + # Check the postcode column masterlist_path = config["paths"]["masterlist_path"] validation.validate_post_col(contributors_df, masterlist_path) From d8e4cc840e6a7a8fb0b83d9775b9367aa2a308b3 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 20:55:53 +0100 Subject: [PATCH 241/411] Added logging wrappers to check_data_shape() function. Modified rest of file to work when run from main.py. --- src/data_validation/validation.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index c230c2161..194af6cd5 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,5 +1,6 @@ import os import toml +import logging import postcodes_uk import pandas as pd from src.utils.wrappers import time_logger_wrap, exception_wrap @@ -11,8 +12,12 @@ # Get the config conf_obj = Config_settings() config = conf_obj.config_dict +global_config = config["global"] +config_paths = config["paths"] +snapshot_path = config_paths["snapshot_path"] # Taken from config file -ValidationLogger = logging.getLogger(__name__) +# Set up logging +validationlogger = logging.getLogger(__name__) def validate_postcode_pattern(pcode: str) -> bool: @@ -74,7 +79,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: # Log the unreal postcodes if not unreal_postcodes.empty: - ValidationLogger.warning( + validationlogger.warning( f"These postcodes are not found in the ONS postcode list: {unreal_postcodes.to_list()}" # noqa ) @@ -85,7 +90,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: # Log the invalid postcodes if not invalid_pattern_postcodes.empty: - ValidationLogger.warning( + validationlogger.warning( f"Invalid pattern postcodes found: {invalid_pattern_postcodes.to_list()}" ) @@ -147,7 +152,10 @@ def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: return toml_string +@time_logger_wrap +@exception_wrap def check_data_shape( + data_df: pd.DataFrame, schema_path: str = "./config/Data_Schema.toml", ) -> bool: """Compares the shape of the data and compares it to the shape of the toml @@ -164,17 +172,17 @@ def check_data_shape( cols_match = False - # Specify which key in snapshot data dictionary to get correct data - # List, with each element containing a dictionary for each row of data - contributerdict = snapdata["contributors"] + data_dict = data_df.to_dict() # Load toml data schema into dictionary toml_string = load_schema(schema_path) # Compare length of data dictionary to the data schema - if len(contributerdict[0]) == len(toml_string): + if len(data_dict) == len(toml_string): cols_match = True else: cols_match = False + validationlogger.info(f"Data columns match schema: {cols_match}.") + return cols_match From a140e30230570b3c8371d756f996c6ef84511e60 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 21:12:38 +0100 Subject: [PATCH 242/411] Edited logger from info to warning if number of columns in data don't match the schema. --- src/data_validation/validation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 194af6cd5..c0b8b62bd 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -183,6 +183,9 @@ def check_data_shape( else: cols_match = False - validationlogger.info(f"Data columns match schema: {cols_match}.") + validationlogger.warning(f"Data columns match schema: {cols_match}.") + validationlogger.info( + f"Length of data: {len(data_dict)}. Length of schema: {len(toml_string)}" + ) return cols_match From f5903ef4f8cbf512e7d122a53b23fdce62c2ab5e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 21:13:22 +0100 Subject: [PATCH 243/411] Edited check_data_shape line to check full_responses instead of just contributors dataframe. --- src/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipeline.py b/src/pipeline.py index 6189c9035..f96056337 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -52,7 +52,7 @@ def run_pipeline(start): MainLogger.info("Finished Data Transmutation...") # Data validation - validation.check_data_shape(contributors_df) + validation.check_data_shape(full_responses) # Check the postcode column masterlist_path = config["paths"]["masterlist_path"] From 3b288907f132fd5272c3f3b226e58cd6ce656130 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 21:25:41 +0100 Subject: [PATCH 244/411] Added test cases for load_schema and check_data_shape from tests/test_validation.py to tests/data_validation/test_validation.py. --- tests/test_data_validation/test_validation.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 2151300f4..3a37c4dd8 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -160,3 +160,42 @@ def test_check_pcs_real_with_valid_postcodes(test_data, monkeypatch): assert ( bool(unreal_postcodes.isin(["NP10 8XG", "SW1P 4DF"]).any()) is False ) # Assert that the real postcodes are not in the unreal postcodes + + +def test_check_data_shape(): + """Test the check_data_shape function.""" + # Arrange + from src.data_validation.validation import check_data_shape + + # Dataframe for test function to use + dummy_df = {"col1": [1, 2], "col2": [3, 4]} + + # Act: use pytest to assert the result + result_1 = check_data_shape(dummy_df) + + # Assert + assert isinstance(result_1, bool) + # Assert: Negative test. Should fails when the answer is wrong + with pytest.raises(AssertionError): + assert not isinstance(result_1, bool) + # Assert: test that add fails when the arguments are wrong type + pytest.raises(TypeError, check_data_shape, 1, "2") + pytest.raises(TypeError, check_data_shape, "1", 2) + + +def test_load_schema(): + """Test the load_schema function.""" + # Arrange + from src.data_validation.validation import load_schema + + # Act: use pytest to assert the result + result_1 = load_schema() + + # Assert + assert isinstance(result_1, dict) + # Assert: Negative test. Should fails when the answer is wrong + with pytest.raises(AssertionError): + assert not isinstance(result_1, dict) + # Assert: test that add fails when the arguments are wrong type + pytest.raises(TypeError, load_schema, 2) + pytest.raises(TypeError, load_schema, True) From 30692003015d3fd3e6a1cf6abdbf368ef6ab39c9 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 21:26:21 +0100 Subject: [PATCH 245/411] Deleted tests/test_validation.py. --- tests/test_validation.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 tests/test_validation.py diff --git a/tests/test_validation.py b/tests/test_validation.py deleted file mode 100644 index d965486ba..000000000 --- a/tests/test_validation.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Create a test suite for the validation module.""" - -import pytest - - -def test_check_data_shape(): - """Test the check_data_shape function.""" - # Arrange - from src.data_validation.validation import check_data_shape - - # Act: use pytest to assert the result - result_1 = check_data_shape() - - # Assert - assert isinstance(result_1, bool) - # Assert: Negative test. Should fails when the answer is wrong - with pytest.raises(AssertionError): - assert not isinstance(result_1, bool) - # Assert: test that add fails when the arguments are wrong type - pytest.raises(TypeError, check_data_shape, 1, "2") - pytest.raises(TypeError, check_data_shape, "1", 2) - - -def test_load_schema(): - """Test the load_schema function.""" - # Arrange - from src.data_validation.validation import load_schema - - # Act: use pytest to assert the result - result_1 = load_schema() - - # Assert - assert isinstance(result_1, dict) - # Assert: Negative test. Should fails when the answer is wrong - with pytest.raises(AssertionError): - assert not isinstance(result_1, dict) - # Assert: test that add fails when the arguments are wrong type - pytest.raises(TypeError, load_schema, 2) - pytest.raises(TypeError, load_schema, True) From 23aa9d24664e4e7a4cb23ff730416ee88489cb76 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 21:29:46 +0100 Subject: [PATCH 246/411] Corrected dummy dataframe. Was set to a dictionary object without conversion before. --- tests/test_data_validation/test_validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 3a37c4dd8..c94b06c80 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -168,7 +168,8 @@ def test_check_data_shape(): from src.data_validation.validation import check_data_shape # Dataframe for test function to use - dummy_df = {"col1": [1, 2], "col2": [3, 4]} + dummy_dict = {"col1": [1, 2], "col2": [3, 4]} + dummy_df = pd.DataFrame(data=dummy_dict) # Act: use pytest to assert the result result_1 = check_data_shape(dummy_df) From 6f48070544b852e676d2f6a2c98d88fa2551036b Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 21:59:03 +0100 Subject: [PATCH 247/411] Attempting to fix pytest error with check_data_shape(). --- tests/test_data_validation/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index c94b06c80..46a2c83bf 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -175,7 +175,7 @@ def test_check_data_shape(): result_1 = check_data_shape(dummy_df) # Assert - assert isinstance(result_1, bool) + assert isinstance(result_1, (bool, None)) # Assert: Negative test. Should fails when the answer is wrong with pytest.raises(AssertionError): assert not isinstance(result_1, bool) From 89f8110f2ef1b1814f12efeae11d8478d4b2f0b1 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 22:01:29 +0100 Subject: [PATCH 248/411] Attempting to fix pytest error with check_data_shape(). --- tests/test_data_validation/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 46a2c83bf..c94b06c80 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -175,7 +175,7 @@ def test_check_data_shape(): result_1 = check_data_shape(dummy_df) # Assert - assert isinstance(result_1, (bool, None)) + assert isinstance(result_1, bool) # Assert: Negative test. Should fails when the answer is wrong with pytest.raises(AssertionError): assert not isinstance(result_1, bool) From c5d5efaadc87796da1a8116d7aab3001438c6ff0 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 22:13:56 +0100 Subject: [PATCH 249/411] Removed time_logger_wrap from check_data_shape(). --- src/data_validation/validation.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index c0b8b62bd..55b0fe28a 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -3,9 +3,8 @@ import logging import postcodes_uk import pandas as pd -from src.utils.wrappers import time_logger_wrap, exception_wrap -import logging +from src.utils.wrappers import exception_wrap from src.utils.helpers import Config_settings @@ -152,7 +151,6 @@ def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: return toml_string -@time_logger_wrap @exception_wrap def check_data_shape( data_df: pd.DataFrame, From 2ade282e591a1c2a4e2a04145b1d373e4739e4ea Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 12:53:34 +0100 Subject: [PATCH 250/411] Added a raise ValueError check in check_data_shape. --- src/data_validation/validation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 55b0fe28a..20208c5c7 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -167,6 +167,10 @@ def check_data_shape( Returns: A bool: boolean, True if number of columns is as expected, otherwise False """ + if not isinstance(data_df, pd.DataFrame): + raise ValueError( + f"data_df must be a pandas dataframe, is currently {type(data_df)}." + ) cols_match = False From b73678aba7172c2e033d524a4643b0e64ff17d99 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 12:54:18 +0100 Subject: [PATCH 251/411] Edited test_check_data_shape to test for ValueError check when not passing a pandas dataframe to the function. --- tests/test_data_validation/test_validation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index c94b06c80..9a1c4d86c 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -180,8 +180,7 @@ def test_check_data_shape(): with pytest.raises(AssertionError): assert not isinstance(result_1, bool) # Assert: test that add fails when the arguments are wrong type - pytest.raises(TypeError, check_data_shape, 1, "2") - pytest.raises(TypeError, check_data_shape, "1", 2) + pytest.raises(ValueError, check_data_shape, 1) def test_load_schema(): From b76256e1712c2e91a506185c19c9261046d16444 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 15:30:53 +0100 Subject: [PATCH 252/411] Added time_logger_wrap back in to validation.py. --- src/data_validation/validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 20208c5c7..30d461738 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -4,7 +4,7 @@ import postcodes_uk import pandas as pd -from src.utils.wrappers import exception_wrap +from src.utils.wrappers import exception_wrap, time_logger_wrap from src.utils.helpers import Config_settings @@ -104,7 +104,7 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}" ) - ValidationLogger.info("All postcodes validated....") + validationlogger.info("All postcodes validated....") return True @@ -154,7 +154,7 @@ def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: @exception_wrap def check_data_shape( data_df: pd.DataFrame, - schema_path: str = "./config/Data_Schema.toml", + schema_path: str = "./config/contributors_schema.toml", ) -> bool: """Compares the shape of the data and compares it to the shape of the toml file based off the data schema. Returns true if there is a match and false From 3ad16f0d0d022e7b710b50c3f71fb6b845898d75 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 16:38:18 +0100 Subject: [PATCH 253/411] Added exception wrap to load_schema file. --- src/data_validation/validation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 30d461738..c67514633 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -127,12 +127,13 @@ def check_pcs_real(df: pd.DataFrame, masterlist_path: str): return unreal_postcodes -def load_schema(file_path: str = "./config/Data_Schema.toml") -> dict: +@exception_wrap +def load_schema(file_path: str = "./config/contributors_schema.toml") -> dict: """Load the data schema from toml file into a dictionary Keyword Arguments: file_path -- Path to data schema toml file - (default: {"./config/Data_Schema.toml"}) + (default: {"./config/contributors_schema.toml"}) Returns: A dict: dictionary containing parsed schema toml file From 238ada7ff05980f520688069b9f4cefbca3bef40 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 16:38:39 +0100 Subject: [PATCH 254/411] Updated requirements.txt. --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 29638e9ea..78d75b34c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,9 +12,9 @@ pydoop pytest python-dotenv pyyaml +readme-coverage-badger requests setuptools sphinx -postcodes_uk # remove this later - use regex +table-logger typing -readme-coverage-badger From 4cbd1565370a4d9ac64ae567dcf45923ffa8984c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 16:43:38 +0100 Subject: [PATCH 255/411] Deleted loading.py as spp_parser.py replaces it. --- src/data_ingest/loading.py | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100644 src/data_ingest/loading.py diff --git a/src/data_ingest/loading.py b/src/data_ingest/loading.py deleted file mode 100644 index 117c6c040..000000000 --- a/src/data_ingest/loading.py +++ /dev/null @@ -1,28 +0,0 @@ -import pandas as pd - -from src.utils.helpers import Config_settings -from src.utils.hdfs_mods import hdfs_load_json - -conf_obj = Config_settings() -config = conf_obj.config_dict -snapshot_path = config["paths"]["snapshot_path"] # Taken from config file - -snapdata = hdfs_load_json(snapshot_path) - -contributerdict = snapdata["contributors"] -responsesdict = snapdata["responses"] - -contributers = pd.DataFrame(contributerdict) -responses = pd.DataFrame(responsesdict) - -contributerdict = snapdata["contributors"] -responsesdict = snapdata["responses"] - -contributers = pd.DataFrame(contributerdict) -responses = pd.DataFrame(responsesdict) - -print(contributers.head()) -print("\n") -print(responses.head()) -print("\n") -print([responses["questioncode"].unique()]) From 8fe8a2cfaffa3ef72a68bb94693098c4893c7d08 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 27 Mar 2023 16:45:13 +0100 Subject: [PATCH 256/411] Added a blank Jenkinsfile. --- Jenkinsfile | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 000000000..e69de29bb From 176caff2674ac7545a551f62fea298fa93c4a2bd Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 27 Mar 2023 16:46:58 +0100 Subject: [PATCH 257/411] Added a basic set of instructions to the Jenkinsfile (copied from branch 36). --- Jenkinsfile | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index e69de29bb..62fa2bed4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -0,0 +1,29 @@ +pipeline { + + agent any + + options { + + buildDiscarder logRotator(artifactDaysToKeepStr: '', artifactNumToKeepStr: '5', daysToKeepStr: '', numToKeepStr: '5') + + } + + stages { + + stage('Hello World') { + + steps { + + sh ''' + + java -version + + ''' + + } + + } + + } + +} From bf4399f15a244e136f68bf37221dc5833ade3f62 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 27 Mar 2023 16:56:09 +0100 Subject: [PATCH 258/411] Added RAP example .jenkinsfile to Jenkinsfile. Won't work out of box, needs updated to match current project. --- Jenkinsfile | 166 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 154 insertions(+), 12 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 62fa2bed4..ae8db8b86 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,29 +1,171 @@ +#!groovy + +// Global scope required for multi-stage persistence +def artifactoryStr = 'art-p-01' +artServer = Artifactory.server "${artifactoryStr}" +buildInfo = Artifactory.newBuildInfo() +def agentPython3Version = 'python_3.6.2' +def artifactVersion + +// Define a function to push packaged code to Artifactory +def pushToPyPiArtifactoryRepo_temp(String projectName, String version, String sourceDistLocation = 'python/dist/*', String artifactoryHost = 'art-p-01') { + withCredentials([usernamePassword(credentialsId: env.ARTIFACTORY_CREDS, usernameVariable: 'ARTIFACTORY_USER', passwordVariable: 'ARTIFACTORY_PASSWORD')]){ + sh "curl -u ${ARTIFACTORY_USER}:\${ARTIFACTORY_PASSWORD} -T ${sourceDistLocation} 'http://${artifactoryHost}/artifactory/${env.ARTIFACTORY_PYPI_REPO}/${projectName}/'" + } +} + +// Define a function to update the pipeline status on Gitlab +def updateGitlabStatus_temp(String stage, String state, String gitlabHost = 'https://gitlab-app-l-01.ons.statistics.gov.uk') { + withCredentials([string(credentialsId: env.GITLAB_CREDS, variable: 'GITLAB_TOKEN')]) { + println("Updating GitLab pipeline status") + shortCommit = sh(returnStdout: true, script: "cd ${PROJECT_NAME} && git log -n 1 --pretty=format:'%h'").trim() + sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_TOKEN}\" \"${gitlabHost}/api/v4/projects/${GITLAB_PROJECT_ID}/statuses/${shortCommit}?state=${state}&name=${stage}&target_url=${BUILD_URL}\"" + } +} + +// This section defines the Jenkins pipeline pipeline { + libraries { + lib('jenkins-pipeline-shared@feature/dap-ci-scripts') + } + + environment { + ARTIFACTORY_CREDS = 's_jenkins_epds' + ARTIFACTORY_PYPI_REPO = 'LR_EPDS_pypi' + PROJECT_NAME = 'projectname_placeholder' + BUILD_BRANCH = 'build_branch' // Any commits to this branch will create a build in artifactory + BUILD_TAG = 'v*' // Any commits tagged with this pattern will create a build in artifactory + MIN_COVERAGE_PC = '0' + GITLAB_CREDS = 'epds_gitlab_token' // Credentials used for notifying GitLab of build status + GITLAB_PROJECT_ID = 'gitlabid_placeholder' + } + + options { + skipDefaultCheckout true + } + + agent any + + stages { + stage('Checkout') { + agent { label 'download.jenkins.slave' } + steps { + onStage() + colourText('info', "Checking out code from source control.") - agent any + checkout scm - options { + updateGitlabStatus_temp('Jenkins', 'pending') - buildDiscarder logRotator(artifactDaysToKeepStr: '', artifactNumToKeepStr: '5', daysToKeepStr: '', numToKeepStr: '5') + script { + buildInfo.name = "${PROJECT_NAME}" + buildInfo.number = "${BUILD_NUMBER}" + buildInfo.env.collect() + } + colourText('info', "BuildInfo: ${buildInfo.name}-${buildInfo.number}") + stash name: 'Checkout', useDefaultExcludes: false + } + } - } + stage('Preparing virtual environment') { + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Create venv and install dependencies") + unstash name: 'Checkout' - stages { + sh ''' + PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH - stage('Hello World') { + python3 -m pip install -U pip + pip3 install virtualenv - steps { + if [ ! -d "venv" ]; then + virtualenv venv + fi + . venv/bin/activate - sh ''' + python -m pip install -U pip + pip3 install pypandoc==1.7.5 + pip3 install -r requirements-dev.txt + pip3 install pyspark==2.4.0 + pip3 install -e . + pip3 freeze + ''' + stash name: 'venv', useDefaultExcludes: false + } + } - java -version + stage('Unit Test and coverage') { + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Running unit tests and code coverage.") + unstash name: 'Checkout' + unstash name: 'venv' - ''' + // Compatibility for PyArrow with Spark 2.4-legacy IPC format. + sh 'export ARROW_PRE_0_15_IPC_FORMAT=1' - } + // Running coverage first runs the tests + sh ''' + . venv/bin/activate + coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests + coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} + ''' + + cobertura autoUpdateHealth: false, + autoUpdateStability: false, + coberturaReportFile: 'python_coverage.xml', + conditionalCoverageTargets: '70, 0, 0', + failUnhealthy: false, + failUnstable: false, + lineCoverageTargets: '80, 0, 0', + maxNumberOfBuilds: 0, + methodCoverageTargets: '80, 0, 0', + onlyStable: false, + zoomCoverageChart: false + } + } + + stage('Build and publish Python Package') { + when { + anyOf{ + branch BUILD_BRANCH + tag BUILD_TAG + } + beforeAgent true + } + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Building Python package.") + unstash name: 'Checkout' + unstash name: 'venv' + + sh ''' + . venv/bin/activate + pip3 install wheel==0.29.0 + python3 setup.py build bdist_wheel + ''' + + script { + pushToPyPiArtifactoryRepo_temp("${buildInfo.name}", "", "dist/*") + } + } + } } - } + post { + success { + unstash name: 'Checkout' + updateGitlabStatus_temp('Jenkins', 'success') + } + failure { + unstash name: 'Checkout' + updateGitlabStatus_temp('Jenkins', 'failed') + } + } } From c6469d797bd905efca1f802b479e9bf119f41528 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 27 Mar 2023 17:11:20 +0100 Subject: [PATCH 259/411] Added resdev in place of . Removed stages after Checkout stage for testing. --- Jenkinsfile | 180 ++++++++++++++++++++++++++-------------------------- 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ae8db8b86..1fceeb07a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -18,7 +18,7 @@ def pushToPyPiArtifactoryRepo_temp(String projectName, String version, String so def updateGitlabStatus_temp(String stage, String state, String gitlabHost = 'https://gitlab-app-l-01.ons.statistics.gov.uk') { withCredentials([string(credentialsId: env.GITLAB_CREDS, variable: 'GITLAB_TOKEN')]) { println("Updating GitLab pipeline status") - shortCommit = sh(returnStdout: true, script: "cd ${PROJECT_NAME} && git log -n 1 --pretty=format:'%h'").trim() + shortCommit = sh(returnStdout: true, script: "cd resdev && git log -n 1 --pretty=format:'%h'").trim() sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_TOKEN}\" \"${gitlabHost}/api/v4/projects/${GITLAB_PROJECT_ID}/statuses/${shortCommit}?state=${state}&name=${stage}&target_url=${BUILD_URL}\"" } } @@ -66,95 +66,95 @@ pipeline { stash name: 'Checkout', useDefaultExcludes: false } } - - stage('Preparing virtual environment') { - agent { label "test.${agentPython3Version}" } - steps { - onStage() - colourText('info', "Create venv and install dependencies") - unstash name: 'Checkout' - - sh ''' - PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH - - python3 -m pip install -U pip - pip3 install virtualenv - - if [ ! -d "venv" ]; then - virtualenv venv - fi - . venv/bin/activate - - python -m pip install -U pip - pip3 install pypandoc==1.7.5 - pip3 install -r requirements-dev.txt - pip3 install pyspark==2.4.0 - pip3 install -e . - pip3 freeze - ''' - stash name: 'venv', useDefaultExcludes: false - } - } - - stage('Unit Test and coverage') { - agent { label "test.${agentPython3Version}" } - steps { - onStage() - colourText('info', "Running unit tests and code coverage.") - unstash name: 'Checkout' - unstash name: 'venv' - - // Compatibility for PyArrow with Spark 2.4-legacy IPC format. - sh 'export ARROW_PRE_0_15_IPC_FORMAT=1' - - // Running coverage first runs the tests - sh ''' - . venv/bin/activate - - coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests - coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} - ''' - - cobertura autoUpdateHealth: false, - autoUpdateStability: false, - coberturaReportFile: 'python_coverage.xml', - conditionalCoverageTargets: '70, 0, 0', - failUnhealthy: false, - failUnstable: false, - lineCoverageTargets: '80, 0, 0', - maxNumberOfBuilds: 0, - methodCoverageTargets: '80, 0, 0', - onlyStable: false, - zoomCoverageChart: false - } - } - - stage('Build and publish Python Package') { - when { - anyOf{ - branch BUILD_BRANCH - tag BUILD_TAG - } - beforeAgent true - } - agent { label "test.${agentPython3Version}" } - steps { - onStage() - colourText('info', "Building Python package.") - unstash name: 'Checkout' - unstash name: 'venv' - - sh ''' - . venv/bin/activate - pip3 install wheel==0.29.0 - python3 setup.py build bdist_wheel - ''' - - script { - pushToPyPiArtifactoryRepo_temp("${buildInfo.name}", "", "dist/*") - } - } - } +// + //stage('Preparing virtual environment') { + // agent { label "test.${agentPython3Version}" } + // steps { + // onStage() + // colourText('info', "Create venv and install dependencies") + // unstash name: 'Checkout' +// + // sh ''' + // PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH +// + // python3 -m pip install -U pip + // pip3 install virtualenv +// + // if [ ! -d "venv" ]; then + // virtualenv venv + // fi + // . venv/bin/activate +// + // python -m pip install -U pip + // pip3 install pypandoc==1.7.5 + // pip3 install -r requirements-dev.txt + // pip3 install pyspark==2.4.0 + // pip3 install -e . + // pip3 freeze + // ''' + // stash name: 'venv', useDefaultExcludes: false + // } + //} +// + //stage('Unit Test and coverage') { + // agent { label "test.${agentPython3Version}" } + // steps { + // onStage() + // colourText('info', "Running unit tests and code coverage.") + // unstash name: 'Checkout' + // unstash name: 'venv' +// + // // Compatibility for PyArrow with Spark 2.4-legacy IPC format. + // sh 'export ARROW_PRE_0_15_IPC_FORMAT=1' +// + // // Running coverage first runs the tests + // sh ''' + // . venv/bin/activate +// + // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests + // coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} + // ''' +// + // cobertura autoUpdateHealth: false, + // autoUpdateStability: false, + // coberturaReportFile: 'python_coverage.xml', + // conditionalCoverageTargets: '70, 0, 0', + // failUnhealthy: false, + // failUnstable: false, + // lineCoverageTargets: '80, 0, 0', + // maxNumberOfBuilds: 0, + // methodCoverageTargets: '80, 0, 0', + // onlyStable: false, + // zoomCoverageChart: false + // } + //} +// + //stage('Build and publish Python Package') { + // when { + // anyOf{ + // branch BUILD_BRANCH + // tag BUILD_TAG + // } + // beforeAgent true + // } + // agent { label "test.${agentPython3Version}" } + // steps { + // onStage() + // colourText('info', "Building Python package.") + // unstash name: 'Checkout' + // unstash name: 'venv' +// + // sh ''' + // . venv/bin/activate + // pip3 install wheel==0.29.0 + // python3 setup.py build bdist_wheel + // ''' +// + // script { + // pushToPyPiArtifactoryRepo_temp("${buildInfo.name}", "", "dist/*") + // } + // } + //} } From b8412ae60d63ebf04f3b23de065a73f2f0a75dbe Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 27 Mar 2023 19:16:39 +0100 Subject: [PATCH 260/411] Removed updateGitLabStatus call from Checkout Stage for testing. --- Jenkinsfile | 188 ++++++++++++++++++++++++++-------------------------- 1 file changed, 94 insertions(+), 94 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1fceeb07a..b63ef398c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -55,10 +55,10 @@ pipeline { checkout scm - updateGitlabStatus_temp('Jenkins', 'pending') + //updateGitlabStatus_temp('Jenkins', 'pending') script { - buildInfo.name = "${PROJECT_NAME}" + buildInfo.name = "resdev" buildInfo.number = "${BUILD_NUMBER}" buildInfo.env.collect() } @@ -66,99 +66,99 @@ pipeline { stash name: 'Checkout', useDefaultExcludes: false } } -// - //stage('Preparing virtual environment') { - // agent { label "test.${agentPython3Version}" } - // steps { - // onStage() - // colourText('info', "Create venv and install dependencies") - // unstash name: 'Checkout' -// - // sh ''' - // PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH -// - // python3 -m pip install -U pip - // pip3 install virtualenv -// - // if [ ! -d "venv" ]; then - // virtualenv venv - // fi - // . venv/bin/activate -// - // python -m pip install -U pip - // pip3 install pypandoc==1.7.5 - // pip3 install -r requirements-dev.txt - // pip3 install pyspark==2.4.0 - // pip3 install -e . - // pip3 freeze - // ''' - // stash name: 'venv', useDefaultExcludes: false - // } - //} -// - //stage('Unit Test and coverage') { - // agent { label "test.${agentPython3Version}" } - // steps { - // onStage() - // colourText('info', "Running unit tests and code coverage.") - // unstash name: 'Checkout' - // unstash name: 'venv' -// - // // Compatibility for PyArrow with Spark 2.4-legacy IPC format. - // sh 'export ARROW_PRE_0_15_IPC_FORMAT=1' -// - // // Running coverage first runs the tests - // sh ''' - // . venv/bin/activate -// - // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests - // coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} - // ''' -// - // cobertura autoUpdateHealth: false, - // autoUpdateStability: false, - // coberturaReportFile: 'python_coverage.xml', - // conditionalCoverageTargets: '70, 0, 0', - // failUnhealthy: false, - // failUnstable: false, - // lineCoverageTargets: '80, 0, 0', - // maxNumberOfBuilds: 0, - // methodCoverageTargets: '80, 0, 0', - // onlyStable: false, - // zoomCoverageChart: false - // } - //} -// - //stage('Build and publish Python Package') { - // when { - // anyOf{ - // branch BUILD_BRANCH - // tag BUILD_TAG - // } - // beforeAgent true - // } - // agent { label "test.${agentPython3Version}" } - // steps { - // onStage() - // colourText('info', "Building Python package.") - // unstash name: 'Checkout' - // unstash name: 'venv' -// - // sh ''' - // . venv/bin/activate - // pip3 install wheel==0.29.0 - // python3 setup.py build bdist_wheel - // ''' -// - // script { - // pushToPyPiArtifactoryRepo_temp("${buildInfo.name}", "", "dist/*") - // } - // } - //} - } + /*stage('Preparing virtual environment') { + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Create venv and install dependencies") + unstash name: 'Checkout' + + sh ''' + PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH + + python3 -m pip install -U pip + pip3 install virtualenv + + if [ ! -d "venv" ]; then + virtualenv venv + fi + . venv/bin/activate + + python -m pip install -U pip + pip3 install pypandoc==1.7.5 + pip3 install -r requirements-dev.txt + pip3 install pyspark==2.4.0 + pip3 install -e . + pip3 freeze + ''' + stash name: 'venv', useDefaultExcludes: false + } + } + + stage('Unit Test and coverage') { + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Running unit tests and code coverage.") + unstash name: 'Checkout' + unstash name: 'venv' + + // Compatibility for PyArrow with Spark 2.4-legacy IPC format. + sh 'export ARROW_PRE_0_15_IPC_FORMAT=1' + + // Running coverage first runs the tests + sh ''' + . venv/bin/activate - post { + coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests + coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} + ''' + + cobertura autoUpdateHealth: false, + autoUpdateStability: false, + coberturaReportFile: 'python_coverage.xml', + conditionalCoverageTargets: '70, 0, 0', + failUnhealthy: false, + failUnstable: false, + lineCoverageTargets: '80, 0, 0', + maxNumberOfBuilds: 0, + methodCoverageTargets: '80, 0, 0', + onlyStable: false, + zoomCoverageChart: false + } + } + + stage('Build and publish Python Package') { + when { + anyOf{ + branch BUILD_BRANCH + tag BUILD_TAG + } + beforeAgent true + } + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Building Python package.") + unstash name: 'Checkout' + unstash name: 'venv' + + sh ''' + . venv/bin/activate + pip3 install wheel==0.29.0 + python3 setup.py build bdist_wheel + ''' + + script { + pushToPyPiArtifactoryRepo_temp("${buildInfo.name}", "", "dist/*") + } + } + } + } + */ + + /*post { success { unstash name: 'Checkout' updateGitlabStatus_temp('Jenkins', 'success') @@ -167,5 +167,5 @@ pipeline { unstash name: 'Checkout' updateGitlabStatus_temp('Jenkins', 'failed') } - } + }*/ } From 7418540b9e9b9782d19e3f71cc3d58a9046d4086 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 27 Mar 2023 19:32:04 +0100 Subject: [PATCH 261/411] Re-added updateGitLabStatus call from Checkout Stage for testing. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b63ef398c..4e58bebea 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -55,7 +55,7 @@ pipeline { checkout scm - //updateGitlabStatus_temp('Jenkins', 'pending') + updateGitlabStatus_temp('Jenkins', 'pending') script { buildInfo.name = "resdev" From 3aa1c0c106692e6bb7372d95c82866cae36c299e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 28 Mar 2023 13:13:05 +0100 Subject: [PATCH 262/411] Corrected change in function updateGitlabStatus, and amended two environment variables. --- Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4e58bebea..27b56b6e2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -18,7 +18,7 @@ def pushToPyPiArtifactoryRepo_temp(String projectName, String version, String so def updateGitlabStatus_temp(String stage, String state, String gitlabHost = 'https://gitlab-app-l-01.ons.statistics.gov.uk') { withCredentials([string(credentialsId: env.GITLAB_CREDS, variable: 'GITLAB_TOKEN')]) { println("Updating GitLab pipeline status") - shortCommit = sh(returnStdout: true, script: "cd resdev && git log -n 1 --pretty=format:'%h'").trim() + shortCommit = sh(returnStdout: true, script: "cd ${PROJECT_NAME} && git log -n 1 --pretty=format:'%h'").trim() sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_TOKEN}\" \"${gitlabHost}/api/v4/projects/${GITLAB_PROJECT_ID}/statuses/${shortCommit}?state=${state}&name=${stage}&target_url=${BUILD_URL}\"" } } @@ -32,8 +32,8 @@ pipeline { environment { ARTIFACTORY_CREDS = 's_jenkins_epds' ARTIFACTORY_PYPI_REPO = 'LR_EPDS_pypi' - PROJECT_NAME = 'projectname_placeholder' - BUILD_BRANCH = 'build_branch' // Any commits to this branch will create a build in artifactory + PROJECT_NAME = 'resdev' + BUILD_BRANCH = '142_jenkinsFile_RAP' // Any commits to this branch will create a build in artifactory BUILD_TAG = 'v*' // Any commits tagged with this pattern will create a build in artifactory MIN_COVERAGE_PC = '0' GITLAB_CREDS = 'epds_gitlab_token' // Credentials used for notifying GitLab of build status From fc35652ccf781499ddd3f934ae76fbb1a25cfce7 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 28 Mar 2023 17:13:29 +0100 Subject: [PATCH 263/411] Deleted unused stages. --- Jenkinsfile | 100 ---------------------------------------------------- 1 file changed, 100 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 27b56b6e2..3b967f7b9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -67,105 +67,5 @@ pipeline { } } - /*stage('Preparing virtual environment') { - agent { label "test.${agentPython3Version}" } - steps { - onStage() - colourText('info', "Create venv and install dependencies") - unstash name: 'Checkout' - - sh ''' - PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH - - python3 -m pip install -U pip - pip3 install virtualenv - - if [ ! -d "venv" ]; then - virtualenv venv - fi - . venv/bin/activate - - python -m pip install -U pip - pip3 install pypandoc==1.7.5 - pip3 install -r requirements-dev.txt - pip3 install pyspark==2.4.0 - pip3 install -e . - pip3 freeze - ''' - stash name: 'venv', useDefaultExcludes: false - } - } - - stage('Unit Test and coverage') { - agent { label "test.${agentPython3Version}" } - steps { - onStage() - colourText('info', "Running unit tests and code coverage.") - unstash name: 'Checkout' - unstash name: 'venv' - - // Compatibility for PyArrow with Spark 2.4-legacy IPC format. - sh 'export ARROW_PRE_0_15_IPC_FORMAT=1' - - // Running coverage first runs the tests - sh ''' - . venv/bin/activate - - coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests - coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} - ''' - - cobertura autoUpdateHealth: false, - autoUpdateStability: false, - coberturaReportFile: 'python_coverage.xml', - conditionalCoverageTargets: '70, 0, 0', - failUnhealthy: false, - failUnstable: false, - lineCoverageTargets: '80, 0, 0', - maxNumberOfBuilds: 0, - methodCoverageTargets: '80, 0, 0', - onlyStable: false, - zoomCoverageChart: false - } - } - - stage('Build and publish Python Package') { - when { - anyOf{ - branch BUILD_BRANCH - tag BUILD_TAG - } - beforeAgent true - } - agent { label "test.${agentPython3Version}" } - steps { - onStage() - colourText('info', "Building Python package.") - unstash name: 'Checkout' - unstash name: 'venv' - - sh ''' - . venv/bin/activate - pip3 install wheel==0.29.0 - python3 setup.py build bdist_wheel - ''' - - script { - pushToPyPiArtifactoryRepo_temp("${buildInfo.name}", "", "dist/*") - } - } - } } - */ - - /*post { - success { - unstash name: 'Checkout' - updateGitlabStatus_temp('Jenkins', 'success') - } - failure { - unstash name: 'Checkout' - updateGitlabStatus_temp('Jenkins', 'failed') - } - }*/ } From 557e89b7febea02e1275f596bc11a84fad663c42 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 28 Mar 2023 17:14:42 +0100 Subject: [PATCH 264/411] Deleted updateGitlabStatus function. --- Jenkinsfile | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3b967f7b9..97b409317 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,15 +14,6 @@ def pushToPyPiArtifactoryRepo_temp(String projectName, String version, String so } } -// Define a function to update the pipeline status on Gitlab -def updateGitlabStatus_temp(String stage, String state, String gitlabHost = 'https://gitlab-app-l-01.ons.statistics.gov.uk') { - withCredentials([string(credentialsId: env.GITLAB_CREDS, variable: 'GITLAB_TOKEN')]) { - println("Updating GitLab pipeline status") - shortCommit = sh(returnStdout: true, script: "cd ${PROJECT_NAME} && git log -n 1 --pretty=format:'%h'").trim() - sh "curl --request POST --header \"PRIVATE-TOKEN: ${GITLAB_TOKEN}\" \"${gitlabHost}/api/v4/projects/${GITLAB_PROJECT_ID}/statuses/${shortCommit}?state=${state}&name=${stage}&target_url=${BUILD_URL}\"" - } -} - // This section defines the Jenkins pipeline pipeline { libraries { From 92c1bca11a9a25d22c5417b268e6c61594457003 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 28 Mar 2023 17:15:26 +0100 Subject: [PATCH 265/411] Deleted GITLAB_PROJECT_ID as currently unused. --- Jenkinsfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 97b409317..13f5f24c4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,6 @@ pipeline { BUILD_TAG = 'v*' // Any commits tagged with this pattern will create a build in artifactory MIN_COVERAGE_PC = '0' GITLAB_CREDS = 'epds_gitlab_token' // Credentials used for notifying GitLab of build status - GITLAB_PROJECT_ID = 'gitlabid_placeholder' } options { From e79f0e9fb914a8d16138daeb448319788635f99f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 28 Mar 2023 17:20:41 +0100 Subject: [PATCH 266/411] Deleted updateGitlabStatus_temp in Checkout stage. --- Jenkinsfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 13f5f24c4..11e3ce9c1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,8 +45,6 @@ pipeline { checkout scm - updateGitlabStatus_temp('Jenkins', 'pending') - script { buildInfo.name = "resdev" buildInfo.number = "${BUILD_NUMBER}" From 1da52baa620dd645c6b2a75acc66660f62f1deab Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 13:19:32 +0100 Subject: [PATCH 267/411] Added virtual environment stage and amended for conda env testing. --- Jenkinsfile | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 11e3ce9c1..0047c72b6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -55,5 +55,30 @@ pipeline { } } + stage('Preparing virtual environment') { + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Create venv and install dependencies") + unstash name: 'Checkout' + + sh ''' + PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH + + python3 -m pip install -U pip + pip3 install conda + + if [ ! -d "resdev36" ]; then + conda create -n resdev36 python=3.6.2 + fi + . venv/bin/activate + source activate resdev36 + + ''' + stash name: 'resdev36', useDefaultExcludes: false + } + } + } + } From 4cb01207ec9399d6a5384304fd8b4a6069b012d4 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 13:27:08 +0100 Subject: [PATCH 268/411] Amended Python version and deleted a legacy line in virtual env stage. --- Jenkinsfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0047c72b6..cf1da380e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,7 +4,7 @@ def artifactoryStr = 'art-p-01' artServer = Artifactory.server "${artifactoryStr}" buildInfo = Artifactory.newBuildInfo() -def agentPython3Version = 'python_3.6.2' +def agentPython3Version = 'python_3.6.1' def artifactVersion // Define a function to push packaged code to Artifactory @@ -71,7 +71,6 @@ pipeline { if [ ! -d "resdev36" ]; then conda create -n resdev36 python=3.6.2 fi - . venv/bin/activate source activate resdev36 ''' From 77c434f6606aa260d4e5a7418a67dad2fe9f1625 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 13:58:02 +0100 Subject: [PATCH 269/411] Added line to test if conda gets installed on Jenkins. --- Jenkinsfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index cf1da380e..7d796eb1d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -66,11 +66,13 @@ pipeline { PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH python3 -m pip install -U pip - pip3 install conda + pip install conda + conda -V if [ ! -d "resdev36" ]; then conda create -n resdev36 python=3.6.2 fi + source activate resdev36 ''' From 5f87ea72203a66bcaac94b08ad27d95c533a992c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 14:10:11 +0100 Subject: [PATCH 270/411] Added line in virtual env stage to install wheel. --- Jenkinsfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7d796eb1d..90ff0adfc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -66,7 +66,9 @@ pipeline { PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH python3 -m pip install -U pip - pip install conda + pip3 install wheel + pip3 install conda + conda -V if [ ! -d "resdev36" ]; then From a155e6fb671f045735da10340e027e29a023b2e9 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 17:12:51 +0100 Subject: [PATCH 271/411] Added lines to try wget on Jenkins. --- Jenkinsfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 90ff0adfc..b3e657c3c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -67,9 +67,10 @@ pipeline { python3 -m pip install -U pip pip3 install wheel - pip3 install conda - conda -V + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -nv -O miniconda.sh + + bash miniconda.sh -b -p $WORKSPACE/miniconda if [ ! -d "resdev36" ]; then conda create -n resdev36 python=3.6.2 From 8f33f9a1b781c329724d6277050b298e011321fb Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 18:15:21 +0100 Subject: [PATCH 272/411] Attempting to use JEN-BDA-10 agent in Jenkins to avoid current queuing issues. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b3e657c3c..465371a02 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -34,7 +34,7 @@ pipeline { skipDefaultCheckout true } - agent any + agent JEN-BDA-10 stages { stage('Checkout') { From 48784257787b92440c3b6905dfccdb55faa2407b Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 18:44:10 +0100 Subject: [PATCH 273/411] Corrected Jenkins agent declaration, missing {} before. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 465371a02..65baee674 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -34,7 +34,7 @@ pipeline { skipDefaultCheckout true } - agent JEN-BDA-10 + agent {JEN-BDA-10} stages { stage('Checkout') { From 6f331713ceb2a2159fda89adc2750e78fa8ba205 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 18:48:20 +0100 Subject: [PATCH 274/411] Corrected Jenkins agent declaration, missing {} before. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 65baee674..0225a4a05 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -34,7 +34,7 @@ pipeline { skipDefaultCheckout true } - agent {JEN-BDA-10} + agent {"JEN-BDA-10"} stages { stage('Checkout') { From ad2ecfb0d938c045fed5462bdaaf85d03b3f11e6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 19:29:36 +0100 Subject: [PATCH 275/411] Reverted agent to 'any' on line 37. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0225a4a05..b3e657c3c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -34,7 +34,7 @@ pipeline { skipDefaultCheckout true } - agent {"JEN-BDA-10"} + agent any stages { stage('Checkout') { From e947bbf451627c99c3b51b090a347e8b471b950b Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 19:34:29 +0100 Subject: [PATCH 276/411] wget not found in Jenkins, trying apt install wget. --- Jenkinsfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index b3e657c3c..24cf47dfd 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,6 +68,8 @@ pipeline { python3 -m pip install -U pip pip3 install wheel + apt install wget + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -nv -O miniconda.sh bash miniconda.sh -b -p $WORKSPACE/miniconda From a2c3f225ce04a97583090c05a25d1d830e889cb5 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 19:48:40 +0100 Subject: [PATCH 277/411] apt changed to apt-get --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 24cf47dfd..9fc93cb52 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,7 +68,7 @@ pipeline { python3 -m pip install -U pip pip3 install wheel - apt install wget + apt-get install wget wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -nv -O miniconda.sh From 23db42126704221173e3a0c47b48ee8546c7e806 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 19:58:36 +0100 Subject: [PATCH 278/411] Cut out apt-get and checking which distribution is being used. --- Jenkinsfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9fc93cb52..7c60bd698 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,7 +68,8 @@ pipeline { python3 -m pip install -U pip pip3 install wheel - apt-get install wget + cat /etc/*-release + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -nv -O miniconda.sh From 6d61ea69d577a564fbc854b83730c08fe61d627b Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 29 Mar 2023 20:11:47 +0100 Subject: [PATCH 279/411] Using yum instead of apt as linux distribution is CentOS. --- Jenkinsfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 7c60bd698..df86c7d6e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -70,6 +70,8 @@ pipeline { cat /etc/*-release + yum install wget + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -nv -O miniconda.sh From 82379496ac1db149a82756f3f0de0d1a47745b97 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 10:15:57 +0100 Subject: [PATCH 280/411] Removed cat line regarding discovering which linux release jenkins uses, it is CentOS. --- Jenkinsfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index df86c7d6e..64b9a23cb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,8 +68,6 @@ pipeline { python3 -m pip install -U pip pip3 install wheel - cat /etc/*-release - yum install wget From e9514de4a02156c0764c942c8e7c8bf4b044f53a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 10:34:42 +0100 Subject: [PATCH 281/411] Removed wget lines to focus on trying to get yum working on jenkins. --- Jenkinsfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 64b9a23cb..a6a0f3ae4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -71,9 +71,6 @@ pipeline { yum install wget - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -nv -O miniconda.sh - - bash miniconda.sh -b -p $WORKSPACE/miniconda if [ ! -d "resdev36" ]; then conda create -n resdev36 python=3.6.2 From 847a28506902d09bdfedfb2d2a3125a36bc84382 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 10:50:08 +0100 Subject: [PATCH 282/411] Trying curl instead of yum to install wget. --- Jenkinsfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index a6a0f3ae4..534f9364f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -68,9 +68,11 @@ pipeline { python3 -m pip install -U pip pip3 install wheel - yum install wget + curl install wget + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -nv -O miniconda.sh + bash miniconda.sh -b -p $WORKSPACE/miniconda if [ ! -d "resdev36" ]; then conda create -n resdev36 python=3.6.2 From 32e776f7b987b516d32318827bbe538dd372d303 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 11:46:29 +0100 Subject: [PATCH 283/411] Testing using normal virtualenv and requirements.txt file. --- Jenkinsfile | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 534f9364f..b717d5d5f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -66,22 +66,19 @@ pipeline { PATH=$WORKSPACE/venv/bin:/usr/local/bin:$PATH python3 -m pip install -U pip - pip3 install wheel + pip3 install virtualenv - curl install wget - - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -nv -O miniconda.sh - - bash miniconda.sh -b -p $WORKSPACE/miniconda - - if [ ! -d "resdev36" ]; then - conda create -n resdev36 python=3.6.2 + if [ ! -d "venv" ]; then + virtualenv venv fi + . venv/bin/activate - source activate resdev36 + python -m pip install -U pip + pip3 install -r requirements.txt + pip3 freeze ''' - stash name: 'resdev36', useDefaultExcludes: false + stash name: 'venv', useDefaultExcludes: false } } From 030b1fa2362bf4a0f033bcd12a81ba5ad98477be Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 13:54:25 +0100 Subject: [PATCH 284/411] Pre-commit reformatted. --- .pre-commit-config.yaml | 138 +++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 72 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b8838c944..1827cabf5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,75 +1,69 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: - - repo: https://github.com/kynan/nbstripout - rev: 0.4.0 - hooks: - - id: nbstripout - name: nbstripout - Strip outputs from notebooks (auto-fixes) - args: - - --extra-keys - - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId" - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 - hooks: - - id: check-added-large-files - name: Check for files larger than 5 MB - args: [ "--maxkb=5120" ] - - id: end-of-file-fixer - name: Check for a blank line at the end of scripts (auto-fixes) - exclude: '\.Rd' - - id: trailing-whitespace - name: Check for trailing whitespaces (auto-fixes) - - repo: https://github.com/pycqa/isort - rev: 5.8.0 - hooks: - - id: isort - name: isort - Sort Python imports (auto-fixes) - types: [ cython, pyi, python ] - args: [ "--profile", "black", "--filter-files" ] - - repo: https://github.com/psf/black - rev: 22.8.0 # Replace by any tag/version: https://github.com/psf/black/tags - hooks: - - id: black - name: black - consistent Python code formatting (auto-fixes) - language_version: python # Should be a command that runs python3.6+ - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - name: flake8 - Python linting - - repo: https://github.com/nbQA-dev/nbQA - rev: 0.12.0 - hooks: - - id: nbqa-isort - name: nbqa-isort - Sort Python imports (notebooks; auto-fixes) - args: [ --nbqa-mutate ] - additional_dependencies: [ isort==5.8.0 ] - - id: nbqa-black - name: nbqa-black - consistent Python code formatting (notebooks; auto-fixes) - args: [ --nbqa-mutate ] - additional_dependencies: [ black==21.5b2 ] - # TODO: Disabled for now until it's clear how to add noqa to specific cells of a Jupyter notebook - #- id: nbqa-flake8 - # name: nbqa-flake8 - Python linting (notebooks) - # additional_dependencies: [ flake8==3.9.2 ] - - repo: https://github.com/Yelp/detect-secrets - rev: v1.0.3 - hooks: - - id: detect-secrets - name: detect-secrets - Detect secrets in staged code - args: [ "--baseline", ".secrets.baseline" ] - exclude: .*/tests/.*|^\.cruft\.json$ - - repo: local - hooks: - - id: restricted-filenames - name: Check commits for restricted file extensions - entry: These file extensions are restricted. Data should be removed from the commit - language: fail - files: .*\.(csv|feather|xlsx|zip|hdf5|h5|json|xml|hd|parquet) - - repo: local - hooks: - - id: coverage-badge - name: Update the coverage badge in the readme - entry: python /home/cdsw/research-and-development/cov_reports/update_readme.py - language: python + - repo: https://github.com/kynan/nbstripout + rev: 0.4.0 + hooks: + - id: nbstripout + name: nbstripout - Strip outputs from notebooks (auto-fixes) + args: + - --extra-keys + - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId" + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-added-large-files + name: Check for files larger than 5 MB + args: ["--maxkb=5120"] + - id: end-of-file-fixer + name: Check for a blank line at the end of scripts (auto-fixes) + exclude: '\.Rd' + - id: trailing-whitespace + name: Check for trailing whitespaces (auto-fixes) + - repo: https://github.com/pycqa/isort + rev: 5.8.0 + hooks: + - id: isort + name: isort - Sort Python imports (auto-fixes) + types: [cython, pyi, python] + args: ["--profile", "black", "--filter-files"] + - repo: https://github.com/psf/black + rev: 22.8.0 # Replace by any tag/version: https://github.com/psf/black/tags + hooks: + - id: black + name: black - consistent Python code formatting (auto-fixes) + language_version: python # Should be a command that runs python3.6+ + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + name: flake8 - Python linting + - repo: https://github.com/nbQA-dev/nbQA + rev: 0.12.0 + hooks: + - id: nbqa-isort + name: nbqa-isort - Sort Python imports (notebooks; auto-fixes) + args: [--nbqa-mutate] + additional_dependencies: [isort==5.8.0] + - id: nbqa-black + name: nbqa-black - consistent Python code formatting (notebooks; auto-fixes) + args: [--nbqa-mutate] + additional_dependencies: [black==21.5b2] + # TODO: Disabled for now until it's clear how to add noqa to specific cells of a Jupyter notebook + #- id: nbqa-flake8 + # name: nbqa-flake8 - Python linting (notebooks) + # additional_dependencies: [ flake8==3.9.2 ] + - repo: https://github.com/Yelp/detect-secrets + rev: v1.0.3 + hooks: + - id: detect-secrets + name: detect-secrets - Detect secrets in staged code + args: ["--baseline", ".secrets.baseline"] + exclude: .*/tests/.*|^\.cruft\.json$ + - repo: local + hooks: + - id: restricted-filenames + name: Check commits for restricted file extensions + entry: These file extensions are restricted. Data should be removed from the commit + language: fail + files: .*\.(csv|feather|xlsx|zip|hdf5|h5|txt|json|xml|hd|parquet)|!requirements.txt From e9a74998ca0e789bddd4442ad7c97b3962ab57fa Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 13:54:41 +0100 Subject: [PATCH 285/411] Added line to NOT include requirements.txt in .gitignore file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 45e3c17b0..56ccf9a73 100644 --- a/.gitignore +++ b/.gitignore @@ -883,6 +883,7 @@ data/processed/* *.json *.parquet *.txt +!requirements.txt *.xlsx *.xml *.zip From 64feaf7bfe17c145885cc2665230d86574cabe55 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 13:55:55 +0100 Subject: [PATCH 286/411] Created script to convert the environment.yml to a requirements.txt format. --- yml_to_requirements_convert.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 yml_to_requirements_convert.py diff --git a/yml_to_requirements_convert.py b/yml_to_requirements_convert.py new file mode 100644 index 000000000..edd5d14ef --- /dev/null +++ b/yml_to_requirements_convert.py @@ -0,0 +1,24 @@ +import yaml + +# yaml = ruamel.yaml.YAML() +data = yaml.safe_load(open("/home/cdsw/research-and-development/environment.yml")) + +requirements = [] +for dep in data["dependencies"]: + if isinstance(dep, str): + if "=" in dep: + package, package_version = dep.split("=") + if package == "python": + requirements.append(package + "==3.6.2") + else: + requirements.append(package + "==" + package_version) + else: + requirements.append(dep) + elif isinstance(dep, dict): + for preq in dep.get("pip", []): + requirements.append(preq) + print(requirements) + +with open("/home/cdsw/research-and-development/requirements.txt", "w") as fp: + for requirement in requirements: + print(requirement, file=fp) From cc1b2a2acacbc3c61382f654d20271647a558ae3 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 13:57:55 +0100 Subject: [PATCH 287/411] Modified pre-commit-config.yaml to allow txt files, including new requirements.txt. --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1827cabf5..42d2ecb18 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -66,4 +66,4 @@ repos: name: Check commits for restricted file extensions entry: These file extensions are restricted. Data should be removed from the commit language: fail - files: .*\.(csv|feather|xlsx|zip|hdf5|h5|txt|json|xml|hd|parquet)|!requirements.txt + files: .*\.(csv|feather|xlsx|zip|hdf5|h5|json|xml|hd|parquet) From 6dbe660bddccac438dcb0a8f1e3738d077abdc32 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 13:58:07 +0100 Subject: [PATCH 288/411] Newly created requirements.txt file. --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3598ecc37..76b76a684 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ coverage pyyaml requests sphinx -postcodes_uk # remove this later - use regex typing readme-coverage-badger +postcodes_uk +toml From 873fe13467cbd81e0d2ff14889603b2396bdadbf Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 15:30:14 +0100 Subject: [PATCH 289/411] Added Unit testing stage to Jenkinsfile after virtualenv stage passed. --- Jenkinsfile | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index b717d5d5f..24563a564 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -82,6 +82,41 @@ pipeline { } } + + stage('Unit Test and coverage') { + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Running unit tests and code coverage.") + unstash name: 'Checkout' + unstash name: 'venv' + + // Compatibility for PyArrow with Spark 2.4-legacy IPC format. + sh 'export ARROW_PRE_0_15_IPC_FORMAT=1' + + // Running coverage first runs the tests + sh ''' + . venv/bin/activate + + coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests + coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} + ''' + + cobertura autoUpdateHealth: false, + autoUpdateStability: false, + coberturaReportFile: 'python_coverage.xml', + conditionalCoverageTargets: '70, 0, 0', + failUnhealthy: false, + failUnstable: false, + lineCoverageTargets: '80, 0, 0', + maxNumberOfBuilds: 0, + methodCoverageTargets: '80, 0, 0', + onlyStable: false, + zoomCoverageChart: false + } + } + + } } From 4a538e0b66dee3d41ec94f8cbe34836753a30140 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 16:56:55 +0100 Subject: [PATCH 290/411] Added a few lines to the virtual environment stage of the Jenkinsfile, checking it resolves error in next stage. --- Jenkinsfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 24563a564..ca3abdc76 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -74,7 +74,10 @@ pipeline { . venv/bin/activate python -m pip install -U pip - pip3 install -r requirements.txt + pip3 install pypandoc==1.7.5 + pip3 install -r requirements-dev.txt + pip3 install pyspark==2.4.0 + pip3 freeze ''' From 4a7b5ad28b95f3bb34c56210b81068980f9ee9ea Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 17:07:44 +0100 Subject: [PATCH 291/411] Corrected requirements-dev.txt to requirements.txt --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index ca3abdc76..3904d095c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -75,7 +75,7 @@ pipeline { python -m pip install -U pip pip3 install pypandoc==1.7.5 - pip3 install -r requirements-dev.txt + pip3 install -r requirements.txt pip3 install pyspark==2.4.0 pip3 freeze From a993ac9aeeb8c776bb66ae8e0cde54d1696ebf45 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 18:46:48 +0100 Subject: [PATCH 292/411] Added sqlite manually to the requirements.txt file. Testing for unit test stage on Jenkins. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 76b76a684..fb1f4d19e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ typing readme-coverage-badger postcodes_uk toml +sqlite From 78db57576a0ca983f8ffdba8858dc24950f6a1b2 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 30 Mar 2023 18:52:31 +0100 Subject: [PATCH 293/411] Amended sqlite to pysqlite3 in requirements.txt. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fb1f4d19e..207c831ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ typing readme-coverage-badger postcodes_uk toml -sqlite +pysqlite3 From bc7d80ded4688b3eaa42542d315b7cc41fde08d4 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 10:46:03 +0100 Subject: [PATCH 294/411] Changed buildInfo.name back to in Checkout stage. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3904d095c..d2d0314b4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -46,7 +46,7 @@ pipeline { checkout scm script { - buildInfo.name = "resdev" + buildInfo.name = "${PROJECT_NAME}" buildInfo.number = "${BUILD_NUMBER}" buildInfo.env.collect() } From db1f707c81b9233bdd351e1b6b4951ea3c1d2f04 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 10:56:44 +0100 Subject: [PATCH 295/411] Removed pysqlite3 from requirements.txt. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 207c831ac..76b76a684 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,3 @@ typing readme-coverage-badger postcodes_uk toml -pysqlite3 From 7ae9380fb042c101f78aa0001785c94912f31269 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 11:59:53 +0100 Subject: [PATCH 296/411] Amended sqlite to sqlite-devel. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 76b76a684..3ff5ed07b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ typing readme-coverage-badger postcodes_uk toml +sqlite-devel From 9f8724d07135d6602a90b8176f6eb985beb1a678 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 12:29:39 +0100 Subject: [PATCH 297/411] Removed sqlite-devel in requirements.txt. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3ff5ed07b..76b76a684 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,3 @@ typing readme-coverage-badger postcodes_uk toml -sqlite-devel From f0f76fa0030d8ed2cda50726a95ca77736663487 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 12:31:54 +0100 Subject: [PATCH 298/411] Added pwd and ls lines to investigate unit test stage errors. --- Jenkinsfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index d2d0314b4..227bc35fa 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -101,6 +101,11 @@ pipeline { sh ''' . venv/bin/activate + pwd + ls -lhrta + echo "" + ls -lhrta ./tests + coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} ''' From 1aed0dce37696c3bc1af89258be3136509f89423 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 12:43:33 +0100 Subject: [PATCH 299/411] Grepping for sqlite in /tmp/ on Jenkins node itself. Can't seem to find _sqlite3 module. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 227bc35fa..8b7534787 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -104,7 +104,7 @@ pipeline { pwd ls -lhrta echo "" - ls -lhrta ./tests + grep -r sqlite /tmp/ coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} From 1e7020f05205c0b06b29e4842cc10d94dcf2a721 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 12:53:24 +0100 Subject: [PATCH 300/411] Grepping for '_sqlite3' in /tmp/ . --- Jenkinsfile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8b7534787..3f4e61bd8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -101,10 +101,7 @@ pipeline { sh ''' . venv/bin/activate - pwd - ls -lhrta - echo "" - grep -r sqlite /tmp/ + grep -r "_sqlite3" /tmp/ coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} From 95d74ee5b61bd903b0353ee6f7599dc21b629886 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 13:09:11 +0100 Subject: [PATCH 301/411] Removed sqlite3 lines in .gitignore, testing if changes Jenkins error. --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 56ccf9a73..53fa13c86 100644 --- a/.gitignore +++ b/.gitignore @@ -11,8 +11,8 @@ *.pyc __pycache__/ local_settings.py -db.sqlite3 -db.sqlite3-journal +#db.sqlite3 +#db.sqlite3-journal media # If your build process includes running collectstatic, then you probably don't need or want to include staticfiles/ From 6b368e37f118485fdcbf1f667037c7fd0bc17fa3 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 13:09:42 +0100 Subject: [PATCH 302/411] Removed grep line in Jenkinsfile. --- Jenkinsfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3f4e61bd8..d2d0314b4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -101,8 +101,6 @@ pipeline { sh ''' . venv/bin/activate - grep -r "_sqlite3" /tmp/ - coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} ''' From f0c4893938463260b479c873932628e9197adb2c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 13:31:50 +0100 Subject: [PATCH 303/411] Re-added sqlite3 lines in .gitignore. --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 53fa13c86..56ccf9a73 100644 --- a/.gitignore +++ b/.gitignore @@ -11,8 +11,8 @@ *.pyc __pycache__/ local_settings.py -#db.sqlite3 -#db.sqlite3-journal +db.sqlite3 +db.sqlite3-journal media # If your build process includes running collectstatic, then you probably don't need or want to include staticfiles/ From 65dc4d8cb15eff2ab2c2cfdd78a5b41cc67d26d8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 13:33:45 +0100 Subject: [PATCH 304/411] Set coverage version to the same as the pub_sec project to see if that resolves sqlite3 errors. --- requirements.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/requirements.txt b/requirements.txt index 76b76a684..f77a86c7c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,12 @@ # python==3.6 +coverage==4.5.4 +pytest +pyyaml +pandas +numpy +requests +sphinx +pip arrow cookiecutter detect-secrets From 9767ef072d2ed97ccf2e5d2f430acd539c39f1f8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 13:48:30 +0100 Subject: [PATCH 305/411] Removed coverage xml line from sh script and commented out cobertura lines. --- Jenkinsfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index d2d0314b4..cf8fa370b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -102,10 +102,11 @@ pipeline { . venv/bin/activate coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests - coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} ''' + //coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} - cobertura autoUpdateHealth: false, + + /*cobertura autoUpdateHealth: false, autoUpdateStability: false, coberturaReportFile: 'python_coverage.xml', conditionalCoverageTargets: '70, 0, 0', @@ -115,7 +116,7 @@ pipeline { maxNumberOfBuilds: 0, methodCoverageTargets: '80, 0, 0', onlyStable: false, - zoomCoverageChart: false + zoomCoverageChart: false */ } } From c79233a1a80a62f996d7b9566ca4ac3ee9be7558 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 14:14:20 +0100 Subject: [PATCH 306/411] Added lines relating to 'cobertura' back in to test what they are. --- Jenkinsfile | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index cf8fa370b..474021c23 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -106,7 +106,7 @@ pipeline { //coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} - /*cobertura autoUpdateHealth: false, + cobertura autoUpdateHealth: false, autoUpdateStability: false, coberturaReportFile: 'python_coverage.xml', conditionalCoverageTargets: '70, 0, 0', @@ -116,10 +116,38 @@ pipeline { maxNumberOfBuilds: 0, methodCoverageTargets: '80, 0, 0', onlyStable: false, - zoomCoverageChart: false */ + zoomCoverageChart: false } } + stage('Build and publish Python Package') { + when { + anyOf{ + branch BUILD_BRANCH + //tag BUILD_TAG + } + beforeAgent true + } + agent { label "test.${agentPython3Version}" } + steps { + onStage() + colourText('info', "Building Python package.") + unstash name: 'Checkout' + unstash name: 'venv' + + sh ''' + . venv/bin/activate + pip3 install wheel==0.29.0 + python3 setup.py build bdist_wheel + ''' + + script { + pushToPyPiArtifactoryRepo_temp("${buildInfo.name}", "", "dist/*") + } + } + } + + } From 05d3c56bf169a3dc72259f05bf3782b1027eb498 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 14:23:48 +0100 Subject: [PATCH 307/411] Removed cobertura. These relate to coverage report using xml. --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 474021c23..79c0cf587 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -106,7 +106,7 @@ pipeline { //coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} - cobertura autoUpdateHealth: false, + /*cobertura autoUpdateHealth: false, autoUpdateStability: false, coberturaReportFile: 'python_coverage.xml', conditionalCoverageTargets: '70, 0, 0', @@ -116,7 +116,7 @@ pipeline { maxNumberOfBuilds: 0, methodCoverageTargets: '80, 0, 0', onlyStable: false, - zoomCoverageChart: false + zoomCoverageChart: false */ } } From 89b3fc69e88ecc1299892282fb6eb89095b6f750 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 14:54:54 +0100 Subject: [PATCH 308/411] Removed import src.main as pre-commit complains it is not used. Left comment in unit test stage of Jenkinsfile. --- Jenkinsfile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 79c0cf587..5ae7af9c8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -103,10 +103,14 @@ pipeline { coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests ''' - //coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} + /* + // Lines below create a coverage report for on Jenkins. Currently commented out + // as it gives errors when no imports are used in unit tests. import src.main + // causes pre-commit to complain. Easier to leave out for now. + coverage xml -o python_coverage.xml && coverage report -m --fail-under=${MIN_COVERAGE_PC} - /*cobertura autoUpdateHealth: false, + cobertura autoUpdateHealth: false, autoUpdateStability: false, coberturaReportFile: 'python_coverage.xml', conditionalCoverageTargets: '70, 0, 0', From fedc7aeca8185d8ff851ab725f5446c6720d95ab Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 14:56:08 +0100 Subject: [PATCH 309/411] Created setup.py. Needed for build stage in Jenkinsfile. --- setup.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..e69de29bb From f56f28387cc5f354ed9f081ba767f5bc7515ca74 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 14:57:41 +0100 Subject: [PATCH 310/411] Copied setup.py code from pub_sec project. --- setup.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/setup.py b/setup.py index e69de29bb..8e37d1659 100644 --- a/setup.py +++ b/setup.py @@ -0,0 +1,31 @@ +"""Setup script for creating package from code.""" +from setuptools import setup, find_packages +import re + +# Specify and open the version file +VERSION_FILE = "pub_sec/_version.py" +verstrline = open(VERSION_FILE, "rt").read() +print(verstrline) + +# Automatically detect the package version from VERSION_FILE +VERSION_REGEX = r"^__version__ = ['\"]([^'\"]*)['\"]" +mo = re.search(VERSION_REGEX, verstrline, re.M) +if mo: + version_string = mo.group(1) +else: + raise RuntimeError("Unable to find version string in %s." % (VERSION_FILE,)) + +with open("requirements.txt") as f: + requirements = f.read().splitlines() + +setup( + name="pub_sec", + version=version_string, + description="Public Sector local Python downloads and preprocessing package", + url="https://gitlab-app-l-01/EPDS/pub_sec", + packages=find_packages(), + package_data={"": ["*.toml", "*.r", "*.R", "*.pem"]}, + include_package_data=True, + zip_safe=False, + install_requires=requirements, +) From 6ca2f49eb41db88378b1cc9fe1895c3f17ad952e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 15:00:46 +0100 Subject: [PATCH 311/411] Changed path for _version.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8e37d1659..f03e91ef6 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import re # Specify and open the version file -VERSION_FILE = "pub_sec/_version.py" +VERSION_FILE = "src/_version.py" verstrline = open(VERSION_FILE, "rt").read() print(verstrline) From d32bb4e8c00c37351f8f1ad4a2bba0c75a3cb88f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 15:05:15 +0100 Subject: [PATCH 312/411] Changed set up name and url to match this project. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index f03e91ef6..9a386084d 100644 --- a/setup.py +++ b/setup.py @@ -19,10 +19,10 @@ requirements = f.read().splitlines() setup( - name="pub_sec", + name="research-and-development", version=version_string, description="Public Sector local Python downloads and preprocessing package", - url="https://gitlab-app-l-01/EPDS/pub_sec", + url="https://github.com/ONSdigital/research-and-development", packages=find_packages(), package_data={"": ["*.toml", "*.r", "*.R", "*.pem"]}, include_package_data=True, From 160ec19a297177d92c940b8239222fb371a05e2f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 31 Mar 2023 15:43:35 +0100 Subject: [PATCH 313/411] Changed artifactory repo to yr-python. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5ae7af9c8..146430780 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { environment { ARTIFACTORY_CREDS = 's_jenkins_epds' - ARTIFACTORY_PYPI_REPO = 'LR_EPDS_pypi' + ARTIFACTORY_PYPI_REPO = 'yr-python' PROJECT_NAME = 'resdev' BUILD_BRANCH = '142_jenkinsFile_RAP' // Any commits to this branch will create a build in artifactory BUILD_TAG = 'v*' // Any commits tagged with this pattern will create a build in artifactory From 5b6ae8f6caa9e89e0501c2ad25b3abc5c34728b5 Mon Sep 17 00:00:00 2001 From: westwj1 Date: Thu, 13 Apr 2023 11:42:14 +0100 Subject: [PATCH 314/411] Code to convert env to pip --- .gitignore | 2 +- notebooks/scripting_env_converter.ipynb | 107 ++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 notebooks/scripting_env_converter.ipynb diff --git a/.gitignore b/.gitignore index 56ccf9a73..e70baeb69 100644 --- a/.gitignore +++ b/.gitignore @@ -93,7 +93,7 @@ doc/_build/ target/ # Jupyter Notebook -notebooks/* +# notebooks/* !notebooks/.gitkeep !notebooks/.README.md .ipynb_checkpoints diff --git a/notebooks/scripting_env_converter.ipynb b/notebooks/scripting_env_converter.ipynb new file mode 100644 index 000000000..99acbdda6 --- /dev/null +++ b/notebooks/scripting_env_converter.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import yaml\n", + "\n", + "env_path = \"/home/cdsw/research-and-development/environment.yml\"\n", + "\n", + "\n", + "def dependency_reader(env_file=env_path):\n", + " env_contents = yaml.safe_load(open(env_file))\n", + " dependencies = env_contents[\"dependencies\"]\n", + " return dependencies\n", + "\n", + "\n", + "dependencies = dependency_reader(env_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def dependency_joiner(all_dependencies):\n", + " conda_dependencies = [dep for dep in all_dependencies if isinstance(dep, str)]\n", + " # The pip section is always the last element\n", + " pip_dependencies = all_dependencies[-1][\"pip\"]\n", + " deps_comb = pip_dependencies + conda_dependencies\n", + " return deps_comb\n", + "\n", + "\n", + "deps_combnd = dependency_joiner(dependencies)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "\n", + "def sub_equals(dep):\n", + " patt = re.compile(r\"(?<=[a-z])=(?=\\d)\")\n", + " dep = re.sub(patt, \"==\", dep)\n", + " return dep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "deps_combnd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pip_corrected = map(sub_equals, deps_combnd)\n", + "list(pip_corrected)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3c9535e50deb0ae3c8c325f8ec43c3c5a5259f27 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 19 Apr 2023 17:18:23 +0100 Subject: [PATCH 315/411] Added environment.yml relative path --- yml_to_requirements_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yml_to_requirements_convert.py b/yml_to_requirements_convert.py index edd5d14ef..0d6923801 100644 --- a/yml_to_requirements_convert.py +++ b/yml_to_requirements_convert.py @@ -1,7 +1,7 @@ import yaml # yaml = ruamel.yaml.YAML() -data = yaml.safe_load(open("/home/cdsw/research-and-development/environment.yml")) +data = yaml.safe_load(open("./environment.yml")) requirements = [] for dep in data["dependencies"]: From e49f649d0f6f80cf4f248d5eb5e3cf3771125a5d Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 19 Apr 2023 17:18:53 +0100 Subject: [PATCH 316/411] Added requirements.txt relative path --- yml_to_requirements_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yml_to_requirements_convert.py b/yml_to_requirements_convert.py index 0d6923801..49d28b6d3 100644 --- a/yml_to_requirements_convert.py +++ b/yml_to_requirements_convert.py @@ -19,6 +19,6 @@ requirements.append(preq) print(requirements) -with open("/home/cdsw/research-and-development/requirements.txt", "w") as fp: +with open("./requirements.txt", "w") as fp: for requirement in requirements: print(requirement, file=fp) From f0ff2d92cd0678ddf2c69aad5d500ec51236726d Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 19 Apr 2023 17:40:09 +0100 Subject: [PATCH 317/411] Added more descriptive variable name for environment.yml --- yml_to_requirements_convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yml_to_requirements_convert.py b/yml_to_requirements_convert.py index 49d28b6d3..2d57768ef 100644 --- a/yml_to_requirements_convert.py +++ b/yml_to_requirements_convert.py @@ -1,10 +1,10 @@ import yaml # yaml = ruamel.yaml.YAML() -data = yaml.safe_load(open("./environment.yml")) +yml_env = yaml.safe_load(open("./environment.yml")) requirements = [] -for dep in data["dependencies"]: +for dep in yml_env["dependencies"]: if isinstance(dep, str): if "=" in dep: package, package_version = dep.split("=") From 95987e9fc27d81ab1958564263e090138240f663 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 19 Apr 2023 17:51:03 +0100 Subject: [PATCH 318/411] Put code into a function --- yml_to_requirements_convert.py | 39 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/yml_to_requirements_convert.py b/yml_to_requirements_convert.py index 2d57768ef..01d54840b 100644 --- a/yml_to_requirements_convert.py +++ b/yml_to_requirements_convert.py @@ -1,24 +1,25 @@ import yaml -# yaml = ruamel.yaml.YAML() -yml_env = yaml.safe_load(open("./environment.yml")) -requirements = [] -for dep in yml_env["dependencies"]: - if isinstance(dep, str): - if "=" in dep: - package, package_version = dep.split("=") - if package == "python": - requirements.append(package + "==3.6.2") +def yml_convertor(): + + yml_env = yaml.safe_load(open("./environment.yml")) + + requirements = [] + for dep in yml_env["dependencies"]: + if isinstance(dep, str): + if "=" in dep: + package, package_version = dep.split("=") + if package == "python": + requirements.append(package + "==3.6.2") + else: + requirements.append(package + "==" + package_version) else: - requirements.append(package + "==" + package_version) - else: - requirements.append(dep) - elif isinstance(dep, dict): - for preq in dep.get("pip", []): - requirements.append(preq) - print(requirements) + requirements.append(dep) + elif isinstance(dep, dict): + for preq in dep.get("pip", []): + requirements.append(preq) -with open("./requirements.txt", "w") as fp: - for requirement in requirements: - print(requirement, file=fp) + with open("./requirements.txt", "w") as fp: + for requirement in requirements: + print(requirement, file=fp) From a9fff260422949dccaf422499b22e23f939f645c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 15:12:47 +0100 Subject: [PATCH 319/411] Created new yml_convetor.py file and added function to read dependencies --- yml_convertor.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 yml_convertor.py diff --git a/yml_convertor.py b/yml_convertor.py new file mode 100644 index 000000000..b59513511 --- /dev/null +++ b/yml_convertor.py @@ -0,0 +1,21 @@ +import yaml + + +yml_file = "./environment.yml" + + +def yml_dependencies(yml="./environment.yml") -> list: + """_summary_ + + Keyword Arguments: + yml -- _description_ (default: {"./environment.yml"}) + + Returns: + _description_ + """ + yml_env = yaml.safe_load(open(yml)) + yml_dep = yml_env["dependencies"] + return yml_dep + + +ydep = yml_dependencies() From db78d8bfdcb65ae92de8d6567a09617e843c3e3e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 15:18:10 +0100 Subject: [PATCH 320/411] Added function that returns a list of the conda dependencies from the yml file --- yml_convertor.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/yml_convertor.py b/yml_convertor.py index b59513511..6c15b4185 100644 --- a/yml_convertor.py +++ b/yml_convertor.py @@ -19,3 +19,20 @@ def yml_dependencies(yml="./environment.yml") -> list: ydep = yml_dependencies() + + +def yml_conda_dependencies(dep_list) -> list: + """_summary_ + + Arguments: + dep_list -- _description_ + + Returns: + _description_ + """ + yml_conda = dep_list[:-1] + return yml_conda + + +y_condadep = yml_conda_dependencies(ydep) +y_condadep.sort() From 8fde9efb81f2e3e0f63ffebf87f3f9674249af36 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 15:19:13 +0100 Subject: [PATCH 321/411] Added function that returns a list of the pip dependencies from the yml file --- yml_convertor.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/yml_convertor.py b/yml_convertor.py index 6c15b4185..e5b49aa15 100644 --- a/yml_convertor.py +++ b/yml_convertor.py @@ -36,3 +36,20 @@ def yml_conda_dependencies(dep_list) -> list: y_condadep = yml_conda_dependencies(ydep) y_condadep.sort() + + +def yml_pip_dependencies(dep_list) -> list: + """_summary_ + + Arguments: + dep_list -- _description_ + + Returns: + _description_ + """ + yml_pip = dep_list[-1]["pip"] + return yml_pip + + +y_pipdep = yml_pip_dependencies(ydep) +y_pipdep.sort() From aacdf076dfe80d68d8ea7d87a61b89230dae3a65 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 15:21:11 +0100 Subject: [PATCH 322/411] Added a function to check if the requirements.txt file exists. Returns a bool. --- yml_convertor.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/yml_convertor.py b/yml_convertor.py index e5b49aa15..1efb21b1c 100644 --- a/yml_convertor.py +++ b/yml_convertor.py @@ -1,5 +1,5 @@ import yaml - +import os yml_file = "./environment.yml" @@ -53,3 +53,22 @@ def yml_pip_dependencies(dep_list) -> list: y_pipdep = yml_pip_dependencies(ydep) y_pipdep.sort() + + +req_file = "./requirements.txt" + + +def req_check(req="./requirements.txt") -> bool: + """_summary_ + + Keyword Arguments: + req -- _description_ (default: {"./requirements.txt"}) + + Returns: + _description_ + """ + isFile = os.path.isfile(req) + return isFile + + +req_exist = req_check(req_file) From 6567d5c45369dccd82d8e270e346b7ce1a678f6e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 15:22:18 +0100 Subject: [PATCH 323/411] Added a function to create the requirements.txt file if it doesn't exist. --- yml_convertor.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/yml_convertor.py b/yml_convertor.py index 1efb21b1c..6b3aa10b6 100644 --- a/yml_convertor.py +++ b/yml_convertor.py @@ -72,3 +72,23 @@ def req_check(req="./requirements.txt") -> bool: req_exist = req_check(req_file) + + +def req_create(req) -> bool: + """_summary_ + + Arguments: + req -- _description_ + + Returns: + _description_ + """ + if not req: + f = open(req, "x") + f.close() + return True + else: + return False + + +check = req_create() From 662de6ea8160096e28b632059f95fd2907f7b5b8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 15:43:47 +0100 Subject: [PATCH 324/411] Added a function to concatenate the conda and pip dependencies into a sorted list --- yml_convertor.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/yml_convertor.py b/yml_convertor.py index 6b3aa10b6..6b1b0e252 100644 --- a/yml_convertor.py +++ b/yml_convertor.py @@ -55,6 +55,24 @@ def yml_pip_dependencies(dep_list) -> list: y_pipdep.sort() +def deps_combnd(conda_deps, pip_deps) -> list: + """_summary_ + + Arguments: + conda_deps -- _description_ + pip_deps -- _description_ + + Returns: + _description_ + """ + full_deps = conda_deps + pip_deps + full_deps.sort() + return full_deps + + +dependencies = deps_combnd(y_condadep, y_pipdep) +print(dependencies) + req_file = "./requirements.txt" @@ -89,6 +107,3 @@ def req_create(req) -> bool: return True else: return False - - -check = req_create() From 8a57984325cff96e8e1c2dc031211f8cd03362ae Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 16:58:52 +0100 Subject: [PATCH 325/411] Created a function to compare dependencies from environemt.yml and requirements.txt and add the unique deps to requirements.txt --- yml_convertor.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/yml_convertor.py b/yml_convertor.py index 6b1b0e252..af986360b 100644 --- a/yml_convertor.py +++ b/yml_convertor.py @@ -71,7 +71,6 @@ def deps_combnd(conda_deps, pip_deps) -> list: dependencies = deps_combnd(y_condadep, y_pipdep) -print(dependencies) req_file = "./requirements.txt" @@ -107,3 +106,37 @@ def req_create(req) -> bool: return True else: return False + + +check = req_create(req_file) + + +def req_compare(dep_file, dep_list) -> list: + """_summary_ + + Arguments: + dep_file -- _description_ + dep_list -- _description_ + + Returns: + _description_ + """ + f = open(dep_file, "w+") + req_existing = f.read() + req_list = req_existing.split("\n") + req_list.sort() + + unique_deps = list(set(dep_list) - set(req_list)) + unique_deps.sort() + + # full_deps = req_list + unique_deps + # full_deps.sort() + + for line in unique_deps: + f.write(f"{line}\n") + f.close() + + return unique_deps + + +test = req_compare(req_file, dependencies) From 37835f3da8ad9a4497d1912669d129eecd73614c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:18:34 +0100 Subject: [PATCH 326/411] Created function to compare environment.yml to requirements.txt and make edits. --- yml_convertor.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/yml_convertor.py b/yml_convertor.py index af986360b..2a71ba77f 100644 --- a/yml_convertor.py +++ b/yml_convertor.py @@ -31,11 +31,11 @@ def yml_conda_dependencies(dep_list) -> list: _description_ """ yml_conda = dep_list[:-1] + yml_conda.sort() return yml_conda y_condadep = yml_conda_dependencies(ydep) -y_condadep.sort() def yml_pip_dependencies(dep_list) -> list: @@ -48,11 +48,11 @@ def yml_pip_dependencies(dep_list) -> list: _description_ """ yml_pip = dep_list[-1]["pip"] + yml_pip.sort() return yml_pip y_pipdep = yml_pip_dependencies(ydep) -y_pipdep.sort() def deps_combnd(conda_deps, pip_deps) -> list: @@ -129,9 +129,6 @@ def req_compare(dep_file, dep_list) -> list: unique_deps = list(set(dep_list) - set(req_list)) unique_deps.sort() - # full_deps = req_list + unique_deps - # full_deps.sort() - for line in unique_deps: f.write(f"{line}\n") f.close() From efda359552b72110a6591e0b283185c685781a88 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:21:26 +0100 Subject: [PATCH 327/411] Moved all tester code to the bottom of yml_convertor.py --- yml_convertor.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/yml_convertor.py b/yml_convertor.py index 2a71ba77f..52921bfd0 100644 --- a/yml_convertor.py +++ b/yml_convertor.py @@ -1,8 +1,6 @@ import yaml import os -yml_file = "./environment.yml" - def yml_dependencies(yml="./environment.yml") -> list: """_summary_ @@ -18,9 +16,6 @@ def yml_dependencies(yml="./environment.yml") -> list: return yml_dep -ydep = yml_dependencies() - - def yml_conda_dependencies(dep_list) -> list: """_summary_ @@ -35,9 +30,6 @@ def yml_conda_dependencies(dep_list) -> list: return yml_conda -y_condadep = yml_conda_dependencies(ydep) - - def yml_pip_dependencies(dep_list) -> list: """_summary_ @@ -52,9 +44,6 @@ def yml_pip_dependencies(dep_list) -> list: return yml_pip -y_pipdep = yml_pip_dependencies(ydep) - - def deps_combnd(conda_deps, pip_deps) -> list: """_summary_ @@ -70,11 +59,6 @@ def deps_combnd(conda_deps, pip_deps) -> list: return full_deps -dependencies = deps_combnd(y_condadep, y_pipdep) - -req_file = "./requirements.txt" - - def req_check(req="./requirements.txt") -> bool: """_summary_ @@ -88,9 +72,6 @@ def req_check(req="./requirements.txt") -> bool: return isFile -req_exist = req_check(req_file) - - def req_create(req) -> bool: """_summary_ @@ -108,9 +89,6 @@ def req_create(req) -> bool: return False -check = req_create(req_file) - - def req_compare(dep_file, dep_list) -> list: """_summary_ @@ -136,4 +114,12 @@ def req_compare(dep_file, dep_list) -> list: return unique_deps +yml_file = "./environment.yml" +ydep = yml_dependencies() +y_condadep = yml_conda_dependencies(ydep) +y_pipdep = yml_pip_dependencies(ydep) +dependencies = deps_combnd(y_condadep, y_pipdep) +req_file = "./requirements.txt" +req_exist = req_check(req_file) +check = req_create(req_file) test = req_compare(req_file, dependencies) From 90444e12a41435f57e5924a08a8fc7bc45150594 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:22:22 +0100 Subject: [PATCH 328/411] Reformatted requirements.txt --- requirements.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index f77a86c7c..701da2aee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ -# python==3.6 -coverage==4.5.4 -pytest -pyyaml -pandas -numpy -requests -sphinx -pip arrow cookiecutter +coverage detect-secrets myst-parser +numpy +pandas +pip pre-commit==2.17.0 +pytest python-dotenv +python=3 +pyyaml +requests +sphinx table_logger pandas==1.1.5 numpy From f17099e85af18ce6a7cf8acb8cd40ea3e4ccf1f9 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:25:17 +0100 Subject: [PATCH 329/411] Some print statements in env converter notebook. --- notebooks/scripting_env_converter.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/scripting_env_converter.ipynb b/notebooks/scripting_env_converter.ipynb index 99acbdda6..3c5ecf3f2 100644 --- a/notebooks/scripting_env_converter.ipynb +++ b/notebooks/scripting_env_converter.ipynb @@ -99,7 +99,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.8" } }, "nbformat": 4, From c760e7530d259d33452b4d4edc53e0cf4a4c4a72 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:34:20 +0100 Subject: [PATCH 330/411] Added yml_convertor.py to src/utils/ as this is more sensible location. --- src/utils/yml_convertor.py | 125 +++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 src/utils/yml_convertor.py diff --git a/src/utils/yml_convertor.py b/src/utils/yml_convertor.py new file mode 100644 index 000000000..d75b05e52 --- /dev/null +++ b/src/utils/yml_convertor.py @@ -0,0 +1,125 @@ +import yaml +import os + + +def yml_dependencies(yml="../../environment.yml") -> list: + """_summary_ + + Keyword Arguments: + yml -- _description_ (default: {"../../environment.yml"}) + + Returns: + _description_ + """ + yml_env = yaml.safe_load(open(yml)) + yml_dep = yml_env["dependencies"] + return yml_dep + + +def yml_conda_dependencies(dep_list) -> list: + """_summary_ + + Arguments: + dep_list -- _description_ + + Returns: + _description_ + """ + yml_conda = dep_list[:-1] + yml_conda.sort() + return yml_conda + + +def yml_pip_dependencies(dep_list) -> list: + """_summary_ + + Arguments: + dep_list -- _description_ + + Returns: + _description_ + """ + yml_pip = dep_list[-1]["pip"] + yml_pip.sort() + return yml_pip + + +def deps_combnd(conda_deps, pip_deps) -> list: + """_summary_ + + Arguments: + conda_deps -- _description_ + pip_deps -- _description_ + + Returns: + _description_ + """ + full_deps = conda_deps + pip_deps + full_deps.sort() + return full_deps + + +def req_check(req="../../requirements.txt") -> bool: + """_summary_ + + Keyword Arguments: + req -- _description_ (default: {"../../requirements.txt"}) + + Returns: + _description_ + """ + isFile = os.path.isfile(req) + return isFile + + +def req_create(req) -> bool: + """_summary_ + + Arguments: + req -- _description_ + + Returns: + _description_ + """ + if not req: + f = open(req, "x") + f.close() + return True + else: + return False + + +def req_compare(dep_file, dep_list) -> list: + """_summary_ + + Arguments: + dep_file -- _description_ + dep_list -- _description_ + + Returns: + _description_ + """ + f = open(dep_file, "w+") + req_existing = f.read() + req_list = req_existing.split("\n") + req_list.sort() + + unique_deps = list(set(dep_list) - set(req_list)) + unique_deps.sort() + + for line in unique_deps: + f.write(f"{line}\n") + f.close() + + return unique_deps + + +yml_file = "../../environment.yml" +ydep = yml_dependencies() +y_condadep = yml_conda_dependencies(ydep) +y_pipdep = yml_pip_dependencies(ydep) +dependencies = deps_combnd(y_condadep, y_pipdep) +req_file = "../../requirements.txt" +req_exist = req_check(req_file) +check = req_create(req_file) +test = req_compare(req_file, dependencies) From 7c72a2f1b04e4909eabe219d225e7ea0d7d180ce Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:35:13 +0100 Subject: [PATCH 331/411] Moved yml_convertor.py from top level directory, moved to src/utils. --- yml_convertor.py | 125 ----------------------------------------------- 1 file changed, 125 deletions(-) delete mode 100644 yml_convertor.py diff --git a/yml_convertor.py b/yml_convertor.py deleted file mode 100644 index 52921bfd0..000000000 --- a/yml_convertor.py +++ /dev/null @@ -1,125 +0,0 @@ -import yaml -import os - - -def yml_dependencies(yml="./environment.yml") -> list: - """_summary_ - - Keyword Arguments: - yml -- _description_ (default: {"./environment.yml"}) - - Returns: - _description_ - """ - yml_env = yaml.safe_load(open(yml)) - yml_dep = yml_env["dependencies"] - return yml_dep - - -def yml_conda_dependencies(dep_list) -> list: - """_summary_ - - Arguments: - dep_list -- _description_ - - Returns: - _description_ - """ - yml_conda = dep_list[:-1] - yml_conda.sort() - return yml_conda - - -def yml_pip_dependencies(dep_list) -> list: - """_summary_ - - Arguments: - dep_list -- _description_ - - Returns: - _description_ - """ - yml_pip = dep_list[-1]["pip"] - yml_pip.sort() - return yml_pip - - -def deps_combnd(conda_deps, pip_deps) -> list: - """_summary_ - - Arguments: - conda_deps -- _description_ - pip_deps -- _description_ - - Returns: - _description_ - """ - full_deps = conda_deps + pip_deps - full_deps.sort() - return full_deps - - -def req_check(req="./requirements.txt") -> bool: - """_summary_ - - Keyword Arguments: - req -- _description_ (default: {"./requirements.txt"}) - - Returns: - _description_ - """ - isFile = os.path.isfile(req) - return isFile - - -def req_create(req) -> bool: - """_summary_ - - Arguments: - req -- _description_ - - Returns: - _description_ - """ - if not req: - f = open(req, "x") - f.close() - return True - else: - return False - - -def req_compare(dep_file, dep_list) -> list: - """_summary_ - - Arguments: - dep_file -- _description_ - dep_list -- _description_ - - Returns: - _description_ - """ - f = open(dep_file, "w+") - req_existing = f.read() - req_list = req_existing.split("\n") - req_list.sort() - - unique_deps = list(set(dep_list) - set(req_list)) - unique_deps.sort() - - for line in unique_deps: - f.write(f"{line}\n") - f.close() - - return unique_deps - - -yml_file = "./environment.yml" -ydep = yml_dependencies() -y_condadep = yml_conda_dependencies(ydep) -y_pipdep = yml_pip_dependencies(ydep) -dependencies = deps_combnd(y_condadep, y_pipdep) -req_file = "./requirements.txt" -req_exist = req_check(req_file) -check = req_create(req_file) -test = req_compare(req_file, dependencies) From c0e46464badba08fbbbbc70239aa049fa26ae5fc Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:36:24 +0100 Subject: [PATCH 332/411] Renamed yml_convertor.py to yml_converter.py. --- src/utils/{yml_convertor.py => yml_converter.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/utils/{yml_convertor.py => yml_converter.py} (100%) diff --git a/src/utils/yml_convertor.py b/src/utils/yml_converter.py similarity index 100% rename from src/utils/yml_convertor.py rename to src/utils/yml_converter.py From 3886cd466ef3618b7a5ead935e603135c9f9f7ba Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:42:20 +0100 Subject: [PATCH 333/411] Wrote docstring for yml_dependencies() function. --- src/utils/yml_converter.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index d75b05e52..48fa64bf9 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -3,13 +3,18 @@ def yml_dependencies(yml="../../environment.yml") -> list: - """_summary_ + """Loads an environment.yml file into a list. The values + of the 'dependencies' dictionary, an entry in the safe_load + list, are separated into their own list and returned by the + function. Keyword Arguments: - yml -- _description_ (default: {"../../environment.yml"}) + yml -- .yml environment file to be passed to function + (default: {"../../environment.yml"}) Returns: - _description_ + A list: list containing the values of the 'dependencies' + dictionary. """ yml_env = yaml.safe_load(open(yml)) yml_dep = yml_env["dependencies"] From 891a02f080b49010b57b5dec9966cea4e42927d7 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:45:19 +0100 Subject: [PATCH 334/411] Wrote docstring for yml_conda_dependencies() function. --- src/utils/yml_converter.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index 48fa64bf9..3db164b6c 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -22,13 +22,16 @@ def yml_dependencies(yml="../../environment.yml") -> list: def yml_conda_dependencies(dep_list) -> list: - """_summary_ + """Takes the total list of dependencies from the environment.yml + file (returned by yml_dependecies()) and returns only those that + are conda specific. Arguments: - dep_list -- _description_ + dep_list -- return value of yml_dependencies(). Total list + of dependencies from the environment.yml file. Returns: - _description_ + A list: sorted list containing dependencies unique to conda. """ yml_conda = dep_list[:-1] yml_conda.sort() From a4dba3f6c4f879c8b7494ab2bb615435e415e421 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:46:27 +0100 Subject: [PATCH 335/411] Wrote docstring for yml_pip_dependencies() function. --- src/utils/yml_converter.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index 3db164b6c..e29837e73 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -39,13 +39,16 @@ def yml_conda_dependencies(dep_list) -> list: def yml_pip_dependencies(dep_list) -> list: - """_summary_ + """akes the total list of dependencies from the environment.yml + file (returned by yml_dependecies()) and returns only those that + are pip specific. Arguments: - dep_list -- _description_ + dep_list -- return value of yml_dependencies(). Total list + of dependencies from the environment.yml file. Returns: - _description_ + A list: sorted list containing dependencies unique to pip. """ yml_pip = dep_list[-1]["pip"] yml_pip.sort() From f2f2c6dc7fc73426994d0e330eed5e9055d408a3 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:49:03 +0100 Subject: [PATCH 336/411] Wrote docstring for deps_combnd() function. --- src/utils/yml_converter.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index e29837e73..f9ed646e5 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -56,14 +56,15 @@ def yml_pip_dependencies(dep_list) -> list: def deps_combnd(conda_deps, pip_deps) -> list: - """_summary_ + """Combines the conda and pip dependencies lists into a single sorted + list. Arguments: - conda_deps -- _description_ - pip_deps -- _description_ + conda_deps -- list containing dependencies unique to conda + pip_deps -- list containing dependencies unique to pip Returns: - _description_ + A list: sorted list containing all dependencies from environment.yml """ full_deps = conda_deps + pip_deps full_deps.sort() From 4b37edde622337e180ada6e00d1a806d16af64e6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:51:12 +0100 Subject: [PATCH 337/411] Wrote docstring for req_check() function. --- src/utils/yml_converter.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index f9ed646e5..22d105585 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -72,13 +72,14 @@ def deps_combnd(conda_deps, pip_deps) -> list: def req_check(req="../../requirements.txt") -> bool: - """_summary_ + """Checks if the requirements.txt file already exists or not. Keyword Arguments: - req -- _description_ (default: {"../../requirements.txt"}) + req -- relative path to the requirements.txt file. + (default: {"../../requirements.txt"}) Returns: - _description_ + A bool: boolean value indicating if file exists or not. """ isFile = os.path.isfile(req) return isFile From 5185adb282a32f8ab8480e71e0dda30e44a22b37 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 17:54:39 +0100 Subject: [PATCH 338/411] Wrote docstring for req_create() function. --- src/utils/yml_converter.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index 22d105585..43c50ee43 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -85,14 +85,16 @@ def req_check(req="../../requirements.txt") -> bool: return isFile -def req_create(req) -> bool: - """_summary_ +def req_create(req="../../requirements.txt") -> bool: + """Create a requirements.txt file if one doesn't exist, otherwise + do nothing. Arguments: - req -- _description_ + req -- relative path to the requirements.txt file. + (default: {"../../requirements.txt"}) Returns: - _description_ + A bool: boolean value, if True then file has been created, else False. """ if not req: f = open(req, "x") From 96f776421480493b4c50e727986d41ee7d3d2011 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Fri, 21 Apr 2023 18:01:00 +0100 Subject: [PATCH 339/411] Wrote docstring for req_compare() function. --- src/utils/yml_converter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index 43c50ee43..0861b0597 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -105,14 +105,18 @@ def req_create(req="../../requirements.txt") -> bool: def req_compare(dep_file, dep_list) -> list: - """_summary_ + """Function to compare dependencies from environment.yml and + existing requirements.txt files. If there are differences in the + environment.yml file then the requirements.txt is updated accordingly Arguments: - dep_file -- _description_ - dep_list -- _description_ + dep_file -- relative path to the requirements.txt file. + (default: {"../../requirements.txt"}) + dep_list -- full list of dependencies from the environment.yml file. Returns: - _description_ + A list: list of differences between the environment.yml dependencies + and those in requirements.txt. """ f = open(dep_file, "w+") req_existing = f.read() From ba5079e57276acd17bf5b4e9f33e655088901138 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 24 Apr 2023 14:16:23 +0100 Subject: [PATCH 340/411] Added new unit tests for yml_converter.py under tests/ folder. --- tests/test_yml_converter.py | 89 +++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 tests/test_yml_converter.py diff --git a/tests/test_yml_converter.py b/tests/test_yml_converter.py new file mode 100644 index 000000000..a31d7a6f6 --- /dev/null +++ b/tests/test_yml_converter.py @@ -0,0 +1,89 @@ +"""Create a test suite for the yml conversion module.""" + +import pytest + + +def test_yml_dependencies(): + + # Arrange + from src.utils.yml_converter import yml_dependencies + + # Act: use pytest to assert the result + test = yml_dependencies() + + # Assert + assert type(test) == list + + pytest.raises(TypeError, yml_dependencies, 1) + + +def test_yml_conda_dependencies(): + + # Arrange + from src.utils.yml_converter import yml_conda_dependencies + + # Act: use pytest to assert the result + test = yml_conda_dependencies() + + # Assert + assert type(test) == list + + +def test_yml_pip_dependencies(): + + # Arrange + from src.utils.yml_converter import yml_pip_dependencies + + # Act: use pytest to assert the result + test = yml_pip_dependencies() + + # Assert + assert type(test) == list + + +def test_deps_combnd(): + + # Arrange + from src.utils.yml_converter import deps_combnd + + # Act: use pytest to assert the result + test = deps_combnd() + + # Assert + assert type(test) == list + + +def test_req_check(): + + # Arrange + from src.utils.yml_converter import req_check + + # Act: use pytest to assert the result + test = req_check() + + # Assert + assert type(test) == bool + + +def test_req_create(): + + # Arrange + from src.utils.yml_converter import req_create + + # Act: use pytest to assert the result + test = req_create() + + # Assert + assert type(test) == bool + + +def test_req_compare(): + + # Arrange + from src.utils.yml_converter import req_compare + + # Act: use pytest to assert the result + test = req_compare() + + # Assert + assert type(test) == list From 9e148cbc9a57154925e97a0c41bb3f582e92766c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 24 Apr 2023 16:50:53 +0100 Subject: [PATCH 341/411] Added function to list differences in yml and requirements files. Changed write function to only write when differences are present and in alphabetical order. --- src/utils/yml_converter.py | 71 +++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index 0861b0597..6c7218dbc 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -104,35 +104,80 @@ def req_create(req="../../requirements.txt") -> bool: return False -def req_compare(dep_file, dep_list) -> list: +def req_compare(dep_list, dep_file="../../requirements.txt") -> tuple: + """Function to compare dependencies from environment.yml and + existing requirements.txt files. The differences in dependencies + between the two files is returned. + + Arguments: + dep_list -- full list of dependencies from the environment.yml file. + dep_file -- relative path to the requirements.txt file. + (default: {"../../requirements.txt"}) + + + Returns: + A tuple: tuple containing two lists. The first list contains the + differences between the environment.yml dependencies and those in + requirements.txt. List two contains the reverse. + """ + f = open(dep_file, "r") + req_existing = f.read() + req_list = req_existing.split("\n") + req_list.sort() + f.close() + + unique_deps_1 = list(set(dep_list) - set(req_list)) + unique_deps_1.sort() + unique_deps_2 = list(set(req_list) - set(dep_list)) + unique_deps_2.sort() + + return unique_deps_1, unique_deps_2 + + +def req_write(dep_list, dep_file="../../requirements.txt") -> list: """Function to compare dependencies from environment.yml and existing requirements.txt files. If there are differences in the environment.yml file then the requirements.txt is updated accordingly Arguments: + dep_list -- full list of dependencies from the environment.yml file. dep_file -- relative path to the requirements.txt file. (default: {"../../requirements.txt"}) - dep_list -- full list of dependencies from the environment.yml file. + Returns: A list: list of differences between the environment.yml dependencies and those in requirements.txt. """ - f = open(dep_file, "w+") - req_existing = f.read() - req_list = req_existing.split("\n") - req_list.sort() - unique_deps = list(set(dep_list) - set(req_list)) - unique_deps.sort() + diff = req_compare(dep_list, dep_file) - for line in unique_deps: - f.write(f"{line}\n") - f.close() + if not diff[0]: + msg = "No unique dependencies in environment.yml compared to requirements.txt." + return msg + else: + f = open(dep_file, "r+") + req_existing = f.read() + req_list = req_existing.split("\n") + sorted_req_list = sorted(req_list, key=str.casefold) + print(f"Req list = {sorted_req_list} \n") + + unique_deps = list(set(dep_list) - set(req_list)) + + total_deps = sorted_req_list[1:] + unique_deps + sorted_total_deps = sorted(total_deps, key=str.casefold) + + f.seek(0) + + for line in sorted_total_deps: + f.write(f"{line}\n") + f.close() - return unique_deps + msg = f"Difference in environment.yml and requirements.txt is {unique_deps}." + return msg +# Lines for testing the above code yml_file = "../../environment.yml" ydep = yml_dependencies() y_condadep = yml_conda_dependencies(ydep) @@ -141,4 +186,4 @@ def req_compare(dep_file, dep_list) -> list: req_file = "../../requirements.txt" req_exist = req_check(req_file) check = req_create(req_file) -test = req_compare(req_file, dependencies) +test = req_write(dependencies, req_file) From 5a4af2db829b4b24c39c59932c391c2fdacc4857 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 24 Apr 2023 17:12:16 +0100 Subject: [PATCH 342/411] Added unit test for req_write() function. --- tests/test_yml_converter.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_yml_converter.py b/tests/test_yml_converter.py index a31d7a6f6..d79a06642 100644 --- a/tests/test_yml_converter.py +++ b/tests/test_yml_converter.py @@ -87,3 +87,15 @@ def test_req_compare(): # Assert assert type(test) == list + + +def test_req_write(): + + # Arrange + from src.utils.yml_converter import req_write + + # Act: use pytest to assert the result + test = req_write() + + # Assert + assert type(test) == str From 3a9da3cf30f235c4710fa46acb4812bea33dafef Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 24 Apr 2023 17:25:24 +0100 Subject: [PATCH 343/411] Removed python=3 and replaced it with just 'python' in both environment.yml and requirements.txt. --- environment.yml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 3628a50a1..57e2cb585 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ name: resdev36 dependencies: - - python=3 + - python - coverage - pyyaml - requests diff --git a/requirements.txt b/requirements.txt index 701da2aee..f4f12d433 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,8 +8,8 @@ pandas pip pre-commit==2.17.0 pytest +python python-dotenv -python=3 pyyaml requests sphinx From f23f751f02978c1eb276a4d5099ea4a84fffceb2 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 24 Apr 2023 17:26:11 +0100 Subject: [PATCH 344/411] Removed a print statement. --- src/utils/yml_converter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index 6c7218dbc..784a5cd0d 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -160,7 +160,6 @@ def req_write(dep_list, dep_file="../../requirements.txt") -> list: req_existing = f.read() req_list = req_existing.split("\n") sorted_req_list = sorted(req_list, key=str.casefold) - print(f"Req list = {sorted_req_list} \n") unique_deps = list(set(dep_list) - set(req_list)) From 1401b964eaf09ef499756983d9c7823d04b8067f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 24 Apr 2023 17:46:56 +0100 Subject: [PATCH 345/411] Removed python requirement entirely as causing crashing in Jenkins. --- environment.yml | 1 - requirements.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/environment.yml b/environment.yml index 57e2cb585..5e4dc7a79 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,5 @@ name: resdev36 dependencies: - - python - coverage - pyyaml - requests diff --git a/requirements.txt b/requirements.txt index f4f12d433..25c46502d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,6 @@ pandas pip pre-commit==2.17.0 pytest -python python-dotenv pyyaml requests From 624931641b1174dcdd6d7ffec5e81c2bf9d16abb Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 12:55:37 +0100 Subject: [PATCH 346/411] Amended relative paths to environment.yml and requirements.txt. Also removed 'testing lines' at end of yml_converter.py. --- src/utils/yml_converter.py | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index 784a5cd0d..a51f7f263 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -2,7 +2,7 @@ import os -def yml_dependencies(yml="../../environment.yml") -> list: +def yml_dependencies(yml="./environment.yml") -> list: """Loads an environment.yml file into a list. The values of the 'dependencies' dictionary, an entry in the safe_load list, are separated into their own list and returned by the @@ -71,12 +71,12 @@ def deps_combnd(conda_deps, pip_deps) -> list: return full_deps -def req_check(req="../../requirements.txt") -> bool: +def req_check(req="./requirements.txt") -> bool: """Checks if the requirements.txt file already exists or not. Keyword Arguments: req -- relative path to the requirements.txt file. - (default: {"../../requirements.txt"}) + (default: {"./requirements.txt"}) Returns: A bool: boolean value indicating if file exists or not. @@ -85,13 +85,13 @@ def req_check(req="../../requirements.txt") -> bool: return isFile -def req_create(req="../../requirements.txt") -> bool: +def req_create(req="./requirements.txt") -> bool: """Create a requirements.txt file if one doesn't exist, otherwise do nothing. Arguments: req -- relative path to the requirements.txt file. - (default: {"../../requirements.txt"}) + (default: {"./requirements.txt"}) Returns: A bool: boolean value, if True then file has been created, else False. @@ -104,7 +104,7 @@ def req_create(req="../../requirements.txt") -> bool: return False -def req_compare(dep_list, dep_file="../../requirements.txt") -> tuple: +def req_compare(dep_list, dep_file="./requirements.txt") -> tuple: """Function to compare dependencies from environment.yml and existing requirements.txt files. The differences in dependencies between the two files is returned. @@ -112,7 +112,7 @@ def req_compare(dep_list, dep_file="../../requirements.txt") -> tuple: Arguments: dep_list -- full list of dependencies from the environment.yml file. dep_file -- relative path to the requirements.txt file. - (default: {"../../requirements.txt"}) + (default: {"./requirements.txt"}) Returns: @@ -134,7 +134,7 @@ def req_compare(dep_list, dep_file="../../requirements.txt") -> tuple: return unique_deps_1, unique_deps_2 -def req_write(dep_list, dep_file="../../requirements.txt") -> list: +def req_write(dep_list, dep_file="./requirements.txt") -> list: """Function to compare dependencies from environment.yml and existing requirements.txt files. If there are differences in the environment.yml file then the requirements.txt is updated accordingly @@ -142,7 +142,7 @@ def req_write(dep_list, dep_file="../../requirements.txt") -> list: Arguments: dep_list -- full list of dependencies from the environment.yml file. dep_file -- relative path to the requirements.txt file. - (default: {"../../requirements.txt"}) + (default: {"./requirements.txt"}) Returns: @@ -174,15 +174,3 @@ def req_write(dep_list, dep_file="../../requirements.txt") -> list: msg = f"Difference in environment.yml and requirements.txt is {unique_deps}." return msg - - -# Lines for testing the above code -yml_file = "../../environment.yml" -ydep = yml_dependencies() -y_condadep = yml_conda_dependencies(ydep) -y_pipdep = yml_pip_dependencies(ydep) -dependencies = deps_combnd(y_condadep, y_pipdep) -req_file = "../../requirements.txt" -req_exist = req_check(req_file) -check = req_create(req_file) -test = req_write(dependencies, req_file) From ea24a9655d10d405270b565a03e2e3e68356705a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 13:09:10 +0100 Subject: [PATCH 347/411] Added default values to all functions and added expected types for function arguments where missing. --- src/utils/yml_converter.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py index a51f7f263..3b20cbbfb 100644 --- a/src/utils/yml_converter.py +++ b/src/utils/yml_converter.py @@ -2,7 +2,7 @@ import os -def yml_dependencies(yml="./environment.yml") -> list: +def yml_dependencies(yml: str = "./environment.yml") -> list: """Loads an environment.yml file into a list. The values of the 'dependencies' dictionary, an entry in the safe_load list, are separated into their own list and returned by the @@ -21,7 +21,7 @@ def yml_dependencies(yml="./environment.yml") -> list: return yml_dep -def yml_conda_dependencies(dep_list) -> list: +def yml_conda_dependencies(dep_list=yml_dependencies()) -> list: """Takes the total list of dependencies from the environment.yml file (returned by yml_dependecies()) and returns only those that are conda specific. @@ -38,7 +38,7 @@ def yml_conda_dependencies(dep_list) -> list: return yml_conda -def yml_pip_dependencies(dep_list) -> list: +def yml_pip_dependencies(dep_list=yml_dependencies()) -> list: """akes the total list of dependencies from the environment.yml file (returned by yml_dependecies()) and returns only those that are pip specific. @@ -55,7 +55,9 @@ def yml_pip_dependencies(dep_list) -> list: return yml_pip -def deps_combnd(conda_deps, pip_deps) -> list: +def deps_combnd( + conda_deps=yml_conda_dependencies(), pip_deps=yml_pip_dependencies() +) -> list: """Combines the conda and pip dependencies lists into a single sorted list. @@ -71,7 +73,7 @@ def deps_combnd(conda_deps, pip_deps) -> list: return full_deps -def req_check(req="./requirements.txt") -> bool: +def req_check(req: str = "./requirements.txt") -> bool: """Checks if the requirements.txt file already exists or not. Keyword Arguments: @@ -85,7 +87,7 @@ def req_check(req="./requirements.txt") -> bool: return isFile -def req_create(req="./requirements.txt") -> bool: +def req_create(req: str = "./requirements.txt") -> bool: """Create a requirements.txt file if one doesn't exist, otherwise do nothing. @@ -104,7 +106,9 @@ def req_create(req="./requirements.txt") -> bool: return False -def req_compare(dep_list, dep_file="./requirements.txt") -> tuple: +def req_compare( + dep_list: str = "./environment.yml", dep_file: str = "./requirements.txt" +) -> tuple: """Function to compare dependencies from environment.yml and existing requirements.txt files. The differences in dependencies between the two files is returned. @@ -134,7 +138,9 @@ def req_compare(dep_list, dep_file="./requirements.txt") -> tuple: return unique_deps_1, unique_deps_2 -def req_write(dep_list, dep_file="./requirements.txt") -> list: +def req_write( + dep_list: str = "./environment.yml", dep_file="./requirements.txt" +) -> list: """Function to compare dependencies from environment.yml and existing requirements.txt files. If there are differences in the environment.yml file then the requirements.txt is updated accordingly From fc0e80449a26007df700e1b9774e2f8fdea696cf Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 4 May 2023 13:34:52 +0100 Subject: [PATCH 348/411] Fixed test_req_compare() expected value in assert. --- tests/test_yml_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_yml_converter.py b/tests/test_yml_converter.py index d79a06642..6d4e5847f 100644 --- a/tests/test_yml_converter.py +++ b/tests/test_yml_converter.py @@ -86,7 +86,7 @@ def test_req_compare(): test = req_compare() # Assert - assert type(test) == list + assert type(test) == tuple def test_req_write(): From 723372220c5079365a3115f294f7afaec74bdc73 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 10 May 2023 14:35:17 +0100 Subject: [PATCH 349/411] Updated requirements.txt with coverage==4.5.4. Need this version to avoid _sqlite3 ModuleNotFoundError in Jenkins. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 25c46502d..5d3d52077 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ arrow cookiecutter -coverage +coverage==4.5.4 detect-secrets myst-parser numpy From 04882092b0d7cac522c01dae515ee614c5ad68d0 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 15 May 2023 10:26:03 +0100 Subject: [PATCH 350/411] Deleted scripting_env_converter.ipynb notebook. --- notebooks/scripting_env_converter.ipynb | 107 ------------------------ 1 file changed, 107 deletions(-) delete mode 100644 notebooks/scripting_env_converter.ipynb diff --git a/notebooks/scripting_env_converter.ipynb b/notebooks/scripting_env_converter.ipynb deleted file mode 100644 index 3c5ecf3f2..000000000 --- a/notebooks/scripting_env_converter.ipynb +++ /dev/null @@ -1,107 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pwd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import yaml\n", - "\n", - "env_path = \"/home/cdsw/research-and-development/environment.yml\"\n", - "\n", - "\n", - "def dependency_reader(env_file=env_path):\n", - " env_contents = yaml.safe_load(open(env_file))\n", - " dependencies = env_contents[\"dependencies\"]\n", - " return dependencies\n", - "\n", - "\n", - "dependencies = dependency_reader(env_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def dependency_joiner(all_dependencies):\n", - " conda_dependencies = [dep for dep in all_dependencies if isinstance(dep, str)]\n", - " # The pip section is always the last element\n", - " pip_dependencies = all_dependencies[-1][\"pip\"]\n", - " deps_comb = pip_dependencies + conda_dependencies\n", - " return deps_comb\n", - "\n", - "\n", - "deps_combnd = dependency_joiner(dependencies)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "\n", - "\n", - "def sub_equals(dep):\n", - " patt = re.compile(r\"(?<=[a-z])=(?=\\d)\")\n", - " dep = re.sub(patt, \"==\", dep)\n", - " return dep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "deps_combnd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pip_corrected = map(sub_equals, deps_combnd)\n", - "list(pip_corrected)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From d5c331dadebfcea887ecbf84be40c131813c0af6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 1 Jun 2023 11:31:12 +0100 Subject: [PATCH 351/411] Deleted environment.yml and previous-environment.yml. --- environment.yml | 26 --------------- previous-environment.yml | 69 ---------------------------------------- 2 files changed, 95 deletions(-) delete mode 100644 environment.yml delete mode 100644 previous-environment.yml diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 5e4dc7a79..000000000 --- a/environment.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: resdev36 -dependencies: - - coverage - - pyyaml - - requests - - sphinx - - pip - - pip: - - arrow - - cookiecutter - - detect-secrets - - myst-parser - - more-itertools==5.0.0 - - numpy - - pandas==1.1.5 - - postcodes_uk - - pre-commit==2.17.0 - - pydoop - - pytest #default version 5 which is compatible with postcodes_uk - - pytest-cov - - python-dotenv - - readme-coverage-badger - - setuptools - - table_logger - - toml - - xlrd diff --git a/previous-environment.yml b/previous-environment.yml deleted file mode 100644 index be07ca20b..000000000 --- a/previous-environment.yml +++ /dev/null @@ -1,69 +0,0 @@ -name: resdev310 -dependencies: - - arrow=1.2.2=pyhd3eb1b0_0 - - binaryornot=0.4.4=pyhd3eb1b0_1 - - brotlipy=0.7.0=py310h2bbff1b_1002 - - bzip2=1.0.8=he774522_0 - - ca-certificates=2022.10.11=haa95532_0 - - certifi=2022.9.24=py310haa95532_0 - - cffi=1.15.1=py310h2bbff1b_0 - - cfgv=3.3.1=py310haa95532_0 - - chardet=4.0.0=py310haa95532_1003 - - charset-normalizer=2.0.4=pyhd3eb1b0_0 - - click=8.0.4=py310haa95532_0 - - colorama=0.4.5=py310haa95532_0 - - cookiecutter=1.7.3=pyhd3eb1b0_0 - - cryptography=37.0.1=py310h21b164f_0 - - distlib=0.3.2=pyhd3eb1b0_0 - - filelock=3.6.0=pyhd3eb1b0_0 - - identify=2.5.5=py310haa95532_0 - - idna=3.4=py310haa95532_0 - - jinja2=3.0.3=pyhd3eb1b0_0 - - jinja2-time=0.2.0=pyhd3eb1b0_3 - - libffi=3.4.2=hd77b12b_4 - - markupsafe=2.1.1=py310h2bbff1b_0 - - nodeenv=1.7.0=py310haa95532_0 - - openssl=1.1.1q=h2bbff1b_0 - - pip=22.2.2=py310haa95532_0 - - platformdirs=2.5.2=py310haa95532_0 - - poyo=0.5.0=pyhd3eb1b0_0 - - pre-commit=2.20.0=py310haa95532_0 - - pre_commit=2.20.0=hd3eb1b0_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py310haa95532_0 - - python=3.10.6=hbb2ffb3_0 - - python-dateutil=2.8.2=pyhd3eb1b0_0 - - python-slugify=5.0.2=pyhd3eb1b0_0 - - pyyaml=6.0=py310h2bbff1b_0 - - requests=2.28.1=py310haa95532_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.39.3=h2bbff1b_0 - - text-unidecode=1.3=pyhd3eb1b0_0 - - tk=8.6.12=h2bbff1b_0 - - toml=0.10.2=pyhd3eb1b0_0 - - typing_extensions=4.3.0=py310haa95532_0 - - tzdata=2022e=h04d1e81_0 - - ukkonen=1.0.1=py310h59b6b97_0 - - unidecode=1.2.0=pyhd3eb1b0_0 - - urllib3=1.26.11=py310haa95532_0 - - vc=14.2=h21ff451_1 - - virtualenv=20.16.2=py310haa95532_0 - - vs2015_runtime=14.27.29016=h5e58377_2 - - wheel=0.37.1=pyhd3eb1b0_0 - - win_inet_pton=1.1.0=py310haa95532_0 - - wincertstore=0.2=py310haa95532_2 - - xz=5.2.6=h8cc25b3_0 - - yaml=0.2.5=he774522_0 - - zlib=1.2.12=h8cc25b3_3 - - pip: - - setuptools==65.5.0 - - coverage - - detect-secrets==1.0.3 - - myst-parser - - pre-commit - - pytest - - python-dotenv - - Sphinx - - toml -prefix: C:\Users\westwj1\Anaconda3\envs\resdev310 From 5b045eb8122ce92d114dc5e791e7850cc193ee8c Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 1 Jun 2023 11:35:03 +0100 Subject: [PATCH 352/411] Updated requirements.txt to match environment.yml recently deleted. --- requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5d3d52077..0b66299f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,13 +4,15 @@ coverage==4.5.4 detect-secrets myst-parser numpy -pandas +pandas==1.1.5 pip pre-commit==2.17.0 +pydoop pytest python-dotenv pyyaml requests +setuptools sphinx table_logger pandas==1.1.5 From 525575fc3bd5dfab8753e12b75ed92c72cdd84e5 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 1 Jun 2023 11:40:09 +0100 Subject: [PATCH 353/411] Deleted script that converts environment.yml into a requirements.txt. --- yml_to_requirements_convert.py | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 yml_to_requirements_convert.py diff --git a/yml_to_requirements_convert.py b/yml_to_requirements_convert.py deleted file mode 100644 index 01d54840b..000000000 --- a/yml_to_requirements_convert.py +++ /dev/null @@ -1,25 +0,0 @@ -import yaml - - -def yml_convertor(): - - yml_env = yaml.safe_load(open("./environment.yml")) - - requirements = [] - for dep in yml_env["dependencies"]: - if isinstance(dep, str): - if "=" in dep: - package, package_version = dep.split("=") - if package == "python": - requirements.append(package + "==3.6.2") - else: - requirements.append(package + "==" + package_version) - else: - requirements.append(dep) - elif isinstance(dep, dict): - for preq in dep.get("pip", []): - requirements.append(preq) - - with open("./requirements.txt", "w") as fp: - for requirement in requirements: - print(requirement, file=fp) From adb1bc68244ce05c29689d8fe24fcda3c3ee1463 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 1 Jun 2023 11:43:26 +0100 Subject: [PATCH 354/411] Deleted yml_converter.py and associated test file as only using requirements.txt file moving forward. --- src/utils/yml_converter.py | 182 ------------------------------------ tests/test_yml_converter.py | 101 -------------------- 2 files changed, 283 deletions(-) delete mode 100644 src/utils/yml_converter.py delete mode 100644 tests/test_yml_converter.py diff --git a/src/utils/yml_converter.py b/src/utils/yml_converter.py deleted file mode 100644 index 3b20cbbfb..000000000 --- a/src/utils/yml_converter.py +++ /dev/null @@ -1,182 +0,0 @@ -import yaml -import os - - -def yml_dependencies(yml: str = "./environment.yml") -> list: - """Loads an environment.yml file into a list. The values - of the 'dependencies' dictionary, an entry in the safe_load - list, are separated into their own list and returned by the - function. - - Keyword Arguments: - yml -- .yml environment file to be passed to function - (default: {"../../environment.yml"}) - - Returns: - A list: list containing the values of the 'dependencies' - dictionary. - """ - yml_env = yaml.safe_load(open(yml)) - yml_dep = yml_env["dependencies"] - return yml_dep - - -def yml_conda_dependencies(dep_list=yml_dependencies()) -> list: - """Takes the total list of dependencies from the environment.yml - file (returned by yml_dependecies()) and returns only those that - are conda specific. - - Arguments: - dep_list -- return value of yml_dependencies(). Total list - of dependencies from the environment.yml file. - - Returns: - A list: sorted list containing dependencies unique to conda. - """ - yml_conda = dep_list[:-1] - yml_conda.sort() - return yml_conda - - -def yml_pip_dependencies(dep_list=yml_dependencies()) -> list: - """akes the total list of dependencies from the environment.yml - file (returned by yml_dependecies()) and returns only those that - are pip specific. - - Arguments: - dep_list -- return value of yml_dependencies(). Total list - of dependencies from the environment.yml file. - - Returns: - A list: sorted list containing dependencies unique to pip. - """ - yml_pip = dep_list[-1]["pip"] - yml_pip.sort() - return yml_pip - - -def deps_combnd( - conda_deps=yml_conda_dependencies(), pip_deps=yml_pip_dependencies() -) -> list: - """Combines the conda and pip dependencies lists into a single sorted - list. - - Arguments: - conda_deps -- list containing dependencies unique to conda - pip_deps -- list containing dependencies unique to pip - - Returns: - A list: sorted list containing all dependencies from environment.yml - """ - full_deps = conda_deps + pip_deps - full_deps.sort() - return full_deps - - -def req_check(req: str = "./requirements.txt") -> bool: - """Checks if the requirements.txt file already exists or not. - - Keyword Arguments: - req -- relative path to the requirements.txt file. - (default: {"./requirements.txt"}) - - Returns: - A bool: boolean value indicating if file exists or not. - """ - isFile = os.path.isfile(req) - return isFile - - -def req_create(req: str = "./requirements.txt") -> bool: - """Create a requirements.txt file if one doesn't exist, otherwise - do nothing. - - Arguments: - req -- relative path to the requirements.txt file. - (default: {"./requirements.txt"}) - - Returns: - A bool: boolean value, if True then file has been created, else False. - """ - if not req: - f = open(req, "x") - f.close() - return True - else: - return False - - -def req_compare( - dep_list: str = "./environment.yml", dep_file: str = "./requirements.txt" -) -> tuple: - """Function to compare dependencies from environment.yml and - existing requirements.txt files. The differences in dependencies - between the two files is returned. - - Arguments: - dep_list -- full list of dependencies from the environment.yml file. - dep_file -- relative path to the requirements.txt file. - (default: {"./requirements.txt"}) - - - Returns: - A tuple: tuple containing two lists. The first list contains the - differences between the environment.yml dependencies and those in - requirements.txt. List two contains the reverse. - """ - f = open(dep_file, "r") - req_existing = f.read() - req_list = req_existing.split("\n") - req_list.sort() - f.close() - - unique_deps_1 = list(set(dep_list) - set(req_list)) - unique_deps_1.sort() - unique_deps_2 = list(set(req_list) - set(dep_list)) - unique_deps_2.sort() - - return unique_deps_1, unique_deps_2 - - -def req_write( - dep_list: str = "./environment.yml", dep_file="./requirements.txt" -) -> list: - """Function to compare dependencies from environment.yml and - existing requirements.txt files. If there are differences in the - environment.yml file then the requirements.txt is updated accordingly - - Arguments: - dep_list -- full list of dependencies from the environment.yml file. - dep_file -- relative path to the requirements.txt file. - (default: {"./requirements.txt"}) - - - Returns: - A list: list of differences between the environment.yml dependencies - and those in requirements.txt. - """ - - diff = req_compare(dep_list, dep_file) - - if not diff[0]: - msg = "No unique dependencies in environment.yml compared to requirements.txt." - return msg - else: - f = open(dep_file, "r+") - req_existing = f.read() - req_list = req_existing.split("\n") - sorted_req_list = sorted(req_list, key=str.casefold) - - unique_deps = list(set(dep_list) - set(req_list)) - - total_deps = sorted_req_list[1:] + unique_deps - sorted_total_deps = sorted(total_deps, key=str.casefold) - - f.seek(0) - - for line in sorted_total_deps: - f.write(f"{line}\n") - f.close() - - msg = f"Difference in environment.yml and requirements.txt is {unique_deps}." - return msg diff --git a/tests/test_yml_converter.py b/tests/test_yml_converter.py deleted file mode 100644 index 6d4e5847f..000000000 --- a/tests/test_yml_converter.py +++ /dev/null @@ -1,101 +0,0 @@ -"""Create a test suite for the yml conversion module.""" - -import pytest - - -def test_yml_dependencies(): - - # Arrange - from src.utils.yml_converter import yml_dependencies - - # Act: use pytest to assert the result - test = yml_dependencies() - - # Assert - assert type(test) == list - - pytest.raises(TypeError, yml_dependencies, 1) - - -def test_yml_conda_dependencies(): - - # Arrange - from src.utils.yml_converter import yml_conda_dependencies - - # Act: use pytest to assert the result - test = yml_conda_dependencies() - - # Assert - assert type(test) == list - - -def test_yml_pip_dependencies(): - - # Arrange - from src.utils.yml_converter import yml_pip_dependencies - - # Act: use pytest to assert the result - test = yml_pip_dependencies() - - # Assert - assert type(test) == list - - -def test_deps_combnd(): - - # Arrange - from src.utils.yml_converter import deps_combnd - - # Act: use pytest to assert the result - test = deps_combnd() - - # Assert - assert type(test) == list - - -def test_req_check(): - - # Arrange - from src.utils.yml_converter import req_check - - # Act: use pytest to assert the result - test = req_check() - - # Assert - assert type(test) == bool - - -def test_req_create(): - - # Arrange - from src.utils.yml_converter import req_create - - # Act: use pytest to assert the result - test = req_create() - - # Assert - assert type(test) == bool - - -def test_req_compare(): - - # Arrange - from src.utils.yml_converter import req_compare - - # Act: use pytest to assert the result - test = req_compare() - - # Assert - assert type(test) == tuple - - -def test_req_write(): - - # Arrange - from src.utils.yml_converter import req_write - - # Act: use pytest to assert the result - test = req_write() - - # Assert - assert type(test) == str From 976ad7c82abd780b9633baa228c7306d2c190b91 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 1 Jun 2023 12:47:40 +0100 Subject: [PATCH 355/411] Updated pytest-action.yaml to match that from branch 147. --- .github/workflows/pytest-action.yaml | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index cfe923e8b..68f7d6180 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -2,14 +2,26 @@ name: Run pytest on pull request to develop # runs on every pull request to develop on: - pull_request: - branches: - - develop + pull_request: + branches: + - develop # runs on version 20.04 of ubuntu jobs: - pytest-coverage-comment: - runs-on: ubuntu-20.04 - steps: + pytest-coverage-comment: + runs-on: ubuntu-20.04 + steps: + # 1) Checkout the code + - uses: actions/checkout@v3 + # 2) Removing PyDoop from the environment yaml + - name: Remove pydoop requirements.txt + shell: bash -l {0} + run: | + awk '!/pydoop.*/' requirements.txt> temp && mv temp requirements.txt + # 3) Set up Python + - name: Set up Python 3.6 + uses: actions/setup-python@v4 + with: + python-version: "3.6" # 1) Checkout the code - uses: actions/checkout@v3 From 236987b4b731bcc56c7417786bf4cc88084c38e2 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 1 Jun 2023 12:50:00 +0100 Subject: [PATCH 356/411] Updated pytest-action.yaml to match that from branch 147. --- .github/workflows/pytest-action.yaml | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/.github/workflows/pytest-action.yaml b/.github/workflows/pytest-action.yaml index 68f7d6180..cfe923e8b 100644 --- a/.github/workflows/pytest-action.yaml +++ b/.github/workflows/pytest-action.yaml @@ -2,26 +2,14 @@ name: Run pytest on pull request to develop # runs on every pull request to develop on: - pull_request: - branches: - - develop + pull_request: + branches: + - develop # runs on version 20.04 of ubuntu jobs: - pytest-coverage-comment: - runs-on: ubuntu-20.04 - steps: - # 1) Checkout the code - - uses: actions/checkout@v3 - # 2) Removing PyDoop from the environment yaml - - name: Remove pydoop requirements.txt - shell: bash -l {0} - run: | - awk '!/pydoop.*/' requirements.txt> temp && mv temp requirements.txt - # 3) Set up Python - - name: Set up Python 3.6 - uses: actions/setup-python@v4 - with: - python-version: "3.6" + pytest-coverage-comment: + runs-on: ubuntu-20.04 + steps: # 1) Checkout the code - uses: actions/checkout@v3 From 7ec9e2248bcad060bfe733e268c8da94f87596b6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 14:39:05 +0100 Subject: [PATCH 357/411] Resolved duplicates and ordering in requirements.txt. --- requirements.txt | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0b66299f9..078a8c2ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,25 +6,16 @@ myst-parser numpy pandas==1.1.5 pip +postcodes_uk pre-commit==2.17.0 pydoop pytest python-dotenv pyyaml +readme-coverage-badger requests setuptools sphinx table_logger -pandas==1.1.5 -numpy -pydoop -setuptools -pytest -coverage -pyyaml -requests -sphinx -typing -readme-coverage-badger -postcodes_uk toml +typing From 12589bcd67c5072b666eb0efc8c1914ff45a0b9e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 14:51:29 +0100 Subject: [PATCH 358/411] Added wheel install in virtualenv stage of Jenkins. --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index 146430780..9139baff1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -75,6 +75,7 @@ pipeline { python -m pip install -U pip pip3 install pypandoc==1.7.5 + pip3 install wheel pip3 install -r requirements.txt pip3 install pyspark==2.4.0 From 6e52539cc9114cd77920440b017657acd5ea3bc6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 15:20:46 +0100 Subject: [PATCH 359/411] Removed build wheel lin ein Jenkinsfile. --- Jenkinsfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9139baff1..146430780 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -75,7 +75,6 @@ pipeline { python -m pip install -U pip pip3 install pypandoc==1.7.5 - pip3 install wheel pip3 install -r requirements.txt pip3 install pyspark==2.4.0 From 6275cab7f33f90327ebb57727e5a89846f4de1d5 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 13:56:56 +0100 Subject: [PATCH 360/411] Added line to remove pydoop from requirements.txt before getting installed. --- Jenkinsfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 146430780..c80e1f41a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -75,6 +75,10 @@ pipeline { python -m pip install -U pip pip3 install pypandoc==1.7.5 + + # Remove pydoop from requirements before it's installed. + awk '!/pydoop.*/' requirements.txt > temp && mv temp requirements.txt + pip3 install -r requirements.txt pip3 install pyspark==2.4.0 From 9b8bdac1b769b7c2acbb770f5aa1c3f4bc0f674a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 20:08:03 +0100 Subject: [PATCH 361/411] Updated check_data_shape logger output with conditional on whether cols_match is False or not. --- src/data_validation/validation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index c67514633..ee347c695 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -186,9 +186,12 @@ def check_data_shape( else: cols_match = False - validationlogger.warning(f"Data columns match schema: {cols_match}.") + if cols_match is False: + validationlogger.warning(f"Data columns match schema: {cols_match}.") + else: + validationlogger.info(f"Data columns match schema: {cols_match}.") + validationlogger.info( f"Length of data: {len(data_dict)}. Length of schema: {len(toml_string)}" ) - return cols_match From e8b91f724f2c76bbde433804a94437702aa4645a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 13 Jun 2023 14:40:28 +0100 Subject: [PATCH 362/411] Updated Jenkinsfile with coverage command from pytest-action.yaml. Should omit hdfs_mods tests causing pydoop error. --- Jenkinsfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index c80e1f41a..9e8f138f5 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -105,8 +105,11 @@ pipeline { sh ''' . venv/bin/activate - coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests + coverage run --branch --source=./${PROJECT_NAME} --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/pipeline.py \ + -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ + -o python_coverage.xml && coverage report -m --fail-under=10 ''' + // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests /* // Lines below create a coverage report for on Jenkins. Currently commented out // as it gives errors when no imports are used in unit tests. import src.main From 40646b13280fef3aeeed32672fa415bcfd352f21 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 13 Jun 2023 15:45:31 +0100 Subject: [PATCH 363/411] Removed junitxml line for Jenkins unit test stage. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9e8f138f5..0bd055658 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -106,7 +106,7 @@ pipeline { . venv/bin/activate coverage run --branch --source=./${PROJECT_NAME} --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/pipeline.py \ - -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ + -m pytest -ra ./tests --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ -o python_coverage.xml && coverage report -m --fail-under=10 ''' // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests From 5c3fcf0409ce354b4101473e0dff715d4bc95f86 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 13 Jun 2023 16:22:00 +0100 Subject: [PATCH 364/411] Removed python_coverage.xml file coverage command in unit test stage of Jenkinsfile. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0bd055658..c18e99de7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -107,7 +107,7 @@ pipeline { coverage run --branch --source=./${PROJECT_NAME} --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/pipeline.py \ -m pytest -ra ./tests --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ - -o python_coverage.xml && coverage report -m --fail-under=10 + coverage report -m --fail-under=10 ''' // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests /* From dfd5870e068720e64b03e94d99c143a4f80516b7 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 13 Jun 2023 16:59:27 +0100 Subject: [PATCH 365/411] Removed --failed-under line as it appeared to be causing an issue in Jenkins. Re-added the xml parts of original line. --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index c18e99de7..c6947e3d3 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -106,8 +106,8 @@ pipeline { . venv/bin/activate coverage run --branch --source=./${PROJECT_NAME} --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/pipeline.py \ - -m pytest -ra ./tests --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ - coverage report -m --fail-under=10 + -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ + -o python_coverage.xml ''' // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests /* From e1747e7b6999098ac45e7eececffaad213bfa7c7 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 13 Jun 2023 17:10:48 +0100 Subject: [PATCH 366/411] Removed python_coverage.xml, causing error with 'No Data to Report' in Jenkins. --- Jenkinsfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index c6947e3d3..918b9aec4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -106,8 +106,7 @@ pipeline { . venv/bin/activate coverage run --branch --source=./${PROJECT_NAME} --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/pipeline.py \ - -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml \ - -o python_coverage.xml + -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml ''' // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests /* From 15e44f47c9826590d3fff7aaefd7da11f463bf37 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 13 Jun 2023 17:17:53 +0100 Subject: [PATCH 367/411] Removed all xml lines in coverage command. Jenkins errors keep coming back to them. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 918b9aec4..82936f01b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -106,7 +106,7 @@ pipeline { . venv/bin/activate coverage run --branch --source=./${PROJECT_NAME} --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/pipeline.py \ - -m pytest -ra ./tests --junitxml=junit_result.xml --ignore=tests/test_utils/test_hdfs_mods.py && coverage xml + -m pytest -ra ./tests --ignore=tests/test_utils/test_hdfs_mods.py ''' // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests /* From 58b78b5b5f107acc57808f3329be4b5353efd0e0 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 13 Jun 2023 17:24:58 +0100 Subject: [PATCH 368/411] Removed old coverage run command line that was commented out. --- Jenkinsfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 82936f01b..af20180d1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -108,7 +108,6 @@ pipeline { coverage run --branch --source=./${PROJECT_NAME} --omit=src/utils/hdfs_mods.py,src/utils/wrappers.py,src/utils/runlog.py,src/_version.py,src/pipeline.py \ -m pytest -ra ./tests --ignore=tests/test_utils/test_hdfs_mods.py ''' - // coverage run --branch --source=./${PROJECT_NAME} -m pytest -ra ./tests /* // Lines below create a coverage report for on Jenkins. Currently commented out // as it gives errors when no imports are used in unit tests. import src.main From 16fbdf0bf3b7b832bc7898450c39d808a4d75bb8 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 17:47:19 +0100 Subject: [PATCH 369/411] Created a data_validation file containing the function to test if a file exists and if it is non-empty. Returns boolean. --- src/data_validation/validation.py | 128 ++++-------------------------- 1 file changed, 15 insertions(+), 113 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 55dbbf3ea..88e27ef1b 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,121 +1,23 @@ -import postcodes_uk -import pandas as pd -from src.utils.wrappers import time_logger_wrap, exception_wrap -import logging +import os -from src.utils.helpers import Config_settings +def check_file_exists(filePath="./src/data_validation/validation.py") -> bool: + """Checks if file exists and is non-empty -# Get the config -conf_obj = Config_settings() -config = conf_obj.config_dict - -ValidationLogger = logging.getLogger(__name__) - - -def validate_postcode_pattern(pcode: str) -> bool: - """A function to validate UK postcodes which uses the - - Args: - pcode (str): The postcode to validate - - Returns: - bool: True or False depending on if it is valid or not - """ - if pcode is None: - return False - - # Validation step - valid_bool = postcodes_uk.validate(pcode) - - return valid_bool - - -@exception_wrap -def get_masterlist(masterlist_path) -> pd.Series: - """This function loads the masterlist of postcodes from a csv file - - Returns: - pd.Series: The dataframe of postcodes - """ - masterlist = pd.read_csv(masterlist_path, usecols=["pcd"]).squeeze() - return masterlist - - -@time_logger_wrap -@exception_wrap -def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: - """This function checks if all postcodes in the specified DataFrame column - are valid UK postcodes. It uses the `validate_postcode` function to - perform the validation. - - Args: - df (pd.DataFrame): The DataFrame containing the postcodes. + Keyword Arguments: + filePath -- Relative path to file + (default: {"./src/data_validation/validation.py"}) Returns: - bool: True if all postcodes are valid, False otherwise. - - Raises: - ValueError: If any invalid postcodes are found, a ValueError is raised. - The error message includes the list of invalid postcodes. - - Example: - >>> df = pd.DataFrame( - {"referencepostcode": ["AB12 3CD", "EFG 456", "HIJ 789", "KL1M 2NO"]}) - >>> validate_post_col(df, "example-path/to/masterlist.csv"") - ValueError: Invalid postcodes found: ['EFG 456', 'HIJ 789'] + A bool: boolean value is True if file exists and is non-empty, + False otherwise. """ - if not isinstance(df, pd.DataFrame): - raise TypeError(f"The dataframe you are attempting to validate is {type(df)}") - - unreal_postcodes = check_pcs_real(df, masterlist_path) - - # Log the unreal postcodes - if not unreal_postcodes.empty: - ValidationLogger.warning( - f"These postcodes are not found in the ONS postcode list: {unreal_postcodes.to_list()}" # noqa - ) + output = False - # Check if postcodes match pattern - invalid_pattern_postcodes = df.loc[ - ~df["referencepostcode"].apply(validate_postcode_pattern), "referencepostcode" - ] - - # Log the invalid postcodes - if not invalid_pattern_postcodes.empty: - ValidationLogger.warning( - f"Invalid pattern postcodes found: {invalid_pattern_postcodes.to_list()}" - ) - - # Combine the two lists - combined_invalid_postcodes = pd.concat( - [unreal_postcodes, invalid_pattern_postcodes] - ) - combined_invalid_postcodes.drop_duplicates(inplace=True) - - if not combined_invalid_postcodes.empty: - raise ValueError( - f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}" - ) - - ValidationLogger.info("All postcodes validated....") - - return True - -def check_pcs_real(df: pd.DataFrame, masterlist_path: str): - """Checks if the postcodes are real against a masterlist of actual postcodes - """ - if config["global"]["postcode_csv_check"]: - master_series = get_masterlist(masterlist_path) + fileExists = os.path.exists(filePath) + if fileExists: + fileSize = os.path.getsize(filePath) - # Check if postcode are real - unreal_postcodes = df.loc[ - ~df["referencepostcode"].isin(master_series), "referencepostcode" - ] - else: - emptydf = pd.DataFrame(columns=["referencepostcode"]) - unreal_postcodes = emptydf.loc[ - ~emptydf["referencepostcode"], "referencepostcode" - ] - - return unreal_postcodes + if fileExists and fileSize > 0: + output = True + return output From d66e56f25f31af4998b67dd6d4c13c88398291ce Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 17:55:28 +0100 Subject: [PATCH 370/411] Added unit test for check_file_exists() function. --- tests/test_main.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_main.py b/tests/test_main.py index 2b8454b49..2e75c7d79 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -49,3 +49,17 @@ def test_period_select(): # Assert: Negative test. Should fails when the answer is wrong with pytest.raises(AssertionError): assert not isinstance(result_1, tuple) + + +def test_check_file_exists(): + """Test the check_file_exists function.""" + # Arrange + from src.data_validation import check_file_exists + + # Act: use pytest to assert the result + result_1 = check_file_exists() + # Assert + assert isinstance(result_1, bool) + # Assert: Negative test. Should fails when the answer is wrong + with pytest.raises(AssertionError): + assert not isinstance(result_1, bool) From fcde0aa05798a62dbefc337670b686ffc3f04703 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:09:31 +0100 Subject: [PATCH 371/411] Imported check_file_exists function into src.main.py and tested it worked. --- src/main.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 src/main.py diff --git a/src/main.py b/src/main.py new file mode 100644 index 000000000..d7fff22d6 --- /dev/null +++ b/src/main.py @@ -0,0 +1,48 @@ +"""The main pipeline""" + +from src.utils import runlog +from src._version import __version__ as version +from src.utils.helpers import Config_settings +from src.utils.wrappers import logger_creator +from src.utils.testfunctions import Manipulate_data +from src.data_validation.validation import check_file_exists + +import time +import logging + + +MainLogger = logging.getLogger(__name__) +MainLogger.setLevel(logging.INFO) + + +def run_pipeline(start): + """The main pipeline. + + Args: + start (float): The time when the pipeline is launched + generated from the time module using time.time() + """ + + conf_obj = Config_settings() + config = conf_obj.config_dict + global_config = config["global"] + + runlog_obj = runlog.RunLog(config, version) + + logger = logger_creator(global_config) + MainLogger.info("Launching Pipeline .......................") + logger.info("Collecting logging parameters ..........") + check_file_exists() + Manipulate_data() + MainLogger.info("Finishing Pipeline .......................") + + runlog_obj.retrieve_pipeline_logs() + + run_time = round(time.time() - start, 5) + runlog_obj._record_time_taken(run_time) + + runlog_obj.retrieve_configs() + runlog_obj._create_runlog_dicts() + runlog_obj._create_runlog_dfs() + runlog_obj.create_runlog_files() + runlog_obj._write_runlog() From 72289686c40b548ec196370f8dcb4948397fc3d2 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:10:22 +0100 Subject: [PATCH 372/411] Removed unit test for check_file_exists() function to add it to its own file. --- tests/test_main.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/test_main.py b/tests/test_main.py index 2e75c7d79..2b8454b49 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -49,17 +49,3 @@ def test_period_select(): # Assert: Negative test. Should fails when the answer is wrong with pytest.raises(AssertionError): assert not isinstance(result_1, tuple) - - -def test_check_file_exists(): - """Test the check_file_exists function.""" - # Arrange - from src.data_validation import check_file_exists - - # Act: use pytest to assert the result - result_1 = check_file_exists() - # Assert - assert isinstance(result_1, bool) - # Assert: Negative test. Should fails when the answer is wrong - with pytest.raises(AssertionError): - assert not isinstance(result_1, bool) From ee3fb6ee22568ff2d859f5b0a38a0dfe8a777e06 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:10:56 +0100 Subject: [PATCH 373/411] Create test_validation.py file and added unit test for check_file_exists function. --- tests/test_validation.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 tests/test_validation.py diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 000000000..f1b125e55 --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,17 @@ +"""Create a test suite for the validation module.""" + +import pytest + + +def test_check_file_exists(): + """Test the check_file_exists function.""" + # Arrange + from src.data_validation import check_file_exists + + # Act: use pytest to assert the result + result_1 = check_file_exists() + # Assert + assert isinstance(result_1, bool) + # Assert: Negative test. Should fails when the answer is wrong + with pytest.raises(AssertionError): + assert not isinstance(result_1, bool) From 18c18d913d0a97da235c32ade29ece795cb1326a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:21:23 +0100 Subject: [PATCH 374/411] Added True and False assertion tests in test_validation.py. --- tests/test_validation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_validation.py b/tests/test_validation.py index f1b125e55..0d2973a2e 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -9,9 +9,18 @@ def test_check_file_exists(): from src.data_validation import check_file_exists # Act: use pytest to assert the result + empty_file = open("emptyfile.py", "a").close() + result_1 = check_file_exists() + result_2 = check_file_exists("Non_existant_file.txt") + result_3 = check_file_exists(empty_file) + # Assert assert isinstance(result_1, bool) + assert result_1 + assert not result_2 + assert not result_3 + # Assert: Negative test. Should fails when the answer is wrong with pytest.raises(AssertionError): assert not isinstance(result_1, bool) From dd751ad3182df02e93f296fa87743ce1b47d659a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:35:00 +0100 Subject: [PATCH 375/411] Added line in test_validation.py which should remove created empty file automatically after it has been used for testing. --- tests/test_validation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_validation.py b/tests/test_validation.py index 0d2973a2e..3bbf85af5 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -1,6 +1,7 @@ """Create a test suite for the validation module.""" import pytest +import os def test_check_file_exists(): @@ -15,6 +16,8 @@ def test_check_file_exists(): result_2 = check_file_exists("Non_existant_file.txt") result_3 = check_file_exists(empty_file) + os.remove(empty_file) + # Assert assert isinstance(result_1, bool) assert result_1 From 10187a7163832ce4bc1ae34cc23d12681bdf08ca Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:41:20 +0100 Subject: [PATCH 376/411] Corrected import path in test_validation.py for check_file_exists. --- tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index 3bbf85af5..8702099c0 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -7,7 +7,7 @@ def test_check_file_exists(): """Test the check_file_exists function.""" # Arrange - from src.data_validation import check_file_exists + from src.data_validation.validation import check_file_exists # Act: use pytest to assert the result empty_file = open("emptyfile.py", "a").close() From e8315ec868f0f758e3b13863c129ee37e7e84ce3 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:45:26 +0100 Subject: [PATCH 377/411] pytest was failing as empty_file did not exist. Removed '.close()' after creating to fix. --- tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index 8702099c0..63b7fc926 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -10,7 +10,7 @@ def test_check_file_exists(): from src.data_validation.validation import check_file_exists # Act: use pytest to assert the result - empty_file = open("emptyfile.py", "a").close() + empty_file = open("emptyfile.py", "a") result_1 = check_file_exists() result_2 = check_file_exists("Non_existant_file.txt") From 30ad15cb3590a9e336eb535ca5ec62653b82df87 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:50:28 +0100 Subject: [PATCH 378/411] Changed open mode when creating empty file to 'x' instead of 'a'. --- tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index 63b7fc926..1916264c2 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -10,7 +10,7 @@ def test_check_file_exists(): from src.data_validation.validation import check_file_exists # Act: use pytest to assert the result - empty_file = open("emptyfile.py", "a") + empty_file = open("./emptyfile.py", "x") result_1 = check_file_exists() result_2 = check_file_exists("Non_existant_file.txt") From c07a9d8e6d83729fb80880572c059bfb068a75b1 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 25 Apr 2023 18:58:23 +0100 Subject: [PATCH 379/411] Using .name to point to empty file name rather than _io.TextIOWrapper object. --- tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index 1916264c2..ced2e0d88 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -14,9 +14,9 @@ def test_check_file_exists(): result_1 = check_file_exists() result_2 = check_file_exists("Non_existant_file.txt") - result_3 = check_file_exists(empty_file) + result_3 = check_file_exists(empty_file.name) - os.remove(empty_file) + os.remove(empty_file.name) # Assert assert isinstance(result_1, bool) From 80ccdadb0eab0a757b197c7d89f9d383fad3ff2f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 11 May 2023 14:47:47 +0100 Subject: [PATCH 380/411] Updated check_file_exists() function to take a filename and an optional filepath argument (defaults to ./data/raw/). --- src/data_validation/validation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 88e27ef1b..e61a69130 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,7 +1,7 @@ import os -def check_file_exists(filePath="./src/data_validation/validation.py") -> bool: +def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: """Checks if file exists and is non-empty Keyword Arguments: @@ -14,10 +14,11 @@ def check_file_exists(filePath="./src/data_validation/validation.py") -> bool: """ output = False - fileExists = os.path.exists(filePath) + fileExists = os.path.exists(filepath + filename) if fileExists: - fileSize = os.path.getsize(filePath) + fileSize = os.path.getsize(filepath + filename) if fileExists and fileSize > 0: output = True + return output From 1ff052f8100680defa26da4736661c742e4a129d Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 18:48:56 +0100 Subject: [PATCH 381/411] Amended test_validation.py to include required positional argument in check_file_exists function calls. Also added a few comments. --- tests/test_validation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index ced2e0d88..4d9c52450 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -10,12 +10,15 @@ def test_check_file_exists(): from src.data_validation.validation import check_file_exists # Act: use pytest to assert the result + # Create emptyfile.py if it doesn't already exist empty_file = open("./emptyfile.py", "x") - result_1 = check_file_exists() + # developer_config.yaml should exist and be non-empty + result_1 = check_file_exists("./src/developer_config.yaml") result_2 = check_file_exists("Non_existant_file.txt") result_3 = check_file_exists(empty_file.name) + # Delete emptyfile.py after testing os.remove(empty_file.name) # Assert From b0625cf64f92bee0c45aabac1a1f06eec0d9277a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 22 May 2023 18:52:48 +0100 Subject: [PATCH 382/411] Added path argument to check_file_exists call in test_validation.py, otherwise it defaults to './data/raw/'. --- tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index 4d9c52450..a4933c823 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -14,7 +14,7 @@ def test_check_file_exists(): empty_file = open("./emptyfile.py", "x") # developer_config.yaml should exist and be non-empty - result_1 = check_file_exists("./src/developer_config.yaml") + result_1 = check_file_exists("developer_config.yaml", "./src/") result_2 = check_file_exists("Non_existant_file.txt") result_3 = check_file_exists(empty_file.name) From 8c2ee0af0ec4c040da2dac5b16d91bed0e8c074e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 1 Jun 2023 11:49:50 +0100 Subject: [PATCH 383/411] Removed check_file_exists() in src/main.py. --- src/main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/main.py b/src/main.py index d7fff22d6..21c8866b8 100644 --- a/src/main.py +++ b/src/main.py @@ -5,8 +5,6 @@ from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data -from src.data_validation.validation import check_file_exists - import time import logging @@ -32,7 +30,6 @@ def run_pipeline(start): logger = logger_creator(global_config) MainLogger.info("Launching Pipeline .......................") logger.info("Collecting logging parameters ..........") - check_file_exists() Manipulate_data() MainLogger.info("Finishing Pipeline .......................") From 525d1c20fbd4b17e7af8559fa489d17913d9c3e2 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 7 Jun 2023 12:58:21 +0100 Subject: [PATCH 384/411] Corrected indentation issue. --- src/data_validation/validation.py | 103 ++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index e61a69130..95f5a9121 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -22,3 +22,106 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: output = True return output + + +def validate_postcode_pattern(pcode: str) -> bool: + """A function to validate UK postcodes which uses the + + Args: + pcode (str): The postcode to validate + + Returns: + bool: True or False depending on if it is valid or not + """ + if pcode is None: + return False + + # Validation step + valid_bool = postcodes_uk.validate(pcode) + + return valid_bool + + +def get_masterlist(masterlist_path) -> pd.Series: + """This function loads the masterlist of postcodes from a csv file + + Returns: + pd.Series: The dataframe of postcodes + """ + masterlist = pd.read_csv(masterlist_path, usecols=["pcd"]).squeeze() + return masterlist + + +def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: + """This function checks if all postcodes in the specified DataFrame column + are valid UK postcodes. It uses the `validate_postcode` function to + perform the validation. + + Args: + df (pd.DataFrame): The DataFrame containing the postcodes. + + Returns: + bool: True if all postcodes are valid, False otherwise. + + Raises: + ValueError: If any invalid postcodes are found, a ValueError is raised. + The error message includes the list of invalid postcodes. + + Example: + >>> df = pd.DataFrame( + {"referencepostcode": ["AB12 3CD", "EFG 456", "HIJ 789", "KL1M 2NO"]}) + >>> validate_post_col(df, "example-path/to/masterlist.csv"") + ValueError: Invalid postcodes found: ['EFG 456', 'HIJ 789'] + """ + if not isinstance(df, pd.DataFrame): + raise TypeError(f"The dataframe you are attempting to validate is {type(df)}") + + unreal_postcodes = check_pcs_real(df, masterlist_path) + + # Log the unreal postcodes + if not unreal_postcodes.empty: + logger.warning( + f"These postcodes are not found in the ONS postcode list: {unreal_postcodes.to_list()}" # noqa + ) + + # Check if postcodes match pattern + invalid_pattern_postcodes = df.loc[ + ~df["referencepostcode"].apply(validate_postcode_pattern), "referencepostcode" + ] + + # Log the invalid postcodes + if not invalid_pattern_postcodes.empty: + logger.warning( + f"Invalid pattern postcodes found: {invalid_pattern_postcodes.to_list()}" + ) + + # Combine the two lists + combined_invalid_postcodes = pd.concat( + [unreal_postcodes, invalid_pattern_postcodes] + ) + combined_invalid_postcodes.drop_duplicates(inplace=True) + + if not combined_invalid_postcodes.empty: + raise ValueError( + f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}" + ) + + return True + + +def check_pcs_real(df: pd.DataFrame, masterlist_path: str): + """Checks if the postcodes are real against a masterlist of actual postcodes""" + if config["global"]["postcode_csv_check"]: + master_series = get_masterlist(masterlist_path) + + # Check if postcode are real + unreal_postcodes = df.loc[ + ~df["referencepostcode"].isin(master_series), "referencepostcode" + ] + else: + emptydf = pd.DataFrame(columns=["referencepostcode"]) + unreal_postcodes = emptydf.loc[ + ~emptydf["referencepostcode"], "referencepostcode" + ] + + return unreal_postcodes From 62aa60ab76db863f31e706054cd9d51cef6b269f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 8 Jun 2023 20:12:11 +0100 Subject: [PATCH 385/411] Updated dosctring to descript 'filename' argument. --- src/data_validation/validation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 95f5a9121..973bbd879 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -5,6 +5,7 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: """Checks if file exists and is non-empty Keyword Arguments: + filename -- Name of file to check filePath -- Relative path to file (default: {"./src/data_validation/validation.py"}) From d33cae182081781d8b40cdf0c6ed7f9ae339276d Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 14 Jun 2023 15:58:31 +0100 Subject: [PATCH 386/411] Re-added test for check_file_exists function. --- tests/test_data_validation/test_validation.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 2151300f4..1750b87d1 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -1,5 +1,6 @@ import pandas as pd import pytest +import os from src.data_validation.validation import ( validate_post_col, validate_postcode_pattern, @@ -12,6 +13,33 @@ config = conf_obj.config_dict +def test_check_file_exists(): + """Test the check_file_exists function.""" + # Arrange + from src.data_validation.validation import check_file_exists + + # Act: use pytest to assert the result + # Create emptyfile.py if it doesn't already exist + empty_file = open("./emptyfile.py", "w") + + # developer_config.yaml should exist and be non-empty + result_1 = check_file_exists("developer_config.yaml", "./src/") + result_2 = check_file_exists("Non_existant_file.txt") + result_3 = check_file_exists(empty_file.name) + + # Delete emptyfile.py after testing + os.remove(empty_file.name) + + # Assert + assert isinstance(result_1, bool) + assert result_1 + assert not result_2 + assert not result_3 + # Assert: Negative test. Should fails when the answer is wrong + with pytest.raises(AssertionError): + assert not isinstance(result_1, bool) + + @pytest.fixture # noqa def test_data(): """'NP10 8XG', 'SW1P 4DF' are valid and real postcodes. From f3c5572fe5c99ddb25f71e983612b0fd614ae0ec Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 14 Jun 2023 16:03:43 +0100 Subject: [PATCH 387/411] Added in file_loc using os.path.join method to work across different os's. --- src/data_validation/validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 973bbd879..c316a8d61 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -15,9 +15,11 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: """ output = False - fileExists = os.path.exists(filepath + filename) + file_loc = os.path.join(filepath, filename) + fileExists = os.path.exists(file_loc) + if fileExists: - fileSize = os.path.getsize(filepath + filename) + fileSize = os.path.getsize(file_loc) if fileExists and fileSize > 0: output = True From 9eecb0964e7ce0c16c60ed518bc50935936611ec Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Wed, 14 Jun 2023 16:21:14 +0100 Subject: [PATCH 388/411] Removed legacy test_validation.py. --- tests/test_validation.py | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 tests/test_validation.py diff --git a/tests/test_validation.py b/tests/test_validation.py deleted file mode 100644 index a4933c823..000000000 --- a/tests/test_validation.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Create a test suite for the validation module.""" - -import pytest -import os - - -def test_check_file_exists(): - """Test the check_file_exists function.""" - # Arrange - from src.data_validation.validation import check_file_exists - - # Act: use pytest to assert the result - # Create emptyfile.py if it doesn't already exist - empty_file = open("./emptyfile.py", "x") - - # developer_config.yaml should exist and be non-empty - result_1 = check_file_exists("developer_config.yaml", "./src/") - result_2 = check_file_exists("Non_existant_file.txt") - result_3 = check_file_exists(empty_file.name) - - # Delete emptyfile.py after testing - os.remove(empty_file.name) - - # Assert - assert isinstance(result_1, bool) - assert result_1 - assert not result_2 - assert not result_3 - - # Assert: Negative test. Should fails when the answer is wrong - with pytest.raises(AssertionError): - assert not isinstance(result_1, bool) From 4b3cfcc80a97d5bb9cdf12e294b55887445ba800 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 15:44:42 +0100 Subject: [PATCH 389/411] Added check_file_exists() to main pipeline. --- src/main.py | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 21c8866b8..448a09adb 100644 --- a/src/main.py +++ b/src/main.py @@ -31,7 +31,45 @@ def run_pipeline(start): MainLogger.info("Launching Pipeline .......................") logger.info("Collecting logging parameters ..........") Manipulate_data() - MainLogger.info("Finishing Pipeline .......................") + + # Data Ingest + # Load SPP data from DAP + snapshot_path = config["paths"]["snapshot_path"] + snapdata = hdfs_load_json(snapshot_path) + contributors_df, responses_df = spp_parser.parse_snap_data(snapdata) + # Data Transmutation + full_responses = processing.full_responses(contributors_df, responses_df) + print(full_responses.sample(5)) + logger.info( + "The response rate is %.3%", + processing.response_rate(contributors_df, responses_df), + ) + + # Data validation + validation.check_file_exists(snapshot_path) + + # Check the postcode column + validation.validate_post_col(contributors_df, masterlist_path) + + # Outlier detection + + # Data cleaning + + # Data processing: Imputation + + # Data processing: Estimation + + # Data processing: Regional Apportionment + + # Data processing: Aggregation + + # Data display: Visualisations + + # Data output: Disclosure Control + + # Data output: File Outputs + + MainLogger.info("Finshing Pipeline .......................") runlog_obj.retrieve_pipeline_logs() From fa4d38fe078fe6b6a2c6d4bb08c7792860ca057a Mon Sep 17 00:00:00 2001 From: jwestw Date: Thu, 15 Jun 2023 16:00:17 +0100 Subject: [PATCH 390/411] Adding error handling --- src/data_validation/validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index c316a8d61..518ff7fc1 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -23,6 +23,8 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: if fileExists and fileSize > 0: output = True + else: + raise FileNotFoundError(f"File {filename} does not exist or is empty") return output From 6d0734c585aa78c2ebbb206fff54fda0c5292d2f Mon Sep 17 00:00:00 2001 From: jwestw Date: Thu, 15 Jun 2023 16:02:49 +0100 Subject: [PATCH 391/411] Calling recently developed function --- src/main.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index 448a09adb..639283ac9 100644 --- a/src/main.py +++ b/src/main.py @@ -5,6 +5,10 @@ from src.utils.helpers import Config_settings from src.utils.wrappers import logger_creator from src.utils.testfunctions import Manipulate_data +from src.data_ingest import spp_parser +from src.data_processing import spp_snapshot_processing as processing +from src.utils.hdfs_mods import hdfs_load_json +from src.data_validation import validation as val import time import logging @@ -13,6 +17,13 @@ MainLogger.setLevel(logging.INFO) +# load config +conf_obj = Config_settings() +config = conf_obj.config_dict +masterlist_path = config["paths"]["masterlist_path"] +snapshot_path = config["paths"]["snapshot_path"] + + def run_pipeline(start): """The main pipeline. @@ -32,9 +43,12 @@ def run_pipeline(start): logger.info("Collecting logging parameters ..........") Manipulate_data() + # Check data files exist + val.check_file_exists(snapshot_path) + # Data Ingest # Load SPP data from DAP - snapshot_path = config["paths"]["snapshot_path"] + snapdata = hdfs_load_json(snapshot_path) contributors_df, responses_df = spp_parser.parse_snap_data(snapdata) # Data Transmutation @@ -49,7 +63,7 @@ def run_pipeline(start): validation.check_file_exists(snapshot_path) # Check the postcode column - validation.validate_post_col(contributors_df, masterlist_path) + val.validate_post_col(contributors_df, masterlist_path) # Outlier detection From e979496ef875eeb35aaceeaac742c4f231fd01bf Mon Sep 17 00:00:00 2001 From: jwestw Date: Thu, 15 Jun 2023 16:04:09 +0100 Subject: [PATCH 392/411] Adding a log message for succesful commit --- src/data_validation/validation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 518ff7fc1..31f02d6a2 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -23,6 +23,7 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: if fileExists and fileSize > 0: output = True + logger.info(f"File {filename} exists and is non-empty") else: raise FileNotFoundError(f"File {filename} does not exist or is empty") From c5470d2808a69a3a4d049c895a7a89ab227bfbf6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 16:26:51 +0100 Subject: [PATCH 393/411] Updated check_file_exists() function to work with hdfs so real data can be checked on HUE as well as more local files. --- src/data_validation/validation.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 31f02d6a2..8d351d8c7 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,4 +1,19 @@ import os +import postcodes_uk +import pandas as pd +import pydoop.hdfs as hdfs + +from src.utils.wrappers import logger_creator +from src.utils.helpers import Config_settings + + +# Get the config +conf_obj = Config_settings() +config = conf_obj.config_dict +global_config = config["global"] + +# Set up logging +logger = logger_creator(global_config) def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: @@ -16,7 +31,7 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: output = False file_loc = os.path.join(filepath, filename) - fileExists = os.path.exists(file_loc) + fileExists = hdfs.path.exists(file_loc) if fileExists: fileSize = os.path.getsize(file_loc) From 2de75124c7bbf14966b17c0a7e5e85bccb03a3a0 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 16:29:00 +0100 Subject: [PATCH 394/411] Added call to check_file_exists in main pipeline. --- src/main.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main.py b/src/main.py index 639283ac9..8626c3526 100644 --- a/src/main.py +++ b/src/main.py @@ -54,10 +54,7 @@ def run_pipeline(start): # Data Transmutation full_responses = processing.full_responses(contributors_df, responses_df) print(full_responses.sample(5)) - logger.info( - "The response rate is %.3%", - processing.response_rate(contributors_df, responses_df), - ) + processing.response_rate(contributors_df, responses_df) # Data validation validation.check_file_exists(snapshot_path) From 58abab37138f4200f07fde1d24965e2ec70fed1f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 16:36:35 +0100 Subject: [PATCH 395/411] Modified getsize to use hdfs rather than os. --- src/data_validation/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 8d351d8c7..631fdf9f8 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -34,7 +34,7 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: fileExists = hdfs.path.exists(file_loc) if fileExists: - fileSize = os.path.getsize(file_loc) + fileSize = hdfs.path.getsize(file_loc) if fileExists and fileSize > 0: output = True From fc7aebe67954575eec0a04f02ef884a0ee2d1c9e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 17:42:43 +0100 Subject: [PATCH 396/411] Tidied up main pipeline. --- src/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.py b/src/main.py index 8626c3526..9a3a72f2c 100644 --- a/src/main.py +++ b/src/main.py @@ -57,7 +57,6 @@ def run_pipeline(start): processing.response_rate(contributors_df, responses_df) # Data validation - validation.check_file_exists(snapshot_path) # Check the postcode column val.validate_post_col(contributors_df, masterlist_path) From 390892234d6e613d8f54ef72286adbcdd5c8eb69 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 17:47:45 +0100 Subject: [PATCH 397/411] Updated check_file_exists function to check if hdfs file exists, and check locally if not. Also checks for file size and raises log warnings/error. --- src/data_validation/validation.py | 33 +++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 631fdf9f8..e303ca464 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -31,14 +31,39 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: output = False file_loc = os.path.join(filepath, filename) - fileExists = hdfs.path.exists(file_loc) + hdfs_file_exists = hdfs.path.exists(file_loc) + local_file_exists = os.path.exists(file_loc) - if fileExists: - fileSize = hdfs.path.getsize(file_loc) + # If the hdfs file exists check the size of it. + # If it doesn't exists on hdfs check locally. + if hdfs_file_exists: + hdfs_file_size = hdfs.path.getsize(file_loc) - if fileExists and fileSize > 0: + # If the file exists locally, check the size of it. + if local_file_exists: + local_file_size = os.path.getsize(file_loc) + + # If hdfs file exists and is non-empty + if hdfs_file_exists and hdfs_file_size > 0: + output = True + logger.info(f"File {filename} exists on HDFS and is non-empty") + + # If hdfs file exists and is empty + elif hdfs_file_exists and hdfs_file_size == 0: + output = False + logger.warning(f"File {filename} exists on HDFS but is empty") + + # If file is not on hdfs but is local, and non-empty + elif local_file_exists and local_file_size > 0 and not hdfs_file_exists: output = True logger.info(f"File {filename} exists and is non-empty") + + # If file is empty, is not on hdfs but does exist locally + elif local_file_exists and local_file_size == 0 and not hdfs_file_exists: + output = False + logger.warning(f"File {filename} exists but is empty") + + # Raise error if file does not exist else: raise FileNotFoundError(f"File {filename} does not exist or is empty") From 466e02bb5d3196cd4af8dbd9ed89046a2d6370bb Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 17:48:51 +0100 Subject: [PATCH 398/411] Modified unit tests for check_file_exists. --- tests/test_data_validation/test_validation.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 1750b87d1..f33301885 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -20,12 +20,15 @@ def test_check_file_exists(): # Act: use pytest to assert the result # Create emptyfile.py if it doesn't already exist - empty_file = open("./emptyfile.py", "w") + empty_file = open("emptyfile.py", "w") # developer_config.yaml should exist and be non-empty - result_1 = check_file_exists("developer_config.yaml", "./src/") - result_2 = check_file_exists("Non_existant_file.txt") - result_3 = check_file_exists(empty_file.name) + result_1 = check_file_exists( + "developer_config.yaml", "/home/cdsw/research-and-development/src/" + ) + result_3 = check_file_exists( + empty_file.name, "/home/cdsw/research-and-development/" + ) # Delete emptyfile.py after testing os.remove(empty_file.name) @@ -33,11 +36,13 @@ def test_check_file_exists(): # Assert assert isinstance(result_1, bool) assert result_1 - assert not result_2 assert not result_3 - # Assert: Negative test. Should fails when the answer is wrong + + # Assert: Negative test. Should fail when the answer is wrong with pytest.raises(AssertionError): assert not isinstance(result_1, bool) + with pytest.raises(FileNotFoundError): + assert not check_file_exists("Non_existant_file.txt") @pytest.fixture # noqa From 67088a298a8d4554a1d077f443eb75dae029f7cf Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 18:06:31 +0100 Subject: [PATCH 399/411] Removed src/main.py --- src/main.py | 93 ----------------------------------------------------- 1 file changed, 93 deletions(-) delete mode 100644 src/main.py diff --git a/src/main.py b/src/main.py deleted file mode 100644 index 9a3a72f2c..000000000 --- a/src/main.py +++ /dev/null @@ -1,93 +0,0 @@ -"""The main pipeline""" - -from src.utils import runlog -from src._version import __version__ as version -from src.utils.helpers import Config_settings -from src.utils.wrappers import logger_creator -from src.utils.testfunctions import Manipulate_data -from src.data_ingest import spp_parser -from src.data_processing import spp_snapshot_processing as processing -from src.utils.hdfs_mods import hdfs_load_json -from src.data_validation import validation as val -import time -import logging - - -MainLogger = logging.getLogger(__name__) -MainLogger.setLevel(logging.INFO) - - -# load config -conf_obj = Config_settings() -config = conf_obj.config_dict -masterlist_path = config["paths"]["masterlist_path"] -snapshot_path = config["paths"]["snapshot_path"] - - -def run_pipeline(start): - """The main pipeline. - - Args: - start (float): The time when the pipeline is launched - generated from the time module using time.time() - """ - - conf_obj = Config_settings() - config = conf_obj.config_dict - global_config = config["global"] - - runlog_obj = runlog.RunLog(config, version) - - logger = logger_creator(global_config) - MainLogger.info("Launching Pipeline .......................") - logger.info("Collecting logging parameters ..........") - Manipulate_data() - - # Check data files exist - val.check_file_exists(snapshot_path) - - # Data Ingest - # Load SPP data from DAP - - snapdata = hdfs_load_json(snapshot_path) - contributors_df, responses_df = spp_parser.parse_snap_data(snapdata) - # Data Transmutation - full_responses = processing.full_responses(contributors_df, responses_df) - print(full_responses.sample(5)) - processing.response_rate(contributors_df, responses_df) - - # Data validation - - # Check the postcode column - val.validate_post_col(contributors_df, masterlist_path) - - # Outlier detection - - # Data cleaning - - # Data processing: Imputation - - # Data processing: Estimation - - # Data processing: Regional Apportionment - - # Data processing: Aggregation - - # Data display: Visualisations - - # Data output: Disclosure Control - - # Data output: File Outputs - - MainLogger.info("Finshing Pipeline .......................") - - runlog_obj.retrieve_pipeline_logs() - - run_time = round(time.time() - start, 5) - runlog_obj._record_time_taken(run_time) - - runlog_obj.retrieve_configs() - runlog_obj._create_runlog_dicts() - runlog_obj._create_runlog_dfs() - runlog_obj.create_runlog_files() - runlog_obj._write_runlog() From f677539306458fc825165936dd79a4d997015fa6 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 18:07:00 +0100 Subject: [PATCH 400/411] Modified pipeline.py to include check_file_exists call. --- src/pipeline.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/pipeline.py b/src/pipeline.py index 20aae146c..db7aa5c1d 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -8,7 +8,7 @@ from src.data_ingest import spp_parser from src.data_processing import spp_snapshot_processing as processing from src.utils.hdfs_mods import hdfs_load_json -from src.data_validation import validation +from src.data_validation import validation as val import time import logging @@ -39,6 +39,10 @@ def run_pipeline(start): MainLogger.info("Starting Data Ingest...") # Load SPP data from DAP snapshot_path = config["paths"]["snapshot_path"] + + # Check data file exists + val.check_file_exists(snapshot_path) + snapdata = hdfs_load_json(snapshot_path) contributors_df, responses_df = spp_parser.parse_snap_data(snapdata) MainLogger.info("Finished Data Ingest...") @@ -54,7 +58,7 @@ def run_pipeline(start): MainLogger.info("Starting Data Validation...") # Check the postcode column masterlist_path = config["paths"]["masterlist_path"] - validation.validate_post_col(contributors_df, masterlist_path) + val.validate_post_col(contributors_df, masterlist_path) # Outlier detection From 308ba87b93d3fcc54b657964def8e8d1c00417ec Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 18:32:54 +0100 Subject: [PATCH 401/411] Imports hdfs function from hdfs_mods to check hdfs file exists and is non-empty --- src/data_validation/validation.py | 42 ++++++++++++++++--------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index e303ca464..6c6bd4c2d 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -1,11 +1,10 @@ import os import postcodes_uk import pandas as pd -import pydoop.hdfs as hdfs from src.utils.wrappers import logger_creator from src.utils.helpers import Config_settings - +from src.utils.hdfs_mods import hdfs_file_exists, hdfs_file_size # Get the config conf_obj = Config_settings() @@ -31,38 +30,41 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: output = False file_loc = os.path.join(filepath, filename) - hdfs_file_exists = hdfs.path.exists(file_loc) - local_file_exists = os.path.exists(file_loc) - # If the hdfs file exists check the size of it. - # If it doesn't exists on hdfs check locally. - if hdfs_file_exists: - hdfs_file_size = hdfs.path.getsize(file_loc) + local_file = os.path.exists(file_loc) # If the file exists locally, check the size of it. - if local_file_exists: - local_file_size = os.path.getsize(file_loc) + if local_file: + file_size = os.path.getsize(file_loc) - # If hdfs file exists and is non-empty - if hdfs_file_exists and hdfs_file_size > 0: - output = True - logger.info(f"File {filename} exists on HDFS and is non-empty") + # If file does not exists locally, check hdfs + if not local_file: + hdfs_file = hdfs_file_exists(file_loc) - # If hdfs file exists and is empty - elif hdfs_file_exists and hdfs_file_size == 0: - output = False - logger.warning(f"File {filename} exists on HDFS but is empty") + # If hdfs file exists, check its size + if hdfs_file: + file_size = hdfs_file_size(file_loc) # If file is not on hdfs but is local, and non-empty - elif local_file_exists and local_file_size > 0 and not hdfs_file_exists: + if local_file and file_size > 0: output = True logger.info(f"File {filename} exists and is non-empty") # If file is empty, is not on hdfs but does exist locally - elif local_file_exists and local_file_size == 0 and not hdfs_file_exists: + elif local_file and file_size == 0: output = False logger.warning(f"File {filename} exists but is empty") + # If hdfs file exists and is non-empty + elif hdfs_file and file_size > 0: + output = True + logger.info(f"File {filename} exists on HDFS and is non-empty") + + # If hdfs file exists and is empty + elif hdfs_file and file_size == 0: + output = False + logger.warning(f"File {filename} exists on HDFS but is empty") + # Raise error if file does not exist else: raise FileNotFoundError(f"File {filename} does not exist or is empty") From e5c877efcc97d2c2a84cf45c543c57e9fd3ec282 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 18:35:57 +0100 Subject: [PATCH 402/411] Updated hdfs_mods.py with two functions to get file exists and file size, to avoid pydoop pytest errors. --- src/utils/hdfs_mods.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/utils/hdfs_mods.py b/src/utils/hdfs_mods.py index 1df831714..37824498f 100644 --- a/src/utils/hdfs_mods.py +++ b/src/utils/hdfs_mods.py @@ -49,3 +49,37 @@ def hdfs_load_json(filepath: str) -> dict: datadict = json.load(file) return datadict + + +def hdfs_file_exists(filepath: str) -> bool: + """Function to check file exists + + Args: + filepath (string) -- The filepath in Hue + + Returns: + Bool - A boolean value indicating whether a file + exists or not + """ + # Open the file in read mode inside Hadoop context + with hdfs.open(filepath, "r") as file: + file_exists = hdfs.path.exists(file) + + return file_exists + + +def hdfs_file_size(filepath: str) -> int: + """Function to check file exists + + Args: + filepath (string) -- The filepath in Hue + + Returns: + Int - an integer value indicating the size + of the file in bytes + """ + # Open the file in read mode inside Hadoop context + with hdfs.open(filepath, "r") as file: + file_size = hdfs.path.getsize(file) + + return file_size From 914d957bd6ed6ce53d35ec418a2e58682f6c7c5f Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 18:46:15 +0100 Subject: [PATCH 403/411] Updated docstring for check_file_exists function. --- src/data_validation/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 6c6bd4c2d..9383e730f 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -16,7 +16,8 @@ def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: - """Checks if file exists and is non-empty + """Checks if file exists on hdfs or locally and is non-empty. + Raises an FileNotFoundError if the file doesn't exist. Keyword Arguments: filename -- Name of file to check From 829f167e5a9e88c11ec372b7925c34e9f4842cdb Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 18:58:48 +0100 Subject: [PATCH 404/411] Corrected naming to match import --- src/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipeline.py b/src/pipeline.py index 11281fff2..5ee2a68f3 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -56,7 +56,7 @@ def run_pipeline(start): MainLogger.info("Finished Data Transmutation...") # Data validation - validation.check_data_shape(full_responses) + val.check_data_shape(full_responses) # Check the postcode column masterlist_path = config["paths"]["masterlist_path"] From ba63f1812ddcc83c19a1589dd0973363a99a6f20 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Thu, 15 Jun 2023 18:59:22 +0100 Subject: [PATCH 405/411] Corrected hdfs_file_exists and hdfs_file_size functions. --- src/utils/hdfs_mods.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/utils/hdfs_mods.py b/src/utils/hdfs_mods.py index 37824498f..d3170c82e 100644 --- a/src/utils/hdfs_mods.py +++ b/src/utils/hdfs_mods.py @@ -61,9 +61,7 @@ def hdfs_file_exists(filepath: str) -> bool: Bool - A boolean value indicating whether a file exists or not """ - # Open the file in read mode inside Hadoop context - with hdfs.open(filepath, "r") as file: - file_exists = hdfs.path.exists(file) + file_exists = hdfs.path.exists(filepath) return file_exists @@ -78,8 +76,6 @@ def hdfs_file_size(filepath: str) -> int: Int - an integer value indicating the size of the file in bytes """ - # Open the file in read mode inside Hadoop context - with hdfs.open(filepath, "r") as file: - file_size = hdfs.path.getsize(file) + file_size = hdfs.path.getsize(filepath) return file_size From 60ec1e38438145b6b6667b7ff8bb87ac149f9207 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 19 Jun 2023 15:31:59 +0100 Subject: [PATCH 406/411] Edited pipeline.py to reflect change of file the check_file_exists function is stored in. --- src/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipeline.py b/src/pipeline.py index 5ee2a68f3..a39049f20 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -7,7 +7,7 @@ from src.utils.wrappers import logger_creator from src.data_ingest import spp_parser from src.data_processing import spp_snapshot_processing as processing -from src.utils.hdfs_mods import hdfs_load_json +from src.utils.hdfs_mods import hdfs_load_json, check_file_exists from src.data_validation import validation as val import time @@ -42,7 +42,7 @@ def run_pipeline(start): snapshot_path = config["paths"]["snapshot_path"] # Check data file exists - val.check_file_exists(snapshot_path) + check_file_exists(snapshot_path) snapdata = hdfs_load_json(snapshot_path) contributors_df, responses_df = spp_parser.parse_snap_data(snapdata) From 70c76c487975f911577000b03dabc14673397ac0 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 19 Jun 2023 15:32:42 +0100 Subject: [PATCH 407/411] Removed check_file_exists function from validation.py --- src/data_validation/validation.py | 59 ------------------------------- 1 file changed, 59 deletions(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 4aa1c362c..08a9f60a1 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -5,7 +5,6 @@ from src.utils.wrappers import logger_creator, exception_wrap from src.utils.helpers import Config_settings -from src.utils.hdfs_mods import hdfs_file_exists, hdfs_file_size # Get the config conf_obj = Config_settings() @@ -16,64 +15,6 @@ validationlogger = logger_creator(global_config) -def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: - """Checks if file exists on hdfs or locally and is non-empty. - Raises an FileNotFoundError if the file doesn't exist. - - Keyword Arguments: - filename -- Name of file to check - filePath -- Relative path to file - (default: {"./src/data_validation/validation.py"}) - - Returns: - A bool: boolean value is True if file exists and is non-empty, - False otherwise. - """ - output = False - - file_loc = os.path.join(filepath, filename) - - local_file = os.path.exists(file_loc) - - # If the file exists locally, check the size of it. - if local_file: - file_size = os.path.getsize(file_loc) - - # If file does not exists locally, check hdfs - if not local_file: - hdfs_file = hdfs_file_exists(file_loc) - - # If hdfs file exists, check its size - if hdfs_file: - file_size = hdfs_file_size(file_loc) - - # If file is not on hdfs but is local, and non-empty - if local_file and file_size > 0: - output = True - validationlogger.info(f"File {filename} exists and is non-empty") - - # If file is empty, is not on hdfs but does exist locally - elif local_file and file_size == 0: - output = False - validationlogger.warning(f"File {filename} exists but is empty") - - # If hdfs file exists and is non-empty - elif hdfs_file and file_size > 0: - output = True - validationlogger.info(f"File {filename} exists on HDFS and is non-empty") - - # If hdfs file exists and is empty - elif hdfs_file and file_size == 0: - output = False - validationlogger.warning(f"File {filename} exists on HDFS but is empty") - - # Raise error if file does not exist - else: - raise FileNotFoundError(f"File {filename} does not exist or is empty") - - return output - - def validate_postcode_pattern(pcode: str) -> bool: """A function to validate UK postcodes which uses the From 532bfac735dee8819b90ab62f9fc6e5107340c9e Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 19 Jun 2023 15:33:15 +0100 Subject: [PATCH 408/411] Added check_file_exists() to hdfs_mods.py --- src/utils/hdfs_mods.py | 61 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/utils/hdfs_mods.py b/src/utils/hdfs_mods.py index d3170c82e..16df0cde9 100644 --- a/src/utils/hdfs_mods.py +++ b/src/utils/hdfs_mods.py @@ -5,6 +5,9 @@ import pydoop.hdfs as hdfs import pandas as pd import json +import os + +from src.data_validation.validation import validationlogger def read_hdfs_csv(filepath: str) -> pd.DataFrame: @@ -79,3 +82,61 @@ def hdfs_file_size(filepath: str) -> int: file_size = hdfs.path.getsize(filepath) return file_size + + +def check_file_exists(filename: str, filepath: str = "./data/raw/") -> bool: + """Checks if file exists on hdfs or locally and is non-empty. + Raises an FileNotFoundError if the file doesn't exist. + + Keyword Arguments: + filename -- Name of file to check + filePath -- Relative path to file + (default: {"./src/data_validation/validation.py"}) + + Returns: + A bool: boolean value is True if file exists and is non-empty, + False otherwise. + """ + output = False + + file_loc = os.path.join(filepath, filename) + + local_file = os.path.exists(file_loc) + + # If the file exists locally, check the size of it. + if local_file: + file_size = os.path.getsize(file_loc) + + # If file does not exists locally, check hdfs + if not local_file: + hdfs_file = hdfs_file_exists(file_loc) + + # If hdfs file exists, check its size + if hdfs_file: + file_size = hdfs_file_size(file_loc) + + # If file is not on hdfs but is local, and non-empty + if local_file and file_size > 0: + output = True + validationlogger.info(f"File {filename} exists and is non-empty") + + # If file is empty, is not on hdfs but does exist locally + elif local_file and file_size == 0: + output = False + validationlogger.warning(f"File {filename} exists but is empty") + + # If hdfs file exists and is non-empty + elif hdfs_file and file_size > 0: + output = True + validationlogger.info(f"File {filename} exists on HDFS and is non-empty") + + # If hdfs file exists and is empty + elif hdfs_file and file_size == 0: + output = False + validationlogger.warning(f"File {filename} exists on HDFS but is empty") + + # Raise error if file does not exist + else: + raise FileNotFoundError(f"File {filename} does not exist or is empty") + + return output From 28f3e2dff470d9f25f839156a9617d893edbaed1 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 19 Jun 2023 15:35:49 +0100 Subject: [PATCH 409/411] Added tests for check_file_exists to tests/test_utils_test_hdfs_mods.py --- tests/test_utils/test_hdfs_mods.py | 37 +++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/test_utils/test_hdfs_mods.py b/tests/test_utils/test_hdfs_mods.py index 84cbc10ed..a593f835a 100644 --- a/tests/test_utils/test_hdfs_mods.py +++ b/tests/test_utils/test_hdfs_mods.py @@ -4,10 +4,17 @@ import pandas as pd # Import modules to test +import os import sys +from src.utils.hdfs_mods import ( + read_hdfs_csv, + write_hdfs_csv, + hdfs_load_json, + check_file_exists, +) # noqa + sys.modules["mock_f"] = mock.Mock() -from src.utils.hdfs_mods import read_hdfs_csv, write_hdfs_csv, hdfs_load_json # noqa class TestReadCsv: @@ -107,3 +114,31 @@ def test_hdfs_load_json(self, mock_hdfs, mock_json): json_expout = self.expout_data() assert json_result == json_expout + + +class TestCheckFileExists: + @mock.patch("src.utils.hdfs_mods.hdfs") + def test_check_file_exists(self, mock_hdfs): + """Test the check_file_exists function.""" + + mock_hdfs.open.return_value.__enter__.return_value = sys.modules["mock_f"] + + # Act: use pytest to assert the result + # Create emptyfile.py if it doesn't already exist + empty_file = open("emptyfile.py", "w") + + # developer_config.yaml should exist and be non-empty + result_1 = check_file_exists( + "developer_config.yaml", "/home/cdsw/research-and-development/src/" + ) + result_3 = check_file_exists( + empty_file.name, "/home/cdsw/research-and-development/" + ) + + # Delete emptyfile.py after testing + os.remove(empty_file.name) + + # Assert + assert isinstance(result_1, bool) + assert result_1 + assert not result_3 From c4d793bcdc284edf3c9bacf21335744de1803f7a Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Mon, 19 Jun 2023 15:38:12 +0100 Subject: [PATCH 410/411] Removed test_check_file_exists from test_validation.py --- tests/test_data_validation/test_validation.py | 34 +------------------ 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/tests/test_data_validation/test_validation.py b/tests/test_data_validation/test_validation.py index 8e736cc7e..f5553d9a0 100644 --- a/tests/test_data_validation/test_validation.py +++ b/tests/test_data_validation/test_validation.py @@ -1,6 +1,6 @@ import pandas as pd import pytest -import os + from src.data_validation.validation import ( validate_post_col, validate_postcode_pattern, @@ -13,38 +13,6 @@ config = conf_obj.config_dict -def test_check_file_exists(): - """Test the check_file_exists function.""" - # Arrange - from src.data_validation.validation import check_file_exists - - # Act: use pytest to assert the result - # Create emptyfile.py if it doesn't already exist - empty_file = open("emptyfile.py", "w") - - # developer_config.yaml should exist and be non-empty - result_1 = check_file_exists( - "developer_config.yaml", "/home/cdsw/research-and-development/src/" - ) - result_3 = check_file_exists( - empty_file.name, "/home/cdsw/research-and-development/" - ) - - # Delete emptyfile.py after testing - os.remove(empty_file.name) - - # Assert - assert isinstance(result_1, bool) - assert result_1 - assert not result_3 - - # Assert: Negative test. Should fail when the answer is wrong - with pytest.raises(AssertionError): - assert not isinstance(result_1, bool) - with pytest.raises(FileNotFoundError): - assert not check_file_exists("Non_existant_file.txt") - - @pytest.fixture # noqa def test_data(): """'NP10 8XG', 'SW1P 4DF' are valid and real postcodes. From eb5fadeafd727c5c25dfc8b997e9e93d047b64c5 Mon Sep 17 00:00:00 2001 From: Roddy Macrae Date: Tue, 20 Jun 2023 14:40:29 +0100 Subject: [PATCH 411/411] Added wrappers and validationlogger line back into validation.py --- src/data_validation/validation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/data_validation/validation.py b/src/data_validation/validation.py index 08a9f60a1..f983597d9 100644 --- a/src/data_validation/validation.py +++ b/src/data_validation/validation.py @@ -3,7 +3,7 @@ import postcodes_uk import pandas as pd -from src.utils.wrappers import logger_creator, exception_wrap +from src.utils.wrappers import logger_creator, time_logger_wrap, exception_wrap from src.utils.helpers import Config_settings # Get the config @@ -33,6 +33,7 @@ def validate_postcode_pattern(pcode: str) -> bool: return valid_bool +@exception_wrap def get_masterlist(masterlist_path) -> pd.Series: """This function loads the masterlist of postcodes from a csv file @@ -43,6 +44,8 @@ def get_masterlist(masterlist_path) -> pd.Series: return masterlist +@time_logger_wrap +@exception_wrap def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: """This function checks if all postcodes in the specified DataFrame column are valid UK postcodes. It uses the `validate_postcode` function to @@ -97,6 +100,8 @@ def validate_post_col(df: pd.DataFrame, masterlist_path: str) -> bool: f"Invalid postcodes found: {combined_invalid_postcodes.to_list()}" ) + validationlogger.info("All postcodes validated....") + return True