Merge pull request #68 from vkoves/issue-41-add-automated-tests

Issue 41 add automated tests for cleaning file for all years
vkoves · Mar 13, 2024 · 6f67f61 · 6f67f61
2 parents 39b351b + 031e4ea
commit 6f67f61
Show file tree

Hide file tree

Showing 27 changed files with 3,650 additions and 3,362 deletions.
diff --git a/README.md b/README.md
@@ -36,71 +36,109 @@ GraphQL requires data key names to have no spaces or special characters, so ther
 - [ ] Create ward page that shows data by ward (needs new data source)
 - [ ] Figure out a way to rank buildings by opportunity for improvement (perhaps higher than avg. in category, uses a lot of natural gas?)
 
+
 ## Development
 
-## Setup
+### Front-End Setup
 
 Make sure you have [Yarn](https://yarnpkg.com/) installed, `cd` into the project directory (after cloning it) and run:
 
-```
+```bash
 yarn install
 ```
 
-### Known Issues
-
-#### macOS libvips Error
-
-If you encounter an error on macOS such as `sharp Prebuilt libvips 8.10.5 binaries are not yet available for darwin-arm64v8`, you'll need to install these dependencies separately. Install the [Brew package manager](https://brew.sh/), then run the following commands:
-
-```
-brew install --build-from-source gcc
-xcode-select install
-brew install vips
-```
-
-## Running
+### Running The Front-End
 
 Run `yarn develop` to start a local dev server at `http://localhost:8080`
 
 Happy coding 🎉🙌
 
-## Run Linting
+### Run Front-End Linting
 
 To run linting with auto-fix, run:
 
-```
+```bash
 yarn lint-fix
 ```
 
 ## Deploys
 
 This site deploys automatically via Netlify by running `gridsome build`.
 
+
 ## Tools
 
 [python](https://www.python.org/) and [pandas](https://pandas.pydata.org/)
 for data processing
 
 Leaflet and Leaflet Google mutant https://www.npmjs.com/package/leaflet.gridlayer.googlemutant
 
-### Run Data Processing
 
-If you update the raw data CSVs or the data scripts that post-process them (like if you are adding
-a new statistical analysis), you need to re-run the data processing.
+## Data Processing
+
+### Python Setup (For Data Processing & tests)
 
-This requires:
+This project's Python data pipeline requires:
 
 - pip
 - python 3.9
 
-Run the following commands:
+To install our Python dependencies, from the root of the project, run:
 
-```
-cd src/data
+```bash
 pip install --no-cache-dir -r requirements.txt
+```
+
+### Run Data Processing
+
+If you update the raw data CSVs or the data scripts that post-process them (like if you are adding
+a new statistical analysis), you need to re-run the data processing. Make sure to follow the "Python
+Setup" steps first.
+
+To then process a new CSV file (at `src/data/source/ChicagoEnergyBenchmarking.csv`), from the project
+directory run:
+
+```bash
 bash run_all.sh
 ```
 
+### Run Data Processing Tests
+
+Make sure test data is created/replaced before running tests by running the following script from
+the main project directory (it will overwrite the existing test data file if it exists):
+
+```bash
+bash create_test_data.sh
+```
+
+To run all tests simply in the project directory run:
+
+```bash
+pytest
+```
+
+This assumes that `pytest` has been installed, see setup.
+
+Run the following command for individual unit test suite (where XXX is something like
+`test_clean_all_years`):
+
+```bash
+python3 -m pytest test/data/scripts/unit/XXX.py
+```
+
+
+## Known Development Issues
+
+#### macOS libvips Error
+
+If you encounter an error on macOS such as `sharp Prebuilt libvips 8.10.5 binaries are not yet available for darwin-arm64v8`, you'll need to install these dependencies separately. Install the [Brew package manager](https://brew.sh/), then run the following commands:
+
+```
+brew install --build-from-source gcc
+xcode-select install
+brew install vips
+```
+=======
 **Important!** When you update the data, make sure to update the `LatestDataYear` in
 `globals.vue`, as well as the filter year in all page queries.
 

diff --git a/__init__.py b/__init__.py
diff --git a/create_test_data.sh b/create_test_data.sh
@@ -0,0 +1 @@
+python3 -m tests.data.scripts.create_test_data ./src/data/source/ChicagoEnergyBenchmarking.csv ./tests/data/source/test_src_data.csv
diff --git a/src/data/requirements.txt → requirements.txt b/src/data/requirements.txt → requirements.txt
@@ -1,2 +1,3 @@
 python-slugify==4.0.1
 pandas==2.0.3
+pytest==7.4.4
diff --git a/run_all.sh b/run_all.sh
@@ -0,0 +1,9 @@
+# /bin/bash
+
+python3 -m src.data.scripts.clean_and_pare_down_data_all_years
+
+python3 -m src.data.scripts.process_data
+
+python3 -m src.data.scripts.add_context_by_property_type
+
+python3 -m src.data.scripts.clean_and_pare_down_data_current_year
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/data/__init__.py b/src/data/__init__.py
diff --git a/src/data/dist/building-benchmarks.csv b/src/data/dist/building-benchmarks.csv
diff --git a/src/data/run_all.sh b/src/data/run_all.sh
diff --git a/src/data/scripts/__init__.py b/src/data/scripts/__init__.py
diff --git a/...a/scripts/add-context-by-property-type.py → ...a/scripts/add_context_by_property_type.py b/...a/scripts/add-context-by-property-type.py → ...a/scripts/add_context_by_property_type.py
@@ -1,8 +1,12 @@
 import pandas as pd
 import json
 
-path_to_buildings_csv = "./dist/building-benchmarks.csv"
-property_types_file = "./dist/property-types.json"
+from src.data.scripts.utils import get_data_file_path
+
+out_dir = 'dist'
+path_to_buildings_csv = get_data_file_path(out_dir, 'building-benchmarks.csv')
+property_types_file_path = get_data_file_path(out_dir, 'property-types.json')
+property_stats_file_path = get_data_file_path(out_dir, 'building-statistics-by-property-type.json')
 
 # Columns we want to rank for and append ranks to each building's data
 building_cols_to_rank = [
@@ -51,7 +55,7 @@
 def generate_property_types():
     # output property_types to a json file for use in the frontend
     property_types_json = {"propertyTypes": list(property_types)}
-    with open(property_types_file, 'w', encoding='latin1') as json_file:
+    with open(property_types_file_path, 'w', encoding='latin1') as json_file:
         json.dump(property_types_json, json_file)
 
 def calculateBuildingStatistics():
@@ -85,7 +89,7 @@ def calculateBuildingStatistics():
 
         stats_by_property_type[property] = cur_property_type_stats
 
-    with open("./dist/building-statistics-by-property-type.json", "w") as property_stats_file:
+    with open(property_stats_file_path, "w") as property_stats_file:
         json.dump(stats_by_property_type, property_stats_file)
 
 # Ranks buildings in relation to their property type, then re-exporting the file

diff --git a/src/data/scripts/clean-and-pare-down-data_all_years.py b/src/data/scripts/clean-and-pare-down-data_all_years.py
diff --git a/src/data/scripts/clean_and_pare_down_data_all_years.py b/src/data/scripts/clean_and_pare_down_data_all_years.py
@@ -0,0 +1,117 @@
+import pandas as pd
+from src.data.scripts.utils import get_and_clean_csv, get_data_file_path
+
+file_dir = 'source'
+building_emissions_file = 'ChicagoEnergyBenchmarking.csv'
+data_out_file = 'ChicagoEnergyBenchmarkingAllNewestInstances.csv'
+
+# Columns that should be strings because they are immutable identifiers
+string_cols = [
+    'ChicagoEnergyRating',
+    'ZIPCode',
+    'Latitude',
+    'Longitude'
+]
+
+# Int columns that are numbers (and can get averaged) but should be rounded
+int_cols = [
+    'NumberOfBuildings',
+    'ENERGYSTARScore',
+    # TODO: Move to string after figuring out why the X.0 is showing up
+    'Wards',
+    'CensusTracts',
+    # 'ZIPCode',
+    'CommunityAreas',
+    'HistoricalWards2003-2015'
+]
+
+replace_headers = {'Data Year': 'DataYear',
+    'ID': 'ID',
+    'Property Name': 'PropertyName',
+    'Reporting Status': 'ReportingStatus',
+    'Address': 'Address',
+    'ZIP Code': 'ZIPCode',
+    'Chicago Energy Rating': 'ChicagoEnergyRating',
+    'Exempt From Chicago Energy Rating': 'ExemptFromChicagoEnergyRating',
+    'Community Area': 'CommunityArea',
+    'Primary Property Type': 'PrimaryPropertyType',
+    'Gross Floor Area - Buildings (sq ft)': 'GrossFloorArea',
+    'Total GHG Emissions (Metric Tons CO2e)': 'TotalGHGEmissions',
+    'GHG Intensity (kg CO2e/sq ft)': 'GHGIntensity',
+    'Year Built': 'YearBuilt',
+    '# of Buildings': 'NumberOfBuildings',
+    'Water Use (kGal)': 'WaterUse',
+    'ENERGY STAR Score': 'ENERGYSTARScore',
+    'Electricity Use (kBtu)': 'ElectricityUse',
+    'Natural Gas Use (kBtu)': 'NaturalGasUse',
+    'District Steam Use (kBtu)': 'DistrictSteamUse',
+    'District Chilled Water Use (kBtu)': 'DistrictChilledWaterUse',
+    'All Other Fuel Use (kBtu)': 'AllOtherFuelUse',
+    'Site EUI (kBtu/sq ft)': 'SiteEUI',
+    'Source EUI (kBtu/sq ft)': 'SourceEUI',
+    'Weather Normalized Site EUI (kBtu/sq ft)': 'WeatherNormalizedSiteEUI',
+    'Weather Normalized Source EUI (kBtu/sq ft)': 'WeatherNormalizedSourceEUI',
+    'Latitude': 'Latitude',
+    'Longitude': 'Longitude',
+    'Location': 'Location',
+    'Row_ID': 'Row_ID',
+    'Wards': 'Wards',
+    'Community Areas': 'CommunityAreas',
+    'Zip Codes': 'ZipCodes',
+    'Census Tracts': 'CensusTracts',
+    'Historical Wards 2003-2015': 'HistoricalWards2003-2015' }
+
+def rename_columns(building_data: pd.DataFrame) -> pd.DataFrame:
+    return building_data.rename(columns=replace_headers)
+
+def get_buildings_with_ghg_intensity(building_data: pd.DataFrame) -> pd.DataFrame:
+    return building_data.loc[(building_data['GHGIntensity'] > 0)].copy()
+
+def get_submitted_data(building_data: pd.DataFrame) -> pd.DataFrame:
+    is_submitted = (building_data['ReportingStatus'] == 'Submitted')
+    is_submitted_data = (building_data['ReportingStatus'] == 'Submitted Data')
+    has_status_submitted = is_submitted | is_submitted_data
+    return building_data.loc[has_status_submitted].copy()
+
+def get_last_year_data(all_submitted_data: pd.DataFrame) -> pd.DataFrame:
+    all_submitted_data = all_submitted_data.sort_values(by=['ID', 'DataYear'])
+    all_recent_submitted_data = all_submitted_data.drop_duplicates(subset=['ID'], keep='last').copy()
+    return all_recent_submitted_data
+
+def fix_str_cols(all_recent_submitted_data: pd.DataFrame, renamed_building_data: pd.DataFrame) -> pd.DataFrame:
+    # Mark columns that look like numbers but should be strings as such to prevent decimals showing
+    # up (e.g. zipcode of 60614 or Ward 9)
+    all_recent_submitted_data[string_cols] = renamed_building_data[string_cols].astype('string')
+    return all_recent_submitted_data
+
+def fix_int_cols(building_data: pd.DataFrame) -> pd.DataFrame:
+    building_data[int_cols] = building_data[int_cols].astype('Int64')
+    return building_data
+
+def output_to_csv(building_data: pd.DataFrame, dir: str) -> None:
+    # Mark columns as ints that should never show a decimal, e.g. Number of Buildings, Zipcode
+    building_data.to_csv(dir, sep=',', encoding='utf-8', index=False)
+
+def process(file_path: str) -> pd.DataFrame:
+    building_data = get_and_clean_csv(file_path)
+
+    building_data = rename_columns(building_data)
+
+    buildings_with_ghg_intensity = get_buildings_with_ghg_intensity(building_data)
+
+    all_submitted_data = get_submitted_data(buildings_with_ghg_intensity)
+
+    all_recent_submitted_data = get_last_year_data(all_submitted_data)
+
+    all_recent_submitted_data = fix_str_cols(all_recent_submitted_data, building_data)
+
+    all_recent_submitted_data = fix_int_cols(all_recent_submitted_data)
+
+    return all_recent_submitted_data
+
+def main():
+    processed = process(get_data_file_path(file_dir, building_emissions_file))
+    output_to_csv(processed, get_data_file_path(file_dir, data_out_file))
+
+if __name__ == '__main__':
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python3 -m tests.data.scripts.create_test_data ./src/data/source/ChicagoEnergyBenchmarking.csv ./tests/data/source/test_src_data.csv