Merge branch 'main' into generate-historic-data

vkoves · May 8, 2024 · 7e593df · 7e593df
2 parents 5ce3747 + cc4aebd
commit 7e593df
Show file tree

Hide file tree

Showing 4 changed files with 181 additions and 124 deletions.
diff --git a/tests/data/scripts/create_test_data.py b/tests/data/scripts/create_test_data.py
@@ -9,24 +9,24 @@
 src_input_file = 'ChicagoEnergyBenchmarking.csv'
 test_input_file = 'test_src_data.csv'
 
-property_test_cases = ['United Center', 'Crown Hall', 'Art Institute', 'Marie Curie']
+property_ids_to_include = [
+    '100856',   # United Center
+    '256419',   # Crown Hall
+    '160196',   # The Art Institute of Chicago
+    '138730',   # random property
+    '240068',   # random property w/ submitted data and no GHGIntensity data
+    ]
 
-def write_test_sample(reader: csv.reader, writer: csv.writer, property_test_cases: List[str]) -> csv.writer:
+def write_test_sample(reader: csv.reader, writer: csv.writer, property_ids_to_include: List[str]) -> csv.writer:
     header_row = next(reader)
     if len(header_row) <= 0:
         raise EOFError('ChicagoEnergyBenchmarking CSV file is empty!') 
     else:
         writer.writerow(header_row)
     for row in reader:
-        for item in row:
-            has_prop = False
-            for case in property_test_cases:
-                if case in item:
-                    has_prop = True
-                    writer.writerow(row)
-                    break
-            if has_prop:
-                break
+        property_id = row[1]
+        if property_id in property_ids_to_include:
+            writer.writerow(row)
 
 def main():
     # the first console argument is technically the python script so we skip that
@@ -41,7 +41,7 @@ def main():
 
     csvfile = open(get_test_file_path(target_path), 'w')
     test_file = csv.writer(csvfile)
-    write_test_sample(src_csv, test_file, property_test_cases)
+    write_test_sample(src_csv, test_file, property_ids_to_include)
     print('Copied source data from', src_path)
     print('Copied test data to', target_path)
 

diff --git a/tests/data/scripts/unit/test_clean_all_years.py b/tests/data/scripts/unit/test_clean_all_years.py
@@ -1,123 +1,166 @@
 import pytest
-import shutil, os, pathlib, csv
+import os
+import csv
 import pandas as pd
-import numpy as np
 
-from src.data.scripts.utils import get_and_clean_csv
-from src.data.scripts import clean_and_pare_down_data_all_years as clean, process_data as proc
-from tests.data.scripts.utils import get_test_file_path, get_src_file_path
+from src.data.scripts import clean_and_pare_down_data_all_years
+from tests.data.scripts.utils import get_test_file_path
 
 src_dir = 'src'
 test_dir = 'tests'
-src_input_file = 'ChicagoEnergyBenchmarking.csv'
 test_input_file = 'test_src_data.csv'
 test_output_file = 'test_output.csv'
 
-@pytest.fixture
-def src_building_data() -> pd.DataFrame:
-    test_data_path = get_test_file_path(test_input_file)
-    assert os.path.exists(test_data_path)
-    return get_and_clean_csv(test_data_path)
-
-@pytest.fixture
-def csv_file() -> csv.reader:
-    csvfile = open(get_test_file_path(test_input_file))
-    return csv.reader(csvfile)
-
-def test_csv_file_has_some_data(csv_file):
-    first_line = csv_file.__next__()
-    assert first_line
-    assert len(first_line) > 0
-
-@pytest.mark.parametrize("test_input", [
-    clean.string_cols,
-    clean.int_cols,
-    clean.replace_headers
-])
-def test_is_not_empty(test_input):
-    assert len(test_input) > 0
-
-def test_src_data_exists(src_building_data):
-    assert src_building_data is not None
 
 @pytest.fixture
-def test_columns_are_renamed(src_building_data) -> pd.DataFrame:
-    df = clean.rename_columns(src_building_data)
-    assert df is not None
-    assert not df.columns.equals(src_building_data.columns)
-    return df
-
-def test_data_has_positive_ghg_data(test_columns_are_renamed):
-    df = clean.get_buildings_with_ghg_intensity(test_columns_are_renamed)
-    assert df is not None
-    assert np.all(df['GHGIntensity'] > 0)
+def csv_reader() -> csv.reader:
+    '''return a csv.DictReader of our test data CSV'''
 
-def test_data_has_submitted_status(test_columns_are_renamed):
-    df = clean.get_submitted_data(test_columns_are_renamed)
-    assert np.all(df['ReportingStatus'].str.contains('Submitted'))
+    csv_path = get_test_file_path(test_input_file)
+    with open(csv_path) as filehandle:
+        # yield here so that the context manager (with...)
+        # can cleanup the open filehandle after we're done with
+        # the csv.DictReader
+        yield csv.DictReader(filehandle)
 
-@pytest.fixture
-def test_has_last_year_of_data(test_columns_are_renamed) -> pd.DataFrame:
-    df = clean.get_last_year_data(test_columns_are_renamed)
-    assert np.all(df['ID'].value_counts() == 1)
-    return df
-
-@pytest.fixture
-def fixed_strings(test_has_last_year_of_data, test_columns_are_renamed):
-    return clean.fix_str_cols(test_has_last_year_of_data,
-                              test_columns_are_renamed)
 
 @pytest.fixture
-def fixed_strings_all_years(test_columns_are_renamed):
-    return clean.fix_str_cols(test_columns_are_renamed,
-                              test_columns_are_renamed)
-
-def test_str_values_remain_the_same_as_origin(fixed_strings_all_years, csv_file):
-    header_row = next(csv_file)
-    str_col_positions = list(map(lambda col: fixed_strings_all_years.columns.get_loc(col), clean.string_cols))
-    for csv_row in csv_file:
-        year, id = csv_row[0], csv_row[1]
-        row = fixed_strings_all_years[(fixed_strings_all_years['ID'].astype(str) == id) & \
-                                      (fixed_strings_all_years['DataYear'].astype(str) == year)]
-
-        for col, csv_pos in zip(clean.string_cols, str_col_positions):
-            if all(pd.isna(row[col].to_numpy())):
-                continue
-
-            # The raw GPS in ChicagoEnergyBenchmarking.csv has 41.880451999999998, which gets
-            # truncated, so we round to ignore that, since it's not a significant difference
-            # TODO: Fix GPS inconsistency and drop  rounding
-            csv_value = csv_row[csv_pos]
-
-
-            # If > 10 or < -10, we truncate 0 after rounding to 6 decimals. This means this applies
-            # to GPS coordinates but not energy star ratings (e.g.)
-            if (abs(float(csv_value)) > 10):
-                print("df ", row[col].to_numpy(), "csv ", csv_value)
-                csv_float = float(csv_value)
-                csv_val_parsed = f'{csv_float:.9f}'.rstrip('0').rstrip('.')
-            else:
-                csv_val_parsed = csv_value
-
-            assert row[col].to_numpy()[0] == csv_val_parsed
-
-def test_lat_lon_become_strings(fixed_strings):
-    df = fixed_strings[['Latitude','Longitude']]
-    assert np.all(df.dtypes == 'string')
-
-def test_int_values_remain_the_same_as_origin(test_has_last_year_of_data):
-    df = clean.fix_int_cols(test_has_last_year_of_data)
-    assert np.all(df[clean.int_cols].dtypes == 'Int64')
-
-def test_csv_is_produced(test_has_last_year_of_data):
-    out_file = get_test_file_path(test_output_file)
-    clean.output_to_csv(test_has_last_year_of_data, out_file)
-    assert os.path.exists(out_file)
+def processed_dataframe() -> pd.DataFrame:
+    '''Process our test data as per clean_and_pare_down_data_all_years.py
+    and return the resulting dataframe'''
 
-@pytest.fixture
-def process():
-    return clean.process(get_src_file_path(src_input_file), True)
+    input_filename = get_test_file_path(test_input_file)
+    df = clean_and_pare_down_data_all_years.process(input_filename, True)
+    assert df is not None
+    return df
 
-def test_data_has_ranking_columns(process):
-    for col in proc.building_cols_to_rank:
-        assert col in process.columns
+def test_data_has_positive_ghg_data(processed_dataframe):
+    '''confirm each property in the processed dataframe has non-zero GHGIntensity'''
+
+    df = processed_dataframe
+    assert all([ghg > 0 for ghg in df['GHGIntensity']])
+
+
+def test_data_has_submitted_status(processed_dataframe):
+    '''confirm each property in the processed dataframe has a submitted status'''
+
+    df = processed_dataframe
+    for status in df['ReportingStatus']:
+        assert status in ('Submitted Data', 'Submitted')
+
+
+def test_lat_long_are_unchanged(processed_dataframe, csv_reader):
+    '''confirm lat/long in the processed dataframe is unchanged from origin csv'''
+
+    df = processed_dataframe
+    df_lattitudes = [x for x in df['Latitude']]
+    df_longitudes = [x for x in df['Longitude']]
+    df_property_ids = [x for x in df['ID']]
+
+    for row in csv_reader:
+        csv_property_id = row['ID']
+        csv_lat = row['Latitude']
+        csv_long = row['Longitude']
+        if csv_property_id in df_property_ids:
+            i = df_property_ids.index(csv_property_id)
+            assert (csv_lat, csv_long) == (df_lattitudes[i], df_longitudes[i])
+
+
+def test_one_entry_per_property(processed_dataframe):
+    '''confirm each property only has 1 entry in the processed dataframe'''
+
+    df = processed_dataframe
+    assert all([count == 1 for count in df['ID'].value_counts()])
+
+
+def test_expected_columns_present(processed_dataframe):
+    '''confirm all expected columns are present in the processed dataframe'''
+
+    df = processed_dataframe
+    mandatory_columns = (
+        'DataYear',
+        'ID',
+        'PropertyName',
+        'ReportingStatus',
+        'Address',
+        'ZIPCode',
+        'ChicagoEnergyRating',
+        'ExemptFromChicagoEnergyRating',
+        'CommunityArea',
+        'PrimaryPropertyType',
+        'GrossFloorArea',
+        'TotalGHGEmissions',
+        'GHGIntensity',
+        'YearBuilt',
+        'NumberOfBuildings',
+        'WaterUse',
+        'ENERGYSTARScore',
+        'ElectricityUse',
+        'NaturalGasUse',
+        'DistrictSteamUse',
+        'DistrictChilledWaterUse',
+        'AllOtherFuelUse',
+        'SiteEUI',
+        'SourceEUI',
+        'WeatherNormalizedSiteEUI',
+        'WeatherNormalizedSourceEUI',
+        'Latitude',
+        'Longitude',
+        'Location',
+        'Row_ID',
+        'Wards',
+        'CommunityAreas',
+        'ZipCodes',
+        'CensusTracts',
+        'HistoricalWards2003-2015',
+    )
+    assert set(df.columns) == set(mandatory_columns)
+
+
+def test_correct_year_selected(processed_dataframe):
+    '''confirm the correct DataYear is present in the processed dataframe
+    for a sample of properties'''
+
+    df = processed_dataframe
+
+    united_center_df = df[df['PropertyName']=='United Center']
+    united_center_df.reset_index(inplace=True, drop=True)
+    assert len(united_center_df) == 1
+    assert united_center_df.loc[0, 'DataYear'] == 2019
+
+    crown_hall_df = df[df['PropertyName']=='Crown Hall']
+    crown_hall_df.reset_index(inplace=True, drop=True)
+    assert len(crown_hall_df) == 1
+    assert crown_hall_df.loc[0, 'DataYear'] == 2021
+
+    bldg_138730_df = df[df['ID']==138730]
+    bldg_138730_df.reset_index(inplace=True, drop=True)
+    assert len(bldg_138730_df) == 1
+    assert bldg_138730_df.loc[0, 'DataYear'] == 2020
+
+
+def test_property_count(processed_dataframe):
+    '''confirm the processed dataframe has the correct number of properties'''
+
+    df = processed_dataframe
+    assert len(df) == 4
+
+
+def test_no_ghg_property_is_excluded(processed_dataframe):
+    '''confirm property with submitted data but no GHGIntensity data
+    ie excluded from the processed dataframe'''
+
+    df = processed_dataframe
+    # property ID 240068 is present in test source data but
+    # 2016-2022 submitted data has no GHGIntensity data
+    assert len(df[df['ID']=='240068']) == 0
+
+
+def test_csv_is_produced(processed_dataframe):
+    '''confirm clean_and_pare_down_data_all_years.output_to_csv creates
+    a csv on disk'''
+
+    df = processed_dataframe
+    output_file_path = get_test_file_path(test_output_file)
+    clean_and_pare_down_data_all_years.output_to_csv(df, output_file_path)
+    assert os.path.exists(output_file_path)
diff --git a/tests/data/source/test_output.csv b/tests/data/source/test_output.csv
@@ -1,5 +1,5 @@
 DataYear,ID,PropertyName,ReportingStatus,Address,ZIPCode,ChicagoEnergyRating,ExemptFromChicagoEnergyRating,CommunityArea,PrimaryPropertyType,GrossFloorArea,YearBuilt,NumberOfBuildings,WaterUse,ENERGYSTARScore,ElectricityUse,NaturalGasUse,DistrictSteamUse,DistrictChilledWaterUse,AllOtherFuelUse,SiteEUI,SourceEUI,WeatherNormalizedSiteEUI,WeatherNormalizedSourceEUI,TotalGHGEmissions,GHGIntensity,Latitude,Longitude,Location,Row_ID,Wards,CommunityAreas,ZipCodes,CensusTracts,HistoricalWards2003-2015
-2021,100856,United Center,Not Submitted,1901 W Madison St,60612,0.0,False,NEAR WEST SIDE,Indoor Arena,2289000.0,1994.0,1.0,,,,,,,,,,,,,,41.88067672,-87.67418207,"(41.88067672, -87.67418207)",2021-100856,46,29,21184,90,41
-2021,160196,The Art Institute of Chicago,Not Submitted,111 S Michigan Ave,60603,0.0,False,LOOP,Museum,1008416.0,1892.0,1.0,,,,,,,,,,,,,,41.880452,-87.624229,"(41.880452, -87.624229)",2021-160196,36,38,14311,367,22
-2021,256419,Crown Hall,Submitted,3360 S State Street,60616,1.0,False,DOUGLAS,College/University,54291.0,1955.0,1.0,,,1333307.2,0.0,451039945.6,0.0,,8332.4,10063.4,8332.4,10063.4,30138.8,555.1,41.842325,-87.62715344,"(41.842325, -87.62715344)",2021-256419,9,1,21194,377,12
-2021,256458,United Center Office Building,Not Submitted,1901 W Madison St,60612,0.0,False,,,,,,,,,,,,,,,,,,,41.88125398,-87.67448493,"(41.88125398, -87.67448493)",2021-256458,46,29,21184,90,41
+2019,100856,United Center,Submitted,1901 W Madison St,60612.0,2.0,False,NEAR WEST SIDE,Indoor Arena,960000.0,1994.0,2,206239.0,,102653875.6,15169580.2,,,,122.7,316.0,122.4,,17883.7,18.6,41.88067672,-87.67418207,"(41.88067672, -87.67418207)",2019-100856,46,29,21184,90,41
+2020,138730,Grand Blvd Plaza,Submitted Data,5401 S WENTWORTH AVE,60609.0,3.0,False,FULLER PARK,Strip Mall,138730.0,1975.0,1,,,6245386.4,5872823.7,,,,87.4,170.5,87.9,172.0,1286.6,9.3,41.79622465,-87.63030493,"(41.79622465, -87.63030493)",2020-138730,9,3,14924,224,12
+2022,160196,The Art Institute of Chicago,Submitted Data,111 S Michigan Ave,60603.0,1.0,False,LOOP,Museum,1008416.0,1892.0,1,,,80968968.1,158224778.6,0.0,0.0,,237.2,389.6,239.3,389.5,19068.8,18.9,41.880527821930805,-87.62420946585881,"(41.880527821930805, -87.62420946585881)",2022-160196,36,38,14311,367,22
+2021,256419,Crown Hall,Submitted,3360 S State Street,60616.0,1.0,False,DOUGLAS,College/University,54291.0,1955.0,1,,,1333307.2,0.0,451039945.6,0.0,,8332.4,10063.4,8332.4,10063.4,30138.8,555.1,41.842325,-87.62715344,"(41.842325, -87.62715344)",2021-256419,9,1,21194,377,12