Skip to content

Commit

Permalink
Merge branch 'main' into generate-historic-data
Browse files Browse the repository at this point in the history
  • Loading branch information
vkoves committed May 8, 2024
2 parents 5ce3747 + cc4aebd commit 7e593df
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 124 deletions.
24 changes: 12 additions & 12 deletions tests/data/scripts/create_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@
src_input_file = 'ChicagoEnergyBenchmarking.csv'
test_input_file = 'test_src_data.csv'

property_test_cases = ['United Center', 'Crown Hall', 'Art Institute', 'Marie Curie']
property_ids_to_include = [
'100856', # United Center
'256419', # Crown Hall
'160196', # The Art Institute of Chicago
'138730', # random property
'240068', # random property w/ submitted data and no GHGIntensity data
]

def write_test_sample(reader: csv.reader, writer: csv.writer, property_test_cases: List[str]) -> csv.writer:
def write_test_sample(reader: csv.reader, writer: csv.writer, property_ids_to_include: List[str]) -> csv.writer:
header_row = next(reader)
if len(header_row) <= 0:
raise EOFError('ChicagoEnergyBenchmarking CSV file is empty!')
else:
writer.writerow(header_row)
for row in reader:
for item in row:
has_prop = False
for case in property_test_cases:
if case in item:
has_prop = True
writer.writerow(row)
break
if has_prop:
break
property_id = row[1]
if property_id in property_ids_to_include:
writer.writerow(row)

def main():
# the first console argument is technically the python script so we skip that
Expand All @@ -41,7 +41,7 @@ def main():

csvfile = open(get_test_file_path(target_path), 'w')
test_file = csv.writer(csvfile)
write_test_sample(src_csv, test_file, property_test_cases)
write_test_sample(src_csv, test_file, property_ids_to_include)
print('Copied source data from', src_path)
print('Copied test data to', target_path)

Expand Down
255 changes: 149 additions & 106 deletions tests/data/scripts/unit/test_clean_all_years.py
Original file line number Diff line number Diff line change
@@ -1,123 +1,166 @@
import pytest
import shutil, os, pathlib, csv
import os
import csv
import pandas as pd
import numpy as np

from src.data.scripts.utils import get_and_clean_csv
from src.data.scripts import clean_and_pare_down_data_all_years as clean, process_data as proc
from tests.data.scripts.utils import get_test_file_path, get_src_file_path
from src.data.scripts import clean_and_pare_down_data_all_years
from tests.data.scripts.utils import get_test_file_path

src_dir = 'src'
test_dir = 'tests'
src_input_file = 'ChicagoEnergyBenchmarking.csv'
test_input_file = 'test_src_data.csv'
test_output_file = 'test_output.csv'

@pytest.fixture
def src_building_data() -> pd.DataFrame:
test_data_path = get_test_file_path(test_input_file)
assert os.path.exists(test_data_path)
return get_and_clean_csv(test_data_path)

@pytest.fixture
def csv_file() -> csv.reader:
csvfile = open(get_test_file_path(test_input_file))
return csv.reader(csvfile)

def test_csv_file_has_some_data(csv_file):
first_line = csv_file.__next__()
assert first_line
assert len(first_line) > 0

@pytest.mark.parametrize("test_input", [
clean.string_cols,
clean.int_cols,
clean.replace_headers
])
def test_is_not_empty(test_input):
assert len(test_input) > 0

def test_src_data_exists(src_building_data):
assert src_building_data is not None

@pytest.fixture
def test_columns_are_renamed(src_building_data) -> pd.DataFrame:
df = clean.rename_columns(src_building_data)
assert df is not None
assert not df.columns.equals(src_building_data.columns)
return df

def test_data_has_positive_ghg_data(test_columns_are_renamed):
df = clean.get_buildings_with_ghg_intensity(test_columns_are_renamed)
assert df is not None
assert np.all(df['GHGIntensity'] > 0)
def csv_reader() -> csv.reader:
'''return a csv.DictReader of our test data CSV'''

def test_data_has_submitted_status(test_columns_are_renamed):
df = clean.get_submitted_data(test_columns_are_renamed)
assert np.all(df['ReportingStatus'].str.contains('Submitted'))
csv_path = get_test_file_path(test_input_file)
with open(csv_path) as filehandle:
# yield here so that the context manager (with...)
# can cleanup the open filehandle after we're done with
# the csv.DictReader
yield csv.DictReader(filehandle)

@pytest.fixture
def test_has_last_year_of_data(test_columns_are_renamed) -> pd.DataFrame:
df = clean.get_last_year_data(test_columns_are_renamed)
assert np.all(df['ID'].value_counts() == 1)
return df

@pytest.fixture
def fixed_strings(test_has_last_year_of_data, test_columns_are_renamed):
return clean.fix_str_cols(test_has_last_year_of_data,
test_columns_are_renamed)

@pytest.fixture
def fixed_strings_all_years(test_columns_are_renamed):
return clean.fix_str_cols(test_columns_are_renamed,
test_columns_are_renamed)

def test_str_values_remain_the_same_as_origin(fixed_strings_all_years, csv_file):
header_row = next(csv_file)
str_col_positions = list(map(lambda col: fixed_strings_all_years.columns.get_loc(col), clean.string_cols))
for csv_row in csv_file:
year, id = csv_row[0], csv_row[1]
row = fixed_strings_all_years[(fixed_strings_all_years['ID'].astype(str) == id) & \
(fixed_strings_all_years['DataYear'].astype(str) == year)]

for col, csv_pos in zip(clean.string_cols, str_col_positions):
if all(pd.isna(row[col].to_numpy())):
continue

# The raw GPS in ChicagoEnergyBenchmarking.csv has 41.880451999999998, which gets
# truncated, so we round to ignore that, since it's not a significant difference
# TODO: Fix GPS inconsistency and drop rounding
csv_value = csv_row[csv_pos]


# If > 10 or < -10, we truncate 0 after rounding to 6 decimals. This means this applies
# to GPS coordinates but not energy star ratings (e.g.)
if (abs(float(csv_value)) > 10):
print("df ", row[col].to_numpy(), "csv ", csv_value)
csv_float = float(csv_value)
csv_val_parsed = f'{csv_float:.9f}'.rstrip('0').rstrip('.')
else:
csv_val_parsed = csv_value

assert row[col].to_numpy()[0] == csv_val_parsed

def test_lat_lon_become_strings(fixed_strings):
df = fixed_strings[['Latitude','Longitude']]
assert np.all(df.dtypes == 'string')

def test_int_values_remain_the_same_as_origin(test_has_last_year_of_data):
df = clean.fix_int_cols(test_has_last_year_of_data)
assert np.all(df[clean.int_cols].dtypes == 'Int64')

def test_csv_is_produced(test_has_last_year_of_data):
out_file = get_test_file_path(test_output_file)
clean.output_to_csv(test_has_last_year_of_data, out_file)
assert os.path.exists(out_file)
def processed_dataframe() -> pd.DataFrame:
'''Process our test data as per clean_and_pare_down_data_all_years.py
and return the resulting dataframe'''

@pytest.fixture
def process():
return clean.process(get_src_file_path(src_input_file), True)
input_filename = get_test_file_path(test_input_file)
df = clean_and_pare_down_data_all_years.process(input_filename, True)
assert df is not None
return df

def test_data_has_ranking_columns(process):
for col in proc.building_cols_to_rank:
assert col in process.columns
def test_data_has_positive_ghg_data(processed_dataframe):
'''confirm each property in the processed dataframe has non-zero GHGIntensity'''

df = processed_dataframe
assert all([ghg > 0 for ghg in df['GHGIntensity']])


def test_data_has_submitted_status(processed_dataframe):
'''confirm each property in the processed dataframe has a submitted status'''

df = processed_dataframe
for status in df['ReportingStatus']:
assert status in ('Submitted Data', 'Submitted')


def test_lat_long_are_unchanged(processed_dataframe, csv_reader):
'''confirm lat/long in the processed dataframe is unchanged from origin csv'''

df = processed_dataframe
df_lattitudes = [x for x in df['Latitude']]
df_longitudes = [x for x in df['Longitude']]
df_property_ids = [x for x in df['ID']]

for row in csv_reader:
csv_property_id = row['ID']
csv_lat = row['Latitude']
csv_long = row['Longitude']
if csv_property_id in df_property_ids:
i = df_property_ids.index(csv_property_id)
assert (csv_lat, csv_long) == (df_lattitudes[i], df_longitudes[i])


def test_one_entry_per_property(processed_dataframe):
'''confirm each property only has 1 entry in the processed dataframe'''

df = processed_dataframe
assert all([count == 1 for count in df['ID'].value_counts()])


def test_expected_columns_present(processed_dataframe):
'''confirm all expected columns are present in the processed dataframe'''

df = processed_dataframe
mandatory_columns = (
'DataYear',
'ID',
'PropertyName',
'ReportingStatus',
'Address',
'ZIPCode',
'ChicagoEnergyRating',
'ExemptFromChicagoEnergyRating',
'CommunityArea',
'PrimaryPropertyType',
'GrossFloorArea',
'TotalGHGEmissions',
'GHGIntensity',
'YearBuilt',
'NumberOfBuildings',
'WaterUse',
'ENERGYSTARScore',
'ElectricityUse',
'NaturalGasUse',
'DistrictSteamUse',
'DistrictChilledWaterUse',
'AllOtherFuelUse',
'SiteEUI',
'SourceEUI',
'WeatherNormalizedSiteEUI',
'WeatherNormalizedSourceEUI',
'Latitude',
'Longitude',
'Location',
'Row_ID',
'Wards',
'CommunityAreas',
'ZipCodes',
'CensusTracts',
'HistoricalWards2003-2015',
)
assert set(df.columns) == set(mandatory_columns)


def test_correct_year_selected(processed_dataframe):
'''confirm the correct DataYear is present in the processed dataframe
for a sample of properties'''

df = processed_dataframe

united_center_df = df[df['PropertyName']=='United Center']
united_center_df.reset_index(inplace=True, drop=True)
assert len(united_center_df) == 1
assert united_center_df.loc[0, 'DataYear'] == 2019

crown_hall_df = df[df['PropertyName']=='Crown Hall']
crown_hall_df.reset_index(inplace=True, drop=True)
assert len(crown_hall_df) == 1
assert crown_hall_df.loc[0, 'DataYear'] == 2021

bldg_138730_df = df[df['ID']==138730]
bldg_138730_df.reset_index(inplace=True, drop=True)
assert len(bldg_138730_df) == 1
assert bldg_138730_df.loc[0, 'DataYear'] == 2020


def test_property_count(processed_dataframe):
'''confirm the processed dataframe has the correct number of properties'''

df = processed_dataframe
assert len(df) == 4


def test_no_ghg_property_is_excluded(processed_dataframe):
'''confirm property with submitted data but no GHGIntensity data
ie excluded from the processed dataframe'''

df = processed_dataframe
# property ID 240068 is present in test source data but
# 2016-2022 submitted data has no GHGIntensity data
assert len(df[df['ID']=='240068']) == 0


def test_csv_is_produced(processed_dataframe):
'''confirm clean_and_pare_down_data_all_years.output_to_csv creates
a csv on disk'''

df = processed_dataframe
output_file_path = get_test_file_path(test_output_file)
clean_and_pare_down_data_all_years.output_to_csv(df, output_file_path)
assert os.path.exists(output_file_path)
8 changes: 4 additions & 4 deletions tests/data/source/test_output.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
DataYear,ID,PropertyName,ReportingStatus,Address,ZIPCode,ChicagoEnergyRating,ExemptFromChicagoEnergyRating,CommunityArea,PrimaryPropertyType,GrossFloorArea,YearBuilt,NumberOfBuildings,WaterUse,ENERGYSTARScore,ElectricityUse,NaturalGasUse,DistrictSteamUse,DistrictChilledWaterUse,AllOtherFuelUse,SiteEUI,SourceEUI,WeatherNormalizedSiteEUI,WeatherNormalizedSourceEUI,TotalGHGEmissions,GHGIntensity,Latitude,Longitude,Location,Row_ID,Wards,CommunityAreas,ZipCodes,CensusTracts,HistoricalWards2003-2015
2021,100856,United Center,Not Submitted,1901 W Madison St,60612,0.0,False,NEAR WEST SIDE,Indoor Arena,2289000.0,1994.0,1.0,,,,,,,,,,,,,,41.88067672,-87.67418207,"(41.88067672, -87.67418207)",2021-100856,46,29,21184,90,41
2021,160196,The Art Institute of Chicago,Not Submitted,111 S Michigan Ave,60603,0.0,False,LOOP,Museum,1008416.0,1892.0,1.0,,,,,,,,,,,,,,41.880452,-87.624229,"(41.880452, -87.624229)",2021-160196,36,38,14311,367,22
2021,256419,Crown Hall,Submitted,3360 S State Street,60616,1.0,False,DOUGLAS,College/University,54291.0,1955.0,1.0,,,1333307.2,0.0,451039945.6,0.0,,8332.4,10063.4,8332.4,10063.4,30138.8,555.1,41.842325,-87.62715344,"(41.842325, -87.62715344)",2021-256419,9,1,21194,377,12
2021,256458,United Center Office Building,Not Submitted,1901 W Madison St,60612,0.0,False,,,,,,,,,,,,,,,,,,,41.88125398,-87.67448493,"(41.88125398, -87.67448493)",2021-256458,46,29,21184,90,41
2019,100856,United Center,Submitted,1901 W Madison St,60612.0,2.0,False,NEAR WEST SIDE,Indoor Arena,960000.0,1994.0,2,206239.0,,102653875.6,15169580.2,,,,122.7,316.0,122.4,,17883.7,18.6,41.88067672,-87.67418207,"(41.88067672, -87.67418207)",2019-100856,46,29,21184,90,41
2020,138730,Grand Blvd Plaza,Submitted Data,5401 S WENTWORTH AVE,60609.0,3.0,False,FULLER PARK,Strip Mall,138730.0,1975.0,1,,,6245386.4,5872823.7,,,,87.4,170.5,87.9,172.0,1286.6,9.3,41.79622465,-87.63030493,"(41.79622465, -87.63030493)",2020-138730,9,3,14924,224,12
2022,160196,The Art Institute of Chicago,Submitted Data,111 S Michigan Ave,60603.0,1.0,False,LOOP,Museum,1008416.0,1892.0,1,,,80968968.1,158224778.6,0.0,0.0,,237.2,389.6,239.3,389.5,19068.8,18.9,41.880527821930805,-87.62420946585881,"(41.880527821930805, -87.62420946585881)",2022-160196,36,38,14311,367,22
2021,256419,Crown Hall,Submitted,3360 S State Street,60616.0,1.0,False,DOUGLAS,College/University,54291.0,1955.0,1,,,1333307.2,0.0,451039945.6,0.0,,8332.4,10063.4,8332.4,10063.4,30138.8,555.1,41.842325,-87.62715344,"(41.842325, -87.62715344)",2021-256419,9,1,21194,377,12
Loading

0 comments on commit 7e593df

Please sign in to comment.