Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate Historic Data, Showing In Simple Table #94

Merged
merged 14 commits into from
May 8, 2024
32 changes: 29 additions & 3 deletions gridsome.server.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const parse = require('csv-parse/sync').parse;
const DataDirectory = './src/data/dist/';

const BuildingEmissionsDataFile = 'building-benchmarks.csv';
const HistoricBenchmarkingDataFile = 'benchmarking-all-years.csv';

// This is an array equivalent of Object.keys(BuildingOwners) but this file can't use Typescript and
// import that file
Expand All @@ -34,6 +35,7 @@ module.exports = function(api) {
// Use the Data Store API here: https://gridsome.org/docs/data-store-api/
api.loadSource(async (actions) => {
loadBuildingBenchmarkData(actions);
loadHistoricBenchmarkDat(actions);
});

// Use the Pages API here: https://gridsome.org/docs/pages-api/
Expand All @@ -56,19 +58,19 @@ module.exports = function(api) {
* @param {unknown} actions The actions class?
*/
function loadBuildingBenchmarkData(actions) {
const input = readFileSync(`${DataDirectory}${BuildingEmissionsDataFile}`, 'utf8');
const latestBenchmarksRaw = readFileSync(`${DataDirectory}${BuildingEmissionsDataFile}`, 'utf8');

/**
* Load in building benchmarks and expose as Buildings collection
*/
const BuildingsData = parse(input, {
const LatestBenchmarksData = parse(latestBenchmarksRaw, {
columns: true,
skip_empty_lines: true,
});

const collection = actions.addCollection({typeName: 'Building'});

for (const building of BuildingsData) {
for (const building of LatestBenchmarksData) {
// Make a slugSource that is the property name or the address as a fallback (skip one letter
// names, e.g. '-)
building.slugSource = building.PropertyName.length > 1 ? building.PropertyName : building.Address;
Expand All @@ -80,3 +82,27 @@ function loadBuildingBenchmarkData(actions) {
collection.addNode(building);
}
}


/**
* Load in the historic benchmark data
*
* @param {unknown} actions The actions class?
*/
function loadHistoricBenchmarkDat(actions) {
const historicBenchmarksRaw = readFileSync(`${DataDirectory}${HistoricBenchmarkingDataFile}`, 'utf8');

/**
* Load in building benchmarks and expose as Buildings collection
*/
const HistoricBenchmarksData = parse(historicBenchmarksRaw, {
columns: true,
skip_empty_lines: true,
});

const collection = actions.addCollection({ typeName: 'Benchmark' });

for (const benchmark of HistoricBenchmarksData) {
collection.addNode(benchmark);
}
}
13 changes: 13 additions & 0 deletions src/common-functions.vue
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ export interface IBuilding {
/** How GraphQL passes back a building */
export interface IBuildingNode { node: IBuilding }

export interface IHistoricData {
ID: string;
DataYear: string;
GrossFloorArea: string;
ChicagoEnergyRating: string;
ENERGYSTARScore: string;
SourceEUI: string;
ElectricityUse: string;
GHGIntensity: string;
NaturalGasUse: string;
DistrictSteamUse: string;
}

/**
* A constant for what we use as min and max values for flagged ranks
*/
Expand Down
4 changes: 2 additions & 2 deletions src/components/BuildingsTable.vue
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export default class BuildingsTable extends Vue {
</script>

<template>
<div class="table-cont">
<div class="buildings-table-cont">
<table :class="{ '-wide': showSquareFootage || showGasUse || showElectricityUse }">
<thead>
<tr>
Expand Down Expand Up @@ -196,7 +196,7 @@ export default class BuildingsTable extends Vue {
<style lang="scss">
// Make the whole table scroll in a constrained container so we can have a sticky header - CSS makes
// that impossible otherwise
.table-cont {
.buildings-table-cont {
width: 100%;
max-height: 80vh;
overflow: auto;
Expand Down
2 changes: 0 additions & 2 deletions src/components/StatTile.vue
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,6 @@ export default class StatTile extends Vue {
const statStdDeviation = this.stats[this.statKey]?.std;
const statMean = this.stats[this.statKey]?.mean;

console.log({ key: this.statKey, statStdDeviation, statMean });

if (this.building[this.statKey] === null || !statStdDeviation) {
return false;
}
Expand Down
19,136 changes: 19,136 additions & 0 deletions src/data/dist/benchmarking-all-years.csv

Large diffs are not rendered by default.

86 changes: 72 additions & 14 deletions src/data/scripts/clean_and_pare_down_data_all_years.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,17 @@
from src.data.scripts.utils import get_and_clean_csv, get_data_file_path

file_dir = 'source'
out_file_dir = 'dist'

# The source file we read from
building_emissions_file = 'ChicagoEnergyBenchmarking.csv'
data_out_file = 'ChicagoEnergyBenchmarkingAllNewestInstances.csv'

# The output file we generate that has all columns, but just for the latest year reported
newest_instances_out_filename = 'ChicagoEnergyBenchmarkingAllNewestInstances.csv'

# The output file we generate with limited columns but for ALL years (reported and non-reported),
# allowing us to track metrics (e.g. emissions, GHG intensity) over time, and reporting status
all_years_out_filename = 'benchmarking-all-years.csv'

# Columns that should be strings because they are immutable identifiers
string_cols = [
Expand All @@ -25,6 +34,30 @@
'HistoricalWards2003-2015'
]

# The columns we want to have in our historical data output - we need the ID (to filter by a
# particular building) and should then have columns of interest that change over time (so yes to
# 'GHGIntensity', no to 'YearBuilt')
columns_to_track_over_time = [
'ID',
'DataYear',
'ReportingStatus',
'GrossFloorArea',
'TotalGHGEmissions',
'GHGIntensity',
'NumberOfBuildings',
'ChicagoEnergyRating',
'ENERGYSTARScore',
'ElectricityUse',
'NaturalGasUse',
'DistrictSteamUse',
'DistrictChilledWaterUse',
'AllOtherFuelUse',
'SiteEUI',
'SourceEUI',
'WeatherNormalizedSiteEUI',
'WeatherNormalizedSourceEUI',
]

replace_headers = {'Data Year': 'DataYear',
'ID': 'ID',
'Property Name': 'PropertyName',
Expand Down Expand Up @@ -65,22 +98,35 @@ def rename_columns(building_data: pd.DataFrame) -> pd.DataFrame:
return building_data.rename(columns=replace_headers)

def get_buildings_with_ghg_intensity(building_data: pd.DataFrame) -> pd.DataFrame:
"""Filter to buildings with a greenhouse gas intensity present, as otherwise it's likely empty
or junk data"""

return building_data.loc[(building_data['GHGIntensity'] > 0)].copy()

def get_submitted_data(building_data: pd.DataFrame) -> pd.DataFrame:
"""Filter down to building entries with reported data"""

is_submitted = (building_data['ReportingStatus'] == 'Submitted')
is_submitted_data = (building_data['ReportingStatus'] == 'Submitted Data')
has_status_submitted = is_submitted | is_submitted_data

return building_data.loc[has_status_submitted].copy()

def get_last_year_data(all_submitted_data: pd.DataFrame) -> pd.DataFrame:
""" Filter down data to only the latest submission (reported year) per building"""
all_submitted_data = all_submitted_data.sort_values(by=['ID', 'DataYear'])
all_recent_submitted_data = all_submitted_data.drop_duplicates(subset=['ID'], keep='last').copy()
return all_recent_submitted_data


def filter_cols_historic(building_data: pd.DataFrame) -> pd.DataFrame:
"""Filter down the reporting entries to only columns relevant to our historical data CSV"""

return building_data[columns_to_track_over_time]

def fix_str_cols(all_recent_submitted_data: pd.DataFrame, renamed_building_data: pd.DataFrame) -> pd.DataFrame:
# Mark columns that look like numbers but should be strings as such to prevent decimals showing
# up (e.g. zipcode of 60614 or Ward 9)
""" Mark columns that look like numbers but should be strings as such to prevent decimals showing
up (e.g. zipcode of 60614 or Ward 9) """
all_recent_submitted_data[string_cols] = renamed_building_data[string_cols].astype('string')
return all_recent_submitted_data

Expand All @@ -89,29 +135,41 @@ def fix_int_cols(building_data: pd.DataFrame) -> pd.DataFrame:
return building_data

def output_to_csv(building_data: pd.DataFrame, dir: str) -> None:
# Mark columns as ints that should never show a decimal, e.g. Number of Buildings, Zipcode
""" Mark columns as ints that should never show a decimal, e.g. Number of Buildings, Zipcode """
building_data.to_csv(dir, sep=',', encoding='utf-8', index=False)

def process(file_path: str) -> pd.DataFrame:
def process(file_path: str, latest_year_only: bool) -> pd.DataFrame:
"""Process an input file, renaming columns and applying filters based on whether we are getting
only the latest year for each building or all historic data"""
building_data = get_and_clean_csv(file_path)

building_data = rename_columns(building_data)

buildings_with_ghg_intensity = get_buildings_with_ghg_intensity(building_data)
# Used to be fix_str_cols(cleaned_data, building_data) when this was below the filtering
cleaned_data = fix_str_cols(building_data, building_data)
cleaned_data = fix_int_cols(cleaned_data)

all_submitted_data = get_submitted_data(buildings_with_ghg_intensity)
cleaned_data = get_buildings_with_ghg_intensity(building_data)

all_recent_submitted_data = get_last_year_data(all_submitted_data)
# Only filter to the latest reporting year if that's the file we're generating
if (latest_year_only):
cleaned_data = get_submitted_data(cleaned_data)
cleaned_data = get_last_year_data(cleaned_data)
else:
cleaned_data = filter_cols_historic(cleaned_data)

all_recent_submitted_data = fix_str_cols(all_recent_submitted_data, building_data)

all_recent_submitted_data = fix_int_cols(all_recent_submitted_data)

return all_recent_submitted_data
return cleaned_data

def main():
processed = process(get_data_file_path(file_dir, building_emissions_file))
output_to_csv(processed, get_data_file_path(file_dir, data_out_file))
processed_latest_year = process(get_data_file_path(file_dir, building_emissions_file), True)
processed_all_years = process(get_data_file_path(file_dir, building_emissions_file), False)

# Output the latest year data to source, since other processing steps still get applied
output_to_csv(processed_latest_year, get_data_file_path(file_dir, newest_instances_out_filename))

# The all years data is in it's final form already, we don't do ranks or stats off of it (yet)
output_to_csv(processed_all_years, get_data_file_path(out_file_dir, all_years_out_filename))

if __name__ == '__main__':
main()
Loading
Loading