Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Python Data Tests & Add to CI Via GitHub Actions #80

Merged
merged 6 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/eslint.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
# ESLint is a tool for identifying and reporting on patterns
# found in ECMAScript/JavaScript code.
# More details at https://github.com/eslint/eslint
Expand Down
28 changes: 28 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# A GitHub action to run our data tests

name: Pytest Data Tests

on:
push:
branches: [ "main" ]
pull_request:
# The branches below must be a subset of the branches above
branches: [ "main" ]

jobs:
pytest:
name: 'Pytest'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Test with pytest
run: |
pytest
30 changes: 23 additions & 7 deletions tests/data/scripts/unit/test_clean_all_years.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def src_building_data() -> pd.DataFrame:
@pytest.fixture
def csv_file() -> csv.reader:
csvfile = open(get_test_file_path(test_input_file))
return csv.reader(csvfile)
return csv.reader(csvfile)

def test_csv_file_has_some_data(csv_file):
first_line = csv_file.__next__()
Expand All @@ -48,7 +48,7 @@ def test_columns_are_renamed(src_building_data) -> pd.DataFrame:
return df

def test_data_has_positive_ghg_data(test_columns_are_renamed):
df = clean.get_all_ghg_data(test_columns_are_renamed)
df = clean.get_buildings_with_ghg_intensity(test_columns_are_renamed)
assert df is not None
assert np.all(df['GHGIntensity'] > 0)

Expand All @@ -64,12 +64,12 @@ def test_has_last_year_of_data(test_columns_are_renamed) -> pd.DataFrame:

@pytest.fixture
def fixed_strings(test_has_last_year_of_data, test_columns_are_renamed):
return clean.fix_str_cols(test_has_last_year_of_data,
return clean.fix_str_cols(test_has_last_year_of_data,
test_columns_are_renamed)

@pytest.fixture
def fixed_strings_all_years(test_columns_are_renamed):
return clean.fix_str_cols(test_columns_are_renamed,
return clean.fix_str_cols(test_columns_are_renamed,
test_columns_are_renamed)

def test_str_values_remain_the_same_as_origin(fixed_strings_all_years, csv_file):
Expand All @@ -79,16 +79,32 @@ def test_str_values_remain_the_same_as_origin(fixed_strings_all_years, csv_file)
year, id = csv_row[0], csv_row[1]
row = fixed_strings_all_years[(fixed_strings_all_years['ID'].astype(str) == id) & \
(fixed_strings_all_years['DataYear'].astype(str) == year)]

for col, csv_pos in zip(clean.string_cols, str_col_positions):
if all(pd.isna(row[col].to_numpy())):
continue
# print("df ", row[col].to_numpy(), "csv ", csv_row[csv_pos])
assert row[col].to_numpy()[0] == csv_row[csv_pos]

# The raw GPS in ChicagoEnergyBenchmarking.csv has 41.880451999999998, which gets
# truncated, so we round to ignore that, since it's not a significant difference
# TODO: Fix GPS inconsistency and drop rounding
csv_value = csv_row[csv_pos]


# If > 10 or < -10, we truncate 0 after rounding to 6 decimals. This means this applies
# to GPS coordinates but not energy star ratings (e.g.)
if (abs(float(csv_value)) > 10):
print("df ", row[col].to_numpy(), "csv ", csv_value)
csv_float = float(csv_value)
csv_val_parsed = f'{csv_float:.9f}'.rstrip('0').rstrip('.')
else:
csv_val_parsed = csv_value

assert row[col].to_numpy()[0] == csv_val_parsed

def test_lat_lon_become_strings(fixed_strings):
df = fixed_strings[['Latitude','Longitude']]
assert np.all(df.dtypes == 'string')

def test_int_values_remain_the_same_as_origin(test_has_last_year_of_data):
df = clean.fix_int_cols(test_has_last_year_of_data)
assert np.all(df[clean.int_cols].dtypes == 'Int64')
Expand Down
Loading