vkoves · vkoves · Apr 2, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/.github/workflows/eslint.yml b/.github/workflows/eslint.yml
@@ -1,7 +1,3 @@
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
 # ESLint is a tool for identifying and reporting on patterns
 # found in ECMAScript/JavaScript code.
 # More details at https://github.com/eslint/eslint

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -0,0 +1,28 @@
+# A GitHub action to run our data tests
+
+name: Pytest Data Tests
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ "main" ]
+
+jobs:
+  pytest:
+    name: 'Pytest'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Test with pytest
+        run: |
+          pytest
diff --git a/tests/data/scripts/unit/test_clean_all_years.py b/tests/data/scripts/unit/test_clean_all_years.py
@@ -22,7 +22,7 @@ def src_building_data() -> pd.DataFrame:
 @pytest.fixture
 def csv_file() -> csv.reader:
     csvfile = open(get_test_file_path(test_input_file))
-    return csv.reader(csvfile) 
+    return csv.reader(csvfile)
 
 def test_csv_file_has_some_data(csv_file):
     first_line = csv_file.__next__()
@@ -48,7 +48,7 @@ def test_columns_are_renamed(src_building_data) -> pd.DataFrame:
     return df
 
 def test_data_has_positive_ghg_data(test_columns_are_renamed):
-    df = clean.get_all_ghg_data(test_columns_are_renamed)
+    df = clean.get_buildings_with_ghg_intensity(test_columns_are_renamed)
     assert df is not None
     assert np.all(df['GHGIntensity'] > 0)
 
@@ -64,12 +64,12 @@ def test_has_last_year_of_data(test_columns_are_renamed) -> pd.DataFrame:
 
 @pytest.fixture
 def fixed_strings(test_has_last_year_of_data, test_columns_are_renamed):
-    return clean.fix_str_cols(test_has_last_year_of_data, 
+    return clean.fix_str_cols(test_has_last_year_of_data,
                               test_columns_are_renamed)
 
 @pytest.fixture
 def fixed_strings_all_years(test_columns_are_renamed):
-    return clean.fix_str_cols(test_columns_are_renamed, 
+    return clean.fix_str_cols(test_columns_are_renamed,
                               test_columns_are_renamed)
 
 def test_str_values_remain_the_same_as_origin(fixed_strings_all_years, csv_file):
@@ -79,16 +79,32 @@ def test_str_values_remain_the_same_as_origin(fixed_strings_all_years, csv_file)
         year, id = csv_row[0], csv_row[1]
         row = fixed_strings_all_years[(fixed_strings_all_years['ID'].astype(str) == id) & \
                                       (fixed_strings_all_years['DataYear'].astype(str) == year)]
+
         for col, csv_pos in zip(clean.string_cols, str_col_positions):
             if all(pd.isna(row[col].to_numpy())):
                 continue
-            # print("df ", row[col].to_numpy(), "csv ", csv_row[csv_pos])
-            assert row[col].to_numpy()[0] == csv_row[csv_pos]
+
+            # The raw GPS in ChicagoEnergyBenchmarking.csv has 41.880451999999998, which gets
+            # truncated, so we round to ignore that, since it's not a significant difference
+            # TODO: Fix GPS inconsistency and drop  rounding
+            csv_value = csv_row[csv_pos]
+
+
+            # If > 10 or < -10, we truncate 0 after rounding to 6 decimals. This means this applies
+            # to GPS coordinates but not energy star ratings (e.g.)
+            if (abs(float(csv_value)) > 10):
+                print("df ", row[col].to_numpy(), "csv ", csv_value)
+                csv_float = float(csv_value)
+                csv_val_parsed = f'{csv_float:.9f}'.rstrip('0').rstrip('.')
+            else:
+                csv_val_parsed = csv_value
+
+            assert row[col].to_numpy()[0] == csv_val_parsed
 
 def test_lat_lon_become_strings(fixed_strings):
     df = fixed_strings[['Latitude','Longitude']]
     assert np.all(df.dtypes == 'string')
-    
+
 def test_int_values_remain_the_same_as_origin(test_has_last_year_of_data):
     df = clean.fix_int_cols(test_has_last_year_of_data)
     assert np.all(df[clean.int_cols].dtypes == 'Int64')