-
-
Notifications
You must be signed in to change notification settings - Fork 111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Eia176 wide table #3590
base: main
Are you sure you want to change the base?
Eia176 wide table #3590
Changes from 2 commits
b81ff5b
20ebdb9
9d9e590
b399e58
a921da2
1e9e482
a64a614
93002d0
a59aac4
3aee64e
c11545c
2824d7a
54523a4
df1eab8
489d657
9a71166
d3c7a8c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
"""Module to perform data cleaning functions on EIA176 data tables.""" | ||
|
||
import warnings | ||
|
||
import pandas as pd | ||
from dagster import ExperimentalWarning, asset, asset_check | ||
|
||
from pudl.logging_helpers import get_logger | ||
|
||
logger = get_logger(__name__) | ||
|
||
# Asset Checks are still Experimental, silence the warning since we use them | ||
# everywhere. | ||
warnings.filterwarnings("ignore", category=ExperimentalWarning) | ||
|
||
|
||
@asset | ||
def _core_eia176__data(raw_eia176__data: pd.DataFrame) -> pd.DataFrame: | ||
"""Take entity-attribute-value rows and convert to rows with primary key and one column per variable.""" | ||
raw_eia176__data["variable_name"] = ( | ||
raw_eia176__data["line"] + "_" + raw_eia176__data["atype"] | ||
) | ||
primary_key = ["report_year", "area", "id"] | ||
variable_names = list(raw_eia176__data.groupby("variable_name").count().index) | ||
wide_table = pd.DataFrame(columns=primary_key + variable_names) | ||
|
||
granular_data = raw_eia176__data[ | ||
(raw_eia176__data["company"] != " Total of All Companies") | ||
] | ||
for report_year, area, id_ in granular_data.groupby(primary_key).count().index: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks to me like you're trying to manually do an unstack() operation (take the variable names in the I was able to get a similarly shaped result across all years in ~5s by leveraging the built-in primary_key = ["report_year", "area", "id"]
raw_eia176__data["variable_name"] = (
raw_eia176__data["line"] + "_" + raw_eia176__data["atype"]
)
# TODO should probably sanitize this company name somewhere beforehand
granular = raw_eia176__data.loc[
raw_eia176__data.company.str.strip().str.lower() != "total of all companies"
]
unstacked = (
granular
.drop(columns=["itemsort", "item", "atype", "line", "company"])
.set_index(primary_key + ["variable_name"])
.unstack(level="variable_name")
)
# columns is a weird multi-index with ("value", "actual column name") - clean that up
unstacked.columns = unstacked.columns.droplevel(0)
unstacked.columns.name = None # gets rid of "variable_name" name of columns index
# TODO instead of "first NA value we see in each column" applied willy-nilly, we could check to see if there are any conflicting non-null values using .count() first.
condensed = unstacked.groupby(level=primary_key).first().reset_index()
return condensed One sort of weird thing that I'm curious about - in #3501 it looks like we wanted to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Outputs for comparison/posterity: `unstack`, all years
existing code, all years
They seem to be the same if you There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. I was looking for a built-in function and it wasn't a simple transpose. I've worked this in but need to get the pre-commit hooks passing. Will chip away at this over the next few days. |
||
# Get the data corresponding to one completed form EIA-176 | ||
form_data = granular_data[ | ||
(granular_data["report_year"] == report_year) | ||
& (granular_data["area"] == area) | ||
& (granular_data["id"] == id_) | ||
] | ||
|
||
wide_row = {"report_year": report_year, "area": area, "id": id_} | ||
|
||
# Translate each piece of data entered into the form into its own column | ||
for record in form_data.iterrows(): | ||
form_row = record[1] | ||
wide_row[form_row["variable_name"]] = form_row["value"] | ||
|
||
wide_table.loc[len(wide_table.index)] = wide_row | ||
|
||
return wide_table | ||
|
||
|
||
@asset_check(asset=_core_eia176__data, blocking=True) | ||
def validate_totals(): | ||
"""Compare reported and calculated totals for different geographical aggregates, report any differences.""" | ||
|
||
|
||
def _compare_totals( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was helpful for validating raw inputs before tackling how to work with them, but I think I want to adapt this to just validate the output of the transposition function above. Planning to do that and roll it into the |
||
reported_totals: pd.DataFrame, | ||
calculated_totals: pd.DataFrame, | ||
groupby_cols: list[str], | ||
) -> pd.DataFrame: | ||
"""Compare two dataframes representing reporting and calculated totals.""" | ||
reset_calculated = ( | ||
calculated_totals.sort_values(by=groupby_cols) | ||
.reset_index()[groupby_cols + ["value"]] | ||
.round(2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can use |
||
) | ||
|
||
reset_reported = ( | ||
reported_totals.sort_values(by=groupby_cols) | ||
.reset_index()[groupby_cols + ["value"]] | ||
.fillna(0) | ||
) | ||
|
||
return reset_calculated.compare(reset_reported) | ||
|
||
|
||
# TODO: Reasonable boundaries -- in a script/notebook in the 'validate' directory? How are those executed? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Other ideas on validations to cover here as an asset check? I also see reasonable boundaries in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't really know much about the actual semantics of the gas data - I think a reasonable thing to do is graph some of the different variables over time and see if anything jumps out as "suspicious", then bring that up and we can try to research that together. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Happy to just get min, max, and enumerated values based on data so far to validate against, maybe in a follow-up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This sounds like a perfect follow-up PR! |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import pandas as pd | ||
|
||
from pudl.transform.eia176 import _compare_totals, _core_eia176__data | ||
|
||
|
||
def get_test_df(): | ||
col_names = ["area", "atype", "company", "id", "line", "report_year", "value"] | ||
df = pd.DataFrame(columns=col_names) | ||
df.loc[0] = [ | ||
"New Mexico", | ||
"VL", | ||
"ZIA NATURAL GAS", | ||
"17635019NM", | ||
"1010", | ||
"2022", | ||
2013231.0, | ||
] | ||
df.loc[1] = [ | ||
"New Mexico", | ||
"VL", | ||
" Total of All Companies", | ||
"17635019NM", | ||
"1010", | ||
"2022", | ||
2013231.0, | ||
] | ||
|
||
return df | ||
|
||
|
||
def test_core_eia176__data(): | ||
eav_model = get_test_df() | ||
|
||
wide_table = _core_eia176__data(eav_model) | ||
assert wide_table.shape == (1, 4) | ||
row = wide_table.loc[0] | ||
assert list(row.index) == ["report_year", "area", "id", "1010_VL"] | ||
assert list(row.values) == ["2022", "New Mexico", "17635019NM", 2013231.0] | ||
|
||
|
||
def test_compare_totals_no_diff(): | ||
states_df = get_test_df() | ||
states_df.loc[2] = [ | ||
"New Mexico", | ||
"VL", | ||
"ANOTHER COMPANY", | ||
"12345679NM", | ||
"1010", | ||
"2022", | ||
1.0, | ||
] | ||
states_df.loc[1, "value"] += 1 | ||
|
||
reported_state_totals = states_df[states_df["company"] == " Total of All Companies"] | ||
state_companies_df = states_df[states_df["company"] != " Total of All Companies"] | ||
state_level_cols = ["report_year", "area", "line", "atype"] | ||
calculated_state_totals = state_companies_df.groupby(state_level_cols).sum() | ||
assert _compare_totals( | ||
reported_state_totals, calculated_state_totals, state_level_cols | ||
).empty | ||
|
||
|
||
# TODO: Test on specific details here | ||
def test_compare_totals_diff(): | ||
states_df = get_test_df() | ||
states_df.loc[2] = [ | ||
"New Mexico", | ||
"VL", | ||
"ANOTHER COMPANY", | ||
"12345679NM", | ||
"1010", | ||
"2022", | ||
1.0, | ||
] | ||
|
||
reported_state_totals = states_df[states_df["company"] == " Total of All Companies"] | ||
state_companies_df = states_df[states_df["company"] != " Total of All Companies"] | ||
state_level_cols = ["report_year", "area", "line", "atype"] | ||
calculated_state_totals = state_companies_df.groupby(state_level_cols).sum() | ||
assert not _compare_totals( | ||
reported_state_totals, calculated_state_totals, state_level_cols | ||
).empty | ||
|
||
|
||
# TODO: Implement, if we can even unit-test a function annotated as an asset check | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be able to! In my mind, asset checks are just assets, which can be called directly... |
||
def test_validate__totals(): | ||
pass |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI: They've graduated from "experimental" since you opened this draft PR, we can take this out! Benefits of waiting a month to review your PR 😅