Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use read_csv_to_dataframe in validation #1419

Merged
merged 19 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Unreleased Changes
(`#1374 <https://github.com/natcap/invest/issues/1374>`_)
* Datastack archives will now be correctly extracted
(`#1308 <https://github.com/natcap/invest/issues/1308>`_)
* Validation of tables has been improved and standardized, which should
result in more readable validation errors.
(`#1379 <https://github.com/natcap/invest/issues/1379>`_)
* Updated to ``pygeoprocessing`` 2.4.2. This includes an update to
``pygeoprocessing.zonal_statistics``, which is now more correct on certain
edge cases. Aggregated model results may change slightly.
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ GIT_TEST_DATA_REPO_REV := da013683e80ea094fbb2309197e2488c02794da8

GIT_UG_REPO := https://github.com/natcap/invest.users-guide
GIT_UG_REPO_PATH := doc/users-guide
GIT_UG_REPO_REV := 1db6aa847e07b774700ad1432172c791c4729dde
GIT_UG_REPO_REV := 6d40e3c8e56cfb09e579c58312d653086e69d6c4

ENV = "./env"
ifeq ($(OS),Windows_NT)
Expand Down
13 changes: 7 additions & 6 deletions src/natcap/invest/annual_water_yield.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,8 +526,9 @@ def execute(args):
'Checking that watersheds have entries for every `ws_id` in the '
'valuation table.')
# Open/read in valuation parameters from CSV file
valuation_df = utils.read_csv_to_dataframe(
args['valuation_table_path'], MODEL_SPEC['args']['valuation_table_path'])
valuation_df = validation.get_validated_dataframe(
args['valuation_table_path'],
**MODEL_SPEC['args']['valuation_table_path'])
watershed_vector = gdal.OpenEx(
args['watersheds_path'], gdal.OF_VECTOR)
watershed_layer = watershed_vector.GetLayer()
Expand Down Expand Up @@ -645,15 +646,15 @@ def execute(args):
'lulc': pygeoprocessing.get_raster_info(clipped_lulc_path)['nodata'][0]}

# Open/read in the csv file into a dictionary and add to arguments
bio_df = utils.read_csv_to_dataframe(args['biophysical_table_path'],
MODEL_SPEC['args']['biophysical_table_path'])
bio_df = validation.get_validated_dataframe(args['biophysical_table_path'],
**MODEL_SPEC['args']['biophysical_table_path'])
bio_lucodes = set(bio_df.index.values)
bio_lucodes.add(nodata_dict['lulc'])
LOGGER.debug(f'bio_lucodes: {bio_lucodes}')

if 'demand_table_path' in args and args['demand_table_path'] != '':
demand_df = utils.read_csv_to_dataframe(
args['demand_table_path'], MODEL_SPEC['args']['demand_table_path'])
demand_df = validation.get_validated_dataframe(
args['demand_table_path'], **MODEL_SPEC['args']['demand_table_path'])
demand_reclassify_dict = dict(
[(lucode, row['demand']) for lucode, row in demand_df.iterrows()])
demand_lucodes = set(demand_df.index.values)
Expand Down
4 changes: 2 additions & 2 deletions src/natcap/invest/carbon.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,8 @@ def execute(args):
(_INTERMEDIATE_BASE_FILES, intermediate_output_dir),
(_TMP_BASE_FILES, output_dir)], file_suffix)

carbon_pool_df = utils.read_csv_to_dataframe(
args['carbon_pools_path'], MODEL_SPEC['args']['carbon_pools_path'])
carbon_pool_df = validation.get_validated_dataframe(
args['carbon_pools_path'], **MODEL_SPEC['args']['carbon_pools_path'])

try:
n_workers = int(args['n_workers'])
Expand Down
24 changes: 12 additions & 12 deletions src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,9 +570,9 @@ def execute(args):
task_graph, n_workers, intermediate_dir, output_dir, suffix = (
_set_up_workspace(args))

snapshots = utils.read_csv_to_dataframe(
snapshots = validation.get_validated_dataframe(
args['landcover_snapshot_csv'],
MODEL_SPEC['args']['landcover_snapshot_csv']
**MODEL_SPEC['args']['landcover_snapshot_csv']
)['raster_path'].to_dict()

# Phase 1: alignment and preparation of inputs
Expand All @@ -593,9 +593,9 @@ def execute(args):

# We're assuming that the LULC initial variables and the carbon pool
# transient table are combined into a single lookup table.
biophysical_df = utils.read_csv_to_dataframe(
biophysical_df = validation.get_validated_dataframe(
args['biophysical_table_path'],
MODEL_SPEC['args']['biophysical_table_path'])
**MODEL_SPEC['args']['biophysical_table_path'])

# LULC Classnames are critical to the transition mapping, so they must be
# unique. This check is here in ``execute`` because it's possible that
Expand Down Expand Up @@ -963,9 +963,9 @@ def execute(args):
prices = None
if args.get('do_economic_analysis', False): # Do if truthy
if args.get('use_price_table', False):
prices = utils.read_csv_to_dataframe(
prices = validation.get_validated_dataframe(
args['price_table_path'],
MODEL_SPEC['args']['price_table_path']
**MODEL_SPEC['args']['price_table_path']
)['price'].to_dict()
else:
inflation_rate = float(args['inflation_rate']) * 0.01
Expand Down Expand Up @@ -1948,8 +1948,8 @@ def _read_transition_matrix(transition_csv_path, biophysical_df):
landcover transition, and the second contains accumulation rates for
the pool for the landcover transition.
"""
table = utils.read_csv_to_dataframe(
transition_csv_path, MODEL_SPEC['args']['landcover_transitions_table']
table = validation.get_validated_dataframe(
transition_csv_path, **MODEL_SPEC['args']['landcover_transitions_table']
).reset_index()

lulc_class_to_lucode = {}
Expand Down Expand Up @@ -2172,9 +2172,9 @@ def validate(args, limit_to=None):

if ("landcover_snapshot_csv" not in invalid_keys and
"landcover_snapshot_csv" in sufficient_keys):
snapshots = utils.read_csv_to_dataframe(
snapshots = validation.get_validated_dataframe(
args['landcover_snapshot_csv'],
MODEL_SPEC['args']['landcover_snapshot_csv']
**MODEL_SPEC['args']['landcover_snapshot_csv']
)['raster_path'].to_dict()

for snapshot_year, snapshot_raster_path in snapshots.items():
Expand Down Expand Up @@ -2204,8 +2204,8 @@ def validate(args, limit_to=None):
transitions_spec['columns']['[LULC CODE]']['options'].keys())
# lowercase options since utils call will lowercase table values
transition_options = [x.lower() for x in transition_options]
transitions_df = utils.read_csv_to_dataframe(
args['landcover_transitions_table'], transitions_spec)
transitions_df = validation.get_validated_dataframe(
args['landcover_transitions_table'], **transitions_spec)
transitions_mask = ~transitions_df.isin(transition_options) & ~transitions_df.isna()
if transitions_mask.any(axis=None):
transition_numpy_mask = transitions_mask.values
Expand Down
8 changes: 4 additions & 4 deletions src/natcap/invest/coastal_blue_carbon/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ def execute(args):
os.path.join(args['workspace_dir'], 'taskgraph_cache'),
n_workers, reporting_interval=5.0)

snapshots_dict = utils.read_csv_to_dataframe(
snapshots_dict = validation.get_validated_dataframe(
args['landcover_snapshot_csv'],
MODEL_SPEC['args']['landcover_snapshot_csv']
**MODEL_SPEC['args']['landcover_snapshot_csv']
)['raster_path'].to_dict()

# Align the raster stack for analyzing the various transitions.
Expand Down Expand Up @@ -213,9 +213,9 @@ def execute(args):
target_path_list=aligned_snapshot_paths,
task_name='Align input landcover rasters')

landcover_df = utils.read_csv_to_dataframe(
landcover_df = validation.get_validated_dataframe(
args['lulc_lookup_table_path'],
MODEL_SPEC['args']['lulc_lookup_table_path'])
**MODEL_SPEC['args']['lulc_lookup_table_path'])

target_transition_table = os.path.join(
output_dir, TRANSITION_TABLE.format(suffix=suffix))
Expand Down
28 changes: 15 additions & 13 deletions src/natcap/invest/coastal_vulnerability.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,19 @@
"Shore points with associated habitat data"),
"index_col": "shore_id",
"columns": {
# shore_id and R_hab come first so that they get
# matched before [HABITAT], which matches everything
"shore_id": {
"type": "integer",
"about": "Shore point ID"
},
"R_hab": {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving this up so that the r_hab column is matched before we get to the [HABITAT] pattern, which matches everything.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing this out! Could you add an inline comment about this so we don't forget in the future?

"about": (
"Overall habitat exposure rank, the "
"result of equation (15)"),
"type": "number",
"units": u.none
},
"[HABITAT]": {
"about": (
"Habitat exposure rank for the given "
Expand All @@ -477,13 +486,6 @@
"rank defined in the Habitats Table input."),
"type": "number",
"units": u.none
},
"R_hab": {
"about": (
"Overall habitat exposure rank, the "
"result of equation (15)"),
"type": "number",
"units": u.none
}
}
}
Expand Down Expand Up @@ -2302,8 +2304,8 @@ def _schedule_habitat_tasks(
list of pickle file path strings

"""
habitat_dataframe = utils.read_csv_to_dataframe(
habitat_table_path, MODEL_SPEC['args']['habitat_table_path']
habitat_dataframe = validation.get_validated_dataframe(
habitat_table_path, **MODEL_SPEC['args']['habitat_table_path']
).rename(columns={'protection distance (m)': 'distance'})

habitat_task_list = []
Expand Down Expand Up @@ -2831,8 +2833,8 @@ def assemble_results_and_calculate_exposure(
with open(pickle_path, 'rb') as file:
final_values_dict[var_name] = pickle.load(file)

habitat_df = utils.read_csv_to_dataframe(
habitat_protection_path, MODEL_SPEC['outputs']['intermediate'][
habitat_df = validation.get_validated_dataframe(
habitat_protection_path, **MODEL_SPEC['outputs']['intermediate'][
'contents']['habitats']['contents']['habitat_protection.csv']
).rename(columns={'r_hab': 'R_hab'})
output_layer.StartTransaction()
Expand Down Expand Up @@ -3459,8 +3461,8 @@ def _validate_habitat_table_paths(habitat_table_path):
Raises:
ValueError if any vector in the ``path`` column cannot be opened.
"""
habitat_dataframe = utils.read_csv_to_dataframe(
habitat_table_path, MODEL_SPEC['args']['habitat_table_path'])
habitat_dataframe = validation.get_validated_dataframe(
habitat_table_path, **MODEL_SPEC['args']['habitat_table_path'])
bad_paths = []
for habitat_row in habitat_dataframe.itertuples():
try:
Expand Down
12 changes: 6 additions & 6 deletions src/natcap/invest/crop_production_percentile.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,9 +468,9 @@ def execute(args):
None.

"""
crop_to_landcover_df = utils.read_csv_to_dataframe(
crop_to_landcover_df = validation.get_validated_dataframe(
args['landcover_to_crop_table_path'],
MODEL_SPEC['args']['landcover_to_crop_table_path'])
**MODEL_SPEC['args']['landcover_to_crop_table_path'])
bad_crop_name_list = []
for crop_name in crop_to_landcover_df.index:
crop_climate_bin_raster_path = os.path.join(
Expand Down Expand Up @@ -549,9 +549,9 @@ def execute(args):
climate_percentile_yield_table_path = os.path.join(
args['model_data_path'],
_CLIMATE_PERCENTILE_TABLE_PATTERN % crop_name)
crop_climate_percentile_df = utils.read_csv_to_dataframe(
crop_climate_percentile_df = validation.get_validated_dataframe(
climate_percentile_yield_table_path,
MODEL_SPEC['args']['model_data_path']['contents'][
**MODEL_SPEC['args']['model_data_path']['contents'][
'climate_percentile_yield_tables']['contents'][
'[CROP]_percentile_yield_table.csv'])
yield_percentile_headers = [
Expand Down Expand Up @@ -707,9 +707,9 @@ def execute(args):

# both 'crop_nutrient.csv' and 'crop' are known data/header values for
# this model data.
nutrient_df = utils.read_csv_to_dataframe(
nutrient_df = validation.get_validated_dataframe(
os.path.join(args['model_data_path'], 'crop_nutrient.csv'),
MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv'])
**MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv'])
result_table_path = os.path.join(
output_dir, 'result_table%s.csv' % file_suffix)

Expand Down
16 changes: 8 additions & 8 deletions src/natcap/invest/crop_production_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,13 +495,13 @@ def execute(args):

LOGGER.info(
"Checking if the landcover raster is missing lucodes")
crop_to_landcover_df = utils.read_csv_to_dataframe(
crop_to_landcover_df = validation.get_validated_dataframe(
args['landcover_to_crop_table_path'],
MODEL_SPEC['args']['landcover_to_crop_table_path'])
**MODEL_SPEC['args']['landcover_to_crop_table_path'])

crop_to_fertilization_rate_df = utils.read_csv_to_dataframe(
crop_to_fertilization_rate_df = validation.get_validated_dataframe(
args['fertilization_rate_table_path'],
MODEL_SPEC['args']['fertilization_rate_table_path'])
**MODEL_SPEC['args']['fertilization_rate_table_path'])

crop_lucodes = list(crop_to_landcover_df[_EXPECTED_LUCODE_TABLE_HEADER])

Expand Down Expand Up @@ -576,10 +576,10 @@ def execute(args):
task_name='crop_climate_bin')
dependent_task_list.append(crop_climate_bin_task)

crop_regression_df = utils.read_csv_to_dataframe(
crop_regression_df = validation.get_validated_dataframe(
os.path.join(args['model_data_path'],
_REGRESSION_TABLE_PATTERN % crop_name),
MODEL_SPEC['args']['model_data_path']['contents'][
**MODEL_SPEC['args']['model_data_path']['contents'][
'climate_regression_yield_tables']['contents'][
'[CROP]_regression_yield_table.csv'])
for _, row in crop_regression_df.iterrows():
Expand Down Expand Up @@ -803,9 +803,9 @@ def execute(args):

# both 'crop_nutrient.csv' and 'crop' are known data/header values for
# this model data.
nutrient_df = utils.read_csv_to_dataframe(
nutrient_df = validation.get_validated_dataframe(
os.path.join(args['model_data_path'], 'crop_nutrient.csv'),
MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv'])
**MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv'])

LOGGER.info("Generating report table")
crop_names = list(crop_to_landcover_df.index)
Expand Down
5 changes: 3 additions & 2 deletions src/natcap/invest/datastack.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from osgeo import gdal

from . import utils
from . import validation

try:
from . import __version__
Expand Down Expand Up @@ -333,8 +334,8 @@ def build_datastack_archive(args, model_name, datastack_path):
contained_files_dir = os.path.join(
data_dir, f'{key}_csv_data')

dataframe = utils.read_csv_to_dataframe(
source_path, args_spec[key])
dataframe = validation.get_validated_dataframe(
source_path, **args_spec[key])
csv_source_dir = os.path.abspath(os.path.dirname(source_path))
for spatial_column_name in spatial_columns:
# Iterate through the spatial columns, identify the set of
Expand Down
12 changes: 6 additions & 6 deletions src/natcap/invest/forest_carbon_edge_effect.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,9 +413,9 @@ def execute(args):
# Map non-forest landcover codes to carbon biomasses
LOGGER.info('Calculating direct mapped carbon stocks')
carbon_maps = []
biophysical_df = utils.read_csv_to_dataframe(
biophysical_df = validation.get_validated_dataframe(
args['biophysical_table_path'],
MODEL_SPEC['args']['biophysical_table_path'])
**MODEL_SPEC['args']['biophysical_table_path'])
pool_list = [('c_above', True)]
if args['pools_to_calculate'] == 'all':
pool_list.extend([
Expand Down Expand Up @@ -624,8 +624,8 @@ def _calculate_lulc_carbon_map(

"""
# classify forest pixels from lulc
biophysical_df = utils.read_csv_to_dataframe(
biophysical_table_path, MODEL_SPEC['args']['biophysical_table_path'])
biophysical_df = validation.get_validated_dataframe(
biophysical_table_path, **MODEL_SPEC['args']['biophysical_table_path'])

lucode_to_per_cell_carbon = {}
cell_size = pygeoprocessing.get_raster_info(
Expand Down Expand Up @@ -688,8 +688,8 @@ def _map_distance_from_tropical_forest_edge(

"""
# Build a list of forest lucodes
biophysical_df = utils.read_csv_to_dataframe(
biophysical_table_path, MODEL_SPEC['args']['biophysical_table_path'])
biophysical_df = validation.get_validated_dataframe(
biophysical_table_path, **MODEL_SPEC['args']['biophysical_table_path'])
forest_codes = biophysical_df[biophysical_df['is_tropical_forest']].index.values

# Make a raster where 1 is non-forest landcover types and 0 is forest
Expand Down
16 changes: 8 additions & 8 deletions src/natcap/invest/habitat_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,12 +372,12 @@ def execute(args):

LOGGER.info("Checking Threat and Sensitivity tables for compliance")
# Get CSVs as dictionaries and ensure the key is a string for threats.
threat_df = utils.read_csv_to_dataframe(
args['threats_table_path'], MODEL_SPEC['args']['threats_table_path']
threat_df = validation.get_validated_dataframe(
args['threats_table_path'], **MODEL_SPEC['args']['threats_table_path']
).fillna('')
sensitivity_df = utils.read_csv_to_dataframe(
sensitivity_df = validation.get_validated_dataframe(
args['sensitivity_table_path'],
MODEL_SPEC['args']['sensitivity_table_path'])
**MODEL_SPEC['args']['sensitivity_table_path'])

half_saturation_constant = float(args['half_saturation_constant'])

Expand Down Expand Up @@ -1086,12 +1086,12 @@ def validate(args, limit_to=None):
"sensitivity_table_path" not in invalid_keys and
"threat_raster_folder" not in invalid_keys):
# Get CSVs as dictionaries and ensure the key is a string for threats.
threat_df = utils.read_csv_to_dataframe(
threat_df = validation.get_validated_dataframe(
args['threats_table_path'],
MODEL_SPEC['args']['threats_table_path']).fillna('')
sensitivity_df = utils.read_csv_to_dataframe(
**MODEL_SPEC['args']['threats_table_path']).fillna('')
sensitivity_df = validation.get_validated_dataframe(
args['sensitivity_table_path'],
MODEL_SPEC['args']['sensitivity_table_path'])
**MODEL_SPEC['args']['sensitivity_table_path'])

# check that the threat names in the threats table match with the
# threats columns in the sensitivity table.
Expand Down
Loading