diff --git a/HISTORY.rst b/HISTORY.rst index 3608338b4..4554c375c 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -49,6 +49,9 @@ Unreleased Changes (`#1374 `_) * Datastack archives will now be correctly extracted (`#1308 `_) + * Validation of tables has been improved and standardized, which should + result in more readable validation errors. + (`#1379 `_) * Updated to ``pygeoprocessing`` 2.4.2. This includes an update to ``pygeoprocessing.zonal_statistics``, which is now more correct on certain edge cases. Aggregated model results may change slightly. diff --git a/Makefile b/Makefile index 39514077a..95df9b5b3 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ GIT_TEST_DATA_REPO_REV := da013683e80ea094fbb2309197e2488c02794da8 GIT_UG_REPO := https://github.com/natcap/invest.users-guide GIT_UG_REPO_PATH := doc/users-guide -GIT_UG_REPO_REV := 1db6aa847e07b774700ad1432172c791c4729dde +GIT_UG_REPO_REV := 6d40e3c8e56cfb09e579c58312d653086e69d6c4 ENV = "./env" ifeq ($(OS),Windows_NT) diff --git a/src/natcap/invest/annual_water_yield.py b/src/natcap/invest/annual_water_yield.py index 314ddb815..b0ebf1e9d 100644 --- a/src/natcap/invest/annual_water_yield.py +++ b/src/natcap/invest/annual_water_yield.py @@ -526,8 +526,9 @@ def execute(args): 'Checking that watersheds have entries for every `ws_id` in the ' 'valuation table.') # Open/read in valuation parameters from CSV file - valuation_df = utils.read_csv_to_dataframe( - args['valuation_table_path'], MODEL_SPEC['args']['valuation_table_path']) + valuation_df = validation.get_validated_dataframe( + args['valuation_table_path'], + **MODEL_SPEC['args']['valuation_table_path']) watershed_vector = gdal.OpenEx( args['watersheds_path'], gdal.OF_VECTOR) watershed_layer = watershed_vector.GetLayer() @@ -645,15 +646,15 @@ def execute(args): 'lulc': pygeoprocessing.get_raster_info(clipped_lulc_path)['nodata'][0]} # Open/read in the csv file into a dictionary and add to arguments - bio_df = utils.read_csv_to_dataframe(args['biophysical_table_path'], - MODEL_SPEC['args']['biophysical_table_path']) + bio_df = validation.get_validated_dataframe(args['biophysical_table_path'], + **MODEL_SPEC['args']['biophysical_table_path']) bio_lucodes = set(bio_df.index.values) bio_lucodes.add(nodata_dict['lulc']) LOGGER.debug(f'bio_lucodes: {bio_lucodes}') if 'demand_table_path' in args and args['demand_table_path'] != '': - demand_df = utils.read_csv_to_dataframe( - args['demand_table_path'], MODEL_SPEC['args']['demand_table_path']) + demand_df = validation.get_validated_dataframe( + args['demand_table_path'], **MODEL_SPEC['args']['demand_table_path']) demand_reclassify_dict = dict( [(lucode, row['demand']) for lucode, row in demand_df.iterrows()]) demand_lucodes = set(demand_df.index.values) diff --git a/src/natcap/invest/carbon.py b/src/natcap/invest/carbon.py index 0ea0025a7..9752426c8 100644 --- a/src/natcap/invest/carbon.py +++ b/src/natcap/invest/carbon.py @@ -364,8 +364,8 @@ def execute(args): (_INTERMEDIATE_BASE_FILES, intermediate_output_dir), (_TMP_BASE_FILES, output_dir)], file_suffix) - carbon_pool_df = utils.read_csv_to_dataframe( - args['carbon_pools_path'], MODEL_SPEC['args']['carbon_pools_path']) + carbon_pool_df = validation.get_validated_dataframe( + args['carbon_pools_path'], **MODEL_SPEC['args']['carbon_pools_path']) try: n_workers = int(args['n_workers']) diff --git a/src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py b/src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py index e610aef97..f1f7fc67a 100644 --- a/src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py +++ b/src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py @@ -570,9 +570,9 @@ def execute(args): task_graph, n_workers, intermediate_dir, output_dir, suffix = ( _set_up_workspace(args)) - snapshots = utils.read_csv_to_dataframe( + snapshots = validation.get_validated_dataframe( args['landcover_snapshot_csv'], - MODEL_SPEC['args']['landcover_snapshot_csv'] + **MODEL_SPEC['args']['landcover_snapshot_csv'] )['raster_path'].to_dict() # Phase 1: alignment and preparation of inputs @@ -593,9 +593,9 @@ def execute(args): # We're assuming that the LULC initial variables and the carbon pool # transient table are combined into a single lookup table. - biophysical_df = utils.read_csv_to_dataframe( + biophysical_df = validation.get_validated_dataframe( args['biophysical_table_path'], - MODEL_SPEC['args']['biophysical_table_path']) + **MODEL_SPEC['args']['biophysical_table_path']) # LULC Classnames are critical to the transition mapping, so they must be # unique. This check is here in ``execute`` because it's possible that @@ -963,9 +963,9 @@ def execute(args): prices = None if args.get('do_economic_analysis', False): # Do if truthy if args.get('use_price_table', False): - prices = utils.read_csv_to_dataframe( + prices = validation.get_validated_dataframe( args['price_table_path'], - MODEL_SPEC['args']['price_table_path'] + **MODEL_SPEC['args']['price_table_path'] )['price'].to_dict() else: inflation_rate = float(args['inflation_rate']) * 0.01 @@ -1948,8 +1948,8 @@ def _read_transition_matrix(transition_csv_path, biophysical_df): landcover transition, and the second contains accumulation rates for the pool for the landcover transition. """ - table = utils.read_csv_to_dataframe( - transition_csv_path, MODEL_SPEC['args']['landcover_transitions_table'] + table = validation.get_validated_dataframe( + transition_csv_path, **MODEL_SPEC['args']['landcover_transitions_table'] ).reset_index() lulc_class_to_lucode = {} @@ -2172,9 +2172,9 @@ def validate(args, limit_to=None): if ("landcover_snapshot_csv" not in invalid_keys and "landcover_snapshot_csv" in sufficient_keys): - snapshots = utils.read_csv_to_dataframe( + snapshots = validation.get_validated_dataframe( args['landcover_snapshot_csv'], - MODEL_SPEC['args']['landcover_snapshot_csv'] + **MODEL_SPEC['args']['landcover_snapshot_csv'] )['raster_path'].to_dict() for snapshot_year, snapshot_raster_path in snapshots.items(): @@ -2204,8 +2204,8 @@ def validate(args, limit_to=None): transitions_spec['columns']['[LULC CODE]']['options'].keys()) # lowercase options since utils call will lowercase table values transition_options = [x.lower() for x in transition_options] - transitions_df = utils.read_csv_to_dataframe( - args['landcover_transitions_table'], transitions_spec) + transitions_df = validation.get_validated_dataframe( + args['landcover_transitions_table'], **transitions_spec) transitions_mask = ~transitions_df.isin(transition_options) & ~transitions_df.isna() if transitions_mask.any(axis=None): transition_numpy_mask = transitions_mask.values diff --git a/src/natcap/invest/coastal_blue_carbon/preprocessor.py b/src/natcap/invest/coastal_blue_carbon/preprocessor.py index 9cef76c8f..03fda7714 100644 --- a/src/natcap/invest/coastal_blue_carbon/preprocessor.py +++ b/src/natcap/invest/coastal_blue_carbon/preprocessor.py @@ -180,9 +180,9 @@ def execute(args): os.path.join(args['workspace_dir'], 'taskgraph_cache'), n_workers, reporting_interval=5.0) - snapshots_dict = utils.read_csv_to_dataframe( + snapshots_dict = validation.get_validated_dataframe( args['landcover_snapshot_csv'], - MODEL_SPEC['args']['landcover_snapshot_csv'] + **MODEL_SPEC['args']['landcover_snapshot_csv'] )['raster_path'].to_dict() # Align the raster stack for analyzing the various transitions. @@ -213,9 +213,9 @@ def execute(args): target_path_list=aligned_snapshot_paths, task_name='Align input landcover rasters') - landcover_df = utils.read_csv_to_dataframe( + landcover_df = validation.get_validated_dataframe( args['lulc_lookup_table_path'], - MODEL_SPEC['args']['lulc_lookup_table_path']) + **MODEL_SPEC['args']['lulc_lookup_table_path']) target_transition_table = os.path.join( output_dir, TRANSITION_TABLE.format(suffix=suffix)) diff --git a/src/natcap/invest/coastal_vulnerability.py b/src/natcap/invest/coastal_vulnerability.py index 3f591231c..ced65f9d7 100644 --- a/src/natcap/invest/coastal_vulnerability.py +++ b/src/natcap/invest/coastal_vulnerability.py @@ -461,10 +461,19 @@ "Shore points with associated habitat data"), "index_col": "shore_id", "columns": { + # shore_id and R_hab come first so that they get + # matched before [HABITAT], which matches everything "shore_id": { "type": "integer", "about": "Shore point ID" }, + "R_hab": { + "about": ( + "Overall habitat exposure rank, the " + "result of equation (15)"), + "type": "number", + "units": u.none + }, "[HABITAT]": { "about": ( "Habitat exposure rank for the given " @@ -477,13 +486,6 @@ "rank defined in the Habitats Table input."), "type": "number", "units": u.none - }, - "R_hab": { - "about": ( - "Overall habitat exposure rank, the " - "result of equation (15)"), - "type": "number", - "units": u.none } } } @@ -2302,8 +2304,8 @@ def _schedule_habitat_tasks( list of pickle file path strings """ - habitat_dataframe = utils.read_csv_to_dataframe( - habitat_table_path, MODEL_SPEC['args']['habitat_table_path'] + habitat_dataframe = validation.get_validated_dataframe( + habitat_table_path, **MODEL_SPEC['args']['habitat_table_path'] ).rename(columns={'protection distance (m)': 'distance'}) habitat_task_list = [] @@ -2831,8 +2833,8 @@ def assemble_results_and_calculate_exposure( with open(pickle_path, 'rb') as file: final_values_dict[var_name] = pickle.load(file) - habitat_df = utils.read_csv_to_dataframe( - habitat_protection_path, MODEL_SPEC['outputs']['intermediate'][ + habitat_df = validation.get_validated_dataframe( + habitat_protection_path, **MODEL_SPEC['outputs']['intermediate'][ 'contents']['habitats']['contents']['habitat_protection.csv'] ).rename(columns={'r_hab': 'R_hab'}) output_layer.StartTransaction() @@ -3459,8 +3461,8 @@ def _validate_habitat_table_paths(habitat_table_path): Raises: ValueError if any vector in the ``path`` column cannot be opened. """ - habitat_dataframe = utils.read_csv_to_dataframe( - habitat_table_path, MODEL_SPEC['args']['habitat_table_path']) + habitat_dataframe = validation.get_validated_dataframe( + habitat_table_path, **MODEL_SPEC['args']['habitat_table_path']) bad_paths = [] for habitat_row in habitat_dataframe.itertuples(): try: diff --git a/src/natcap/invest/crop_production_percentile.py b/src/natcap/invest/crop_production_percentile.py index c91fb5ae9..a2ad8e772 100644 --- a/src/natcap/invest/crop_production_percentile.py +++ b/src/natcap/invest/crop_production_percentile.py @@ -468,9 +468,9 @@ def execute(args): None. """ - crop_to_landcover_df = utils.read_csv_to_dataframe( + crop_to_landcover_df = validation.get_validated_dataframe( args['landcover_to_crop_table_path'], - MODEL_SPEC['args']['landcover_to_crop_table_path']) + **MODEL_SPEC['args']['landcover_to_crop_table_path']) bad_crop_name_list = [] for crop_name in crop_to_landcover_df.index: crop_climate_bin_raster_path = os.path.join( @@ -549,9 +549,9 @@ def execute(args): climate_percentile_yield_table_path = os.path.join( args['model_data_path'], _CLIMATE_PERCENTILE_TABLE_PATTERN % crop_name) - crop_climate_percentile_df = utils.read_csv_to_dataframe( + crop_climate_percentile_df = validation.get_validated_dataframe( climate_percentile_yield_table_path, - MODEL_SPEC['args']['model_data_path']['contents'][ + **MODEL_SPEC['args']['model_data_path']['contents'][ 'climate_percentile_yield_tables']['contents'][ '[CROP]_percentile_yield_table.csv']) yield_percentile_headers = [ @@ -707,9 +707,9 @@ def execute(args): # both 'crop_nutrient.csv' and 'crop' are known data/header values for # this model data. - nutrient_df = utils.read_csv_to_dataframe( + nutrient_df = validation.get_validated_dataframe( os.path.join(args['model_data_path'], 'crop_nutrient.csv'), - MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv']) + **MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv']) result_table_path = os.path.join( output_dir, 'result_table%s.csv' % file_suffix) diff --git a/src/natcap/invest/crop_production_regression.py b/src/natcap/invest/crop_production_regression.py index da29ec52f..4b10ffdd3 100644 --- a/src/natcap/invest/crop_production_regression.py +++ b/src/natcap/invest/crop_production_regression.py @@ -495,13 +495,13 @@ def execute(args): LOGGER.info( "Checking if the landcover raster is missing lucodes") - crop_to_landcover_df = utils.read_csv_to_dataframe( + crop_to_landcover_df = validation.get_validated_dataframe( args['landcover_to_crop_table_path'], - MODEL_SPEC['args']['landcover_to_crop_table_path']) + **MODEL_SPEC['args']['landcover_to_crop_table_path']) - crop_to_fertilization_rate_df = utils.read_csv_to_dataframe( + crop_to_fertilization_rate_df = validation.get_validated_dataframe( args['fertilization_rate_table_path'], - MODEL_SPEC['args']['fertilization_rate_table_path']) + **MODEL_SPEC['args']['fertilization_rate_table_path']) crop_lucodes = list(crop_to_landcover_df[_EXPECTED_LUCODE_TABLE_HEADER]) @@ -576,10 +576,10 @@ def execute(args): task_name='crop_climate_bin') dependent_task_list.append(crop_climate_bin_task) - crop_regression_df = utils.read_csv_to_dataframe( + crop_regression_df = validation.get_validated_dataframe( os.path.join(args['model_data_path'], _REGRESSION_TABLE_PATTERN % crop_name), - MODEL_SPEC['args']['model_data_path']['contents'][ + **MODEL_SPEC['args']['model_data_path']['contents'][ 'climate_regression_yield_tables']['contents'][ '[CROP]_regression_yield_table.csv']) for _, row in crop_regression_df.iterrows(): @@ -803,9 +803,9 @@ def execute(args): # both 'crop_nutrient.csv' and 'crop' are known data/header values for # this model data. - nutrient_df = utils.read_csv_to_dataframe( + nutrient_df = validation.get_validated_dataframe( os.path.join(args['model_data_path'], 'crop_nutrient.csv'), - MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv']) + **MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv']) LOGGER.info("Generating report table") crop_names = list(crop_to_landcover_df.index) diff --git a/src/natcap/invest/datastack.py b/src/natcap/invest/datastack.py index 4a9cd52d4..461a12d60 100644 --- a/src/natcap/invest/datastack.py +++ b/src/natcap/invest/datastack.py @@ -35,6 +35,7 @@ from osgeo import gdal from . import utils +from . import validation try: from . import __version__ @@ -333,8 +334,8 @@ def build_datastack_archive(args, model_name, datastack_path): contained_files_dir = os.path.join( data_dir, f'{key}_csv_data') - dataframe = utils.read_csv_to_dataframe( - source_path, args_spec[key]) + dataframe = validation.get_validated_dataframe( + source_path, **args_spec[key]) csv_source_dir = os.path.abspath(os.path.dirname(source_path)) for spatial_column_name in spatial_columns: # Iterate through the spatial columns, identify the set of diff --git a/src/natcap/invest/forest_carbon_edge_effect.py b/src/natcap/invest/forest_carbon_edge_effect.py index ee73f3ac4..bb0fb399d 100644 --- a/src/natcap/invest/forest_carbon_edge_effect.py +++ b/src/natcap/invest/forest_carbon_edge_effect.py @@ -413,9 +413,9 @@ def execute(args): # Map non-forest landcover codes to carbon biomasses LOGGER.info('Calculating direct mapped carbon stocks') carbon_maps = [] - biophysical_df = utils.read_csv_to_dataframe( + biophysical_df = validation.get_validated_dataframe( args['biophysical_table_path'], - MODEL_SPEC['args']['biophysical_table_path']) + **MODEL_SPEC['args']['biophysical_table_path']) pool_list = [('c_above', True)] if args['pools_to_calculate'] == 'all': pool_list.extend([ @@ -624,8 +624,8 @@ def _calculate_lulc_carbon_map( """ # classify forest pixels from lulc - biophysical_df = utils.read_csv_to_dataframe( - biophysical_table_path, MODEL_SPEC['args']['biophysical_table_path']) + biophysical_df = validation.get_validated_dataframe( + biophysical_table_path, **MODEL_SPEC['args']['biophysical_table_path']) lucode_to_per_cell_carbon = {} cell_size = pygeoprocessing.get_raster_info( @@ -688,8 +688,8 @@ def _map_distance_from_tropical_forest_edge( """ # Build a list of forest lucodes - biophysical_df = utils.read_csv_to_dataframe( - biophysical_table_path, MODEL_SPEC['args']['biophysical_table_path']) + biophysical_df = validation.get_validated_dataframe( + biophysical_table_path, **MODEL_SPEC['args']['biophysical_table_path']) forest_codes = biophysical_df[biophysical_df['is_tropical_forest']].index.values # Make a raster where 1 is non-forest landcover types and 0 is forest diff --git a/src/natcap/invest/habitat_quality.py b/src/natcap/invest/habitat_quality.py index 3f8de3f7e..993edeb9a 100644 --- a/src/natcap/invest/habitat_quality.py +++ b/src/natcap/invest/habitat_quality.py @@ -372,12 +372,12 @@ def execute(args): LOGGER.info("Checking Threat and Sensitivity tables for compliance") # Get CSVs as dictionaries and ensure the key is a string for threats. - threat_df = utils.read_csv_to_dataframe( - args['threats_table_path'], MODEL_SPEC['args']['threats_table_path'] + threat_df = validation.get_validated_dataframe( + args['threats_table_path'], **MODEL_SPEC['args']['threats_table_path'] ).fillna('') - sensitivity_df = utils.read_csv_to_dataframe( + sensitivity_df = validation.get_validated_dataframe( args['sensitivity_table_path'], - MODEL_SPEC['args']['sensitivity_table_path']) + **MODEL_SPEC['args']['sensitivity_table_path']) half_saturation_constant = float(args['half_saturation_constant']) @@ -1086,12 +1086,12 @@ def validate(args, limit_to=None): "sensitivity_table_path" not in invalid_keys and "threat_raster_folder" not in invalid_keys): # Get CSVs as dictionaries and ensure the key is a string for threats. - threat_df = utils.read_csv_to_dataframe( + threat_df = validation.get_validated_dataframe( args['threats_table_path'], - MODEL_SPEC['args']['threats_table_path']).fillna('') - sensitivity_df = utils.read_csv_to_dataframe( + **MODEL_SPEC['args']['threats_table_path']).fillna('') + sensitivity_df = validation.get_validated_dataframe( args['sensitivity_table_path'], - MODEL_SPEC['args']['sensitivity_table_path']) + **MODEL_SPEC['args']['sensitivity_table_path']) # check that the threat names in the threats table match with the # threats columns in the sensitivity table. diff --git a/src/natcap/invest/hra.py b/src/natcap/invest/hra.py index 99a8f035c..1b964f499 100644 --- a/src/natcap/invest/hra.py +++ b/src/natcap/invest/hra.py @@ -551,8 +551,7 @@ def execute(args): f" Missing from criteria table: {missing_from_criteria_table}" ) - criteria_df = pandas.read_csv(composite_criteria_table_path, - index_col=False) + criteria_df = utils.read_csv_to_dataframe(composite_criteria_table_path) # Because criteria may be spatial, we need to prepare those spatial inputs # as well. spatial_criteria_attrs = {} @@ -1786,8 +1785,8 @@ def _parse_info_table(info_table_path): info_table_path = os.path.abspath(info_table_path) try: - table = utils.read_csv_to_dataframe( - info_table_path, MODEL_SPEC['args']['info_table_path']) + table = validation.get_validated_dataframe( + info_table_path, **MODEL_SPEC['args']['info_table_path']) except ValueError as err: if 'Index has duplicate keys' in str(err): raise ValueError("Habitat and stressor names may not overlap.") @@ -1829,8 +1828,8 @@ def _parse_criteria_table(criteria_table_path, target_composite_csv_path): """ # This function requires that the table is read as a numpy array, so it's # easiest to read the table directly. - table = pandas.read_csv(criteria_table_path, header=None, sep=None, - engine='python').to_numpy() + table = utils.read_csv_to_dataframe( + criteria_table_path, header=None).to_numpy() # clean up any leading or trailing whitespace. for row_num in range(table.shape[0]): @@ -2379,8 +2378,8 @@ def _override_datastack_archive_criteria_table_path( the data dir. """ args_key = 'criteria_table_path' - criteria_table_array = pandas.read_csv( - criteria_table_path, header=None, sep=None, engine='python').to_numpy() + criteria_table_array = utils.read_csv_to_dataframe( + criteria_table_path, header=None).to_numpy() contained_data_dir = os.path.join(data_dir, f'{args_key}_data') known_rating_cols = set() diff --git a/src/natcap/invest/ndr/ndr.py b/src/natcap/invest/ndr/ndr.py index 9d7370631..4958a6252 100644 --- a/src/natcap/invest/ndr/ndr.py +++ b/src/natcap/invest/ndr/ndr.py @@ -1,4 +1,5 @@ """InVEST Nutrient Delivery Ratio (NDR) module.""" +import copy import itertools import logging import os @@ -574,9 +575,9 @@ def execute(args): if args['calc_' + nutrient_id]: nutrients_to_process.append(nutrient_id) - biophysical_df = utils.read_csv_to_dataframe( + biophysical_df = validation.get_validated_dataframe( args['biophysical_table_path'], - MODEL_SPEC['args']['biophysical_table_path']) + **MODEL_SPEC['args']['biophysical_table_path']) # these are used for aggregation in the last step field_pickle_map = {} @@ -1161,39 +1162,30 @@ def validate(args, limit_to=None): be an empty list if validation succeeds. """ + spec_copy = copy.deepcopy(MODEL_SPEC['args']) + # Check required fields given the state of ``calc_n`` and ``calc_p`` + nutrients_selected = [] + for nutrient_letter in ('n', 'p'): + if f'calc_{nutrient_letter}' in args and args[f'calc_{nutrient_letter}']: + nutrients_selected.append(nutrient_letter) + + for param in ['load', 'eff', 'crit_len']: + for nutrient in nutrients_selected: + spec_copy['biophysical_table_path']['columns'][f'{param}_{nutrient}'] = ( + spec_copy['biophysical_table_path']['columns'][f'{param}_[NUTRIENT]']) + spec_copy['biophysical_table_path']['columns'][f'{param}_{nutrient}']['required'] = True + spec_copy['biophysical_table_path']['columns'].pop(f'{param}_[NUTRIENT]') + + if 'n' in nutrients_selected: + spec_copy['biophysical_table_path']['columns']['proportion_subsurface_n'][ + 'required'] = True + validation_warnings = validation.validate( - args, MODEL_SPEC['args'], MODEL_SPEC['args_with_spatial_overlap']) - - invalid_keys = validation.get_invalid_keys(validation_warnings) - - LOGGER.debug('Starting logging for biophysical table') - if 'biophysical_table_path' not in invalid_keys: - # Check required fields given the state of ``calc_n`` and ``calc_p`` - nutrient_required_fields = ['lucode'] - nutrients_selected = set() - for nutrient_letter in ('n', 'p'): - if nutrient_letter == 'n': - nutrient_required_fields += ['proportion_subsurface_n'] - do_nutrient_key = f'calc_{nutrient_letter}' - if do_nutrient_key in args and args[do_nutrient_key]: - nutrients_selected.add(do_nutrient_key) - nutrient_required_fields += [ - f'load_{nutrient_letter}', - f'eff_{nutrient_letter}', - f'crit_len_{nutrient_letter}' - ] - if not nutrients_selected: - validation_warnings.append( - (['calc_n', 'calc_p'], MISSING_NUTRIENT_MSG)) - - # Check that these nutrient-specific keys are in the table - # validate has already checked all the other keys - error_msg = validation.check_csv( - args['biophysical_table_path'], - columns={key: '' for key in nutrient_required_fields}) - if error_msg: - validation_warnings.append( - (['biophysical_table_path'], error_msg)) + args, spec_copy, MODEL_SPEC['args_with_spatial_overlap']) + + if not nutrients_selected: + validation_warnings.append( + (['calc_n', 'calc_p'], MISSING_NUTRIENT_MSG)) return validation_warnings diff --git a/src/natcap/invest/pollination.py b/src/natcap/invest/pollination.py index c5e5fc369..f6fe6d221 100644 --- a/src/natcap/invest/pollination.py +++ b/src/natcap/invest/pollination.py @@ -1205,8 +1205,8 @@ def _parse_scenario_variables(args): else: farm_vector_path = None - guild_df = utils.read_csv_to_dataframe( - guild_table_path, MODEL_SPEC['args']['guild_table_path']) + guild_df = validation.get_validated_dataframe( + guild_table_path, **MODEL_SPEC['args']['guild_table_path']) LOGGER.info('Checking to make sure guild table has all expected headers') for header in _EXPECTED_GUILD_HEADERS: @@ -1217,9 +1217,9 @@ def _parse_scenario_variables(args): f"'{header}' but was unable to find one. Here are all the " f"headers from {guild_table_path}: {', '.join(guild_df.columns)}") - landcover_biophysical_df = utils.read_csv_to_dataframe( + landcover_biophysical_df = validation.get_validated_dataframe( landcover_biophysical_table_path, - MODEL_SPEC['args']['landcover_biophysical_table_path']) + **MODEL_SPEC['args']['landcover_biophysical_table_path']) biophysical_table_headers = landcover_biophysical_df.columns for header in _EXPECTED_BIOPHYSICAL_HEADERS: matches = re.findall(header, " ".join(biophysical_table_headers)) diff --git a/src/natcap/invest/recreation/recmodel_client.py b/src/natcap/invest/recreation/recmodel_client.py index f212ecb82..c695eef54 100644 --- a/src/natcap/invest/recreation/recmodel_client.py +++ b/src/natcap/invest/recreation/recmodel_client.py @@ -842,8 +842,8 @@ def _schedule_predictor_data_processing( 'line_intersect_length': _line_intersect_length, } - predictor_df = utils.read_csv_to_dataframe( - predictor_table_path, MODEL_SPEC['args']['predictor_table_path']) + predictor_df = validation.get_validated_dataframe( + predictor_table_path, **MODEL_SPEC['args']['predictor_table_path']) predictor_task_list = [] predictor_json_list = [] # tracks predictor files to add to shp @@ -1530,8 +1530,8 @@ def _validate_same_id_lengths(table_path): string message if IDs are too long """ - predictor_df = utils.read_csv_to_dataframe( - table_path, MODEL_SPEC['args']['predictor_table_path']) + predictor_df = validation.get_validated_dataframe( + table_path, **MODEL_SPEC['args']['predictor_table_path']) too_long = set() for p_id in predictor_df.index: if len(p_id) > 10: @@ -1559,12 +1559,12 @@ def _validate_same_ids_and_types( string message if any of the fields in 'id' and 'type' don't match between tables. """ - predictor_df = utils.read_csv_to_dataframe( - predictor_table_path, MODEL_SPEC['args']['predictor_table_path']) + predictor_df = validation.get_validated_dataframe( + predictor_table_path, **MODEL_SPEC['args']['predictor_table_path']) - scenario_predictor_df = utils.read_csv_to_dataframe( + scenario_predictor_df = validation.get_validated_dataframe( scenario_predictor_table_path, - MODEL_SPEC['args']['scenario_predictor_table_path']) + **MODEL_SPEC['args']['scenario_predictor_table_path']) predictor_pairs = set([ (p_id, row['type']) for p_id, row in predictor_df.iterrows()]) @@ -1589,8 +1589,8 @@ def _validate_same_projection(base_vector_path, table_path): """ # This will load the table as a list of paths which we can iterate through # without bothering the rest of the table structure - data_paths = utils.read_csv_to_dataframe( - table_path, MODEL_SPEC['args']['predictor_table_path'] + data_paths = validation.get_validated_dataframe( + table_path, **MODEL_SPEC['args']['predictor_table_path'] )['path'].tolist() base_vector = gdal.OpenEx(base_vector_path, gdal.OF_VECTOR) @@ -1640,8 +1640,8 @@ def _validate_predictor_types(table_path): string message if any value in the ``type`` column does not match a valid type, ignoring leading/trailing whitespace. """ - df = utils.read_csv_to_dataframe( - table_path, MODEL_SPEC['args']['predictor_table_path']) + df = validation.get_validated_dataframe( + table_path, **MODEL_SPEC['args']['predictor_table_path']) # ignore leading/trailing whitespace because it will be removed # when the type values are used valid_types = set({'raster_mean', 'raster_sum', 'point_count', diff --git a/src/natcap/invest/sdr/sdr.py b/src/natcap/invest/sdr/sdr.py index 5833ac174..9a8ad2d66 100644 --- a/src/natcap/invest/sdr/sdr.py +++ b/src/natcap/invest/sdr/sdr.py @@ -494,8 +494,9 @@ def execute(args): """ file_suffix = utils.make_suffix_string(args, 'results_suffix') - biophysical_df = utils.read_csv_to_dataframe( - args['biophysical_table_path'], MODEL_SPEC['args']['biophysical_table_path']) + biophysical_df = validation.get_validated_dataframe( + args['biophysical_table_path'], + **MODEL_SPEC['args']['biophysical_table_path']) # Test to see if c or p values are outside of 0..1 for key in ['usle_c', 'usle_p']: diff --git a/src/natcap/invest/seasonal_water_yield/seasonal_water_yield.py b/src/natcap/invest/seasonal_water_yield/seasonal_water_yield.py index e969be9e0..4f8d7396a 100644 --- a/src/natcap/invest/seasonal_water_yield/seasonal_water_yield.py +++ b/src/natcap/invest/seasonal_water_yield/seasonal_water_yield.py @@ -587,19 +587,19 @@ def execute(args): # fail early on a missing required rain events table if (not args['user_defined_local_recharge'] and not args['user_defined_climate_zones']): - rain_events_df = utils.read_csv_to_dataframe( + rain_events_df = validation.get_validated_dataframe( args['rain_events_table_path'], - MODEL_SPEC['args']['rain_events_table_path']) + **MODEL_SPEC['args']['rain_events_table_path']) - biophysical_df = utils.read_csv_to_dataframe( + biophysical_df = validation.get_validated_dataframe( args['biophysical_table_path'], - MODEL_SPEC['args']['biophysical_table_path']) + **MODEL_SPEC['args']['biophysical_table_path']) if args['monthly_alpha']: # parse out the alpha lookup table of the form (month_id: alpha_val) - alpha_month_map = utils.read_csv_to_dataframe( + alpha_month_map = validation.get_validated_dataframe( args['monthly_alpha_path'], - MODEL_SPEC['args']['monthly_alpha_path'] + **MODEL_SPEC['args']['monthly_alpha_path'] )['alpha'].to_dict() else: # make all 12 entries equal to args['alpha_m'] @@ -766,9 +766,9 @@ def execute(args): 'table_name': 'Climate Zone'} for month_id in range(N_MONTHS): if args['user_defined_climate_zones']: - cz_rain_events_df = utils.read_csv_to_dataframe( + cz_rain_events_df = validation.get_validated_dataframe( args['climate_zone_table_path'], - MODEL_SPEC['args']['climate_zone_table_path']) + **MODEL_SPEC['args']['climate_zone_table_path']) climate_zone_rain_events_month = ( cz_rain_events_df[MONTH_ID_TO_LABEL[month_id]].to_dict()) n_events_task = task_graph.add_task( diff --git a/src/natcap/invest/stormwater.py b/src/natcap/invest/stormwater.py index 5bd7f5366..ba10e31d2 100644 --- a/src/natcap/invest/stormwater.py +++ b/src/natcap/invest/stormwater.py @@ -486,8 +486,8 @@ def execute(args): # Build a lookup dictionary mapping each LULC code to its row # sort by the LULC codes upfront because we use the sorted list in multiple # places. it's more efficient to do this once. - biophysical_df = utils.read_csv_to_dataframe( - args['biophysical_table'], MODEL_SPEC['args']['biophysical_table'] + biophysical_df = validation.get_validated_dataframe( + args['biophysical_table'], **MODEL_SPEC['args']['biophysical_table'] ).sort_index() sorted_lucodes = biophysical_df.index.to_list() diff --git a/src/natcap/invest/urban_cooling_model.py b/src/natcap/invest/urban_cooling_model.py index 1f1f86b53..1581d7521 100644 --- a/src/natcap/invest/urban_cooling_model.py +++ b/src/natcap/invest/urban_cooling_model.py @@ -413,8 +413,9 @@ def execute(args): intermediate_dir = os.path.join( args['workspace_dir'], 'intermediate') utils.make_directories([args['workspace_dir'], intermediate_dir]) - biophysical_df = utils.read_csv_to_dataframe( - args['biophysical_table_path'], MODEL_SPEC['args']['biophysical_table_path']) + biophysical_df = validation.get_validated_dataframe( + args['biophysical_table_path'], + **MODEL_SPEC['args']['biophysical_table_path']) # cast to float and calculate relative weights # Use default weights for shade, albedo, eti if the user didn't provide @@ -1084,9 +1085,9 @@ def calculate_energy_savings( for field in target_building_layer.schema] type_field_index = fieldnames.index('type') - energy_consumption_df = utils.read_csv_to_dataframe( + energy_consumption_df = validation.get_validated_dataframe( energy_consumption_table_path, - MODEL_SPEC['args']['energy_consumption_table_path']) + **MODEL_SPEC['args']['energy_consumption_table_path']) target_building_layer.StartTransaction() last_time = time.time() diff --git a/src/natcap/invest/urban_flood_risk_mitigation.py b/src/natcap/invest/urban_flood_risk_mitigation.py index 7703dfcd1..7f865e5e3 100644 --- a/src/natcap/invest/urban_flood_risk_mitigation.py +++ b/src/natcap/invest/urban_flood_risk_mitigation.py @@ -298,9 +298,9 @@ def execute(args): task_name='align raster stack') # Load CN table - cn_df = utils.read_csv_to_dataframe( + cn_df = validation.get_validated_dataframe( args['curve_number_table_path'], - MODEL_SPEC['args']['curve_number_table_path']) + **MODEL_SPEC['args']['curve_number_table_path']) # make cn_table into a 2d array where first dim is lucode, second is # 0..3 to correspond to CN_A..CN_D @@ -640,9 +640,9 @@ def _calculate_damage_to_infrastructure_in_aoi( infrastructure_vector = gdal.OpenEx(structures_vector_path, gdal.OF_VECTOR) infrastructure_layer = infrastructure_vector.GetLayer() - damage_type_map = utils.read_csv_to_dataframe( + damage_type_map = validation.get_validated_dataframe( structures_damage_table, - MODEL_SPEC['args']['infrastructure_damage_loss_table_path'] + **MODEL_SPEC['args']['infrastructure_damage_loss_table_path'] )['damage'].to_dict() infrastructure_layer_defn = infrastructure_layer.GetLayerDefn() @@ -942,9 +942,9 @@ def validate(args, limit_to=None): if ("curve_number_table_path" not in invalid_keys and "curve_number_table_path" in sufficient_keys): # Load CN table. Resulting DF has index and CN_X columns only. - cn_df = utils.read_csv_to_dataframe( + cn_df = validation.get_validated_dataframe( args['curve_number_table_path'], - MODEL_SPEC['args']['curve_number_table_path']) + **MODEL_SPEC['args']['curve_number_table_path']) # Check for NaN values. nan_mask = cn_df.isna() if nan_mask.any(axis=None): diff --git a/src/natcap/invest/urban_nature_access.py b/src/natcap/invest/urban_nature_access.py index f02a44361..c0fc76296 100644 --- a/src/natcap/invest/urban_nature_access.py +++ b/src/natcap/invest/urban_nature_access.py @@ -924,9 +924,9 @@ def execute(args): aoi_reprojection_task, lulc_mask_task] ) - attr_table = utils.read_csv_to_dataframe( + attr_table = validation.get_validated_dataframe( args['lulc_attribute_table'], - MODEL_SPEC['args']['lulc_attribute_table']) + **MODEL_SPEC['args']['lulc_attribute_table']) kernel_paths = {} # search_radius, kernel path kernel_tasks = {} # search_radius, kernel task @@ -944,9 +944,9 @@ def execute(args): lucode_to_search_radii = list( urban_nature_attrs[['search_radius_m']].itertuples(name=None)) elif args['search_radius_mode'] == RADIUS_OPT_POP_GROUP: - pop_group_table = utils.read_csv_to_dataframe( + pop_group_table = validation.get_validated_dataframe( args['population_group_radii_table'], - MODEL_SPEC['args']['population_group_radii_table']) + **MODEL_SPEC['args']['population_group_radii_table']) search_radii = set(pop_group_table['search_radius_m'].unique()) # Build a dict of {pop_group: search_radius_m} search_radii_by_pop_group = pop_group_table['search_radius_m'].to_dict() @@ -1835,8 +1835,8 @@ def _reclassify_urban_nature_area( Returns: ``None`` """ - lulc_attribute_df = utils.read_csv_to_dataframe( - lulc_attribute_table, MODEL_SPEC['args']['lulc_attribute_table']) + lulc_attribute_df = validation.get_validated_dataframe( + lulc_attribute_table, **MODEL_SPEC['args']['lulc_attribute_table']) squared_pixel_area = abs( numpy.multiply(*_square_off_pixels(lulc_raster_path))) diff --git a/src/natcap/invest/utils.py b/src/natcap/invest/utils.py index d845444e0..9ee61a060 100644 --- a/src/natcap/invest/utils.py +++ b/src/natcap/invest/utils.py @@ -410,125 +410,48 @@ def expand_path(path, base_path): return os.path.abspath(os.path.join(os.path.dirname(base_path), path)) -def read_csv_to_dataframe(path, spec, **kwargs): +def read_csv_to_dataframe(path, **kwargs): """Return a dataframe representation of the CSV. - Wrapper around ``pandas.read_csv`` that performs some common data cleaning - based on information in the arg spec. - - Columns are filtered to just those that match a pattern in the spec. - Column names are lowercased and whitespace is stripped off. Empty rows are - dropped. Values in each column are processed and cast to an appropriate - dtype according to the type in the spec: - - - Values in raster, vector, csv, file, and directory columns are cast to - str, whitespace stripped, and expanded as paths relative to the input path - - Values in freestyle_string and option_string columns are cast to str, - whitespace stripped, and converted to lowercase - - Values in number, ratio, and percent columns are cast to float - - Values in integer columns are cast to int - - Values in boolean columns are cast to bool - - Empty or NA cells are returned as ``numpy.nan`` (for floats) or - ``pandas.NA`` (for all other types). - - Also sets custom defaults for some kwargs passed to ``pandas.read_csv``, - which you can override with kwargs: + Wrapper around ``pandas.read_csv`` that performs some common data cleaning. + Column names are lowercased and whitespace is stripped off. Empty rows and + columns are dropped. Sets custom defaults for some kwargs passed to + ``pandas.read_csv``, which you can override with kwargs: - sep=None: lets the Python engine infer the separator - engine='python': The 'python' engine supports the sep=None option. - encoding='utf-8-sig': 'utf-8-sig' handles UTF-8 with or without BOM. + - index_col=False: force pandas not to index by any column, useful in + case of trailing separators Args: path (str): path to a CSV file - spec (dict): dictionary specifying the structure of the CSV table **kwargs: additional kwargs will be passed to ``pandas.read_csv`` Returns: pandas.DataFrame with the contents of the given CSV """ - # build up a list of regex patterns to match columns against columns from - # the table that match a pattern in this list (after stripping whitespace - # and lowercasing) will be included in the dataframe - patterns = [] - for column in spec['columns']: - column = column.lower() - match = re.match(r'(.*)\[(.+)\](.*)', column) - if match: - # for column name patterns, convert it to a regex pattern - groups = match.groups() - patterns.append(f'{groups[0]}(.+){groups[2]}') - else: - # for regular column names, use the exact name as the pattern - patterns.append(column.replace('(', '\(').replace(')', '\)')) - try: - # set index_col=False to force pandas not to index by any column - # this is useful in case of trailing separators - # we'll explicitly set the index column later on df = pandas.read_csv( path, - index_col=False, - usecols=lambda col: any( - re.fullmatch(pattern, col.strip().lower()) for pattern in patterns - ), **{ + 'index_col': False, 'sep': None, 'engine': 'python', 'encoding': 'utf-8-sig', **kwargs }) except UnicodeDecodeError as error: - LOGGER.error( + raise ValueError( f'The file {path} must be encoded as UTF-8 or ASCII') - raise error + + # drop columns whose header is NA + df = df[[col for col in df.columns if not pandas.isna(col)]] # strip whitespace from column names and convert to lowercase # this won't work on integer types, which happens if you set header=None # however, there's little reason to use this function if there's no header - df.columns = df.columns.str.strip().str.lower() - - # drop any empty rows - df = df.dropna(how="all") - - available_cols = set(df.columns) - - for col_spec, pattern in zip(spec['columns'].values(), patterns): - matching_cols = [c for c in available_cols if re.match(pattern, c)] - available_cols -= set(matching_cols) - for col in matching_cols: - try: - if col_spec['type'] in ['csv', 'directory', 'file', 'raster', 'vector', {'vector', 'raster'}]: - df[col] = df[col].apply( - lambda p: p if pandas.isna(p) else expand_path(str(p).strip(), path)) - df[col] = df[col].astype(pandas.StringDtype()) - elif col_spec['type'] in {'freestyle_string', 'option_string'}: - df[col] = df[col].apply( - lambda s: s if pandas.isna(s) else str(s).strip().lower()) - df[col] = df[col].astype(pandas.StringDtype()) - elif col_spec['type'] in {'number', 'percent', 'ratio'}: - df[col] = df[col].astype(float) - elif col_spec['type'] == 'integer': - df[col] = df[col].astype(pandas.Int64Dtype()) - elif col_spec['type'] == 'boolean': - df[col] = df[col].astype('boolean') - except ValueError as err: - raise ValueError( - f'Value(s) in the "{col}" column of the table {path} ' - f'could not be interpreted as {col_spec["type"]}s. ' - f'Original error: {err}') - - # set the index column, if specified - if 'index_col' in spec and spec['index_col'] is not None: - index_col = spec['index_col'].lower() - try: - df = df.set_index(index_col, verify_integrity=True) - except KeyError: - # If 'index_col' is not a column then KeyError is raised for using - # it as the index column - LOGGER.error(f"The column '{index_col}' could not be found " - f"in the table {path}") - raise + df.columns = df.columns.astype(str).str.strip().str.lower() return df diff --git a/src/natcap/invest/validation.py b/src/natcap/invest/validation.py index b98e193de..12278b91e 100644 --- a/src/natcap/invest/validation.py +++ b/src/natcap/invest/validation.py @@ -33,8 +33,10 @@ 'MISSING_VALUE': gettext('Input is required but has no value'), 'MATCHED_NO_HEADERS': gettext('Expected the {header} "{header_name}" but did ' 'not find it'), + 'PATTERN_MATCHED_NONE': gettext('Expected to find at least one {header} matching ' + 'the pattern "{header_name}" but found none'), 'DUPLICATE_HEADER': gettext('Expected the {header} "{header_name}" only once ' - 'but found it {number} times'), + 'but found it {number} times'), 'NOT_A_NUMBER': gettext('Value "{value}" could not be interpreted as a number'), 'WRONG_PROJECTION_UNIT': gettext('Layer must be projected in this unit: ' '"{unit_a}" but found this unit: "{unit_b}"'), @@ -47,8 +49,6 @@ 'NOT_GDAL_RASTER': gettext('File could not be opened as a GDAL raster'), 'OVR_FILE': gettext('File found to be an overview ".ovr" file.'), 'NOT_GDAL_VECTOR': gettext('File could not be opened as a GDAL vector'), - 'NOT_CSV': gettext('File could not be opened as a CSV. File must be encoded as ' - 'a UTF-8 CSV.'), 'REGEXP_MISMATCH': gettext("Value did not match expected pattern {regexp}"), 'INVALID_OPTION': gettext("Value must be one of: {option_list}"), 'INVALID_VALUE': gettext('Value does not meet condition {condition}'), @@ -572,19 +572,111 @@ def check_boolean(value, **kwargs): return MESSAGES['NOT_BOOLEAN'].format(value=value) -def check_csv(filepath, rows=None, columns=None, **kwargs): +def get_validated_dataframe(csv_path, columns=None, rows=None, index_col=None, + read_csv_kwargs={}, **kwargs): + """Read a CSV into a dataframe that is guaranteed to match the spec.""" + + if not (columns or rows): + raise ValueError('One of columns or rows must be provided') + + # build up a list of regex patterns to match columns against columns from + # the table that match a pattern in this list (after stripping whitespace + # and lowercasing) will be included in the dataframe + axis = 'column' if columns else 'row' + + if rows: + read_csv_kwargs = read_csv_kwargs.copy() + read_csv_kwargs['header'] = None + + df = utils.read_csv_to_dataframe(csv_path, **read_csv_kwargs) + + if rows: + # swap rows and column + df = df.set_index(df.columns[0]).rename_axis(None, axis=0).T.reset_index(drop=True) + + spec = columns if columns else rows + + patterns = [] + for column in spec: + column = column.lower() + match = re.match(r'(.*)\[(.+)\](.*)', column) + if match: + # for column name patterns, convert it to a regex pattern + groups = match.groups() + patterns.append(f'{groups[0]}(.+){groups[2]}') + else: + # for regular column names, use the exact name as the pattern + patterns.append(column.replace('(', '\(').replace(')', '\)')) + + # select only the columns that match a pattern + df = df[[col for col in df.columns if any( + re.fullmatch(pattern, col) for pattern in patterns)]] + + # drop any empty rows + df = df.dropna(how="all").reset_index(drop=True) + + available_cols = set(df.columns) + + for (col_name, col_spec), pattern in zip(spec.items(), patterns): + matching_cols = [c for c in available_cols if re.fullmatch(pattern, c)] + if col_spec.get('required', True) is True and '[' not in col_name and not matching_cols: + raise ValueError(MESSAGES['MATCHED_NO_HEADERS'].format( + header=axis, + header_name=col_name)) + available_cols -= set(matching_cols) + for col in matching_cols: + try: + # frozenset needed to make the set hashable. A frozenset and set with the same members are equal. + if col_spec['type'] in {'csv', 'directory', 'file', 'raster', 'vector', frozenset({'vector', 'raster'})}: + df[col] = df[col].apply( + lambda p: p if pandas.isna(p) else utils.expand_path(str(p).strip(), csv_path)) + df[col] = df[col].astype(pandas.StringDtype()) + elif col_spec['type'] in {'freestyle_string', 'option_string'}: + df[col] = df[col].apply( + lambda s: s if pandas.isna(s) else str(s).strip().lower()) + df[col] = df[col].astype(pandas.StringDtype()) + elif col_spec['type'] in {'number', 'percent', 'ratio'}: + df[col] = df[col].astype(float) + elif col_spec['type'] == 'integer': + df[col] = df[col].astype(pandas.Int64Dtype()) + elif col_spec['type'] == 'boolean': + df[col] = df[col].astype('boolean') + else: + raise ValueError(f"Unknown type: {col_spec['type']}") + except Exception as err: + raise ValueError( + f'Value(s) in the "{col}" column could not be interpreted ' + f'as {col_spec["type"]}s. Original error: {err}') + + if any(df.columns.duplicated()): + duplicated_columns = df.columns[df.columns.duplicated] + return MESSAGES['DUPLICATE_HEADER'].format( + header=header_type, + header_name=expected, + number=count) + + # set the index column, if specified + if index_col is not None: + index_col = index_col.lower() + try: + df = df.set_index(index_col, verify_integrity=True) + except KeyError: + # If 'index_col' is not a column then KeyError is raised for using + # it as the index column + LOGGER.error(f"The column '{index_col}' could not be found " + f"in the table {csv_path}") + raise + + return df + + + + +def check_csv(filepath, **kwargs): """Validate a table. Args: filepath (string): The string filepath to the table. - rows (dict): A dictionary spec of row names that are expected to exist - in the first column of the table. See the docstring of - ``check_headers`` for details on validation rules. No more than one - of `rows` and `columns` should be defined. - columns (dict): A dictionary spec of column names that are expected to - exist in the first row of the table. See the docstring of - ``check_headers`` for details on validation rules. No more than one - of `rows` and `columns` should be defined. Returns: A string error message if an error was found. ``None`` otherwise. @@ -593,28 +685,11 @@ def check_csv(filepath, rows=None, columns=None, **kwargs): file_warning = check_file(filepath, permissions='r') if file_warning: return file_warning - - try: - # Check if the file encoding is UTF-8 BOM first - encoding = None - if utils.has_utf8_bom(filepath): - encoding = 'utf-8-sig' - # engine=python handles unknown characters by replacing them with a - # replacement character, instead of raising an error - # use sep=None, engine='python' to infer what the separator is - dataframe = pandas.read_csv( - filepath, sep=None, engine='python', encoding=encoding, - header=None) - except Exception: - return MESSAGES['NOT_CSV'] - - # assume that at most one of `rows` and `columns` is defined - if columns: - headers = [str(name).strip() for name in dataframe.iloc[0]] - return check_headers(get_headers_to_validate(columns), headers, 'column') - elif rows: - headers = [str(name).strip() for name in dataframe.iloc[:, 0]] - return check_headers(get_headers_to_validate(rows), headers, 'row') + if 'columns' in kwargs or 'rows' in kwargs: + try: + get_validated_dataframe(filepath, **kwargs) + except Exception as e: + return str(e) def check_headers(expected_headers, actual_headers, header_type='header'): diff --git a/src/natcap/invest/wave_energy.py b/src/natcap/invest/wave_energy.py index dc0a9a7ad..be91ea839 100644 --- a/src/natcap/invest/wave_energy.py +++ b/src/natcap/invest/wave_energy.py @@ -287,26 +287,22 @@ }, "machine_param_path": { "type": "csv", - "rows": { - "capmax": { - "about": gettext("Maximum capacity for device."), - "type": "number", - "units": u.kilowatt - }, - "hsmax": { + # use columns because of the non standard format of this table, + # we cannot validate it with the rows as headers. + "columns": { + "name": { + "type": "freestyle_string", "about": gettext( - "Upper limit of wave height for device operation. The " - "device shuts down when waves are higher than this."), - "type": "number", - "units": u.meter + "Name of the machine parameter. Expected parameters are: " + "'capmax' (maximum capacity for device, in kilowatts), " + "'hsmax' (upper limit of wave height for device operation, " + "in meters), and 'tpmax' (upper limit of wave period for " + "device operation, in seconds).") }, - "tpmax": { - "about": gettext( - "Upper limit of wave period for device operation. The " - "device shuts down when the wave period is longer " - "than this."), + "value": { "type": "number", - "units": u.second + "units": u.none, + "about": gettext("Value of the machine parameter.") } }, "about": gettext("Table of parameters for the wave energy machine in use."), @@ -335,52 +331,28 @@ }, "machine_econ_path": { "type": "csv", - "rows": { - "capmax": { - "type": "number", - "units": u.kilowatt, - "about": gettext("Maximum capacity of the device.") - }, - "cc": { - "type": "number", - "units": u.currency/u.kilowatt, - "about": gettext("Capital cost per device installed.") - }, - "cml": { - "type": "number", - "units": u.currency/u.meter, - "about": gettext("Cost of mooring lines.") - }, - "cul": { - "type": "number", - "units": u.currency/u.kilometer, - "about": gettext("Cost of underwater cable.") - }, - "col": { - "type": "number", - "units": u.currency/u.kilometer, - "about": gettext("Cost of overland transmission lines.") - }, - "omc": { - "type": "number", - "units": u.currency/u.kilowatt_hour, - "about": gettext("Operating and maintenance cost.") - }, - "p": { - "type": "number", - "units": u.currency/u.kilowatt_hour, - "about": gettext("Price of electricity.") - }, - "r": { - "type": "ratio", - "about": gettext("Discount rate.") + # use columns because of the non standard format of this table, + # we cannot validate it with the rows as headers. + "columns": { + "name": { + "type": "freestyle_string", + "about": gettext( + "Name of the machine parameter. Expected parameters are: " + "'capmax' (maximum capacity for device, in kilowatts), " + "'cc' (capital cost per device installed, $/kilowatt), " + "'cml' (cost of mooring lines, $/kilometer), " + "'cul' (cost of underwater cable, $/kilometer), " + "'col' (cost of overland transmission lines, $/kilometer), " + "'omc' (operating and maintenance cost, $/kilowatt hour), " + "'p' (price of electricity, $/kilowatt hour), " + "'r' (discount rate, between 0 and 1), " + "'smlpm' (number of slack lines required per machine)") }, - "smlpm": { + "value": { "type": "number", "units": u.none, - "about": gettext("Number of slack lines required per machine.") + "about": gettext("Value of the machine parameter.") } - }, "required": "valuation_container", "about": gettext( @@ -741,7 +713,7 @@ def execute(args): # arrays. Also store the amount of energy the machine produces # in a certain wave period/height state as a 2D array machine_perf_dict = {} - machine_perf_data = pandas.read_csv(args['machine_perf_path']) + machine_perf_data = utils.read_csv_to_dataframe(args['machine_perf_path']) # Get the wave period fields, starting from the second column of the table machine_perf_dict['periods'] = machine_perf_data.columns.values[1:] # Build up the height field by taking the first column of the table @@ -769,14 +741,21 @@ def execute(args): LOGGER.debug('Machine Performance Rows : %s', machine_perf_dict['periods']) LOGGER.debug('Machine Performance Cols : %s', machine_perf_dict['heights']) - machine_param_dict = _machine_csv_to_dict(args['machine_param_path']) + machine_param_dict = validation.get_validated_dataframe( + args['machine_param_path'], + index_col='name', + columns={ + 'name': {'type': 'option_string'}, + 'value': {'type': 'number'} + }, + )['value'].to_dict() # Check if required column fields are entered in the land grid csv file if 'land_gridPts_path' in args: # Create a grid_land_df dataframe for later use in valuation - grid_land_df = utils.read_csv_to_dataframe( + grid_land_df = validation.get_validated_dataframe( args['land_gridPts_path'], - MODEL_SPEC['args']['land_gridPts_path']) + **MODEL_SPEC['args']['land_gridPts_path']) missing_grid_land_fields = [] for field in ['id', 'type', 'lat', 'long', 'location']: if field not in grid_land_df.columns: @@ -788,7 +767,14 @@ def execute(args): 'Connection Points File: %s' % missing_grid_land_fields) if 'valuation_container' in args and args['valuation_container']: - machine_econ_dict = _machine_csv_to_dict(args['machine_econ_path']) + machine_econ_dict = validation.get_validated_dataframe( + args['machine_econ_path'], + index_col='name', + columns={ + 'name': {'type': 'option_string'}, + 'value': {'type': 'number'} + } + )['value'].to_dict() # Build up a dictionary of possible analysis areas where the key # is the analysis area selected and the value is a dictionary @@ -1626,42 +1612,6 @@ def _binary_wave_data_to_dict(wave_file_path): return wave_dict -def _machine_csv_to_dict(machine_csv_path): - """Create a dictionary from the table in machine csv file. - - The dictionary's keys are the 'NAME' from the machine table and its values - are from the corresponding 'VALUE' field. No need to check for missing - columns since the file is validated by validate() function. - - Args: - machine_csv_path (str): path to the input machine CSV file. - - Returns: - machine_dict (dict): a dictionary of keys from the first column of the - CSV file and corresponding values from the `VALUE` column. - - """ - machine_dict = {} - # make columns and indexes lowercased and strip whitespace - machine_data = utils.read_csv_to_dataframe( - machine_csv_path, - { - 'index_col': 'name', - 'columns': { - 'name': {'type': 'freestyle_string'}, - 'value': {'type': 'number'} - }}) - - # drop NaN indexed rows in dataframe - machine_data = machine_data[machine_data.index.notnull()] - LOGGER.debug('machine_data dataframe from %s: %s' % - (machine_csv_path, machine_data)) - machine_dict = machine_data.to_dict('index') - for key in machine_dict.keys(): - machine_dict[key] = machine_dict[key]['value'] - return machine_dict - - def _get_vector_spatial_ref(base_vector_path): """Get the spatial reference of an OGR vector (datasource). diff --git a/src/natcap/invest/wind_energy.py b/src/natcap/invest/wind_energy.py index bdb33ede6..f7fa4cc73 100644 --- a/src/natcap/invest/wind_energy.py +++ b/src/natcap/invest/wind_energy.py @@ -7,7 +7,6 @@ import tempfile import numpy -import pandas from scipy import integrate import shapely.wkb @@ -707,18 +706,21 @@ def execute(args): ] # Read the biophysical turbine parameters into a dictionary - bio_turbine_dict = _read_csv_wind_parameters( - args['turbine_parameters_path'], biophysical_params) - + turbine_dict = validation.get_validated_dataframe( + args['turbine_parameters_path'], + **MODEL_SPEC['args']['turbine_parameters_path'] + ).iloc[0].to_dict() # Read the biophysical global parameters into a dictionary - bio_global_params_dict = _read_csv_wind_parameters( - args['global_wind_parameters_path'], biophysical_params) + global_params_dict = validation.get_validated_dataframe( + args['global_wind_parameters_path'], + **MODEL_SPEC['args']['global_wind_parameters_path'] + ).iloc[0].to_dict() # Combine the turbine and global parameters into one dictionary - bio_parameters_dict = bio_global_params_dict.copy() - bio_parameters_dict.update(bio_turbine_dict) + parameters_dict = global_params_dict.copy() + parameters_dict.update(turbine_dict) - LOGGER.debug('Biophysical Turbine Parameters: %s', bio_parameters_dict) + LOGGER.debug('Biophysical Turbine Parameters: %s', parameters_dict) if ('valuation_container' not in args or args['valuation_container'] is False): @@ -727,33 +729,11 @@ def execute(args): LOGGER.info( 'Valuation Selected. Checking required parameters from CSV files.') - # Create a list of the valuation parameters we are looking for from the - # input files - valuation_turbine_params = ['turbine_cost', 'turbine_rated_pwr'] - # Read the biophysical turbine parameters into a dictionary - val_turbine_dict = _read_csv_wind_parameters( - args['turbine_parameters_path'], valuation_turbine_params) - - valuation_global_params = [ - 'carbon_coefficient', 'time_period', 'infield_cable_cost', - 'infield_cable_length', 'installation_cost', - 'miscellaneous_capex_cost', 'operation_maintenance_cost', - 'decommission_cost', 'ac_dc_distance_break', 'mw_coef_ac', - 'mw_coef_dc', 'cable_coef_ac', 'cable_coef_dc' - ] - # Read the biophysical global parameters into a dictionary - val_global_param_dict = _read_csv_wind_parameters( - args['global_wind_parameters_path'], valuation_global_params) - - # Combine the turbine and global parameters into one dictionary - val_parameters_dict = val_global_param_dict.copy() - val_parameters_dict.update(val_turbine_dict) - # If Price Table provided use that for price of energy, validate inputs - time = int(val_parameters_dict['time_period']) + time = parameters_dict['time_period'] if args['price_table']: - wind_price_df = utils.read_csv_to_dataframe( - args['wind_schedule'], MODEL_SPEC['args']['wind_schedule'] + wind_price_df = validation.get_validated_dataframe( + args['wind_schedule'], **MODEL_SPEC['args']['wind_schedule'] ).sort_index() # sort by year year_count = len(wind_price_df) @@ -773,17 +753,26 @@ def execute(args): # are the time steps for the lifespan of the farm and values # are adjusted based on the rate of change price_list = [] - for time_step in range(time + 1): + for time_step in range(int(time) + 1): price_list.append(wind_price * (1 + change_rate)**(time_step)) # Hub Height to use for setting Weibull parameters - hub_height = int(bio_parameters_dict['hub_height']) + hub_height = parameters_dict['hub_height'] LOGGER.debug('hub_height : %s', hub_height) # Read the wind energy data into a dictionary LOGGER.info('Reading in Wind Data into a dictionary') - wind_data = _read_csv_wind_data(args['wind_data_path'], hub_height) + wind_point_df = validation.get_validated_dataframe( + args['wind_data_path'], **MODEL_SPEC['args']['wind_data_path']) + wind_point_df.columns = wind_point_df.columns.str.upper() + # Calculate scale value at new hub height given reference values. + # See equation 3 in users guide + wind_point_df.rename(columns={'LAM': 'REF_LAM'}, inplace=True) + wind_point_df['LAM'] = wind_point_df.apply( + lambda row: row.REF_LAM * (hub_height / row.REF)**_ALPHA, axis=1) + wind_point_df.drop(['REF'], axis=1) # REF is not needed after calculation + wind_data = wind_point_df.to_dict('index') # so keys will be 0, 1, 2, ... # Compute Wind Density and Harvested Wind Energy, adding the values to the # points to the dictionary, and pickle the dictionary @@ -791,7 +780,7 @@ def execute(args): inter_dir, 'wind_data%s.pickle' % suffix) compute_density_harvested_task = task_graph.add_task( func=_compute_density_harvested_fields, - args=(wind_data, bio_parameters_dict, number_of_turbines, + args=(wind_data, parameters_dict, number_of_turbines, wind_data_pickle_path), target_path_list=[wind_data_pickle_path], task_name='compute_density_harvested_fields') @@ -1132,8 +1121,8 @@ def execute(args): LOGGER.info('Grid Points Provided. Reading in the grid points') # Read the grid points csv, and convert it to land and grid dictionary - grid_land_df = utils.read_csv_to_dataframe( - args['grid_points_path'], MODEL_SPEC['args']['grid_points_path']) + grid_land_df = validation.get_validated_dataframe( + args['grid_points_path'], **MODEL_SPEC['args']['grid_points_path']) # Convert the dataframes to dictionaries, using 'ID' (the index) as key grid_dict = grid_land_df[grid_land_df['type'] == 'grid'].to_dict('index') @@ -1306,7 +1295,7 @@ def execute(args): task_graph.add_task( func=_calculate_npv_levelized_rasters, args=(harvested_masked_path, final_dist_raster_path, npv_raster_path, - levelized_raster_path, val_parameters_dict, args, price_list), + levelized_raster_path, parameters_dict, args, price_list), target_path_list=[npv_raster_path, levelized_raster_path], task_name='calculate_npv_levelized_rasters', dependent_task_list=[final_dist_task]) @@ -1316,7 +1305,7 @@ def execute(args): # The amount of CO2 not released into the atmosphere, with the constant # conversion factor provided in the users guide by Rob Griffin - carbon_coef = float(val_parameters_dict['carbon_coefficient']) + carbon_coef = parameters_dict['carbon_coefficient'] task_graph.add_task( func=pygeoprocessing.raster_calculator, @@ -1335,7 +1324,7 @@ def execute(args): def _calculate_npv_levelized_rasters( base_harvested_raster_path, base_dist_raster_path, target_npv_raster_path, target_levelized_raster_path, - val_parameters_dict, args, price_list): + parameters_dict, args, price_list): """Calculate NPV and levelized rasters from harvested and dist rasters. Args: @@ -1352,7 +1341,7 @@ def _calculate_npv_levelized_rasters( store the unit price of energy that would be required to set the present value of the farm centered at each pixel equal to zero. - val_parameters_dict (dict): a dictionary of the turbine and biophysical + parameters_dict (dict): a dictionary of the turbine and biophysical global parameters. args (dict): a dictionary that contains information on @@ -1383,35 +1372,35 @@ def _calculate_npv_levelized_rasters( target_levelized_raster_path, gdal.OF_RASTER | gdal.GA_Update) levelized_band = levelized_raster.GetRasterBand(1) - # Get constants from val_parameters_dict to make it more readable + # Get constants from parameters_dict to make it more readable # The length of infield cable in km - infield_length = float(val_parameters_dict['infield_cable_length']) + infield_length = parameters_dict['infield_cable_length'] # The cost of infield cable in currency units per km - infield_cost = float(val_parameters_dict['infield_cable_cost']) + infield_cost = parameters_dict['infield_cable_cost'] # The cost of the foundation in currency units - foundation_cost = float(args['foundation_cost']) + foundation_cost = args['foundation_cost'] # The cost of each turbine unit in currency units - unit_cost = float(val_parameters_dict['turbine_cost']) + unit_cost = parameters_dict['turbine_cost'] # The installation cost as a decimal - install_cost = float(val_parameters_dict['installation_cost']) + install_cost = parameters_dict['installation_cost'] # The miscellaneous costs as a decimal factor of capex_arr - misc_capex_cost = float(val_parameters_dict['miscellaneous_capex_cost']) + misc_capex_cost = parameters_dict['miscellaneous_capex_cost'] # The operations and maintenance costs as a decimal factor of capex_arr - op_maint_cost = float(val_parameters_dict['operation_maintenance_cost']) + op_maint_cost = parameters_dict['operation_maintenance_cost'] # The discount rate as a decimal - discount_rate = float(args['discount_rate']) + discount_rate = args['discount_rate'] # The cost to decommission the farm as a decimal factor of capex_arr - decom = float(val_parameters_dict['decommission_cost']) + decom = parameters_dict['decommission_cost'] # The mega watt value for the turbines in MW - mega_watt = float(val_parameters_dict['turbine_rated_pwr']) + mega_watt = parameters_dict['turbine_rated_pwr'] # The distance at which AC switches over to DC power - circuit_break = float(val_parameters_dict['ac_dc_distance_break']) + circuit_break = parameters_dict['ac_dc_distance_break'] # The coefficients for the AC/DC megawatt and cable cost from the CAP # function - mw_coef_ac = float(val_parameters_dict['mw_coef_ac']) - mw_coef_dc = float(val_parameters_dict['mw_coef_dc']) - cable_coef_ac = float(val_parameters_dict['cable_coef_ac']) - cable_coef_dc = float(val_parameters_dict['cable_coef_dc']) + mw_coef_ac = parameters_dict['mw_coef_ac'] + mw_coef_dc = parameters_dict['mw_coef_dc'] + cable_coef_ac = parameters_dict['cable_coef_ac'] + cable_coef_dc = parameters_dict['cable_coef_dc'] # The total mega watt capacity of the wind farm where mega watt is the # turbines rated power @@ -1437,7 +1426,7 @@ def _calculate_npv_levelized_rasters( # Discount constant raised to the total time, a constant found in the NPV # calculation (1+i)^T - disc_time = disc_const**int(val_parameters_dict['time_period']) + disc_time = disc_const**parameters_dict['time_period'] LOGGER.debug('disc_time : %s', disc_time) for (harvest_block_info, harvest_block_data), (_, dist_block_data) in zip( @@ -1502,7 +1491,7 @@ def _calculate_npv_levelized_rasters( # the wind farm. Starting at year 1, because year 0 yields no revenue for year in range(1, len(price_list)): # currency units per kilowatt-hour of that year - currency_per_kwh = float(price_list[year]) + currency_per_kwh = price_list[year] # The revenue for the wind farm. The energy_val_arr is in kWh/yr rev_arr = energy_val_arr * currency_per_kwh @@ -1824,36 +1813,6 @@ def _calculate_land_to_grid_distance( LOGGER.info('Finished _calculate_land_to_grid_distance.') -def _read_csv_wind_parameters(csv_path, parameter_list): - """Construct a dictionary from a csv file given a list of keys. - - The list of keys corresponds to the parameters names in 'csv_path' which - are represented in the first column of the file. - - Args: - csv_path (str): a path to a CSV file where every row is a parameter - with the parameter name in the first column followed by the value - in the second column - parameter_list (list) : a List of strs that represent the parameter - names to be found in 'csv_path'. These strs will be the keys in - the returned dictionary - - Returns: a Dictionary where the 'parameter_list' strs are the - keys that have values pulled from 'csv_path' - - """ - # use the parameters in the first column as indices for the dataframe - # this doesn't benefit from `utils.read_csv_to_dataframe` because there - # is no header to strip whitespace - # use sep=None, engine='python' to infer what the separator is - wind_param_df = pandas.read_csv(csv_path, header=None, index_col=0) - # only get the required parameters and leave out the rest - wind_param_df = wind_param_df[wind_param_df.index.isin(parameter_list)] - wind_dict = wind_param_df.to_dict()[1] - - return wind_dict - - def _mask_by_distance(base_raster_path, min_dist, max_dist, out_nodata, target_raster_path): """Create a raster whose pixel values are bound by min and max distances. @@ -1950,37 +1909,8 @@ def _create_distance_raster(base_raster_path, base_vector_path, LOGGER.info("Finished _create_distance_raster") -def _read_csv_wind_data(wind_data_path, hub_height): - """Unpack the csv wind data into a dictionary. - - Args: - wind_data_path (str): a path for the csv wind data file with header - of: "LONG","LATI","LAM","K","REF" - hub_height (int): the hub height to use for calculating Weibull - parameters and wind energy values - - Returns: - A dictionary where the keys are lat/long tuples which point - to dictionaries that hold wind data at that location. - - """ - wind_point_df = utils.read_csv_to_dataframe( - wind_data_path, MODEL_SPEC['args']['wind_data_path']) - wind_point_df.columns = wind_point_df.columns.str.upper() - - # Calculate scale value at new hub height given reference values. - # See equation 3 in users guide - wind_point_df.rename(columns={'LAM': 'REF_LAM'}, inplace=True) - wind_point_df['LAM'] = wind_point_df.apply( - lambda row: row.REF_LAM * (hub_height / row.REF)**_ALPHA, axis=1) - wind_point_df.drop(['REF'], axis=1) # REF is not needed after calculation - wind_dict = wind_point_df.to_dict('index') # so keys will be 0, 1, 2, ... - - return wind_dict - - def _compute_density_harvested_fields( - wind_dict, bio_parameters_dict, number_of_turbines, + wind_dict, parameters_dict, number_of_turbines, target_pickle_path): """Compute the density and harvested energy based on scale and shape keys. @@ -1989,7 +1919,7 @@ def _compute_density_harvested_fields( keys ``LAM``, ``LATI``, ``K``, ``LONG``, ``REF_LAM``, and ``REF``, and numbers indicating their corresponding values. - bio_parameters_dict (dict): a dictionary where the 'parameter_list' + parameters_dict (dict): a dictionary where the 'parameter_list' strings are the keys that have values pulled from bio-parameters CSV. @@ -2009,20 +1939,20 @@ def _compute_density_harvested_fields( # The rated power is expressed in units of MW but the harvested energy # equation calls for it in terms of Wh. Thus we multiply by a million to # get to Wh. - rated_power = float(bio_parameters_dict['turbine_rated_pwr']) * 1000000 + rated_power = parameters_dict['turbine_rated_pwr'] * 1000000 # Get the rest of the inputs needed to compute harvested wind energy # from the dictionary so that it is in a more readable format - exp_pwr_curve = int(bio_parameters_dict['exponent_power_curve']) - air_density_standard = float(bio_parameters_dict['air_density']) - v_rate = float(bio_parameters_dict['rated_wspd']) - v_out = float(bio_parameters_dict['cut_out_wspd']) - v_in = float(bio_parameters_dict['cut_in_wspd']) - air_density_coef = float(bio_parameters_dict['air_density_coefficient']) - losses = float(bio_parameters_dict['loss_parameter']) + exp_pwr_curve = parameters_dict['exponent_power_curve'] + air_density_standard = parameters_dict['air_density'] + v_rate = parameters_dict['rated_wspd'] + v_out = parameters_dict['cut_out_wspd'] + v_in = parameters_dict['cut_in_wspd'] + air_density_coef = parameters_dict['air_density_coefficient'] + losses = parameters_dict['loss_parameter'] # Hub Height to use for setting Weibull parameters - hub_height = int(bio_parameters_dict['hub_height']) + hub_height = parameters_dict['hub_height'] # Compute the mean air density, given by CKs formulas mean_air_density = air_density_standard - air_density_coef * hub_height @@ -2202,8 +2132,8 @@ def _dictionary_to_point_vector( # For each inner dictionary (for each point) create a point and set its # fields for point_dict in base_dict_data.values(): - latitude = float(point_dict['lati']) - longitude = float(point_dict['long']) + latitude = point_dict['lati'] + longitude = point_dict['long'] geom = ogr.Geometry(ogr.wkbPoint) geom.AddPoint_2D(longitude, latitude) @@ -2469,8 +2399,8 @@ def _wind_data_to_point_vector(wind_data_pickle_path, # For each inner dictionary (for each point) create a point for point_dict in wind_data.values(): geom = ogr.Geometry(ogr.wkbPoint) - latitude = float(point_dict['LATI']) - longitude = float(point_dict['LONG']) + latitude = point_dict['LATI'] + longitude = point_dict['LONG'] # When projecting to WGS84, extents -180 to 180 are used for # longitude. In case input longitude is from -360 to 0 convert if longitude < -180: @@ -2770,17 +2700,23 @@ def validate(args, limit_to=None): """ validation_warnings = validation.validate(args, MODEL_SPEC['args'], MODEL_SPEC['args_with_spatial_overlap']) - invalid_keys = validation.get_invalid_keys(validation_warnings) sufficient_keys = validation.get_sufficient_keys(args) valid_sufficient_keys = sufficient_keys - invalid_keys if ('wind_schedule' in valid_sufficient_keys and 'global_wind_parameters_path' in valid_sufficient_keys): - year_count = pandas.read_csv(args['wind_schedule']).shape[0] - time = int(_read_csv_wind_parameters( - args['global_wind_parameters_path'], ['time_period'] - )['time_period']) + year_count = utils.read_csv_to_dataframe( + args['wind_schedule']).shape[0] + time = validation.get_validated_dataframe( + args['global_wind_parameters_path'], + index_col='0', + columns={ + '0': {'type': 'freestyle_string'}, + '1': {'type': 'number'} + }, + read_csv_kwargs={'header': None} + )['1']['time_period'] if year_count != time + 1: validation_warnings.append(( ['wind_schedule'], diff --git a/tests/test_coastal_blue_carbon.py b/tests/test_coastal_blue_carbon.py index 3bec14303..5af17b57e 100644 --- a/tests/test_coastal_blue_carbon.py +++ b/tests/test_coastal_blue_carbon.py @@ -13,6 +13,7 @@ import pandas import pygeoprocessing from natcap.invest import utils +from natcap.invest import validation from osgeo import gdal from osgeo import osr @@ -188,9 +189,9 @@ def test_transition_table(self): lulc_csv.write('0,mangrove,True\n') lulc_csv.write('1,parking lot,False\n') - landcover_df = utils.read_csv_to_dataframe( + landcover_df = validation.get_validated_dataframe( landcover_table_path, - preprocessor.MODEL_SPEC['args']['lulc_lookup_table_path']) + **preprocessor.MODEL_SPEC['args']['lulc_lookup_table_path']) target_table_path = os.path.join(self.workspace_dir, 'transition_table.csv') @@ -204,9 +205,9 @@ def test_transition_table(self): str(context.exception)) # Re-load the landcover table - landcover_df = utils.read_csv_to_dataframe( + landcover_df = validation.get_validated_dataframe( landcover_table_path, - preprocessor.MODEL_SPEC['args']['lulc_lookup_table_path']) + **preprocessor.MODEL_SPEC['args']['lulc_lookup_table_path']) preprocessor._create_transition_table( landcover_df, [filename_a, filename_b], target_table_path) @@ -618,9 +619,9 @@ def test_model_one_transition_no_analysis_year(self): args = TestCBC2._create_model_args(self.workspace_dir) args['workspace_dir'] = os.path.join(self.workspace_dir, 'workspace') - prior_snapshots = utils.read_csv_to_dataframe( + prior_snapshots = validation.get_validated_dataframe( args['landcover_snapshot_csv'], - coastal_blue_carbon.MODEL_SPEC['args']['landcover_snapshot_csv'] + **coastal_blue_carbon.MODEL_SPEC['args']['landcover_snapshot_csv'] )['raster_path'].to_dict() baseline_year = min(prior_snapshots.keys()) baseline_raster = prior_snapshots[baseline_year] @@ -796,9 +797,9 @@ def test_model_no_transitions(self): args = TestCBC2._create_model_args(self.workspace_dir) args['workspace_dir'] = os.path.join(self.workspace_dir, 'workspace') - prior_snapshots = utils.read_csv_to_dataframe( + prior_snapshots = validation.get_validated_dataframe( args['landcover_snapshot_csv'], - coastal_blue_carbon.MODEL_SPEC['args']['landcover_snapshot_csv'] + **coastal_blue_carbon.MODEL_SPEC['args']['landcover_snapshot_csv'] )['raster_path'].to_dict() baseline_year = min(prior_snapshots.keys()) baseline_raster = prior_snapshots[baseline_year] @@ -862,9 +863,9 @@ def test_validation(self): raster.write('not a raster') # Write over the landcover snapshot CSV - prior_snapshots = utils.read_csv_to_dataframe( + prior_snapshots = validation.get_validated_dataframe( args['landcover_snapshot_csv'], - coastal_blue_carbon.MODEL_SPEC['args']['landcover_snapshot_csv'] + **coastal_blue_carbon.MODEL_SPEC['args']['landcover_snapshot_csv'] )['raster_path'].to_dict() baseline_year = min(prior_snapshots) with open(args['landcover_snapshot_csv'], 'w') as snapshot_table: diff --git a/tests/test_datastack.py b/tests/test_datastack.py index 33a01de2f..45d2e590b 100644 --- a/tests/test_datastack.py +++ b/tests/test_datastack.py @@ -303,6 +303,7 @@ def test_archive_extraction(self): """Datastack: test archive extraction.""" from natcap.invest import datastack from natcap.invest import utils + from natcap.invest import validation params = { 'blank': '', @@ -379,14 +380,12 @@ def test_archive_extraction(self): self.assertTrue( filecmp.cmp(archive_params[key], params[key], shallow=False)) - spatial_csv_dict = utils.read_csv_to_dataframe( + spatial_csv_dict = validation.get_validated_dataframe( archive_params['spatial_table'], - { - 'index_col': 'id', - 'columns': { - 'id': {'type': 'integer'}, - 'path': {'type': 'file'} - } + index_col='id', + columns={ + 'id': {'type': 'integer'}, + 'path': {'type': 'file'} }).to_dict(orient='index') spatial_csv_dir = os.path.dirname(archive_params['spatial_table']) numpy.testing.assert_allclose( diff --git a/tests/test_habitat_quality.py b/tests/test_habitat_quality.py index 560793d2c..0e2bbebe3 100644 --- a/tests/test_habitat_quality.py +++ b/tests/test_habitat_quality.py @@ -1317,7 +1317,7 @@ def test_habitat_quality_validation_missing_sens_header(self): args['sensitivity_table_path'] = os.path.join( args['workspace_dir'], 'sensitivity_samp.csv') make_sensitivity_samp_csv( - args['sensitivity_table_path'], include_threat=False) + args['sensitivity_table_path'], include_threat=True) make_threats_raster( args['workspace_dir'], threat_values=[1, 1], @@ -1333,7 +1333,7 @@ def test_habitat_quality_validation_missing_sens_header(self): open_table.write( '0.04,0.7,threat_1,linear,,threat_1_c.tif,threat_1_f.tif\n') open_table.write( - '0.07,1.0,threat_2,exponential,,threat_2_c.tif,' + '0.07,1.0,threat_3,exponential,,threat_2_c.tif,' 'threat_2_f.tif\n') # At least one threat header is expected, so there should be a message diff --git a/tests/test_recreation.py b/tests/test_recreation.py index 19ea26595..21771479d 100644 --- a/tests/test_recreation.py +++ b/tests/test_recreation.py @@ -954,6 +954,7 @@ def test_existing_output_shapefiles(self): def test_existing_regression_coef(self): """Recreation test regression coefficients handle existing output.""" from natcap.invest.recreation import recmodel_client + from natcap.invest import validation # Initialize a TaskGraph taskgraph_db_dir = os.path.join( @@ -971,9 +972,9 @@ def test_existing_regression_coef(self): predictor_table_path = os.path.join(SAMPLE_DATA, 'predictors.csv') # make outputs to be overwritten - predictor_dict = utils.read_csv_to_dataframe( + predictor_dict = validation.get_validated_dataframe( predictor_table_path, - recmodel_client.MODEL_SPEC['args']['predictor_table_path'] + **recmodel_client.MODEL_SPEC['args']['predictor_table_path'] ).to_dict(orient='index') predictor_list = predictor_dict.keys() tmp_working_dir = tempfile.mkdtemp(dir=self.workspace_dir) diff --git a/tests/test_utils.py b/tests/test_utils.py index 74f69821b..3006b44fa 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -527,199 +527,7 @@ def tearDown(self): shutil.rmtree(self.workspace_dir) def test_read_csv_to_dataframe(self): - """utils: test the default behavior""" - from natcap.invest import utils - - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - header, - a, - b - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, - {'columns': {'header': {'type': 'freestyle_string'}}}) - # header and table values should be lowercased - self.assertEqual(df.columns[0], 'header') - self.assertEqual(df['header'][0], 'a') - self.assertEqual(df['header'][1], 'b') - - def test_unique_key_not_first_column(self): - """utils: test success when key field is not first column.""" - from natcap.invest import utils - csv_text = ("desc,lucode,val1,val2\n" - "corn,1,0.5,2\n" - "bread,2,1,4\n" - "beans,3,0.5,4\n" - "butter,4,9,1") - table_path = os.path.join(self.workspace_dir, 'table.csv') - with open(table_path, 'w') as table_file: - table_file.write(csv_text) - - df = utils.read_csv_to_dataframe( - table_path, - { - 'index_col': 'lucode', - 'columns': { - 'desc': {'type': 'freestyle_string'}, - 'lucode': {'type': 'integer'}, - 'val1': {'type': 'number'}, - 'val2': {'type': 'number'} - }}) - self.assertEqual(df.index.name, 'lucode') - self.assertEqual(list(df.index.values), [1, 2, 3, 4]) - self.assertEqual(df['desc'][2], 'bread') - - def test_non_unique_keys(self): - """utils: test error is raised if keys are not unique.""" - from natcap.invest import utils - csv_text = ("lucode,desc,val1,val2\n" - "1,corn,0.5,2\n" - "2,bread,1,4\n" - "2,beans,0.5,4\n" - "4,butter,9,1") - table_path = os.path.join(self.workspace_dir, 'table.csv') - with open(table_path, 'w') as table_file: - table_file.write(csv_text) - - with self.assertRaises(ValueError): - utils.read_csv_to_dataframe( - table_path, - { - 'index_col': 'lucode', - 'columns': { - 'desc': {'type': 'freestyle_string'}, - 'lucode': {'type': 'integer'}, - 'val1': {'type': 'number'}, - 'val2': {'type': 'number'} - }}) - - def test_missing_key_field(self): - """utils: test error is raised when missing key field.""" - from natcap.invest import utils - csv_text = ("luode,desc,val1,val2\n" - "1,corn,0.5,2\n" - "2,bread,1,4\n" - "3,beans,0.5,4\n" - "4,butter,9,1") - table_path = os.path.join(self.workspace_dir, 'table.csv') - with open(table_path, 'w') as table_file: - table_file.write(csv_text) - - with self.assertRaises(KeyError): - utils.read_csv_to_dataframe( - table_path, - { - 'index_col': 'lucode', - 'columns': { - 'desc': {'type': 'freestyle_string'}, - 'lucode': {'type': 'integer'}, - 'val1': {'type': 'number'}, - 'val2': {'type': 'number'} - }}) - - def test_nan_row(self): - """utils: test NaN row is dropped.""" - from natcap.invest import utils - csv_text = ("lucode,desc,val1,val2\n" - "1,corn,0.5,2\n" - ",,,\n" - "3,beans,0.5,4\n" - "4,butter,9,1") - table_path = os.path.join(self.workspace_dir, 'table.csv') - with open(table_path, 'w') as table_file: - table_file.write(csv_text) - - result = utils.read_csv_to_dataframe( - table_path, - { - 'index_col': 'lucode', - 'columns': { - 'desc': {'type': 'freestyle_string'}, - 'lucode': {'type': 'integer'}, - 'val1': {'type': 'number'}, - 'val2': {'type': 'number'} - }}).to_dict(orient='index') - expected_result = { - 1: {'desc': 'corn', 'val1': 0.5, 'val2': 2}, - 3: {'desc': 'beans', 'val1': 0.5, 'val2': 4}, - 4: {'desc': 'butter', 'val1': 9, 'val2': 1}} - - self.assertDictEqual(result, expected_result) - - def test_column_subset(self): - """utils: test column subset is properly returned.""" - from natcap.invest import utils - table_path = os.path.join(self.workspace_dir, 'table.csv') - with open(table_path, 'w') as table_file: - table_file.write( - "lucode,desc,val1,val2\n" - "1,corn,0.5,2\n" - "2,bread,1,4\n" - "3,beans,0.5,4\n" - "4,butter,9,1") - df = utils.read_csv_to_dataframe( - table_path, - { - 'columns': { - 'lucode': {'type': 'integer'}, - 'val1': {'type': 'number'}, - 'val2': {'type': 'number'} - } - }) - self.assertEqual(list(df.columns), ['lucode', 'val1', 'val2']) - - def test_column_pattern_matching(self): - """utils: test column subset is properly returned.""" - from natcap.invest import utils - table_path = os.path.join(self.workspace_dir, 'table.csv') - with open(table_path, 'w') as table_file: - table_file.write( - "lucode,grassland_value,forest_value,wetland_valueee\n" - "1,0.5,2\n" - "2,1,4\n" - "3,0.5,4\n" - "4,9,1") - df = utils.read_csv_to_dataframe( - table_path, { - 'columns': { - 'lucode': {'type': 'integer'}, - '[HABITAT]_value': {'type': 'number'} - } - }) - self.assertEqual( - list(df.columns), ['lucode', 'grassland_value', 'forest_value']) - - def test_trailing_comma(self): - """utils: test a trailing comma on first line is handled properly.""" - from natcap.invest import utils - table_path = os.path.join(self.workspace_dir, 'table.csv') - with open(table_path, 'w') as table_file: - table_file.write( - "lucode,desc,val1,val2\n" - "1,corn,0.5,2,\n" - "2,bread,1,4\n" - "3,beans,0.5,4\n" - "4,butter,9,1") - result = utils.read_csv_to_dataframe( - table_path, - { - 'columns': { - 'desc': {'type': 'freestyle_string'}, - 'lucode': {'type': 'integer'}, - 'val1': {'type': 'number'}, - 'val2': {'type': 'number'} - }}) - self.assertEqual(result['val2'][0], 2) - self.assertEqual(result['lucode'][1], 2) - - - def test_trailing_comma_second_line(self): - """utils: test a trailing comma on second line is handled properly.""" + """utils: read csv with no row or column specs provided""" from natcap.invest import utils csv_text = ("lucode,desc,val1,val2\n" "1,corn,0.5,2\n" @@ -730,216 +538,12 @@ def test_trailing_comma_second_line(self): with open(table_path, 'w') as table_file: table_file.write(csv_text) - result = utils.read_csv_to_dataframe( - table_path, - { - 'index_col': 'lucode', - 'columns': { - 'desc': {'type': 'freestyle_string'}, - 'lucode': {'type': 'integer'}, - 'val1': {'type': 'number'}, - 'val2': {'type': 'number'} - }}).to_dict(orient='index') - - expected_result = { - 1: {'desc': 'corn', 'val1': 0.5, 'val2': 2}, - 2: {'desc': 'bread', 'val1': 1, 'val2': 4}, - 3: {'desc': 'beans', 'val1': 0.5, 'val2': 4}, - 4: {'desc': 'butter', 'val1': 9, 'val2': 1}} - - self.assertDictEqual(result, expected_result) - - def test_csv_dialect_detection_semicolon_delimited(self): - """utils: test that we can parse semicolon-delimited CSVs.""" - from natcap.invest import utils - - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - header1;HEADER2;header3; - 1;2;3; - 4;FOO;bar; - """ - )) - - df = utils.read_csv_to_dataframe( - csv_file, - {'columns': { - 'header1': {'type': 'integer'}, - 'header2': {'type': 'freestyle_string'}, - 'header3': {'type': 'freestyle_string'} - } - }) - self.assertEqual(df['header2'][1], 'foo') - self.assertEqual(df['header3'][1], 'bar') - self.assertEqual(df['header1'][0], 1) - - def test_convert_cols_to_lower(self): - """utils: test that column names are converted to lowercase""" - from natcap.invest import utils - - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - header, - A, - b - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, {'columns': { - 'header': {'type': 'freestyle_string'} - }}) - self.assertEqual(df['header'][0], 'a') - - def test_convert_vals_to_lower(self): - """utils: test that values are converted to lowercase""" - from natcap.invest import utils - - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - HEADER, - a, - b - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, {'columns': { - 'header': {'type': 'freestyle_string'} - }}) - self.assertEqual(df.columns[0], 'header') - - def test_integer_type_columns(self): - """utils: integer column values are returned as integers.""" - from natcap.invest import utils - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - id,header, - 1,5.0, - 2,-1, - 3, - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, {'columns': { - 'id': {'type': 'integer'}, - 'header': {'type': 'integer', 'na_allowed': True}}}) - self.assertIsInstance(df['header'][0], numpy.int64) - self.assertIsInstance(df['header'][1], numpy.int64) - # empty values are returned as pandas.NA - self.assertTrue(pd.isna(df['header'][2])) - - def test_float_type_columns(self): - """utils: float column values are returned as floats.""" - from natcap.invest import utils - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - h1,h2,h3 - 5,0.5,.4 - -1,-.3, - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, {'columns': { - 'h1': {'type': 'number'}, - 'h2': {'type': 'ratio'}, - 'h3': {'type': 'percent', 'na_allowed': True}, - }}) - self.assertEqual(df['h1'].dtype, float) - self.assertEqual(df['h2'].dtype, float) - self.assertEqual(df['h3'].dtype, float) - # empty values are returned as numpy.nan - self.assertTrue(numpy.isnan(df['h3'][1])) - - def test_string_type_columns(self): - """utils: string column values are returned as strings.""" - from natcap.invest import utils - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - h1,h2,h3 - 1,a,foo - 2,b, - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, {'columns': { - 'h1': {'type': 'freestyle_string'}, - 'h2': {'type': 'option_string'}, - 'h3': {'type': 'freestyle_string'}, - }}) - self.assertEqual(df['h1'][0], '1') - self.assertEqual(df['h2'][1], 'b') - # empty values are returned as NA - self.assertTrue(pd.isna(df['h3'][1])) - - def test_boolean_type_columns(self): - """utils: boolean column values are returned as booleans.""" - from natcap.invest import utils - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - index,h1 - a,1 - b,0 - c, - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, {'columns': { - 'index': {'type': 'freestyle_string'}, - 'h1': {'type': 'bool', 'na_allowed': True}}}) - self.assertEqual(df['h1'][0], True) - self.assertEqual(df['h1'][1], False) - # empty values are returned as pandas.NA - self.assertTrue(pd.isna(df['h1'][2])) - - def test_expand_path_columns(self): - """utils: test values in path columns are expanded.""" - from natcap.invest import utils - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - f"""\ - bar,path - 1,foo.txt - 2,foo/bar.txt - 3,foo\\bar.txt - 4,{self.workspace_dir}/foo.txt - 5, - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, {'columns': { - 'bar': {'type': 'integer'}, - 'path': {'type': 'file'} - }}) - self.assertEqual( - f'{self.workspace_dir}{os.sep}foo.txt', - df['path'][0]) - self.assertEqual( - f'{self.workspace_dir}{os.sep}foo{os.sep}bar.txt', - df['path'][1]) - self.assertEqual( - f'{self.workspace_dir}{os.sep}foo\\bar.txt', - df['path'][2]) - self.assertEqual( - f'{self.workspace_dir}{os.sep}foo.txt', - df['path'][3]) - # empty values are returned as empty strings - self.assertTrue(pd.isna(df['path'][4])) + df = utils.read_csv_to_dataframe(table_path) + self.assertEqual(list(df.columns), ['lucode', 'desc', 'val1', 'val2']) + self.assertEqual(df['lucode'][0], 1) + self.assertEqual(df['desc'][1], 'bread') + self.assertEqual(df['val1'][2], 0.5) + self.assertEqual(df['val2'][3], 1) def test_csv_utf8_encoding(self): """utils: test that CSV read correctly with UTF-8 encoding.""" @@ -955,16 +559,9 @@ def test_csv_utf8_encoding(self): """ )) lookup_dict = utils.read_csv_to_dataframe( - csv_file, - { - 'index_col': 'header1', - 'columns': { - 'header1': {'type': 'integer'}, - 'header2': {'type': 'integer'}, - 'header3': {'type': 'freestyle_string'} - }}).to_dict(orient='index') - self.assertEqual(lookup_dict[4]['header2'], 5) - self.assertEqual(lookup_dict[4]['header3'], 'foo') + csv_file).to_dict(orient='index') + self.assertEqual(lookup_dict[1]['header2'], 5) + self.assertEqual(lookup_dict[1]['header3'], 'FOO') def test_utf8_bom_encoding(self): """utils: test that CSV read correctly with UTF-8 BOM encoding.""" @@ -982,13 +579,7 @@ def test_utf8_bom_encoding(self): # confirm that the file has the BOM prefix with open(csv_file, 'rb') as file_obj: self.assertTrue(file_obj.read().startswith(codecs.BOM_UTF8)) - df = utils.read_csv_to_dataframe(csv_file, - { - 'columns': { - 'header1': {'type': 'integer'}, - 'header2': {'type': 'integer'}, - 'header3': {'type': 'freestyle_string'} - }}) + df = utils.read_csv_to_dataframe(csv_file) # assert the BOM prefix was correctly parsed and skipped self.assertEqual(df.columns[0], 'header1') self.assertEqual(df['header2'][1], 5) @@ -1005,15 +596,9 @@ def test_csv_latin_1_encoding(self): 4,5,FOO """ )) - df = utils.read_csv_to_dataframe( - csv_file, - {'columns': { - 'header 1': {'type': 'integer'}, - 'header 2': {'type': 'integer'}, - 'header 3': {'type': 'freestyle_string'} - }}) + df = utils.read_csv_to_dataframe(csv_file) self.assertEqual(df['header 2'][1], 5) - self.assertEqual(df['header 3'][1], 'foo') + self.assertEqual(df['header 3'][1], 'FOO') self.assertEqual(df['header 1'][0], 1) def test_csv_error_non_utf8_character(self): @@ -1029,16 +614,8 @@ def test_csv_error_non_utf8_character(self): 4,5,FÖÖ """ )) - with self.assertRaises(UnicodeDecodeError): - utils.read_csv_to_dataframe( - csv_file, - { - 'index_col': 'header1', - 'columns': { - 'header1': {'type': 'integer'}, - 'header2': {'type': 'integer'}, - 'header3': {'type': 'freestyle_string'} - }}) + with self.assertRaises(ValueError): + utils.read_csv_to_dataframe(csv_file) def test_override_default_encoding(self): """utils: test that you can override the default encoding kwarg""" @@ -1055,99 +632,31 @@ def test_override_default_encoding(self): bar """ )) - df = utils.read_csv_to_dataframe( - csv_file, { - 'columns': {'header': {'type': 'freestyle_string'} - }}, encoding='iso8859_5') + df = utils.read_csv_to_dataframe(csv_file, encoding='iso8859_5') # with the encoding specified, special characters should work - # and be lowercased - self.assertEqual(df['header'][0], 'fюю') + self.assertEqual(df['header'][0], 'fЮЮ') self.assertEqual(df['header'][1], 'bar') - def test_other_kwarg(self): - """utils: any other kwarg should be passed to pandas.read_csv""" + def test_csv_dialect_detection_semicolon_delimited(self): + """utils: test that we can parse semicolon-delimited CSVs.""" from natcap.invest import utils csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: file_obj.write(textwrap.dedent( """\ - h1;h2;h3 - a;b;c - d;e;f + header1;HEADER2;header3; + 1;2;3; + 4;FOO;bar; """ )) - # using sep=None with the default engine='python', - # it should infer what the separator is - df = utils.read_csv_to_dataframe( - csv_file, { - 'columns': { - 'h1': {'type': 'freestyle_string'}, - 'h2': {'type': 'freestyle_string'}, - 'h3': {'type': 'freestyle_string'} - }}, converters={'h2': lambda val: f'foo_{val}'}) - - self.assertEqual(df.columns[0], 'h1') - self.assertEqual(df['h2'][1], 'foo_e') - - def test_csv_with_integer_headers(self): - """ - utils: CSV with integer headers should be read into strings. - - This shouldn't matter for any of the models, but if a user inputs a CSV - with extra columns that are labeled with numbers, it should still work. - """ - from natcap.invest import utils - - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - """\ - 1,2,3 - a,b,c - d,e,f - """ - )) - df = utils.read_csv_to_dataframe( - csv_file, - {'columns': { - '1': {'type': 'freestyle_string'}, - '2': {'type': 'freestyle_string'}, - '3': {'type': 'freestyle_string'} - }}) - # expect headers to be strings - self.assertEqual(df.columns[0], '1') - self.assertEqual(df['1'][0], 'a') - - def test_removal_whitespace(self): - """utils: test that leading/trailing whitespace is removed.""" - from natcap.invest import utils + df = utils.read_csv_to_dataframe(csv_file) + self.assertEqual(df['header2'][1], 'FOO') + self.assertEqual(df['header3'][1], 'bar') + self.assertEqual(df['header1'][0], 1) - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(" Col1, Col2 ,Col3 \n") - file_obj.write(" val1, val2 ,val3 \n") - file_obj.write(" , 2 1 , ") - df = utils.read_csv_to_dataframe( - csv_file, { - 'columns': { - 'col1': {'type': 'freestyle_string'}, - 'col2': {'type': 'freestyle_string'}, - 'col3': {'type': 'freestyle_string'} - }}) - # header should have no leading / trailing whitespace - self.assertEqual(list(df.columns), ['col1', 'col2', 'col3']) - - # values should have no leading / trailing whitespace - self.assertEqual(df['col1'][0], 'val1') - self.assertEqual(df['col2'][0], 'val2') - self.assertEqual(df['col3'][0], 'val3') - self.assertEqual(df['col1'][1], '') - self.assertEqual(df['col2'][1], '2 1') - self.assertEqual(df['col3'][1], '') class CreateCoordinateTransformationTests(unittest.TestCase): diff --git a/tests/test_validation.py b/tests/test_validation.py index 8f9e10acf..2a5e4fa81 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -1,15 +1,17 @@ """Testing module for validation.""" -# encoding=UTF-8 -import tempfile -import unittest -from unittest.mock import Mock +import codecs import functools import os import shutil import string +import tempfile +import textwrap import time +import unittest +from unittest.mock import Mock import warnings +import numpy from osgeo import gdal, osr, ogr import pandas @@ -717,7 +719,8 @@ def test_csv_fieldnames(self): df.to_csv(target_file) self.assertEqual(None, validation.check_csv( - target_file, columns={'foo': {}, 'bar': {}})) + target_file, columns={ + 'foo': {'type': 'integer'}, 'bar': {'type': 'integer'}})) def test_csv_bom_fieldnames(self): """Validation: test that we can check fieldnames in a CSV with BOM.""" @@ -732,7 +735,8 @@ def test_csv_bom_fieldnames(self): df.to_csv(target_file, encoding='utf-8-sig') self.assertEqual(None, validation.check_csv( - target_file, columns={'foo': {}, 'bar': {}})) + target_file, columns={ + 'foo': {'type': 'integer'}, 'bar': {'type': 'integer'}})) def test_csv_missing_fieldnames(self): """Validation: test that we can check missing fieldnames in a CSV.""" @@ -765,7 +769,7 @@ def test_wrong_filetype(self): df.to_pickle(target_file) error_msg = validation.check_csv(target_file, columns={'field_a': {}}) - self.assertEqual(error_msg, validation.MESSAGES['NOT_CSV']) + self.assertIn('must be encoded as UTF-8', error_msg) def test_slow_to_open(self): """Test timeout by mocking a CSV that is slow to open""" @@ -837,6 +841,508 @@ def test_check_headers(self): self.assertEqual(result, None) +class TestGetValidatedDataframe(unittest.TestCase): + """Tests for validation.get_validated_dataframe.""" + def setUp(self): + """Create a new workspace to use for each test.""" + self.workspace_dir = tempfile.mkdtemp() + + def tearDown(self): + """Remove the workspace created for this test.""" + shutil.rmtree(self.workspace_dir) + + def test_get_validated_dataframe(self): + """validation: test the default behavior""" + from natcap.invest import validation + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + header, , + a, , + b,c + """ + )) + df = validation.get_validated_dataframe( + csv_file, + columns={'header': {'type': 'freestyle_string'}}) + # header and table values should be lowercased + self.assertEqual(df.columns[0], 'header') + self.assertEqual(df['header'][0], 'a') + self.assertEqual(df['header'][1], 'b') + + def test_unique_key_not_first_column(self): + """validation: test success when key field is not first column.""" + from natcap.invest import validation + csv_text = ("desc,lucode,val1,val2\n" + "corn,1,0.5,2\n" + "bread,2,1,4\n" + "beans,3,0.5,4\n" + "butter,4,9,1") + table_path = os.path.join(self.workspace_dir, 'table.csv') + with open(table_path, 'w') as table_file: + table_file.write(csv_text) + + df = validation.get_validated_dataframe( + table_path, + index_col='lucode', + columns={ + 'desc': {'type': 'freestyle_string'}, + 'lucode': {'type': 'integer'}, + 'val1': {'type': 'number'}, + 'val2': {'type': 'number'} + }) + self.assertEqual(df.index.name, 'lucode') + self.assertEqual(list(df.index.values), [1, 2, 3, 4]) + self.assertEqual(df['desc'][2], 'bread') + + def test_non_unique_keys(self): + """validation: test error is raised if keys are not unique.""" + from natcap.invest import validation + csv_text = ("lucode,desc,val1,val2\n" + "1,corn,0.5,2\n" + "2,bread,1,4\n" + "2,beans,0.5,4\n" + "4,butter,9,1") + table_path = os.path.join(self.workspace_dir, 'table.csv') + with open(table_path, 'w') as table_file: + table_file.write(csv_text) + + with self.assertRaises(ValueError): + validation.get_validated_dataframe( + table_path, + index_col='lucode', + columns={ + 'desc': {'type': 'freestyle_string'}, + 'lucode': {'type': 'integer'}, + 'val1': {'type': 'number'}, + 'val2': {'type': 'number'} + }) + + def test_missing_key_field(self): + """validation: test error is raised when missing key field.""" + from natcap.invest import validation + csv_text = ("luode,desc,val1,val2\n" + "1,corn,0.5,2\n" + "2,bread,1,4\n" + "3,beans,0.5,4\n" + "4,butter,9,1") + table_path = os.path.join(self.workspace_dir, 'table.csv') + with open(table_path, 'w') as table_file: + table_file.write(csv_text) + + with self.assertRaises(ValueError): + validation.get_validated_dataframe( + table_path, + index_col='lucode', + columns={ + 'desc': {'type': 'freestyle_string'}, + 'lucode': {'type': 'integer'}, + 'val1': {'type': 'number'}, + 'val2': {'type': 'number'} + }) + + def test_column_subset(self): + """validation: test column subset is properly returned.""" + from natcap.invest import validation + table_path = os.path.join(self.workspace_dir, 'table.csv') + with open(table_path, 'w') as table_file: + table_file.write( + "lucode,desc,val1,val2\n" + "1,corn,0.5,2\n" + "2,bread,1,4\n" + "3,beans,0.5,4\n" + "4,butter,9,1") + df = validation.get_validated_dataframe( + table_path, + columns={ + 'lucode': {'type': 'integer'}, + 'val1': {'type': 'number'}, + 'val2': {'type': 'number'} + }) + self.assertEqual(list(df.columns), ['lucode', 'val1', 'val2']) + + def test_column_pattern_matching(self): + """validation: test column subset is properly returned.""" + from natcap.invest import validation + table_path = os.path.join(self.workspace_dir, 'table.csv') + with open(table_path, 'w') as table_file: + table_file.write( + "lucode,grassland_value,forest_value,wetland_valueee\n" + "1,0.5,2\n" + "2,1,4\n" + "3,0.5,4\n" + "4,9,1") + df = validation.get_validated_dataframe( + table_path, + columns={ + 'lucode': {'type': 'integer'}, + '[HABITAT]_value': {'type': 'number'} + }) + self.assertEqual( + list(df.columns), ['lucode', 'grassland_value', 'forest_value']) + + def test_trailing_comma(self): + """validation: test a trailing comma on first line is handled properly.""" + from natcap.invest import validation + table_path = os.path.join(self.workspace_dir, 'table.csv') + with open(table_path, 'w') as table_file: + table_file.write( + "lucode,desc,val1,val2\n" + "1,corn,0.5,2,\n" + "2,bread,1,4\n" + "3,beans,0.5,4\n" + "4,butter,9,1") + result = validation.get_validated_dataframe( + table_path, + columns={ + 'desc': {'type': 'freestyle_string'}, + 'lucode': {'type': 'integer'}, + 'val1': {'type': 'number'}, + 'val2': {'type': 'number'} + }) + self.assertEqual(result['val2'][0], 2) + self.assertEqual(result['lucode'][1], 2) + + def test_trailing_comma_second_line(self): + """validation: test a trailing comma on second line is handled properly.""" + from natcap.invest import validation + csv_text = ("lucode,desc,val1,val2\n" + "1,corn,0.5,2\n" + "2,bread,1,4,\n" + "3,beans,0.5,4\n" + "4,butter,9,1") + table_path = os.path.join(self.workspace_dir, 'table.csv') + with open(table_path, 'w') as table_file: + table_file.write(csv_text) + + result = validation.get_validated_dataframe( + table_path, + index_col='lucode', + columns={ + 'desc': {'type': 'freestyle_string'}, + 'lucode': {'type': 'integer'}, + 'val1': {'type': 'number'}, + 'val2': {'type': 'number'} + }).to_dict(orient='index') + + expected_result = { + 1: {'desc': 'corn', 'val1': 0.5, 'val2': 2}, + 2: {'desc': 'bread', 'val1': 1, 'val2': 4}, + 3: {'desc': 'beans', 'val1': 0.5, 'val2': 4}, + 4: {'desc': 'butter', 'val1': 9, 'val2': 1}} + + self.assertDictEqual(result, expected_result) + + def test_convert_cols_to_lower(self): + """validation: test that column names are converted to lowercase""" + from natcap.invest import validation + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + header, + A, + b + """ + )) + df = validation.get_validated_dataframe( + csv_file, + columns={'header': {'type': 'freestyle_string'}}) + self.assertEqual(df['header'][0], 'a') + + def test_convert_vals_to_lower(self): + """validation: test that values are converted to lowercase""" + from natcap.invest import validation + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + HEADER, + a, + b + """ + )) + df = validation.get_validated_dataframe( + csv_file, columns={'header': {'type': 'freestyle_string'}}) + self.assertEqual(df.columns[0], 'header') + + def test_integer_type_columns(self): + """validation: integer column values are returned as integers.""" + from natcap.invest import validation + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + id,header, + 1,5.0, + 2,-1, + 3, + """ + )) + df = validation.get_validated_dataframe( + csv_file, + columns={ + 'id': {'type': 'integer'}, + 'header': {'type': 'integer', 'na_allowed': True}}) + self.assertIsInstance(df['header'][0], numpy.int64) + self.assertIsInstance(df['header'][1], numpy.int64) + # empty values are returned as pandas.NA + self.assertTrue(pandas.isna(df['header'][2])) + + def test_float_type_columns(self): + """validation: float column values are returned as floats.""" + from natcap.invest import validation + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + h1,h2,h3 + 5,0.5,.4 + -1,-.3, + """ + )) + df = validation.get_validated_dataframe( + csv_file, + columns={ + 'h1': {'type': 'number'}, + 'h2': {'type': 'ratio'}, + 'h3': {'type': 'percent', 'na_allowed': True}, + }) + self.assertEqual(df['h1'].dtype, float) + self.assertEqual(df['h2'].dtype, float) + self.assertEqual(df['h3'].dtype, float) + # empty values are returned as numpy.nan + self.assertTrue(numpy.isnan(df['h3'][1])) + + def test_string_type_columns(self): + """validation: string column values are returned as strings.""" + from natcap.invest import validation + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + h1,h2,h3 + 1,a,foo + 2,b, + """ + )) + df = validation.get_validated_dataframe( + csv_file, + columns={ + 'h1': {'type': 'freestyle_string'}, + 'h2': {'type': 'option_string'}, + 'h3': {'type': 'freestyle_string'}, + }) + self.assertEqual(df['h1'][0], '1') + self.assertEqual(df['h2'][1], 'b') + # empty values are returned as NA + self.assertTrue(pandas.isna(df['h3'][1])) + + def test_boolean_type_columns(self): + """validation: boolean column values are returned as booleans.""" + from natcap.invest import validation + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + index,h1 + a,1 + b,0 + c, + """ + )) + df = validation.get_validated_dataframe( + csv_file, + columns={ + 'index': {'type': 'freestyle_string'}, + 'h1': {'type': 'boolean', 'na_allowed': True}}) + self.assertEqual(df['h1'][0], True) + self.assertEqual(df['h1'][1], False) + # empty values are returned as pandas.NA + self.assertTrue(pandas.isna(df['h1'][2])) + + def test_expand_path_columns(self): + """validation: test values in path columns are expanded.""" + from natcap.invest import validation + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + f"""\ + bar,path + 1,foo.txt + 2,foo/bar.txt + 3,foo\\bar.txt + 4,{self.workspace_dir}/foo.txt + 5, + """ + )) + df = validation.get_validated_dataframe( + csv_file, + columns={ + 'bar': {'type': 'integer'}, + 'path': {'type': 'file'} + }) + self.assertEqual( + f'{self.workspace_dir}{os.sep}foo.txt', + df['path'][0]) + self.assertEqual( + f'{self.workspace_dir}{os.sep}foo{os.sep}bar.txt', + df['path'][1]) + self.assertEqual( + f'{self.workspace_dir}{os.sep}foo\\bar.txt', + df['path'][2]) + self.assertEqual( + f'{self.workspace_dir}{os.sep}foo.txt', + df['path'][3]) + # empty values are returned as empty strings + self.assertTrue(pandas.isna(df['path'][4])) + + def test_other_kwarg(self): + """validation: any other kwarg should be passed to pandas.read_csv""" + from natcap.invest import validation + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + h1;h2;h3 + a;b;c + d;e;f + """ + )) + # using sep=None with the default engine='python', + # it should infer what the separator is + df = validation.get_validated_dataframe( + csv_file, + columns={ + 'h1': {'type': 'freestyle_string'}, + 'h2': {'type': 'freestyle_string'}, + 'h3': {'type': 'freestyle_string'}}, + read_csv_kwargs={'converters': {'h2': lambda val: f'foo_{val}'}}) + + self.assertEqual(df.columns[0], 'h1') + self.assertEqual(df['h2'][1], 'foo_e') + + def test_csv_with_integer_headers(self): + """ + validation: CSV with integer headers should be read into strings. + + This shouldn't matter for any of the models, but if a user inputs a CSV + with extra columns that are labeled with numbers, it should still work. + """ + from natcap.invest import validation + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + 1,2,3 + a,b,c + d,e,f + """ + )) + df = validation.get_validated_dataframe( + csv_file, + columns={ + '1': {'type': 'freestyle_string'}, + '2': {'type': 'freestyle_string'}, + '3': {'type': 'freestyle_string'} + }) + # expect headers to be strings + self.assertEqual(df.columns[0], '1') + self.assertEqual(df['1'][0], 'a') + + def test_removal_whitespace(self): + """validation: test that leading/trailing whitespace is removed.""" + from natcap.invest import validation + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + + with open(csv_file, 'w') as file_obj: + file_obj.write(" Col1, Col2 ,Col3 \n") + file_obj.write(" val1, val2 ,val3 \n") + file_obj.write(" , 2 1 , ") + df = validation.get_validated_dataframe( + csv_file, + columns={ + 'col1': {'type': 'freestyle_string'}, + 'col2': {'type': 'freestyle_string'}, + 'col3': {'type': 'freestyle_string'} + }) + # header should have no leading / trailing whitespace + self.assertEqual(list(df.columns), ['col1', 'col2', 'col3']) + + # values should have no leading / trailing whitespace + self.assertEqual(df['col1'][0], 'val1') + self.assertEqual(df['col2'][0], 'val2') + self.assertEqual(df['col3'][0], 'val3') + self.assertEqual(df['col1'][1], '') + self.assertEqual(df['col2'][1], '2 1') + self.assertEqual(df['col3'][1], '') + + def test_nan_row(self): + """validation: test NaN row is dropped.""" + from natcap.invest import validation + csv_text = ("lucode,desc,val1,val2\n" + "1,corn,0.5,2\n" + ",,,\n" + "3,beans,0.5,4\n" + "4,butter,9,1") + table_path = os.path.join(self.workspace_dir, 'table.csv') + with open(table_path, 'w') as table_file: + table_file.write(csv_text) + + result = validation.get_validated_dataframe( + table_path, + index_col='lucode', + columns={ + 'desc': {'type': 'freestyle_string'}, + 'lucode': {'type': 'integer'}, + 'val1': {'type': 'number'}, + 'val2': {'type': 'number'} + }).to_dict(orient='index') + expected_result = { + 1: {'desc': 'corn', 'val1': 0.5, 'val2': 2}, + 3: {'desc': 'beans', 'val1': 0.5, 'val2': 4}, + 4: {'desc': 'butter', 'val1': 9, 'val2': 1}} + + self.assertDictEqual(result, expected_result) + + def test_rows(self): + """validation: read csv with row headers instead of columns""" + from natcap.invest import validation + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + + with open(csv_file, 'w') as file_obj: + file_obj.write("row1, a ,b\n") + file_obj.write("row2,1,3\n") + df = validation.get_validated_dataframe( + csv_file, + rows={ + 'row1': {'type': 'freestyle_string'}, + 'row2': {'type': 'number'}, + }) + print(df) + # header should have no leading / trailing whitespace + self.assertEqual(list(df.columns), ['row1', 'row2']) + + self.assertEqual(df['row1'][0], 'a') + self.assertEqual(df['row1'][1], 'b') + self.assertEqual(df['row2'][0], 1) + self.assertEqual(df['row2'][1], 3) + self.assertEqual(df['row2'].dtype, float) + + class TestValidationFromSpec(unittest.TestCase): """Test Validation From Spec.""" diff --git a/tests/test_wind_energy.py b/tests/test_wind_energy.py index c34687330..c2b732fb0 100644 --- a/tests/test_wind_energy.py +++ b/tests/test_wind_energy.py @@ -167,30 +167,6 @@ def test_calculate_land_to_grid_distance(self): result_val = point_feat.GetField(field_index) numpy.testing.assert_allclose(result_val, exp_results[i]) - def test_read_csv_wind_parameters(self): - """WindEnergy: testing 'read_csv_wind_parameter' function.""" - from natcap.invest import wind_energy - - csv_path = os.path.join( - SAMPLE_DATA, - 'global_wind_energy_parameters.csv') - - parameter_list = [ - 'air_density', 'exponent_power_curve', 'decommission_cost', - 'operation_maintenance_cost', 'miscellaneous_capex_cost'] - - result = wind_energy._read_csv_wind_parameters( - csv_path, parameter_list) - - expected_result = { - 'air_density': 1.225, - 'exponent_power_curve': 2, - 'decommission_cost': 0.037, - 'operation_maintenance_cost': .035, - 'miscellaneous_capex_cost': .05 - } - self.assertDictEqual(expected_result, result) - def test_wind_data_to_point_vector(self): """WindEnergy: testing 'wind_data_to_point_vector' function.""" from natcap.invest import wind_energy