diff --git a/HISTORY.rst b/HISTORY.rst index 08eb1b6b24..a2005147ea 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -44,12 +44,16 @@ Unreleased Changes * Updated the package installation instructions in the API docs for clarity and also to highlight the ease of installation through ``conda-forge``. https://github.com/natcap/invest/issues/1256 - * ``utils.build_lookup_from_csv`` now accepts kwargs for ``pandas.read_csv`` - (`#1319 `_) + * ``utils.build_lookup_from_csv`` has been deprecated and its functionality + has been merged into ``utils.read_csv_to_dataframe`` + (`#1319 `_), + (`#1327 `_) * Workbench * Fixed a bug where sampledata downloads failed silently (and progress bar became innacurate) if the Workbench did not have write permission to the download location. https://github.com/natcap/invest/issues/1070 +* Forest Carbon + * The biophysical table is now case-insensitive. * HRA * Fixed a bug in HRA where the model would error when all exposure and consequence criteria were skipped for a single habitat. The model now diff --git a/src/natcap/invest/annual_water_yield.py b/src/natcap/invest/annual_water_yield.py index 0f341e4f1c..1fa5a9a1ee 100644 --- a/src/natcap/invest/annual_water_yield.py +++ b/src/natcap/invest/annual_water_yield.py @@ -517,8 +517,8 @@ def execute(args): 'Checking that watersheds have entries for every `ws_id` in the ' 'valuation table.') # Open/read in valuation parameters from CSV file - valuation_params = utils.build_lookup_from_csv( - args['valuation_table_path'], 'ws_id') + valuation_params = utils.read_csv_to_dataframe( + args['valuation_table_path'], 'ws_id').to_dict(orient='index') watershed_vector = gdal.OpenEx( args['watersheds_path'], gdal.OF_VECTOR) watershed_layer = watershed_vector.GetLayer() @@ -636,15 +636,15 @@ def execute(args): 'lulc': pygeoprocessing.get_raster_info(clipped_lulc_path)['nodata'][0]} # Open/read in the csv file into a dictionary and add to arguments - bio_dict = utils.build_lookup_from_csv( - args['biophysical_table_path'], 'lucode', to_lower=True) + bio_dict = utils.read_csv_to_dataframe( + args['biophysical_table_path'], 'lucode').to_dict(orient='index') bio_lucodes = set(bio_dict.keys()) bio_lucodes.add(nodata_dict['lulc']) LOGGER.debug(f'bio_lucodes: {bio_lucodes}') if 'demand_table_path' in args and args['demand_table_path'] != '': - demand_dict = utils.build_lookup_from_csv( - args['demand_table_path'], 'lucode') + demand_dict = utils.read_csv_to_dataframe( + args['demand_table_path'], 'lucode').to_dict(orient='index') demand_reclassify_dict = dict( [(lucode, demand_dict[lucode]['demand']) for lucode in demand_dict]) diff --git a/src/natcap/invest/carbon.py b/src/natcap/invest/carbon.py index d4d1661585..b36a7b519c 100644 --- a/src/natcap/invest/carbon.py +++ b/src/natcap/invest/carbon.py @@ -366,8 +366,8 @@ def execute(args): (_INTERMEDIATE_BASE_FILES, intermediate_output_dir), (_TMP_BASE_FILES, output_dir)], file_suffix) - carbon_pool_table = utils.build_lookup_from_csv( - args['carbon_pools_path'], 'lucode') + carbon_pool_table = utils.read_csv_to_dataframe( + args['carbon_pools_path'], 'lucode').to_dict(orient='index') work_token_dir = os.path.join( intermediate_output_dir, '_taskgraph_working_dir') diff --git a/src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py b/src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py index 71787b2d9f..c8c1515d66 100644 --- a/src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py +++ b/src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py @@ -584,8 +584,8 @@ def execute(args): # We're assuming that the LULC initial variables and the carbon pool # transient table are combined into a single lookup table. - biophysical_parameters = utils.build_lookup_from_csv( - args['biophysical_table_path'], 'code') + biophysical_parameters = utils.read_csv_to_dataframe( + args['biophysical_table_path'], 'code').to_dict(orient='index') # LULC Classnames are critical to the transition mapping, so they must be # unique. This check is here in ``execute`` because it's possible that @@ -964,8 +964,9 @@ def execute(args): if args.get('use_price_table', False): prices = { year: values['price'] for (year, values) in - utils.build_lookup_from_csv( - args['price_table_path'], 'year').items()} + utils.read_csv_to_dataframe( + args['price_table_path'], 'year' + ).to_dict(orient='index').items()} else: inflation_rate = float(args['inflation_rate']) * 0.01 annual_price = float(args['price']) @@ -1985,7 +1986,8 @@ def _read_transition_matrix(transition_csv_path, biophysical_dict): landcover transition, and the second contains accumulation rates for the pool for the landcover transition. """ - table = utils.read_csv_to_dataframe(transition_csv_path, index_col=False) + table = utils.read_csv_to_dataframe( + transition_csv_path, convert_cols_to_lower=False, convert_vals_to_lower=False) lulc_class_to_lucode = {} max_lucode = 0 @@ -2030,6 +2032,10 @@ def _read_transition_matrix(transition_csv_path, biophysical_dict): # Strip any whitespace to eliminate leading/trailing whitespace row = row.str.strip() + # skip rows starting with a blank cell, these are part of the legend + if not row['lulc-class']: + continue + try: from_colname = str(row['lulc-class']).lower() from_lucode = lulc_class_to_lucode[from_colname] @@ -2239,8 +2245,7 @@ def _extract_snapshots_from_table(csv_path): """ table = utils.read_csv_to_dataframe( - csv_path, to_lower=True, index_col=False, - expand_path_cols=['raster_path']) + csv_path, convert_vals_to_lower=False, expand_path_cols=['raster_path']) output_dict = {} table.set_index("snapshot_year", drop=False, inplace=True) diff --git a/src/natcap/invest/coastal_blue_carbon/preprocessor.py b/src/natcap/invest/coastal_blue_carbon/preprocessor.py index 6f942e7f20..8b590151de 100644 --- a/src/natcap/invest/coastal_blue_carbon/preprocessor.py +++ b/src/natcap/invest/coastal_blue_carbon/preprocessor.py @@ -209,8 +209,8 @@ def execute(args): target_path_list=aligned_snapshot_paths, task_name='Align input landcover rasters') - landcover_table = utils.build_lookup_from_csv( - args['lulc_lookup_table_path'], 'code') + landcover_table = utils.read_csv_to_dataframe( + args['lulc_lookup_table_path'], 'code').to_dict(orient='index') target_transition_table = os.path.join( output_dir, TRANSITION_TABLE.format(suffix=suffix)) diff --git a/src/natcap/invest/coastal_vulnerability.py b/src/natcap/invest/coastal_vulnerability.py index 9dab22ff36..7ce14522ba 100644 --- a/src/natcap/invest/coastal_vulnerability.py +++ b/src/natcap/invest/coastal_vulnerability.py @@ -2315,7 +2315,7 @@ def _schedule_habitat_tasks( """ habitat_dataframe = utils.read_csv_to_dataframe( - habitat_table_path, to_lower=True, expand_path_cols=['path']) + habitat_table_path, convert_vals_to_lower=False, expand_path_cols=['path']) habitat_dataframe = habitat_dataframe.rename( columns={'protection distance (m)': 'distance'}) @@ -2834,7 +2834,8 @@ def assemble_results_and_calculate_exposure( with open(pickle_path, 'rb') as file: final_values_dict[var_name] = pickle.load(file) - habitat_df = utils.read_csv_to_dataframe(habitat_protection_path) + habitat_df = utils.read_csv_to_dataframe( + habitat_protection_path, convert_cols_to_lower=False, convert_vals_to_lower=False) output_layer.StartTransaction() for feature in output_layer: shore_id = feature.GetField(SHORE_ID_FIELD) @@ -3464,7 +3465,8 @@ def _validate_habitat_table_paths(habitat_table_path): ValueError if any vector in the ``path`` column cannot be opened. """ habitat_dataframe = utils.read_csv_to_dataframe( - habitat_table_path, expand_path_cols=['path']) + habitat_table_path, convert_cols_to_lower=False, convert_vals_to_lower=False, + expand_path_cols=['path']) bad_paths = [] for habitat_row in habitat_dataframe.itertuples(): try: diff --git a/src/natcap/invest/crop_production_percentile.py b/src/natcap/invest/crop_production_percentile.py index 600212118d..1cc5717ef3 100644 --- a/src/natcap/invest/crop_production_percentile.py +++ b/src/natcap/invest/crop_production_percentile.py @@ -458,8 +458,8 @@ def execute(args): None. """ - crop_to_landcover_table = utils.build_lookup_from_csv( - args['landcover_to_crop_table_path'], 'crop_name', to_lower=True) + crop_to_landcover_table = utils.read_csv_to_dataframe( + args['landcover_to_crop_table_path'], 'crop_name').to_dict(orient='index') bad_crop_name_list = [] for crop_name in crop_to_landcover_table: crop_climate_bin_raster_path = os.path.join( @@ -540,8 +540,8 @@ def execute(args): climate_percentile_yield_table_path = os.path.join( args['model_data_path'], _CLIMATE_PERCENTILE_TABLE_PATTERN % crop_name) - crop_climate_percentile_table = utils.build_lookup_from_csv( - climate_percentile_yield_table_path, 'climate_bin', to_lower=True) + crop_climate_percentile_table = utils.read_csv_to_dataframe( + climate_percentile_yield_table_path, 'climate_bin').to_dict(orient='index') yield_percentile_headers = [ x for x in list(crop_climate_percentile_table.values())[0] if x != 'climate_bin'] @@ -698,9 +698,10 @@ def execute(args): # both 'crop_nutrient.csv' and 'crop' are known data/header values for # this model data. - nutrient_table = utils.build_lookup_from_csv( + nutrient_table = utils.read_csv_to_dataframe( os.path.join(args['model_data_path'], 'crop_nutrient.csv'), - 'crop', to_lower=False) + 'crop', convert_cols_to_lower=False, convert_vals_to_lower=False + ).to_dict(orient='index') result_table_path = os.path.join( output_dir, 'result_table%s.csv' % file_suffix) diff --git a/src/natcap/invest/crop_production_regression.py b/src/natcap/invest/crop_production_regression.py index c25997877e..602070ec5a 100644 --- a/src/natcap/invest/crop_production_regression.py +++ b/src/natcap/invest/crop_production_regression.py @@ -484,11 +484,11 @@ def execute(args): LOGGER.info( "Checking if the landcover raster is missing lucodes") - crop_to_landcover_table = utils.build_lookup_from_csv( - args['landcover_to_crop_table_path'], 'crop_name', to_lower=True) + crop_to_landcover_table = utils.read_csv_to_dataframe( + args['landcover_to_crop_table_path'], 'crop_name').to_dict(orient='index') - crop_to_fertlization_rate_table = utils.build_lookup_from_csv( - args['fertilization_rate_table_path'], 'crop_name', to_lower=True) + crop_to_fertlization_rate_table = utils.read_csv_to_dataframe( + args['fertilization_rate_table_path'], 'crop_name').to_dict(orient='index') crop_lucodes = [ x[_EXPECTED_LUCODE_TABLE_HEADER] @@ -571,8 +571,8 @@ def execute(args): crop_regression_table_path = os.path.join( args['model_data_path'], _REGRESSION_TABLE_PATTERN % crop_name) - crop_regression_table = utils.build_lookup_from_csv( - crop_regression_table_path, 'climate_bin', to_lower=True) + crop_regression_table = utils.read_csv_to_dataframe( + crop_regression_table_path, 'climate_bin').to_dict(orient='index') for bin_id in crop_regression_table: for header in _EXPECTED_REGRESSION_TABLE_HEADERS: if crop_regression_table[bin_id][header.lower()] == '': @@ -796,9 +796,10 @@ def execute(args): # both 'crop_nutrient.csv' and 'crop' are known data/header values for # this model data. - nutrient_table = utils.build_lookup_from_csv( + nutrient_table = utils.read_csv_to_dataframe( os.path.join(args['model_data_path'], 'crop_nutrient.csv'), - 'crop', to_lower=False) + 'crop', convert_cols_to_lower=False, convert_vals_to_lower=False + ).to_dict(orient='index') LOGGER.info("Generating report table") result_table_path = os.path.join( diff --git a/src/natcap/invest/datastack.py b/src/natcap/invest/datastack.py index 1d264813eb..06b7b9a44b 100644 --- a/src/natcap/invest/datastack.py +++ b/src/natcap/invest/datastack.py @@ -336,7 +336,7 @@ def build_datastack_archive(args, model_name, datastack_path): data_dir, f'{key}_csv_data') dataframe = utils.read_csv_to_dataframe( - source_path, to_lower=True) + source_path, convert_vals_to_lower=False) csv_source_dir = os.path.abspath(os.path.dirname(source_path)) for spatial_column_name in spatial_columns: # Iterate through the spatial columns, identify the set of diff --git a/src/natcap/invest/forest_carbon_edge_effect.py b/src/natcap/invest/forest_carbon_edge_effect.py index 0c2559afcb..e38ead6eed 100644 --- a/src/natcap/invest/forest_carbon_edge_effect.py +++ b/src/natcap/invest/forest_carbon_edge_effect.py @@ -418,8 +418,8 @@ def execute(args): # Map non-forest landcover codes to carbon biomasses LOGGER.info('Calculating direct mapped carbon stocks') carbon_maps = [] - biophysical_table = utils.build_lookup_from_csv( - args['biophysical_table_path'], 'lucode', to_lower=False) + biophysical_table = utils.read_csv_to_dataframe( + args['biophysical_table_path'], 'lucode').to_dict(orient='index') biophysical_keys = [ x.lower() for x in list(biophysical_table.values())[0].keys()] pool_list = [('c_above', True)] @@ -630,8 +630,8 @@ def _calculate_lulc_carbon_map( """ # classify forest pixels from lulc - biophysical_table = utils.build_lookup_from_csv( - biophysical_table_path, 'lucode', to_lower=False) + biophysical_table = utils.read_csv_to_dataframe( + biophysical_table_path, 'lucode').to_dict(orient='index') lucode_to_per_cell_carbon = {} cell_size = pygeoprocessing.get_raster_info( @@ -696,8 +696,8 @@ def _map_distance_from_tropical_forest_edge( """ # Build a list of forest lucodes - biophysical_table = utils.build_lookup_from_csv( - biophysical_table_path, 'lucode', to_lower=False) + biophysical_table = utils.read_csv_to_dataframe( + biophysical_table_path, 'lucode').to_dict(orient='index') forest_codes = [ lucode for (lucode, ludata) in biophysical_table.items() if int(ludata['is_tropical_forest']) == 1] diff --git a/src/natcap/invest/habitat_quality.py b/src/natcap/invest/habitat_quality.py index 36ca91b8c2..5aa4c42172 100644 --- a/src/natcap/invest/habitat_quality.py +++ b/src/natcap/invest/habitat_quality.py @@ -380,11 +380,12 @@ def execute(args): LOGGER.info("Checking Threat and Sensitivity tables for compliance") # Get CSVs as dictionaries and ensure the key is a string for threats. threat_dict = { - str(key): value for key, value in utils.build_lookup_from_csv( - args['threats_table_path'], 'THREAT', to_lower=True, - expand_path_cols=['cur_path', 'fut_path', 'base_path']).items()} - sensitivity_dict = utils.build_lookup_from_csv( - args['sensitivity_table_path'], 'LULC', to_lower=True) + str(key): value for key, value in utils.read_csv_to_dataframe( + args['threats_table_path'], 'THREAT', + expand_path_cols=['cur_path', 'fut_path', 'base_path'] + ).to_dict(orient='index').items()} + sensitivity_dict = utils.read_csv_to_dataframe( + args['sensitivity_table_path'], 'LULC').to_dict(orient='index') half_saturation_constant = float(args['half_saturation_constant']) @@ -1156,11 +1157,12 @@ def validate(args, limit_to=None): # Get CSVs as dictionaries and ensure the key is a string for threats. threat_dict = { - str(key): value for key, value in utils.build_lookup_from_csv( - args['threats_table_path'], 'THREAT', to_lower=True, - expand_path_cols=['cur_path', 'fut_path', 'base_path']).items()} - sensitivity_dict = utils.build_lookup_from_csv( - args['sensitivity_table_path'], 'LULC', to_lower=True) + str(key): value for key, value in utils.read_csv_to_dataframe( + args['threats_table_path'], 'THREAT', + expand_path_cols=['cur_path', 'fut_path', 'base_path'] + ).to_dict(orient='index').items()} + sensitivity_dict = utils.read_csv_to_dataframe( + args['sensitivity_table_path'], 'LULC').to_dict(orient='index') # check that the threat names in the threats table match with the # threats columns in the sensitivity table. diff --git a/src/natcap/invest/hra.py b/src/natcap/invest/hra.py index af74b12fa2..b21d9f18e0 100644 --- a/src/natcap/invest/hra.py +++ b/src/natcap/invest/hra.py @@ -1845,7 +1845,8 @@ def _open_table_as_dataframe(table_path, **kwargs): return excel_df else: return utils.read_csv_to_dataframe( - table_path, to_lower=True, expand_path_cols=['path'], **kwargs) + table_path, convert_vals_to_lower=False, + expand_path_cols=['path'], **kwargs) def _parse_info_table(info_table_path): diff --git a/src/natcap/invest/ndr/ndr.py b/src/natcap/invest/ndr/ndr.py index 309436b518..fd8d39353b 100644 --- a/src/natcap/invest/ndr/ndr.py +++ b/src/natcap/invest/ndr/ndr.py @@ -619,8 +619,8 @@ def _validate_inputs(nutrients_to_process, lucode_to_parameters): if args['calc_' + nutrient_id]: nutrients_to_process.append(nutrient_id) - lucode_to_parameters = utils.build_lookup_from_csv( - args['biophysical_table_path'], 'lucode') + lucode_to_parameters = utils.read_csv_to_dataframe( + args['biophysical_table_path'], 'lucode').to_dict(orient='index') _validate_inputs(nutrients_to_process, lucode_to_parameters) diff --git a/src/natcap/invest/pollination.py b/src/natcap/invest/pollination.py index db859d37cc..77aff70095 100644 --- a/src/natcap/invest/pollination.py +++ b/src/natcap/invest/pollination.py @@ -1179,8 +1179,8 @@ def _parse_scenario_variables(args): else: farm_vector_path = None - guild_table = utils.build_lookup_from_csv( - guild_table_path, 'species', to_lower=True) + guild_table = utils.read_csv_to_dataframe( + guild_table_path, 'species').to_dict(orient='index') LOGGER.info('Checking to make sure guild table has all expected headers') guild_headers = list(guild_table.values())[0].keys() @@ -1192,8 +1192,8 @@ def _parse_scenario_variables(args): f"'{header}' but was unable to find one. Here are all the " f"headers from {guild_table_path}: {', '.join(guild_headers)}") - landcover_biophysical_table = utils.build_lookup_from_csv( - landcover_biophysical_table_path, 'lucode', to_lower=True) + landcover_biophysical_table = utils.read_csv_to_dataframe( + landcover_biophysical_table_path, 'lucode').to_dict(orient='index') biophysical_table_headers = ( list(landcover_biophysical_table.values())[0].keys()) for header in _EXPECTED_BIOPHYSICAL_HEADERS: diff --git a/src/natcap/invest/recreation/recmodel_client.py b/src/natcap/invest/recreation/recmodel_client.py index 71eadfe27c..7ef4c2c12d 100644 --- a/src/natcap/invest/recreation/recmodel_client.py +++ b/src/natcap/invest/recreation/recmodel_client.py @@ -853,8 +853,9 @@ def _schedule_predictor_data_processing( 'line_intersect_length': _line_intersect_length, } - predictor_table = utils.build_lookup_from_csv( - predictor_table_path, 'id', expand_path_cols=['path']) + predictor_table = utils.read_csv_to_dataframe( + predictor_table_path, 'id', expand_path_cols=['path'] + ).to_dict(orient='index') predictor_task_list = [] predictor_json_list = [] # tracks predictor files to add to shp @@ -1546,7 +1547,8 @@ def _validate_same_id_lengths(table_path): tables. """ - predictor_table = utils.build_lookup_from_csv(table_path, 'id') + predictor_table = utils.read_csv_to_dataframe( + table_path, 'id').to_dict(orient='index') too_long = set() for p_id in predictor_table: if len(p_id) > 10: @@ -1579,11 +1581,11 @@ def _validate_same_ids_and_types( tables. """ - predictor_table = utils.build_lookup_from_csv( - predictor_table_path, 'id') + predictor_table = utils.read_csv_to_dataframe( + predictor_table_path, 'id').to_dict(orient='index') - scenario_predictor_table = utils.build_lookup_from_csv( - scenario_predictor_table_path, 'id') + scenario_predictor_table = utils.read_csv_to_dataframe( + scenario_predictor_table_path, 'id').to_dict(orient='index') predictor_table_pairs = set([ (p_id, predictor_table[p_id]['type'].strip()) for p_id in predictor_table]) @@ -1616,7 +1618,7 @@ def _validate_same_projection(base_vector_path, table_path): # This will load the table as a list of paths which we can iterate through # without bothering the rest of the table structure data_paths = utils.read_csv_to_dataframe( - table_path, to_lower=True, expand_path_cols=['path'] + table_path, convert_vals_to_lower=False, expand_path_cols=['path'] ).squeeze('columns')['path'].tolist() base_vector = gdal.OpenEx(base_vector_path, gdal.OF_VECTOR) @@ -1673,7 +1675,7 @@ def _validate_predictor_types(table_path): ValueError if any value in the ``type`` column does not match a valid type, ignoring leading/trailing whitespace. """ - df = utils.read_csv_to_dataframe(table_path, to_lower=True) + df = utils.read_csv_to_dataframe(table_path, convert_vals_to_lower=False) # ignore leading/trailing whitespace because it will be removed # when the type values are used type_list = set([type.strip() for type in df['type']]) diff --git a/src/natcap/invest/sdr/sdr.py b/src/natcap/invest/sdr/sdr.py index c4cf4b6284..28fed6fbaa 100644 --- a/src/natcap/invest/sdr/sdr.py +++ b/src/natcap/invest/sdr/sdr.py @@ -498,8 +498,8 @@ def execute(args): """ file_suffix = utils.make_suffix_string(args, 'results_suffix') - biophysical_table = utils.build_lookup_from_csv( - args['biophysical_table_path'], 'lucode') + biophysical_table = utils.read_csv_to_dataframe( + args['biophysical_table_path'], 'lucode').to_dict(orient='index') # Test to see if c or p values are outside of 0..1 for table_key in ['usle_c', 'usle_p']: diff --git a/src/natcap/invest/seasonal_water_yield/seasonal_water_yield.py b/src/natcap/invest/seasonal_water_yield/seasonal_water_yield.py index 208d99e269..40715eb58a 100644 --- a/src/natcap/invest/seasonal_water_yield/seasonal_water_yield.py +++ b/src/natcap/invest/seasonal_water_yield/seasonal_water_yield.py @@ -562,11 +562,12 @@ def _execute(args): if (not args['user_defined_local_recharge'] and not args['user_defined_climate_zones']): rain_events_lookup = ( - utils.build_lookup_from_csv( - args['rain_events_table_path'], 'month')) + utils.read_csv_to_dataframe( + args['rain_events_table_path'], 'month' + ).to_dict(orient='index')) - biophysical_table = utils.build_lookup_from_csv( - args['biophysical_table_path'], 'lucode') + biophysical_table = utils.read_csv_to_dataframe( + args['biophysical_table_path'], 'lucode').to_dict(orient='index') bad_value_list = [] for lucode, value in biophysical_table.items(): @@ -592,8 +593,9 @@ def _execute(args): # parse out the alpha lookup table of the form (month_id: alpha_val) alpha_month_map = dict( (key, val['alpha']) for key, val in - utils.build_lookup_from_csv( - args['monthly_alpha_path'], 'month').items()) + utils.read_csv_to_dataframe( + args['monthly_alpha_path'], 'month' + ).to_dict(orient='index').items()) else: # make all 12 entries equal to args['alpha_m'] alpha_m = float(fractions.Fraction(args['alpha_m'])) @@ -761,8 +763,9 @@ def _execute(args): for month_id in range(N_MONTHS): if args['user_defined_climate_zones']: cz_rain_events_lookup = ( - utils.build_lookup_from_csv( - args['climate_zone_table_path'], 'cz_id')) + utils.read_csv_to_dataframe( + args['climate_zone_table_path'], 'cz_id' + ).to_dict(orient='index')) month_label = MONTH_ID_TO_LABEL[month_id] climate_zone_rain_events_month = dict([ (cz_id, cz_rain_events_lookup[cz_id][month_label]) for diff --git a/src/natcap/invest/stormwater.py b/src/natcap/invest/stormwater.py index 57d7ead1d8..afae65498c 100644 --- a/src/natcap/invest/stormwater.py +++ b/src/natcap/invest/stormwater.py @@ -482,8 +482,8 @@ def execute(args): task_name='align input rasters') # Build a lookup dictionary mapping each LULC code to its row - biophysical_dict = utils.build_lookup_from_csv( - args['biophysical_table'], 'lucode') + biophysical_dict = utils.read_csv_to_dataframe( + args['biophysical_table'], 'lucode').to_dict(orient='index') # sort the LULC codes upfront because we use the sorted list in multiple # places. it's more efficient to do this once. sorted_lucodes = sorted(biophysical_dict) diff --git a/src/natcap/invest/urban_cooling_model.py b/src/natcap/invest/urban_cooling_model.py index 755452aa89..39c4decda5 100644 --- a/src/natcap/invest/urban_cooling_model.py +++ b/src/natcap/invest/urban_cooling_model.py @@ -410,8 +410,8 @@ def execute(args): intermediate_dir = os.path.join( args['workspace_dir'], 'intermediate') utils.make_directories([args['workspace_dir'], intermediate_dir]) - biophysical_lucode_map = utils.build_lookup_from_csv( - args['biophysical_table_path'], 'lucode', to_lower=True) + biophysical_lucode_map = utils.read_csv_to_dataframe( + args['biophysical_table_path'], 'lucode').to_dict(orient='index') # cast to float and calculate relative weights # Use default weights for shade, albedo, eti if the user didn't provide @@ -1079,8 +1079,8 @@ def calculate_energy_savings( for field in target_building_layer.schema] type_field_index = fieldnames.index('type') - energy_consumption_table = utils.build_lookup_from_csv( - energy_consumption_table_path, 'type', to_lower=True) + energy_consumption_table = utils.read_csv_to_dataframe( + energy_consumption_table_path, 'type').to_dict(orient='index') target_building_layer.StartTransaction() last_time = time.time() diff --git a/src/natcap/invest/urban_flood_risk_mitigation.py b/src/natcap/invest/urban_flood_risk_mitigation.py index f047308a19..1fab33c3ac 100644 --- a/src/natcap/invest/urban_flood_risk_mitigation.py +++ b/src/natcap/invest/urban_flood_risk_mitigation.py @@ -306,8 +306,8 @@ def execute(args): task_name='align raster stack') # Load CN table - cn_table = utils.build_lookup_from_csv( - args['curve_number_table_path'], 'lucode') + cn_table = utils.read_csv_to_dataframe( + args['curve_number_table_path'], 'lucode').to_dict(orient='index') # make cn_table into a 2d array where first dim is lucode, second is # 0..3 to correspond to CN_A..CN_D @@ -648,8 +648,8 @@ def _calculate_damage_to_infrastructure_in_aoi( infrastructure_vector = gdal.OpenEx(structures_vector_path, gdal.OF_VECTOR) infrastructure_layer = infrastructure_vector.GetLayer() - damage_type_map = utils.build_lookup_from_csv( - structures_damage_table, 'type', to_lower=True) + damage_type_map = utils.read_csv_to_dataframe( + structures_damage_table, 'type').to_dict(orient='index') infrastructure_layer_defn = infrastructure_layer.GetLayerDefn() type_index = -1 diff --git a/src/natcap/invest/urban_nature_access.py b/src/natcap/invest/urban_nature_access.py index 528d69650d..48b4ae703f 100644 --- a/src/natcap/invest/urban_nature_access.py +++ b/src/natcap/invest/urban_nature_access.py @@ -902,8 +902,7 @@ def execute(args): aoi_reprojection_task, lulc_mask_task] ) - attr_table = utils.read_csv_to_dataframe( - args['lulc_attribute_table'], to_lower=True) + attr_table = utils.read_csv_to_dataframe(args['lulc_attribute_table']) kernel_paths = {} # search_radius, kernel path kernel_tasks = {} # search_radius, kernel task @@ -1749,8 +1748,8 @@ def _reclassify_urban_nature_area( Returns: ``None`` """ - attribute_table_dict = utils.build_lookup_from_csv( - lulc_attribute_table, key_field='lucode') + attribute_table_dict = utils.read_csv_to_dataframe( + lulc_attribute_table, 'lucode').to_dict(orient='index') squared_pixel_area = abs( numpy.multiply(*_square_off_pixels(lulc_raster_path))) diff --git a/src/natcap/invest/utils.py b/src/natcap/invest/utils.py index fd5e4a9fce..1f603eecb8 100644 --- a/src/natcap/invest/utils.py +++ b/src/natcap/invest/utils.py @@ -580,108 +580,6 @@ def _build_path(base_filename, path): return f_reg -def build_lookup_from_csv( - table_path, key_field, column_list=None, to_lower=True, **kwargs): - """Read a CSV table into a dictionary indexed by ``key_field``. - - Creates a dictionary from a CSV whose keys are unique entries in the CSV - table under the column named by ``key_field`` and values are dictionaries - indexed by the other columns in ``table_path`` including ``key_field`` - whose values are the values on that row of the CSV table. - - If an entire row is NA/NaN (including ``key_field``) then it is dropped - from the table and a warning is given of the dropped rows. - - Args: - table_path (string): path to a CSV file containing at - least the header key_field - key_field: (string): a column in the CSV file at `table_path` that - can uniquely identify each row in the table and sets the row index. - column_list (list): a list of column names to subset from the CSV - file, default=None - to_lower (bool): if True, converts all unicode in the CSV, - including headers and values to lowercase, otherwise uses raw - string values. default=True. - **kwargs: additional kwargs will be passed to ``utils.read_csv_from_dataframe`` - - Returns: - lookup_dict (dict): a dictionary of the form - {key_field_0: {csv_header_0: value0, csv_header_1: value1...}, - key_field_1: {csv_header_0: valuea, csv_header_1: valueb...}} - - if ``to_lower`` all strings including key_fields and values are - converted to lowercase unicode. - - Raise: - ValueError - If ValueError occurs during conversion to dictionary. - KeyError - If ``key_field`` is not present during ``set_index`` call. - """ - # Reassign to avoid mutation - col_list = column_list - # if a list of columns are provided to use and return, make sure - # 'key_field' is one of them. - if col_list and key_field not in col_list: - col_list.append(key_field) - - table = read_csv_to_dataframe( - table_path, to_lower=to_lower, index_col=False, **kwargs) - - # if 'to_lower`, case handling is done before trying to access the data. - # the columns are stripped of leading/trailing whitespace in - # ``read_csv_to_dataframe``, and also lowercased if ``to_lower`` so we only - # need to convert the rest of the table. - if to_lower: - key_field = key_field.lower() - # lowercase column names - if col_list: - col_list = [col.lower() for col in col_list] - # lowercase values - table = table.applymap( - lambda x: x.lower() if isinstance(x, str) else x) - - # Set 'key_field' as the index of the dataframe - try: - table.set_index(key_field, drop=False, inplace=True) - except KeyError: - # If 'key_field' is not a column then KeyError is raised for using - # it as the index column - LOGGER.error(f"'key_field' : '{key_field}' could not be found as a" - f" column in the table. Table path: {table_path}.") - raise - - # Subset dataframe by columns if desired - if col_list: - table = table.loc[:, col_list] - - # look for NaN values and warn if any are found. - table_na = table.isna() - if table_na.values.any(): - LOGGER.warning( - f"Empty or NaN values were found in the table: {table_path}.") - # look to see if an entire row is NA values - table_na_rows = table_na.all(axis=1) - na_rows = table_na_rows.index[table_na_rows].tolist() - # if a completely empty row, drop it - if na_rows: - LOGGER.warning( - "Encountered an entirely blank row on line(s)" - f" {[x+2 for x in na_rows]}. Dropping rows from table.") - table.dropna(how="all", inplace=True) - # fill the rest of empty or NaN values with empty string - table.fillna(value="", inplace=True) - try: - lookup_dict = table.to_dict(orient='index') - except ValueError: - # If 'key_field' is not unique then a value error is raised. - LOGGER.error(f"The 'key_field' : '{key_field}' column values are not" - f" unique: {table.index.tolist()}") - raise - - return lookup_dict - - def expand_path(path, base_path): """Check if a path is relative, and if so, expand it using the base path. @@ -700,7 +598,8 @@ def expand_path(path, base_path): def read_csv_to_dataframe( - path, to_lower=False, expand_path_cols=[], sep=None, engine='python', + path, index_col=False, usecols=None, convert_cols_to_lower=True, + convert_vals_to_lower=True, expand_path_cols=None, sep=None, engine='python', encoding='utf-8-sig', **kwargs): """Return a dataframe representation of the CSV. @@ -714,7 +613,15 @@ def read_csv_to_dataframe( Args: path (str): path to a CSV file - to_lower (bool): if True, convert all column names to lowercase + index_col (str): name of column to use as the dataframe index. If + ``convert_cols_to_lower``, this column name and the dataframe column names + will be lowercased before they are compared. If ``usecols`` + is defined, this must be included in ``usecols``. + usecols (list(str)): list of column names to subset from the dataframe. + If ``convert_cols_to_lower``, these names and the dataframe column names + will be lowercased before they are compared. + convert_cols_to_lower (bool): if True, convert all column names to lowercase + convert_vals_to_lower (bool): if True, convert all table values to lowercase expand_path_cols (list[string])): if provided, a list of the names of columns that contain paths to expand. Any relative paths in these columns will be expanded to absolute paths. It is assumed that @@ -732,23 +639,56 @@ def read_csv_to_dataframe( """ try: + # set index_col=False to force pandas not to index by any column + # this is useful in case of trailing separators + # we'll explicitly set the index column later on dataframe = pandas.read_csv( - path, sep=sep, engine=engine, encoding=encoding, **kwargs) + path, index_col=False, sep=sep, engine=engine, encoding=encoding, **kwargs) except UnicodeDecodeError as error: LOGGER.error( - f'{path} must be encoded as UTF-8 or ASCII') + f'The file {path} must be encoded as UTF-8 or ASCII') raise error + # strip whitespace from column names # this won't work on integer types, which happens if you set header=None # however, there's little reason to use this function if there's no header dataframe.columns = dataframe.columns.str.strip() - if to_lower: + + # convert column names to lowercase + if convert_cols_to_lower: dataframe.columns = dataframe.columns.str.lower() + # if 'to_lower`, case handling is done before trying to access the data. + # the columns are stripped of leading/trailing whitespace in + # ``read_csv_to_dataframe``, and also lowercased if ``to_lower`` so we only + # need to convert the rest of the table. + if index_col and isinstance(index_col, str): + index_col = index_col.lower() + # lowercase column names + if usecols: + usecols = [col.lower() for col in usecols] - # Remove values with leading ('^ +') and trailing (' +$') whitespace. - # Regular expressions using 'replace' only substitute on strings. - dataframe = dataframe.replace(r"^ +| +$", r"", regex=True) + # Subset dataframe by columns if desired + if usecols: + dataframe = dataframe[usecols] + # Set 'index_col' as the index of the dataframe + if index_col: + try: + dataframe = dataframe.set_index( + index_col, drop=False, verify_integrity=True) + except KeyError: + # If 'index_col' is not a column then KeyError is raised for using + # it as the index column + LOGGER.error(f"The column '{index_col}' could not be found " + f"in the table {path}") + raise + + # convert table values to lowercase + if convert_vals_to_lower: + dataframe = dataframe.applymap( + lambda x: x.lower() if isinstance(x, str) else x) + + # expand paths if expand_path_cols: for col in expand_path_cols: # allow for the case where a column is optional @@ -758,6 +698,17 @@ def read_csv_to_dataframe( # catch that before trying to expand them as paths lambda p: '' if pandas.isna(p) else expand_path(p, path)) + # drop any empty rows + dataframe = dataframe.dropna(how="all") + + # fill the rest of empty or NaN values with empty string + dataframe = dataframe.fillna(value="") + + # strip whitespace from table values + # Remove values with leading ('^ +') and trailing (' +$') whitespace. + # Regular expressions using 'replace' only substitute on strings. + dataframe = dataframe.replace(r"^ +| +$", r"", regex=True) + return dataframe diff --git a/src/natcap/invest/wave_energy.py b/src/natcap/invest/wave_energy.py index b56b29030c..a03c7e76f0 100644 --- a/src/natcap/invest/wave_energy.py +++ b/src/natcap/invest/wave_energy.py @@ -777,7 +777,7 @@ def execute(args): if 'land_gridPts_path' in args: # Create a grid_land_data dataframe for later use in valuation grid_land_data = utils.read_csv_to_dataframe( - args['land_gridPts_path'], to_lower=True) + args['land_gridPts_path'], convert_vals_to_lower=False) required_col_names = ['id', 'type', 'lat', 'long', 'location'] grid_land_data, missing_grid_land_fields = _get_validated_dataframe( args['land_gridPts_path'], required_col_names) @@ -1425,7 +1425,7 @@ def _get_validated_dataframe(csv_path, field_list): missing_fields (list): missing fields as string format in dataframe. """ - dataframe = utils.read_csv_to_dataframe(csv_path, to_lower=True) + dataframe = utils.read_csv_to_dataframe(csv_path, convert_vals_to_lower=False) missing_fields = [] for field in field_list: if field not in dataframe.columns: @@ -1670,7 +1670,7 @@ def _machine_csv_to_dict(machine_csv_path): machine_dict = {} # make columns and indexes lowercased and strip whitespace machine_data = utils.read_csv_to_dataframe( - machine_csv_path, to_lower=True, index_col=0) + machine_csv_path, 'name', convert_vals_to_lower=False) machine_data.index = machine_data.index.str.strip() machine_data.index = machine_data.index.str.lower() diff --git a/src/natcap/invest/wind_energy.py b/src/natcap/invest/wind_energy.py index cda30b7051..52fda89b85 100644 --- a/src/natcap/invest/wind_energy.py +++ b/src/natcap/invest/wind_energy.py @@ -754,8 +754,7 @@ def execute(args): # If Price Table provided use that for price of energy, validate inputs time = int(val_parameters_dict['time_period']) if args['price_table']: - wind_price_df = utils.read_csv_to_dataframe( - args['wind_schedule'], to_lower=True) + wind_price_df = utils.read_csv_to_dataframe(args['wind_schedule']) year_count = len(wind_price_df['year']) if year_count != time + 1: @@ -1135,7 +1134,7 @@ def execute(args): # Read the grid points csv, and convert it to land and grid dictionary grid_land_df = utils.read_csv_to_dataframe( - args['grid_points_path'], to_lower=True) + args['grid_points_path'], convert_vals_to_lower=False) # Make separate dataframes based on 'TYPE' grid_df = grid_land_df.loc[( @@ -1974,7 +1973,8 @@ def _read_csv_wind_data(wind_data_path, hub_height): to dictionaries that hold wind data at that location. """ - wind_point_df = utils.read_csv_to_dataframe(wind_data_path) + wind_point_df = utils.read_csv_to_dataframe( + wind_data_path, convert_cols_to_lower=False, convert_vals_to_lower=False) # Calculate scale value at new hub height given reference values. # See equation 3 in users guide diff --git a/tests/test_coastal_blue_carbon.py b/tests/test_coastal_blue_carbon.py index fc53dce873..9b42771ac2 100644 --- a/tests/test_coastal_blue_carbon.py +++ b/tests/test_coastal_blue_carbon.py @@ -151,10 +151,10 @@ def test_sample_data(self): pprint.pformat(non_suffixed_files))) expected_landcover_codes = set(range(0, 24)) - found_landcover_codes = set(utils.build_lookup_from_csv( + found_landcover_codes = set(utils.read_csv_to_dataframe( os.path.join(outputs_dir, 'carbon_biophysical_table_template_150225.csv'), - 'code').keys()) + 'code').to_dict(orient='index').keys()) self.assertEqual(expected_landcover_codes, found_landcover_codes) def test_transition_table(self): @@ -188,8 +188,8 @@ def test_transition_table(self): lulc_csv.write('0,mangrove,True\n') lulc_csv.write('1,parking lot,False\n') - landcover_table = utils.build_lookup_from_csv( - landcover_table_path, 'code') + landcover_table = utils.read_csv_to_dataframe( + landcover_table_path, 'code').to_dict(orient='index') target_table_path = os.path.join(self.workspace_dir, 'transition_table.csv') @@ -203,8 +203,8 @@ def test_transition_table(self): str(context.exception)) # Re-load the landcover table - landcover_table = utils.build_lookup_from_csv( - landcover_table_path, 'code') + landcover_table = utils.read_csv_to_dataframe( + landcover_table_path, 'code').to_dict(orient='index') preprocessor._create_transition_table( landcover_table, [filename_a, filename_b], target_table_path) diff --git a/tests/test_datastack.py b/tests/test_datastack.py index 762d60e02d..dbf30ec566 100644 --- a/tests/test_datastack.py +++ b/tests/test_datastack.py @@ -377,8 +377,9 @@ def test_archive_extraction(self): self.assertTrue( filecmp.cmp(archive_params[key], params[key], shallow=False)) - spatial_csv_dict = utils.build_lookup_from_csv( - archive_params['spatial_table'], 'ID', to_lower=True) + spatial_csv_dict = utils.read_csv_to_dataframe( + archive_params['spatial_table'], 'ID', + convert_cols_to_lower=True, convert_vals_to_lower=True).to_dict(orient='index') spatial_csv_dir = os.path.dirname(archive_params['spatial_table']) numpy.testing.assert_allclose( pygeoprocessing.raster_to_numpy_array( diff --git a/tests/test_recreation.py b/tests/test_recreation.py index c02f419179..a5ae82b346 100644 --- a/tests/test_recreation.py +++ b/tests/test_recreation.py @@ -971,8 +971,9 @@ def test_existing_regression_coef(self): predictor_table_path = os.path.join(SAMPLE_DATA, 'predictors.csv') # make outputs to be overwritten - predictor_dict = utils.build_lookup_from_csv( - predictor_table_path, 'id') + predictor_dict = utils.read_csv_to_dataframe( + predictor_table_path, 'id', + convert_cols_to_lower=True, convert_vals_to_lower=True).to_dict(orient='index') predictor_list = predictor_dict.keys() tmp_working_dir = tempfile.mkdtemp(dir=self.workspace_dir) empty_json_list = [ diff --git a/tests/test_utils.py b/tests/test_utils.py index b1fb77b2fd..eb75c186e0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -599,8 +599,8 @@ def test_prepare_workspace(self): self.assertTrue('Elapsed time:' in logfile_text) -class BuildLookupFromCSVTests(unittest.TestCase): - """Tests for natcap.invest.utils.build_lookup_from_csv.""" +class ReadCSVToDataframeTests(unittest.TestCase): + """Tests for natcap.invest.utils.read_csv_to_dataframe.""" def setUp(self): """Make temporary directory for workspace.""" @@ -610,25 +610,25 @@ def tearDown(self): """Delete workspace.""" shutil.rmtree(self.workspace_dir) - def test_build_lookup_from_csv(self): - """utils: test build_lookup_from_csv.""" + def test_read_csv_to_dataframe(self): + """utils: test the default behavior""" from natcap.invest import utils - table_str = 'a,b,foo,bar,_\n0.0,x,-1,bar,apple\n' - table_path = os.path.join(self.workspace_dir, 'table.csv') - with open(table_path, 'w') as table_file: - table_file.write(table_str) - result = utils.build_lookup_from_csv( - table_path, 'a', to_lower=True) - expected_dict = { - 0.0: { - 'a': 0.0, - 'b': 'x', - 'foo': -1.0, - 'bar': 'bar', - '_': 'apple' - }, - } - self.assertDictEqual(result, expected_dict) + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + """\ + HEADER, + A, + b + """ + )) + df = utils.read_csv_to_dataframe(csv_file) + # header and table values should be lowercased + self.assertEqual(df.columns[0], 'header') + self.assertEqual(df['header'][0], 'a') + self.assertEqual(df['header'][1], 'b') def test_unique_key_not_first_column(self): """utils: test success when key field is not first column.""" @@ -642,8 +642,8 @@ def test_unique_key_not_first_column(self): with open(table_path, 'w') as table_file: table_file.write(csv_text) - result = utils.build_lookup_from_csv( - table_path, 'lucode', to_lower=True) + result = utils.read_csv_to_dataframe( + table_path, 'lucode').to_dict(orient='index') expected_result = { 1: {'desc': 'corn', 'val1': 0.5, 'val2': 2, 'lucode': 1}, 2: {'desc': 'bread', 'val1': 1, 'val2': 4, 'lucode': 2}, @@ -665,7 +665,7 @@ def test_non_unique_keys(self): table_file.write(csv_text) with self.assertRaises(ValueError): - utils.build_lookup_from_csv(table_path, 'lucode', to_lower=True) + utils.read_csv_to_dataframe(table_path, 'lucode') def test_missing_key_field(self): """utils: test error is raised when missing key field.""" @@ -680,7 +680,7 @@ def test_missing_key_field(self): table_file.write(csv_text) with self.assertRaises(KeyError): - utils.build_lookup_from_csv(table_path, 'lucode', to_lower=True) + utils.read_csv_to_dataframe(table_path, 'lucode') def test_nan_holes(self): """utils: test empty strings returned when missing data is present.""" @@ -694,8 +694,8 @@ def test_nan_holes(self): with open(table_path, 'w') as table_file: table_file.write(csv_text) - result = utils.build_lookup_from_csv( - table_path, 'lucode', to_lower=True) + result = utils.read_csv_to_dataframe( + table_path, 'lucode').to_dict(orient='index') expected_result = { 1: {'desc': 'corn', 'val1': 0.5, 'val2': 2, 'lucode': 1}, 2: {'desc': '', 'val1': 1, 'val2': 4, 'lucode': 2}, @@ -716,8 +716,8 @@ def test_nan_row(self): with open(table_path, 'w') as table_file: table_file.write(csv_text) - result = utils.build_lookup_from_csv( - table_path, 'lucode', to_lower=True) + result = utils.read_csv_to_dataframe( + table_path, 'lucode').to_dict(orient='index') expected_result = { 1.0: {'desc': 'corn', 'val1': 0.5, 'val2': 2, 'lucode': 1.0}, 3.0: {'desc': 'beans', 'val1': 0.5, 'val2': 4, 'lucode': 3.0}, @@ -737,8 +737,9 @@ def test_column_subset(self): with open(table_path, 'w') as table_file: table_file.write(csv_text) - result = utils.build_lookup_from_csv( - table_path, 'lucode', to_lower=True, column_list=['val1', 'val2']) + result = utils.read_csv_to_dataframe( + table_path, 'lucode', + usecols=['lucode', 'val1', 'val2']).to_dict(orient='index') expected_result = { 1: {'val1': 0.5, 'val2': 2, 'lucode': 1}, @@ -760,8 +761,8 @@ def test_trailing_comma(self): with open(table_path, 'w') as table_file: table_file.write(csv_text) - result = utils.build_lookup_from_csv( - table_path, 'lucode', to_lower=True) + result = utils.read_csv_to_dataframe( + table_path, 'lucode').to_dict(orient='index') expected_result = { 1: {'desc': 'corn', 'val1': 0.5, 'val2': 2, 'lucode': 1}, @@ -783,8 +784,8 @@ def test_trailing_comma_second_line(self): with open(table_path, 'w') as table_file: table_file.write(csv_text) - result = utils.build_lookup_from_csv( - table_path, 'lucode', to_lower=True) + result = utils.read_csv_to_dataframe( + table_path, 'lucode').to_dict(orient='index') expected_result = { 1: {'desc': 'corn', 'val1': 0.5, 'val2': 2, 'lucode': 1}, @@ -801,15 +802,15 @@ def test_results_lowercase_non_numeric(self): csv_file = os.path.join(self.workspace_dir, 'csv.csv') with open(csv_file, 'w') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ header1,HEADER2,header3 1,2,bar 4,5,FOO """ - ).strip()) + )) - lookup_dict = utils.build_lookup_from_csv( - csv_file, 'header1', to_lower=True) + lookup_dict = utils.read_csv_to_dataframe( + csv_file, 'header1').to_dict(orient='index') self.assertEqual(lookup_dict[4]['header3'], 'foo') self.assertEqual(lookup_dict[1]['header2'], 2) @@ -821,15 +822,16 @@ def test_results_uppercase_numeric_cast(self): csv_file = os.path.join(self.workspace_dir, 'csv.csv') with open(csv_file, 'w') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ header1,HEADER2,header3,missing_column, 1,2,3, 4,FOO,bar, """ - ).strip()) + )) - lookup_dict = utils.build_lookup_from_csv( - csv_file, 'header1', to_lower=False) + lookup_dict = utils.read_csv_to_dataframe( + csv_file, 'header1', + convert_cols_to_lower=False, convert_vals_to_lower=False).to_dict(orient='index') self.assertEqual(lookup_dict[4]['HEADER2'], 'FOO') self.assertEqual(lookup_dict[4]['header3'], 'bar') @@ -842,15 +844,16 @@ def test_csv_dialect_detection_semicolon_delimited(self): csv_file = os.path.join(self.workspace_dir, 'csv.csv') with open(csv_file, 'w') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ header1;HEADER2;header3; 1;2;3; 4;FOO;bar; """ - ).strip()) + )) - lookup_dict = utils.build_lookup_from_csv( - csv_file, 'header1', to_lower=False) + lookup_dict = utils.read_csv_to_dataframe( + csv_file, 'header1', + convert_cols_to_lower=False, convert_vals_to_lower=False).to_dict(orient='index') self.assertEqual(lookup_dict[4]['HEADER2'], 'FOO') self.assertEqual(lookup_dict[4]['header3'], 'bar') @@ -863,14 +866,14 @@ def test_csv_utf8_encoding(self): csv_file = os.path.join(self.workspace_dir, 'csv.csv') with open(csv_file, 'w', encoding='utf-8') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ header1,HEADER2,header3 1,2,bar 4,5,FOO """ - ).strip()) - lookup_dict = utils.build_lookup_from_csv( - csv_file, 'header1') + )) + lookup_dict = utils.read_csv_to_dataframe( + csv_file, 'header1').to_dict(orient='index') self.assertEqual(lookup_dict[4]['header2'], 5) self.assertEqual(lookup_dict[4]['header3'], 'foo') self.assertEqual(lookup_dict[1]['header1'], 1) @@ -883,18 +886,18 @@ def test_csv_utf8_bom_encoding(self): # writing with utf-8-sig will prepend the BOM with open(csv_file, 'w', encoding='utf-8-sig') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ header1,HEADER2,header3 1,2,bar 4,5,FOO """ - ).strip()) + )) # confirm that the file has the BOM prefix with open(csv_file, 'rb') as file_obj: self.assertTrue(file_obj.read().startswith(codecs.BOM_UTF8)) - lookup_dict = utils.build_lookup_from_csv( - csv_file, 'header1') + lookup_dict = utils.read_csv_to_dataframe( + csv_file, 'header1').to_dict(orient='index') # assert the BOM prefix was correctly parsed and skipped self.assertEqual(lookup_dict[4]['header2'], 5) self.assertEqual(lookup_dict[4]['header3'], 'foo') @@ -907,15 +910,15 @@ def test_csv_latin_1_encoding(self): csv_file = os.path.join(self.workspace_dir, 'csv.csv') with codecs.open(csv_file, 'w', encoding='iso-8859-1') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ header 1,HEADER 2,header 3 1,2,bar1 4,5,FOO """ - ).strip()) + )) - lookup_dict = utils.build_lookup_from_csv( - csv_file, 'header 1') + lookup_dict = utils.read_csv_to_dataframe( + csv_file, 'header 1').to_dict(orient='index') self.assertEqual(lookup_dict[4]['header 2'], 5) self.assertEqual(lookup_dict[4]['header 3'], 'foo') @@ -928,14 +931,14 @@ def test_csv_error_non_utf8_character(self): csv_file = os.path.join(self.workspace_dir, 'csv.csv') with codecs.open(csv_file, 'w', encoding='iso-8859-1') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ header 1,HEADER 2,header 3 1,2,bar1 4,5,FÖÖ """ - ).strip()) + )) with self.assertRaises(UnicodeDecodeError): - utils.build_lookup_from_csv(csv_file, 'header 1') + utils.read_csv_to_dataframe(csv_file, 'header 1') def test_expand_path(self): """utils: test path expansion function.""" @@ -954,69 +957,29 @@ def test_expand_path(self): f'{self.workspace_dir}{os.sep}foo.txt', utils.expand_path(f'{self.workspace_dir}{os.sep}foo.txt', base_path)) - def test_expand_path_columns(self): - """utils: test path expansion feature of read_csv_to_dataframe.""" - from natcap.invest import utils - - csv_file = os.path.join(self.workspace_dir, 'csv.csv') - with open(csv_file, 'w') as file_obj: - file_obj.write(textwrap.dedent( - f""" - bar,path - 1,foo.txt - 2,foo/bar.txt - 3,foo\\bar.txt - 4,{self.workspace_dir}/foo.txt - """ - ).strip()) - df = utils.read_csv_to_dataframe(csv_file, expand_path_cols=['path']) - self.assertEqual( - f'{self.workspace_dir}{os.sep}foo.txt', - df['path'][0]) - self.assertEqual( - f'{self.workspace_dir}{os.sep}foo{os.sep}bar.txt', - df['path'][1]) - self.assertEqual( - f'{self.workspace_dir}{os.sep}foo\\bar.txt', - df['path'][2]) - self.assertEqual( - f'{self.workspace_dir}{os.sep}foo.txt', - df['path'][3]) - - - -class ReadCSVToDataframeTests(unittest.TestCase): - """Tests for natcap.invest.utils.read_csv_to_dataframe.""" - - def setUp(self): - """Make temporary directory for workspace.""" - self.workspace_dir = tempfile.mkdtemp() - - def tearDown(self): - """Delete workspace.""" - shutil.rmtree(self.workspace_dir) - - def test_read_csv_to_dataframe(self): - """utils: test the default behavior""" + def test_convert_cols_to_lower(self): + """utils: test that to_lower=True makes headers lowercase""" from natcap.invest import utils csv_file = os.path.join(self.workspace_dir, 'csv.csv') with open(csv_file, 'w') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ HEADER, A, b """ - ).strip()) - df = utils.read_csv_to_dataframe(csv_file) - # case of header and table values shouldn't change - self.assertEqual(df.columns[0], 'HEADER') - self.assertEqual(df['HEADER'][0], 'A') - self.assertEqual(df['HEADER'][1], 'b') + )) + df = utils.read_csv_to_dataframe( + csv_file, convert_cols_to_lower=True, convert_vals_to_lower=False) + # header should be lowercase + self.assertEqual(df.columns[0], 'header') + # case of table values shouldn't change + self.assertEqual(df['header'][0], 'A') + self.assertEqual(df['header'][1], 'b') - def test_to_lower(self): + def test_convert_vals_to_lower(self): """utils: test that to_lower=True makes headers lowercase""" from natcap.invest import utils @@ -1024,18 +987,19 @@ def test_to_lower(self): with open(csv_file, 'w') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ HEADER, A, b """ - ).strip()) - df = utils.read_csv_to_dataframe(csv_file, to_lower=True) - # header should be lowercase - self.assertEqual(df.columns[0], 'header') - # case of table values shouldn't change - self.assertEqual(df['header'][0], 'A') - self.assertEqual(df['header'][1], 'b') + )) + df = utils.read_csv_to_dataframe( + csv_file, convert_cols_to_lower=False, convert_vals_to_lower=True) + # header should still be uppercase + self.assertEqual(df.columns[0], 'HEADER') + # case of table values should change + self.assertEqual(df['HEADER'][0], 'a') + self.assertEqual(df['HEADER'][1], 'b') def test_utf8_bom_encoding(self): """utils: test that CSV read correctly with UTF-8 BOM encoding.""" @@ -1045,12 +1009,12 @@ def test_utf8_bom_encoding(self): # writing with utf-8-sig will prepend the BOM with open(csv_file, 'w', encoding='utf-8-sig') as file_obj: file_obj.write(textwrap.dedent( - """ - header1,HEADER2,header3 + """\ + header1,header2,header3 1,2,bar 4,5,FOO """ - ).strip()) + )) # confirm that the file has the BOM prefix with open(csv_file, 'rb') as file_obj: self.assertTrue(file_obj.read().startswith(codecs.BOM_UTF8)) @@ -1058,7 +1022,7 @@ def test_utf8_bom_encoding(self): df = utils.read_csv_to_dataframe(csv_file) # assert the BOM prefix was correctly parsed and skipped self.assertEqual(df.columns[0], 'header1') - self.assertEqual(df['HEADER2'][1], 5) + self.assertEqual(df['header2'][1], 5) def test_override_default_encoding(self): """utils: test that you can override the default encoding kwarg""" @@ -1069,15 +1033,16 @@ def test_override_default_encoding(self): # encode with ISO Cyrillic, include a non-ASCII character with open(csv_file, 'w', encoding='iso8859_5') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ header, fЮЮ, bar """ - ).strip()) + )) df = utils.read_csv_to_dataframe(csv_file, encoding='iso8859_5') # with the encoding specified, special characters should work - self.assertEqual(df['header'][0], 'fЮЮ') + # and be lowercased + self.assertEqual(df['header'][0], 'fюю') self.assertEqual(df['header'][1], 'bar') def test_other_kwarg(self): @@ -1088,12 +1053,12 @@ def test_other_kwarg(self): with open(csv_file, 'w') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ h1;h2;h3 a;b;c d;e;f """ - ).strip()) + )) # using sep=None with the default engine='python', # it should infer what the separator is df = utils.read_csv_to_dataframe(csv_file, sep=None) @@ -1114,12 +1079,12 @@ def test_csv_with_integer_headers(self): with open(csv_file, 'w') as file_obj: file_obj.write(textwrap.dedent( - """ + """\ 1,2,3 a,b,c d,e,f """ - ).strip()) + )) df = utils.read_csv_to_dataframe(csv_file) # expect headers to be strings self.assertEqual(df.columns[0], '1') @@ -1135,7 +1100,7 @@ def test_removal_whitespace(self): file_obj.write(" Col1, Col2 ,Col3 \n") file_obj.write(" val1, val2 ,val3 \n") file_obj.write(" , 2 1 , ") - df = utils.read_csv_to_dataframe(csv_file) + df = utils.read_csv_to_dataframe(csv_file, convert_cols_to_lower=False) # header should have no leading / trailing whitespace self.assertEqual(df.columns[0], 'Col1') self.assertEqual(df.columns[1], 'Col2') @@ -1148,6 +1113,36 @@ def test_removal_whitespace(self): self.assertEqual(df['Col2'][1], '2 1') self.assertEqual(df['Col3'][1], '') + def test_expand_path_columns(self): + """utils: test path expansion feature of read_csv_to_dataframe.""" + from natcap.invest import utils + + csv_file = os.path.join(self.workspace_dir, 'csv.csv') + with open(csv_file, 'w') as file_obj: + file_obj.write(textwrap.dedent( + f"""\ + bar,path + 1,foo.txt + 2,foo/bar.txt + 3,foo\\bar.txt + 4,{self.workspace_dir}/foo.txt + """ + )) + df = utils.read_csv_to_dataframe( + csv_file, expand_path_cols=['path'], convert_vals_to_lower=False) + self.assertEqual( + f'{self.workspace_dir}{os.sep}foo.txt', + df['path'][0]) + self.assertEqual( + f'{self.workspace_dir}{os.sep}foo{os.sep}bar.txt', + df['path'][1]) + self.assertEqual( + f'{self.workspace_dir}{os.sep}foo\\bar.txt', + df['path'][2]) + self.assertEqual( + f'{self.workspace_dir}{os.sep}foo.txt', + df['path'][3]) + class CreateCoordinateTransformationTests(unittest.TestCase): """Tests for natcap.invest.utils.create_coordinate_transformer."""