Skip to content

Commit 9075ba8

Browse files
committed
building regression data with new response vars
1 parent 439889c commit 9075ba8

File tree

3 files changed

+124
-84
lines changed

3 files changed

+124
-84
lines changed

src/natcap/invest/recreation/recmodel_client.py

+120-81
Original file line numberDiff line numberDiff line change
@@ -348,19 +348,15 @@
348348
# Have 5 seconds between timed progress outputs
349349
LOGGER_TIME_DELAY = 5
350350

351-
# For now, this is the field name we use to mark the photo user "days"
352-
RESPONSE_ID_DICT = {
353-
'photos': 'PUD_YR_AVG',
354-
'tweets': 'TUD_YR_AVG'
355-
}
351+
RESPONSE_VARIABLE_ID = 'sumPUD_TUD'
356352
SCENARIO_RESPONSE_ID = 'UD_EST'
357353

358354
_OUTPUT_BASE_FILES = {
359355
'pud_results_path': 'pud_results.shp',
360356
'pud_monthly_table_path': 'pud_monthly_table.csv',
361357
'tud_results_path': 'tud_results.shp',
362358
'tud_monthly_table_path': 'tud_monthly_table.csv',
363-
'predictor_vector_path': 'predictor_data.shp',
359+
'regression_vector_path': 'regression_data.shp',
364360
'scenario_results_path': 'scenario_results.shp',
365361
'regression_coefficients': 'regression_coefficients.txt',
366362
}
@@ -525,35 +521,44 @@ def execute(args):
525521
task_name='prepare response polygons for geoprocessing')
526522

527523
# Build predictor data
528-
build_predictor_data_task = _schedule_predictor_data_processing(
524+
predictor_task_list, predictor_json_list = _schedule_predictor_data_processing(
529525
file_registry['local_aoi_path'],
530526
file_registry['response_polygons_lookup'],
531527
prepare_response_polygons_task,
532528
args['predictor_table_path'],
533-
file_registry['predictor_vector_path'],
529+
# file_registry['predictor_vector_path'],
534530
intermediate_dir, task_graph)
535531

532+
assemble_regression_data_task = task_graph.add_task(
533+
func=_assemble_regression_data,
534+
args=(file_registry['pud_results_path'],
535+
file_registry['tud_results_path'],
536+
predictor_json_list,
537+
file_registry['regression_vector_path']),
538+
target_path_list=[file_registry['regression_vector_path']],
539+
dependent_task_list=predictor_task_list + [user_days_task],
540+
task_name='assemble predictor data')
541+
536542
# Compute the regression
537543
coefficient_json_path = os.path.join(
538544
intermediate_dir, 'predictor_estimates.json')
539545
compute_regression_task = task_graph.add_task(
540546
func=_compute_and_summarize_regression,
541-
args=(file_registry['results_path'],
542-
RESPONSE_ID_DICT[args['visitation_proxy']],
543-
file_registry['predictor_vector_path'],
547+
args=(file_registry['regression_vector_path'],
548+
RESPONSE_VARIABLE_ID,
549+
args['predictor_table_path'],
544550
file_registry['server_version'],
545551
coefficient_json_path,
546552
file_registry['regression_coefficients']),
547553
target_path_list=[file_registry['regression_coefficients'],
548554
coefficient_json_path],
549-
dependent_task_list=[
550-
user_days_task, build_predictor_data_task],
555+
dependent_task_list=[assemble_regression_data_task],
551556
task_name='compute regression')
552557

553558
if ('scenario_predictor_table_path' in args and
554559
args['scenario_predictor_table_path'] != ''):
555560
utils.make_directories([scenario_dir])
556-
build_scenario_data_task = _schedule_predictor_data_processing(
561+
build_scenario_data_task, predictor_json_list = _schedule_predictor_data_processing(
557562
file_registry['local_aoi_path'],
558563
file_registry['response_polygons_lookup'],
559564
prepare_response_polygons_task,
@@ -637,8 +642,8 @@ def _retrieve_user_days(
637642

638643
dataset_list = ['flickr', 'twitter']
639644
acronym_lookup = {
640-
'flickr': 'pud',
641-
'twitter': 'tud'
645+
'flickr': 'PUD',
646+
'twitter': 'TUD'
642647
}
643648
results = recmodel_manager.calculate_userdays(
644649
zip_file_binary, start_year, end_year, dataset_list)
@@ -791,7 +796,7 @@ def _generate_polygon(col_index, row_index):
791796
def _schedule_predictor_data_processing(
792797
response_vector_path, response_polygons_pickle_path,
793798
prepare_response_polygons_task,
794-
predictor_table_path, out_predictor_vector_path,
799+
predictor_table_path,
795800
working_dir, task_graph):
796801
"""Summarize spatial predictor data by polygons in the response vector.
797802
@@ -891,15 +896,16 @@ def _schedule_predictor_data_processing(
891896
dependent_task_list=[prepare_response_polygons_task],
892897
task_name=f'predictor {predictor_id}'))
893898

894-
assemble_predictor_data_task = task_graph.add_task(
895-
func=_json_to_shp_table,
896-
args=(response_vector_path, out_predictor_vector_path,
897-
predictor_json_list),
898-
target_path_list=[out_predictor_vector_path],
899-
dependent_task_list=predictor_task_list,
900-
task_name='assemble predictor data')
899+
return predictor_task_list, predictor_json_list
900+
# assemble_predictor_data_task = task_graph.add_task(
901+
# func=_json_to_shp_table,
902+
# args=(response_vector_path, out_predictor_vector_path,
903+
# predictor_json_list),
904+
# target_path_list=[out_predictor_vector_path],
905+
# dependent_task_list=predictor_task_list,
906+
# task_name='assemble predictor data')
901907

902-
return assemble_predictor_data_task
908+
# return assemble_predictor_data_task
903909

904910

905911
def _prepare_response_polygons_lookup(
@@ -918,52 +924,86 @@ def _prepare_response_polygons_lookup(
918924
pickle.dump(response_polygons_lookup, pickle_file)
919925

920926

921-
def _json_to_shp_table(
922-
response_vector_path, predictor_vector_path,
923-
predictor_json_list):
924-
"""Create a shapefile and a field with data from each json file.
927+
def _assemble_regression_data(
928+
pud_vector_path, tud_vector_path,
929+
predictor_json_list, target_vector_path):
930+
"""Create a vector with data for each predictor and response variables.
925931
926932
Args:
927-
response_vector_path (string): Path to the response vector polygon
928-
shapefile.
929-
predictor_vector_path (string): a copy of ``response_vector_path``.
930-
One field will be added for each json file, and all other
931-
fields will be deleted.
933+
pud_vector_path (string): Path to the vector polygon
934+
layer with PUD_YR_AVG.
935+
tud_vector_path (string): Path to the vector polygon
936+
layer with TUD_YR_AVG.
932937
predictor_json_list (list): list of json filenames, one for each
933938
predictor dataset. A json file will look like this,
934939
{0: 0.0, 1: 0.0}
935940
Keys match FIDs of ``response_vector_path``.
941+
target_vector_path (string): a copy of the geometry from ``pud_vector_path``.
942+
Fields include all data needed to compute the linear regression:
943+
* one field for each predictor,
944+
* PUD_YR_AVG
945+
* TUD_YR_AVG
946+
* PUD_plus_TUD (the response variable for linear regression)
936947
937948
Returns:
938949
None
939950
940951
"""
941952
driver = gdal.GetDriverByName('ESRI Shapefile')
942-
if os.path.exists(predictor_vector_path):
943-
driver.Delete(predictor_vector_path)
944-
response_vector = gdal.OpenEx(
945-
response_vector_path, gdal.OF_VECTOR | gdal.GA_Update)
946-
predictor_vector = driver.CreateCopy(
947-
predictor_vector_path, response_vector)
948-
response_vector = None
949-
950-
layer = predictor_vector.GetLayer()
953+
if os.path.exists(target_vector_path):
954+
driver.Delete(target_vector_path)
955+
pud_vector = gdal.OpenEx(
956+
pud_vector_path, gdal.OF_VECTOR | gdal.GA_Update)
957+
target_vector = driver.CreateCopy(
958+
target_vector_path, pud_vector)
959+
tud_vector = gdal.OpenEx(
960+
tud_vector_path, gdal.OF_VECTOR | gdal.GA_Update)
961+
tud_layer = tud_vector.GetLayer()
962+
963+
layer = target_vector.GetLayer()
951964
layer_defn = layer.GetLayerDefn()
952965

966+
def _create_field(fieldname):
967+
# Create a new field for the predictor
968+
# Delete the field first if it already exists
969+
field_index = layer.FindFieldIndex(
970+
str(fieldname), 1)
971+
if field_index >= 0:
972+
layer.DeleteField(field_index)
973+
field = ogr.FieldDefn(str(fieldname), ogr.OFTReal)
974+
field.SetWidth(24)
975+
field.SetPrecision(11)
976+
layer.CreateField(field)
977+
978+
tud_variable_id = 'TUD_YR_AVG'
979+
pud_variable_id = 'PUD_YR_AVG'
980+
_create_field(tud_variable_id)
981+
_create_field(RESPONSE_VARIABLE_ID)
982+
983+
for feature in layer:
984+
tud_feature = tud_layer.GetFeature(feature.GetFID())
985+
tud_yr_avg = tud_feature.GetField(tud_variable_id)
986+
feature.SetField(tud_variable_id, tud_yr_avg)
987+
feature.SetField(
988+
RESPONSE_VARIABLE_ID,
989+
feature.GetField(pud_variable_id) + tud_yr_avg)
990+
layer.SetFeature(feature)
991+
953992
predictor_id_list = []
954993
for json_filename in predictor_json_list:
955994
predictor_id = os.path.basename(os.path.splitext(json_filename)[0])
956995
predictor_id_list.append(predictor_id)
957996
# Create a new field for the predictor
958997
# Delete the field first if it already exists
959-
field_index = layer.FindFieldIndex(
960-
str(predictor_id), 1)
961-
if field_index >= 0:
962-
layer.DeleteField(field_index)
963-
predictor_field = ogr.FieldDefn(str(predictor_id), ogr.OFTReal)
964-
predictor_field.SetWidth(24)
965-
predictor_field.SetPrecision(11)
966-
layer.CreateField(predictor_field)
998+
# field_index = layer.FindFieldIndex(
999+
# str(predictor_id), 1)
1000+
# if field_index >= 0:
1001+
# layer.DeleteField(field_index)
1002+
# predictor_field = ogr.FieldDefn(str(predictor_id), ogr.OFTReal)
1003+
# predictor_field.SetWidth(24)
1004+
# predictor_field.SetPrecision(11)
1005+
# layer.CreateField(predictor_field)
1006+
_create_field(predictor_id)
9671007

9681008
with open(json_filename, 'r') as file:
9691009
predictor_results = json.load(file)
@@ -973,20 +1013,22 @@ def _json_to_shp_table(
9731013
layer.SetFeature(feature)
9741014

9751015
# Get all the fieldnames. If they are not in the predictor_id_list,
976-
# get their index and delete
1016+
# or the userday variables, find and delete them.
1017+
field_list = predictor_id_list + [
1018+
RESPONSE_VARIABLE_ID, tud_variable_id, pud_variable_id]
9771019
n_fields = layer_defn.GetFieldCount()
9781020
fieldnames = []
9791021
for idx in range(n_fields):
9801022
field_defn = layer_defn.GetFieldDefn(idx)
9811023
fieldnames.append(field_defn.GetName())
9821024
for field_name in fieldnames:
983-
if field_name not in predictor_id_list:
1025+
if field_name not in field_list:
9841026
idx = layer.FindFieldIndex(field_name, 1)
9851027
layer.DeleteField(idx)
9861028
layer_defn = None
9871029
layer = None
988-
predictor_vector.FlushCache()
989-
predictor_vector = None
1030+
target_vector.FlushCache()
1031+
target_vector = None
9901032

9911033

9921034
def _raster_sum_mean(
@@ -1266,13 +1308,13 @@ def _ogr_to_geometry_list(vector_path):
12661308

12671309

12681310
def _compute_and_summarize_regression(
1269-
response_vector_path, response_id, predictor_vector_path, server_version_path,
1311+
data_vector_path, response_id, predictor_table_path, server_version_path,
12701312
target_coefficient_json_path, target_regression_summary_path):
12711313
"""Compute a regression and summary statistics and generate a report.
12721314
12731315
Args:
1274-
response_vector_path (string): path to polygon vector containing the
1275-
RESPONSE_ID field.
1316+
data_vector_path (string): path to polygon vector containing the
1317+
RESPONSE_ID field and predictor data
12761318
predictor_vector_path (string): path to polygon vector containing
12771319
fields for each predictor variable. Geometry is identical to that
12781320
of 'response_vector_path'.
@@ -1288,9 +1330,13 @@ def _compute_and_summarize_regression(
12881330
None
12891331
12901332
"""
1333+
predictor_df = validation.get_validated_dataframe(
1334+
predictor_table_path, **MODEL_SPEC['args']['predictor_table_path'])
1335+
predictor_list = predictor_df.index
1336+
import pdb; pdb.set_trace()
12911337
predictor_id_list, coefficients, ssres, r_sq, r_sq_adj, std_err, dof, se_est = (
12921338
_build_regression(
1293-
response_vector_path, predictor_vector_path, response_id))
1339+
data_vector_path, predictor_list, response_id))
12941340

12951341
# Generate a nice looking regression result and write to log and file
12961342
coefficients_string = ' estimate stderr t value\n'
@@ -1331,7 +1377,7 @@ def _compute_and_summarize_regression(
13311377

13321378

13331379
def _build_regression(
1334-
response_vector_path, predictor_vector_path,
1380+
data_vector_path, predictor_id_list,
13351381
response_id):
13361382
"""Multiple least-squares regression with log-transformed response.
13371383
@@ -1368,44 +1414,37 @@ def _build_regression(
13681414
13691415
"""
13701416
LOGGER.info("Computing regression")
1371-
response_vector = gdal.OpenEx(response_vector_path, gdal.OF_VECTOR)
1372-
response_layer = response_vector.GetLayer()
1373-
1374-
predictor_vector = gdal.OpenEx(predictor_vector_path, gdal.OF_VECTOR)
1375-
predictor_layer = predictor_vector.GetLayer()
1376-
predictor_layer_defn = predictor_layer.GetLayerDefn()
1417+
data_vector = gdal.OpenEx(data_vector_path, gdal.OF_VECTOR)
1418+
data_layer = data_vector.GetLayer()
13771419

1378-
n_features = predictor_layer.GetFeatureCount()
1379-
# Not sure what would cause this to be untrue, but if it ever is,
1380-
# we sure want to know about it.
1381-
assert(n_features == response_layer.GetFeatureCount())
1420+
n_features = data_layer.GetFeatureCount()
13821421

13831422
# Response data matrix
13841423
response_array = numpy.empty((n_features, 1))
1385-
for row_index, feature in enumerate(response_layer):
1424+
for row_index, feature in enumerate(data_layer):
13861425
response_array[row_index, :] = feature.GetField(str(response_id))
13871426
response_array = numpy.log1p(response_array)
13881427

13891428
# Y-Intercept data matrix
13901429
intercept_array = numpy.ones_like(response_array)
13911430

13921431
# Predictor data matrix
1393-
n_predictors = predictor_layer_defn.GetFieldCount()
1394-
predictor_matrix = numpy.empty((n_features, n_predictors))
1395-
predictor_names = []
1396-
for idx in range(n_predictors):
1397-
field_defn = predictor_layer_defn.GetFieldDefn(idx)
1398-
field_name = field_defn.GetName()
1399-
predictor_names.append(field_name)
1400-
for row_index, feature in enumerate(predictor_layer):
1432+
# n_predictors = predictor_layer_defn.GetFieldCount()
1433+
predictor_matrix = numpy.empty((n_features, len(predictor_id_list)))
1434+
# predictor_names = []
1435+
# for idx in range(n_predictors):
1436+
# field_defn = predictor_layer_defn.GetFieldDefn(idx)
1437+
# field_name = field_defn.GetName()
1438+
# predictor_names.append(field_name)
1439+
for row_index, feature in enumerate(data_layer):
14011440
predictor_matrix[row_index, :] = numpy.array(
1402-
[feature.GetField(str(key)) for key in predictor_names])
1441+
[feature.GetField(str(key)) for key in predictor_id_list])
14031442

14041443
# If some predictor has no data across all features, drop that predictor:
14051444
valid_pred = ~numpy.isnan(predictor_matrix).all(axis=0)
14061445
predictor_matrix = predictor_matrix[:, valid_pred]
14071446
predictor_names = [
1408-
pred for (pred, valid) in zip(predictor_names, valid_pred)
1447+
pred for (pred, valid) in zip(predictor_id_list, valid_pred)
14091448
if valid]
14101449
n_predictors = predictor_matrix.shape[1]
14111450

src/natcap/invest/recreation/recmodel_server.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def __init__(
153153
# self.global_cache_dir = global_cache
154154
self.min_year = min_year
155155
self.max_year = max_year
156-
self.acronym = 'pud' if dataset_name == 'flickr' else 'tud'
156+
self.acronym = 'PUD' if dataset_name == 'flickr' else 'TUD'
157157

158158
def get_valid_year_range(self):
159159
"""Return the min and max year queriable.

tests/test_recreation.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,8 @@ def setUp(self):
507507
"""Setup workspace."""
508508
from natcap.invest.recreation import recmodel_server
509509

510-
self.workspace_dir = tempfile.mkdtemp()
510+
# self.workspace_dir = tempfile.mkdtemp()
511+
self.workspace_dir = 'test_ws'
511512
self.resampled_data_path = os.path.join(
512513
self.workspace_dir, 'resampled_data.csv')
513514
_resample_csv(
@@ -554,7 +555,7 @@ def setUp(self):
554555
def tearDown(self):
555556
"""Delete workspace."""
556557
self.server_process.terminate()
557-
shutil.rmtree(self.workspace_dir, ignore_errors=True)
558+
# shutil.rmtree(self.workspace_dir, ignore_errors=True)
558559

559560
def test_all_metrics_local_server(self):
560561
"""Recreation test with all but trivial predictor metrics.

0 commit comments

Comments
 (0)