348
348
# Have 5 seconds between timed progress outputs
349
349
LOGGER_TIME_DELAY = 5
350
350
351
- # For now, this is the field name we use to mark the photo user "days"
352
- RESPONSE_ID_DICT = {
353
- 'photos' : 'PUD_YR_AVG' ,
354
- 'tweets' : 'TUD_YR_AVG'
355
- }
351
+ RESPONSE_VARIABLE_ID = 'sumPUD_TUD'
356
352
SCENARIO_RESPONSE_ID = 'UD_EST'
357
353
358
354
_OUTPUT_BASE_FILES = {
359
355
'pud_results_path' : 'pud_results.shp' ,
360
356
'pud_monthly_table_path' : 'pud_monthly_table.csv' ,
361
357
'tud_results_path' : 'tud_results.shp' ,
362
358
'tud_monthly_table_path' : 'tud_monthly_table.csv' ,
363
- 'predictor_vector_path ' : 'predictor_data .shp' ,
359
+ 'regression_vector_path ' : 'regression_data .shp' ,
364
360
'scenario_results_path' : 'scenario_results.shp' ,
365
361
'regression_coefficients' : 'regression_coefficients.txt' ,
366
362
}
@@ -525,35 +521,44 @@ def execute(args):
525
521
task_name = 'prepare response polygons for geoprocessing' )
526
522
527
523
# Build predictor data
528
- build_predictor_data_task = _schedule_predictor_data_processing (
524
+ predictor_task_list , predictor_json_list = _schedule_predictor_data_processing (
529
525
file_registry ['local_aoi_path' ],
530
526
file_registry ['response_polygons_lookup' ],
531
527
prepare_response_polygons_task ,
532
528
args ['predictor_table_path' ],
533
- file_registry ['predictor_vector_path' ],
529
+ # file_registry['predictor_vector_path'],
534
530
intermediate_dir , task_graph )
535
531
532
+ assemble_regression_data_task = task_graph .add_task (
533
+ func = _assemble_regression_data ,
534
+ args = (file_registry ['pud_results_path' ],
535
+ file_registry ['tud_results_path' ],
536
+ predictor_json_list ,
537
+ file_registry ['regression_vector_path' ]),
538
+ target_path_list = [file_registry ['regression_vector_path' ]],
539
+ dependent_task_list = predictor_task_list + [user_days_task ],
540
+ task_name = 'assemble predictor data' )
541
+
536
542
# Compute the regression
537
543
coefficient_json_path = os .path .join (
538
544
intermediate_dir , 'predictor_estimates.json' )
539
545
compute_regression_task = task_graph .add_task (
540
546
func = _compute_and_summarize_regression ,
541
- args = (file_registry ['results_path ' ],
542
- RESPONSE_ID_DICT [ args [ 'visitation_proxy' ]] ,
543
- file_registry [ 'predictor_vector_path ' ],
547
+ args = (file_registry ['regression_vector_path ' ],
548
+ RESPONSE_VARIABLE_ID ,
549
+ args [ 'predictor_table_path ' ],
544
550
file_registry ['server_version' ],
545
551
coefficient_json_path ,
546
552
file_registry ['regression_coefficients' ]),
547
553
target_path_list = [file_registry ['regression_coefficients' ],
548
554
coefficient_json_path ],
549
- dependent_task_list = [
550
- user_days_task , build_predictor_data_task ],
555
+ dependent_task_list = [assemble_regression_data_task ],
551
556
task_name = 'compute regression' )
552
557
553
558
if ('scenario_predictor_table_path' in args and
554
559
args ['scenario_predictor_table_path' ] != '' ):
555
560
utils .make_directories ([scenario_dir ])
556
- build_scenario_data_task = _schedule_predictor_data_processing (
561
+ build_scenario_data_task , predictor_json_list = _schedule_predictor_data_processing (
557
562
file_registry ['local_aoi_path' ],
558
563
file_registry ['response_polygons_lookup' ],
559
564
prepare_response_polygons_task ,
@@ -637,8 +642,8 @@ def _retrieve_user_days(
637
642
638
643
dataset_list = ['flickr' , 'twitter' ]
639
644
acronym_lookup = {
640
- 'flickr' : 'pud ' ,
641
- 'twitter' : 'tud '
645
+ 'flickr' : 'PUD ' ,
646
+ 'twitter' : 'TUD '
642
647
}
643
648
results = recmodel_manager .calculate_userdays (
644
649
zip_file_binary , start_year , end_year , dataset_list )
@@ -791,7 +796,7 @@ def _generate_polygon(col_index, row_index):
791
796
def _schedule_predictor_data_processing (
792
797
response_vector_path , response_polygons_pickle_path ,
793
798
prepare_response_polygons_task ,
794
- predictor_table_path , out_predictor_vector_path ,
799
+ predictor_table_path ,
795
800
working_dir , task_graph ):
796
801
"""Summarize spatial predictor data by polygons in the response vector.
797
802
@@ -891,15 +896,16 @@ def _schedule_predictor_data_processing(
891
896
dependent_task_list = [prepare_response_polygons_task ],
892
897
task_name = f'predictor { predictor_id } ' ))
893
898
894
- assemble_predictor_data_task = task_graph .add_task (
895
- func = _json_to_shp_table ,
896
- args = (response_vector_path , out_predictor_vector_path ,
897
- predictor_json_list ),
898
- target_path_list = [out_predictor_vector_path ],
899
- dependent_task_list = predictor_task_list ,
900
- task_name = 'assemble predictor data' )
899
+ return predictor_task_list , predictor_json_list
900
+ # assemble_predictor_data_task = task_graph.add_task(
901
+ # func=_json_to_shp_table,
902
+ # args=(response_vector_path, out_predictor_vector_path,
903
+ # predictor_json_list),
904
+ # target_path_list=[out_predictor_vector_path],
905
+ # dependent_task_list=predictor_task_list,
906
+ # task_name='assemble predictor data')
901
907
902
- return assemble_predictor_data_task
908
+ # return assemble_predictor_data_task
903
909
904
910
905
911
def _prepare_response_polygons_lookup (
@@ -918,52 +924,86 @@ def _prepare_response_polygons_lookup(
918
924
pickle .dump (response_polygons_lookup , pickle_file )
919
925
920
926
921
- def _json_to_shp_table (
922
- response_vector_path , predictor_vector_path ,
923
- predictor_json_list ):
924
- """Create a shapefile and a field with data from each json file .
927
+ def _assemble_regression_data (
928
+ pud_vector_path , tud_vector_path ,
929
+ predictor_json_list , target_vector_path ):
930
+ """Create a vector with data for each predictor and response variables .
925
931
926
932
Args:
927
- response_vector_path (string): Path to the response vector polygon
928
- shapefile.
929
- predictor_vector_path (string): a copy of ``response_vector_path``.
930
- One field will be added for each json file, and all other
931
- fields will be deleted.
933
+ pud_vector_path (string): Path to the vector polygon
934
+ layer with PUD_YR_AVG.
935
+ tud_vector_path (string): Path to the vector polygon
936
+ layer with TUD_YR_AVG.
932
937
predictor_json_list (list): list of json filenames, one for each
933
938
predictor dataset. A json file will look like this,
934
939
{0: 0.0, 1: 0.0}
935
940
Keys match FIDs of ``response_vector_path``.
941
+ target_vector_path (string): a copy of the geometry from ``pud_vector_path``.
942
+ Fields include all data needed to compute the linear regression:
943
+ * one field for each predictor,
944
+ * PUD_YR_AVG
945
+ * TUD_YR_AVG
946
+ * PUD_plus_TUD (the response variable for linear regression)
936
947
937
948
Returns:
938
949
None
939
950
940
951
"""
941
952
driver = gdal .GetDriverByName ('ESRI Shapefile' )
942
- if os .path .exists (predictor_vector_path ):
943
- driver .Delete (predictor_vector_path )
944
- response_vector = gdal .OpenEx (
945
- response_vector_path , gdal .OF_VECTOR | gdal .GA_Update )
946
- predictor_vector = driver .CreateCopy (
947
- predictor_vector_path , response_vector )
948
- response_vector = None
949
-
950
- layer = predictor_vector .GetLayer ()
953
+ if os .path .exists (target_vector_path ):
954
+ driver .Delete (target_vector_path )
955
+ pud_vector = gdal .OpenEx (
956
+ pud_vector_path , gdal .OF_VECTOR | gdal .GA_Update )
957
+ target_vector = driver .CreateCopy (
958
+ target_vector_path , pud_vector )
959
+ tud_vector = gdal .OpenEx (
960
+ tud_vector_path , gdal .OF_VECTOR | gdal .GA_Update )
961
+ tud_layer = tud_vector .GetLayer ()
962
+
963
+ layer = target_vector .GetLayer ()
951
964
layer_defn = layer .GetLayerDefn ()
952
965
966
+ def _create_field (fieldname ):
967
+ # Create a new field for the predictor
968
+ # Delete the field first if it already exists
969
+ field_index = layer .FindFieldIndex (
970
+ str (fieldname ), 1 )
971
+ if field_index >= 0 :
972
+ layer .DeleteField (field_index )
973
+ field = ogr .FieldDefn (str (fieldname ), ogr .OFTReal )
974
+ field .SetWidth (24 )
975
+ field .SetPrecision (11 )
976
+ layer .CreateField (field )
977
+
978
+ tud_variable_id = 'TUD_YR_AVG'
979
+ pud_variable_id = 'PUD_YR_AVG'
980
+ _create_field (tud_variable_id )
981
+ _create_field (RESPONSE_VARIABLE_ID )
982
+
983
+ for feature in layer :
984
+ tud_feature = tud_layer .GetFeature (feature .GetFID ())
985
+ tud_yr_avg = tud_feature .GetField (tud_variable_id )
986
+ feature .SetField (tud_variable_id , tud_yr_avg )
987
+ feature .SetField (
988
+ RESPONSE_VARIABLE_ID ,
989
+ feature .GetField (pud_variable_id ) + tud_yr_avg )
990
+ layer .SetFeature (feature )
991
+
953
992
predictor_id_list = []
954
993
for json_filename in predictor_json_list :
955
994
predictor_id = os .path .basename (os .path .splitext (json_filename )[0 ])
956
995
predictor_id_list .append (predictor_id )
957
996
# Create a new field for the predictor
958
997
# Delete the field first if it already exists
959
- field_index = layer .FindFieldIndex (
960
- str (predictor_id ), 1 )
961
- if field_index >= 0 :
962
- layer .DeleteField (field_index )
963
- predictor_field = ogr .FieldDefn (str (predictor_id ), ogr .OFTReal )
964
- predictor_field .SetWidth (24 )
965
- predictor_field .SetPrecision (11 )
966
- layer .CreateField (predictor_field )
998
+ # field_index = layer.FindFieldIndex(
999
+ # str(predictor_id), 1)
1000
+ # if field_index >= 0:
1001
+ # layer.DeleteField(field_index)
1002
+ # predictor_field = ogr.FieldDefn(str(predictor_id), ogr.OFTReal)
1003
+ # predictor_field.SetWidth(24)
1004
+ # predictor_field.SetPrecision(11)
1005
+ # layer.CreateField(predictor_field)
1006
+ _create_field (predictor_id )
967
1007
968
1008
with open (json_filename , 'r' ) as file :
969
1009
predictor_results = json .load (file )
@@ -973,20 +1013,22 @@ def _json_to_shp_table(
973
1013
layer .SetFeature (feature )
974
1014
975
1015
# Get all the fieldnames. If they are not in the predictor_id_list,
976
- # get their index and delete
1016
+ # or the userday variables, find and delete them.
1017
+ field_list = predictor_id_list + [
1018
+ RESPONSE_VARIABLE_ID , tud_variable_id , pud_variable_id ]
977
1019
n_fields = layer_defn .GetFieldCount ()
978
1020
fieldnames = []
979
1021
for idx in range (n_fields ):
980
1022
field_defn = layer_defn .GetFieldDefn (idx )
981
1023
fieldnames .append (field_defn .GetName ())
982
1024
for field_name in fieldnames :
983
- if field_name not in predictor_id_list :
1025
+ if field_name not in field_list :
984
1026
idx = layer .FindFieldIndex (field_name , 1 )
985
1027
layer .DeleteField (idx )
986
1028
layer_defn = None
987
1029
layer = None
988
- predictor_vector .FlushCache ()
989
- predictor_vector = None
1030
+ target_vector .FlushCache ()
1031
+ target_vector = None
990
1032
991
1033
992
1034
def _raster_sum_mean (
@@ -1266,13 +1308,13 @@ def _ogr_to_geometry_list(vector_path):
1266
1308
1267
1309
1268
1310
def _compute_and_summarize_regression (
1269
- response_vector_path , response_id , predictor_vector_path , server_version_path ,
1311
+ data_vector_path , response_id , predictor_table_path , server_version_path ,
1270
1312
target_coefficient_json_path , target_regression_summary_path ):
1271
1313
"""Compute a regression and summary statistics and generate a report.
1272
1314
1273
1315
Args:
1274
- response_vector_path (string): path to polygon vector containing the
1275
- RESPONSE_ID field.
1316
+ data_vector_path (string): path to polygon vector containing the
1317
+ RESPONSE_ID field and predictor data
1276
1318
predictor_vector_path (string): path to polygon vector containing
1277
1319
fields for each predictor variable. Geometry is identical to that
1278
1320
of 'response_vector_path'.
@@ -1288,9 +1330,13 @@ def _compute_and_summarize_regression(
1288
1330
None
1289
1331
1290
1332
"""
1333
+ predictor_df = validation .get_validated_dataframe (
1334
+ predictor_table_path , ** MODEL_SPEC ['args' ]['predictor_table_path' ])
1335
+ predictor_list = predictor_df .index
1336
+ import pdb ; pdb .set_trace ()
1291
1337
predictor_id_list , coefficients , ssres , r_sq , r_sq_adj , std_err , dof , se_est = (
1292
1338
_build_regression (
1293
- response_vector_path , predictor_vector_path , response_id ))
1339
+ data_vector_path , predictor_list , response_id ))
1294
1340
1295
1341
# Generate a nice looking regression result and write to log and file
1296
1342
coefficients_string = ' estimate stderr t value\n '
@@ -1331,7 +1377,7 @@ def _compute_and_summarize_regression(
1331
1377
1332
1378
1333
1379
def _build_regression (
1334
- response_vector_path , predictor_vector_path ,
1380
+ data_vector_path , predictor_id_list ,
1335
1381
response_id ):
1336
1382
"""Multiple least-squares regression with log-transformed response.
1337
1383
@@ -1368,44 +1414,37 @@ def _build_regression(
1368
1414
1369
1415
"""
1370
1416
LOGGER .info ("Computing regression" )
1371
- response_vector = gdal .OpenEx (response_vector_path , gdal .OF_VECTOR )
1372
- response_layer = response_vector .GetLayer ()
1373
-
1374
- predictor_vector = gdal .OpenEx (predictor_vector_path , gdal .OF_VECTOR )
1375
- predictor_layer = predictor_vector .GetLayer ()
1376
- predictor_layer_defn = predictor_layer .GetLayerDefn ()
1417
+ data_vector = gdal .OpenEx (data_vector_path , gdal .OF_VECTOR )
1418
+ data_layer = data_vector .GetLayer ()
1377
1419
1378
- n_features = predictor_layer .GetFeatureCount ()
1379
- # Not sure what would cause this to be untrue, but if it ever is,
1380
- # we sure want to know about it.
1381
- assert (n_features == response_layer .GetFeatureCount ())
1420
+ n_features = data_layer .GetFeatureCount ()
1382
1421
1383
1422
# Response data matrix
1384
1423
response_array = numpy .empty ((n_features , 1 ))
1385
- for row_index , feature in enumerate (response_layer ):
1424
+ for row_index , feature in enumerate (data_layer ):
1386
1425
response_array [row_index , :] = feature .GetField (str (response_id ))
1387
1426
response_array = numpy .log1p (response_array )
1388
1427
1389
1428
# Y-Intercept data matrix
1390
1429
intercept_array = numpy .ones_like (response_array )
1391
1430
1392
1431
# Predictor data matrix
1393
- n_predictors = predictor_layer_defn .GetFieldCount ()
1394
- predictor_matrix = numpy .empty ((n_features , n_predictors ))
1395
- predictor_names = []
1396
- for idx in range (n_predictors ):
1397
- field_defn = predictor_layer_defn .GetFieldDefn (idx )
1398
- field_name = field_defn .GetName ()
1399
- predictor_names .append (field_name )
1400
- for row_index , feature in enumerate (predictor_layer ):
1432
+ # n_predictors = predictor_layer_defn.GetFieldCount()
1433
+ predictor_matrix = numpy .empty ((n_features , len ( predictor_id_list ) ))
1434
+ # predictor_names = []
1435
+ # for idx in range(n_predictors):
1436
+ # field_defn = predictor_layer_defn.GetFieldDefn(idx)
1437
+ # field_name = field_defn.GetName()
1438
+ # predictor_names.append(field_name)
1439
+ for row_index , feature in enumerate (data_layer ):
1401
1440
predictor_matrix [row_index , :] = numpy .array (
1402
- [feature .GetField (str (key )) for key in predictor_names ])
1441
+ [feature .GetField (str (key )) for key in predictor_id_list ])
1403
1442
1404
1443
# If some predictor has no data across all features, drop that predictor:
1405
1444
valid_pred = ~ numpy .isnan (predictor_matrix ).all (axis = 0 )
1406
1445
predictor_matrix = predictor_matrix [:, valid_pred ]
1407
1446
predictor_names = [
1408
- pred for (pred , valid ) in zip (predictor_names , valid_pred )
1447
+ pred for (pred , valid ) in zip (predictor_id_list , valid_pred )
1409
1448
if valid ]
1410
1449
n_predictors = predictor_matrix .shape [1 ]
1411
1450
0 commit comments