-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstage2_setup_gpkg.sh
executable file
·184 lines (168 loc) · 7.7 KB
/
stage2_setup_gpkg.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env bash
#
# Set up a working GeoPackage database for use with QGIS in creating subsets.
# It's must faster to do these operations by directly running spatial queries
# against the GeoPackage and creating new tables vs calculating centroids or
# subsets directly in R / Python / QGIS.
#
# Contact: Edgar Castro <edgar_castro@g.harvard.edu>
# Input directories
GEO_DIR="$(dirname "$0")/output/geo"
GRIDS_DIR="$(dirname "$0")/output/grids"
CROSSWALKS_DIR="$(dirname "$0")/output/crosswalks"
# Need to write to home then move to the current directory since SMB mounts
# don't play well with GeoPackages
WORKING_GPKG_TEMP=/home/edgar/working.gpkg
WORKING_GPKG_FINAL="$(dirname "$0")/output/working.gpkg"
REGIONS=(
NE1 "'09', '23', '25', '33', '44', '50'"
MA2 "'34', '36', '42'"
NEC3 "'17', '18', '26', '39', '55'"
WNC4 "'19', '20', '27', '29', '31', '38', '46'"
SA5 "'10', '11', '12', '13', '24', '45', '37', '51', '54'"
ESC6 "'01', '21', '28', '47'"
WSC7 "'05', '22', '40', '48'"
M8 "'04', '08', '16', '30', '32', '35', '49', '56'"
P9 "'06', '41', '53'"
)
n_regions=${#REGIONS[@]}
mkdir -p "$CROSSWALKS_DIR"
# Given a GeoPackage layer of all Census blocks in the U.S., create subsets of
# the blocks in different tables according to region
function create_regional_subsets() {
blocks_gpkg="$1" # e.g. blocks_2000.gpkg / blocks_2010.gpkg / blocks_2020.gpkg
blocks_layer="$2" # e.g. blocks_2000 / blocks_2010 / blocks_2020
geoid_field="$3" # e.g. BLKIDFP00 / GEOID10 / GEOID20
for i in $(seq 1 $[n_regions / 2])
do
region=${REGIONS[($i - 1) * 2]}
fips_codes=${REGIONS[($i - 1) * 2 + 1]}
output_table=${blocks_layer}_${region}
sql_code="SELECT *
FROM ${blocks_layer}
WHERE substr(${geoid_field}, 1, 2) IN (${fips_codes})"
echo -e "Creating table ${output_table} using:"
sed "s/^[ \t]*/> /" <<< "$sql_code"
ogr2ogr -f GPKG "$blocks_gpkg" "$blocks_gpkg" \
-nln "$output_table" \
-update \
-sql "$sql_code"
done
}
# Given a GeoPackage layer of all Census blocks in the U.S., create subsets of
# *centroids* of the blocks in different tables according to region
function create_regional_centroids() {
blocks_gpkg="$1" # e.g. blocks_2000.gpkg / blocks_2010.gpkg / blocks_2020.gpkg
blocks_layer="$2" # e.g. blocks_2000 / blocks_2010 / blocks_2020
geoid_field="$3" # e.g. BLKIDFP00 / GEOID10 / GEOID20
for i in $(seq 1 $[n_regions / 2])
do
region=${REGIONS[($i - 1) * 2]}
fips_codes=${REGIONS[($i - 1) * 2 + 1]}
output_table=${blocks_layer}_centroids_${region}
sql_code="SELECT ${geoid_field}, ST_Centroid(geom)
FROM ${blocks_layer}
WHERE substr(${geoid_field}, 1, 2) IN (${fips_codes})"
echo -e "Creating table ${output_table} using:"
sed "s/^[ \t]*/> /" <<< "$sql_code"
ogr2ogr -f GPKG "$blocks_gpkg" "$blocks_gpkg" \
-nln "$output_table" \
-update \
-sql "$sql_code"
done
}
# DEPRECATED - use QGIS or PostGIS instead (much faster)
# Point-in-polygon spatial join of grid cells within Census blocks across the
# entire U.S.
function generate_crosswalks_within() {
working_gpkg="$1" # e.g. working.gpkg
blocks_layer="$2" # e.g. blocks_2000 / blocks_2010 / blocks_2020
geoid_field="$3" # e.g. BLKIDFP00 / GEOID10 / GEOID20
output_dir="$4" # e.g. "." / crosswalks
mkdir -p "$output_dir"
for i in $(seq 1 $[n_regions / 2])
do
region=${REGIONS[($i - 1) * 2]}
fips_codes=${REGIONS[($i - 1) * 2 + 1]}
region_layer=${region}_urban
output_file="${output_dir}/${region}_within_${blocks_layer}.csv"
output_table=${blocks_layer}_centroids_${region}
sql_code="SELECT ${region_layer}.gridcell_idx, ${blocks_layer}.${geoid_field}
FROM ${region_layer}
LEFT JOIN ${blocks_layer}
ON ST_Within(${region_layer}.geom, ${blocks_layer}.geom)
WHERE substr(${blocks_layer}.BLKIDFP00, 1, 2) IN (${fips_codes})"
echo -e "Creating file ${output_file} using:"
sed "s/^[ \t]*/> /" <<< "$sql_code"
ogr2ogr -f CSV "$output_file" "$working_gpkg" -sql "$sql_code"
done
}
# DEPRECATED - use QGIS or PostGIS instead (much faster)
# Point-in-polygon spatial join of grid cells within Census blocks across the
# U.S., using the regional subsets created by `create_regional_subsets()`
function generate_crosswalks_within_subsets() {
working_gpkg="$1" # e.g. working.gpkg
blocks_layer_prefix="$2" # e.g. blocks_2000 / blocks_2010 / blocks_2020
geoid_field="$3" # e.g. BLKIDFP00 / GEOID10 / GEOID20
output_dir="$4" # e.g. "." / crosswalks
mkdir -p "$output_dir"
for i in $(seq 1 $[n_regions / 2])
do
region=${REGIONS[($i - 1) * 2]}
blocks_layer="${blocks_layer_prefix}_${region}"
fips_codes=${REGIONS[($i - 1) * 2 + 1]}
region_layer=${region}_urban
output_file="${output_dir}/${region}_within_${blocks_layer}.csv"
output_table=${blocks_layer}_centroids_${region}
sql_code="SELECT ${region_layer}.gridcell_idx, ${blocks_layer}.${geoid_field}
FROM ${region_layer}
LEFT JOIN ${blocks_layer}
ON ST_Within(${region_layer}.geom, ${blocks_layer}.geom)"
echo -e "Creating file ${output_file} using:"
sed "s/^[ \t]*/> /" <<< "$sql_code"
ogr2ogr -f CSV "$output_file" "$working_gpkg" -sql "$sql_code"
done
}
# Import layers into the working GeoPackage
#find "$GRIDS_DIR"/* "$GEO_DIR"/blocks_{2000,2010,2020}.gpkg |
echo "Importing layers"
find "$GEO_DIR"/blocks_{2000,2010,2020}.gpkg |
sort |
while read gpkg
do
echo "$gpkg"
layer_name=$(basename $gpkg .gpkg)
echo "importing ${gpkg}::${layer_name} => ${WORKING_GPKG_TEMP}"
if [ -f "$WORKING_GPKG_TEMP" ]
then
ogr2ogr -f GPKG "$WORKING_GPKG_TEMP" "$gpkg" -nln "$layer_name" -update
else
ogr2ogr -f GPKG "$WORKING_GPKG_TEMP" "$gpkg" -nln "$layer_name"
fi
done
# Create Census block subsets
echo "Creating Census block subsets"
create_regional_subsets "$WORKING_GPKG_TEMP" blocks_2000 BLKIDFP00
create_regional_subsets "$WORKING_GPKG_TEMP" blocks_2010 GEOID10
create_regional_subsets "$WORKING_GPKG_TEMP" blocks_2020 GEOID20
# Create Census block centroid subsets
echo "Creating Census block centroid subsets"
create_regional_centroids "$WORKING_GPKG_TEMP" blocks_2000 BLKIDFP00
create_regional_centroids "$WORKING_GPKG_TEMP" blocks_2010 GEOID10
create_regional_centroids "$WORKING_GPKG_TEMP" blocks_2020 GEOID20
# Drop the non-subsetted data
ogrinfo "$WORKING_GPKG_TEMP" -sql "DROP TABLE blocks_2000"
ogrinfo "$WORKING_GPKG_TEMP" -sql "DROP TABLE blocks_2010"
ogrinfo "$WORKING_GPKG_TEMP" -sql "DROP TABLE blocks_2020"
ogrinfo "$WORKING_GPKG_TEMP" -sql "VACUUM"
# DEPRECATED - use QGIS or PostGIS instead (much faster)
# generate_crosswalks_within "$WORKING_GPKG_TEMP" blocks_2000 BLKIDFP00 "$CROSSWALKS_DIR"
# generate_crosswalks_within "$WORKING_GPKG_TEMP" blocks_2010 GEOID10 "$CROSSWALKS_DIR"
# generate_crosswalks_within "$WORKING_GPKG_TEMP" blocks_2020 GEOID20 "$CROSSWALKS_DIR"
# DEPRECATED - use QGIS or PostGIS instead (much faster)
# generate_crosswalks_within_subsets "$WORKING_GPKG_TEMP" blocks_2000 BLKIDFP00 "$CROSSWALKS_DIR"
# generate_crosswalks_within_subsets "$WORKING_GPKG_TEMP" blocks_2010 GEOID10 "$CROSSWALKS_DIR"
# generate_crosswalks_within_subsets "$WORKING_GPKG_TEMP" blocks_2020 GEOID20 "$CROSSWALKS_DIR"
# Finished - move the GeoPackage to the project directory
echo "Moving GeoPackage"
mv -v "$WORKING_GPKG_TEMP" "$WORKING_GPKG_FINAL"