-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsingle_species_pipeline.R
310 lines (234 loc) · 8.54 KB
/
single_species_pipeline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#source(here::here("combine/FS","pat.R"))
#devtools::install_github("https://github.com/hferg/sdmpl", auth_token = GITHUB_PAT)
#library(sdmpl)
library(CoordinateCleaner)
library(dplyr)
library(here)
library(lwgeom)
library(raster)
library(readr)
library(sf)
library(caret)
library(randomForest)
source("SA_pipeline_functions.R")
# Data Preparation
sp_name <-
"Martes_martes" #add species name with underscore between genus and species
#We are only interested in species occurrences in the study region - covered in the extents below
xmin <- -13
xmax <- 2
ymin <- 50
ymax <- 62
ext_occ <- extent(xmin, xmax, ymin, ymax)
min_occ <-
20 # minimum number of occurrence points needed for the species to be included in the models
bioclim_layers <-
c(1, 5, 6, 13, 14)#the number of the bioclim layers to be included as environmental variables - https://worldclim.org/bioclim
lapply(bioclim_layers, chelsa_bioclim_get) #this will download the chelsa bioclim data for the layers that you have selected
###Loading and cropping the environmental data
env_layers <-
loadBioclim(path = ("CHELSA/"), extension = ".tif") #add the path linking to your environment variables. CHELSA bioclim variables from here: http://chelsa-climate.org/downloads/
env_crop <- raster::crop(env_layers, ext_occ)
#This code creates this directory if itdoes not already exist
if (!dir.exists(here::here("points/raw"))) {
dir.create(here::here("points/raw/"), recursive = TRUE)
}
spxy_out <- gbifData(
sp_name = sp_name,
ext_occ = ext_occ,#area over which occurrence points will be downloaded
out_dir = here::here("points/raw"),#where points will be saved
min_occ = min_occ
)
##Coordinate Cleaner
if (!dir.exists(here::here("points/cleaned_raw"))) {
dir.create(here::here("points/cleaned_raw/"))
}
###Cleaning the coordinates using the CoordinateCleaner package - https://cran.r-project.org/web/packages/CoordinateCleaner/CoordinateCleaner.pdf
cc_wrapper(
sp_name = sp_name,
in_dir = here::here("points/raw"),
out_dir = here::here("points/cleaned_raw")
)
###Rarefy Points - so there is only one occurrence point per grid cell
if (!dir.exists(here::here("points/rarefied"))) {
dir.create(here::here("points/rarefied/"))
}
ref_map <- env_crop[[1]]
ref_map[!is.na(ref_map)] <- 0 #ref_map should be full of non-1 values
rarefyPoints(
sp_name = sp_name,
in_dir = here::here("points/cleaned_raw"),
out_dir = here::here("points/rarefied/"),
ref_map = ref_map
)
###Extract Data for presence points
if (!dir.exists(here::here("environmental/presence/"))) {
dir.create(here::here("environmental/presence/"), recursive = TRUE)
}
ras_extract(
sp_name = sp_name,
in_dir = here::here("points/rarefied"),
out_dir = here::here("environmental/presence"),
raster_in = env_crop
)
###Background Data - create background and pseudoabsence points - need to think about buffer and density values
if (!dir.exists(here::here("points/background/"))) {
dir.create(here::here("points/background/"))
}
if (!dir.exists(here::here("points/pseudoabsence/"))) {
dir.create(here::here("points/pseudoabsence/"))
}
wwf_ecoregions_get() # this should download the wwf ecoregions data and put it in the right place for you.
ecoreg <-
sf::st_read(here::here("WWF_Ecoregions/wwf_terr_ecos.shp")) %>%
sf::st_crop(., ext_occ) %>% ##cropping to the area of intereste=
dplyr::select(OBJECTID, ECO_NAME) ##just selecting out the columns we're interested in
background_sampler(
sp_name = sp_name,
in_dir = here::here("points/rarefied"),
out_dir = here::here("points/background"),
dens_abs = "density",
density = 100,
type = "background",
polygon = ecoreg
)
background_sampler(
sp_name = sp_name,
in_dir = here::here("points/rarefied"),
out_dir = here::here("points/pseudoabsence"),
dens_abs = "density",
density = 100,
type = "pseudoabsence",
buffer = 100,
polygon = ecoreg
)
###Extract environmental data for background points
if (!dir.exists(here::here("environmental/background/"))) {
dir.create(here::here("environmental/background/"))
}
ras_extract(
sp_name = sp_name,
in_dir = here::here("points/background"),
out_dir = here::here("environmental/background"),
raster_in = env_crop
)
###Extract environmental data for pseudoabsence points
if (!dir.exists(here::here("environmental/pseudoabsence/"))) {
dir.create(here::here("environmental/pseudoabsence/"))
}
ras_extract(
sp_name = sp_name,
in_dir = here::here("points/pseudoabsence"),
out_dir = here::here("environmental/pseudoabsence/"),
raster_in = env_crop
)
if (!dir.exists(here::here("predictions/bioclim/"))) {
dir.create(here::here("predictions/bioclim"), recursive = TRUE)
}
if (!dir.exists(here::here("predictions/glm/"))) {
dir.create(here::here("predictions/glm"), recursive = TRUE)
}
if (!dir.exists(here::here("predictions/rf/"))) {
dir.create(here::here("predictions/rf"), recursive = TRUE)
}
###Fit Bioclim
fitBC(
sp_name = sp_name,
pres_dir = here::here("environmental/presence/"),
backg_dir = here::here("environmental/pseudoabsence/"),
predictor_names = bioclim_layers,
predictors = env_crop,
pred_out_dir = here::here("predictions/bioclim/"),
eval_out_dir = here::here("evaluation/bioclim/"),
overwrite = TRUE,
eval = TRUE
)
###Fit GLM
fitGLM(
sp_name = sp_name,
pres_dir = here::here("environmental/presence/"),
backg_dir = here::here("environmental/pseudoabsence/"),
predictor_names = bioclim_layers,
predictors = env_crop,
pred_out_dir = here::here("predictions/glm/"),
eval_out_dir = here::here("evaluation/glm/"),
overwrite = TRUE,
eval = TRUE
)
###Fit RF
fitRF(
sp_name = sp_name,
pres_dir = here::here("environmental/presence/"),
backg_dir = here::here("environmental/pseudoabsence/"),
predictor_names = bioclim_layers,
predictors = env_crop,
pred_out_dir = here::here("predictions/rf/"),
eval_out_dir = here::here("evaluation/rf/"),
overwrite = TRUE,
eval = TRUE
)
###Get Evaluations and AUCs
eval_files <-
list.files(
here::here("evaluation/"),
full.names = TRUE,
recursive = TRUE,
pattern = paste0("*", sp_name)
)
evals_out <- lapply(eval_files, get_eval, threshold = "tss")
eval_df <- do.call(rbind, evals_out)
eval_df$sp_name <- as.character(eval_df$sp_name)
eval_df
###Plotting the Model Results
bc_plot <- raster(paste0(here::here("predictions/bioclim/"), "/", sp_name, "_bioclim.tif"))
glm_plot <- raster(paste0(here::here("predictions/glm/"), "/", sp_name, "_glm.tif"))
rf_plot <- raster(paste0(here::here("predictions/rf/"), "/", sp_name, "_rf.tif"))
cuts=seq(0,1,0.05)#set breaks
pal <- colorRampPalette(c("grey","forestgreen","darkgreen"))
par(mfrow = c(2,2)) #change this to c(1,1) if you don't want all the plots in a pane
plot(bc_plot, main = "Bioclim", breaks=cuts, col = pal(length(cuts)))
plot(glm_plot, main = "GLM", breaks=cuts, col = pal(length(cuts)))
plot(rf_plot, main = "RF", breaks=cuts, col = pal(length(cuts)))
### Plotting the model results with thresholds from eval_df
plot(bc_plot > eval_df$threshold[which(eval_df$model == "bioclim")], main = "Bioclim")
plot(glm_plot > eval_df$threshold[which(eval_df$model == "glm")], main = "GLM")
plot(rf_plot > eval_df$threshold[which(eval_df$model == "rf")], main = "RF")
###Build Ensemble Model
if (!dir.exists(here::here("predictions/ensemble/majority_pa"))) {
dir.create(here::here("predictions/ensemble/majority_pa"),
recursive = TRUE)
}
if (!dir.exists(here::here("predictions/ensemble/weighted"))) {
dir.create(here::here("predictions/ensemble/weighted"))
}
preds <-
list.files(
here::here("predictions"),
full.names = TRUE,
recursive = TRUE,
pattern = paste0("*", sp_name)
)
preds <- preds[!grepl("/ensemble/", preds)]
#maj_pa takes the thresholded outputs from the models and stacks them on top of eachother.
#For each cell if the majority of the models predict a presence then the output of maj_pa shows a predicted presence
maj_pa <- ensemble_model(
sp_name = sp_name,
eval_df = eval_df,
preds = preds,
method = "majority_pa",
out_dir = here::here("predictions/ensemble")
)
#weighted takes the outputs from the models and applies a weighted average
#to them, so that the best performing model (based on AUC) has a bigger influence on the output.
weighted <- ensemble_model(
sp_name = sp_name,
eval_df = eval_df,
preds = preds,
method = "weighted",
out_dir = here::here("predictions/ensemble")
)
xy <- read.csv(paste0(here::here("points/rarefied/"), "/", sp_name, ".csv"))
plot(maj_pa, main = "Majority Presence/Absence")
points(xy$x, xy$y)
plot(weighted, main = "Weighted Ensemble Model", breaks=cuts, col = pal(10))
points(xy$x, xy$y)