Skip to content

Commit ef419f8

Browse files
committed
comparison CRPS- and LogS-based boosting
1 parent d98b932 commit ef419f8

File tree

2 files changed

+212
-0
lines changed

2 files changed

+212
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
## Data preparation: Run
2+
3+
# rm(list=ls())
4+
#
5+
# data_dir <- "/media/sebastian/Elements/Postproc_NN/data/"
6+
# load(paste0(data_dir, "data_all.Rdata"))
7+
#
8+
# # remove sm data (missing values)
9+
# data$sm_mean <- NULL
10+
# data$sm_var <- NULL
11+
# head(data)
12+
#
13+
# library(scoringRules)
14+
# library(lubridate)
15+
# library(crch)
16+
#
17+
# train_end <- as.Date("2016-01-01 00:00 UTC") - days(2)
18+
# train_start <- data$date[1]
19+
#
20+
# data_train_all <- subset(data, date >= train_start & date <= train_end)
21+
#
22+
# eval_start <- as.Date("2016-01-01 00:00 UTC")
23+
# eval_end <- as.Date("2016-12-31 00:00 UTC")
24+
# eval_dates <- seq(eval_start, eval_end, by = "1 day")
25+
#
26+
# data_eval_all <- subset(data, date >= eval_start & date <= eval_end)
27+
#
28+
# out_loc <- rep(NA, nrow(data_eval_all))
29+
# out_sc <- rep(NA, nrow(data_eval_all))
30+
#
31+
# stations_list <- unique(data$station)
32+
33+
## Logs: run
34+
35+
# ncoef_loc <- NULL
36+
# ncoef_sc <- NULL
37+
#
38+
# for(this_station in stations_list){
39+
# # progress indicator
40+
# progind <- which(stations_list == this_station)
41+
# if(progind %% 10 == 0){
42+
# cat(progind, "of", length(stations_list), "started at", paste(Sys.time()), "\n")
43+
# }
44+
#
45+
# # data_train <- subset(data, date >= train_start & date <= train_end & station == this_station)
46+
# data_train <- subset(data_train_all, station == this_station)
47+
#
48+
# # remove incomplete cases (= NA obs or fc)
49+
# data_train <- data_train[complete.cases(data_train), ]
50+
# if(nrow(data_train) < 10){
51+
# next
52+
# }
53+
#
54+
# ## NOTE: boosting is only implemented for link.scale = "log", otherwise cryptic error message
55+
# crch_model <- crch(obs ~ .|.,
56+
# data = data_train[,-which(names(data) %in% c("date", "station"))],
57+
# dist = "gaussian",
58+
# link.scale = "log",
59+
# method = "boosting",
60+
# maxit = 1000,
61+
# nu = 0.05,
62+
# mstop = "aic")
63+
#
64+
# ncoef_loc[progind] <- sum(crch_model$coefficients$location > 0)
65+
# ncoef_sc[progind] <- sum(crch_model$coefficients$scale > 0)
66+
# }
67+
#
68+
# save(ncoef_loc, ncoef_sc, file = "ncoef_LogS.Rdata")
69+
70+
## CRPS: run
71+
72+
# ncoef_loc <- NULL
73+
# ncoef_sc <- NULL
74+
#
75+
# for(this_station in stations_list){
76+
# # progress indicator
77+
# progind <- which(stations_list == this_station)
78+
# if(progind %% 10 == 0){
79+
# cat(progind, "of", length(stations_list), "started at", paste(Sys.time()), "\n")
80+
# }
81+
#
82+
# # data_train <- subset(data, date >= train_start & date <= train_end & station == this_station)
83+
# data_train <- subset(data_train_all, station == this_station)
84+
#
85+
# # remove incomplete cases (= NA obs or fc)
86+
# data_train <- data_train[complete.cases(data_train), ]
87+
# if(nrow(data_train) < 10){
88+
# next
89+
# }
90+
#
91+
# ## NOTE: boosting is only implemented for link.scale = "log", otherwise cryptic error message
92+
# crch_model <- crch(obs ~ .|.,
93+
# data = data_train[,-which(names(data) %in% c("date", "station"))],
94+
# dist = "gaussian",
95+
# link.scale = "log",
96+
# method = "boosting",
97+
# type = "crps",
98+
# maxit = 1000,
99+
# nu = 0.05,
100+
# mstop = "aic")
101+
#
102+
# ncoef_loc[progind] <- sum(crch_model$coefficients$location > 0)
103+
# ncoef_sc[progind] <- sum(crch_model$coefficients$scale > 0)
104+
# }
105+
# save(ncoef_loc, ncoef_sc, file = "ncoef_CRPS.Rdata")
106+
107+
## Plot
108+
rm(list=ls())
109+
110+
load("ncoef_CRPS.Rdata")
111+
ncoef_loc_crps <- ncoef_loc
112+
ncoef_sc_crps <- ncoef_sc
113+
114+
load("ncoef_LogS.Rdata")
115+
ncoef_loc_logs <- ncoef_loc
116+
ncoef_sc_logs <- ncoef_sc
117+
118+
pdf("ncoef_boosting_CRPS-LogS.pdf", width = 10, height = 5, pointsize = 12)
119+
120+
par(mfrow=c(1,2))
121+
122+
# plot for location parameter
123+
ploc_logs <- hist(ncoef_loc_logs, breaks = seq(0,25,1), plot = FALSE)
124+
ploc_crps <- hist(ncoef_loc_crps, breaks = seq(0,25,1), plot = FALSE)
125+
plot(ploc_logs, col=rgb(0,0,1,1/4), xlim=c(0,25), ylim = c(0,150),
126+
main = "Location", xlab = "Number of selected predictors") # first histogram
127+
plot(ploc_crps, col=rgb(1,0,0,1/4), xlim=c(0,25), add=T) # second
128+
legend("topright", c("LogS", "CRPS"), fill=c(rgb(0,0,1,1/4), rgb(1,0,0,1/4)))
129+
130+
# plot for scale parameter
131+
psc_logs <- hist(ncoef_sc_logs, breaks = seq(0,25,1), plot = FALSE)
132+
psc_crps <- hist(ncoef_sc_crps, breaks = seq(0,25,1), plot = FALSE)
133+
plot(psc_logs, col=rgb(0,0,1,1/4), xlim=c(0,25), ylim = c(0,150),
134+
main = "Scale", xlab = "Number of selected predictors") # first histogram
135+
plot(psc_crps, col=rgb(1,0,0,1/4), xlim=c(0,25), add=T) # second
136+
legend("topright", c("LogS", "CRPS"), fill=c(rgb(0,0,1,1/4), rgb(1,0,0,1/4)))
137+
138+
dev.off()

results/revision/bias_seasonality.R

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
rm(list=ls())
2+
3+
library(lubridate)
4+
5+
data_dir <- "/media/sebastian/Elements/Postproc_NN/data/"
6+
load(paste0(data_dir, "data_ensfc.Rdata"))
7+
8+
data_eval_all <- data_ens
9+
ind_use <- which(!is.na(data_eval_all$obs))
10+
data_eval <- data_eval_all[ind_use,]
11+
12+
library(scoringRules)
13+
14+
fc_matrix <- as.matrix(data_eval[,4:ncol(data_eval)])
15+
bias_sample <- function(y, dat){
16+
apply(dat, 1, mean) - y
17+
}
18+
bias_ens <- bias_sample(y = data_eval$obs, dat = fc_matrix)
19+
20+
df <- data.frame(date = data_eval$date, bias = bias_ens)
21+
22+
df$year <- year(df$date)
23+
df$month <- month(df$date)
24+
25+
bias_monthlymean <- matrix(NA, ncol = 12, nrow = length(c(2007:2016)))
26+
bias_monthlyqlow <- matrix(NA, ncol = 12, nrow = length(c(2007:2016)))
27+
bias_monthlyqhigh <- matrix(NA, ncol = 12, nrow = length(c(2007:2016)))
28+
29+
for(yy in c(2007:2016)){
30+
print(yy)
31+
dfy <- subset(df, year == yy)
32+
mean_mm <- NULL
33+
# low_mm <- NULL
34+
# high_mm <- NULL
35+
for(mm in 1:12){
36+
dfy_mm <- subset(dfy, month == mm)
37+
mean_mm[mm] <- mean(dfy_mm$bias)
38+
# low_mm[mm] <- quantile(dfy_mm$bias, 0.05)
39+
# high_mm[mm] <- quantile(dfy_mm$bias, 0.95)
40+
}
41+
bias_monthlymean[which(c(2007:2016) == yy),] <- mean_mm
42+
# bias_monthlyqlow[which(c(2007:2016) == yy),] <- low_mm
43+
# bias_monthlyqhigh[which(c(2007:2016) == yy),] <- high_mm
44+
}
45+
46+
47+
pdf("bias_seasonality.pdf", width = 10, height = 5, pointsize = 10)
48+
49+
par(mfrow=c(1,2))
50+
51+
colors <- rainbow(length(c(2007:2016)))
52+
plot(bias_monthlymean[1,], type = "l", col = colors[1], ylim = range(c(bias_monthlymean)+c(0,0.5)),
53+
xlab = "Month", ylab = "Monthly mean bias of ensemble", axes = FALSE)
54+
axis(1, at = 1:12, labels = month.abb)
55+
axis(2)
56+
for(row in 2:nrow(bias_monthlymean)){
57+
lines(bias_monthlymean[row,], col = colors[row])
58+
}
59+
legend("top", col = colors, lty = 1, legend = c(2007:2016), ncol = 4, bty = "n")
60+
61+
62+
63+
indpl <- 9:10
64+
plot(bias_monthlymean[indpl,][1,], type = "l", col = colors[indpl][1], ylim = range(c(bias_monthlymean))+c(0,0.5),
65+
xlab = "Month", ylab = "Monthly mean bias of ensemble", axes = FALSE)
66+
axis(1, at = 1:12, labels = month.abb)
67+
axis(2)
68+
for(row in 2:nrow(bias_monthlymean[indpl,])){
69+
lines(bias_monthlymean[indpl,][row,], col = colors[indpl][row])
70+
}
71+
legend("top", col = colors[indpl], lty = 1, legend = c(2007:2016)[indpl], ncol = 3, bty = "n")
72+
73+
dev.off()
74+

0 commit comments

Comments
 (0)