small

andreashandel · andreashandel · commit c9f09b9ce8d6 · 2024-03-15T16:40:22.000-04:00
diff --git a/content/model-improvement/assessment-model-improvement.qmd b/content/model-improvement/assessment-model-improvement.qmd
@@ -104,7 +104,9 @@ Follow the [Evaluate your model with resampling section](https://www.tidymodels.
 
 Compute the RMSE for both models again. Of course nothing changes for the null model. Compare the new RMSE estimates obtained through CV with those obtained earlier. What did and didn't change? 
 
-Run the code again that creates the CV folds and does the fitting. This time, choose a different value for the random seed. The RMSE values for the CV fits will change. That's just due to the randomness in the data splitting. If we had more data, we would expect to get less variability. The overall pattern between changes in the RMSE values for the fits to the training data without CV, and what we see with CV, should still be the same.
+Also look at the standard error for the RMSE. Since you are now sampling, you not only get a single estimate for RMSE, but one for each sample, so you can look at the variation in RMSE. This gives you a good indication of how robust your model performance is.
+
+Finally, run the code again that creates the CV folds and does the fitting. This time, choose a different value for the random seed. The RMSE values for the CV fits will change. That's just due to the randomness in the data splitting. If we had more data, we would expect to get less variability. The overall pattern between changes in the RMSE values for the fits to the training data without CV, and what we see with CV, should still be the same.
 
 ::: note
 If you want to get more robust RMSE estimates with CV, you can try to set `repeats` to some value. That creates more samples by repeating the whole CV procedure several times. In theory this might give more robust results. You might encounter some warning messages. This is likely related that occasionally, by chance, data is split in a way that some information (e.g., a certain value for `SEX` in our data) is missing from one one of the folds. That can cause issues.
diff --git a/myresources/improvement-exercise-solution/improvement-exercise.R b/myresources/improvement-exercise-solution/improvement-exercise.R
@@ -26,9 +26,9 @@ Ntest = nrow(test_data)
 ## ---- model1 --------
 mod <- linear_reg() %>% set_engine("lm")
 wflow1 <- 
-  workflow() %>% 
-  add_model(mod) %>% 
-  add_formula(Y ~ DOSE)
+	workflow() %>% 
+	add_model(mod) %>% 
+	add_formula(Y ~ DOSE)
 fit1 <- wflow1 %>% fit(data = train_data)
 
 
@@ -46,20 +46,20 @@ pred0 <- rep(mean(train_data$Y),Ntrain)
 ## ---- rmse --------
 # Compute the RMSE and R squared for model 1
 rmse_train_1 <-  bind_cols(train_data, pred1) %>% 
-  rmse(truth = Y, estimate = .pred) 
+	rmse(truth = Y, estimate = .pred) 
 
 # Compute the RMSE and R squared for model 2
 rmse_train_2 <- bind_cols(train_data, pred2) %>% 
-  rmse(truth = Y, estimate = .pred) 
+	rmse(truth = Y, estimate = .pred) 
 
 # Compute RMSE for a dumb null model
 rmse_train_0 <-  rmse_vec(truth = train_data$Y, estimate = pred0) 
 
 # Print the results
 metrics = data.frame(model = c("null model","model 1","model 2"), 
-                     rmse = c(rmse_train_0, 
-                              rmse_train_1$.estimate, 
-                              rmse_train_2$.estimate) )
+										 rmse = c(rmse_train_0, 
+										 				 rmse_train_1$.estimate, 
+										 				 rmse_train_2$.estimate) )
 print(metrics)
 
 ## ---- cross-validation --------
@@ -74,35 +74,35 @@ rmse_cv_2 <- collect_metrics(fit2_cv)$mean[1]
 
 # Print the results
 metrics_cv = data.frame(model = c("null","model 1","model 2"), 
-                        rmse = c(rmse_train_0, rmse_cv_1, rmse_cv_2) )
+												rmse = c(rmse_train_0, rmse_cv_1, rmse_cv_2) )
 print(metrics_cv)
 
 ## ---- obs-pred-plot --------
 pred0a <- data.frame(predicted = pred0, model = "model 0")
-pred1a <- data.frame(predicted = as.numeric(unlist(pred1)), model = "model 1")
-pred2a <- data.frame(predicted = as.numeric(unlist(pred2)), model = "model 2")
+pred1a <- data.frame(predicted = pred1$.pred, model = "model 1")
+pred2a <- data.frame(predicted = pred2$.pred, model = "model 2")
 
 plot_data <- bind_rows(pred0a,pred1a,pred2a) %>% 
-  mutate(observed = rep(train_data$Y,3)) 
+	mutate(observed = rep(train_data$Y,3)) 
 
 p1 <- plot_data %>% ggplot() +
-  geom_point(aes(x = observed, y = predicted, color = model, shape = model)) +
-  labs(x = "Observed", y = "Predicted", title = "Predicted vs Observed") +
-  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
-  scale_x_continuous(limits=c(0,5000)) +
-  scale_y_continuous(limits=c(0,5000)) +
-  theme_minimal()
+	geom_point(aes(x = observed, y = predicted, color = model, shape = model)) +
+	labs(x = "Observed", y = "Predicted", title = "Predicted vs Observed") +
+	geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
+	scale_x_continuous(limits=c(0,5000)) +
+	scale_y_continuous(limits=c(0,5000)) +
+	theme_minimal()
 plot(p1)
 
 
 ## ---- residuals-plot --------
 plot_data1 <- plot_data |> mutate(residuals = predicted-observed) |> filter(model == "model 2")
 p1a <- plot_data1 %>% ggplot() +
-  geom_point(aes(x = predicted, y = residuals, color = model, shape = model)) +
-  labs(x = "Predicted", y = "Residuals", title = "Residuals vs Predicted") +
-  geom_abline(intercept = 0, slope = 0, linetype = "dashed", color = "black") +
-  scale_y_continuous(limits=c(-2500,2500)) +
-  theme_minimal()
+	geom_point(aes(x = predicted, y = residuals, color = model, shape = model)) +
+	labs(x = "Predicted", y = "Residuals", title = "Residuals vs Predicted") +
+	geom_abline(intercept = 0, slope = 0, linetype = "dashed", color = "black") +
+	scale_y_continuous(limits=c(-2500,2500)) +
+	theme_minimal()
 plot(p1a)
 
 
@@ -112,52 +112,77 @@ plot(p1a)
 Nsamp = 100 #number of samples
 set.seed(rngseed)
 # create samples
-dat_bs <- train_data |> rsample::bootstraps(times = Nsamp, apparent = TRUE)
+dat_bs <- train_data |> rsample::bootstraps(times = Nsamp)
 
 #set up empty arrays to store predictions for each sample
 pred_bs = array(0, dim=c(Nsamp,Ntrain))
 
 #loop over each bootstrap sample, fit model, then predict and record predictions
-for (i in 1:Nsamp)
-{
-  dat_sample = rsample::analysis(dat_bs$splits[[i]])
-  fit_bs <- wflow2 |> fit(data = dat_sample)
-  pred_bs[i,] <- fit_bs %>% predict(train_data) %>% unlist()
+for (i in 1:Nsamp) {
+	dat_sample = rsample::analysis(dat_bs$splits[[i]])
+	fit_bs <- wflow2 |> fit(data = dat_sample)
+	pred_df <- fit_bs %>% predict(train_data)
+	pred_bs[i,] <- pred_df$.pred %>% unlist()
 }
 
-#compute mean and 89% confidence interval for predictions
+#compute median and 89% confidence interval for predictions
 preds <- pred_bs |> apply(2, quantile,  c(0.055, 0.5, 0.945)) |>  t()
 
 
 #make plot showing uncertainty
-plot_data2 <- data.frame(median = preds[,2], lb = preds[,1], 
-                         ub = preds[,3], observed = rep(train_data$Y,3), mean = pred2a$predicted) 
+plot_data2 <- data.frame(
+	median = preds[,2],
+	lb = preds[,1],
+	ub = preds[,3],
+	observed = rep(train_data$Y,3),
+	mean = pred2a$predicted
+) 
 
 p2 <- plot_data2 %>% ggplot() +
-  geom_point(aes(x = observed, y = median), shape = 5, color = "blue") +
-  geom_point(aes(x = observed, y = lb), shape = 4, color = "red") +
-  geom_point(aes(x = observed, y = ub), shape = 4,  color = "red") +
-  geom_point(aes(x = observed, y = mean), shape = 6, color = "black") +
-  labs(x = "Observed", y = "Predicted", title = "Predicted vs Observed") +
-  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
-  scale_x_continuous(limits=c(0,5000)) +
-  scale_y_continuous(limits=c(0,5000)) +
-  theme_minimal()
+	geom_errorbar(aes(x = observed, ymin = lb, ymax = ub), width = 25) +
+	geom_point(
+		aes(x = observed, y = median, color = "median"),
+		shape = 5
+	) +
+	geom_point(
+		aes(x = observed, y = mean, color = "mean"),
+		shape = 6
+	) +
+	labs(x = "Observed", y = "Predicted", title = "Predicted vs Observed") +
+	geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
+	scale_x_continuous(limits=c(0,5000)) +
+	scale_y_continuous(limits=c(0,5000)) +
+	scale_color_manual(name = "stat", values = c("orange", "blue")) +
+	theme_minimal()
 plot(p2)
 
 
 
 ## ---- final testing --------
-predf <- fit2 %>% predict(test_data) 
+predf <- fit2 %>% predict(test_data)
 plot_f <- predf %>% mutate(observed = rep(test_data$Y,1)) %>% rename(predicted = .pred)
 
-p3 <- ggplot() +
-  geom_point(aes(x = observed, y = predicted), data = plot_data, color="black") +
-  geom_point(aes(x = observed, y = predicted), data = plot_f, color="red", shape = 15) +
-  labs(x = "Observed", y = "Predicted", title = "Predicted vs Observed") +
-  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
-  scale_x_continuous(limits=c(0,5000)) +
-  scale_y_continuous(limits=c(0,5000)) +
-  theme_minimal()
+final_plot_data <-
+	dplyr::bind_rows(
+		"train" = dplyr::filter(plot_data, model == "model 2"),
+		"test" = plot_f,
+		.id = "set"
+	) |>
+	tibble::tibble() |>
+	dplyr::select(-model)
+
+p3 <- ggplot(final_plot_data) +
+	aes(
+		x = observed,
+		y = predicted,
+		color = set,
+		shape = set
+	) +
+	geom_point() +
+	labs(x = "Observed", y = "Predicted", title = "Predicted vs Observed") +
+	geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
+	scale_x_continuous(limits=c(0,5000)) +
+	scale_y_continuous(limits=c(0,5000)) +
+	theme_minimal()
 plot(p3)
 
diff --git a/myresources/webexercises.qmd b/myresources/webexercises.qmd