tidymodels
diff --git a/‎NAMESPACE
+12 b/‎NAMESPACE
+12
diff --git a/‎NEWS.md
+5 b/‎NEWS.md
+5
diff --git a/‎R/compat-vctrs-helpers.R
+2-1 b/‎R/compat-vctrs-helpers.R
+2-1
diff --git a/‎R/spatial_nndm_cv.R
+269 b/‎R/spatial_nndm_cv.R
+269
@@ -15,6 +15,7 @@ S3method(vec_cast,data.frame.spatial_block_cv)
 S3method(vec_cast,data.frame.spatial_buffer_vfold_cv)
 S3method(vec_cast,data.frame.spatial_clustering_cv)
 S3method(vec_cast,data.frame.spatial_leave_location_out_cv)
+S3method(vec_cast,data.frame.spatial_nndm_cv)
 S3method(vec_cast,spatial_block_cv.data.frame)
 S3method(vec_cast,spatial_block_cv.spatial_block_cv)
 S3method(vec_cast,spatial_block_cv.tbl_df)
@@ -27,14 +28,19 @@ S3method(vec_cast,spatial_clustering_cv.tbl_df)
 S3method(vec_cast,spatial_leave_location_out_cv.data.frame)
 S3method(vec_cast,spatial_leave_location_out_cv.spatial_leave_location_out_cv)
 S3method(vec_cast,spatial_leave_location_out_cv.tbl_df)
+S3method(vec_cast,spatial_nndm_cv.data.frame)
+S3method(vec_cast,spatial_nndm_cv.spatial_nndm_cv)
+S3method(vec_cast,spatial_nndm_cv.tbl_df)
 S3method(vec_cast,tbl_df.spatial_block_cv)
 S3method(vec_cast,tbl_df.spatial_buffer_vfold_cv)
 S3method(vec_cast,tbl_df.spatial_clustering_cv)
 S3method(vec_cast,tbl_df.spatial_leave_location_out_cv)
+S3method(vec_cast,tbl_df.spatial_nndm_cv)
 S3method(vec_ptype2,data.frame.spatial_block_cv)
 S3method(vec_ptype2,data.frame.spatial_buffer_vfold_cv)
 S3method(vec_ptype2,data.frame.spatial_clustering_cv)
 S3method(vec_ptype2,data.frame.spatial_leave_location_out_cv)
+S3method(vec_ptype2,data.frame.spatial_nndm_cv)
 S3method(vec_ptype2,spatial_block_cv.data.frame)
 S3method(vec_ptype2,spatial_block_cv.spatial_block_cv)
 S3method(vec_ptype2,spatial_block_cv.tbl_df)
@@ -47,21 +53,27 @@ S3method(vec_ptype2,spatial_clustering_cv.tbl_df)
 S3method(vec_ptype2,spatial_leave_location_out_cv.data.frame)
 S3method(vec_ptype2,spatial_leave_location_out_cv.spatial_leave_location_out_cv)
 S3method(vec_ptype2,spatial_leave_location_out_cv.tbl_df)
+S3method(vec_ptype2,spatial_nndm_cv.data.frame)
+S3method(vec_ptype2,spatial_nndm_cv.spatial_nndm_cv)
+S3method(vec_ptype2,spatial_nndm_cv.tbl_df)
 S3method(vec_ptype2,tbl_df.spatial_block_cv)
 S3method(vec_ptype2,tbl_df.spatial_buffer_vfold_cv)
 S3method(vec_ptype2,tbl_df.spatial_clustering_cv)
 S3method(vec_ptype2,tbl_df.spatial_leave_location_out_cv)
+S3method(vec_ptype2,tbl_df.spatial_nndm_cv)
 S3method(vec_restore,spatial_block_cv)
 S3method(vec_restore,spatial_buffer_vfold_cv)
 S3method(vec_restore,spatial_clustering_cv)
 S3method(vec_restore,spatial_leave_location_out_cv)
+S3method(vec_restore,spatial_nndm_cv)
 export(analysis)
 export(assessment)
 export(autoplot)
 export(spatial_block_cv)
 export(spatial_buffer_vfold_cv)
 export(spatial_clustering_cv)
 export(spatial_leave_location_out_cv)
+export(spatial_nndm_cv)
 import(sf)
 import(vctrs)
 importFrom(dplyr,dplyr_reconstruct)
 
@@ -1,5 +1,10 @@
 # spatialsample (development version)
 
+* `spatial_nndm_cv()` is a new function for nearest neighbor distance matching
+  cross-validation, as described in Milà et al. 2022 
+  (doi: 10.1111/2041-210X.13851). NNDM was first implemented in CAST
+  (https://cran.r-project.org/package=CAST).
+
 # spatialsample 0.3.0
 
 ## Breaking changes
 
@@ -17,7 +17,8 @@ delayedAssign("rset_subclasses", {
         spatial_block_cv              = spatial_block_cv(test_data()),
         spatial_clustering_cv         = spatial_clustering_cv(test_data()),
         spatial_buffer_vfold_cv       = spatial_buffer_vfold_cv(test_data(), radius = 1, buffer = 1),
-        spatial_leave_location_out_cv = spatial_leave_location_out_cv(test_data(), idx)
+        spatial_leave_location_out_cv = spatial_leave_location_out_cv(test_data(), idx),
+        spatial_nndm_cv               = spatial_nndm_cv(test_data()[1:500, ], test_data()[501:682, ])
       )
     )
   } else {
 
@@ -0,0 +1,269 @@
+#' Nearest neighbor distance matching (NNDM) cross-validation
+#'
+#' NNDM is a variant of leave-one-out cross-validation which assigns each
+#' observation to a single assessment fold, and then attempts to remove data
+#' from each analysis fold until the nearest neighbor distance distribution
+#' between assessment and analysis folds matches the nearest neighbor distance
+#' distribution between training data and the locations a model will be used to
+#' predict.
+#' Proposed by Milà et al. (2022), this method aims to provide accurate
+#' estimates of how well models will perform in the locations they will actually
+#' be predicting. This method was originally implemented in the CAST package.
+#'
+#' Note that, as a form of leave-one-out cross-validation, this method can be
+#' rather slow for larger data (and fitting models to these resamples will be
+#' even slower).
+#'
+#' @param data An object of class `sf` or `sfc`.
+#' @param prediction_sites An `sf` or `sfc` object describing the areas to be
+#' predicted. If `prediction_sites` are all points, then those points are
+#' treated as the intended prediction points when calculating target nearest
+#' neighbor distances. If any element of `prediction_sites` is not a single
+#' point, then points are sampled from within the bounding box of
+#' `prediction_sites` and those points are then used as the intended prediction
+#' points.
+#' @param ... Additional arguments passed to [sf::st_sample()]. Note that the
+#' number of points to sample is controlled by `prediction_sample_size`; trying
+#' to pass `size` via `...` will cause an error.
+#' @param autocorrelation_range A numeric of length 1 representing the landscape
+#' autocorrelation range ("phi" in the terminology of Milà et al. (2022)). If
+#' `NULL`, the default, the autocorrelation range is assumed to be the distance
+#' between the opposite corners of the bounding box of `prediction_sites`.
+#' @param prediction_sample_size A numeric of length 1: the number of points to
+#' sample when `prediction_sites` is not only composed of points. Note that this
+#' argument is passed to `size` in [sf::st_sample()], meaning that no elements
+#' of `...` can be named `size`.
+#' @param min_analysis_proportion The minimum proportion of `data` that must
+#' remain after removing points to match nearest neighbor distances. This
+#' function will stop removing data from analysis sets once only
+#' `min_analysis_proportion` of the original data remains in analysis sets, even
+#' if the nearest neighbor distances between analysis and assessment sets are
+#' still lower than those between training and prediction locations.
+#'
+#' @return A tibble with classes `spatial_nndm_cv`,  `spatial_rset`, `rset`,
+#'   `tbl_df`, `tbl`, and `data.frame`. The results include a column for the
+#'   data split objects and an identification variable `id`.
+#'
+#' @references
+#' C. Milà, J. Mateu, E. Pebesma, and H. Meyer. 2022. "Nearest Neighbour
+#' Distance Matching Leave-One-Out Cross-Validation for map validation." Methods
+#' in Ecology and Evolution 2022:13, pp 1304– 1316.
+#' doi: 10.1111/2041-210X.13851.
+#'
+#' H. Meyer and E. Pebesma. 2022. "Machine learning-based global maps of
+#' ecological variables and the challenge of assessing them."
+#' Nature Communications 13, pp 2208. doi: 10.1038/s41467-022-29838-9.
+#'
+#' @examplesIf rlang::is_installed("modeldata")
+#' data(ames, package = "modeldata")
+#' ames_sf <- sf::st_as_sf(ames, coords = c("Longitude", "Latitude"), crs = 4326)
+#'
+#' # Using a small subset of the data, to make the example run faster:
+#' spatial_nndm_cv(ames_sf[1:200, ], ames_sf[2001:2200, ])
+#'
+#' @export
+spatial_nndm_cv <- function(data, prediction_sites, ...,
+                            autocorrelation_range = NULL,
+                            prediction_sample_size = 1000,
+                            min_analysis_proportion = 0.5) {
+  # Data validation: check that all dots are used,
+  # that data and prediction_sites are sf objects,
+  # that data has a CRS and s2 is enabled if necessary
+  rlang::check_dots_used()
+
+  standard_checks(data, "`spatial_nndm_cv()`", rlang::current_env())
+  if (!is_sf(prediction_sites)) {
+    rlang::abort(
+      c(
+        glue::glue("`spatial_nndm_cv()` currently only supports `sf` objects."),
+        i = "Try converting `prediction_sites` to an `sf` object via `sf::st_as_sf()`."
+      )
+    )
+  }
+
+  # sf::st_distance won't reproject automatically, so if prediction_sites
+  # isn't already aligned with data, reproject coordinates to prevent
+  # distance calculations from failing
+  if (!isTRUE(sf::st_crs(prediction_sites) == sf::st_crs(data))) {
+    rlang::warn(
+      c(
+        "Reprojecting `prediction_sites` to match the CRS of `data`.",
+        i = "Reproject `prediction_sites` and `data` to share a CRS to avoid this warning."
+      )
+    )
+    if (is.na(sf::st_crs(prediction_sites))) {
+      prediction_sites <- sf::st_set_crs(prediction_sites, sf::st_crs(data))
+    } else {
+      prediction_sites <- sf::st_transform(prediction_sites, sf::st_crs(data))
+    }
+  }
+
+  # Attributes that will be attached to the rset object
+  # Importantly this is before we sample prediction_sites
+  # or compute autocorrelation_range,
+  # primarily for compatibility with rsample::reshuffle_rset()
+  cv_att <- list(
+    prediction_sites = prediction_sites,
+    prediction_sample_size = prediction_sample_size,
+    autocorrelation_range = autocorrelation_range,
+    min_analysis_proportion = min_analysis_proportion,
+    ...
+  )
+
+  ######## Actual processing begins here ########
+  # "If any element of `prediction_sites` is not a single point,
+  # then points are sampled from within the bounding box of `prediction_sites`"
+  # Because an sf object can contain multiple geometry types,
+  # we check both for length > 1 (in order to avoid the "condition has length"
+  # error) and to see if the input is already only points
+  pred_geometry <- unique(sf::st_geometry_type(prediction_sites))
+  if (length(pred_geometry) > 1 || pred_geometry != "POINT") {
+    prediction_sites <- sf::st_sample(
+      x = sf::st_as_sfc(sf::st_bbox(prediction_sites)),
+      size = prediction_sample_size,
+      ...
+    )
+  }
+
+  # Set autocorrelation range, if not specified, to be the distance between
+  # the bottom-left and upper-right corners of prediction_sites --
+  # the idea being that this is the maximum relevant distance for
+  # autocorrelation, and there's limited harm in assuming too long a range
+  # (at least, versus too short)
+  #
+  # We do this after sampling for 1:1 compatibility with CAST
+  if (is.null(autocorrelation_range)) {
+    bbox <- sf::st_bbox(prediction_sites)
+
+    autocorrelation_range <- sf::st_distance(
+      sf::st_as_sf(
+        data.frame(
+          lon = bbox[c("xmin", "xmax")],
+          lat = bbox[c("ymin", "ymax")]
+        ),
+        coords = c("lon", "lat"),
+        crs = sf::st_crs(prediction_sites)
+      )
+    )[2]
+  }
+
+  dist_to_nn_prediction <- apply(
+    sf::st_distance(prediction_sites, data),
+    1,
+    min
+  )
+
+  distance_matrix <- sf::st_distance(data)
+
+  # We've enforced that prediction_sites and data are in the same CRS;
+  # therefore nearest_neighbors and distance_matrix are in the same units
+  # Force autocorrelation_range into the same units:
+  units(autocorrelation_range) <- units(distance_matrix)
+
+  # We're guaranteed to be working in one set of units now,
+  # which means we should be able to drop units entirely at this point
+  # (which should make some of the logic here easier)
+  units(autocorrelation_range) <- NULL
+  units(distance_matrix) <- NULL
+
+  diag(distance_matrix) <- NA
+  dist_to_nn_training <- apply(distance_matrix, 1, min, na.rm = TRUE)
+
+  current_neighbor <- list(
+    distance = min(dist_to_nn_training),
+    row = which.min(dist_to_nn_training)[1]
+  )
+  current_neighbor$col <- which.min(distance_matrix[current_neighbor$row, ])
+
+  n_training <- nrow(data)
+
+  # Core loop: try to match the empirical nearest neighbor distribution curves
+  # (adjusting the training:training curve to that of prediction:training)
+  while (current_neighbor$distance <= autocorrelation_range) {
+    # Proportion of training data with a neighbor in training
+    # closer than current_neighbor$distance if we removed one additional point
+    # (hence 1 / n_training)
+    prop_close_training <-
+      mean(dist_to_nn_training <= current_neighbor$distance) - (1 / n_training)
+    # Proportion of prediction data with a neighbor in training data
+    # closer than current_neighbor$distance
+    prop_close_prediction <- mean(
+      dist_to_nn_prediction <= current_neighbor$distance
+    )
+
+    # How much data remains in analysis sets?
+    prop_remaining <- sum(
+      !is.na(distance_matrix[current_neighbor$row, ])
+    ) / n_training
+
+    if ((prop_close_training >= prop_close_prediction) &
+      (prop_remaining > min_analysis_proportion)) {
+
+      # Remove nearest neighbors from analysis sets until the % of points with
+      # an NN in analysis at distance X in analysis ~= the % of points
+      # in predict with NN in train at distance X
+      distance_matrix[current_neighbor$row, current_neighbor$col] <- NA
+
+      dist_to_nn_training <- apply(distance_matrix, 1, min, na.rm = TRUE)
+
+      # Then update "distance X" to be the next nearest neighbor
+      #
+      # We just set the distance at current_neighbor to NA,
+      # so using >= won't just select the same neighbor over and over again
+      current_neighbor <- find_next_neighbor(
+        current_neighbor,
+        dist_to_nn_training,
+        distance_matrix,
+        equal_distance_ok = TRUE
+      )
+    } else {
+      # If prop_close_training < prop_close_prediction,
+      # we don't need to remove the current point;
+      # as such, we need to find a distance >, rather than >=,
+      # to the current neighbor
+      # (or else we'd loop on this point forever)
+      current_neighbor <- find_next_neighbor(
+        current_neighbor,
+        dist_to_nn_training,
+        distance_matrix,
+        equal_distance_ok = FALSE
+      )
+    }
+
+    if (!any(dist_to_nn_training > current_neighbor$distance)) {
+      break
+    }
+  }
+
+  indices <- purrr::map(
+    seq_len(nrow(distance_matrix)),
+    function(i) {
+      list(
+        analysis = which(!is.na(distance_matrix[i, ])),
+        assessment = i
+      )
+    }
+  )
+
+  split_objs <- purrr::map(
+    indices,
+    make_splits,
+    data = data,
+    class = c("spatial_nndm_split", "spatial_rsplit")
+  )
+
+  new_rset(
+    splits = split_objs,
+    ids = names0(length(split_objs), "Fold"),
+    attrib = cv_att,
+    subclass = c("spatial_nndm_cv", "spatial_rset", "rset")
+  )
+}
+
+find_next_neighbor <- function(current_neighbor, dist_to_nn_training, distance_matrix, equal_distance_ok = FALSE) {
+  operator <- if (equal_distance_ok) `>=` else `>`
+  current_neighbor$distance <- min(dist_to_nn_training[operator(dist_to_nn_training, current_neighbor$distance)])
+  current_neighbor$row <- which(dist_to_nn_training == current_neighbor$distance)[1]
+  current_neighbor$col <- which(distance_matrix[current_neighbor$row, ] == current_neighbor$distance)
+  current_neighbor
+}
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,8 @@ delayedAssign("rset_subclasses", {`
`17`	`17`	`spatial_block_cv = spatial_block_cv(test_data()),`
`18`	`18`	`spatial_clustering_cv = spatial_clustering_cv(test_data()),`
`19`	`19`	`spatial_buffer_vfold_cv = spatial_buffer_vfold_cv(test_data(), radius = 1, buffer = 1),`
`20`		`- spatial_leave_location_out_cv = spatial_leave_location_out_cv(test_data(), idx)`
	`20`	`+ spatial_leave_location_out_cv = spatial_leave_location_out_cv(test_data(), idx),`
	`21`	`+ spatial_nndm_cv = spatial_nndm_cv(test_data()[1:500, ], test_data()[501:682, ])`
`21`	`22`	`)`
`22`	`23`	`)`
`23`	`24`	`} else {`