Skip to content

Commit 822d8c5

Browse files
authored
Merge pull request #53 from BiodiversiteQuebec/feature/inject-teledetection-data
🐛 fix invalid date validation
2 parents d5b7b0d + 1f9be74 commit 822d8c5

6 files changed

+216
-68
lines changed

R/coleo_validate.R

+130-67
Original file line numberDiff line numberDiff line change
@@ -271,13 +271,13 @@ coleo_validate <- function(data, media_path = NULL) {
271271
# - Test only for campaigns with obs_species
272272
#------------------------------------------------------------------------
273273
# Identify columns that need and need not to be NA if campaigns are empty
274-
no_na_tbls <- c("cells", "sites", "campaigns", "efforts", "environments", "devices", "lures", "traps", "landmarks", "samples", "thermographs")
275-
which_no_na_tbls <- sapply(no_na_tbls, function(x) grepl(x, dat_names) |> which()) |> unlist() |> unique()
276-
na_cols <- dat_names[-which_no_na_tbls]
274+
no_na_tbls <- c("cells", "sites", "campaigns", "efforts", "environments", "devices", "lures", "traps", "landmarks", "samples", "thermographs")
275+
which_no_na_tbls <- sapply(no_na_tbls, function(x) grepl(x, dat_names) |> which()) |> unlist() |> unique()
276+
na_cols <- dat_names[-which_no_na_tbls]
277+
no_obs <- 0
277278
if ("obs_species_taxa_name" %in% dat_names) {
278279
# Loop through rows to validate that observations related fields are NA if no observations
279280
row_not_empty <- c()
280-
no_obs <- 0
281281
for (row in 1:nrow(data)) {
282282
is_obs_na <- data$obs_species_taxa_name[row] |> is.na()
283283
## If no observation, then all fields of taxonomic level equal or lower to the observation need to be NA
@@ -454,80 +454,26 @@ coleo_validate <- function(data, media_path = NULL) {
454454
grepl("_date", tbl$noms_de_colonnes, fixed = TRUE)]
455455
cols_date <- cols_date_name[cols_date_name %in% dat_names]
456456

457-
458457
if(length(cols_date) > 0) {
458+
## Validate number of digits
459+
date_digits_message <- coleo_validate_date_digits(data, dat_names, cols_date)
460+
if (!is.na(date_digits_message)) warning(date_digits_message)
459461

460-
# Function to check if a date has the right number of digits
461-
has_valid_digits <- function(date) {
462-
date_parts <- strsplit(date, "-", fixed = TRUE)[[1]]
463-
all(nchar(date_parts) == c(4, 2, 2))
464-
}
465-
466-
# Check date columns
467-
cols_ndigits <- sapply(cols_date, function(x) {
468-
dates <- data[[x]]
469-
non_na_dates <- dates[!is.na(dates)]
470-
471-
# Check if all dates have 3 parts (year, month, day)
472-
all_dates_valid <- all(sapply(non_na_dates, function(date) sum(length(strsplit(date, "-", fixed = TRUE)[[1]])) == 3))
473-
474-
# Check if all dates have the right number of digits
475-
all_digits_valid <- all(sapply(non_na_dates, has_valid_digits))
476-
477-
all_dates_valid && all_digits_valid
478-
})
479-
480-
is_ndigits_valid <- all(cols_ndigits)
481-
482-
## Remove columns with all NA
483-
all_na = sapply(cols_date, function(x) {
484-
dates <- data[[x]]
485-
non_na_dates <- dates[!is.na(dates)]
486-
length(non_na_dates) == 0
487-
})
488-
non_na_date_cols <- cols_date[!all_na]
489-
is_na <- any(is.na(data[cols_date[cols_date %in% req_columns]])) | any(is.na(data[non_na_date_cols]))
490-
491-
if(!is_ndigits_valid) warning("--------------------------------------------------\nV\u00E9rifiez le format des valeurs de dates. Les dates doivent \u00EAtre du format YYYY-MM-DD.\n\n")
492-
if(is_na) warning("--------------------------------------------------\nCertaines valeurs de date sont manquantes ou NA. Les lignes sans valeurs dans les colonnes campaigns_opened_at et observations_date_obs ne seront pas injectées dans leurs tables respectives.\n\n")
462+
## Check if required date columns has NA values
463+
date_na_message <- coleo_validate_required_date_na(data, cols_date, tbl)
464+
if (!is.na(date_na_message)) warning(date_na_message)
493465
}
494466

495-
496467
#------------------------------------------------------------------------
497468
# Diagnistics
498469
#
499470
# - Check that dates are within a decent range
500471
# - Check that hours are within a decent range
501472
# - Check number of campaigns, empty campaigns, observations
502473
#------------------------------------------------------------------------
503-
# Check that the values are within a decent range -----------------------
504-
if(length(non_na_date_cols) > 0) {
505-
# Year
506-
range_year <- sapply(data[non_na_date_cols], function(x) {
507-
split <- strsplit(unlist(x), "-", fixed = TRUE)
508-
split <- split[!is.na(split)]
509-
range(as.numeric(sapply(split, `[[`, 1)))
510-
}) |>
511-
range()
512-
# Month
513-
range_month <- sapply(data[non_na_date_cols], function(x) {
514-
split <- strsplit(unlist(x), "-", fixed = TRUE)
515-
split <- split[!is.na(split)]
516-
range(as.numeric(sapply(split, `[[`, 2)))
517-
}) |>
518-
range()
519-
# Day
520-
range_day <- sapply(data[non_na_date_cols], function(x) {
521-
split <- strsplit(unlist(x), "-", fixed = TRUE)
522-
split <- split[!is.na(split)]
523-
range(as.numeric(sapply(split, `[[`, 3)))
524-
}) |>
525-
range()
526-
527-
message(paste0("==================================================\n\nValidation diagnostique :\n",
528-
if ("obs_species_taxa_name" %in% dat_names) paste0("\n- V\u00E9rifiez les lignes qui repr\u00E9sentent des campagnes vides : il y a ", no_obs, " lignes sans observations. Celles-ci entraineront une erreur lors de l'injection des observations.\n"),
529-
"\n- V\u00E9rifiez que l'intervalle des dates", paste0(" inject\u00E9", "es "), "correspond aux attentes. Les valeurs de dates des colonnes ", paste0(cols_date, collapse = ", "), " se trouvent dans l'intervalle de", paste0(" l'ann\u00E9", "e "), range_year[1], " \u00E0 ", range_year[2], " du mois ", range_month[1], " \u00E0 ", range_month[2], " et du jour ", range_day[1], " \u00E0 ", range_day[2], ".\n\n- Si les", paste0(" donn\u00E9", "es"), " sont bonnes et qu'aucun autre message n'apparait, vous pouvez", paste0(" proc\u00E9", "der"), " \u00e0 l'injection des", paste0(" donn\u00E9", "es."), '\n'))
530-
}
474+
# Check that the dates values are within a decent range ---------------------
475+
date_range_message <- coleo_validate_date_range(data, cols_date)
476+
if (!is.na(date_range_message)) message(date_range_message)
531477

532478
# Check number of entries per table -------------------------------------
533479
message("---\n\nRésumé des injections par table :\n")
@@ -678,3 +624,120 @@ missing_obs <- function(data, nvals){
678624

679625
return(message)
680626
}
627+
628+
629+
#' Validation du format des dates
630+
#'
631+
#'
632+
#' @param data Le dataframe contenant les données à injecter.
633+
#' @param dat_names Les noms des colonnes du dataframe.
634+
#' @param cols_date Les noms des colonnes contenant des date.
635+
#'
636+
#' @return Le message de validation.
637+
#'
638+
coleo_validate_date_digits <- function(data, dat_names, cols_date) {
639+
# Initiate message
640+
message <- NA
641+
642+
# Helper function to check if a date has the right number of digits
643+
has_valid_digits <- function(date) {
644+
date_parts <- strsplit(date, "-", fixed = TRUE)[[1]]
645+
all(nchar(date_parts) == c(4, 2, 2))
646+
}
647+
648+
# Check date columns
649+
cols_ndigits <- sapply(cols_date, function(x) {
650+
dates <- data[[x]]
651+
non_na_dates <- dates[!is.na(dates)]
652+
653+
# Check if all dates have 3 parts (year, month, day)
654+
all_dates_valid <- all(sapply(non_na_dates, function(date) sum(length(strsplit(date, "-", fixed = TRUE)[[1]])) == 3))
655+
656+
# Check if all dates have the right number of digits
657+
all_digits_valid <- all(sapply(non_na_dates, has_valid_digits))
658+
659+
all_dates_valid && all_digits_valid
660+
})
661+
is_ndigits_valid <- all(cols_ndigits)
662+
663+
664+
if(!is_ndigits_valid) message <- "--------------------------------------------------\nV\u00E9rifiez le format des valeurs de dates. Les dates doivent \u00EAtre du format YYYY-MM-DD.\n\n"
665+
666+
return(message)
667+
}
668+
669+
670+
#' Validation de la présence de valeurs NA dans les colonnes de date
671+
#'
672+
#'
673+
#' @param data Le dataframe contenant les données à injecter.
674+
#' @param cols_date Les noms des colonnes contenant des date.
675+
#' @param tbl La table de la base de données à laquelle les données seront injectées.
676+
#'
677+
#' @return Le message de validation.
678+
#'
679+
coleo_validate_required_date_na <- function(data, cols_date, tbl) {
680+
# Initiate message
681+
message <- NA
682+
683+
all_na <- coleo_validate_empty_cols(data, cols_date)
684+
685+
non_na_date_cols <- cols_date[!all_na]
686+
req_cols <- tbl[tbl$colonne_requise==TRUE,]$noms_de_colonnes
687+
is_na <- any(is.na(data[cols_date[cols_date %in% req_cols]])) | any(is.na(data[non_na_date_cols]))
688+
689+
690+
if(is_na) message <- ("--------------------------------------------------\nCertaines valeurs de date sont manquantes ou NA. Les lignes sans valeurs dans les colonnes campaigns_opened_at, observations_date_obs ou remote_sensing_events_date_start ne seront pas injectées dans leurs tables respectives.\n\n")
691+
692+
return(message)
693+
}
694+
695+
696+
#' Validation des colonnes vides
697+
#'
698+
#'
699+
#' @param data Le dataframe contenant les données à injecter.
700+
#' @param columns Les noms des colonnes à valider.
701+
#'
702+
#' @return Le vecteur booléen représentant si la colonne est vide.
703+
#'
704+
coleo_validate_empty_cols <- function(data, columns) {
705+
all_na <- sapply(columns, function(x) {
706+
cols <- data[[x]]
707+
non_na_cols <- cols[!is.na(cols)]
708+
length(non_na_cols) == 0
709+
})
710+
711+
return(all_na)
712+
}
713+
714+
715+
#' Validation de la présence de valeurs NA dans les colonnes de date
716+
#'
717+
#'
718+
#' @param data Le dataframe contenant les données à injecter.
719+
#' @param cols_date Les noms des colonnes contenant des dates.
720+
#' @param no_obs Le nombre d'observations sans taxon.
721+
#'
722+
#' @return Le message de validation.
723+
#'
724+
coleo_validate_date_range <- function(data, cols_date, no_obs = 0) {
725+
dates <- unlist(data[cols_date])
726+
dates <- dates[!is.na(dates)]
727+
# Extract date parts for valid dates
728+
split <- strsplit(dates, "-", fixed = TRUE)
729+
split <- split[sapply(split, length) == 3]
730+
if (length(split) == 0) return(NA)
731+
# Year
732+
range_year <- range(as.numeric(sapply(split, `[[`, 1)))
733+
# Month
734+
range_month <- range(as.numeric(sapply(split, `[[`, 2)))
735+
# Day
736+
range_day <- range(as.numeric(sapply(split, `[[`, 3)))
737+
738+
message <- paste0("==================================================\n\nValidation diagnostique :\n",
739+
if ("obs_species_taxa_name" %in% names(data)) paste0("\n- V\u00E9rifiez les lignes qui repr\u00E9sentent des campagnes vides : il y a ", no_obs, " lignes sans observations. Celles-ci entraineront une erreur lors de l'injection des observations.\n"),
740+
"\n- V\u00E9rifiez que l'intervalle des dates", paste0(" inject\u00E9", "es "), "correspond aux attentes. Les valeurs de dates des colonnes ", paste0(cols_date, collapse = ", "), " se trouvent dans l'intervalle de", paste0(" l'ann\u00E9", "e "), range_year[1], " \u00E0 ", range_year[2], " du mois ", range_month[1], " \u00E0 ", range_month[2], " et du jour ", range_day[1], " \u00E0 ", range_day[2], ".\n\n- Si les", paste0(" donn\u00E9", "es"), " sont bonnes et qu'aucun autre message n'apparait, vous pouvez", paste0(" proc\u00E9", "der"), " \u00e0 l'injection des", paste0(" donn\u00E9", "es."), '\n')
741+
742+
return(message)
743+
}

man/coleo_validate_date_digits.Rd

+21
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/coleo_validate_date_range.Rd

+21
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/coleo_validate_empty_cols.Rd

+19
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/coleo_validate_required_date_na.Rd

+21
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-coleo_validate.R

+4-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,10 @@ test_that("coleo_validate", {
155155
## Test that date format respects the YYYY-MM-DD convention
156156
dat_test <- dat
157157
dat_test$observations_date_obs <- "95-05-15"
158-
158+
testthat::expect_warning(coleo_validate(dat_test),
159+
regexp = "Vérifiez le format des valeurs de dates.*")
160+
dat_test <- dat
161+
dat_test$observations_date_obs <- "2020_05_15"
159162
testthat::expect_warning(coleo_validate(dat_test),
160163
regexp = "Vérifiez le format des valeurs de dates.*")
161164

0 commit comments

Comments
 (0)