Skip to content

Commit 2e018c3

Browse files
committed
✨ refactor for teledetection data validation #49
1 parent 426f3fc commit 2e018c3

7 files changed

+127
-66
lines changed

R/coleo_format_utils.R

+2-4
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88
#' @export
99
#'
1010
coleo_format <- function(dataFrame) {
11-
# DataFrame infos
12-
campaign_type <- unique(dataFrame$campaigns_type)
13-
assertthat::assert_that(length(campaign_type) == 1,
14-
msg = "Le type de campagne doit être unique")
11+
# Get required columns for the campaign type
12+
campaign_type <- coleo_return_campaign_type(dataFrame)
1513
data_cols <- coleo_return_cols(campaign_type)
1614

1715
# Format columns

R/coleo_get_required_tables.R

+27-25
Large diffs are not rendered by default.

R/coleo_prep_input_data.R

+4-3
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,17 @@ coleo_prep_input_data <- function(df, db_table, schema = "public") {
2020
# Convert character NAs to actual NAs
2121
# df[df == "NA"] <- NA
2222

23-
# Add cell_id to sites table
24-
if (db_table == "sites") {
23+
# Add cell_id to required table
24+
if (db_table == "sites" | db_table == "remote_sensing_events") {
2525
df <- df |>
2626
dplyr::nest_by(cells_cell_code) |>
2727
dplyr::mutate(coleo_id = list(coleo_request_by_code(human_code = cells_cell_code, table = "cells", schema = schema)),
2828
cell_id = coleo_extract_id(coleo_id)) |>
2929
dplyr::select(-cells_cell_code, -coleo_id) |>
3030
dplyr::relocate(cell_id) |>
3131
tidyr::unnest(cols = c(data)) |>
32-
dplyr::ungroup()
32+
dplyr::ungroup() |>
33+
suppressMessages()
3334
}
3435

3536
# Campaigns table specific manipulations

R/coleo_read.R

+10-10
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ coleo_read <- function(filePath) {
1616
# Catch the file extension
1717
ex <- strsplit(basename(filePath), split="\\.")[[1]][2]
1818

19+
# Test that the extension is acceptable
20+
if (!ex %in% c("csv", "xls", "xlsx", "shp")) {
21+
stop("Seuls les documents csv, shp et les gabarits coleo sont pris en charge au moment. Veuillez soumettre les donn\u00e9es dans un format support\u00e9.")
22+
}
23+
1924
# Read the data
2025
if (ex == "csv") dataFile <- coleo_read_csv(filePath)
2126
else if (ex %in% c("xls", "xlsx")) dataFile <- coleo_read_template(filePath)
@@ -29,20 +34,15 @@ coleo_read <- function(filePath) {
2934
# Format the data
3035
# - Except for shapefiles
3136
#-----------------------------------------------------------------------------
32-
# Test that the extension is acceptable
33-
if (!ex %in% c("csv", "xls", "xlsx")) stop("Seuls les documents csv, shp et les gabarits coleo sont pris en charge au moment. Veuillez soumettre les donn\u00e9es dans un format support\u00e9.")
34-
else if ("campaigns_type" %in% names(dataFile)) {
35-
# Format dataset
36-
dataFile <- coleo_format(dataFile)
37-
38-
return(dataFile)
39-
} else if ("sites_type" %in% colnames(dataFile)) {
37+
if ("sites_type" %in% colnames(dataFile)) {
4038
# Format sites dataset
4139
dataFile$sites_lat <- as.numeric(dataFile$sites_lat)
4240
dataFile$sites_lon <- as.numeric(dataFile$sites_lon)
4341

44-
return(dataFile)
45-
}
42+
} else {
43+
# Format dataset
44+
dataFile <- coleo_format(dataFile)
45+
}
4646

4747
return(dataFile)
4848
}

R/coleo_return.R

+27-2
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,14 @@ coleo_return_required_name_table <- function(db_table) {
5151

5252

5353

54-
#' Retourne un vecteur contenant les noms valides de campagnes
54+
#' Retourne un vecteur contenant les noms valides de campagnes incluant les indicateurs de télédétection
5555
#'
5656
#'
5757
#' @return vecteur de charactères contenant tous les types de capagnes valides
5858
#' @export
5959
#'
6060
coleo_return_valid_campaigns <- function(){
61-
campaigns <- coleo_get_enum_values("enum_campaigns_type")
61+
campaigns <- c(coleo_get_enum_values("enum_campaigns_type"), coleo_get_enum_values("enum_remote_sensing_indicators_name"))
6262

6363
return(campaigns)
6464
}
@@ -97,3 +97,28 @@ coleo_return_required_tables <- function(camp_type) {
9797

9898
return(tbls)
9999
}
100+
101+
102+
#' Retourne le 'campaign_type' d'un jeu de données pour tous types d'inventaires.
103+
#'
104+
#' @param data un jeu de données
105+
#'
106+
#' @return le 'campaign_type' du jeu de données
107+
#'
108+
#' @export
109+
#'
110+
coleo_return_campaign_type <- function(data) {
111+
# Get required columns for the campaign type
112+
if (is.null(data$campaigns_type)) {
113+
# Remote sensing campaigns
114+
campaign_type <- unique(data$remote_sensing_indicators_name)
115+
} else {
116+
campaign_type <- unique(data$campaigns_type)
117+
}
118+
119+
# Check if there is a single campaign type
120+
if (length(campaign_type) > 1) stop("V\u00E9rifiez que toutes les valeurs de la colonne campaigns_type (ou remote_sensing_indicators_name) sont identiques et que la valeur est un type de campagne valide. \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
121+
if (length(campaign_type) == 0) stop("V\u00E9rifiez qu'une colonne contient le type d'inventaire (campaigns_type ou remote_sensing_indicators_name) et que son nom de colonne correspond \u00e0 campaigns_type \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
122+
123+
return(campaign_type)
124+
}

R/coleo_return_cols.R

+18-9
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,8 @@ coleo_return_cols <- function(campaign_type, required.columns = FALSE) {
1010
#-------------------------------------------------------------------------------
1111
# Vérifier que campaign_type est un choix valide
1212
#-------------------------------------------------------------------------------
13-
all_camp_types <- coleo_request_general("rpc/get_enum_values",
14-
'enum_type' = "enum_campaigns_type", response_as_df = TRUE) |>
15-
unlist(use.names = FALSE)
16-
assertthat::assert_that(campaign_type %in% all_camp_types,
17-
msg = "Entrez un type de campagne valide")
13+
campaigns <- coleo_return_valid_campaigns()
14+
if (!campaign_type %in% campaigns) stop("Entrez un type de campagne valide")
1815
#-------------------------------------------------------------------------------
1916
# Tables requises pour un type de campagne
2017
#-------------------------------------------------------------------------------
@@ -63,14 +60,26 @@ coleo_return_cols <- function(campaign_type, required.columns = FALSE) {
6360
classe = classe,
6461
valeurs_acceptees = valeurs_acceptees))
6562
#-------------------------------------------------------------------------------
66-
# site_code est requis pour l'injection
63+
# site_code ou cell_code est requis pour l'injection
6764
#-------------------------------------------------------------------------------
68-
site_code_row <- data.frame(table = "sites",
69-
noms_de_champs = "site_code",
65+
is_remote_sensing <- campaign_type %in% coleo_get_enum_values("enum_remote_sensing_indicators_name")
66+
location_code_row <- data.frame(table = ifelse(is_remote_sensing, "cells", "sites"),
67+
noms_de_champs = ifelse(is_remote_sensing, "cell_code", "site_code"),
7068
colonne_requise = "TRUE",
7169
classe = "character",
7270
valeurs_acceptees = NA_character_)
73-
df <- rbind(df,site_code_row)
71+
df <- rbind(df, location_code_row)
72+
#-------------------------------------------------------------------------------
73+
# remote_sensing_indicators_name est requis pour l'injection des inventaires de télédétection
74+
#-------------------------------------------------------------------------------
75+
if (is_remote_sensing) {
76+
remote_sensing_row <- data.frame(table = "remote_sensing_indicators",
77+
noms_de_champs = "name",
78+
colonne_requise = "TRUE",
79+
classe = "character",
80+
valeurs_acceptees = coleo_get_enum_values("enum_remote_sensing_indicators_name"))
81+
df <- rbind(df, remote_sensing_row)
82+
}
7483
#-------------------------------------------------------------------------------
7584
# Special column class pour injection
7685
#-------------------------------------------------------------------------------

R/coleo_validate.R

+39-13
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,9 @@ coleo_validate <- function(data, media_path = NULL) {
1818
#------------------------------------------------------------------------
1919
# Check that there is a campaign type column and that it contains a unique value
2020
#------------------------------------------------------------------------
21-
# Pursue only if there is a campaign type column
22-
if(!assertthat::has_name(data, "campaigns_type")) stop("V\u00E9rifiez qu'une colonne contient le type de campagne et que son nom de colonne correspond \u00e0 campaigns_type \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
23-
24-
# Pursue only if there is a campaign type value
25-
campaign_type <- unique(data$campaigns_type)
21+
campaign_type <- coleo_return_campaign_type(data)
2622
campaigns <- coleo_return_valid_campaigns()
27-
if(!(length(campaign_type) == 1 && campaign_type %in% campaigns)) stop("V\u00E9rifiez que toutes les valeurs de la colonne campaigns_type sont identiques et que la valeur est un type de campagne valide. \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
23+
if(!campaign_type %in% campaigns) stop("V\u00E9rifiez que toutes les valeurs de la colonne campaigns_type sont identiques et que la valeur est un type de campagne valide. \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
2824

2925

3026
#------------------------------------------------------------------------
@@ -65,7 +61,7 @@ coleo_validate <- function(data, media_path = NULL) {
6561
# Check that the imported data has all of the required columns
6662
#------------------------------------------------------------------------
6763
# Compare required column names to present columns ----------------------
68-
req_columns <- coleo_return_cols(campaign_type, required.columns = TRUE)$noms_de_colonnes
64+
req_columns <- tbl[tbl$colonne_requise==TRUE,]$noms_de_colonnes
6965
req_col_diff <- setdiff(req_columns, true_nms)
7066
# Remove media table names from required columns
7167
media_names <- grepl("media_", req_col_diff, fixed = TRUE)
@@ -143,6 +139,23 @@ coleo_validate <- function(data, media_path = NULL) {
143139
erroneous_cols <- dat_names[!class_of_col]
144140
if(!all(class_of_col)) warning("--------------------------------------------------\nV\u00E9rifiez la classe des colonnes. Ces colonnes sont ", paste0("probl","\U00E9","matiques "), " : \n", paste0(erroneous_cols, collapse = ", "), "\n\n")
145141

142+
#------------------------------------------------------------------------
143+
# Check that all cells exists in coleo
144+
#------------------------------------------------------------------------
145+
if ("cells_cell_code" %in% dat_names) {
146+
existing_cells <- coleo_request_general(endpoint = "cells", response_as_df = TRUE, schema = 'public')
147+
148+
are_cells_exists <- all(unique(data$cells_cell_code) %in% existing_cells$cell_code)
149+
# Missing cells ---------------------------------------------------------
150+
cells_x <- which(!unique(data$cells_cell_code) %in% existing_cells$cell_code)
151+
if (length(cells_x) > 10) {
152+
missing_cells <- paste0(paste0(unique(data$cells_cell_code)[cells_x[1:10]], collapse = ", "), " [...",length(cells_x)-10," tronquées]")
153+
} else {
154+
missing_cells <- paste0(unique(data$cells_cell_code)[cells_x], collapse = ", ")
155+
}
156+
157+
if(!are_cells_exists) warning("--------------------------------------------------\n", paste0("V","\U00E9","rifiez")," les cellules ", missing_cells, " de la colonne cells_cell_code ou injectez ces cellules dans la table cells de coleo. Ces cellules n'existent pas dans coleo.\n\n")
158+
}
146159

147160
#------------------------------------------------------------------------
148161
# Check that all sites exists in coleo
@@ -152,8 +165,13 @@ coleo_validate <- function(data, media_path = NULL) {
152165
are_sites_exists <- all(unique(data$sites_site_code) %in% existing_sites$site_code)
153166
# Missing sites ---------------------------------------------------------
154167
sites_x <- which(!unique(data$sites_site_code) %in% existing_sites$site_code)
168+
if (length(sites_x) > 10) {
169+
missing_sites <- paste0(paste0(unique(data$sites_site_code)[sites_x[1:10]], collapse = ", ")," [...",length(sites_x)-10," tronqués]")
170+
} else {
171+
missing_sites <- paste0(unique(data$sites_site_code)[sites_x], collapse = ", ")
172+
}
155173

156-
if(!are_sites_exists) warning("--------------------------------------------------\n", paste0("V","\U00E9","rifiez")," les sites ", paste0(unique(data$sites_site_code)[sites_x], collapse = ", "), " de la colonne sites_site_code ou injectez ces sites dans la table sites de coleo. Ces sites n'existent pas dans coleo.\n\n")
174+
if(!are_sites_exists) warning("--------------------------------------------------\n", paste0("V","\U00E9","rifiez")," les sites ", missing_sites, " de la colonne sites_site_code ou injectez ces sites dans la table sites de coleo. Ces sites n'existent pas dans coleo.\n\n")
157175

158176

159177
#------------------------------------------------------------------------
@@ -460,7 +478,15 @@ coleo_validate <- function(data, media_path = NULL) {
460478
})
461479

462480
is_ndigits_valid <- all(cols_ndigits)
463-
is_na <- any(is.na(data[cols_date]))
481+
482+
## Remove columns with all NA
483+
all_na = sapply(cols_date, function(x) {
484+
dates <- data[[x]]
485+
non_na_dates <- dates[!is.na(dates)]
486+
length(non_na_dates) == 0
487+
})
488+
non_na_date_cols <- cols_date[!all_na]
489+
is_na <- any(is.na(data[cols_date[cols_date %in% req_columns]])) | any(is.na(data[non_na_date_cols]))
464490

465491
if(!is_ndigits_valid) warning("--------------------------------------------------\nV\u00E9rifiez le format des valeurs de dates. Les dates doivent \u00EAtre du format YYYY-MM-DD.\n\n")
466492
if(is_na) warning("--------------------------------------------------\nCertaines valeurs de date sont manquantes ou NA. Les lignes sans valeurs dans les colonnes campaigns_opened_at et observations_date_obs ne seront pas injectées dans leurs tables respectives.\n\n")
@@ -475,23 +501,23 @@ coleo_validate <- function(data, media_path = NULL) {
475501
# - Check number of campaigns, empty campaigns, observations
476502
#------------------------------------------------------------------------
477503
# Check that the values are within a decent range -----------------------
478-
if(length(cols_date) > 0) {
504+
if(length(non_na_date_cols) > 0) {
479505
# Year
480-
range_year <- sapply(data[cols_date], function(x) {
506+
range_year <- sapply(data[non_na_date_cols], function(x) {
481507
split <- strsplit(unlist(x), "-", fixed = TRUE)
482508
split <- split[!is.na(split)]
483509
range(as.numeric(sapply(split, `[[`, 1)))
484510
}) |>
485511
range()
486512
# Month
487-
range_month <- sapply(data[cols_date], function(x) {
513+
range_month <- sapply(data[non_na_date_cols], function(x) {
488514
split <- strsplit(unlist(x), "-", fixed = TRUE)
489515
split <- split[!is.na(split)]
490516
range(as.numeric(sapply(split, `[[`, 2)))
491517
}) |>
492518
range()
493519
# Day
494-
range_day <- sapply(data[cols_date], function(x) {
520+
range_day <- sapply(data[non_na_date_cols], function(x) {
495521
split <- strsplit(unlist(x), "-", fixed = TRUE)
496522
split <- split[!is.na(split)]
497523
range(as.numeric(sapply(split, `[[`, 3)))

0 commit comments

Comments
 (0)