Skip to content

Commit b73dbb2

Browse files
authored
Merge pull request #51 from BiodiversiteQuebec/feature/validate-teledetection-data
✨ refactor for teledetection data validation #49
2 parents 426f3fc + 0ae5f7a commit b73dbb2

13 files changed

+173
-73
lines changed

DESCRIPTION

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: rcoleo
22
Title: rcoleo
3-
Version: 3.0.0
3+
Version: 3.1.0
44
Authors@R: c(
55
person("Steve", "Vissault", , "steve.vissault@usherbrooke.ca", role = "aut",
66
comment = c(ORCID = "0000-0002-0866-4376")),
@@ -14,8 +14,8 @@ Description: Injection et retrait des données collectées dans le cadre du
1414
programme de suivi de la biodiversité via l'API de Coléo, un système
1515
d'information sur la biodiversité du Québec.
1616
License: MIT + file LICENSE
17-
URL: https://github.com/ReseauBiodiversiteQuebec/rcoleo
18-
BugReports: https://github.com/ReseauBiodiversiteQuebec/rcoleo/issues
17+
URL: https://github.com/biodiversitequebec/rcoleo
18+
BugReports: https://github.com/biodiversitequebec/rcoleo/issues
1919
Depends:
2020
R (>= 4.1.0)
2121
Imports:

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ export(coleo_request_by_code)
2020
export(coleo_request_data)
2121
export(coleo_request_general)
2222
export(coleo_resp_df)
23+
export(coleo_return_campaign_type)
2324
export(coleo_return_cols)
2425
export(coleo_return_required_tables)
2526
export(coleo_return_valid_campaigns)

R/coleo_format_utils.R

+2-4
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88
#' @export
99
#'
1010
coleo_format <- function(dataFrame) {
11-
# DataFrame infos
12-
campaign_type <- unique(dataFrame$campaigns_type)
13-
assertthat::assert_that(length(campaign_type) == 1,
14-
msg = "Le type de campagne doit être unique")
11+
# Get required columns for the campaign type
12+
campaign_type <- coleo_return_campaign_type(dataFrame)
1513
data_cols <- coleo_return_cols(campaign_type)
1614

1715
# Format columns

R/coleo_get_required_tables.R

+27-25
Large diffs are not rendered by default.

R/coleo_prep_input_data.R

+4-3
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,17 @@ coleo_prep_input_data <- function(df, db_table, schema = "public") {
2020
# Convert character NAs to actual NAs
2121
# df[df == "NA"] <- NA
2222

23-
# Add cell_id to sites table
24-
if (db_table == "sites") {
23+
# Add cell_id to required table
24+
if (db_table == "sites" | db_table == "remote_sensing_events") {
2525
df <- df |>
2626
dplyr::nest_by(cells_cell_code) |>
2727
dplyr::mutate(coleo_id = list(coleo_request_by_code(human_code = cells_cell_code, table = "cells", schema = schema)),
2828
cell_id = coleo_extract_id(coleo_id)) |>
2929
dplyr::select(-cells_cell_code, -coleo_id) |>
3030
dplyr::relocate(cell_id) |>
3131
tidyr::unnest(cols = c(data)) |>
32-
dplyr::ungroup()
32+
dplyr::ungroup() |>
33+
suppressMessages()
3334
}
3435

3536
# Campaigns table specific manipulations

R/coleo_read.R

+10-10
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ coleo_read <- function(filePath) {
1616
# Catch the file extension
1717
ex <- strsplit(basename(filePath), split="\\.")[[1]][2]
1818

19+
# Test that the extension is acceptable
20+
if (!ex %in% c("csv", "xls", "xlsx", "shp")) {
21+
stop("Seuls les documents csv, shp et les gabarits coleo sont pris en charge au moment. Veuillez soumettre les donn\u00e9es dans un format support\u00e9.")
22+
}
23+
1924
# Read the data
2025
if (ex == "csv") dataFile <- coleo_read_csv(filePath)
2126
else if (ex %in% c("xls", "xlsx")) dataFile <- coleo_read_template(filePath)
@@ -29,20 +34,15 @@ coleo_read <- function(filePath) {
2934
# Format the data
3035
# - Except for shapefiles
3136
#-----------------------------------------------------------------------------
32-
# Test that the extension is acceptable
33-
if (!ex %in% c("csv", "xls", "xlsx")) stop("Seuls les documents csv, shp et les gabarits coleo sont pris en charge au moment. Veuillez soumettre les donn\u00e9es dans un format support\u00e9.")
34-
else if ("campaigns_type" %in% names(dataFile)) {
35-
# Format dataset
36-
dataFile <- coleo_format(dataFile)
37-
38-
return(dataFile)
39-
} else if ("sites_type" %in% colnames(dataFile)) {
37+
if ("sites_type" %in% colnames(dataFile)) {
4038
# Format sites dataset
4139
dataFile$sites_lat <- as.numeric(dataFile$sites_lat)
4240
dataFile$sites_lon <- as.numeric(dataFile$sites_lon)
4341

44-
return(dataFile)
45-
}
42+
} else {
43+
# Format dataset
44+
dataFile <- coleo_format(dataFile)
45+
}
4646

4747
return(dataFile)
4848
}

R/coleo_return.R

+27-2
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,14 @@ coleo_return_required_name_table <- function(db_table) {
5151

5252

5353

54-
#' Retourne un vecteur contenant les noms valides de campagnes
54+
#' Retourne un vecteur contenant les noms valides de campagnes incluant les indicateurs de télédétection
5555
#'
5656
#'
5757
#' @return vecteur de charactères contenant tous les types de capagnes valides
5858
#' @export
5959
#'
6060
coleo_return_valid_campaigns <- function(){
61-
campaigns <- coleo_get_enum_values("enum_campaigns_type")
61+
campaigns <- c(coleo_get_enum_values("enum_campaigns_type"), coleo_get_enum_values("enum_remote_sensing_indicators_name"))
6262

6363
return(campaigns)
6464
}
@@ -97,3 +97,28 @@ coleo_return_required_tables <- function(camp_type) {
9797

9898
return(tbls)
9999
}
100+
101+
102+
#' Retourne le 'campaign_type' d'un jeu de données pour tous types d'inventaires.
103+
#'
104+
#' @param data un jeu de données
105+
#'
106+
#' @return le 'campaign_type' du jeu de données
107+
#'
108+
#' @export
109+
#'
110+
coleo_return_campaign_type <- function(data) {
111+
# Get required columns for the campaign type
112+
if (is.null(data$campaigns_type)) {
113+
# Remote sensing campaigns
114+
campaign_type <- unique(data$remote_sensing_indicators_name)
115+
} else {
116+
campaign_type <- unique(data$campaigns_type)
117+
}
118+
119+
# Check if there is a single campaign type
120+
if (length(campaign_type) > 1) stop("V\u00E9rifiez que toutes les valeurs de la colonne campaigns_type (ou remote_sensing_indicators_name) sont identiques et que la valeur est un type de campagne valide. \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
121+
if (length(campaign_type) == 0) stop("V\u00E9rifiez qu'une colonne contient le type d'inventaire (campaigns_type ou remote_sensing_indicators_name) et que son nom de colonne correspond \u00e0 campaigns_type \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
122+
123+
return(campaign_type)
124+
}

R/coleo_return_cols.R

+18-9
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,8 @@ coleo_return_cols <- function(campaign_type, required.columns = FALSE) {
1010
#-------------------------------------------------------------------------------
1111
# Vérifier que campaign_type est un choix valide
1212
#-------------------------------------------------------------------------------
13-
all_camp_types <- coleo_request_general("rpc/get_enum_values",
14-
'enum_type' = "enum_campaigns_type", response_as_df = TRUE) |>
15-
unlist(use.names = FALSE)
16-
assertthat::assert_that(campaign_type %in% all_camp_types,
17-
msg = "Entrez un type de campagne valide")
13+
campaigns <- coleo_return_valid_campaigns()
14+
if (!campaign_type %in% campaigns) stop("Entrez un type de campagne valide")
1815
#-------------------------------------------------------------------------------
1916
# Tables requises pour un type de campagne
2017
#-------------------------------------------------------------------------------
@@ -63,14 +60,26 @@ coleo_return_cols <- function(campaign_type, required.columns = FALSE) {
6360
classe = classe,
6461
valeurs_acceptees = valeurs_acceptees))
6562
#-------------------------------------------------------------------------------
66-
# site_code est requis pour l'injection
63+
# site_code ou cell_code est requis pour l'injection
6764
#-------------------------------------------------------------------------------
68-
site_code_row <- data.frame(table = "sites",
69-
noms_de_champs = "site_code",
65+
is_remote_sensing <- campaign_type %in% coleo_get_enum_values("enum_remote_sensing_indicators_name")
66+
location_code_row <- data.frame(table = ifelse(is_remote_sensing, "cells", "sites"),
67+
noms_de_champs = ifelse(is_remote_sensing, "cell_code", "site_code"),
7068
colonne_requise = "TRUE",
7169
classe = "character",
7270
valeurs_acceptees = NA_character_)
73-
df <- rbind(df,site_code_row)
71+
df <- rbind(df, location_code_row)
72+
#-------------------------------------------------------------------------------
73+
# remote_sensing_indicators_name est requis pour l'injection des inventaires de télédétection
74+
#-------------------------------------------------------------------------------
75+
if (is_remote_sensing) {
76+
remote_sensing_row <- data.frame(table = "remote_sensing_indicators",
77+
noms_de_champs = "name",
78+
colonne_requise = "TRUE",
79+
classe = "character",
80+
valeurs_acceptees = coleo_get_enum_values("enum_remote_sensing_indicators_name"))
81+
df <- rbind(df, remote_sensing_row)
82+
}
7483
#-------------------------------------------------------------------------------
7584
# Special column class pour injection
7685
#-------------------------------------------------------------------------------

R/coleo_validate.R

+39-13
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,9 @@ coleo_validate <- function(data, media_path = NULL) {
1818
#------------------------------------------------------------------------
1919
# Check that there is a campaign type column and that it contains a unique value
2020
#------------------------------------------------------------------------
21-
# Pursue only if there is a campaign type column
22-
if(!assertthat::has_name(data, "campaigns_type")) stop("V\u00E9rifiez qu'une colonne contient le type de campagne et que son nom de colonne correspond \u00e0 campaigns_type \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
23-
24-
# Pursue only if there is a campaign type value
25-
campaign_type <- unique(data$campaigns_type)
21+
campaign_type <- coleo_return_campaign_type(data)
2622
campaigns <- coleo_return_valid_campaigns()
27-
if(!(length(campaign_type) == 1 && campaign_type %in% campaigns)) stop("V\u00E9rifiez que toutes les valeurs de la colonne campaigns_type sont identiques et que la valeur est un type de campagne valide. \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
23+
if(!campaign_type %in% campaigns) stop("V\u00E9rifiez que toutes les valeurs de la colonne campaigns_type sont identiques et que la valeur est un type de campagne valide. \nLe type de campagne est n\u00E9cessaire pour les prochaines \u00E9tapes de validation.\n\n")
2824

2925

3026
#------------------------------------------------------------------------
@@ -65,7 +61,7 @@ coleo_validate <- function(data, media_path = NULL) {
6561
# Check that the imported data has all of the required columns
6662
#------------------------------------------------------------------------
6763
# Compare required column names to present columns ----------------------
68-
req_columns <- coleo_return_cols(campaign_type, required.columns = TRUE)$noms_de_colonnes
64+
req_columns <- tbl[tbl$colonne_requise==TRUE,]$noms_de_colonnes
6965
req_col_diff <- setdiff(req_columns, true_nms)
7066
# Remove media table names from required columns
7167
media_names <- grepl("media_", req_col_diff, fixed = TRUE)
@@ -143,6 +139,23 @@ coleo_validate <- function(data, media_path = NULL) {
143139
erroneous_cols <- dat_names[!class_of_col]
144140
if(!all(class_of_col)) warning("--------------------------------------------------\nV\u00E9rifiez la classe des colonnes. Ces colonnes sont ", paste0("probl","\U00E9","matiques "), " : \n", paste0(erroneous_cols, collapse = ", "), "\n\n")
145141

142+
#------------------------------------------------------------------------
143+
# Check that all cells exists in coleo
144+
#------------------------------------------------------------------------
145+
if ("cells_cell_code" %in% dat_names) {
146+
existing_cells <- coleo_request_general(endpoint = "cells", response_as_df = TRUE, schema = 'public')
147+
148+
are_cells_exists <- all(unique(data$cells_cell_code) %in% existing_cells$cell_code)
149+
# Missing cells ---------------------------------------------------------
150+
cells_x <- which(!unique(data$cells_cell_code) %in% existing_cells$cell_code)
151+
if (length(cells_x) > 10) {
152+
missing_cells <- paste0(paste0(unique(data$cells_cell_code)[cells_x[1:10]], collapse = ", "), " [...",length(cells_x)-10," tronquées]")
153+
} else {
154+
missing_cells <- paste0(unique(data$cells_cell_code)[cells_x], collapse = ", ")
155+
}
156+
157+
if(!are_cells_exists) warning("--------------------------------------------------\n", paste0("V","\U00E9","rifiez")," les cellules ", missing_cells, " de la colonne cells_cell_code ou injectez ces cellules dans la table cells de coleo. Ces cellules n'existent pas dans coleo.\n\n")
158+
}
146159

147160
#------------------------------------------------------------------------
148161
# Check that all sites exists in coleo
@@ -152,8 +165,13 @@ coleo_validate <- function(data, media_path = NULL) {
152165
are_sites_exists <- all(unique(data$sites_site_code) %in% existing_sites$site_code)
153166
# Missing sites ---------------------------------------------------------
154167
sites_x <- which(!unique(data$sites_site_code) %in% existing_sites$site_code)
168+
if (length(sites_x) > 10) {
169+
missing_sites <- paste0(paste0(unique(data$sites_site_code)[sites_x[1:10]], collapse = ", ")," [...",length(sites_x)-10," tronqués]")
170+
} else {
171+
missing_sites <- paste0(unique(data$sites_site_code)[sites_x], collapse = ", ")
172+
}
155173

156-
if(!are_sites_exists) warning("--------------------------------------------------\n", paste0("V","\U00E9","rifiez")," les sites ", paste0(unique(data$sites_site_code)[sites_x], collapse = ", "), " de la colonne sites_site_code ou injectez ces sites dans la table sites de coleo. Ces sites n'existent pas dans coleo.\n\n")
174+
if(!are_sites_exists) warning("--------------------------------------------------\n", paste0("V","\U00E9","rifiez")," les sites ", missing_sites, " de la colonne sites_site_code ou injectez ces sites dans la table sites de coleo. Ces sites n'existent pas dans coleo.\n\n")
157175

158176

159177
#------------------------------------------------------------------------
@@ -460,7 +478,15 @@ coleo_validate <- function(data, media_path = NULL) {
460478
})
461479

462480
is_ndigits_valid <- all(cols_ndigits)
463-
is_na <- any(is.na(data[cols_date]))
481+
482+
## Remove columns with all NA
483+
all_na = sapply(cols_date, function(x) {
484+
dates <- data[[x]]
485+
non_na_dates <- dates[!is.na(dates)]
486+
length(non_na_dates) == 0
487+
})
488+
non_na_date_cols <- cols_date[!all_na]
489+
is_na <- any(is.na(data[cols_date[cols_date %in% req_columns]])) | any(is.na(data[non_na_date_cols]))
464490

465491
if(!is_ndigits_valid) warning("--------------------------------------------------\nV\u00E9rifiez le format des valeurs de dates. Les dates doivent \u00EAtre du format YYYY-MM-DD.\n\n")
466492
if(is_na) warning("--------------------------------------------------\nCertaines valeurs de date sont manquantes ou NA. Les lignes sans valeurs dans les colonnes campaigns_opened_at et observations_date_obs ne seront pas injectées dans leurs tables respectives.\n\n")
@@ -475,23 +501,23 @@ coleo_validate <- function(data, media_path = NULL) {
475501
# - Check number of campaigns, empty campaigns, observations
476502
#------------------------------------------------------------------------
477503
# Check that the values are within a decent range -----------------------
478-
if(length(cols_date) > 0) {
504+
if(length(non_na_date_cols) > 0) {
479505
# Year
480-
range_year <- sapply(data[cols_date], function(x) {
506+
range_year <- sapply(data[non_na_date_cols], function(x) {
481507
split <- strsplit(unlist(x), "-", fixed = TRUE)
482508
split <- split[!is.na(split)]
483509
range(as.numeric(sapply(split, `[[`, 1)))
484510
}) |>
485511
range()
486512
# Month
487-
range_month <- sapply(data[cols_date], function(x) {
513+
range_month <- sapply(data[non_na_date_cols], function(x) {
488514
split <- strsplit(unlist(x), "-", fixed = TRUE)
489515
split <- split[!is.na(split)]
490516
range(as.numeric(sapply(split, `[[`, 2)))
491517
}) |>
492518
range()
493519
# Day
494-
range_day <- sapply(data[cols_date], function(x) {
520+
range_day <- sapply(data[non_na_date_cols], function(x) {
495521
split <- strsplit(unlist(x), "-", fixed = TRUE)
496522
split <- split[!is.na(split)]
497523
range(as.numeric(sapply(split, `[[`, 3)))

man/coleo_return_campaign_type.Rd

+17
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/coleo_return_valid_campaigns.Rd

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-coleo_return.R

+13
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ test_that("coleo_return_* functions return object of the right type", {
22

33
# coleo_return_valid_campaigns
44
expect_vector(coleo_return_valid_campaigns())
5+
valid_campaigns <- coleo_return_valid_campaigns()
6+
expect_true(all(c("ADNe", "NDSI") %in% valid_campaigns))
57

68
# coleo_return_valid_site_types
79
expect_vector(coleo_return_valid_site_types())
@@ -23,4 +25,15 @@ test_that("coleo_return_* functions return object of the right type", {
2325
names(df_col_names) <- coleo_get_column_names(table)$column_name
2426
expect_mapequal(coleo_return_rename_vec_input_to_db(table), df_col_names)
2527

28+
# Test coleo_return_campaign_type
29+
## Returns the campaign type from a data frame
30+
c_type <- coleo_return_campaign_type(data.frame(campaigns_type = "ADNe", campaigns_date_start = "2020-07-22"))
31+
expect_equal(c_type, "ADNe")
32+
## Returns the remote sensing indicator from a data frame
33+
rs_type <- coleo_return_campaign_type(data.frame(remote_sensing_indicators_name = "NDSI", remote_sensing_events_date_start = "2020-07-22"))
34+
expect_equal(rs_type, "NDSI")
35+
## Throws an error if the campaign type is not present in the data frame
36+
expect_error(coleo_return_campaign_type(data.frame(campaigns_date_start = "2020-07-22")), "Vérifiez qu'une colonne contient le type d'inventaire")
37+
## Throws an error if more than one campaign type is present in the data frame
38+
expect_error(coleo_return_campaign_type(data.frame(campaigns_type = c("ADNe", "NDSI"), campaigns_date_start = "2020-07-22")), "Vérifiez que toutes les valeurs de la colonne campaigns_type*")
2639
})

tests/testthat/test-coleo_validate.R

+10-2
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,21 @@ test_that("coleo_validate", {
2121
## Test for missing campaign_type column
2222
dat_test <- subset(dat, select = -c(campaigns_type))
2323
testthat::expect_error(coleo_validate(dat_test),
24-
regexp = "V\U00E9rifiez qu'une colonne contient le type de campagne.*")
24+
regexp = "V\U00E9rifiez qu'une colonne contient le type d'inventaire*")
2525

2626
## Test for multiple values within the campaign_type column
2727
dat_test <- dat
2828
dat_test$campaigns_type <- c("sol", "insectes_sol","insectes_sol","insectes_sol","insectes_sol","insectes_sol")
2929
testthat::expect_error(coleo_validate(dat_test),
30-
regexp = "V\U00E9rifiez que toutes les valeurs de la colonne campaigns_type sont identiques.*")
30+
regexp = "V\U00E9rifiez que toutes les valeurs de la colonne campaigns_type.*")
31+
32+
## Test that remote_sensing indicators are recognized as campaigns types (1.e., 1 entry in remote_sensing_events table)
33+
dat_test <- data.frame(remote_sensing_indicators_name = c("NDSI", "NDSI"),
34+
cells_cell_code = c("105_101", "105_101"),
35+
remote_sensing_events_date_start = c("2020-07-22", "2020-07-22"),
36+
remote_sensing_obs_metric = c("max", "min"),
37+
remote_sensing_obs_value = c(0.5, 0.2))
38+
testthat::expect_message(coleo_validate(dat_test), regexp = "*remote_sensing_events : 1*")
3139

3240
## Test that the imported data has all of the required columns
3341
dat_test <- subset(dat, select = -c(observations_date_obs))

0 commit comments

Comments
 (0)