Merge branch 'fix-51-premature-na' into main

cpauvert · cpauvert · commit 66572fd71e5d · 2025-02-21T14:41:34.000+01:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: maldipickr
 Title: Dereplicate and Cherry-Pick Mass Spectrometry Spectra
-Version: 1.3.2
+Version: 1.3.1.9000
 Authors@R: c(
     person("Charlie", "Pauvert", , "cpauvert@ukaachen.de", role = c("aut", "cre", "cph"),
            comment = c(ORCID = "0000-0001-9832-2507")),
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# maldipickr (development version)
+
+## Fixed
+
+* Fix unwanted conversion to NAs when sample names were well numbers like E1 or E2 (#51, thanks for spotting this @sarah-lital)
+
 # maldipickr 1.3.2
 
 ## Fixed
diff --git a/R/read_biotyper_report.R b/R/read_biotyper_report.R
@@ -45,7 +45,7 @@
 #' @param long_format A logical indicating whether the table is in the long format (many rows) or wide format (many columns) when showing all the hits. This option has no effect when `best_hits = TRUE`.
 #'
 #' @return
-#' A tibble of 7 columns (`best_hits = TRUE`) or 52 columns (`best_hits = FALSE`). See Details for the description of the columns.
+#' A tibble of 8 columns (`best_hits = TRUE`) or 52 columns (`best_hits = FALSE`). See Details for the description of the columns.
 #'
 #' @seealso [read_many_biotyper_reports]
 #'
@@ -77,14 +77,25 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
     path,
     col.names = c("name", "sample_name", prep_names$col_names),
     sep = ";", header = FALSE,
-    na = c("NA", "E1", "E2", "") # Added E1 identification in taxid as NA
+    na = c("NA", "") # Added E1 identification in taxid as NA
   )
   no_peak_lgl <- breport$bruker_01_species == "no peaks found"
 
+
+  # E1 and E2 converted to NA is an issue if this are sample names (#51)
+  #  so using a more surgical replacement, but na_if cannot use vectors
+  sanitize_taxid <- function(vec_taxid, vec_to_convert){
+    pattern <- paste0(vec_to_convert, collapse = "|")
+    base::gsub(pattern, "replaceNA", vec_taxid) %>%
+      dplyr::na_if("replaceNA") %>% as.numeric()
+  }
   # Remove the spot name for which no peaks were detected, and warn the user
   breport <- tibble::as_tibble(breport) %>%
     # Empty sample_name are considered logical and this is undesirable
-    dplyr::mutate("sample_name" = as.character(.data$sample_name)) %>%
+    dplyr::mutate("sample_name" = as.character(.data$sample_name)) %>% 
+    dplyr::mutate(
+      dplyr::across(tidyselect::contains("taxid"), ~ sanitize_taxid(.x, c("E1","E2")))
+      ) %>%
     dplyr::filter(.data$bruker_01_species != "no peaks found")
   if (sum(no_peak_lgl) > 0) {
     warning(
diff --git a/dev/import-data.Rmd b/dev/import-data.Rmd
@@ -110,7 +110,7 @@ After inflating the template
 #' @param long_format A logical indicating whether the table is in the long format (many rows) or wide format (many columns) when showing all the hits. This option has no effect when `best_hits = TRUE`.
 #'
 #' @return
-#' A tibble of 7 columns (`best_hits = TRUE`) or 52 columns (`best_hits = FALSE`). See Details for the description of the columns.
+#' A tibble of 8 columns (`best_hits = TRUE`) or 52 columns (`best_hits = FALSE`). See Details for the description of the columns.
 #'
 #' @seealso [read_many_biotyper_reports]
 #'
@@ -136,14 +136,25 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
     path,
     col.names = c("name", "sample_name", prep_names$col_names),
     sep = ";", header = FALSE,
-    na = c("NA", "E1", "E2", "") # Added E1 identification in taxid as NA
+    na = c("NA", "") # Added E1 identification in taxid as NA
   )
   no_peak_lgl <- breport$bruker_01_species == "no peaks found"
 
+
+  # E1 and E2 converted to NA is an issue if this are sample names (#51)
+  #  so using a more surgical replacement, but na_if cannot use vectors
+  sanitize_taxid <- function(vec_taxid, vec_to_convert){
+    pattern <- paste0(vec_to_convert, collapse = "|")
+    base::gsub(pattern, "replaceNA", vec_taxid) %>%
+      dplyr::na_if("replaceNA") %>% as.numeric()
+  }
   # Remove the spot name for which no peaks were detected, and warn the user
   breport <- tibble::as_tibble(breport) %>%
     # Empty sample_name are considered logical and this is undesirable
-    dplyr::mutate("sample_name" = as.character(.data$sample_name)) %>%
+    dplyr::mutate("sample_name" = as.character(.data$sample_name)) %>% 
+    dplyr::mutate(
+      dplyr::across(tidyselect::contains("taxid"), ~ sanitize_taxid(.x, c("E1","E2")))
+      ) %>%
     dplyr::filter(.data$bruker_01_species != "no peaks found")
   if (sum(no_peak_lgl) > 0) {
     warning(
@@ -280,7 +291,7 @@ After inflating the template
 # Test with a correct and empty datasets in "inst/"
 biotyper <- system.file("biotyper.csv", package = "maldipickr")
 biotyper_empty <- system.file("biotyper_empty.csv", package = "maldipickr")
-
+biotyper_fixNA_51 <- system.file("biotyper_fixNA_51.csv", package = "maldipickr")
 # Apply test on my function
 test_that("read_biotyper_report works properly with correct dataset and best hits", {
   expect_equal(
@@ -324,6 +335,18 @@ test_that("read_biotyper_report is empty when no peaks are found", {
     nrow(out), 0
   )
 })
+test_that("read_biotyper_report works properly when sample can be named E1, E2 which used to be NA values", {
+  expect_equal(
+    nrow(read_biotyper_report(biotyper_fixNA_51)), 2
+  )
+  expect_equal(
+    ncol(read_biotyper_report(biotyper_fixNA_51)), 8
+  )
+  expect_equal(
+    dplyr::pull(read_biotyper_report(biotyper_fixNA_51), bruker_taxid),
+    c(1351, NA)
+  )
+})
 ```
 
 ## Importing multiple reports
diff --git a/inst/biotyper_fixNA_51.csv b/inst/biotyper_fixNA_51.csv
@@ -0,0 +1,2 @@
+E1;;+++;Enterococcus faecalis;1351;0ae85b21-2783-4fe6-9c04-96877702d15a;2.24;+++;1_Enterococcus faecalis;145093567;3d377ee2-e73f-433a-8246-9cc0ae324d03;2.21;+++;Enterococcus faecalis;1351;0bba480e-241f-4e83-b6f5-58e80a06389e;2.21;+++;Enterococcus faecalis;1351;a4ab8bfd-4128-4b50-9270-292f99bdad8d;2.13;+++;Enterococcus faecalis;1351;1cc3022d-bf5d-422d-aa5a-c1c5889e84de;2.08;+++;Enterococcus faecalis;1351;4cea4049-e540-44da-a3c7-28a257f47036;2.03;+++;Enterococcus faecalis;1351;f3aaad09-d984-4b9c-b55c-f2437ed1217d;2.00;+;Enterococcus faecalis;1351;46d48674-c407-48fe-b85b-300977b0558f;1.91;+;Enterococcus faecalis;1351;a82c73c8-b1b2-4bda-bb0d-2cae2de47f5c;1.90;+;Enterococcus faecalis;1351;ba0bb1df-5786-450a-ac91-fe4096b1a3e1;1.80
+E2;;-;not reliable identification;E1;3f402e73-bcef-40b1-9014-8a28878f12c5;1.62;-;not reliable identification;E1;631e1d31-81bd-4f97-b16b-195a5e43cfa9;1.40;-;not reliable identification;E1;13a7f9d9-6248-415b-bb12-ca4fe9082a8c;1.39;-;not reliable identification;E1;beca17ff-a591-440f-8f9e-7ef5b9c3d1ce;1.38;-;not reliable identification;E1;4a752505-cecf-4f86-96bf-5dc9de266913;1.37;-;not reliable identification;E1;689fb704-a4f4-46d6-92c7-0e5936a639a5;1.34;-;not reliable identification;E1;f2cfe654-ff28-4488-b0e6-d60e062cb065;1.34;-;not reliable identification;E1;bd6f9303-3d16-4cd8-8fa9-98aefe46db9f;1.33;-;not reliable identification;E1;79b6a458-3dc6-45a2-a8b6-acc0e5e8c730;1.31;-;not reliable identification;E1;28e961b0-791d-487c-8e95-76833bf55e44;1.31
diff --git a/man/read_biotyper_report.Rd b/man/read_biotyper_report.Rd
diff --git a/renv.lock b/renv.lock
@@ -712,7 +712,7 @@
     },
     "knitr": {
       "Package": "knitr",
-      "Version": "1.47",
+      "Version": "1.49",
       "Source": "Repository",
       "Repository": "RSPM",
       "Requirements": [
@@ -724,7 +724,7 @@
         "xfun",
         "yaml"
       ],
-      "Hash": "7c99b2d55584b982717fcc0950378612"
+      "Hash": "9fcb189926d93c636dea94fbe4f44480"
     },
     "later": {
       "Package": "later",
@@ -1138,14 +1138,14 @@
     },
     "rlang": {
       "Package": "rlang",
-      "Version": "1.1.4",
+      "Version": "1.1.5",
       "Source": "Repository",
       "Repository": "RSPM",
       "Requirements": [
         "R",
         "utils"
       ],
-      "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1"
+      "Hash": "724dcc1490cd7071ee75ca2994a5446e"
     },
     "rmarkdown": {
       "Package": "rmarkdown",
@@ -1172,7 +1172,7 @@
     },
     "roxygen2": {
       "Package": "roxygen2",
-      "Version": "7.2.3",
+      "Version": "7.3.2",
       "Source": "Repository",
       "Repository": "RSPM",
       "Requirements": [
@@ -1194,7 +1194,7 @@
         "withr",
         "xml2"
       ],
-      "Hash": "7b153c746193b143c14baa072bae4e27"
+      "Hash": "6ee25f9054a70f44d615300ed531ba8d"
     },
     "rprojroot": {
       "Package": "rprojroot",
@@ -1611,15 +1611,16 @@
     },
     "xfun": {
       "Package": "xfun",
-      "Version": "0.44",
+      "Version": "0.51",
       "Source": "Repository",
       "Repository": "RSPM",
       "Requirements": [
+        "R",
         "grDevices",
         "stats",
         "tools"
       ],
-      "Hash": "317a0538d32f4a009658bcedb7923f4b"
+      "Hash": "e1a3c06389a46d065c18bd4bbc27c64c"
     },
     "xml2": {
       "Package": "xml2",
diff --git a/tests/testthat/test-read_biotyper_report.R b/tests/testthat/test-read_biotyper_report.R
@@ -7,7 +7,7 @@
 # Test with a correct and empty datasets in "inst/"
 biotyper <- system.file("biotyper.csv", package = "maldipickr")
 biotyper_empty <- system.file("biotyper_empty.csv", package = "maldipickr")
-
+biotyper_fixNA_51 <- system.file("biotyper_fixNA_51.csv", package = "maldipickr")
 # Apply test on my function
 test_that("read_biotyper_report works properly with correct dataset and best hits", {
   expect_equal(
@@ -51,3 +51,15 @@ test_that("read_biotyper_report is empty when no peaks are found", {
     nrow(out), 0
   )
 })
+test_that("read_biotyper_report works properly when sample can be named E1, E2 which used to be NA values", {
+  expect_equal(
+    nrow(read_biotyper_report(biotyper_fixNA_51)), 2
+  )
+  expect_equal(
+    ncol(read_biotyper_report(biotyper_fixNA_51)), 8
+  )
+  expect_equal(
+    dplyr::pull(read_biotyper_report(biotyper_fixNA_51), bruker_taxid),
+    c(1351, NA)
+  )
+})

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+E1;;+++;Enterococcus faecalis;1351;0ae85b21-2783-4fe6-9c04-96877702d15a;2.24;+++;1_Enterococcus faecalis;145093567;3d377ee2-e73f-433a-8246-9cc0ae324d03;2.21;+++;Enterococcus faecalis;1351;0bba480e-241f-4e83-b6f5-58e80a06389e;2.21;+++;Enterococcus faecalis;1351;a4ab8bfd-4128-4b50-9270-292f99bdad8d;2.13;+++;Enterococcus faecalis;1351;1cc3022d-bf5d-422d-aa5a-c1c5889e84de;2.08;+++;Enterococcus faecalis;1351;4cea4049-e540-44da-a3c7-28a257f47036;2.03;+++;Enterococcus faecalis;1351;f3aaad09-d984-4b9c-b55c-f2437ed1217d;2.00;+;Enterococcus faecalis;1351;46d48674-c407-48fe-b85b-300977b0558f;1.91;+;Enterococcus faecalis;1351;a82c73c8-b1b2-4bda-bb0d-2cae2de47f5c;1.90;+;Enterococcus faecalis;1351;ba0bb1df-5786-450a-ac91-fe4096b1a3e1;1.80
	`2`	+E2;;-;not reliable identification;E1;3f402e73-bcef-40b1-9014-8a28878f12c5;1.62;-;not reliable identification;E1;631e1d31-81bd-4f97-b16b-195a5e43cfa9;1.40;-;not reliable identification;E1;13a7f9d9-6248-415b-bb12-ca4fe9082a8c;1.39;-;not reliable identification;E1;beca17ff-a591-440f-8f9e-7ef5b9c3d1ce;1.38;-;not reliable identification;E1;4a752505-cecf-4f86-96bf-5dc9de266913;1.37;-;not reliable identification;E1;689fb704-a4f4-46d6-92c7-0e5936a639a5;1.34;-;not reliable identification;E1;f2cfe654-ff28-4488-b0e6-d60e062cb065;1.34;-;not reliable identification;E1;bd6f9303-3d16-4cd8-8fa9-98aefe46db9f;1.33;-;not reliable identification;E1;79b6a458-3dc6-45a2-a8b6-acc0e5e8c730;1.31;-;not reliable identification;E1;28e961b0-791d-487c-8e95-76833bf55e44;1.31