tidyverse · krlmlr · Dec 31, 2024 · Dec 30, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -113,6 +113,8 @@ export("%>%")
 export(as_duck_tbl)
 export(as_duckplyr_df)
 export(as_duckplyr_tibble)
+export(compute_csv)
+export(compute_parquet)
 export(df_from_csv)
 export(df_from_file)
 export(df_from_parquet)

diff --git a/R/compute-rd.R b/R/compute-rd.R
@@ -0,0 +1,24 @@
+#' @title Compute results (duckplyr)
+#'
+#' @description  This is a method for the [`dplyr::compute()`] generic.
+#' For a (lazy) duckplyr frame,
+#' `compute()` executes a query but stores it in a (temporary) table,
+#' or in a Parquet or CSV file.
+#' The result is a duckplyr frame that can be used with subsequent dplyr verbs.
+#'
+#' @inheritParams dplyr::compute
+#' @param lazy Set to `TRUE` to return a lazy or `FALSE` to return an eager data frame,
+#'   see [duck_tbl()].  The default is to inherit the lazyness of the input.
+#' @param name The name of the table to store the result in.
+#' @param schema_name The schema to store the result in, defaults to the current schema.
+#' @param temporary Set to `FALSE` to store the result in a permanent table.
+#' @examples
+#' library("duckplyr")
+#' df <- duck_tbl(x = c(1, 2))
+#' df <- mutate(df, y = 2)
+#' df <- compute(df)
+#' explain(df)
+#' @seealso [`dplyr::collect()`]
+#' @rdname compute.duckplyr_df
+#' @name compute.duckplyr_df
+NULL
diff --git a/R/compute.R b/R/compute.R
@@ -1,10 +1,45 @@
 # Generated by 02-duckplyr_df-methods.R
+#' @rdname compute.duckplyr_df
 #' @export
-compute.duckplyr_df <- function(x, ...) {
+compute.duckplyr_df <- function(
+  x,
+  ...,
+  lazy = NULL,
+  name = NULL,
+  schema_name = NULL,
+  temporary = TRUE
+) {
   # Our implementation
   rel_try(NULL,
-    "No relational implementation for compute()" = TRUE,
+    "Needs duckdb >= 1.1.3.9029" = !is_installed("duckdb", version = "1.1.3.9029"),
     {
+      if (is.null(lazy)) {
+        lazy <- is_lazy_duckplyr_df(x)
+      }
+      if (is.null(schema_name)) {
+        schema_name <- ""
+      }
+      if (is.null(name)) {
+        if (isTRUE(temporary)) {
+          name <- unique_table_name()
+        } else {
+          cli::cli_abort("{.arg name} must be provided if {.arg temporary} is {.value FALSE}")
+        }
+      }
+
+      rel <- duckdb_rel_from_df(x)
+
+      duckdb$rel_to_table(rel, schema_name, name, temporary)
+
+      # API inconsistency: order of name and schema_name
+      out_rel <- duckdb$rel_from_table(get_default_duckdb_connection(), name, schema_name)
+
+      out <- duckplyr_reconstruct(out_rel, x)
+
+      if (is_lazy_duckplyr_df(out) != lazy) {
+        out <- as_duck_tbl(out, .lazy = lazy)
+      }
+
       return(out)
     }
   )

diff --git a/R/compute_file.R b/R/compute_file.R
@@ -0,0 +1,85 @@
+#' @title Compute results to a file
+#'
+#' @description
+#' These functions apply to (lazy) duckplyr frames.
+#' They executes a query and stores the results in a flat file.
+#' The result is a duckplyr frame that can be used with subsequent dplyr verbs.
+#'
+#' `compute_parquet()` creates a Parquet file.
+#'
+#' @inheritParams rlang::args_dots_empty
+#' @inheritParams compute.duckplyr_df
+#' @param path The path to store the result in.
+#' @param options A list of additional options to pass to create the storage format,
+#'   see <https://duckdb.org/docs/data/parquet/overview#writing-to-parquet-files>
+#'   or <https://duckdb.org/docs/data/csv/overview#writing-using-the-copy-statement>
+#'   for details.
+#'
+#' @export
+#' @examples
+#' library("duckplyr")
+#' df <- data.frame(x = c(1, 2))
+#' df <- mutate(df, y = 2)
+#' path <- tempfile(fileext = ".parquet")
+#' df <- compute_parquet(df, path)
+#' explain(df)
+#' @seealso [compute.duckplyr_df()], [dplyr::collect()]
+#' @name compute_file
+compute_parquet <- function(x, path, ..., lazy = NULL, options = NULL) {
+  check_dots_empty()
+
+  if (!is.null(options)) {
+    check_installed("duckdb", "1.1.3.9028")
+  } else {
+    options <- list()
+  }
+
+  if (is.null(lazy)) {
+    lazy <- is_lazy_duckplyr_df(x)
+  }
+
+  rel <- duckdb_rel_from_df(x)
+
+  if (is_installed("duckdb", version = "1.1.3.9028")) {
+    duckdb$rel_to_parquet(rel, path, options)
+  } else {
+    duckdb$rel_to_parquet(rel, path)
+  }
+
+  # If the path is a directory, we assume that the user wants to write multiple files
+  if (dir.exists(path)) {
+    path <- file.path(path, "**", "**.parquet")
+  }
+
+  duck_parquet(path, lazy = lazy)
+}
+
+#' compute_csv()
+#'
+#' `compute_csv()` creates a CSV file.
+#' @rdname compute_file
+#' @export
+compute_csv <- function(x, path, ..., lazy = NULL, options = NULL) {
+  check_dots_empty()
+
+  check_installed("duckdb", "1.1.3.9028")
+
+  if (is.null(options)) {
+    options <- list()
+  }
+
+  if (is.null(lazy)) {
+    lazy <- is_lazy_duckplyr_df(x)
+  }
+
+  rel <- duckdb_rel_from_df(x)
+
+  duckdb$rel_to_csv(rel, path, options)
+
+  # If the path is a directory, we assume that the user wants to write multiple files
+  if (dir.exists(path)) {
+    path <- file.path(path, "**", "**.csv")
+  }
+
+  duck_csv(path, lazy = lazy)
+}
diff --git a/R/lazy.R b/R/lazy.R
@@ -16,6 +16,10 @@ add_lazy_duckplyr_df_class <- function(x) {
   x
 }
 
+is_lazy_duckplyr_df <- function(x) {
+  inherits(x, "lazy_duckplyr_df")
+}
+
 as_eager_duckplyr_df <- function(x) {
   if (!inherits(x, "lazy_duckplyr_df")) {
     return(x)

diff --git a/R/unique_table_name.R b/R/unique_table_name.R
@@ -0,0 +1,6 @@
+# From dbplyr
+unique_table_name <- function(prefix = "") {
+  vals <- c(letters, LETTERS, 0:9)
+  name <- paste0(sample(vals, 10, replace = TRUE), collapse = "")
+  paste0(prefix, "duckplyr_", name)
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -19,6 +19,8 @@ reference:
 
 - subtitle: Connecting, copying, and retrieving data
   contents:
+  - compute.duckplyr_df
+  - compute_file
   # - collect.duckplyr_df
   - pull.duckplyr_df
   - explain.duckplyr_df

diff --git a/man/compute.duckplyr_df.Rd b/man/compute.duckplyr_df.Rd
diff --git a/man/compute_file.Rd b/man/compute_file.Rd
diff --git a/tests/testthat/_snaps/compute.md b/tests/testthat/_snaps/compute.md
@@ -0,0 +1,17 @@
+# compute()
+
+    Code
+      duckdb_rel_from_df(out)
+    Message
+      DuckDB Relation: 
+      ---------------------
+      --- Relation Tree ---
+      ---------------------
+      Scan Table [duckplyr_4hYuvhNS26]
+
+      ---------------------
+      -- Result Columns  --
+      ---------------------
+      - x (DOUBLE)
+
+
diff --git a/tests/testthat/test-compute.R b/tests/testthat/test-compute.R
@@ -0,0 +1,13 @@
+test_that("compute()", {
+  skip_if_not_installed("duckdb", "1.1.3.9029")
+  set.seed(20241230)
+
+  df <- duck_tbl(x = c(1, 2))
+  out <- compute(df)
+  expect_snapshot({
+    duckdb_rel_from_df(out)
+  })
+
+  expect_identical(out, as_duck_tbl(df))
+  expect_false(is_lazy_duckplyr_df(out))
+})
diff --git a/tests/testthat/test-compute_file.R b/tests/testthat/test-compute_file.R
@@ -0,0 +1,44 @@
+test_that("compute_parquet()", {
+  skip_if_not_installed("duckdb", "1.1.3.9028")
+
+  df <- data.frame(x = c(1, 2))
+  withr::defer(unlink("test.parquet"))
+  out <- compute_parquet(df, path = "test.parquet")
+
+  expect_identical(out, as_duck_tbl(df))
+  expect_false(is_lazy_duckplyr_df(out))
+})
+
+test_that("compute_parquet() with options", {
+  skip_if_not_installed("duckdb", "1.1.3.9028")
+
+  df <- data.frame(x = c(1, 2), a = c("a", "b"))
+  withr::defer(unlink("test", recursive = TRUE))
+  dir.create("test")
+  out <- compute_parquet(df, path = "test", options = list(partition_by = "a"))
+
+  expect_identical(out, as_duck_tbl(df))
+  expect_false(is_lazy_duckplyr_df(out))
+})
+
+test_that("compute_csv()", {
+  skip_if_not_installed("duckdb", "1.1.3.9028")
+
+  df <- data.frame(x = c(1, 2))
+  withr::defer(unlink("test.csv"))
+  out <- compute_csv(df, path = "test.csv")
+
+  expect_identical(out, as_duck_tbl(df))
+  expect_false(is_lazy_duckplyr_df(out))
+})
+
+test_that("compute_csv() lazy", {
+  skip_if_not_installed("duckdb", "1.1.3.9028")
+
+  df <- data.frame(x = c(1, 2))
+  withr::defer(unlink("test.csv"))
+  out <- compute_csv(df, path = "test.csv", lazy = TRUE)
+
+  expect_true(is_lazy_duckplyr_df(out))
+  expect_identical(collect(out), as_duck_tbl(df))
+})