Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: New compute_parquet() and compute_csv(), implement compute.duckplyr_df() #430

Merged
merged 1 commit into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ export("%>%")
export(as_duck_tbl)
export(as_duckplyr_df)
export(as_duckplyr_tibble)
export(compute_csv)
export(compute_parquet)
export(df_from_csv)
export(df_from_file)
export(df_from_parquet)
Expand Down
24 changes: 24 additions & 0 deletions R/compute-rd.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#' @title Compute results (duckplyr)
#'
#' @description This is a method for the [`dplyr::compute()`] generic.
#' For a (lazy) duckplyr frame,
#' `compute()` executes a query but stores it in a (temporary) table,
#' or in a Parquet or CSV file.
#' The result is a duckplyr frame that can be used with subsequent dplyr verbs.
#'
#' @inheritParams dplyr::compute
#' @param lazy Set to `TRUE` to return a lazy or `FALSE` to return an eager data frame,
#' see [duck_tbl()]. The default is to inherit the lazyness of the input.
#' @param name The name of the table to store the result in.
#' @param schema_name The schema to store the result in, defaults to the current schema.
#' @param temporary Set to `FALSE` to store the result in a permanent table.
#' @examples
#' library("duckplyr")
#' df <- duck_tbl(x = c(1, 2))
#' df <- mutate(df, y = 2)
#' df <- compute(df)
#' explain(df)
#' @seealso [`dplyr::collect()`]
#' @rdname compute.duckplyr_df
#' @name compute.duckplyr_df
NULL
39 changes: 37 additions & 2 deletions R/compute.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,45 @@
# Generated by 02-duckplyr_df-methods.R
#' @rdname compute.duckplyr_df
#' @export
compute.duckplyr_df <- function(x, ...) {
compute.duckplyr_df <- function(
x,
...,
lazy = NULL,
name = NULL,
schema_name = NULL,
temporary = TRUE
) {
# Our implementation
rel_try(NULL,
"No relational implementation for compute()" = TRUE,
"Needs duckdb >= 1.1.3.9029" = !is_installed("duckdb", version = "1.1.3.9029"),
{
if (is.null(lazy)) {
lazy <- is_lazy_duckplyr_df(x)
}
if (is.null(schema_name)) {
schema_name <- ""
}
if (is.null(name)) {
if (isTRUE(temporary)) {
name <- unique_table_name()
} else {
cli::cli_abort("{.arg name} must be provided if {.arg temporary} is {.value FALSE}")
}
}

rel <- duckdb_rel_from_df(x)

duckdb$rel_to_table(rel, schema_name, name, temporary)

# API inconsistency: order of name and schema_name
out_rel <- duckdb$rel_from_table(get_default_duckdb_connection(), name, schema_name)

out <- duckplyr_reconstruct(out_rel, x)

if (is_lazy_duckplyr_df(out) != lazy) {
out <- as_duck_tbl(out, .lazy = lazy)
}

return(out)
}
)
Expand Down
85 changes: 85 additions & 0 deletions R/compute_file.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#' @title Compute results to a file
#'
#' @description
#' These functions apply to (lazy) duckplyr frames.
#' They executes a query and stores the results in a flat file.
#' The result is a duckplyr frame that can be used with subsequent dplyr verbs.
#'
#' `compute_parquet()` creates a Parquet file.
#'
#' @inheritParams rlang::args_dots_empty
#' @inheritParams compute.duckplyr_df
#' @param path The path to store the result in.
#' @param options A list of additional options to pass to create the storage format,
#' see <https://duckdb.org/docs/data/parquet/overview#writing-to-parquet-files>
#' or <https://duckdb.org/docs/data/csv/overview#writing-using-the-copy-statement>
#' for details.
#'
#' @export
#' @examples
#' library("duckplyr")
#' df <- data.frame(x = c(1, 2))
#' df <- mutate(df, y = 2)
#' path <- tempfile(fileext = ".parquet")
#' df <- compute_parquet(df, path)
#' explain(df)
#' @seealso [compute.duckplyr_df()], [dplyr::collect()]
#' @name compute_file
compute_parquet <- function(x, path, ..., lazy = NULL, options = NULL) {
check_dots_empty()

if (!is.null(options)) {
check_installed("duckdb", "1.1.3.9028")
} else {
options <- list()
}

if (is.null(lazy)) {
lazy <- is_lazy_duckplyr_df(x)
}

rel <- duckdb_rel_from_df(x)

if (is_installed("duckdb", version = "1.1.3.9028")) {
duckdb$rel_to_parquet(rel, path, options)
} else {
duckdb$rel_to_parquet(rel, path)
}

# If the path is a directory, we assume that the user wants to write multiple files
if (dir.exists(path)) {
path <- file.path(path, "**", "**.parquet")
}

duck_parquet(path, lazy = lazy)
}

#' compute_csv()
#'
#' `compute_csv()` creates a CSV file.
#' @rdname compute_file
#' @export
compute_csv <- function(x, path, ..., lazy = NULL, options = NULL) {
check_dots_empty()

check_installed("duckdb", "1.1.3.9028")

if (is.null(options)) {
options <- list()
}

if (is.null(lazy)) {
lazy <- is_lazy_duckplyr_df(x)
}

rel <- duckdb_rel_from_df(x)

duckdb$rel_to_csv(rel, path, options)

# If the path is a directory, we assume that the user wants to write multiple files
if (dir.exists(path)) {
path <- file.path(path, "**", "**.csv")
}

duck_csv(path, lazy = lazy)
}
4 changes: 4 additions & 0 deletions R/lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ add_lazy_duckplyr_df_class <- function(x) {
x
}

is_lazy_duckplyr_df <- function(x) {
inherits(x, "lazy_duckplyr_df")
}

as_eager_duckplyr_df <- function(x) {
if (!inherits(x, "lazy_duckplyr_df")) {
return(x)
Expand Down
6 changes: 6 additions & 0 deletions R/unique_table_name.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# From dbplyr
unique_table_name <- function(prefix = "") {
vals <- c(letters, LETTERS, 0:9)
name <- paste0(sample(vals, 10, replace = TRUE), collapse = "")
paste0(prefix, "duckplyr_", name)
}
2 changes: 2 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ reference:

- subtitle: Connecting, copying, and retrieving data
contents:
- compute.duckplyr_df
- compute_file
# - collect.duckplyr_df
- pull.duckplyr_df
- explain.duckplyr_df
Expand Down
41 changes: 41 additions & 0 deletions man/compute.duckplyr_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions man/compute_file.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions tests/testthat/_snaps/compute.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# compute()

Code
duckdb_rel_from_df(out)
Message
DuckDB Relation:
---------------------
--- Relation Tree ---
---------------------
Scan Table [duckplyr_4hYuvhNS26]

---------------------
-- Result Columns --
---------------------
- x (DOUBLE)


13 changes: 13 additions & 0 deletions tests/testthat/test-compute.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
test_that("compute()", {
skip_if_not_installed("duckdb", "1.1.3.9029")
set.seed(20241230)

df <- duck_tbl(x = c(1, 2))
out <- compute(df)
expect_snapshot({
duckdb_rel_from_df(out)
})

expect_identical(out, as_duck_tbl(df))
expect_false(is_lazy_duckplyr_df(out))
})
44 changes: 44 additions & 0 deletions tests/testthat/test-compute_file.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
test_that("compute_parquet()", {
skip_if_not_installed("duckdb", "1.1.3.9028")

df <- data.frame(x = c(1, 2))
withr::defer(unlink("test.parquet"))
out <- compute_parquet(df, path = "test.parquet")

expect_identical(out, as_duck_tbl(df))
expect_false(is_lazy_duckplyr_df(out))
})

test_that("compute_parquet() with options", {
skip_if_not_installed("duckdb", "1.1.3.9028")

df <- data.frame(x = c(1, 2), a = c("a", "b"))
withr::defer(unlink("test", recursive = TRUE))
dir.create("test")
out <- compute_parquet(df, path = "test", options = list(partition_by = "a"))

expect_identical(out, as_duck_tbl(df))
expect_false(is_lazy_duckplyr_df(out))
})

test_that("compute_csv()", {
skip_if_not_installed("duckdb", "1.1.3.9028")

df <- data.frame(x = c(1, 2))
withr::defer(unlink("test.csv"))
out <- compute_csv(df, path = "test.csv")

expect_identical(out, as_duck_tbl(df))
expect_false(is_lazy_duckplyr_df(out))
})

test_that("compute_csv() lazy", {
skip_if_not_installed("duckdb", "1.1.3.9028")

df <- data.frame(x = c(1, 2))
withr::defer(unlink("test.csv"))
out <- compute_csv(df, path = "test.csv", lazy = TRUE)

expect_true(is_lazy_duckplyr_df(out))
expect_identical(collect(out), as_duck_tbl(df))
})
Loading