Skip to content

Commit 5f7a435

Browse files
committed
new faster and more flexible split.data.table, closes #1389, #448
1 parent c9f500d commit 5f7a435

File tree

6 files changed

+587
-7
lines changed

6 files changed

+587
-7
lines changed

NAMESPACE

+2-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ export(fintersect)
3535
export(fsetdiff)
3636
export(funion)
3737
export(fsetequal)
38+
S3method(all.equal, data.table)
3839

3940
S3method("[", data.table)
4041
S3method("[<-", data.table)
@@ -62,6 +63,7 @@ S3method(as.matrix, data.table)
6263
#S3method(cbind, data.table)
6364
#S3method(rbind, data.table)
6465
export(.rbind.data.table)
66+
S3method(split, data.table)
6567
S3method(dim, data.table)
6668
S3method(dimnames, data.table)
6769
S3method("dimnames<-", data.table)
@@ -75,7 +77,6 @@ S3method(within, data.table)
7577
S3method(is.na, data.table)
7678
S3method(format, data.table)
7779
S3method(Ops, data.table)
78-
S3method(all.equal, data.table)
7980

8081
S3method(anyDuplicated, data.table)
8182

R/data.table.R

+68-5
Original file line numberDiff line numberDiff line change
@@ -2171,11 +2171,74 @@ Ops.data.table <- function(e1, e2 = NULL)
21712171
ans
21722172
}
21732173

2174-
2175-
split.data.table <- function(...) {
2176-
if (cedta() && getOption("datatable.dfdispatchwarn")) # or user can use suppressWarnings
2177-
warning("split is inefficient. It copies memory. Please use [,j,by=list(...)] syntax. See data.table FAQ.")
2178-
NextMethod() # allow user to do it though, split object will be data.table's with 'NA' repeated in row.names silently
2174+
split.data.table <- function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TRUE, flatten = TRUE, ..., verbose = getOption("datatable.verbose")) {
2175+
if (!is.data.table(x)) stop("x argument must be a data.table")
2176+
stopifnot(is.logical(drop), is.logical(sorted), is.logical(keep.by), is.logical(flatten))
2177+
# split data.frame way, using `f` and not `by` argument
2178+
if (!missing(f)) {
2179+
if (!length(f) && nrow(x))
2180+
stop("group length is 0 but data nrow > 0")
2181+
if (!missing(by))
2182+
stop("passing 'f' argument together with 'by' is not allowed, use 'by' when split by column in data.table and 'f' when split by external factor")
2183+
# same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency
2184+
return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind]))
2185+
}
2186+
if (missing(by)) stop("you must provide 'by' or 'f' arguments")
2187+
# check reserved column names during processing
2188+
if (".ll.tech.split" %in% names(x)) stop("column '.ll.tech.split' is reserved for split.data.table processing")
2189+
if (".nm.tech.split" %in% by) stop("column '.nm.tech.split' is reserved for split.data.table processing")
2190+
if (!all(by %in% names(x))) stop("argument 'by' must refer to data.table column names")
2191+
if (!all(by.atomic <- sapply(by, function(.by) is.atomic(x[[.by]])))) stop(sprintf("argument 'by' must refer only to atomic type columns, classes of '%s' columns are not atomic type", paste(by[!by.atomic], collapse=", ")))
2192+
# list of data.tables (flatten) or list of lists of ... data.tables
2193+
make.levels = function(x, cols, sorted) {
2194+
by.order = if (!sorted) x[, funique(.SD), .SDcols=cols] # remember order of data, only when not sorted=FALSE
2195+
ul = lapply(setNames(nm=cols), function(col) if (!is.factor(x[[col]])) unique(x[[col]]) else levels(x[[col]]))
2196+
r = do.call("CJ", c(ul, sorted=sorted, unique=TRUE))
2197+
if (!sorted && nrow(by.order)) {
2198+
ii = r[by.order, on=cols, which=TRUE]
2199+
r = rbindlist(list(
2200+
r[ii], # original order from data
2201+
r[-ii] # empty levels at the end
2202+
))
2203+
}
2204+
r
2205+
}
2206+
.by = by[1L]
2207+
# this builds data.table call - is much more cleaner than handling each case one by one
2208+
dtq = as.list(call("[", as.name("x")))
2209+
join = FALSE
2210+
flatten_any = flatten && any(sapply(by, function(col) is.factor(x[[col]])))
2211+
nested_current = !flatten && is.factor(x[[.by]])
2212+
if (!drop && (flatten_any || nested_current)) {
2213+
dtq[["i"]] = substitute(make.levels(x, cols=.cols, sorted=.sorted), list(.cols=if (flatten) by else .by, .sorted=sorted))
2214+
join = TRUE
2215+
}
2216+
dtq[["j"]] = substitute(
2217+
list(.ll.tech.split=list(.expr)),
2218+
list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) # simplify when `nomatch` accept NULL #857 ?
2219+
)
2220+
by.or.keyby = if (join) "by" else c("by"[!sorted], "keyby"[sorted])[1L]
2221+
dtq[[by.or.keyby]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`.
2222+
.expr,
2223+
list(.expr = if(join) as.name(".EACHI") else if (flatten) by else .by)
2224+
)
2225+
dtq[[".SDcols"]] = if (keep.by) names(x) else setdiff(names(x), if (flatten) by else .by)
2226+
if (join) dtq[["on"]] = if (flatten) by else .by
2227+
dtq = as.call(dtq)
2228+
if (isTRUE(verbose)) cat("Processing split.data.table with: ", deparse(dtq, width.cutoff=500L), "\n", sep="")
2229+
tmp = eval(dtq)
2230+
# add names on list
2231+
setattr(ll <- tmp$.ll.tech.split,
2232+
"names",
2233+
as.character(
2234+
if (!flatten) tmp[[.by]] else tmp[, list(.nm.tech.split=paste(unlist(.SD), collapse = ".")), by=by, .SDcols=by]$.nm.tech.split
2235+
))
2236+
# handle nested split
2237+
if (flatten || length(by) == 1L) return(
2238+
lapply(ll, setattr, '.data.table.locked', NULL)
2239+
) else if (length(by) > 1L) return(
2240+
lapply(ll, split.data.table, drop=drop, by=by[-1L], sorted=sorted, keep.by=keep.by, flatten=flatten)
2241+
)
21792242
}
21802243

21812244
# TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@
7171
27. `by` understands `colA:colB` syntax now, like `.SDcols` does, [#1395](https://github.com/Rdatatable/data.table/issues/1395). Thanks @franknarf1.
7272

7373
28. Joins (and binary search based subsets) using `on=` argument now reuses existing (secondary) indices, [#1439](https://github.com/Rdatatable/data.table/issues/1439). Thanks @jangorecki.
74+
75+
29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389).
7476

7577
#### BUG FIXES
7678

0 commit comments

Comments
 (0)