diff --git a/NEWS.md b/NEWS.md index d2c13959c0..bda4d27d90 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,9 @@ * Fix incorrect handling of column metadata in `insertcols!` and `insertcols` ([#3220](https://github.com/JuliaData/DataFrames.jl/pull/3220)) +* Correctly handle `GroupedDataFrame` with no groups in multi-column + operation specification syntax + ([#3122](https://github.com/JuliaData/DataFrames.jl/issues/3122)) ## Display improvements diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index fc1d60fec9..5e34a2e175 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -30,15 +30,24 @@ object from your data frame using the `groupby` function that takes two argument (1) a data frame to be grouped, and (2) a set of columns to group by. Operations can then be applied on each group using one of the following functions: -* `combine`: does not put restrictions on number of rows returned, the order of rows - is specified by the order of groups in `GroupedDataFrame`; it is typically used - to compute summary statistics by group; +* `combine`: does not put restrictions on number of rows returned per group; + the returned values are vertically concatenaded following order of groups in + `GroupedDataFrame`; it is typically used to compute summary statistics by group; + for `GroupedDataFrame` if grouping columns are kept they are put as first columns + in the result; * `select`: return a data frame with the number and order of rows exactly the same as the source data frame, including only new calculated columns; `select!` is an in-place version of `select`; * `transform`: return a data frame with the number and order of rows exactly the same as the source data frame, including all columns from the source and new calculated columns; - `transform!` is an in-place version of `transform`. + `transform!` is an in-place version of `transform`; + existing columns in the source data frame are put as first columns in the result; + +As a special case, if a `GroupedDataFrame` that has zero groups is passed then +the result of the operation is determined by performing a single call to the +transformation function with a 0-row argument passed to it. The output of this +operation is only used to identify the number and type of produced columns, but +the result has zero rows. All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 0d1363fe89..9e32989c68 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -37,19 +37,26 @@ const TRANSFORMATION_COMMON_RULES = (1) a data frame to be grouped, and (2) a set of columns to group by. Operations can then be applied on each group using one of the following functions: - * `combine`: does not put restrictions on number of rows returned, the order of rows - is specified by the order of groups in `GroupedDataFrame`; it is typically used - to compute summary statistics by group; for `GroupedDataFrame` if grouping columns - are kept they are put as first columns in the result; + * `combine`: does not put restrictions on number of rows returned per group; + the returned values are vertically concatenaded following order of groups in + `GroupedDataFrame`; it is typically used to compute summary statistics by group; + for `GroupedDataFrame` if grouping columns are kept they are put as first columns + in the result; * `select`: return a data frame with the number and order of rows exactly the same as the source data frame, including only new calculated columns; `select!` is an in-place version of `select`; for `GroupedDataFrame` if grouping columns are kept they are put as first columns in the result; * `transform`: return a data frame with the number and order of rows exactly the same as the source data frame, including all columns from the source and new calculated columns; - `transform!` is an in-place version of `transform`; for `GroupedDataFrame` + `transform!` is an in-place version of `transform`; existing columns in the source data frame are put as first columns in the result; + As a special case, if a `GroupedDataFrame` that has zero groups is passed then + the result of the operation is determined by performing a single call to the + transformation function with a 0-row argument passed to it. The output of this + operation is only used to identify the number and type of produced columns, but + the result has zero rows. + All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: 1. standard column selectors (integers, `Symbol`s, strings, vectors of integers, diff --git a/src/groupeddataframe/complextransforms.jl b/src/groupeddataframe/complextransforms.jl index 8a922d9547..e7e256cac1 100644 --- a/src/groupeddataframe/complextransforms.jl +++ b/src/groupeddataframe/complextransforms.jl @@ -28,7 +28,6 @@ function _combine_with_first((first,)::Ref{Any}, @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame} @assert f isa Base.Callable @assert incols isa Union{Nothing, AbstractVector, Tuple, NamedTuple} - @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame} extrude = false lgd = length(gd) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 54580289c4..b627b7ba5d 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -486,6 +486,23 @@ function _combine_process_pair_symbol(optional_i::Bool, end end +@noinline function expand_res_astable(res, kp1, emptyres::Bool) + prepend = all(x -> x isa Integer, kp1) + if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1)) + throw(ArgumentError("keys of the returned elements must be " * + "`Symbol`s, strings or integers")) + end + if any(x -> !isequal(keys(x), kp1), res) + throw(ArgumentError("keys of the returned elements must be equal")) + end + outcols = [[x[n] for x in res] for n in kp1] + # make sure we only infer column names and types for empty res, but do not + # produce values that were generated when computing firstres + emptyres && foreach(empty!, outcols) + nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1] + return outcols, nms +end + # perform a transformation specified using the Pair notation with multiple output columns function _combine_process_pair_astable(optional_i::Bool, gd::GroupedDataFrame, @@ -506,19 +523,15 @@ function _combine_process_pair_astable(optional_i::Bool, firstmulticol, NOTHING_IDX_AGG, threads) @assert length(outcol_vec) == 1 res = outcol_vec[1] - @assert length(res) > 0 - - kp1 = keys(res[1]) - prepend = all(x -> x isa Integer, kp1) - if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1)) - throw(ArgumentError("keys of the returned elements must be " * - "`Symbol`s, strings or integers")) - end - if any(x -> !isequal(keys(x), kp1), res) - throw(ArgumentError("keys of the returned elements must be identical")) + if isempty(res) + emptyres = true + res = firstres + else + emptyres = false end - outcols = [[x[n] for x in res] for n in kp1] - nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1] + kp1 = isempty(res) ? () : keys(res[1]) + + outcols, nms = expand_res_astable(res, kp1, emptyres) else if !firstmulticol firstres = Tables.columntable(firstres) @@ -527,9 +540,8 @@ function _combine_process_pair_astable(optional_i::Bool, end idx, outcols, nms = _combine_multicol(Ref{Any}(firstres), Ref{Any}(fun), gd, wincols, threads) - if !(firstres isa Union{AbstractVecOrMat, AbstractDataFrame, - NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) + NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}}) lock(gd.lazy_lock) do # if idx_agg was not computed yet it is nothing # in this case if we are not passed a vector compute it. @@ -541,8 +553,8 @@ function _combine_process_pair_astable(optional_i::Bool, idx = idx_agg[] end end - @assert length(outcols) == length(nms) end + @assert length(outcols) == length(nms) if out_col_name isa AbstractVector{Symbol} if length(out_col_name) != length(nms) throw(ArgumentError("Number of returned columns is $(length(nms)) " * diff --git a/test/grouping.jl b/test/grouping.jl index d26f09cb22..670179ff87 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -4312,4 +4312,70 @@ end @test_throws ArgumentError gdf[Not([true true true true])] end +@testset "aggregation of empty GroupedDataFrame with table output" begin + df = DataFrame(:a => Int[]) + gdf = groupby(df, :a) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b), + DataFrame(a=Int[], x=Int[], y=String[], b=Int[])) + @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b), + DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[])) + @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b), + DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[])) + # test below errors because keys for strings do not support == comparison + @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b) + @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b), + DataFrame(a=Int[], b=Int[])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable), + DataFrame(a=Int[], x=Int[], y=Any[])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable), + DataFrame(a=Int[], x=Vector{Int}[], y=Any[])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]), + DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3]) + + df = DataFrame(:a => [1, 2]) + gdf = groupby(df, :a)[2:1] + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b), + DataFrame(a=Int[], x=Int[], y=String[], b=Int[])) + @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b), + DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[])) + @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b), + DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[])) + # test below errors because keys for strings do not support == comparison + @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b) + @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b), + DataFrame(a=Int[], b=Int[])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable), + DataFrame(a=Int[], x=Int[], y=Any[])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable), + DataFrame(a=Int[], x=Vector{Int}[], y=Any[])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]), + DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3]) + + df = DataFrame(:a => [1, 2]) + gdf = groupby(df, :a) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b), + DataFrame(a=1:2, x=[1, 1], y=["a", "a"], b=1:2)) + @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b), + DataFrame(a=1:2, x1=[1, 1], x2=["a", "a"], b=1:2)) + @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b), + DataFrame(a=1:2, x1=['a', 'a'], x2=['b', 'b'], b=1:2)) + # test below errors because keys for strings do not support == comparison + @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b) + @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b), + DataFrame(a=1:2, b=1:2)) + @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable) + @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable), + DataFrame(a=[1, 1, 2, 2], x=[1, 3, 1, 3], y=Any[2, "a", 2, "a"])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable), + DataFrame(a=[1, 1, 2, 2], x=[[1], [3], [1], [3]], y=Any[2, "a", 2, "a"])) + @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]), + DataFrame(a=[1, 1, 2, 2], z1=[[1], [3], [1], [3]], z2=Any[2, "a", 2, "a"])) + @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3]) + @test_throws ArgumentError combine(gdf, :a => (x -> [Dict('x' => 1)]) => AsTable) +end + end # module