diff --git a/NEWS.md b/NEWS.md index 0146ba5386..170b6d929c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ and documented in the manual for `AbstractDataFrame`, `DataFrameRow`, `DataFrameRows`, `DataFrameColumns`, `GroupedDataFrame`, `GroupKeys`, and `GroupKey` ([#2573](https://github.com/JuliaData/DataFrames.jl/pull/2573)) +* add `subset` and `subset!` functions that allow to subset rows + ([#2496](https://github.com/JuliaData/DataFrames.jl/pull/2496)) ## Deprecated diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 128f0ac9e7..d6d327c94c 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -102,6 +102,8 @@ first last only nonunique +subset +subset! unique unique! ``` diff --git a/docs/src/man/comparisons.md b/docs/src/man/comparisons.md index 64ba636af1..fabfa5ef36 100644 --- a/docs/src/man/comparisons.md +++ b/docs/src/man/comparisons.md @@ -12,6 +12,11 @@ df = DataFrame(grp = repeat(1:2, 3), x = 6:-1:1, y = 4:9, z = [3:7; missing], id df2 = DataFrame(grp = [1, 3], w = [10, 11]) ``` +Note that in the comparisons presented below predicates like `x -> x >= 1` can +be more compactly written as `=>(1)`. The latter form has an additional benefit +that it is compiled only once per Julia session (as opposed to `x -> x >= 1` +which defines a new anonymous function every time it is introduced). + ## Comparison with the Python package pandas The following table compares the main functions of DataFrames.jl with the Python package pandas (version 1.1.0): @@ -204,7 +209,7 @@ df <- tibble(grp = rep(1:2, 3), x = 6:1, y = 4:9, | Rename columns | `rename(df, x_new = x)` | `rename(df, :x => :x_new)` | | Pick columns | `select(df, x, y)` | `select(df, :x, :y)` | | Pick & transform columns | `transmute(df, mean(x), y)` | `select(df, :x => mean, :y)` | -| Pick rows | `filter(df, x >= 1)` | `filter(:x => >=(1), df)` | +| Pick rows | `filter(df, x >= 1)` | `subset(df, :x => ByRow(x -> x >= 1))` | | Sort rows | `arrange(df, x)` | `sort(df, :x)` | As in dplyr, some of these functions can be applied to grouped data frames, in which case they operate by group: @@ -240,7 +245,7 @@ The following table compares the main functions of DataFrames.jl with Stata: | Add new columns | `egen x_mean = mean(x)` | `transform!(df, :x => mean => :x_mean)` | | Rename columns | `rename x x_new` | `rename!(df, :x => :x_new)` | | Pick columns | `keep x y` | `select!(df, :x, :y)` | -| Pick rows | `keep if x >= 1` | `filter!(:x => >=(1), df)` | +| Pick rows | `keep if x >= 1` | `subset!(df, :x => ByRow(x -> x >= 1)` | | Sort rows | `sort x` | `sort!(df, :x)` | Note that the suffix `!` (i.e. `transform!`, `select!`, etc) ensures that the operation transforms the dataframe in place, as in Stata diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 0715c40a3e..7417259461 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -64,6 +64,8 @@ export AbstractDataFrame, select, semijoin, stack, + subset, + subset!, transform, transform!, unique!, @@ -104,6 +106,7 @@ include("dataframerow/utils.jl") include("other/broadcasting.jl") include("abstractdataframe/selection.jl") +include("abstractdataframe/subset.jl") include("abstractdataframe/iteration.jl") include("abstractdataframe/join.jl") include("abstractdataframe/reshape.jl") diff --git a/src/abstractdataframe/subset.jl b/src/abstractdataframe/subset.jl new file mode 100644 index 0000000000..6f0cdfca31 --- /dev/null +++ b/src/abstractdataframe/subset.jl @@ -0,0 +1,283 @@ +# subset allows a transformation specification without a target column name or a column + +_process_subset_pair(i::Int, a::ColumnIndex) = a => Symbol(:x, i) +_process_subset_pair(i::Int, @nospecialize(a::Pair{<:Any, <:Base.Callable})) = + first(a) => last(a) => Symbol(:x, i) +_process_subset_pair(i::Int, a) = + throw(ArgumentError("condition specifier $a is not supported by `subset`")) + +_and() = throw(ArgumentError("at least one condition must be passed")) +_and(x::Bool) = x +_and(x::Bool, y::Bool...) = x && _and(y...) + +function _and(x::Any...) + loc = findfirst(x -> !(x isa Bool), x) + # we know x has positive length and must contain non-boolean + @assert !isnothing(loc) + xv = x[loc] + if ismissing(xv) + throw(ArgumentError("missing was returned in condition number $loc " * + "but only true or false are allowed; pass " * + "skipmissing=true to skip missing values")) + else + throw(ArgumentError("value $xv was returned in condition number $loc " * + "but only true or false are allowed")) + end +end + +_and_missing() = throw(ArgumentError("at least one condition must be passed")) +_and_missing(x::Bool) = x +_and_missing(x::Bool, y::Union{Bool, Missing}...) = x && _and_missing(y...) +_and_missing(x::Missing, y::Union{Bool, Missing}...) = false + +function _and_missing(x::Any...) + loc = findfirst(x -> !(x isa Union{Bool, Missing}), x) + # we know x has positive length and must contain non-boolean + @assert !isnothing(loc) + xv = x[loc] + throw(ArgumentError("value $xv was returned in condition number $loc" * + "but only true, false, or missing are allowed")) +end + + +# Note that _get_subset_conditions will have a large compilation time +# if more than 32 conditions are passed as `args`. +function _get_subset_conditions(df::Union{AbstractDataFrame, GroupedDataFrame}, + @nospecialize(args), skipmissing::Bool) + conditions = Any[_process_subset_pair(i, a) for (i, a) in enumerate(args)] + + isempty(conditions) && throw(ArgumentError("at least one condition must be passed")) + + if df isa AbstractDataFrame + df_conditions = select(df, conditions..., copycols=!(df isa DataFrame)) + else + df_conditions = select(df, conditions..., + copycols=!(parent(df) isa DataFrame), keepkeys=false) + end + + @assert ncol(df_conditions) == length(conditions) + + if skipmissing + cond = _and_missing.(eachcol(df_conditions)...) + else + cond = _and.(eachcol(df_conditions)...) + end + + @assert eltype(cond) === Bool + return cond +end + +""" + subset(df::AbstractDataFrame, args...; skipmissing::Bool=false, view::Bool=false) + subset(gdf::GroupedDataFrame, args...; skipmissing::Bool=false, view::Bool=false, + ungroup::Bool=true) + +Return a copy of data frame `df` or parent of `gdf` containing only rows for +which all values produced by transformation(s) `args` for a given row are `true`. + +Each argument passed in `args` can be either a single column selector or a +`source_columns => function` transformation specifier following the rules +described for [`select`](@ref). + +Note that as opposed to [`filter`](@ref) the `subset` function works on whole +columns (or all rows in groups for `GroupedDataFrame`). + +If `skipmissing=false` (the default) `args` are required to produce vectors +containing only `Bool` values. If `skipmissing=true`, additionally `missing` is +allowed and it is treated as `false` (i.e. rows for which one of the conditions +returns `missing` are skipped). + +If `view=true` a `SubDataFrame` view is returned instead of a `DataFrame`. + +If `ungroup=false` the resulting data frame is re-grouped based on the same +grouping columns as `gdf` and a `GroupedDataFrame` is returned. + +If a `GroupedDataFrame` is passed then it must include all groups present in the +`parent` data frame, like in [`select!`](@ref). + +See also: [`subset!`](@ref), [`filter`](@ref), [`select`](@ref) + +# Examples + +``` +julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false], + z=[true, true, missing, missing], v=[1, 2, 11, 12]) +4×5 DataFrame + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────────── + 1 │ 1 true true true 1 + 2 │ 2 false true true 2 + 3 │ 3 true false missing 11 + 4 │ 4 false false missing 12 + +julia> subset(df, :x) +2×5 DataFrame + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼──────────────────────────────────── + 1 │ 1 true true true 1 + 2 │ 3 true false missing 11 + +julia> subset(df, :v => x -> x .> 3) +2×5 DataFrame + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────────── + 1 │ 3 true false missing 11 + 2 │ 4 false false missing 12 + +julia> subset(df, :x, :y => ByRow(!)) +1×5 DataFrame + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼──────────────────────────────────── + 1 │ 3 true false missing 11 + +julia> subset(df, :x, :z, skipmissing=true) +1×5 DataFrame + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────── + 1 │ 1 true true true 1 + +julia> subset(df, :x, :z) +ERROR: ArgumentError: missing was returned in condition number 2 but only true or false are allowed; pass skipmissing=true to skip missing values + +julia> subset(groupby(df, :y), :v => x -> x .> minimum(x)) +2×5 DataFrame + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────────── + 1 │ 2 false true true 2 + 2 │ 4 false false missing 12 +``` +""" +function subset(df::AbstractDataFrame, @nospecialize(args...); + skipmissing::Bool=false, view::Bool=false) + row_selector = _get_subset_conditions(df, args, skipmissing) + return view ? Base.view(df, row_selector, :) : df[row_selector, :] +end + +function subset(gdf::GroupedDataFrame, @nospecialize(args...); + skipmissing::Bool=false, view::Bool=false, + ungroup::Bool=true) + row_selector = _get_subset_conditions(gdf, args, skipmissing) + df = parent(gdf) + res = view ? Base.view(df, row_selector, :) : df[row_selector, :] + # TODO: in some cases it might be faster to groupby gdf.groups[row_selector] + return ungroup ? res : groupby(res, groupcols(gdf)) +end + +""" + subset!(df::AbstractDataFrame, args...; skipmissing::Bool=false) + subset!(gdf::GroupedDataFrame{DataFrame}, args..., skipmissing::Bool=false, + ungroup::Bool=true) + +Update data frame `df` or the parent of `gdf` in place to contain only rows for +which all values produced by transformation(s) `args` for a given row is `true`. + +Each argument passed in `args` can be either a single column selector or a +`source_columns => function` transformation specifier following the rules +described for [`select`](@ref). + +Note that as opposed to [`filter!`](@ref) the `subset!` function works on whole +columns (or all rows in groups for `GroupedDataFrame`). + +If `skipmissing=false` (the default) `args` are required to produce vectors +containing only `Bool` values. If `skipmissing=true`, additionally `missing` is +allowed and it is treated as `false` (i.e. rows for which one of the conditions +returns `missing` are skipped). + +If `ungroup=false` the resulting data frame is re-grouped based on the same +grouping columns as `gdf` and a `GroupedDataFrame` is returned. + +If `GroupedDataFrame` is subsetted then it must include all groups present in the +`parent` data frame, like in [`select!`](@ref). + +See also: [`subset`](@ref), [`filter!`](@ref), [`select!`](@ref) + +# Examples + +``` +julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false]) +4×3 DataFrame + Row │ id x y + │ Int64 Bool Bool +─────┼───────────────────── + 1 │ 1 true true + 2 │ 2 false true + 3 │ 3 true false + 4 │ 4 false false + +julia> subset!(df, :x, :y => ByRow(!)); + +julia> df +1×3 DataFrame + Row │ id x y + │ Int64 Bool Bool +─────┼──────────────────── + 1 │ 3 true false + +julia> df = DataFrame(id=1:4, y=[true, true, false, false], v=[1, 2, 11, 12]); + +julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x)); + +julia> df +2×3 DataFrame + Row │ id y v + │ Int64 Bool Int64 +─────┼───────────────────── + 1 │ 2 true 2 + 2 │ 4 false 12 + +julia> df = DataFrame(id=1:4, x=[true, false, true, false], + z=[true, true, missing, missing], v=1:4) +4×4 DataFrame + Row │ id x z v + │ Int64 Bool Bool? Int64 +─────┼────────────────────────────── + 1 │ 1 true true 1 + 2 │ 2 false true 2 + 3 │ 3 true missing 3 + 4 │ 4 false missing 4 + +julia> subset!(df, :x, :z) +ERROR: ArgumentError: missing was returned in condition number 2 but only true or false are allowed; pass skipmissing=true to skip missing values + +julia> subset!(df, :x, :z, skipmissing=true); + +julia> df +1×4 DataFrame + Row │ id x z v + │ Int64 Bool Bool? Int64 +─────┼─────────────────────────── + 1 │ 1 true true 1 + +julia> df = DataFrame(id=1:4, x=[true, false, true, false], y=[true, true, false, false], + z=[true, true, missing, missing], v=[1, 2, 11, 12]); + +julia> subset!(groupby(df, :y), :v => x -> x .> minimum(x)); + +julia> df +2×5 DataFrame + Row │ id x y z v + │ Int64 Bool Bool Bool? Int64 +─────┼───────────────────────────────────── + 1 │ 2 false true true 2 + 2 │ 4 false false missing 12 +``` +""" +function subset!(df::AbstractDataFrame, @nospecialize(args...); skipmissing::Bool=false) + row_selector = _get_subset_conditions(df, args, skipmissing) + return delete!(df, findall(!, row_selector)) +end + +function subset!(gdf::GroupedDataFrame, @nospecialize(args...); skipmissing::Bool=false, + ungroup::Bool=true) + row_selector = _get_subset_conditions(gdf, args, skipmissing) + df = parent(gdf) + res = delete!(df, findall(!, row_selector)) + # TODO: in some cases it might be faster to groupby gdf.groups[row_selector] + return ungroup ? res : groupby(res, groupcols(gdf)) +end diff --git a/test/grouping.jl b/test/grouping.jl index a2df4868ae..cfa3ae2a25 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3272,4 +3272,245 @@ end @test df == df2 end +@testset "subset and subset!" begin + refdf = DataFrame(x = repeat(Any[true, false], 4), + y = repeat([true, false, missing, missing], 2), + z = repeat([1, 2, 3, 3], 2), + id = 1:8) + + for df in (copy(refdf), @view copy(refdf)[1:end-1, :]) + df2 = copy(df) + @test subset(df, :x) ≅ filter(:x => identity, df) + @test df ≅ df2 + @test subset(df, :x) isa DataFrame + @test subset(df, :x, view=true) ≅ filter(:x => identity, df) + @test subset(df, :x, view=true) isa SubDataFrame + @test_throws ArgumentError subset(df, :y) + @test_throws ArgumentError subset(df, :y, :x) + @test subset(df, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(df, :x, :y, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :y, :x, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :x, :y, skipmissing=true, view=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(df, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(df, :x, :id => ByRow(<(4))) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test subset(df, :x, :id => ByRow(<(4)), view=true) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test_throws ArgumentError subset(df) + @test isempty(subset(df, :x, :x => ByRow(!))) + @test_throws ArgumentError subset(df, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset(df, :x => x -> true, :x => x -> 2) + end + + for df in (copy(refdf), @view copy(refdf)[1:end-1, :]), + gdf in (groupby_checked(df, :z), groupby_checked(df, :z)[[3, 2, 1]]) + df2 = copy(df) + @test subset(gdf, :x) ≅ filter(:x => identity, df) + @test df ≅ df2 + @test subset(gdf, :x) isa DataFrame + @test subset(gdf, :x, ungroup=false) ≅ + groupby_checked(filter(:x => identity, df), :z) + @test subset(gdf, :x, ungroup=false) isa GroupedDataFrame{DataFrame} + @test subset(gdf, :x, view=true) ≅ filter(:x => identity, df) + @test subset(gdf, :x, view=true) isa SubDataFrame + @test subset(gdf, :x, view=true, ungroup=false) ≅ + groupby_checked(filter(:x => identity, df), :z) + @test subset(gdf, :x, view=true, ungroup=false) isa GroupedDataFrame{<:SubDataFrame} + @test_throws ArgumentError subset(gdf, :y) + @test_throws ArgumentError subset(gdf, :y, :x) + @test subset(gdf, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, :y, skipmissing=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :y, :y, skipmissing=true, view=true) ≅ filter(:y => x -> x === true, df) + @test subset(gdf, :x, :y, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :y, :x, skipmissing=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :x, :y, skipmissing=true, view=true) ≅ + filter([:x, :y] => (x, y) -> x && y === true, df) + @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true, view=true) ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, df) + @test subset(gdf, :x, :id => ByRow(<(4))) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test subset(gdf, :x, :id => ByRow(<(4)), view=true) ≅ + filter([:x, :id] => (x, id) -> x && id < 4, df) + @test_throws ArgumentError subset(gdf) + @test isempty(subset(gdf, :x, :x => ByRow(!))) + @test_throws ArgumentError subset(gdf, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset(gdf, :x => x -> true, :x => x -> 2) + end + + df = copy(refdf) + @test subset!(df, :x) === df + @test subset!(df, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + @test_throws ArgumentError subset!(df, :y) + @test df ≅ refdf + df = copy(refdf) + @test subset!(df, :y, skipmissing=true) === df + @test subset!(df, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + @test subset!(df, :x, :y, skipmissing=true) === df + @test subset!(df, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + @test subset!(df, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + @test subset!(df, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + @test_throws ArgumentError subset!(df) + df = copy(refdf) + @test isempty(subset!(df, :x, :x => ByRow(!))) + @test isempty(df) + + df = copy(refdf) + @test_throws ArgumentError subset!(df, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(df, :x => x -> true, :x => x -> 2) + + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test subset!(gdf, :x) === df + + df = copy(refdf) + gdf = groupby_checked(df, :z) + gdf2 = subset!(gdf, :x, ungroup=false) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test parent(gdf2) === df + @test gdf2 ≅ groupby_checked(df, :z) ≅ groupby_checked(filter(:x => identity, refdf), :z) + + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test_throws ArgumentError subset!(gdf, :y) + @test df ≅ refdf + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test subset!(gdf, :y, skipmissing=true) === df + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test subset!(gdf, :x, :y, skipmissing=true) === df + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test_throws ArgumentError subset!(gdf) + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test isempty(subset!(gdf, :x, :x => ByRow(!))) + @test isempty(df) + df = copy(refdf) + gdf = groupby_checked(df, :z) + @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) + + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x) ≅ df ≅ filter(:x => identity, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf, :y) + @test df ≅ refdf + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test subset!(gdf, :y, skipmissing=true) ≅ df ≅ filter(:y => x -> x === true, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :y, skipmissing=true) ≅ df ≅ + filter([:x, :y] => (x, y) -> x && y === true, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :y, :id => ByRow(<(4)), skipmissing=true) ≅ df ≅ + filter([:x, :y, :id] => (x, y, id) -> x && y === true && id < 4, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test subset!(gdf, :x, :id => ByRow(<(4))) ≅ df ≅ + filter([:x, :id] => (x, id) -> x && id < 4, refdf) + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf) + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test isempty(subset!(gdf, :x, :x => ByRow(!))) + @test isempty(df) + + df = copy(refdf) + gdf = groupby_checked(df, :z)[[3, 2, 1]] + @test_throws ArgumentError subset!(gdf, :x => x -> false, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> missing) + @test_throws ArgumentError subset!(gdf, :x => x -> true, :x => x -> 2) + + @test_throws ArgumentError subset!(view(refdf, :, :), :x) + @test_throws ArgumentError subset!(groupby_checked(view(refdf, :, :), :z), :x) + + df = DataFrame(g=[2, 2, 1, 1, 1, 1, 3, 3, 3], x = 1:9) + @test subset(df, :x => x -> x .< mean(x)) == DataFrame(g=[2, 2, 1, 1], x = 1:4) + @test subset(groupby_checked(df, :g), :x => x -> x .< mean(x)) == + DataFrame(g=[2, 1, 1, 3], x=[1, 3, 4, 7]) + + @test_throws ArgumentError subset(df, :x => x -> missing) + @test isempty(subset(df, :x => x -> missing, skipmissing=true)) + @test isempty(subset(df, :x => x -> false)) + @test subset(df, :x => x -> true) ≅ df + @test_throws ArgumentError subset(df, :x => x -> (a=x,)) + @test_throws ArgumentError subset(df, :x => (x -> (a=x,)) => AsTable) + + @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=missing, y=false), :x) + @test_throws ArgumentError subset(DataFrame(x=false, y=missing), :y) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y) + + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :x, :y, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :x, :y, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=1, y=false), :y, :x, skipmissing=true) + @test_throws ArgumentError subset(DataFrame(x=false, y=1), :y, skipmissing=true) + + @test_throws ArgumentError DataFrames._and() + @test_throws ArgumentError DataFrames._and_missing() +end + +@testset "make sure we handle idx correctly when groups are reordered" begin + df = DataFrame(g=[2, 2, 1, 1, 1], id = 1:5) + @test select(df, :g, :id, :id => ByRow(identity) => :id2) == + select(groupby_checked(df, :g), :id, :id => ByRow(identity) => :id2) == + select(groupby_checked(df, :g, sort=true), :id, :id => ByRow(identity) => :id2) == + select(groupby_checked(df, :g)[[2,1]], :id, :id => ByRow(identity) => :id2) == + [df DataFrame(id2=df.id)] +end + end # module