diff --git a/docs/src/index.md b/docs/src/index.md index bf315786..f1139d68 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -12,7 +12,7 @@ in C#. In addition, DataFramesMeta provides * `@orderby`, for sorting data frames -* `@where`, for keeping rows of a DataFrame matching a given condition +* `@subset` and `@subset!`, for keeping rows of a data frame matching a given condition * `@by`, for grouping and combining a data frame in a single step * `@with`, for working with the columns of a data frame with high performance and convenient syntax @@ -96,18 +96,21 @@ gd = groupby(df, :x); @transform!(gd, y = 2 .* :y .* first(:y)) ``` -## `@where` +## `@subset` and `@subset!` Select row subsets. Operates on both a `DataFrame` and a `GroupedDataFrame`. +`@subset` always returns a freshly-allocated data frame whereas +`@subset!` modifies the data frame in-place. ```julia +using Statistics df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); gd = groupby(df, :x); outside_var = 1; -@where(df, :x .> 1) -@where(df, :x .> outside_var) -@where(df, :x .> outside_var, :y .< 102) # the two expressions are "and-ed" -@where(gd, :x .> mean(:x)) +@subset(df, :x .> 1) +@subset(df, :x .> outside_var) +@subset(df, :x .> outside_var, :y .< 102) # the two expressions are "and-ed" +@subset(gd, :x .> mean(:x)) ``` ## `@combine` @@ -300,7 +303,7 @@ The following macros accept `@byrow`: * `@transform` and `@transform!`, `@select`, `@select!`, and `@combine`. `@byrow` can be used in the left hand side of expressions, e.g. `@select(df, @byrow z = :x * :y)`. -* `@where` and `@orderby`, with syntax of the form `@where(df, @byrow :x > :y)` +* `@subset`, `@subset!` and `@orderby`, with syntax of the form `@where(df, @byrow :x > :y)` * `@with`, where the anonymous function created by `@with` is wrapped in `ByRow`, as in `@with(df, @byrow :x * :y)`. diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index c1f4ad20..f2974d5b 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -7,11 +7,14 @@ using MacroTools @reexport using DataFrames # Basics: -export @with, @where, @orderby, @transform, @by, @combine, @select, - @transform!, @select!, +export @with, + @subset, @subset!, + @orderby, + @by, @combine, + @transform, @select, @transform!, @select!, @eachrow, @eachrow!, @byrow, - @based_on # deprecated + @based_on, @where # deprecated include("parsing.jl") include("macros.jl") diff --git a/src/macros.jl b/src/macros.jl index 83e6b3e0..6326c287 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -415,52 +415,191 @@ macro with(d, body) end - ############################################################################## ## -## @where - select row subsets +## @subset and subset! - select row subsets ## ############################################################################## -function where_helper(x, args...) - exprs, wrap_byrow = create_args_vector(args...) - t = (fun_to_vec(ex; gensym_names = true, wrap_byrow = wrap_byrow) for ex in exprs) +function subset_helper(x, args...) + exprs, outer_flags = create_args_vector(args...) + t = (fun_to_vec(ex; no_dest=true, outer_flags=outer_flags) for ex in exprs) quote - $where($x, $(t...)) + $subset($x, $(t...); skipmissing=true) end end -function df_to_bool(res::AbstractDataFrame) - if any(t -> !(t isa AbstractVector{<:Union{Missing, Bool}}), eachcol(res)) - throw(ArgumentError("All arguments in @where must return an " * - "AbstractVector{<:Union{Missing, Bool}}")) +function where_helper(x, args...) + exprs, outer_flags = create_args_vector(args...) + t = (fun_to_vec(ex; no_dest=true, outer_flags=outer_flags) for ex in exprs) + quote + $subset($x, $(t...); skipmissing=true) end +end + +""" + @subset(d, i...) + +Select row subsets in `AbstractDataFrame`s and `GroupedDataFrame`s. + +### Arguments + +* `d` : an AbstractDataFrame or GroupedDataFrame +* `i...` : expression for selecting rows + +Multiple `i` expressions are "and-ed" together. + +If given a `GroupedDataFrame`, `@subset` applies transformations by +group, and returns a fresh `DataFrame` containing the rows +for which the generated values are all `true`. + +Inputs to `@subset` can come in two formats: a `begin ... end` block, in which case each +line is a separate selector, or as multiple arguments. +For example the following two statements are equivalent: + +```julia +@subset df begin + :x .> 1 + :y .< 2 +end +``` + +and + +``` +@subset(df, :x .> 1, :y .< 2) +``` + +!!! note + `@subset` treats `missing` values as `false` when filtering rows. + Unlike `DataFrames.subset` and other Boolean operations with + `missing`, `@subset` will *not* error on missing values, and + will only keep `true` values. + +If an expression provided to `@subset` begins with `@byrow`, operations +are applied "by row" along the data frame. To avoid writing `@byrow` multiple +times, `@orderby` also allows `@byrow`to be placed at the beginning of a block of +operations. For example, the following two statements are equivalent. + +``` +@subset df @byrow begin + :x > 1 + :y < 2 +end +``` + +and + +``` +@subset df + @byrow :x > 1 + @byrow :y < 2 +end +``` + +### Examples + +```jldoctest +julia> using DataFramesMeta, Statistics + +julia> df = DataFrame(x = 1:3, y = [2, 1, 2]); + +julia> globalvar = [2, 1, 0]; + +julia> @subset(df, :x .> 1) +2×2 DataFrame + Row │ x y + │ Int64 Int64 +─────┼────────────── + 1 │ 2 1 + 2 │ 3 2 - return reduce((x, y) -> x .& y, eachcol(res)) .=== true +julia> @subset(df, :x .> globalvar) +2×2 DataFrame + Row │ x y + │ Int64 Int64 +─────┼────────────── + 1 │ 2 1 + 2 │ 3 2 + +julia> @subset df begin + :x .> globalvar + :y .== 3 end +0×2 DataFrame + +julia> d = DataFrame(n = 1:20, x = [3, 3, 3, 3, 1, 1, 1, 2, 1, 1, + 2, 1, 1, 2, 2, 2, 3, 1, 1, 2]); + +julia> g = groupby(d, :x); -function where(df::AbstractDataFrame, @nospecialize(args...)) - res = DataFrames.select(df, args...; copycols = false) - tokeep = df_to_bool(res) - df[tokeep, :] +julia> @subset(g, :n .> mean(:n)) +8×2 DataFrame + Row │ n x + │ Int64 Int64 +─────┼────────────── + 1 │ 12 1 + 2 │ 13 1 + 3 │ 15 2 + 4 │ 16 2 + 5 │ 17 3 + 6 │ 18 1 + 7 │ 19 1 + 8 │ 20 2 + +julia> @subset g begin + :n .> mean(:n) + :n .< 20 + end +7×2 DataFrame + Row │ n x + │ Int64 Int64 +─────┼────────────── + 1 │ 12 1 + 2 │ 13 1 + 3 │ 15 2 + 4 │ 16 2 + 5 │ 17 3 + 6 │ 18 1 + 7 │ 19 1 + +julia> d = DataFrame(a = [1, 2, missing], b = ["x", "y", missing]); + +julia> @subset(d, :a .== 1) +1×2 DataFrame +│ Row │ a │ b │ +│ │ Int64? │ String? │ +├─────┼────────┼─────────┤ +│ 1 │ 1 │ x │ +``` +""" +macro subset(x, args...) + esc(subset_helper(x, args...)) end -function where(gd::GroupedDataFrame, @nospecialize(args...)) - res = DataFrames.select(gd, args...; copycols = false, keepkeys = false) - tokeep = df_to_bool(res) - parent(gd)[tokeep, :] +""" + @where(x, args...) + +Deprecated version of `@subset`, see `?@subset` for details. +""" +macro where(x, args...) + @warn "`@where is deprecated, use `@subset` with `@skipmissing` instead." + esc(where_helper(x, args...)) end -function where(df::SubDataFrame, @nospecialize(args...)) - res = DataFrames.select(df, args...) - tokeep = df_to_bool(res) - df[tokeep, :] +function subset!_helper(x, args...) + exprs, outer_flags = create_args_vector(args...) + t = (fun_to_vec(ex; no_dest=true, outer_flags=outer_flags) for ex in exprs) + quote + $subset!($x, $(t...); skipmissing=true) + end end """ - @where(d, i...) + @subset!(d, i...) -Select row subsets in `AbstractDataFrame`s and `GroupedDataFrame`s. +Select row subsets in `AbstractDataFrame`s and `GroupedDataFrame`s, +mutating the underlying data-frame in-place. ### Arguments @@ -469,16 +608,16 @@ Select row subsets in `AbstractDataFrame`s and `GroupedDataFrame`s. Multiple `i` expressions are "and-ed" together. -If given a `GroupedDataFrame`, `@where` applies transformations by +If given a `GroupedDataFrame`, `@subset!` applies transformations by group, and returns a fresh `DataFrame` containing the rows for which the generated values are all `true`. -Inputs to `@where` can come in two formats: a `begin ... end` block, in which case each +Inputs to `@subset!` can come in two formats: a `begin ... end` block, in which case each line is a separate selector, or as multiple arguments. For example the following two statements are equivalent: ```julia -@where df begin +@subset! df begin :x .> 1 :y .< 2 end @@ -487,22 +626,22 @@ end and ``` -@where(df, :x .> 1, :y .< 2) +@subset!(df, :x .> 1, :y .< 2) ``` !!! note - `@where` treats `missing` values as `false` when filtering rows. - Unlike `DataFrames.filter` and other boolean operations with - `missing`, `@where` will *not* error on missing values, and + `@subset!` treats `missing` values as `false` when filtering rows. + Unlike `DataFrames.subset!` and other Boolean operations with + `missing`, `@subset!` will *not* error on missing values, and will only keep `true` values. -If an expression provided to `@where` begins with `@byrow`, operations +If an expression provided to `@subset!` begins with `@byrow`, operations are applied "by row" along the data frame. To avoid writing `@byrow` multiple times, `@orderby` also allows `@byrow`to be placed at the beginning of a block of operations. For example, the following two statements are equivalent. ``` -@where df @byrow begin +@subset! df @byrow begin :x > 1 :y < 2 end @@ -511,7 +650,7 @@ end and ``` -@orderby df +@subset! df @byrow :x > 1 @byrow :y < 2 end @@ -526,7 +665,7 @@ julia> df = DataFrame(x = 1:3, y = [2, 1, 2]); julia> globalvar = [2, 1, 0]; -julia> @where(df, :x .> 1) +julia> @subset!(df, :x .> 1) 2×2 DataFrame Row │ x y │ Int64 Int64 @@ -534,7 +673,7 @@ julia> @where(df, :x .> 1) 1 │ 2 1 2 │ 3 2 -julia> @where(df, :x .> globalvar) +julia> @subset!(df, :x .> globalvar) 2×2 DataFrame Row │ x y │ Int64 Int64 @@ -542,7 +681,7 @@ julia> @where(df, :x .> globalvar) 1 │ 2 1 2 │ 3 2 -julia> @where df begin +julia> @subset! df begin :x .> globalvar :y .== 3 end @@ -553,7 +692,7 @@ julia> d = DataFrame(n = 1:20, x = [3, 3, 3, 3, 1, 1, 1, 2, 1, 1, julia> g = groupby(d, :x); -julia> @where(g, :n .> mean(:n)) +julia> @subset!(g, :n .> mean(:n)) 8×2 DataFrame Row │ n x │ Int64 Int64 @@ -567,7 +706,7 @@ julia> @where(g, :n .> mean(:n)) 7 │ 19 1 8 │ 20 2 -julia> @where g begin +julia> @subset! g begin :n .> mean(:n) :n .< 20 end @@ -585,7 +724,7 @@ julia> @where g begin julia> d = DataFrame(a = [1, 2, missing], b = ["x", "y", missing]); -julia> @where(d, :a .== 1) +julia> @subset!(d, :a .== 1) 1×2 DataFrame │ Row │ a │ b │ │ │ Int64? │ String? │ @@ -593,10 +732,11 @@ julia> @where(d, :a .== 1) │ 1 │ 1 │ x │ ``` """ -macro where(x, args...) - esc(where_helper(x, args...)) +macro subset!(x, args...) + esc(subset!_helper(x, args...)) end + ############################################################################## ## ## @orderby @@ -604,8 +744,8 @@ end ############################################################################## function orderby_helper(x, args...) - exprs, wrap_byrow = create_args_vector(args...) - t = (fun_to_vec(ex; gensym_names = true, wrap_byrow = wrap_byrow) for ex in exprs) + exprs, outer_flags = create_args_vector(args...) + t = (fun_to_vec(ex; gensym_names = true, outer_flags = outer_flags) for ex in exprs) quote $DataFramesMeta.orderby($x, $(t...)) end @@ -768,8 +908,8 @@ end function transform_helper(x, args...) - exprs, wrap_byrow = create_args_vector(args...) - t = (fun_to_vec(ex; gensym_names = false, wrap_byrow = wrap_byrow) for ex in exprs) + exprs, outer_flags = create_args_vector(args...) + t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) quote $DataFrames.transform($x, $(t...)) end @@ -886,8 +1026,8 @@ end function transform!_helper(x, args...) - exprs, wrap_byrow = create_args_vector(args...) - t = (fun_to_vec(ex; gensym_names = false, wrap_byrow = wrap_byrow) for ex in exprs) + exprs, outer_flags = create_args_vector(args...) + t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) quote $DataFrames.transform!($x, $(t...)) end @@ -981,8 +1121,8 @@ end ############################################################################## function select_helper(x, args...) - exprs, wrap_byrow = create_args_vector(args...) - t = (fun_to_vec(ex; gensym_names = false, wrap_byrow = wrap_byrow) for ex in exprs) + exprs, outer_flags = create_args_vector(args...) + t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) quote $DataFrames.select($x, $(t...)) end @@ -1095,8 +1235,8 @@ end ############################################################################## function select!_helper(x, args...) - exprs, wrap_byrow = create_args_vector(args...) - t = (fun_to_vec(ex; gensym_names = false, wrap_byrow = wrap_byrow) for ex in exprs) + exprs, outer_flags = create_args_vector(args...) + t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) quote $DataFrames.select!($x, $(t...)) end @@ -1206,7 +1346,7 @@ end function combine_helper(x, args...; deprecation_warning = false) deprecation_warning && @warn "`@based_on` is deprecated. Use `@combine` instead." - exprs, wrap_byrow = create_args_vector(args...) + exprs, outer_flags = create_args_vector(args...) fe = first(exprs) if length(exprs) == 1 && @@ -1218,7 +1358,7 @@ function combine_helper(x, args...; deprecation_warning = false) exprs = ((:(cols(AsTable) = $fe)),) end - t = (fun_to_vec(ex; gensym_names = false, wrap_byrow = wrap_byrow) for ex in exprs) + t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) quote $DataFrames.combine($x, $(t...)) @@ -1327,7 +1467,7 @@ end function by_helper(x, what, args...) # Only allow one argument when returning a Table object # Only allow one argument when returning a Table object - exprs, wrap_byrow = create_args_vector(args...) + exprs, outer_flags = create_args_vector(args...) fe = first(exprs) if length(exprs) == 1 && !(fe isa QuoteNode || onearg(fe, :cols)) && @@ -1338,7 +1478,7 @@ function by_helper(x, what, args...) exprs = ((:(cols(AsTable) = $fe)),) end - t = (fun_to_vec(ex; gensym_names = false, wrap_byrow = wrap_byrow) for ex in exprs) + t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) quote $DataFrames.combine($groupby($x, $what), $(t...)) diff --git a/src/parsing.jl b/src/parsing.jl index cd77f41c..e5a4427d 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -183,7 +183,10 @@ end # We need wrap_byrow as a keyword argument here in case someone # uses `@transform df @byrow begin ... end`, which we # deal with outside of this function. -function fun_to_vec(ex::Expr; gensym_names::Bool=false, no_dest::Bool=false, wrap_byrow::Bool=false) +function fun_to_vec(ex::Expr; + gensym_names::Bool=false, + outer_flags::Union{NamedTuple, Nothing}=nothing, + no_dest::Bool=false) # classify the type of expression # :x # handled via dispatch # cols(:x) # handled as though above @@ -201,15 +204,16 @@ function fun_to_vec(ex::Expr; gensym_names::Bool=false, no_dest::Bool=false, wra # cols(y) = :x + 1 # re-write as complicated col, but RHS is :block # cols(:y) = cols(:x) + 1 # re-write as complicated call, RHS is block, use cols # `@byrow` before any of the above - ex, flags = extract_macro_flags(MacroTools.unblock(ex)) + ex, inner_flags = extract_macro_flags(MacroTools.unblock(ex)) # Use tuple syntax in future when we add more flags - wrap_byrow_t = flags[Symbol("@byrow")][] + inner_wrap_byrow = inner_flags[Symbol("@byrow")][] + outer_wrap_byrow = outer_flags === nothing ? false : outer_flags[Symbol("@byrow")][] - if wrap_byrow_t && wrap_byrow + if inner_wrap_byrow && outer_wrap_byrow throw(ArgumentError("Redundant @byrow calls.")) else - wrap_byrow = wrap_byrow || wrap_byrow_t + wrap_byrow = inner_wrap_byrow || outer_wrap_byrow end if gensym_names @@ -304,7 +308,10 @@ function fun_to_vec(ex::Expr; gensym_names::Bool=false, no_dest::Bool=false, wra throw(ArgumentError("This path should not be reached")) end -fun_to_vec(ex::QuoteNode; no_dest::Bool=false, gensym_names::Bool=false, wrap_byrow::Bool=false) = ex +fun_to_vec(ex::QuoteNode; + no_dest::Bool=false, + gensym_names::Bool=false, + outer_flags::Union{NamedTuple, Nothing}=nothing) = ex function make_source_concrete(x::AbstractVector) if isempty(x) || isconcretetype(eltype(x)) @@ -332,45 +339,23 @@ function replace_dotted!(e, membernames) Expr(:., x_new, y_new) end -""" - create_args_vector(args...) -> vec, wrap_byrow - -Given multiple arguments which can be any type -of expression-like object (`Expr`, `QuoteNode`, etc.), -puts them into a single array, removing line numbers. -""" function create_args_vector(args...) create_args_vector(Expr(:block, args...)) end """ - create_args_vector(arg) -> vec, wrap_byrow + create_args_vector(arg) -> vec, outer_flags -Normalize a single input to a vector of expressions, -with a `wrap_byrow` flag indicating that the -expressions should operate by row. +Given an expression return a vector of operations +and a `NamedTuple` of the macro-flags that appear +in the expression. -If `arg` is a single `:block`, it is unnested. -Otherwise, return a single-element array. -Also removes line numbers. - -If `arg` is of the form `@byrow ...`, then -`wrap_byrow` is returned as `true`. +If a `:block` expression, return the `args` of +the block as an array. If a simple expression, +wrap the expression in a one-element vector. """ function create_args_vector(arg) - if arg isa Expr && is_macro_head(arg, "@byrow") - wrap_byrow = true - largs = length(arg.args) - if largs == 2 - throw(ArgumentError("No transformations supplied with `@byrow`")) - elseif largs == 3 - arg = arg.args[3] - else - arg = Expr(:block, arg.args[3:end]...) - end - else - wrap_byrow = false - end + arg, outer_flags = extract_macro_flags(MacroTools.unblock(arg)) if arg isa Expr && arg.head == :block x = MacroTools.rmlines(arg).args @@ -378,8 +363,5 @@ function create_args_vector(arg) x = Any[arg] end - if wrap_byrow && any(t -> is_macro_head(t, "@byrow"), x) - throw(ArgumentError("Redundant `@byrow` calls.")) - end - return x, wrap_byrow + return x, outer_flags end diff --git a/test/dataframes.jl b/test/dataframes.jl index b8767d6d..84e3c770 100644 --- a/test/dataframes.jl +++ b/test/dataframes.jl @@ -595,72 +595,6 @@ end @test @with(df, cols("A")) === df.A end -@testset "where" begin - df = DataFrame(A = [1, 2, 3, missing], B = [2, 1, 2, 1]) - - x = [2, 1, 0, 0] - - @test @where(df, :A .> 1) == df[(df.A .> 1) .=== true,:] - @test @where(df, :B .> 1) == df[df.B .> 1,:] - @test @where(df, :A .> x) == df[(df.A .> x) .=== true,:] - @test @where(df, :B .> x) ≅ df[df.B .> x,:] - @test @where(df, :A .> :B, :B .> mean(:B)) == DataFrame(A = 3, B = 2) - @test @where(df, :A .> 1, :B .> 1) == df[map(&, df.A .> 1, df.B .> 1),:] - @test @where(df, :A .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:] - - @test @where(df, :A .> 1).A isa Vector{Union{Missing, Int}} - - @test @where(df, cols(:A) .> 1) == df[(df.A .> 1) .=== true,:] - @test @where(df, cols(:B) .> 1) == df[df.B .> 1,:] - @test @where(df, cols(:A) .> x) == df[(df.A .> x) .=== true,:] - @test @where(df, cols(:B) .> x) ≅ df[df.B .> x,:] - @test @where(df, cols(:A) .> :B, cols(:B) .> mean(:B)) == DataFrame(A = 3, B = 2) - @test @where(df, cols(:A) .> 1, :B .> 1) == df[map(&, df.A .> 1, df.B .> 1),:] - @test @where(df, cols(:A) .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:] - - @test @where(df, :A .> 1, :A .<= 2) == DataFrame(A = 2, B = 1) - - subdf = @view df[df.B .== 2, :] - - @test @where(subdf, :A .== 3) == DataFrame(A = 3, B = 2) -end - -@testset "where with :block" begin - df = DataFrame(A = [1, 2, 3, missing], B = [2, 1, 2, 1]) - - d = @where df begin - :A .> 1 - :B .> 1 - end - @test d ≅ @where(df, :A .> 1, :B .> 1) - - d = @where df begin - cols(:A) .> 1 - :B .> 1 - end - @test d ≅ @where(df, :A .> 1, :B .> 1) - - d = @where df begin - :A .> 1 - cols(:B) .> 1 - end - @test d ≅ @where(df, :A .> 1, :B .> 1) - - d = @where df begin - begin - :A .> 1 - end - :B .> 1 - end - @test d ≅ @where(df, :A .> 1, :B .> 1) - - d = @where df begin - :A .> 1 - @. :B > 1 - end - @test d ≅ @where(df, :A .> 1, :B .> 1) -end - @testset "orderby" begin df = DataFrame( g = [1, 1, 1, 2, 2], @@ -733,6 +667,14 @@ end end macro linenums_macro(arg) + if arg isa Expr && arg.head == :block && length(arg.args) == 1 && arg.args[1] isa LineNumberNode + esc(:([true])) + else + esc(:([false])) + end +end + +macro linenums_macro_byrow(arg) if arg isa Expr && arg.head == :block && length(arg.args) == 1 && arg.args[1] isa LineNumberNode esc(:(true)) else @@ -753,23 +695,23 @@ end @test d.y == [true] d = @transform df @byrow begin - y = @linenums_macro begin end + y = @linenums_macro_byrow begin end end @test d.y == [true] - d = @where(df, @linenums_macro begin end) + d = @subset(df, @linenums_macro begin end) @test nrow(d) == 1 - d = @where df begin - @byrow @linenums_macro begin end + d = @subset df begin + @byrow @linenums_macro_byrow begin end end @test nrow(d) == 1 - d = @where df @byrow begin - @linenums_macro begin end + d = @subset df @byrow begin + @linenums_macro_byrow begin end end @test nrow(d) == 1 diff --git a/test/deprecated.jl b/test/deprecated.jl index e0f68b35..0412d57a 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -77,4 +77,86 @@ const ≅ = isequal @test @based_on(gd, cols("new" * "_" * "column") = 2)."new_column" == [2, 2] end +@testset "where" begin + df = DataFrame(A = [1, 2, 3, missing], B = [2, 1, 2, 1]) + + x = [2, 1, 0, 0] + + @test @where(df, :A .> 1) == df[(df.A .> 1) .=== true,:] + @test @where(df, :B .> 1) == df[df.B .> 1,:] + @test @where(df, :A .> x) == df[(df.A .> x) .=== true,:] + @test @where(df, :B .> x) ≅ df[df.B .> x,:] + @test @where(df, :A .> :B, :B .> mean(:B)) == DataFrame(A = 3, B = 2) + @test @where(df, :A .> 1, :B .> 1) == df[map(&, df.A .> 1, df.B .> 1),:] + @test @where(df, :A .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:] + + @test @where(df, :A .> 1).A isa Vector{Union{Missing, Int}} + + @test @where(df, cols(:A) .> 1) == df[(df.A .> 1) .=== true,:] + @test @where(df, cols(:B) .> 1) == df[df.B .> 1,:] + @test @where(df, cols(:A) .> x) == df[(df.A .> x) .=== true,:] + @test @where(df, cols(:B) .> x) ≅ df[df.B .> x,:] + @test @where(df, cols(:A) .> :B, cols(:B) .> mean(:B)) == DataFrame(A = 3, B = 2) + @test @where(df, cols(:A) .> 1, :B .> 1) == df[map(&, df.A .> 1, df.B .> 1),:] + @test @where(df, cols(:A) .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:] + + @test @where(df, :A .> 1, :A .<= 2) == DataFrame(A = 2, B = 1) + + subdf = @view df[df.B .== 2, :] + + @test @where(subdf, :A .== 3) == DataFrame(A = 3, B = 2) +end + +@testset "where with :block" begin + df = DataFrame(A = [1, 2, 3, missing], B = [2, 1, 2, 1]) + + d = @where df begin + :A .> 1 + :B .> 1 + end + @test d ≅ @where(df, :A .> 1, :B .> 1) + + d = @where df begin + cols(:A) .> 1 + :B .> 1 + end + @test d ≅ @where(df, :A .> 1, :B .> 1) + + d = @where df begin + :A .> 1 + cols(:B) .> 1 + end + @test d ≅ @where(df, :A .> 1, :B .> 1) + + d = @where df begin + begin + :A .> 1 + end + :B .> 1 + end + @test d ≅ @where(df, :A .> 1, :B .> 1) + + d = @where df begin + :A .> 1 + @. :B > 1 + end + @test d ≅ @where(df, :A .> 1, :B .> 1) +end + +@testset "@where with a grouped data frame" begin + df = DataFrame( + g = [1, 1, 1, 2, 2], + i = 1:5, + t = ["a", "b", "c", "c", "e"], + y = [:v, :w, :x, :y, :z], + c = [:g, :quote, :body, :transform, missing] + ) + + gd = groupby(df, :g) + + @test @where(gd, :i .== first(:i)) ≅ df[[1, 4], :] + @test @where(gd, cols(:i) .> mean(cols(:i)), :t .== "c") ≅ df[[3], :] + @test @where(gd, :c .== :g) ≅ df[[], :] +end + end # module \ No newline at end of file diff --git a/test/grouping.jl b/test/grouping.jl index ca7957cb..c603c2a5 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -357,19 +357,4 @@ end @test @select(g, :a, @byrow t = :a ^ 2).t ≅ d.a .^ 2 end -@testset "@where with a grouped data frame" begin - df = DataFrame( - g = [1, 1, 1, 2, 2], - i = 1:5, - t = ["a", "b", "c", "c", "e"], - y = [:v, :w, :x, :y, :z], - c = [:g, :quote, :body, :transform, missing] - ) - - gd = groupby(df, :g) - - @test @where(gd, :i .== first(:i)) ≅ df[[1, 4], :] - @test @where(gd, cols(:i) .> mean(cols(:i)), :t .== "c") ≅ df[[3], :] - @test @where(gd, :c .== :g) ≅ df[[], :] -end end # module diff --git a/test/runtests.jl b/test/runtests.jl index 36e12417..3fb003b3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,6 +5,7 @@ quiet = false my_tests = ["dataframes.jl", "eachrow.jl", "grouping.jl", + "subset.jl", "function_compilation.jl", "chaining.jl", "linqmacro.jl", diff --git a/test/subset.jl b/test/subset.jl new file mode 100644 index 00000000..d501be18 --- /dev/null +++ b/test/subset.jl @@ -0,0 +1,176 @@ +module TestSubset + +using Test +using DataFrames +using DataFramesMeta +using Statistics + +const ≅ = isequal + +@testset "subset" begin + df = DataFrame(A = [1, 2, 3, missing], B = [2, 1, 2, 1]) + + x = [2, 1, 0, 0] + + @test @subset(df, :A .> 1) == df[(df.A .> 1) .=== true,:] + @test @subset(df, :B .> 1) == df[df.B .> 1,:] + @test @subset(df, :A .> x) == df[(df.A .> x) .=== true,:] + @test @subset(df, :B .> x) ≅ df[df.B .> x,:] + @test @subset(df, :A .> :B, :B .> mean(:B)) == DataFrame(A = 3, B = 2) + @test @subset(df, :A .> 1, :B .> 1) == df[map(&, df.A .> 1, df.B .> 1),:] + @test @subset(df, :A .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:] + + @test @subset(df, :A .> 1).A isa Vector{Union{Missing, Int}} + + @test @subset(df, cols(:A) .> 1) == df[(df.A .> 1) .=== true,:] + @test @subset(df, cols(:B) .> 1) == df[df.B .> 1,:] + @test @subset(df, cols(:A) .> x) == df[(df.A .> x) .=== true,:] + @test @subset(df, cols(:B) .> x) ≅ df[df.B .> x,:] + @test @subset(df, cols(:A) .> :B, cols(:B) .> mean(:B)) == DataFrame(A = 3, B = 2) + @test @subset(df, cols(:A) .> 1, :B .> 1) == df[map(&, df.A .> 1, df.B .> 1),:] + @test @subset(df, cols(:A) .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:] + + @test @subset(df, :A .> 1, :A .<= 2) == DataFrame(A = 2, B = 1) + + subdf = @view df[df.B .== 2, :] + + @test @subset(subdf, :A .== 3) == DataFrame(A = 3, B = 2) +end + +@testset "subset with :block" begin + df = DataFrame(A = [1, 2, 3, missing], B = [2, 1, 2, 1]) + + d = @subset df begin + :A .> 1 + :B .> 1 + end + @test d ≅ @subset(df, :A .> 1, :B .> 1) + + d = @subset df begin + cols(:A) .> 1 + :B .> 1 + end + @test d ≅ @subset(df, :A .> 1, :B .> 1) + + d = @subset df begin + :A .> 1 + cols(:B) .> 1 + end + @test d ≅ @subset(df, :A .> 1, :B .> 1) + + d = @subset df begin + begin + :A .> 1 + end + :B .> 1 + end + @test d ≅ @subset(df, :A .> 1, :B .> 1) + + d = @subset df begin + :A .> 1 + @. :B > 1 + end + @test d ≅ @subset(df, :A .> 1, :B .> 1) +end + + +@testset "subset!" begin + df = DataFrame(A = [1, 2, 3, missing], B = [2, 1, 2, 1]) + + x = [2, 1, 0, 0] + + df2 = copy(df) + @test @subset!(df2, :A .> 1) === df2 + @test df2 == df[(df.A .> 1) .=== true,:] + + @test @subset!(copy(df), :B .> 1) == df[df.B .> 1,:] + @test @subset!(copy(df), :A .> x) == df[(df.A .> x) .=== true,:] + @test @subset!(copy(df), :B .> x) ≅ df[df.B .> x,:] + @test @subset!(copy(df), :A .> :B, :B .> mean(:B)) == DataFrame(A = 3, B = 2) + @test @subset!(copy(df), :A .> 1, :B .> 1) == df[map(&, df.A .> 1, df.B .> 1),:] + @test @subset!(copy(df), :A .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:] + + @test @subset!(copy(df), :A .> 1).A isa Vector{Union{Missing, Int}} + + @test @subset!(copy(df), cols(:A) .> 1) == df[(df.A .> 1) .=== true,:] + @test @subset!(copy(df), cols(:B) .> 1) == df[df.B .> 1,:] + @test @subset!(copy(df), cols(:A) .> x) == df[(df.A .> x) .=== true,:] + @test @subset!(copy(df), cols(:B) .> x) ≅ df[df.B .> x,:] + @test @subset!(copy(df), cols(:A) .> :B, cols(:B) .> mean(:B)) == DataFrame(A = 3, B = 2) + @test @subset!(copy(df), cols(:A) .> 1, :B .> 1) == df[map(&, df.A .> 1, df.B .> 1),:] + @test @subset!(copy(df), cols(:A) .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:] + + @test @subset!(copy(df), :A .> 1, :A .<= 2) == DataFrame(A = 2, B = 1) + + subdf = @view df[df.B .== 2, :] + + @test @subset!(copy(subdf), :A .== 3) == DataFrame(A = 3, B = 2) +end + +@testset "subset! with :block" begin + df = DataFrame(A = [1, 2, 3, missing], B = [2, 1, 2, 1]) + + d = @subset! copy(df) begin + :A .> 1 + :B .> 1 + end + @test d ≅ @subset!(copy(df), :A .> 1, :B .> 1) + + d = @subset! copy(df) begin + cols(:A) .> 1 + :B .> 1 + end + @test d ≅ @subset!(copy(df), :A .> 1, :B .> 1) + + d = @subset! copy(df) begin + :A .> 1 + cols(:B) .> 1 + end + @test d ≅ @subset!(copy(df), :A .> 1, :B .> 1) + + d = @subset! copy(df) begin + begin + :A .> 1 + end + :B .> 1 + end + @test d ≅ @subset!(copy(df), :A .> 1, :B .> 1) + + d = @subset! copy(df) begin + :A .> 1 + @. :B > 1 + end + @test d ≅ @subset!(copy(df), :A .> 1, :B .> 1) +end + +@testset "@subset with a grouped data frame" begin + df = DataFrame( + g = [1, 1, 1, 2, 2], + i = 1:5, + t = ["a", "b", "c", "c", "e"], + y = [:v, :w, :x, :y, :z], + c = [:g, :quote, :body, :transform, missing] + ) + + gd = groupby(df, :g) + + @test @subset(gd, :i .== first(:i)) ≅ df[[1, 4], :] + @test @subset(gd, cols(:i) .> mean(cols(:i)), :t .== "c") ≅ df[[3], :] + @test @subset(gd, :c .== :g) ≅ df[[], :] +end + +@testset "@subset! with a grouped data frame" begin + df = DataFrame( + g = [1, 1, 1, 2, 2], + i = 1:5, + t = ["a", "b", "c", "c", "e"], + y = [:v, :w, :x, :y, :z], + c = [:g, :quote, :body, :transform, missing] + ) + + @test @subset!(groupby(copy(df), :g), :i .== first(:i)) ≅ df[[1, 4], :] + @test @subset!(groupby(copy(df), :g), cols(:i) .> mean(cols(:i)), :t .== "c") ≅ df[[3], :] + @test @subset!(groupby(copy(df), :g), :c .== :g) ≅ df[[], :] +end + +end # module