From 006c4373f2258184f85bc75fb4fa2226da57c910 Mon Sep 17 00:00:00 2001 From: Tamas Nagy Date: Sun, 27 May 2018 21:26:11 -0700 Subject: [PATCH] [WIP] prelim version of the density geometry revamp working on #1152. Note: This is a WIP and currently completely breaks `Geom.density` and `Geom.violin` has several regressions. --- src/aesthetics.jl | 71 ++++++++++++++ src/geom/density.jl | 70 +++++++++---- src/geom/line.jl | 3 - src/statistics.jl | 234 +++++++++++++++++++++++++++----------------- 4 files changed, 265 insertions(+), 113 deletions(-) diff --git a/src/aesthetics.jl b/src/aesthetics.jl index d4f6f4f77..b02a25699 100755 --- a/src/aesthetics.jl +++ b/src/aesthetics.jl @@ -1,3 +1,5 @@ +using IterTools + const NumericalOrCategoricalAesthetic = Union{(Void), Vector, DataArray, IndirectArray} @@ -413,3 +415,72 @@ function inherit!(a::Aesthetics, b::Aesthetics; end nothing end + +""" + Given aesthetics to group with, `by`, and an aesthetic to group `togroupvar` +this function constructs a dictionary that maps each given combination of the +`by` aesthetics to the positions which they apply to. Thus the output is a +dictionary of tuples of each unique combination of `by` mapped to a boolean +array of length `n` where `n` is the length of the aesthetics (they have to all +have the same length). If the provided aesthetics are missing, a placeholder +`nothing` is return instead of the unique value. + +## Examples + +```jldoctest +aes = Gadfly.Aesthetics() +aes.x = repeat([1, 2], inner=3) +aes.y = collect(1:6) + +groupby(aes, [:x, :color], :y) + +# output + +Dict((2, nothing)=>Bool[false, false, false, true, true, true],(1, nothing)=>Bool[true, true, true, false, false, false]) +``` + +```jldoctest +aes = Gadfly.Aesthetics() +aes.x = repeat([:a, :b], inner=2) +aes.y = collect(1:4) +aes.color = repeat([colorant"red", colorant"blue"], inner=2) + +groupby(aes, [:x, :color], :y) + +# output + +Dict((:a, RGB{N0f8}(1.0,0.0,0.0))=>Bool[true, true, false, false],(:b, RGB{N0f8}(0.0,0.0,1.0))=>Bool[false, false, true, true]) +``` + +""" +function groupby(aes::Gadfly.Aesthetics, by::Vector{Symbol}, togroupvar::Symbol) + types = fill(Nothing, length(by)) + isconcrete = fill(false, length(by)) + for i in 1:length(by) + isconcrete[i] = getfield(aes, by[i]) != nothing + (!isconcrete[i]) && continue + types[i] = eltype(getfield(aes, by[i])) + @assert length(getfield(aes, togroupvar)) == length(getfield(aes, by[i])) "$togroupvar and $(by[i]) aesthetics must have same length" + end + + T = Tuple{types...} + grouped = Dict{T, Vector{Bool}}() + + # gather options for each `by` aesthetic + opt = [if isconcrete[i] unique(getfield(aes, by[i])) else [nothing] end for i in 1:length(by)] + + # The approach is to identify positions were multiple by aesthetics overlap + # and thus grouping the data positions. We first assume that all positions + # belong to a combination of aesthetics and then whittle it down + for combo in product(opt...) + belongs = fill(true, length(getfield(aes, togroupvar))) + for i in 1:length(combo) + (combo[i] == nothing) && continue + belongs .&= getfield(aes, by[i]) .== combo[i] + end + # for multiple by variables we need to check whether there is any overlap + # between this specific combo before adding it to the dict + (any(belongs)) && (grouped[combo] = belongs) + end + grouped +end diff --git a/src/geom/density.jl b/src/geom/density.jl index 65edd7829..4eff42190 100644 --- a/src/geom/density.jl +++ b/src/geom/density.jl @@ -1,36 +1,70 @@ +struct DensityGeometry <: Gadfly.GeometryElement + stat::Gadfly.StatisticElement + order::Int + tag::Symbol +end + +function DensityGeometry(; order=1, tag=empty_tag, kwargs...) + DensityGeometry(Gadfly.Stat.DensityStatistic(; kwargs...), order, tag) +end + +DensityGeometry(stat; order=1, tag=empty_tag) = DensityGeometry(stat, order, tag) + +const density = DensityGeometry + +element_aesthetics(::DensityGeometry) = Symbol[] +default_statistic(geom::DensityGeometry) = Gadfly.Stat.DensityStatistic(geom.stat) + struct ViolinGeometry <: Gadfly.GeometryElement + stat::Gadfly.StatisticElement + split::Bool order::Int tag::Symbol end -ViolinGeometry(; order=1, tag=empty_tag) = ViolinGeometry(order, tag) +function ViolinGeometry(; order=1, tag=empty_tag, split=false, kwargs...) + ViolinGeometry(Gadfly.Stat.DensityStatistic(; kwargs...), split, order, tag) +end const violin = ViolinGeometry element_aesthetics(::ViolinGeometry) = [:x, :y, :color] -default_statistic(::ViolinGeometry) = Gadfly.Stat.violin() +default_statistic(geom::ViolinGeometry) = Gadfly.Stat.DensityStatistic(geom.stat) function render(geom::ViolinGeometry, theme::Gadfly.Theme, aes::Gadfly.Aesthetics) - # TODO: What should we do with the color aesthetic? Gadfly.assert_aesthetics_defined("Geom.violin", aes, :y, :width) Gadfly.assert_aesthetics_equal_length("Geom.violin", aes, :y, :width) - default_aes = Gadfly.Aesthetics() - default_aes.color = fill(theme.default_color, length(aes.y)) - aes = Gadfly.inherit(aes, default_aes) - - # Group y, width and color by x - ux = unique(aes.x) - grouped_color = Dict(x => first(aes.color[aes.x.==x]) for x in ux) - grouped_y = Dict(x => aes.y[aes.x.==x] for x in ux) - grouped_width = Dict(x => aes.width[aes.x.==x] for x in ux) - - kgy = keys(grouped_y) - violins = [vcat([(x - w/2, y) for (y, w) in zip(grouped_y[x], grouped_width[x])], - reverse!([(x + w/2, y) for (y, w) in zip(grouped_y[x], grouped_width[x])])) - for x in kgy] - colors = [grouped_color[x] for x in kgy] + grouped_data = Gadfly.groupby(aes, [:x, :color], :y) + violins = Array{NTuple{2, Float64}}[] + + colors = [] + (aes.color == nothing) && (aes.color = fill(theme.default_color, length(aes.x))) + color_opts = unique(aes.color) + if geom.split && length(color_opts) > 2 + error("Split violins require 2 colors, not more") + end + + for (keys, belongs) in grouped_data + x, color = keys + ys = aes.y[belongs] + ws = aes.width[belongs] + + if geom.split + pos = findfirst(color_opts, color) + if pos == 1 + push!(violins, [(x - w/2, y) for (y, w) in zip(ys, ws)]) + else + push!(violins, reverse!([(x + w/2, y) for (y, w) in zip(ys, ws)])) + end + push!(colors, color) + else + push!(violins, vcat([(x - w/2, y) for (y, w) in zip(ys, ws)], + reverse!([(x + w/2, y) for (y, w) in zip(ys, ws)]))) + push!(colors, color != nothing ? color : theme.default_color) + end + end ctx = context(order=geom.order) compose!(ctx, Compose.polygon(violins, geom.tag), fill(colors)) diff --git a/src/geom/line.jl b/src/geom/line.jl index 68c4ff4ea..23a1f534b 100644 --- a/src/geom/line.jl +++ b/src/geom/line.jl @@ -27,9 +27,6 @@ end # any of the others will work with `preserve_order=true` right now path() = LineGeometry(preserve_order=true) -density(; bandwidth::Real=-Inf) = - LineGeometry(Gadfly.Stat.density(bandwidth=bandwidth)) - density2d(; bandwidth::Tuple{Real,Real}=(-Inf,-Inf), levels=15) = LineGeometry(Gadfly.Stat.density2d(bandwidth=bandwidth, levels=levels); preserve_order=true) diff --git a/src/statistics.jl b/src/statistics.jl index 01a9028b4..09f0dcc96 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -17,7 +17,6 @@ using IndirectArrays import Gadfly: Scale, Coord, input_aesthetics, output_aesthetics, default_scales, isconcrete, setfield!, discretize_make_ia import KernelDensity -# import Distributions: Uniform, Distribution, qqbuild import IterTools: chain, distinct import Compat.Iterators: cycle, product @@ -477,60 +476,165 @@ end struct DensityStatistic <: Gadfly.StatisticElement - # Number of points sampled + """ + Number of points sampled for estimate. Powers of two yields better + performance. + """ n::Int - # Bandwidth used for the kernel density estimation + + """ + Bandwidth used for the kernel density estimation. This corresponds to the + standard deviation of the `kernel`. + """ bw::Real + + """ + Multiplicative adjustment of the computed optimal bandwidth. This is a + relative adjustment, see `bw` to enforce a specific numerical bandwidth. + """ + adjust::Float64 + + """ + Kernel used for density estimation, see `KernelDensity.jl` for more details. + Default is the Normal Distribution. + """ + kernel + + """ + If set to `true` (default), trim the tails of the estimate to fit the range + of the data. + """ + trim::Bool + + """ + Method for scaling across multiple estimates. If `:area` (default), all + density estimates will have the same area under the curve (prior to trimming + ). If `:count`, the areas are scaled proportionally to the total number of + observations for each density estimate. If `:peak`, then all densities will + have the same maximum peak height. + """ + scale::Symbol +end + +function DensityStatistic(; n=256, + bandwidth=-Inf, + adjust=1.0, + kernel=Normal, + trim=true, + scale=:area, + ) + DensityStatistic(n, bandwidth, adjust, kernel, trim, scale) end -DensityStatistic(; n=256, bandwidth=-Inf) = DensityStatistic(n, bandwidth) -const density = DensityStatistic -input_aesthetics(stat::DensityStatistic) = [:x] -output_aesthetics(stat::DensityStatistic) = [:x, :y] +input_aesthetics(stat::DensityStatistic) = [:x, :y, :color] +output_aesthetics(stat::DensityStatistic) = [:x, :y, :color] + +""" +Given a scale and aesthetic, figures out which variable will be fed into the KDE, +which aesthetics will be overridden with what and provides a nested dictionary +mapping category to dictionaries of color mapped to a boolean array. The array +corresponds to whether a certain datapoint belongs to a given (category, color) +group. The length of the boolean array is equal to `length(aes.x)`, etc. +""" +function determine_density_grouping(scales::Dict{Symbol, Gadfly.ScaleElement}, + aes::Gadfly.Aesthetics) + + # TODO: This general approach should work for any statistic that has a + # categorical component and color. So this should be generalized to bar and + # boxplot + defined = intersect(Set([:x, :y]), Gadfly.defined_aesthetics(aes)) + (length(defined) == 0) && error("DensityStatistic requires either the x or y aesthetics to be defined") + densityvar = :x # variable on which the KDE will be run + + # violin and density have different `element_aesthetics` functions and + # therefore both x and y will only be defined if `Geom.violin()` is used + if length(defined) == 1 && :y in defined + densityvar = :y + end + + # what aes variable to store the category info, points, and densities + outputs = (nothing, :x, :y) + + unqcat = [1] + # densities grouped by the categorical variable and color + catgroup = Dict(1 => fill(true, length(getfield(aes, densityvar)))) + + if :x in defined && :y in defined # is violin? + # We first need to establish whether this is a horizontal or vertical violin + xcat, ycat = Scale.iscategorical(scales, :x), Scale.iscategorical(scales, :y) + catvar = :x + if xcat && ycat + error("Either the x or y aesthetics must be Real for kernel density estimation") + elseif xcat + densityvar = :y + elseif ycat + catvar = :y + densityvar = :x + else # neither x or y is categorical so we'll assume x is meant to be categorical, see #968 + new_scale = Scale.x_discrete(order=sortperm(unique(aes.x))) + Scale.apply_scale(new_scale, [aes], Gadfly.Data(x=aes.x)) + scales[:x] = new_scale + densityvar = :y + warn( + """ + Both x and y aesthetics are continuous, violin plots require a + categorical variable. Transforming x to be categorical. + """) + end + + outputs = (catvar, densityvar, :width) + + end -default_scales(::DensityStatistic) = [Gadfly.Scale.y_continuous()] + densityvar, outputs, Gadfly.groupby(aes, [catvar, :color], densityvar) +end function apply_statistic(stat::DensityStatistic, scales::Dict{Symbol, Gadfly.ScaleElement}, coord::Gadfly.CoordinateElement, aes::Gadfly.Aesthetics) - Gadfly.assert_aesthetics_defined("DensityStatistic", aes, :x) - if aes.color === nothing - isa(aes.x[1], Real) || error("Kernel density estimation only works on Real types.") - x_f64 = collect(Float64, aes.x) + densityvar, outputs, grouped_data = determine_density_grouping(scales, aes) - window = stat.bw <= 0.0 ? KernelDensity.default_bandwidth(x_f64) : stat.bw - f = KernelDensity.kde(x_f64, bandwidth=window, npoints=stat.n) - aes.x = collect(Float64, f.x) - aes.y = f.density - else - groups = Dict() - for (x, c) in zip(aes.x, cycle(aes.color)) - if !haskey(groups, c) - groups[c] = Float64[x] - else - push!(groups[c], x) - end - end + densityinput = getfield(aes, densityvar) - colors = Array{RGB{Float32}}(0) - aes.x = Array{Float64}(0) - aes.y = Array{Float64}(0) - for (c, xs) in groups - window = stat.bw <= 0.0 ? KernelDensity.default_bandwidth(xs) : stat.bw - f = KernelDensity.kde(xs, bandwidth=window, npoints=stat.n) - append!(aes.x, f.x) - append!(aes.y, f.density) - for _ in 1:length(f.x) - push!(colors, c) - end + aes.x = Array{Float64}(0) + aes.y = Array{Float64}(0) + aes.width = Array{Float64}(0) + colors = eltype(aes.color)[] + + for ((cat, color), belongs) in grouped_data + input = densityinput[belongs] + window = stat.n > 1 ? KernelDensity.default_bandwidth(input)*stat.adjust : 0.1 + if stat.trim + f = KernelDensity.kde(input, kernel=stat.kernel, + boundary=extrema(input), + bandwidth=window, + npoints=stat.n) + else + f = KernelDensity.kde(input, kernel=stat.kernel, bandwidth=window, npoints=stat.n) end - aes.color = discretize_make_ia(colors) + # only store category information if this is a violin plot and we need it + (outputs[1] != nothing) && append!(getfield(aes, outputs[1]), fill(cat, length(f.density))) + append!(getfield(aes, outputs[2]), f.x) + + if stat.scale == :area + append!(getfield(aes, outputs[3]), f.density) + elseif stat.scale == :count + append!(getfield(aes, outputs[3]), f.density.*sum(input)) + else + append!(getfield(aes, outputs[3]), f.density ./ maximum(f.density)) + end + append!(colors, fill(color, length(f.density))) end - aes.y_label = Gadfly.Scale.identity_formatter + (aes.color != nothing) && (aes.color = colors) + + pad = 0.1 + maxwidth = maximum(aes.width) + broadcast!(*, aes.width, aes.width, 1 - pad) + broadcast!(/, aes.width, aes.width, maxwidth) end @@ -1507,60 +1611,6 @@ function apply_statistic(stat::QQStatistic, end end - -struct ViolinStatistic <: Gadfly.StatisticElement - # Number of points sampled - n::Int -end -ViolinStatistic() = ViolinStatistic(300) - -input_aesthetics(::ViolinStatistic) = [:x, :y, :width] - Gadfly.default_scales(stat::ViolinStatistic) = [Gadfly.Scale.x_discrete(), Gadfly.Scale.y_continuous()] - - -const violin = ViolinStatistic - -function apply_statistic(stat::ViolinStatistic, - scales::Dict{Symbol, Gadfly.ScaleElement}, - coord::Gadfly.CoordinateElement, - aes::Gadfly.Aesthetics) - - isa(aes.y[1], Real) || error("Kernel density estimation only works on Real types.") - - grouped_y = Dict(1=>aes.y) - grouped_color = Dict{Int, Gadfly.ColorOrNothing}(1=>nothing) - ux = unique(aes.x) - uxflag = length(ux) < length(aes.x) - colorflag = aes.color != nothing - - uxflag && (grouped_y = Dict(x=>aes.y[aes.x.==x] for x in ux)) - - grouped_color = (colorflag ? Dict(x=>first(aes.color[aes.x.==x]) for x in ux) : - uxflag && Dict(x=>nothing for x in ux) ) - - aes.x = Array{Float64}(0) - aes.y = Array{Float64}(0) - aes.width = Array{Float64}(0) - colors = eltype(aes.color)[] - - for (x, ys) in grouped_y - window = stat.n > 1 ? KernelDensity.default_bandwidth(ys) : 0.1 - f = KernelDensity.kde(ys, bandwidth=window, npoints=stat.n) - append!(aes.x, fill(x, length(f.x))) - append!(aes.y, f.x) - append!(aes.width, f.density) - append!(colors, fill(grouped_color[x], length(f.x))) - end - - colorflag && (aes.color = colors) - - pad = 0.1 - maxwidth = maximum(aes.width) - broadcast!(*, aes.width, aes.width, 1 - pad) - broadcast!(/, aes.width, aes.width, maxwidth) -end - - struct JitterStatistic <: Gadfly.StatisticElement vars::Vector{Symbol} range::Float64