diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cfbd23..8f584d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## Upcoming + +### Changed + +- Slightly optimized compilation time +- Minor updates of core formulas + +### Added + +- CLI parameter `no-ncv-estimation` to disable estimation of NCVs + ## [0.5.0] — 2021-06-03 ### Changed diff --git a/Dockerfile b/Dockerfile index 5a57005..7adc810 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,28 +11,6 @@ RUN pip3 install jupyterlab numpy scipy matplotlib seaborn pandas sklearn scikit RUN pip3 install -Iv six==1.12.0 -RUN \ - jupyter nbextensions_configurator enable && \ - jupyter contrib nbextension install - -RUN \ - jupyter nbextension enable codefolding/main && \ - jupyter nbextension enable codefolding/edit && \ - jupyter nbextension enable collapsible_headings/main && \ - jupyter nbextension enable notify/notify && \ - jupyter nbextension enable ruler/main && \ - jupyter nbextension enable table_beautifier/main && \ - jupyter nbextension enable comment-uncomment/main && \ - jupyter nbextension enable hide_header/main && \ - jupyter nbextension enable livemdpreview/livemdpreview && \ - jupyter nbextension enable spellchecker/main && \ - jupyter nbextension enable execution_dependencies/execution_dependencies && \ - jupyter nbextension enable freeze/main && \ - jupyter nbextension enable tree-filter/index && \ - jupyter nbextension enable init_cell/main && \ - jupyter nbextension enable move_selected_cells/main && \ - jupyter nbextension enable toc2/main - RUN julia -e 'using Pkg; Pkg.add("IJulia"); Pkg.build(); using IJulia;' ### jupyter notebook --no-browser --port=8989 --ip=0.0.0.0 --allow-root ./ @@ -40,13 +18,13 @@ RUN julia -e 'using Pkg; Pkg.add("IJulia"); Pkg.build(); using IJulia;' ## Julia Baysor envitonment RUN julia -e 'using Pkg; Pkg.add("PackageCompiler")' -RUN julia -e 'using Pkg; Pkg.add(PackageSpec(url="https://github.com/hms-dbmi/Baysor.git")); Pkg.build(); using Baysor;' +RUN julia -e 'using Pkg; Pkg.add(PackageSpec(url="https://github.com/hms-dbmi/Baysor.git")); Pkg.build();' -# See https://discourse.julialang.org/t/sysimage-incompatible-between-intel-and-amd-x86-64/42716/2 for cpu_target flags -RUN julia -e 'using PackageCompiler; import Baysor; create_sysimage(:Baysor; precompile_execution_file="$(dirname(pathof(Baysor)))/../bin/build.jl", replace_default=true, cpu_target="generic")' +RUN julia -e 'import Baysor, Pkg; Pkg.activate(dirname(dirname(pathof(Baysor)))); Pkg.instantiate();' +RUN julia -e 'using PackageCompiler; import Baysor, Pkg; Pkg.activate(dirname(dirname(pathof(Baysor)))); create_sysimage(:Baysor; precompile_execution_file="$(dirname(pathof(Baysor)))/../bin/precompile.jl", replace_default=true, cpu_target="generic")' RUN \ - echo "#!/usr/bin/env julia\n\nimport Baysor\nBaysor.run_cli()" >> /bin/baysor && \ + printf "#!/usr/bin/env julia\n\nimport Baysor\nBaysor.run_cli()" >> /bin/baysor && \ chmod +x /bin/baysor ENTRYPOINT ["/bin/bash"] diff --git a/bin/build.jl b/bin/build.jl index fcb074c..12e8520 100755 --- a/bin/build.jl +++ b/bin/build.jl @@ -1,9 +1,10 @@ #! /usr/bin/env julia using PackageCompiler +import Pkg out_path = length(ARGS) > 1 ? ARGS[1] : "Baysor"; baysor_path = length(ARGS) > 2 ? ARGS[2] : dirname(@__DIR__); Base.LOAD_PATH .= baysor_path -create_app(baysor_path, out_path; precompile_execution_file="$(baysor_path)/bin/precompile.jl") \ No newline at end of file +create_app(baysor_path, out_path; precompile_execution_file="$(baysor_path)/bin/precompile.jl") diff --git a/src/bmm_algorithm/bmm_algorithm.jl b/src/bmm_algorithm/bmm_algorithm.jl index 44da9a2..6bd0735 100644 --- a/src/bmm_algorithm/bmm_algorithm.jl +++ b/src/bmm_algorithm/bmm_algorithm.jl @@ -45,14 +45,14 @@ end empty!(component_weights) empty!(comp_weights) empty!(comp_ids) - zero_comp_weight = 0.0 + bg_comp_weight = 0.0 @inbounds @simd for i in 1:length(adjacent_weights) c_point = adjacent_points[i] c_id = assignment[c_point] cw = adjacent_weights[i] if c_id == 0 - zero_comp_weight += cw + bg_comp_weight += cw else component_weights[c_id] = get(component_weights, c_id, 0.0) + cw end @@ -63,13 +63,13 @@ end push!(comp_weights, v) end - return zero_comp_weight + return bg_comp_weight end @inline function fill_adjacent_component_weights!(adj_classes::Vector{Int}, adj_weights::Vector{Float64}, data::BmmData, mol_id::Int; component_weights::Dict{Int, Float64}, adj_classes_global::Dict{Int, Vector{Int}}) # Looks like it's impossible to optimize further, even with vectorization. It means that creating vectorized version of expect_dirichlet_spatial makes few sense - zero_comp_weight = aggregate_adjacent_component_weights!(adj_classes, adj_weights, component_weights, data.assignment, + bg_comp_weight = aggregate_adjacent_component_weights!(adj_classes, adj_weights, component_weights, data.assignment, data.adjacent_points[mol_id], data.adjacent_weights[mol_id]) if mol_id in keys(adj_classes_global) @@ -78,7 +78,7 @@ end append!(adj_weights, ones(length(adj_classes) - n1) .* data.real_edge_weight) end - return zero_comp_weight + return bg_comp_weight end function adjust_densities_by_prior_segmentation!(denses::Vector{Float64}, segment_id::Int, largest_cell_id::Int, @@ -105,7 +105,7 @@ function adjust_densities_by_prior_segmentation!(denses::Vector{Float64}, segmen end function expect_density_for_molecule!(denses::Vector{Float64}, data::BmmData{N}, mol_id::Int; - zero_comp_weight::Float64, adj_classes::Vector{Int}, adj_weights::Vector{Float64}) where N + bg_comp_weight::Float64, adj_classes::Vector{Int}, adj_weights::Vector{Float64}) where N x = SVector{N}(position_data(data)[:,mol_id]...) gene::Union{Int, Missing} = composition_data(data)[mol_id] confidence::Float64 = data.confidence[mol_id] @@ -143,7 +143,7 @@ function expect_density_for_molecule!(denses::Vector{Float64}, data::BmmData{N}, end if confidence < 1.0 - push!(denses, (1 - confidence) * exp(data.mrf_strength * fmax(data.real_edge_weight, zero_comp_weight)) * data.noise_density) + push!(denses, (1 - confidence) * exp(data.mrf_strength * bg_comp_weight) * data.noise_density) push!(adj_classes, 0) end @@ -159,10 +159,10 @@ function expect_dirichlet_spatial!(data::BmmData; stochastic::Bool=true) adj_classes_global = get_global_adjacent_classes(data) for i in 1:size(data.x, 1) - zero_comp_weight = fill_adjacent_component_weights!(adj_classes, adj_weights, data, i; + bg_comp_weight = fill_adjacent_component_weights!(adj_classes, adj_weights, data, i; component_weights=component_weights, adj_classes_global=adj_classes_global) - expect_density_for_molecule!(denses, data, i; adj_classes=adj_classes, adj_weights=adj_weights, zero_comp_weight=zero_comp_weight) + expect_density_for_molecule!(denses, data, i; adj_classes=adj_classes, adj_weights=adj_weights, bg_comp_weight=bg_comp_weight) assign_molecule!(data, i; denses=denses, adj_classes=adj_classes, stochastic=stochastic) end diff --git a/src/cli/main.jl b/src/cli/main.jl index 81d8a29..20b8f4a 100644 --- a/src/cli/main.jl +++ b/src/cli/main.jl @@ -65,6 +65,11 @@ function parse_commandline(args::Union{Nothing, Array{String, 1}}=nothing) # TOD arg_type = Float64 default = 0.2 + "--no-ncv-estimation" + dest_name = "estimate-ncvs" + help = "Turns off neighborhood composition vectors estimation" + action = :store_false + "coordinates" help = "CSV file with coordinates of molecules and gene type" required = true @@ -389,9 +394,11 @@ function save_segmentation_results(bm_data::BmmData, gene_names::Vector{String}, segmentated_df = get_segmentation_df(bm_data, gene_names) cell_stat_df = get_cell_stat_df(bm_data, segmentated_df; add_qc=true) - @info "Estimating local colors" - gene_colors = gene_composition_colors(bm_data.x, args["gene-composition-neigborhood"]) - segmentated_df[!, :ncv_color] = "#" .* Colors.hex.(gene_colors) + if args["estimate-ncvs"] + @info "Estimating local colors" + gene_colors = gene_composition_colors(bm_data.x, args["gene-composition-neigborhood"]) + segmentated_df[!, :ncv_color] = "#" .* Colors.hex.(gene_colors) + end @info "Saving results to $(args["output"])" CSV.write(args["output"], segmentated_df[sortperm(segmentated_df.molecule_id), :]); @@ -403,14 +410,16 @@ function save_segmentation_results(bm_data::BmmData, gene_names::Vector{String}, if args["plot"] plot_diagnostics_panel(segmentated_df, segmentated_df.cell, bm_data.tracer, args; clust_res=mol_clusts, comp_segs=comp_segs) - polygons = plot_transcript_assignment_panel(bm_data.x, bm_data.assignment, args; clusters=bm_data.cluster_per_molecule, prior_polygons=prior_polygons, - gene_colors=gene_colors) - - if args["save-polygons"] !== nothing - if lowercase(args["save-polygons"]) == "geojson" - save_polygons_to_geojson(polygons, append_suffix(args["output"], "polygons.json")) - else - @warn "Unknown polygon format: $(args["save-polygons"])" + if args["estimate-ncvs"] + polygons = plot_transcript_assignment_panel(bm_data.x, bm_data.assignment, args; clusters=bm_data.cluster_per_molecule, prior_polygons=prior_polygons, + gene_colors=gene_colors) + + if args["save-polygons"] !== nothing + if lowercase(args["save-polygons"]) == "geojson" + save_polygons_to_geojson(polygons, append_suffix(args["output"], "polygons.json")) + else + @warn "Unknown polygon format: $(args["save-polygons"])" + end end end end diff --git a/src/data_processing/diagnostic_plots.jl b/src/data_processing/diagnostic_plots.jl index 168ea5b..bb43e0d 100644 --- a/src/data_processing/diagnostic_plots.jl +++ b/src/data_processing/diagnostic_plots.jl @@ -6,12 +6,12 @@ function plot_noise_estimation_diagnostics(edge_lengths::Vector{Float64}, confid x_vals = range(0, x_max, length=1000) n1 = sum(confidences); n2 = length(confidences) - n1; - + p_df = estimate_hist(edge_lengths[edge_lengths .< x_max], normalize=true, nbins=bins) p_df[!, :intra] = n1 / (n1 + n2) .* pdf.(d1, p_df.s) p_df[!, :bg] = n2 / (n1 + n2) .* pdf.(d2, p_df.s) - - return p_df |> + + return p_df |> @vlplot(x={:s, title="Distance to $(confidence_nn_id)'th nearest neighbor"}, title=title, width=400, height=300) + @vlplot(:bar, x2=:e, y={:h, title="Density"}, color={datum="Observed", scale={scheme="category10"}, legend={title="Distribution"}}) + @vlplot({:line, size=linewidth}, y=:bg, color={datum="Background"}) + @@ -44,10 +44,10 @@ function plot_gene_structure(df_spatial::DataFrame, gene_names::Vector, confiden return p_df |> @vlplot( - x={:x, scale={domain=val_range(p_df.x)}, title="UMAP-1"}, - y={:y, scale={domain=val_range(p_df.y)}, title="UMAP-2"}, - size={:size, scale={range=[5, 10]}, legend=false}, - tooltip={:gene, type="nominal"}, + x={:x, scale={domain=val_range(p_df.x)}, title="UMAP-1"}, + y={:y, scale={domain=val_range(p_df.y)}, title="UMAP-2"}, + size={:size, scale={range=[5, 10]}, legend=false}, + tooltip={:gene, type="nominal"}, width=600, height=600, title="Gene local structure", config={axis={grid=false, ticks=false, ticklabels=false, labels=false}} ) + diff --git a/src/data_processing/initialization.jl b/src/data_processing/initialization.jl index 123d334..3fb2e96 100644 --- a/src/data_processing/initialization.jl +++ b/src/data_processing/initialization.jl @@ -62,12 +62,12 @@ end # Initialize cell positions -function position_data_by_assignment(pos_data::T where T<: AbstractMatrix{<:Real}, assignment::Vector{<:Integer}) +function position_data_by_assignment(pos_data::Matrix{Float64}, assignment::Vector{Int}) filt_ids = findall(assignment .> 0) return [pos_data[:, ids] for ids in split(filt_ids, assignment[filt_ids])] end -function covs_from_assignment(pos_data::T where T<: AbstractMatrix{<:Real}, assignment::Vector{<:Integer}; min_size::Int=1)::Array{<:CovMat, 1} +function covs_from_assignment(pos_data::Matrix{Float64}, assignment::Vector{Int}; min_size::Int=1)::Array{<:CovMat, 1} pos_data_by_assignment = position_data_by_assignment(pos_data, assignment) CM, MV = (size(pos_data, 1) == 2) ? (CovMat{2}, MeanVec{2}) : (CovMat{3}, MeanVec{3}) @@ -84,8 +84,8 @@ end cell_centers_uniformly(spatial_df::DataFrame, args...; kwargs...) = cell_centers_uniformly(position_data(spatial_df), args..., (:confidence in propertynames(spatial_df)) ? spatial_df.confidence : nothing; kwargs...) -function cell_centers_uniformly(pos_data::T where T<: AbstractMatrix{<:Real}, n_clusters::Int, - confidences::Union{Vector{Float64}, Nothing}=nothing; scale::Union{<:Real, Nothing}) +function cell_centers_uniformly(pos_data::Matrix{Float64}, n_clusters::Int, + confidences::Union{Vector{Float64}, Nothing}=nothing; scale::Union{Float64, Nothing}) n_clusters = min(n_clusters, size(pos_data, 2)) cluster_centers = pos_data[:, select_ids_uniformly(pos_data', confidences; n=n_clusters, confidence_threshold=0.25)] @@ -95,10 +95,10 @@ function cell_centers_uniformly(pos_data::T where T<: AbstractMatrix{<:Real}, n_ if scale === nothing covs = covs_from_assignment(pos_data, cluster_labels) elseif size(pos_data, 1) == 2 - scale = Float64(scale) ^ 2 + scale = scale ^ 2 covs = [CovMat{2}(diagm(0 => (ones(2) .* scale))) for i in 1:size(cluster_centers, 2)] elseif size(pos_data, 1) == 3 - scale = Float64(scale) ^ 2 + scale = scale ^ 2 covs = [CovMat{3}(diagm(0 => (ones(3) .* scale))) for i in 1:size(cluster_centers, 2)] else error("Unexpected number of dimensions: $(size(pos_data, 1))") diff --git a/src/data_processing/neighborhood_composition.jl b/src/data_processing/neighborhood_composition.jl index c3e2c3f..fccb5a1 100644 --- a/src/data_processing/neighborhood_composition.jl +++ b/src/data_processing/neighborhood_composition.jl @@ -1,9 +1,9 @@ using Base.Threads -neighborhood_count_matrix(data::Union{BmmData, T} where T <: AbstractDataFrame, k::Int; kwargs...) = +neighborhood_count_matrix(data::Union{BmmData, DataFrame}, k::Int; kwargs...) = neighborhood_count_matrix(position_data(data), composition_data(data), k; kwargs...) -function neighborhood_count_matrix(pos_data::Matrix{<:Real}, genes::Vector{Int}, k::Int; confidences::Union{Vector{Float64}, Nothing}=nothing, +function neighborhood_count_matrix(pos_data::Matrix{Float64}, genes::Vector{Int}, k::Int; confidences::Union{Vector{Float64}, Nothing}=nothing, n_genes::Int=maximum(genes), normalize_by_dist::Bool=true, normalize::Bool=true) if k < 3 @warn "Too small value of k: $k. Setting it to 3." @@ -18,7 +18,6 @@ function neighborhood_count_matrix(pos_data::Matrix{<:Real}, genes::Vector{Int}, @warn "`confidences` are not supported with `normalize=true`. Ignoring them." confidences = nothing end - k = min(k, size(pos_data, 2)) @@ -117,9 +116,9 @@ function gene_composition_colors!(embedding::Matrix{Float64}; lrange::Tuple{<:Re return vec(mapslices(col -> Colors.Lab(col...), embedding, dims=1)) end -function gene_composition_colors(df_spatial::T where T <: AbstractDataFrame, k::Int; kwargs...) +function gene_composition_colors(df_spatial::DataFrame, k::Int; kwargs...) neighb_cm = neighborhood_count_matrix(df_spatial, k); confidence = (:confidence in propertynames(df_spatial)) ? df_spatial.confidence : ones(size(df_spatial, 1)) transformation = gene_composition_transformation(neighb_cm, confidence) return gene_composition_colors(neighb_cm, transformation; kwargs...) -end \ No newline at end of file +end diff --git a/src/data_processing/noise_estimation.jl b/src/data_processing/noise_estimation.jl index 6be9eef..e536249 100644 --- a/src/data_processing/noise_estimation.jl +++ b/src/data_processing/noise_estimation.jl @@ -1,8 +1,8 @@ -@views maximize_noise_distributions(edge_lengths::Vector{Float64}, assignment_probs::Matrix{Float64}; updating_ids::T2 where T2 <: AbstractArray{Int} = 1:length(edge_lengths)) = +@views maximize_noise_distributions(edge_lengths::Vector{Float64}, assignment_probs::Matrix{Float64}; updating_ids::Vector{Int}) = (Normal(wmean_std(edge_lengths, assignment_probs[:,i], non_zero_ids=updating_ids)...) for i in 1:2) function expect_noise_probabilities!(assignment_probs::Matrix{Float64}, d1::Normal, d2::Normal, edge_lengths::Vector{Float64}, - adjacent_points::Vector{Vector{Int}}, adjacent_weights::Vector{Vector{Float64}}; updating_ids::T where T <: AbstractVector{Int} = 1:length(edge_lengths), + adjacent_points::Vector{Vector{Int}}, adjacent_weights::Vector{Vector{Float64}}; updating_ids::Vector{Int}, min_confidence::Union{Vector{Float64}, Nothing}=nothing) norm_denses = (pdf.(d1, edge_lengths), pdf.(d2, edge_lengths)); n1 = sum(assignment_probs[:, 1]) diff --git a/src/data_processing/triangulation.jl b/src/data_processing/triangulation.jl index 6676268..0ce6616 100644 --- a/src/data_processing/triangulation.jl +++ b/src/data_processing/triangulation.jl @@ -24,7 +24,7 @@ getx(p::IndexedPoint2D) = p._x gety(p::IndexedPoint2D) = p._y geti(p::IndexedPoint2D) = p._index -function adjacency_list(points::AbstractArray{T, 2} where T <: Real; filter::Bool=true, n_mads::T2 where T2 <: Real =2, k_adj::Int=5, +function adjacency_list(points::Matrix{<:Real}; filter::Bool=true, n_mads::Float64=2.0, k_adj::Int=5, adjacency_type::Symbol=:triangulation, distance::TD where TD <: Distances.SemiMetric = Euclidean()) if size(points, 1) == 3 (adjacency_type == :knn) || @warn "Only k-nn random field is supported for 3D data" @@ -81,7 +81,7 @@ end adjacency_list(spatial_df::DataFrame; kwargs...) = adjacency_list(position_data(spatial_df); kwargs...) -function connected_components(adjacent_points::Array{Array{Int, 1}, 1}) +function connected_components(adjacent_points::Array{Vector{Int}, 1}) g = LightGraphs.SimpleGraph(length(adjacent_points)); for (v1, vs) in enumerate(adjacent_points) for v2 in vs diff --git a/src/data_processing/umap_wrappers.jl b/src/data_processing/umap_wrappers.jl index c5d4ba3..429ff4b 100644 --- a/src/data_processing/umap_wrappers.jl +++ b/src/data_processing/umap_wrappers.jl @@ -23,15 +23,15 @@ function fit(::Type{UmapFit}, x::Array{Float64, 2}, pca::MultivariateStats.PCA; return UmapFit(embedding, nn_tree, pca, nn_interpolate) end -fit(::Type{UmapFit}, x::Array{Float64, 2}; n_pcs::Int=15, kwargs...)::UmapFit = +fit(::Type{UmapFit}, x::Matrix{Float64}; n_pcs::Int=15, kwargs...)::UmapFit = fit(UmapFit, x, fit(MultivariateStats.PCA, x, maxoutdim=n_pcs); kwargs...) -function transform(transformation::UmapFit, x::Array{Float64, 2}; dist_offset::Float64=1e-10)::Array{Float64, 2} +function transform(transformation::UmapFit, x::Matrix{Float64}; dist_offset::Float64=1e-10)::Matrix{Float64} x_pc = transform(transformation.pca_transform, x) # This transformation is much faster than the implementation from UMAP.jl, even for input dimensionality = 50. # Probably, because of the slow NearestNeighborDescent package. indices, distances = knn(transformation.nn_tree, x_pc, transformation.nn_interpolate) - res = [mapslices(x -> wmean(x, 1 ./ (dists .+ dist_offset)), transformation.embedding[:, ids], dims=2) for (ids, dists) in zip(indices, distances)] + res = [mapslices(v -> wmean(v, 1 ./ (dists .+ dist_offset)), transformation.embedding[:, ids], dims=2) for (ids, dists) in zip(indices, distances)] return hcat(res...) end \ No newline at end of file diff --git a/src/distributions/MvNormal.jl b/src/distributions/MvNormal.jl index ad1db2b..e053b60 100644 --- a/src/distributions/MvNormal.jl +++ b/src/distributions/MvNormal.jl @@ -19,7 +19,7 @@ struct MvNormalF{L} return MvNormalF(MeanVec{3}(μ)) end - + function MvNormalF(μ::T1 where T1 <: AbstractVector{Float64}, Σ::T2 where T2 <: AbstractMatrix{Float64}) if length(μ) == 2 return MvNormalF(MeanVec{2}(μ), CovMat{2}(Σ)) @@ -68,7 +68,7 @@ end pdf(d::MvNormalF, x::SVector{N, Float64} where N) = exp(log_pdf(d, x)) shape(d::MvNormalF) = Matrix(d.Σ) -function estimate_sample_cov!(Σ::CovMat{N}, x::T where T <: AbstractMatrix{Float64}, weights::Nothing=nothing; μ::MeanVec{N}) where N +function estimate_sample_cov!(Σ::CovMat{N}, x::T where T <: AbstractMatrix{Float64}; μ::MeanVec{N}) where N # https://en.wikipedia.org/wiki/Estimation_of_covariance_matrices#Intrinsic_expectation Σ .= 0.0 for i in 1:size(x, 2) @@ -83,39 +83,21 @@ function estimate_sample_cov!(Σ::CovMat{N}, x::T where T <: AbstractMatrix{Floa return Σ end -function estimate_sample_cov!(Σ::CovMat{N}, x::T where T <: AbstractMatrix{Float64}, weights::T2 where T2 <: AbstractVector{<:Real}; μ::MeanVec{N}) where N - # https://en.wikipedia.org/wiki/Estimation_of_covariance_matrices#Intrinsic_expectation - Σ .= 0.0 - sum_w = 0.0 - for i in 1:size(x, 2) - w = fmax(weights[i], 0.01) - for c in 1:size(Σ, 1) - for r in 1:size(Σ, 1) - Σ[r, c] += w * (x[c, i] - μ[c]) * (x[r, i] - μ[r]) - end - end - sum_w += w - end - - Σ ./= sum_w - return Σ -end - -function maximize!(dist::MvNormalF, x::T, confidences::T2 where T2 <: Union{AbstractVector{<:Real}, Nothing}=nothing)::MvNormalF where T <: AbstractMatrix{Float64} +function maximize!(dist::MvNormalF, x::T; center_probs::T2 where T2 <: Union{AbstractVector{<:Real}, Nothing}=nothing)::MvNormalF where T <: AbstractMatrix{Float64} if size(x, 2) == 0 return dist end - if confidences === nothing + if center_probs === nothing mean!(dist.μ, x) else - wmean!(dist.μ, x, confidences) + wmean!(dist.μ, x, center_probs) end if size(x, 2) <= length(dist.μ) return dist end - estimate_sample_cov!(dist.Σ, x, confidences; μ=dist.μ) + estimate_sample_cov!(dist.Σ, x; μ=dist.μ) return dist end diff --git a/src/models/BmmData.jl b/src/models/BmmData.jl index c8b9cb1..36a2f62 100644 --- a/src/models/BmmData.jl +++ b/src/models/BmmData.jl @@ -63,9 +63,9 @@ mutable struct BmmData{L} - `distribution_sampler::Component`: - `assignment::Array{Int, 1}`: """ - function BmmData(components::Array{Component{N}, 1}, x::DataFrame, adjacent_points::Array{Vector{Int}, 1}, adjacent_weights::Array{Vector{Float64}, 1}, - assignment::Vector{Int}, distribution_sampler::Component{N}; real_edge_weight::Float64=1.0, k_neighbors::Int=20, - cluster_penalty_mult::Float64=0.25, use_gene_smoothing::Bool=true, prior_seg_confidence::Float64=0.5, + function BmmData(components::Array{Component{N}, 1}, x::DataFrame, adjacent_points::Array{Vector{Int}, 1}, adjacent_weights::Array{Vector{Float64}, 1}, + assignment::Vector{Int}, distribution_sampler::Component{N}; real_edge_weight::Float64=1.0, k_neighbors::Int=20, + cluster_penalty_mult::Float64=0.25, use_gene_smoothing::Bool=true, prior_seg_confidence::Float64=0.5, min_nuclei_frac::Float64=0.1, mrf_strength::Float64=0.1, na_genes::Vector{Int}=Int[]) where N @assert maximum(assignment) <= length(components) @assert minimum(assignment) >= 0 @@ -93,11 +93,11 @@ mutable struct BmmData{L} cluster_per_molecule = :cluster in propertynames(x) ? x.cluster : Int[] self = new{N}( x, p_data, comp_data, confidence(x), cluster_per_molecule, Int[], nuclei_probs, - adjacent_points, adjacent_weights, real_edge_weight, position_knn_tree, knn_neighbors, - components, deepcopy(distribution_sampler), assignment, length(components), 0.0, + adjacent_points, adjacent_weights, real_edge_weight, position_knn_tree, knn_neighbors, + components, deepcopy(distribution_sampler), assignment, length(components), 0.0, Int[], Int[], Int[], # prior segmentation info - Dict{Symbol, Any}(), Dict{Symbol, Any}(), Int[], + Dict{Symbol, Any}(), Dict{Symbol, Any}(), Int[], prior_seg_confidence, cluster_penalty_mult, use_gene_smoothing, min_nuclei_frac, mrf_strength ) @@ -139,7 +139,7 @@ function position_data(df::AbstractDataFrame)::Matrix{Float64} if (:z in propertynames(df)) return copy(Matrix{Float64}(df[:, [:x, :y, :z]])') end - + return copy(Matrix{Float64}(df[:, [:x, :y]])') end position_data(data::BmmData)::Matrix{Float64} = data.position_data @@ -217,7 +217,7 @@ function get_cell_qc_df(segmented_df::DataFrame, cell_assignment::Vector{Int}=se if dapi_arr !== nothing # TODO: forward DAPI from CLI brightness_per_mol = staining_value_per_transcript(segmented_df, dapi_arr); - df[!, :mean_dapi_brightness] = trim_mean.(split(brightness_per_mol, cell_assignment .+ 1)[2:end]) + df[!, :mean_dapi_brightness] = mean.(trim.(split(brightness_per_mol, cell_assignment .+ 1)[2:end]; prop=0.2)) end return df diff --git a/src/models/Component.jl b/src/models/Component.jl index a2f8dbb..6b5c869 100644 --- a/src/models/Component.jl +++ b/src/models/Component.jl @@ -52,42 +52,14 @@ mutable struct Component{N} new{L}(position_params, composition_params, n_samples, 1.0, confidence, shape_prior, Dict{Int, Int}(), guid) end -# # This function would be useful when processing QC score as another kind of confidences -# function maximize!(c::Component{N} where N, pos_data::T1 where T1 <: AbstractMatrix{Float64}, comp_data::T2 where T2 <: Union{AbstractVector{Int}, AbstractVector{Union{Int, Missing}}}, -# conf_data::T3 where T3 <: AbstractVector{Float64}; nuclei_probs::Union{<:AbstractVector{Float64}, Nothing}=nothing, min_nuclei_frac::Float64=0.1) -# c.n_samples = size(pos_data, 2) # TODO: need to replace it with confidences, but for that I need to re-write all prior segmentation code to work with confidences as well. Also may need to adjust some other parts -# maximize!(c.composition_params, comp_data, conf_data); -# if nuclei_probs === nothing -# maximize!(c.position_params, pos_data, conf_data); -# else -# maximize!(c.position_params, pos_data, conf_data .* nuclei_probs); -# if length(nuclei_probs) > 1 -# c.confidence = quantile(nuclei_probs, 1 - min_nuclei_frac) -# end -# end - -# if c.shape_prior !== nothing -# try -# adjust_cov_by_prior!(c.position_params.Σ, c.shape_prior; n_samples=sum(conf_data)) -# catch -# @show c.position_params.Σ -# @show c.shape_prior -# @show sum(conf_data) -# rethrow() -# end -# end - -# return c -# end - -function maximize!(c::Component{N} where N, pos_data::T1 where T1 <: AbstractMatrix{Float64}, comp_data::T2 where T2 <: Union{AbstractVector{Int}, AbstractVector{Union{Int, Missing}}}; +function maximize!(c::Component{N} where N, pos_data::T1 where T1 <: AbstractMatrix{Float64}, comp_data::T2 where T2 <: Union{AbstractVector{Int}, AbstractVector{Union{Int, Missing}}}; nuclei_probs::Union{<:AbstractVector{Float64}, Nothing}=nothing, min_nuclei_frac::Float64=0.1) c.n_samples = size(pos_data, 2) - maximize!(c.composition_params, comp_data); + maximize!(c.composition_params, comp_data) if nuclei_probs === nothing - maximize!(c.position_params, pos_data); + maximize!(c.position_params, pos_data) else - maximize!(c.position_params, pos_data, nuclei_probs); + maximize!(c.position_params, pos_data; center_probs=nuclei_probs) if length(nuclei_probs) > 1 c.confidence = quantile(nuclei_probs, 1 - min_nuclei_frac) end diff --git a/src/models/InitialParams.jl b/src/models/InitialParams.jl index 5bcc526..54d0ec4 100644 --- a/src/models/InitialParams.jl +++ b/src/models/InitialParams.jl @@ -8,6 +8,6 @@ struct InitialParams{L} InitialParams(centers::Matrix{<:Real}, covs::Float64, assignment::Vector{Int64}) where N = new{N}(deepcopy(centers), [CovMat{N}(diagm(0 => (ones(N) .* covs))) for i in 1:size(centers, 1)], deepcopy(assignment)) - InitialParams(centers::Array{T, 2} where T <: Real, covs::Array{<:CovMat{N}, 1}, assignment::Array{Int64,1}) where N = + InitialParams(centers::Matrix{<:Real}, covs::Array{<:CovMat{N}, 1}, assignment::Array{Int64,1}) where N = new{N}(deepcopy(centers), deepcopy(covs), deepcopy(assignment)) end \ No newline at end of file diff --git a/src/utils/convex_hull.jl b/src/utils/convex_hull.jl index bc4b063..dda508f 100644 --- a/src/utils/convex_hull.jl +++ b/src/utils/convex_hull.jl @@ -1,5 +1,5 @@ -convex_hull(a::Array{T, 2} where T<:Real) = convex_hull(collect.(eachcol(a))) -function convex_hull(a::Array{Array{T,1},1} where T<:Real) +convex_hull(a::Matrix{<: Real}) = convex_hull(collect.(eachcol(a))) +function convex_hull(a::Array{Vector{T},1} where T<: Real) cw(a, b, c) = (a[1]*(b[2]-c[2])+b[1]*(c[2]-a[2])+c[1]*(a[2]-b[2]) < 0); ccw(a, b, c) = (a[1]*(b[2]-c[2])+b[1]*(c[2]-a[2])+c[1]*(a[2]-b[2]) > 0); @@ -42,4 +42,4 @@ function area(polygon::Matrix{<:Real}) end return abs(area) / 2; -end \ No newline at end of file +end diff --git a/src/utils/logging.jl b/src/utils/logging.jl index 68887b0..ad8be5a 100644 --- a/src/utils/logging.jl +++ b/src/utils/logging.jl @@ -10,7 +10,7 @@ struct DoubleLogger <: AbstractLogger message_limits::Dict{Any,Int} force_flush::Bool end -DoubleLogger(cli_stream::IO=stderr, file_stream::Union{IO, Nothing}=nothing, level=Info; force_flush::Bool=false) = +DoubleLogger(cli_stream::IO=stderr, file_stream::Union{IO, Nothing}=nothing, level=Info; force_flush::Bool=false) = DoubleLogger(cli_stream, file_stream, level, Dict{Any,Int}(), force_flush) shouldlog(logger::DoubleLogger, level, _module, group, id) = get(logger.message_limits, id, 1) > 0 @@ -46,7 +46,7 @@ function log_message(stream::IO, level, message, _module, filepath, line; force_ msglines[2:end] = (" " ^ (length(prefix) - 1)) .* msglines[2:end] if level !== Info - source_address = " " * something("$_module", "nothing") * " " * + source_address = " " * something("$_module", "nothing") * " " * something("$filepath", "nothing") * ":" * something("$line", "nothing") push!(msglines, source_address) end diff --git a/src/utils/utils.jl b/src/utils/utils.jl index f88e428..b81623c 100644 --- a/src/utils/utils.jl +++ b/src/utils/utils.jl @@ -7,7 +7,7 @@ update_args(args::Union{Dict, NamedTuple}, update::Nothing) = args update_args(args::Union{Dict, NamedTuple}, update::Union{Dict, NamedTuple}) = merge([Dict{Symbol, Any}(zip(keys(d), values(d))) for d in (args, update)]...) -estimate_density_kde(coords::Array{Float64, 2}, points::Array{Float64, 2}, bandwidth::T where T <: Real)::Vector{Float64} = +estimate_density_kde(coords::Matrix{Float64}, points::Matrix{Float64}, bandwidth::T where T <: Real)::Vector{Float64} = InterpKDE(kde((coords[1,:], coords[2,:]), bandwidth=(Float64(bandwidth), Float64(bandwidth)))).itp.(points[1,:], points[2,:]) function val_range(arr::AT where AT <: AbstractArray{<:Real}) @@ -102,8 +102,6 @@ function split(vector::T where T <: AbstractVector; n_parts::Int) return [vector[n:min(n + offset - 1, length(vector))] for n in 1:offset:length(vector)] end -trim_mean(x::Array{T, 1} where T <: Real; prop::Float64=0.2) = mean(trim(x; prop=prop)) - function split(array::T where T <: AbstractVector{TV}, factor::T2 where T2 <: AbstractVector{<:Integer}; max_factor::Union{Int, Nothing}=nothing, drop_zero::Bool=false)::Array{Vector{TV}, 1} where TV @assert length(array) == length(factor) if max_factor === nothing @@ -127,88 +125,7 @@ split(array::UnitRange{Int64}, factor::Array{Int64,1}; kwargs...) = split(collec split_ids(factor::Array{Int, 1}; kwargs...) = split(1:length(factor), factor; kwargs...) split(df::DataFrame, factor::Symbol; kwargs...) = split(df, Array(df[!, factor]); kwargs...) -split(df::DataFrame, factor::Array{Int, 1}; kwargs...) = [df[ids, :] for ids in split(1:size(df, 1), factor; kwargs...)] - -function interpolate_linear(x::T, x_start::T, x_end::T; y_start::T=1.0, y_end::T=0.0)::Float64 where T<:Real - if x < x_start - return y_start - elseif x > x_end - return y_end - end - - return y_start + (x - x_start) / (x_end - x_start) * (y_end - y_start) -end - -function is_point_in_polygon(point::Union{Vector{T1}, Tuple{T1, T1}} where T1 <: Real, poly::Array{Vector{T2}, 1} where T2 <: Real, - borders::Union{Array{Vector{T3},1}, Nothing} where T3 = nothing) - if borders !== nothing && (borders[1][1] > point[1] || borders[1][2] > point[2] || borders[2][1] < point[1] || borders[2][2] < point[2]) - return false - end - - j = length(poly) - c = false - for i in 1:length(poly) - if ((poly[i][2] > point[2]) != (poly[j][2] > point[2])) && - (point[1] < poly[i][1] + (poly[j][1] - poly[i][1]) * (point[2] - poly[i][2]) / (poly[j][2] - poly[i][2])) - c = !c - end - - j = i - end - - return c -end - -"""Golden section search -to find the minimum of f on [a,b] -opt_func: a strictly unimodal function on [a,b] - -Returns: tuple with the optimal parameter and the optimal function value -""" -function linsearch_gs(opt_func::Function, a::T, b::T; tol=1e-3) where T<: Real - gr = (sqrt(5) + 1) / 2 - - c = b - (b - a) / gr - d = a + (b - a) / gr - while abs(c - d) > tol - if opt_func(c) < opt_func(d) - b = d - else - a = c - end - # We recompute both c and d here to avoid loss of precision which may lead to incorrect results or infinite loop - c = b - (b - a) / gr - d = a + (b - a) / gr - end - - return (b + a) / 2, opt_func((b + a) / 2) -end - -function trace_values_along_line(arr::Matrix{T}, start_x::Int, start_y::Int, end_x::Int, end_y::Int)::Vector{T} where T <: Real - @assert all([end_y, end_x] .<= size(arr)) - @assert all([start_y, start_x] .<= size(arr)) - @assert all([start_y, start_x] .> 0) - @assert all([end_y, end_x] .> 0) - - a = (end_y - start_y) / (end_x - start_x) - b = end_y - a * end_x - - dx = sign(end_x - start_x) - dy = sign(end_y - start_y) - - x, y = start_x, start_y - - vals = [arr[start_y, start_x]] - while (x != end_x) | (y != end_y) - if (abs((a * x + b) - (y + dy)) < abs((a * (x + dx) + b) - y)) || dx == 0 - y += dy - else - x += dx - end - push!(vals, arr[y, x]) - end - return vals -end +split(df::DataFrame, factor::Vector{Int}; kwargs...) = [df[ids, :] for ids in split(1:size(df, 1), factor; kwargs...)] @inline @fastmath function fmax(v1::T, v2::T) where T <: Real v1 > v2 ? v1 : v2 @@ -245,25 +162,6 @@ function estimate_difference_l0(m1::Matrix{Float64}, m2::Matrix{Float64}; col_we return max_diff, n_changed / size(m1, 2) end -function pairwise_jaccard(values::Array{Vector{Int}, 1}, min_dist::Float64=0.0001)::Matrix{Float64} - dist_mat = zeros(length(values), length(values)) - for i1 in 1:length(values) - s1 = values[i1] - for i2 in (i1+1):length(values) - s2 = values[i2] - inter_len = 0 - for v in s1 - if v in s2 - inter_len += 1 - end - end - dist_mat[i1, i2] = dist_mat[i2, i1] = fmax(1.0 - length(inter_len) / (length(s1) + length(s2) - inter_len), min_dist) - end - end - - return dist_mat -end - ### Statistics @inline function fsample(w::Vector{Float64})::Int @@ -284,7 +182,7 @@ end @inline @inbounds fsample(arr::Vector{Int}, w::Vector{Float64})::Int = arr[fsample(w)] -function wmean(values::Vector{T} where T <: Real, weights::T1 where T1 <: AbstractVector{Float64}; non_zero_ids::T2 where T2 <: AbstractArray{Int} = 1:length(values)) +function wmean(values::Vector{Float64}, weights::T where T <: AbstractVector{Float64}; non_zero_ids::Union{UnitRange{Int}, Vector{Int}}=1:length(values)) s, ws = 0.0, 0.0 for i in non_zero_ids s += values[i] * weights[i] @@ -294,7 +192,7 @@ function wmean(values::Vector{T} where T <: Real, weights::T1 where T1 <: Abstra return s / ws end -function wmean_std(values::Vector{T0} where T0 <: Real, weights::T1 where T1 <: AbstractVector{Float64}; non_zero_ids::T2 where T2 <: AbstractArray{Int} = 1:length(values)) +function wmean_std(values::Vector{Float64}, weights::T where T <: AbstractVector{Float64}; non_zero_ids::Union{UnitRange{Int}, Vector{Int}}=1:length(values)) m = wmean(values, weights; non_zero_ids=non_zero_ids) s, ws = 0.0, 0.0 for i in non_zero_ids @@ -306,26 +204,7 @@ function wmean_std(values::Vector{T0} where T0 <: Real, weights::T1 where T1 <: return m, sqrt(s / ws) end -function upscale(image::T where T <: AbstractMatrix{TR}, ratio::Int64) where TR <: Real - res_img = zeros(TR, size(image) .* ratio) - ct = 1 - for c in 1:size(image, 2) - for m1 in 1:ratio - rt = 1 - for r in 1:size(image, 1) - for m2 in 1:ratio - res_img[rt, ct] = image[r, c] - rt += 1 - end - end - ct += 1 - end - end - - return res_img -end - -function estimate_hist(vec::Vector{<:Real}, weights=FrequencyWeights(ones(length(vec))); +function estimate_hist(vec::Vector{<:Real}, weights=FrequencyWeights(ones(length(vec))); ext_cols::NamedTuple=NamedTuple(), rel_width::Float64=0.9, normalize::Union{Bool, Symbol}=false, center=true, bins=nothing, kwargs...) hf = (bins === nothing) ? fit(Histogram, vec, weights; kwargs...) : fit(Histogram, vec, weights, bins; kwargs...) diffs = rel_width * diff(hf.edges[1])[1] diff --git a/test/runtests.jl b/test/runtests.jl index de3625f..8ec0f32 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -40,10 +40,6 @@ end @test all(chull .== [0 0 2 2 0; 0 2 2 0 0]) @test B.area(chull) ≈ 4.0 end - - @testset "utils" begin - @test all([B.interpolate_linear(x, 0.0, 1.0; y_start=0.0, y_end=1.0) ≈ x for x in range(0, 1.0, length=50)]) - end end @testset "distributions" begin @@ -86,7 +82,7 @@ end @test !any(i in bm_data.adjacent_points[i] for i in 1:length(bm_data.adjacent_points)) for i in 5:10:55 - init_params = B.cell_centers_uniformly(df, i; scale=10) + init_params = B.cell_centers_uniformly(df, i; scale=10.0) @test size(init_params.centers, 1) == i @test size(init_params.centers, 2) == 2 @test length(init_params.covs) == i @@ -162,7 +158,7 @@ end is_not_nan = map(1:10000) do i n_samps = rand(0:100) d = B.MvNormalF([0., 0.], diagm(0 => ones(2))); - B.maximize!(d, rand(2, n_samps), rand(n_samps) .* rand(Binomial(1, 0.5), n_samps)) + B.maximize!(d, rand(2, n_samps); center_probs=rand(n_samps) .* rand(Binomial(1, 0.5), n_samps)) !isnan(d.Σ[1]) end @@ -171,7 +167,7 @@ end @testset "synthetic_run" begin seed!(42) - for i in 1:6 + for it in 1:6 n_components = 10; n_genes = 20 scale = 0.2; @@ -191,7 +187,11 @@ end df_spatial = vcat(df_spatial, DataFrame(:x => rand(noise_size) * frame_size[1], :y => rand(noise_size) * frame_size[2], :gene => rand(1:n_genes, noise_size), :cell => 0)); - if i > 4 + if it % 2 == 0 + df_spatial[!, :z] = rand(1:3, size(df_spatial, 1)) .* scale ./ 10 + end + + if it > 4 df_spatial[!, :cluster] = df_spatial.cell .+ 1 end bm_data_arr = B.initialize_bmm_data(df_spatial; scale=scale, scale_std="5%", min_molecules_per_cell=10);