Merge branch 'develop'

kharchenkolab · Dec 1, 2021 · 076d264 · 076d264
2 parents bc5e50d + 61a0f51
commit 076d264
Show file tree

Hide file tree

Showing 19 changed files with 107 additions and 276 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,14 @@
+## Upcoming
+
+### Changed
+
+- Slightly optimized compilation time
+- Minor updates of core formulas
+
+### Added
+
+- CLI parameter `no-ncv-estimation` to disable estimation of NCVs
+
 ## [0.5.0] — 2021-06-03
 
 ### Changed

diff --git a/Dockerfile b/Dockerfile
@@ -11,42 +11,20 @@ RUN pip3 install jupyterlab numpy scipy matplotlib seaborn pandas sklearn scikit
 
 RUN pip3 install -Iv six==1.12.0
 
-RUN \
-    jupyter nbextensions_configurator enable && \
-    jupyter contrib nbextension install
-
-RUN \
-    jupyter nbextension enable codefolding/main && \
-    jupyter nbextension enable codefolding/edit && \
-    jupyter nbextension enable collapsible_headings/main && \
-    jupyter nbextension enable notify/notify && \
-    jupyter nbextension enable ruler/main && \
-    jupyter nbextension enable table_beautifier/main && \
-    jupyter nbextension enable comment-uncomment/main && \
-    jupyter nbextension enable hide_header/main && \
-    jupyter nbextension enable livemdpreview/livemdpreview && \
-    jupyter nbextension enable spellchecker/main && \
-    jupyter nbextension enable execution_dependencies/execution_dependencies && \
-    jupyter nbextension enable freeze/main && \
-    jupyter nbextension enable tree-filter/index && \
-    jupyter nbextension enable init_cell/main && \
-    jupyter nbextension enable move_selected_cells/main && \
-    jupyter nbextension enable toc2/main
-
 RUN julia -e 'using Pkg; Pkg.add("IJulia"); Pkg.build(); using IJulia;'
 
 ### jupyter notebook --no-browser --port=8989 --ip=0.0.0.0 --allow-root ./
 
 ## Julia Baysor envitonment
 
 RUN julia -e 'using Pkg; Pkg.add("PackageCompiler")'
-RUN julia -e 'using Pkg; Pkg.add(PackageSpec(url="https://github.com/hms-dbmi/Baysor.git")); Pkg.build(); using Baysor;'
+RUN julia -e 'using Pkg; Pkg.add(PackageSpec(url="https://github.com/hms-dbmi/Baysor.git")); Pkg.build();'
 
-# See https://discourse.julialang.org/t/sysimage-incompatible-between-intel-and-amd-x86-64/42716/2 for cpu_target flags
-RUN julia -e 'using PackageCompiler; import Baysor; create_sysimage(:Baysor; precompile_execution_file="$(dirname(pathof(Baysor)))/../bin/build.jl", replace_default=true, cpu_target="generic")'
+RUN julia -e 'import Baysor, Pkg; Pkg.activate(dirname(dirname(pathof(Baysor)))); Pkg.instantiate();'
+RUN julia -e 'using PackageCompiler; import Baysor, Pkg; Pkg.activate(dirname(dirname(pathof(Baysor)))); create_sysimage(:Baysor; precompile_execution_file="$(dirname(pathof(Baysor)))/../bin/precompile.jl", replace_default=true, cpu_target="generic")'
 
 RUN \
-    echo "#!/usr/bin/env julia\n\nimport Baysor\nBaysor.run_cli()" >> /bin/baysor && \
+    printf "#!/usr/bin/env julia\n\nimport Baysor\nBaysor.run_cli()" >> /bin/baysor && \
     chmod +x /bin/baysor
 
 ENTRYPOINT ["/bin/bash"]

diff --git a/bin/build.jl b/bin/build.jl
@@ -1,9 +1,10 @@
 #! /usr/bin/env julia
 
 using PackageCompiler
+import Pkg
 
 out_path = length(ARGS) > 1 ? ARGS[1] : "Baysor";
 baysor_path = length(ARGS) > 2 ? ARGS[2] : dirname(@__DIR__);
 
 Base.LOAD_PATH .= baysor_path
-create_app(baysor_path, out_path; precompile_execution_file="$(baysor_path)/bin/precompile.jl")
+create_app(baysor_path, out_path; precompile_execution_file="$(baysor_path)/bin/precompile.jl")
diff --git a/src/bmm_algorithm/bmm_algorithm.jl b/src/bmm_algorithm/bmm_algorithm.jl
@@ -45,14 +45,14 @@ end
     empty!(component_weights)
     empty!(comp_weights)
     empty!(comp_ids)
-    zero_comp_weight = 0.0
+    bg_comp_weight = 0.0
 
     @inbounds @simd for i in 1:length(adjacent_weights)
         c_point = adjacent_points[i]
         c_id = assignment[c_point]
         cw = adjacent_weights[i]
         if c_id == 0
-            zero_comp_weight += cw
+            bg_comp_weight += cw
         else
             component_weights[c_id] = get(component_weights, c_id, 0.0) + cw
         end
@@ -63,13 +63,13 @@ end
         push!(comp_weights, v)
     end
 
-    return zero_comp_weight
+    return bg_comp_weight
 end
 
 @inline function fill_adjacent_component_weights!(adj_classes::Vector{Int}, adj_weights::Vector{Float64}, data::BmmData, mol_id::Int;
         component_weights::Dict{Int, Float64}, adj_classes_global::Dict{Int, Vector{Int}})
     # Looks like it's impossible to optimize further, even with vectorization. It means that creating vectorized version of expect_dirichlet_spatial makes few sense
-    zero_comp_weight = aggregate_adjacent_component_weights!(adj_classes, adj_weights, component_weights, data.assignment,
+    bg_comp_weight = aggregate_adjacent_component_weights!(adj_classes, adj_weights, component_weights, data.assignment,
         data.adjacent_points[mol_id], data.adjacent_weights[mol_id])
 
     if mol_id in keys(adj_classes_global)
@@ -78,7 +78,7 @@ end
         append!(adj_weights, ones(length(adj_classes) - n1) .* data.real_edge_weight)
     end
 
-    return zero_comp_weight
+    return bg_comp_weight
 end
 
 function adjust_densities_by_prior_segmentation!(denses::Vector{Float64}, segment_id::Int, largest_cell_id::Int,
@@ -105,7 +105,7 @@ function adjust_densities_by_prior_segmentation!(denses::Vector{Float64}, segmen
 end
 
 function expect_density_for_molecule!(denses::Vector{Float64}, data::BmmData{N}, mol_id::Int;
-        zero_comp_weight::Float64, adj_classes::Vector{Int}, adj_weights::Vector{Float64}) where N
+        bg_comp_weight::Float64, adj_classes::Vector{Int}, adj_weights::Vector{Float64}) where N
     x = SVector{N}(position_data(data)[:,mol_id]...)
     gene::Union{Int, Missing} = composition_data(data)[mol_id]
     confidence::Float64 = data.confidence[mol_id]
@@ -143,7 +143,7 @@ function expect_density_for_molecule!(denses::Vector{Float64}, data::BmmData{N},
     end
 
     if confidence < 1.0
-        push!(denses, (1 - confidence) * exp(data.mrf_strength * fmax(data.real_edge_weight, zero_comp_weight)) * data.noise_density)
+        push!(denses, (1 - confidence) * exp(data.mrf_strength * bg_comp_weight) * data.noise_density)
         push!(adj_classes, 0)
     end
 
@@ -159,10 +159,10 @@ function expect_dirichlet_spatial!(data::BmmData; stochastic::Bool=true)
     adj_classes_global = get_global_adjacent_classes(data)
 
     for i in 1:size(data.x, 1)
-        zero_comp_weight = fill_adjacent_component_weights!(adj_classes, adj_weights, data, i;
+        bg_comp_weight = fill_adjacent_component_weights!(adj_classes, adj_weights, data, i;
             component_weights=component_weights, adj_classes_global=adj_classes_global)
 
-        expect_density_for_molecule!(denses, data, i; adj_classes=adj_classes, adj_weights=adj_weights, zero_comp_weight=zero_comp_weight)
+        expect_density_for_molecule!(denses, data, i; adj_classes=adj_classes, adj_weights=adj_weights, bg_comp_weight=bg_comp_weight)
 
         assign_molecule!(data, i; denses=denses, adj_classes=adj_classes, stochastic=stochastic)
     end

diff --git a/src/cli/main.jl b/src/cli/main.jl
@@ -65,6 +65,11 @@ function parse_commandline(args::Union{Nothing, Array{String, 1}}=nothing) # TOD
             arg_type = Float64
             default = 0.2
 
+        "--no-ncv-estimation"
+            dest_name = "estimate-ncvs"
+            help = "Turns off neighborhood composition vectors estimation"
+            action = :store_false
+
         "coordinates"
             help = "CSV file with coordinates of molecules and gene type"
             required = true
@@ -389,9 +394,11 @@ function save_segmentation_results(bm_data::BmmData, gene_names::Vector{String},
     segmentated_df = get_segmentation_df(bm_data, gene_names)
     cell_stat_df = get_cell_stat_df(bm_data, segmentated_df; add_qc=true)
 
-    @info "Estimating local colors"
-    gene_colors = gene_composition_colors(bm_data.x, args["gene-composition-neigborhood"])
-    segmentated_df[!, :ncv_color] = "#" .* Colors.hex.(gene_colors)
+    if args["estimate-ncvs"]
+        @info "Estimating local colors"
+        gene_colors = gene_composition_colors(bm_data.x, args["gene-composition-neigborhood"])
+        segmentated_df[!, :ncv_color] = "#" .* Colors.hex.(gene_colors)
+    end
 
     @info "Saving results to $(args["output"])"
     CSV.write(args["output"], segmentated_df[sortperm(segmentated_df.molecule_id), :]);
@@ -403,14 +410,16 @@ function save_segmentation_results(bm_data::BmmData, gene_names::Vector{String},
 
     if args["plot"]
         plot_diagnostics_panel(segmentated_df, segmentated_df.cell, bm_data.tracer, args; clust_res=mol_clusts, comp_segs=comp_segs)
-        polygons = plot_transcript_assignment_panel(bm_data.x, bm_data.assignment, args; clusters=bm_data.cluster_per_molecule, prior_polygons=prior_polygons,
-            gene_colors=gene_colors)
-
-        if args["save-polygons"] !== nothing
-            if lowercase(args["save-polygons"]) == "geojson"
-                save_polygons_to_geojson(polygons, append_suffix(args["output"], "polygons.json"))
-            else
-                @warn "Unknown polygon format: $(args["save-polygons"])"
+        if args["estimate-ncvs"]
+            polygons = plot_transcript_assignment_panel(bm_data.x, bm_data.assignment, args; clusters=bm_data.cluster_per_molecule, prior_polygons=prior_polygons,
+                gene_colors=gene_colors)
+
+            if args["save-polygons"] !== nothing
+                if lowercase(args["save-polygons"]) == "geojson"
+                    save_polygons_to_geojson(polygons, append_suffix(args["output"], "polygons.json"))
+                else
+                    @warn "Unknown polygon format: $(args["save-polygons"])"
+                end
             end
         end
     end

diff --git a/src/data_processing/diagnostic_plots.jl b/src/data_processing/diagnostic_plots.jl
@@ -6,12 +6,12 @@ function plot_noise_estimation_diagnostics(edge_lengths::Vector{Float64}, confid
         x_vals = range(0, x_max, length=1000)
         n1 = sum(confidences);
         n2 = length(confidences) - n1;
-        
+
         p_df = estimate_hist(edge_lengths[edge_lengths .< x_max], normalize=true, nbins=bins)
         p_df[!, :intra] = n1 / (n1 + n2) .* pdf.(d1, p_df.s)
         p_df[!, :bg] = n2 / (n1 + n2) .* pdf.(d2, p_df.s)
-        
-        return p_df |> 
+
+        return p_df |>
         @vlplot(x={:s, title="Distance to $(confidence_nn_id)'th nearest neighbor"}, title=title, width=400, height=300) +
         @vlplot(:bar, x2=:e, y={:h, title="Density"}, color={datum="Observed", scale={scheme="category10"}, legend={title="Distribution"}}) +
         @vlplot({:line, size=linewidth}, y=:bg, color={datum="Background"}) +
@@ -44,10 +44,10 @@ function plot_gene_structure(df_spatial::DataFrame, gene_names::Vector, confiden
 
     return p_df |>
     @vlplot(
-        x={:x, scale={domain=val_range(p_df.x)}, title="UMAP-1"}, 
-        y={:y, scale={domain=val_range(p_df.y)}, title="UMAP-2"}, 
-        size={:size, scale={range=[5, 10]}, legend=false}, 
-        tooltip={:gene, type="nominal"}, 
+        x={:x, scale={domain=val_range(p_df.x)}, title="UMAP-1"},
+        y={:y, scale={domain=val_range(p_df.y)}, title="UMAP-2"},
+        size={:size, scale={range=[5, 10]}, legend=false},
+        tooltip={:gene, type="nominal"},
         width=600, height=600, title="Gene local structure",
         config={axis={grid=false, ticks=false, ticklabels=false, labels=false}}
     ) +

diff --git a/src/data_processing/initialization.jl b/src/data_processing/initialization.jl
@@ -62,12 +62,12 @@ end
 
 # Initialize cell positions
 
-function position_data_by_assignment(pos_data::T where T<: AbstractMatrix{<:Real}, assignment::Vector{<:Integer})
+function position_data_by_assignment(pos_data::Matrix{Float64}, assignment::Vector{Int})
     filt_ids = findall(assignment .> 0)
     return [pos_data[:, ids] for ids in split(filt_ids, assignment[filt_ids])]
 end
 
-function covs_from_assignment(pos_data::T where T<: AbstractMatrix{<:Real}, assignment::Vector{<:Integer}; min_size::Int=1)::Array{<:CovMat, 1}
+function covs_from_assignment(pos_data::Matrix{Float64}, assignment::Vector{Int}; min_size::Int=1)::Array{<:CovMat, 1}
     pos_data_by_assignment = position_data_by_assignment(pos_data, assignment)
     CM, MV = (size(pos_data, 1) == 2) ? (CovMat{2}, MeanVec{2}) : (CovMat{3}, MeanVec{3})
 
@@ -84,8 +84,8 @@ end
 cell_centers_uniformly(spatial_df::DataFrame, args...; kwargs...) =
     cell_centers_uniformly(position_data(spatial_df), args..., (:confidence in propertynames(spatial_df)) ? spatial_df.confidence : nothing; kwargs...)
 
-function cell_centers_uniformly(pos_data::T where T<: AbstractMatrix{<:Real}, n_clusters::Int,
-        confidences::Union{Vector{Float64}, Nothing}=nothing; scale::Union{<:Real, Nothing})
+function cell_centers_uniformly(pos_data::Matrix{Float64}, n_clusters::Int,
+        confidences::Union{Vector{Float64}, Nothing}=nothing; scale::Union{Float64, Nothing})
     n_clusters = min(n_clusters, size(pos_data, 2))
 
     cluster_centers = pos_data[:, select_ids_uniformly(pos_data', confidences; n=n_clusters, confidence_threshold=0.25)]
@@ -95,10 +95,10 @@ function cell_centers_uniformly(pos_data::T where T<: AbstractMatrix{<:Real}, n_
     if scale === nothing
         covs = covs_from_assignment(pos_data, cluster_labels)
     elseif size(pos_data, 1) == 2
-        scale = Float64(scale) ^ 2
+        scale = scale ^ 2
         covs = [CovMat{2}(diagm(0 => (ones(2) .* scale))) for i in 1:size(cluster_centers, 2)]
     elseif size(pos_data, 1) == 3
-        scale = Float64(scale) ^ 2
+        scale = scale ^ 2
         covs = [CovMat{3}(diagm(0 => (ones(3) .* scale))) for i in 1:size(cluster_centers, 2)]
     else
         error("Unexpected number of dimensions: $(size(pos_data, 1))")

diff --git a/src/data_processing/neighborhood_composition.jl b/src/data_processing/neighborhood_composition.jl
@@ -1,9 +1,9 @@
 using Base.Threads
 
-neighborhood_count_matrix(data::Union{BmmData, T} where T <: AbstractDataFrame, k::Int; kwargs...) =
+neighborhood_count_matrix(data::Union{BmmData, DataFrame}, k::Int; kwargs...) =
     neighborhood_count_matrix(position_data(data), composition_data(data), k; kwargs...)
 
-function neighborhood_count_matrix(pos_data::Matrix{<:Real}, genes::Vector{Int}, k::Int; confidences::Union{Vector{Float64}, Nothing}=nothing,
+function neighborhood_count_matrix(pos_data::Matrix{Float64}, genes::Vector{Int}, k::Int; confidences::Union{Vector{Float64}, Nothing}=nothing,
     n_genes::Int=maximum(genes), normalize_by_dist::Bool=true, normalize::Bool=true)
     if k < 3
         @warn "Too small value of k: $k. Setting it to 3."
@@ -18,7 +18,6 @@ function neighborhood_count_matrix(pos_data::Matrix{<:Real}, genes::Vector{Int},
         @warn "`confidences` are not supported with `normalize=true`. Ignoring them."
         confidences = nothing
     end
-
 
     k = min(k, size(pos_data, 2))
 
@@ -117,9 +116,9 @@ function gene_composition_colors!(embedding::Matrix{Float64}; lrange::Tuple{<:Re
     return vec(mapslices(col -> Colors.Lab(col...), embedding, dims=1))
 end
 
-function gene_composition_colors(df_spatial::T where T <: AbstractDataFrame, k::Int; kwargs...)
+function gene_composition_colors(df_spatial::DataFrame, k::Int; kwargs...)
     neighb_cm = neighborhood_count_matrix(df_spatial, k);
     confidence = (:confidence in propertynames(df_spatial)) ? df_spatial.confidence : ones(size(df_spatial, 1))
     transformation = gene_composition_transformation(neighb_cm, confidence)
     return gene_composition_colors(neighb_cm, transformation; kwargs...)
-end
+end
diff --git a/src/data_processing/noise_estimation.jl b/src/data_processing/noise_estimation.jl
@@ -1,8 +1,8 @@
-@views maximize_noise_distributions(edge_lengths::Vector{Float64}, assignment_probs::Matrix{Float64}; updating_ids::T2 where T2 <: AbstractArray{Int} = 1:length(edge_lengths)) =
+@views maximize_noise_distributions(edge_lengths::Vector{Float64}, assignment_probs::Matrix{Float64}; updating_ids::Vector{Int}) =
     (Normal(wmean_std(edge_lengths, assignment_probs[:,i], non_zero_ids=updating_ids)...) for i in 1:2)
 
 function expect_noise_probabilities!(assignment_probs::Matrix{Float64}, d1::Normal, d2::Normal, edge_lengths::Vector{Float64},
-        adjacent_points::Vector{Vector{Int}}, adjacent_weights::Vector{Vector{Float64}}; updating_ids::T where T <: AbstractVector{Int} = 1:length(edge_lengths),
+        adjacent_points::Vector{Vector{Int}}, adjacent_weights::Vector{Vector{Float64}}; updating_ids::Vector{Int},
         min_confidence::Union{Vector{Float64}, Nothing}=nothing)
     norm_denses = (pdf.(d1, edge_lengths), pdf.(d2, edge_lengths));
     n1 = sum(assignment_probs[:, 1])

diff --git a/src/data_processing/triangulation.jl b/src/data_processing/triangulation.jl
@@ -24,7 +24,7 @@ getx(p::IndexedPoint2D) = p._x
 gety(p::IndexedPoint2D) = p._y
 geti(p::IndexedPoint2D) = p._index
 
-function adjacency_list(points::AbstractArray{T, 2} where T <: Real; filter::Bool=true, n_mads::T2 where T2 <: Real =2, k_adj::Int=5,
+function adjacency_list(points::Matrix{<:Real}; filter::Bool=true, n_mads::Float64=2.0, k_adj::Int=5,
         adjacency_type::Symbol=:triangulation, distance::TD where TD <: Distances.SemiMetric = Euclidean())
     if size(points, 1) == 3
         (adjacency_type == :knn) || @warn "Only k-nn random field is supported for 3D data"
@@ -81,7 +81,7 @@ end
 
 adjacency_list(spatial_df::DataFrame; kwargs...) = adjacency_list(position_data(spatial_df); kwargs...)
 
-function connected_components(adjacent_points::Array{Array{Int, 1}, 1})
+function connected_components(adjacent_points::Array{Vector{Int}, 1})
     g = LightGraphs.SimpleGraph(length(adjacent_points));
     for (v1, vs) in enumerate(adjacent_points)
         for v2 in vs

diff --git a/src/data_processing/umap_wrappers.jl b/src/data_processing/umap_wrappers.jl
@@ -23,15 +23,15 @@ function fit(::Type{UmapFit}, x::Array{Float64, 2}, pca::MultivariateStats.PCA;
     return UmapFit(embedding, nn_tree, pca, nn_interpolate)
 end
 
-fit(::Type{UmapFit}, x::Array{Float64, 2}; n_pcs::Int=15, kwargs...)::UmapFit =
+fit(::Type{UmapFit}, x::Matrix{Float64}; n_pcs::Int=15, kwargs...)::UmapFit =
     fit(UmapFit, x, fit(MultivariateStats.PCA, x, maxoutdim=n_pcs); kwargs...)
 
-function transform(transformation::UmapFit, x::Array{Float64, 2}; dist_offset::Float64=1e-10)::Array{Float64, 2}
+function transform(transformation::UmapFit, x::Matrix{Float64}; dist_offset::Float64=1e-10)::Matrix{Float64}
     x_pc = transform(transformation.pca_transform, x)
 
     # This transformation is much faster than the implementation from UMAP.jl, even for input dimensionality = 50.
     # Probably, because of the slow NearestNeighborDescent package.
     indices, distances = knn(transformation.nn_tree, x_pc, transformation.nn_interpolate)
-    res = [mapslices(x -> wmean(x, 1 ./ (dists .+ dist_offset)), transformation.embedding[:, ids], dims=2) for (ids, dists) in zip(indices, distances)]
+    res = [mapslices(v -> wmean(v, 1 ./ (dists .+ dist_offset)), transformation.embedding[:, ids], dims=2) for (ids, dists) in zip(indices, distances)]
     return hcat(res...)
 end