Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
VPetukhov committed Dec 1, 2021
2 parents bc5e50d + 61a0f51 commit 076d264
Show file tree
Hide file tree
Showing 19 changed files with 107 additions and 276 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
## Upcoming

### Changed

- Slightly optimized compilation time
- Minor updates of core formulas

### Added

- CLI parameter `no-ncv-estimation` to disable estimation of NCVs

## [0.5.0] — 2021-06-03

### Changed
Expand Down
30 changes: 4 additions & 26 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,42 +11,20 @@ RUN pip3 install jupyterlab numpy scipy matplotlib seaborn pandas sklearn scikit

RUN pip3 install -Iv six==1.12.0

RUN \
jupyter nbextensions_configurator enable && \
jupyter contrib nbextension install

RUN \
jupyter nbextension enable codefolding/main && \
jupyter nbextension enable codefolding/edit && \
jupyter nbextension enable collapsible_headings/main && \
jupyter nbextension enable notify/notify && \
jupyter nbextension enable ruler/main && \
jupyter nbextension enable table_beautifier/main && \
jupyter nbextension enable comment-uncomment/main && \
jupyter nbextension enable hide_header/main && \
jupyter nbextension enable livemdpreview/livemdpreview && \
jupyter nbextension enable spellchecker/main && \
jupyter nbextension enable execution_dependencies/execution_dependencies && \
jupyter nbextension enable freeze/main && \
jupyter nbextension enable tree-filter/index && \
jupyter nbextension enable init_cell/main && \
jupyter nbextension enable move_selected_cells/main && \
jupyter nbextension enable toc2/main

RUN julia -e 'using Pkg; Pkg.add("IJulia"); Pkg.build(); using IJulia;'

### jupyter notebook --no-browser --port=8989 --ip=0.0.0.0 --allow-root ./

## Julia Baysor envitonment

RUN julia -e 'using Pkg; Pkg.add("PackageCompiler")'
RUN julia -e 'using Pkg; Pkg.add(PackageSpec(url="https://github.com/hms-dbmi/Baysor.git")); Pkg.build(); using Baysor;'
RUN julia -e 'using Pkg; Pkg.add(PackageSpec(url="https://github.com/hms-dbmi/Baysor.git")); Pkg.build();'

# See https://discourse.julialang.org/t/sysimage-incompatible-between-intel-and-amd-x86-64/42716/2 for cpu_target flags
RUN julia -e 'using PackageCompiler; import Baysor; create_sysimage(:Baysor; precompile_execution_file="$(dirname(pathof(Baysor)))/../bin/build.jl", replace_default=true, cpu_target="generic")'
RUN julia -e 'import Baysor, Pkg; Pkg.activate(dirname(dirname(pathof(Baysor)))); Pkg.instantiate();'
RUN julia -e 'using PackageCompiler; import Baysor, Pkg; Pkg.activate(dirname(dirname(pathof(Baysor)))); create_sysimage(:Baysor; precompile_execution_file="$(dirname(pathof(Baysor)))/../bin/precompile.jl", replace_default=true, cpu_target="generic")'

RUN \
echo "#!/usr/bin/env julia\n\nimport Baysor\nBaysor.run_cli()" >> /bin/baysor && \
printf "#!/usr/bin/env julia\n\nimport Baysor\nBaysor.run_cli()" >> /bin/baysor && \
chmod +x /bin/baysor

ENTRYPOINT ["/bin/bash"]
Expand Down
3 changes: 2 additions & 1 deletion bin/build.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#! /usr/bin/env julia

using PackageCompiler
import Pkg

out_path = length(ARGS) > 1 ? ARGS[1] : "Baysor";
baysor_path = length(ARGS) > 2 ? ARGS[2] : dirname(@__DIR__);

Base.LOAD_PATH .= baysor_path
create_app(baysor_path, out_path; precompile_execution_file="$(baysor_path)/bin/precompile.jl")
create_app(baysor_path, out_path; precompile_execution_file="$(baysor_path)/bin/precompile.jl")
18 changes: 9 additions & 9 deletions src/bmm_algorithm/bmm_algorithm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@ end
empty!(component_weights)
empty!(comp_weights)
empty!(comp_ids)
zero_comp_weight = 0.0
bg_comp_weight = 0.0

@inbounds @simd for i in 1:length(adjacent_weights)
c_point = adjacent_points[i]
c_id = assignment[c_point]
cw = adjacent_weights[i]
if c_id == 0
zero_comp_weight += cw
bg_comp_weight += cw
else
component_weights[c_id] = get(component_weights, c_id, 0.0) + cw
end
Expand All @@ -63,13 +63,13 @@ end
push!(comp_weights, v)
end

return zero_comp_weight
return bg_comp_weight
end

@inline function fill_adjacent_component_weights!(adj_classes::Vector{Int}, adj_weights::Vector{Float64}, data::BmmData, mol_id::Int;
component_weights::Dict{Int, Float64}, adj_classes_global::Dict{Int, Vector{Int}})
# Looks like it's impossible to optimize further, even with vectorization. It means that creating vectorized version of expect_dirichlet_spatial makes few sense
zero_comp_weight = aggregate_adjacent_component_weights!(adj_classes, adj_weights, component_weights, data.assignment,
bg_comp_weight = aggregate_adjacent_component_weights!(adj_classes, adj_weights, component_weights, data.assignment,
data.adjacent_points[mol_id], data.adjacent_weights[mol_id])

if mol_id in keys(adj_classes_global)
Expand All @@ -78,7 +78,7 @@ end
append!(adj_weights, ones(length(adj_classes) - n1) .* data.real_edge_weight)
end

return zero_comp_weight
return bg_comp_weight
end

function adjust_densities_by_prior_segmentation!(denses::Vector{Float64}, segment_id::Int, largest_cell_id::Int,
Expand All @@ -105,7 +105,7 @@ function adjust_densities_by_prior_segmentation!(denses::Vector{Float64}, segmen
end

function expect_density_for_molecule!(denses::Vector{Float64}, data::BmmData{N}, mol_id::Int;
zero_comp_weight::Float64, adj_classes::Vector{Int}, adj_weights::Vector{Float64}) where N
bg_comp_weight::Float64, adj_classes::Vector{Int}, adj_weights::Vector{Float64}) where N
x = SVector{N}(position_data(data)[:,mol_id]...)
gene::Union{Int, Missing} = composition_data(data)[mol_id]
confidence::Float64 = data.confidence[mol_id]
Expand Down Expand Up @@ -143,7 +143,7 @@ function expect_density_for_molecule!(denses::Vector{Float64}, data::BmmData{N},
end

if confidence < 1.0
push!(denses, (1 - confidence) * exp(data.mrf_strength * fmax(data.real_edge_weight, zero_comp_weight)) * data.noise_density)
push!(denses, (1 - confidence) * exp(data.mrf_strength * bg_comp_weight) * data.noise_density)
push!(adj_classes, 0)
end

Expand All @@ -159,10 +159,10 @@ function expect_dirichlet_spatial!(data::BmmData; stochastic::Bool=true)
adj_classes_global = get_global_adjacent_classes(data)

for i in 1:size(data.x, 1)
zero_comp_weight = fill_adjacent_component_weights!(adj_classes, adj_weights, data, i;
bg_comp_weight = fill_adjacent_component_weights!(adj_classes, adj_weights, data, i;
component_weights=component_weights, adj_classes_global=adj_classes_global)

expect_density_for_molecule!(denses, data, i; adj_classes=adj_classes, adj_weights=adj_weights, zero_comp_weight=zero_comp_weight)
expect_density_for_molecule!(denses, data, i; adj_classes=adj_classes, adj_weights=adj_weights, bg_comp_weight=bg_comp_weight)

assign_molecule!(data, i; denses=denses, adj_classes=adj_classes, stochastic=stochastic)
end
Expand Down
31 changes: 20 additions & 11 deletions src/cli/main.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ function parse_commandline(args::Union{Nothing, Array{String, 1}}=nothing) # TOD
arg_type = Float64
default = 0.2

"--no-ncv-estimation"
dest_name = "estimate-ncvs"
help = "Turns off neighborhood composition vectors estimation"
action = :store_false

"coordinates"
help = "CSV file with coordinates of molecules and gene type"
required = true
Expand Down Expand Up @@ -389,9 +394,11 @@ function save_segmentation_results(bm_data::BmmData, gene_names::Vector{String},
segmentated_df = get_segmentation_df(bm_data, gene_names)
cell_stat_df = get_cell_stat_df(bm_data, segmentated_df; add_qc=true)

@info "Estimating local colors"
gene_colors = gene_composition_colors(bm_data.x, args["gene-composition-neigborhood"])
segmentated_df[!, :ncv_color] = "#" .* Colors.hex.(gene_colors)
if args["estimate-ncvs"]
@info "Estimating local colors"
gene_colors = gene_composition_colors(bm_data.x, args["gene-composition-neigborhood"])
segmentated_df[!, :ncv_color] = "#" .* Colors.hex.(gene_colors)
end

@info "Saving results to $(args["output"])"
CSV.write(args["output"], segmentated_df[sortperm(segmentated_df.molecule_id), :]);
Expand All @@ -403,14 +410,16 @@ function save_segmentation_results(bm_data::BmmData, gene_names::Vector{String},

if args["plot"]
plot_diagnostics_panel(segmentated_df, segmentated_df.cell, bm_data.tracer, args; clust_res=mol_clusts, comp_segs=comp_segs)
polygons = plot_transcript_assignment_panel(bm_data.x, bm_data.assignment, args; clusters=bm_data.cluster_per_molecule, prior_polygons=prior_polygons,
gene_colors=gene_colors)

if args["save-polygons"] !== nothing
if lowercase(args["save-polygons"]) == "geojson"
save_polygons_to_geojson(polygons, append_suffix(args["output"], "polygons.json"))
else
@warn "Unknown polygon format: $(args["save-polygons"])"
if args["estimate-ncvs"]
polygons = plot_transcript_assignment_panel(bm_data.x, bm_data.assignment, args; clusters=bm_data.cluster_per_molecule, prior_polygons=prior_polygons,
gene_colors=gene_colors)

if args["save-polygons"] !== nothing
if lowercase(args["save-polygons"]) == "geojson"
save_polygons_to_geojson(polygons, append_suffix(args["output"], "polygons.json"))
else
@warn "Unknown polygon format: $(args["save-polygons"])"
end
end
end
end
Expand Down
14 changes: 7 additions & 7 deletions src/data_processing/diagnostic_plots.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ function plot_noise_estimation_diagnostics(edge_lengths::Vector{Float64}, confid
x_vals = range(0, x_max, length=1000)
n1 = sum(confidences);
n2 = length(confidences) - n1;

p_df = estimate_hist(edge_lengths[edge_lengths .< x_max], normalize=true, nbins=bins)
p_df[!, :intra] = n1 / (n1 + n2) .* pdf.(d1, p_df.s)
p_df[!, :bg] = n2 / (n1 + n2) .* pdf.(d2, p_df.s)
return p_df |>

return p_df |>
@vlplot(x={:s, title="Distance to $(confidence_nn_id)'th nearest neighbor"}, title=title, width=400, height=300) +
@vlplot(:bar, x2=:e, y={:h, title="Density"}, color={datum="Observed", scale={scheme="category10"}, legend={title="Distribution"}}) +
@vlplot({:line, size=linewidth}, y=:bg, color={datum="Background"}) +
Expand Down Expand Up @@ -44,10 +44,10 @@ function plot_gene_structure(df_spatial::DataFrame, gene_names::Vector, confiden

return p_df |>
@vlplot(
x={:x, scale={domain=val_range(p_df.x)}, title="UMAP-1"},
y={:y, scale={domain=val_range(p_df.y)}, title="UMAP-2"},
size={:size, scale={range=[5, 10]}, legend=false},
tooltip={:gene, type="nominal"},
x={:x, scale={domain=val_range(p_df.x)}, title="UMAP-1"},
y={:y, scale={domain=val_range(p_df.y)}, title="UMAP-2"},
size={:size, scale={range=[5, 10]}, legend=false},
tooltip={:gene, type="nominal"},
width=600, height=600, title="Gene local structure",
config={axis={grid=false, ticks=false, ticklabels=false, labels=false}}
) +
Expand Down
12 changes: 6 additions & 6 deletions src/data_processing/initialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@ end

# Initialize cell positions

function position_data_by_assignment(pos_data::T where T<: AbstractMatrix{<:Real}, assignment::Vector{<:Integer})
function position_data_by_assignment(pos_data::Matrix{Float64}, assignment::Vector{Int})
filt_ids = findall(assignment .> 0)
return [pos_data[:, ids] for ids in split(filt_ids, assignment[filt_ids])]
end

function covs_from_assignment(pos_data::T where T<: AbstractMatrix{<:Real}, assignment::Vector{<:Integer}; min_size::Int=1)::Array{<:CovMat, 1}
function covs_from_assignment(pos_data::Matrix{Float64}, assignment::Vector{Int}; min_size::Int=1)::Array{<:CovMat, 1}
pos_data_by_assignment = position_data_by_assignment(pos_data, assignment)
CM, MV = (size(pos_data, 1) == 2) ? (CovMat{2}, MeanVec{2}) : (CovMat{3}, MeanVec{3})

Expand All @@ -84,8 +84,8 @@ end
cell_centers_uniformly(spatial_df::DataFrame, args...; kwargs...) =
cell_centers_uniformly(position_data(spatial_df), args..., (:confidence in propertynames(spatial_df)) ? spatial_df.confidence : nothing; kwargs...)

function cell_centers_uniformly(pos_data::T where T<: AbstractMatrix{<:Real}, n_clusters::Int,
confidences::Union{Vector{Float64}, Nothing}=nothing; scale::Union{<:Real, Nothing})
function cell_centers_uniformly(pos_data::Matrix{Float64}, n_clusters::Int,
confidences::Union{Vector{Float64}, Nothing}=nothing; scale::Union{Float64, Nothing})
n_clusters = min(n_clusters, size(pos_data, 2))

cluster_centers = pos_data[:, select_ids_uniformly(pos_data', confidences; n=n_clusters, confidence_threshold=0.25)]
Expand All @@ -95,10 +95,10 @@ function cell_centers_uniformly(pos_data::T where T<: AbstractMatrix{<:Real}, n_
if scale === nothing
covs = covs_from_assignment(pos_data, cluster_labels)
elseif size(pos_data, 1) == 2
scale = Float64(scale) ^ 2
scale = scale ^ 2
covs = [CovMat{2}(diagm(0 => (ones(2) .* scale))) for i in 1:size(cluster_centers, 2)]
elseif size(pos_data, 1) == 3
scale = Float64(scale) ^ 2
scale = scale ^ 2
covs = [CovMat{3}(diagm(0 => (ones(3) .* scale))) for i in 1:size(cluster_centers, 2)]
else
error("Unexpected number of dimensions: $(size(pos_data, 1))")
Expand Down
9 changes: 4 additions & 5 deletions src/data_processing/neighborhood_composition.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
using Base.Threads

neighborhood_count_matrix(data::Union{BmmData, T} where T <: AbstractDataFrame, k::Int; kwargs...) =
neighborhood_count_matrix(data::Union{BmmData, DataFrame}, k::Int; kwargs...) =
neighborhood_count_matrix(position_data(data), composition_data(data), k; kwargs...)

function neighborhood_count_matrix(pos_data::Matrix{<:Real}, genes::Vector{Int}, k::Int; confidences::Union{Vector{Float64}, Nothing}=nothing,
function neighborhood_count_matrix(pos_data::Matrix{Float64}, genes::Vector{Int}, k::Int; confidences::Union{Vector{Float64}, Nothing}=nothing,
n_genes::Int=maximum(genes), normalize_by_dist::Bool=true, normalize::Bool=true)
if k < 3
@warn "Too small value of k: $k. Setting it to 3."
Expand All @@ -18,7 +18,6 @@ function neighborhood_count_matrix(pos_data::Matrix{<:Real}, genes::Vector{Int},
@warn "`confidences` are not supported with `normalize=true`. Ignoring them."
confidences = nothing
end


k = min(k, size(pos_data, 2))

Expand Down Expand Up @@ -117,9 +116,9 @@ function gene_composition_colors!(embedding::Matrix{Float64}; lrange::Tuple{<:Re
return vec(mapslices(col -> Colors.Lab(col...), embedding, dims=1))
end

function gene_composition_colors(df_spatial::T where T <: AbstractDataFrame, k::Int; kwargs...)
function gene_composition_colors(df_spatial::DataFrame, k::Int; kwargs...)
neighb_cm = neighborhood_count_matrix(df_spatial, k);
confidence = (:confidence in propertynames(df_spatial)) ? df_spatial.confidence : ones(size(df_spatial, 1))
transformation = gene_composition_transformation(neighb_cm, confidence)
return gene_composition_colors(neighb_cm, transformation; kwargs...)
end
end
4 changes: 2 additions & 2 deletions src/data_processing/noise_estimation.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
@views maximize_noise_distributions(edge_lengths::Vector{Float64}, assignment_probs::Matrix{Float64}; updating_ids::T2 where T2 <: AbstractArray{Int} = 1:length(edge_lengths)) =
@views maximize_noise_distributions(edge_lengths::Vector{Float64}, assignment_probs::Matrix{Float64}; updating_ids::Vector{Int}) =
(Normal(wmean_std(edge_lengths, assignment_probs[:,i], non_zero_ids=updating_ids)...) for i in 1:2)

function expect_noise_probabilities!(assignment_probs::Matrix{Float64}, d1::Normal, d2::Normal, edge_lengths::Vector{Float64},
adjacent_points::Vector{Vector{Int}}, adjacent_weights::Vector{Vector{Float64}}; updating_ids::T where T <: AbstractVector{Int} = 1:length(edge_lengths),
adjacent_points::Vector{Vector{Int}}, adjacent_weights::Vector{Vector{Float64}}; updating_ids::Vector{Int},
min_confidence::Union{Vector{Float64}, Nothing}=nothing)
norm_denses = (pdf.(d1, edge_lengths), pdf.(d2, edge_lengths));
n1 = sum(assignment_probs[:, 1])
Expand Down
4 changes: 2 additions & 2 deletions src/data_processing/triangulation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ getx(p::IndexedPoint2D) = p._x
gety(p::IndexedPoint2D) = p._y
geti(p::IndexedPoint2D) = p._index

function adjacency_list(points::AbstractArray{T, 2} where T <: Real; filter::Bool=true, n_mads::T2 where T2 <: Real =2, k_adj::Int=5,
function adjacency_list(points::Matrix{<:Real}; filter::Bool=true, n_mads::Float64=2.0, k_adj::Int=5,
adjacency_type::Symbol=:triangulation, distance::TD where TD <: Distances.SemiMetric = Euclidean())
if size(points, 1) == 3
(adjacency_type == :knn) || @warn "Only k-nn random field is supported for 3D data"
Expand Down Expand Up @@ -81,7 +81,7 @@ end

adjacency_list(spatial_df::DataFrame; kwargs...) = adjacency_list(position_data(spatial_df); kwargs...)

function connected_components(adjacent_points::Array{Array{Int, 1}, 1})
function connected_components(adjacent_points::Array{Vector{Int}, 1})
g = LightGraphs.SimpleGraph(length(adjacent_points));
for (v1, vs) in enumerate(adjacent_points)
for v2 in vs
Expand Down
6 changes: 3 additions & 3 deletions src/data_processing/umap_wrappers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ function fit(::Type{UmapFit}, x::Array{Float64, 2}, pca::MultivariateStats.PCA;
return UmapFit(embedding, nn_tree, pca, nn_interpolate)
end

fit(::Type{UmapFit}, x::Array{Float64, 2}; n_pcs::Int=15, kwargs...)::UmapFit =
fit(::Type{UmapFit}, x::Matrix{Float64}; n_pcs::Int=15, kwargs...)::UmapFit =
fit(UmapFit, x, fit(MultivariateStats.PCA, x, maxoutdim=n_pcs); kwargs...)

function transform(transformation::UmapFit, x::Array{Float64, 2}; dist_offset::Float64=1e-10)::Array{Float64, 2}
function transform(transformation::UmapFit, x::Matrix{Float64}; dist_offset::Float64=1e-10)::Matrix{Float64}
x_pc = transform(transformation.pca_transform, x)

# This transformation is much faster than the implementation from UMAP.jl, even for input dimensionality = 50.
# Probably, because of the slow NearestNeighborDescent package.
indices, distances = knn(transformation.nn_tree, x_pc, transformation.nn_interpolate)
res = [mapslices(x -> wmean(x, 1 ./ (dists .+ dist_offset)), transformation.embedding[:, ids], dims=2) for (ids, dists) in zip(indices, distances)]
res = [mapslices(v -> wmean(v, 1 ./ (dists .+ dist_offset)), transformation.embedding[:, ids], dims=2) for (ids, dists) in zip(indices, distances)]
return hcat(res...)
end
Loading

0 comments on commit 076d264

Please sign in to comment.