Skip to content

Commit

Permalink
Progress bar in neighborhood_count_matrix and knn_parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
Viktor Petukhov committed Dec 10, 2024
1 parent 386e312 commit 25d4c97
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 15 deletions.
30 changes: 20 additions & 10 deletions src/processing/data_processing/neighborhood_composition.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ neighborhood_count_matrix(data::Union{BmmData, DataFrame}, k::Int; kwargs...) =
neighborhood_count_matrix(position_data(data), composition_data(data), k; kwargs...)

function neighborhood_count_matrix(
pos_data::Matrix{Float64}, genes::Vector{<:Union{Int, Missing}}, k::Int;
pos_data::Matrix{<:Real}, genes::Vector{<:Union{Int, Missing}}, k::Int;
confidences::Union{Vector{Float64}, Nothing}=nothing,
n_genes::Int=maximum(skipmissing(genes)), normalize_by_dist::Bool=true, normalize::Bool=true
n_genes::Int=maximum(skipmissing(genes)), normalize_by_dist::Bool=true, normalize::Bool=true,
progress::Bool=false
)
# TODO: disable `normalize_by_dist` as memory allocations are terrible for huge datasets
@assert size(pos_data, 1) in (2, 3) "Position data must have exactly 2 or 3 rows"
if k < 3
@warn "Too small value of k: $k. Setting it to 3."
k = 3
Expand All @@ -25,25 +28,32 @@ function neighborhood_count_matrix(

k = min(k, size(pos_data, 2))

neighbors, dists = knn_parallel(KDTree(pos_data), pos_data, k; sorted=true);
neighbors, dists = knn_parallel(KDTree(pos_data), pos_data, k; sorted=true, progress);

s_vecs = Vector{SparseArrays.SparseVector{Float32, Int64}}(undef, length(neighbors))

if normalize_by_dist
# account for problems with points with duplicating coordinates
med_closest_dist = median(d[findfirst(d .> 1e-15)] for d in dists if any(d .> 1e-15));

return sphstack([ # Not sure if making it parallel will have large effect, as we have a lot of allocations here
count_array_sparse(Float32, genes[nns], 1 ./ max.(ds, med_closest_dist); total=n_genes, normalize=normalize)
for (nns,ds) in zip(neighbors, dists)
]);
# Not sure if making it parallel will have large effect, as we have a lot of allocations here
@showprogress enabled=progress for i in eachindex(neighbors)
@views s_vecs[i] = count_array_sparse(
Float32, genes[neighbors[i]], 1 ./ max.(dists[i], med_closest_dist);
total=n_genes, normalize=normalize
)
end

return sphstack(s_vecs)
end

s_vecs = Vector{SparseArrays.SparseVector{Float32, Int64}}(undef, length(neighbors))

if !normalize || (confidences === nothing)
@threads for i in eachindex(neighbors)
@showprogress enabled=progress @threads for i in eachindex(neighbors)
s_vecs[i] = count_array_sparse(Float32, view(genes, neighbors[i]); total=n_genes, normalize=normalize)
end
else
@threads for i in eachindex(neighbors)
@showprogress enabled=progress @threads for i in eachindex(neighbors)
s_vecs[i] = count_array_sparse(Float32, view(genes, neighbors[i]), view(confidences, neighbors[i]); total=n_genes, normalize=normalize)
end
end
Expand Down
10 changes: 5 additions & 5 deletions src/processing/utils/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ using SparseArrays
using Base.Threads
using NearestNeighbors: KDTree, knn

function knn_parallel(nn_tree::KDTree, x::AbstractMatrix{<:Real}, nn_interpolate::Int; sorted::Bool=false)
function knn_parallel(nn_tree::KDTree, x::AbstractMatrix{<:Real}, nn_interpolate::Int; sorted::Bool=false, progress::Bool=false)
indices = Vector{Vector{Int}}(undef, size(x, 2))
distances = Vector{Vector{eltype(eltype(nn_tree.data))}}(undef, size(x, 2))

@threads for i in axes(x, 2)
@showprogress enabled=progress @threads for i in axes(x, 2)
indices[i], distances[i] = knn(nn_tree, x[:, i], nn_interpolate, sorted)
end

Expand Down Expand Up @@ -49,11 +49,11 @@ count_array_sparse(T::DataType, values::AbstractVector{Union{Missing, Int}}, arg
count_array_sparse(T, collect(skipmissing(values)), args...; kwargs...)

count_array_sparse(values::AbstractVector{Int}, ::Nothing=nothing; kwargs...) = count_array_sparse(Int, values; kwargs...)
count_array_sparse(values::AbstractVector{Int}, weights::AbstractVector{Float64}; kwargs...) =
count_array_sparse(Float64, values, weights; kwargs...)
count_array_sparse(values::AbstractVector{Int}, weights::AbstractVector{<:Real}; kwargs...) =
count_array_sparse(Float32, values, weights; kwargs...)

function count_array_sparse(
T::DataType, values::AbstractVector{Int}, weights::Union{AbstractVector{Float64}, PseudoWeight}=PseudoWeight();
T::DataType, values::AbstractVector{Int}, weights::Union{AbstractVector{<:Real}, PseudoWeight}=PseudoWeight();
total::Int=0, min_val::Float64=1e-5, normalize::Bool=false
)
!isempty(values) || return spzeros(T, total)
Expand Down

0 comments on commit 25d4c97

Please sign in to comment.