diff --git a/KomaMRICore/Project.toml b/KomaMRICore/Project.toml index 82fcb819d..5cf2b5b6e 100644 --- a/KomaMRICore/Project.toml +++ b/KomaMRICore/Project.toml @@ -4,6 +4,7 @@ authors = ["Carlos Castillo Passi "] version = "0.9.0" [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" @@ -26,6 +27,7 @@ KomaoneAPIExt = "oneAPI" [compat] AMDGPU = "0.9, 1" +AcceleratedKernels = "0.2.2" Adapt = "3, 4" CUDA = "3, 4, 5" Functors = "0.4, 0.5" diff --git a/KomaMRICore/src/KomaMRICore.jl b/KomaMRICore/src/KomaMRICore.jl index 3486329ed..481218596 100644 --- a/KomaMRICore/src/KomaMRICore.jl +++ b/KomaMRICore/src/KomaMRICore.jl @@ -3,6 +3,7 @@ module KomaMRICore # General import Base.*, Base.abs import KernelAbstractions as KA +import AcceleratedKernels as AK using Reexport using ThreadsX # Printing diff --git a/KomaMRICore/src/simulation/GPUFunctions.jl b/KomaMRICore/src/simulation/GPUFunctions.jl index 75e63dbb1..bffbae9be 100644 --- a/KomaMRICore/src/simulation/GPUFunctions.jl +++ b/KomaMRICore/src/simulation/GPUFunctions.jl @@ -1,5 +1,7 @@ const LOADED_BACKENDS = Ref{Vector{KA.GPU}}([]) const BACKEND = Ref{Union{KA.Backend,Nothing}}(nothing) +const DEFAULT_PRECESSION_GROUPSIZE = 32 +const DEFAULT_EXCITATION_GROUPSIZE = 256 device_name(backend) = @error "device_name called with invalid backend type $(typeof(backend))" isfunctional(::KA.CPU) = true @@ -10,9 +12,12 @@ name(::KA.CPU) = "CPU" set_device!(backend, val) = @error "set_device! called with invalid parameter types: '$(typeof(backend))', '$(typeof(val))'" set_device!(val) = set_device!(get_backend(true), val) -#oneAPI.jl doesn't support cis (https://github.com/JuliaGPU/oneAPI.jl/pull/443), so -#for now we use a custom function for each backend to implement -function _cis end +#Copied from AcceleratedKernels.jl: https://github.com/JuliaGPU/AcceleratedKernels.jl/blob/00a9c0e7a36d6e02a51dbd13032a9165caab7909/src/utils.jl#L34 +struct TypeWrap{T} end +TypeWrap(T) = TypeWrap{T}() +Base.:*(x::Number, ::TypeWrap{T}) where T = T(x) + +const u16 = TypeWrap(UInt16) """ get_backend(use_gpu) diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl deleted file mode 100644 index e3d10ab27..000000000 --- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl +++ /dev/null @@ -1,206 +0,0 @@ -include("KernelFunctions.jl") - -"""These properties are redundant with seq.Δt and seq.ADC, but it is much faster -to calculate them on the CPU before the simulation is run.""" -struct SeqBlockProperties{T<:Real} - length::Int64 - nADC::Int64 - first_ADC::Bool - ADC_indices::AbstractVector{Int64} - tp_ADC::AbstractVector{T} - dur::T -end - -@functor SeqBlockProperties - -"""Stores information added to the prealloc struct for use in run_spin_precession! -and run_spin_excitation!. This information is calculated on the CPU before the -simulation is run.""" -struct BlochGPUPrecalc{T} <: PrecalcResult{T} - seq_properties::AbstractVector{SeqBlockProperties{T}} -end - -@functor BlochGPUPrecalc - -"""Precalculates sequence properties for use in run_spin_precession!""" -function precalculate( - sim_method::Bloch, - backend::KA.GPU, - seq::DiscreteSequence{T}, - parts::Vector{UnitRange{S}}, - excitation_bool::Vector{Bool} -) where {T<:Real,S<:Integer} - seq_properties = SeqBlockProperties{T}[] - for (block, p) in enumerate(parts) - seq_block = @view seq[p] - if excitation_bool[block] - push!(seq_properties, SeqBlockProperties( - length(seq_block.t), - count(seq_block.ADC), - false, - Int64[], - T[], - zero(T) - )) - else - ADC_indices = findall(seq_block.ADC) .- 1 - if seq_block.ADC[1] - deleteat!(ADC_indices, 1) - end - tp = cumsum(seq_block.Δt) - push!(seq_properties, SeqBlockProperties( - length(seq_block.t), - count(seq_block.ADC), - seq_block.ADC[1], - ADC_indices, - tp[ADC_indices], - last(tp) - )) - end - end - - return BlochGPUPrecalc(seq_properties) -end - -"""Stores preallocated structs for use in Bloch GPU run_spin_precession function.""" -struct BlochGPUPrealloc{T} <: PreallocResult{T} - seq_properties::AbstractVector{SeqBlockProperties{T}} - Bz::AbstractMatrix{T} - B::AbstractMatrix{T} - φ::AbstractMatrix{T} - ΔT1::AbstractMatrix{T} - ΔT2::AbstractMatrix{T} - Δϕ::AbstractMatrix{T} - ϕ::AbstractArray{T} - Mxy::AbstractMatrix{Complex{T}} - ΔBz::AbstractVector{T} -end - -Base.view(p::BlochGPUPrealloc{T}, i::UnitRange) where {T<:Real} = p -function prealloc_block(p::BlochGPUPrealloc{T}, i::Integer) where {T<:Real} - seq_block = p.seq_properties[i] - - return BlochGPUPrealloc( - [seq_block], - view(p.Bz,:,1:seq_block.length), - view(p.B,:,1:seq_block.length), - view(p.φ,:,1:seq_block.length-1), - view(p.ΔT1,:,1:seq_block.length-1), - view(p.ΔT2,:,1:seq_block.length-1), - view(p.Δϕ,:,1:seq_block.length-1), - seq_block.nADC > 0 ? view(p.ϕ,:,1:seq_block.length-1) : view(p.ϕ,:,1), - view(p.Mxy,:,1:seq_block.nADC), - p.ΔBz - ) -end - -"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!.""" -function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real} - if !(precalc isa BlochGPUPrecalc) @error """Sim method Bloch() does not support calling run_sim_time_iter directly. Use method BlochSimple() instead.""" end - - return BlochGPUPrealloc( - precalc.seq_properties, - KA.zeros(backend, T, (size(obj.x, 1), max_block_length)), - KA.zeros(backend, T, (size(obj.x, 1), max_block_length)), - KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)), - KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)), - KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)), - KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)), - KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)), - KA.zeros(backend, Complex{T}, (size(obj.x, 1), max_block_length)), - obj.Δw ./ T(2π .* γ) - ) -end - -@inline function calculate_precession!(Δϕ::AbstractArray{T}, Δt::AbstractArray{T}, Bz::AbstractArray{T}) where {T<:Real} - Δϕ .= (Bz[:,2:end] .+ Bz[:,1:end-1]) .* Δt .* T(-π .* γ) -end -@inline function apply_precession!(ϕ::AbstractVector{T}, Δϕ::AbstractArray{T}) where {T<:Real} - ϕ .= sum(Δϕ, dims=2) -end -function apply_precession!(ϕ::AbstractArray{T}, Δϕ::AbstractArray{T}) where {T<:Real} - cumsum!(ϕ, Δϕ, dims=2) -end - -function run_spin_precession!( - p::Phantom{T}, - seq::DiscreteSequence{T}, - sig::AbstractArray{Complex{T}}, - M::Mag{T}, - sim_method::Bloch, - backend::KA.GPU, - pre::BlochGPUPrealloc -) where {T<:Real} - #Simulation - #Motion - x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t') - #Sequence block info - seq_block = pre.seq_properties[1] - - #Effective field - pre.Bz .= x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ pre.ΔBz - - #Rotation - calculate_precession!(pre.Δϕ, seq.Δt', pre.Bz) - # Reduces Δϕ Nspins x Nt to ϕ Nspins x Nt, if Nadc = 0, to Nspins x 1 - apply_precession!(pre.ϕ, pre.Δϕ) - - #Acquired signal - if seq_block.nADC > 0 - ϕ_ADC = @view pre.ϕ[:,seq_block.ADC_indices] - if seq_block.first_ADC - pre.Mxy[:,1] .= M.xy - pre.Mxy[:,2:end] .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* cis.(ϕ_ADC) - #Reset Spin-State (Magnetization). Only for FlowPath - outflow_spin_reset!(pre.Mxy, seq_block.tp_ADC', p.motion; seq_t=seq.t, add_t0=true) - else - pre.Mxy .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* cis.(ϕ_ADC) - #Reset Spin-State (Magnetization). Only for FlowPath - outflow_spin_reset!(pre.Mxy, seq_block.tp_ADC', p.motion; seq_t=seq.t) - end - - sig .= transpose(sum(pre.Mxy; dims=1)) - end - - #Mxy precession and relaxation, and Mz relaxation - M.z .= M.z .* exp.(-seq_block.dur ./ p.T1) .+ p.ρ .* (T(1) .- exp.(-seq_block.dur ./ p.T1)) - M.xy .= M.xy .* exp.(-seq_block.dur ./ p.T2) .* cis.(pre.ϕ[:,end]) - - #Reset Spin-State (Magnetization). Only for FlowPath - outflow_spin_reset!(M, seq.t', p.motion; replace_by=p.ρ) - - return nothing -end - -function run_spin_excitation!( - p::Phantom{T}, - seq::DiscreteSequence{T}, - sig::AbstractArray{Complex{T}}, - M::Mag{T}, - sim_method::Bloch, - backend::KA.GPU, - pre::BlochGPUPrealloc -) where {T<:Real} - #Motion - x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t') - - #Effective Field - pre.Bz .= (x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz') .+ pre.ΔBz .- seq.Δf' ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z) - pre.B .= sqrt.(abs.(seq.B1') .^ 2 .+ abs.(pre.Bz) .^ 2) - - #Spinor Rotation - pre.φ .= T(-π .* γ) .* (pre.B[:,1:end-1] .* seq.Δt') # TODO: Use trapezoidal integration here (?), this is just Forward Euler - - #Relaxation - pre.ΔT1 .= exp.(-seq.Δt' ./ p.T1) - pre.ΔT2 .= exp.(-seq.Δt' ./ p.T2) - - #Excitation - apply_excitation!(backend, 512)(M.xy, M.z, pre.φ, seq.B1, pre.Bz, pre.B, pre.ΔT1, pre.ΔT2, p.ρ, ndrange=size(M.xy,1)) - KA.synchronize(backend) - - #Reset Spin-State (Magnetization). Only for FlowPath - outflow_spin_reset!(M, seq.t', p.motion; replace_by=p.ρ) # TODO: reset state inside kernel - - return nothing -end \ No newline at end of file diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl deleted file mode 100644 index a3e82e513..000000000 --- a/KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl +++ /dev/null @@ -1,60 +0,0 @@ -using KernelAbstractions: @kernel, @Const, @index, @uniform, @groupsize, @localmem - -## COV_EXCL_START - -@kernel function apply_excitation!(Mxy, Mz, @Const(φ), @Const(B1), @Const(Bz), @Const(B), @Const(ΔT1), @Const(ΔT2), @Const(ρ)) - i_g = @index(Global) - i_l = @index(Local) - - @uniform T = eltype(φ) - @uniform N = @groupsize()[1] - @uniform N_Δt = size(φ, 2) - - s_α_r = @localmem T (N,) - s_α_i = @localmem T (N,) - s_β_i = @localmem T (N,) - s_β_r = @localmem T (N,) - s_Mxy_r = @localmem T (N,) - s_Mxy_i = @localmem T (N,) - s_Mxy_new_r = @localmem T (N,) - s_Mxy_new_i = @localmem T (N,) - s_Mz = @localmem T (N,) - s_Mz_new = @localmem T (N,) - s_ρ = @localmem T (N,) - - @inbounds s_Mxy_r[i_l] = real(Mxy[i_g]) - @inbounds s_Mxy_i[i_l] = imag(Mxy[i_g]) - @inbounds s_Mz[i_l] = Mz[i_g] - @inbounds s_ρ[i_l] = ρ[i_g] - - @inbounds for t = 1 : N_Δt - sin_φ, cos_φ = sincos(φ[i_g, t]) - s_α_r[i_l] = cos_φ - if (iszero(B[i_g, t])) - s_α_i[i_l] = -(Bz[i_g, t] / (B[i_g, t] + eps(T))) * sin_φ - s_β_r[i_l] = (imag(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ - s_β_i[i_l] = -(real(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ - else - s_α_i[i_l] = -(Bz[i_g, t] / B[i_g, t]) * sin_φ - s_β_r[i_l] = (imag(B1[t]) / B[i_g, t]) * sin_φ - s_β_i[i_l] = -(real(B1[t]) / B[i_g, t]) * sin_φ - end - s_Mxy_new_r[i_l] = 2 * (s_Mxy_i[i_l] * (s_α_r[i_l] * s_α_i[i_l] - s_β_r[i_l] * s_β_i[i_l]) + - s_Mz[i_l] * (s_α_i[i_l] * s_β_i[i_l] + s_α_r[i_l] * s_β_r[i_l])) + - s_Mxy_r[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 - s_β_r[i_l]^2 + s_β_i[i_l]^2) - s_Mxy_new_i[i_l] = -2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_α_i[i_l] + s_β_r[i_l] * s_β_i[i_l]) - - s_Mz[i_l] * (s_α_r[i_l] * s_β_i[i_l] - s_α_i[i_l] * s_β_r[i_l])) + - s_Mxy_i[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 + s_β_r[i_l]^2 - s_β_i[i_l]^2) - s_Mz_new[i_l] = s_Mz[i_l] * (s_α_r[i_l]^2 + s_α_i[i_l]^2 - s_β_r[i_l]^2 - s_β_i[i_l]^2) - - 2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_β_r[i_l] - s_α_i[i_l] * s_β_i[i_l]) + - s_Mxy_i[i_l] * (s_α_r[i_l] * s_β_i[i_l] + s_α_i[i_l] * s_β_r[i_l])) - s_Mxy_r[i_l] = s_Mxy_new_r[i_l] * ΔT2[i_g, t] - s_Mxy_i[i_l] = s_Mxy_new_i[i_l] * ΔT2[i_g, t] - s_Mz[i_l] = s_Mz_new[i_l] * ΔT1[i_g, t] + s_ρ[i_l] * (1 - ΔT1[i_g, t]) - end - - @inbounds Mxy[i_g] = s_Mxy_r[i_l] + 1im * s_Mxy_i[i_l] - @inbounds Mz[i_g] = s_Mz[i_l] -end - -## COV_EXCL_STOP \ No newline at end of file diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/cpu/BlochCPU.jl similarity index 98% rename from KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl rename to KomaMRICore/src/simulation/SimMethods/Bloch/cpu/BlochCPU.jl index ddd74ae7a..6527f9e2a 100644 --- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl +++ b/KomaMRICore/src/simulation/SimMethods/Bloch/cpu/BlochCPU.jl @@ -20,7 +20,7 @@ Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin end """Preallocates arrays for use in run_spin_precession! and run_spin_excitation!.""" -function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real} +function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, groupsize) where {T<:Real} return BlochCPUPrealloc( Mag( similar(M.xy), @@ -51,6 +51,7 @@ function run_spin_precession!( sig::AbstractArray{Complex{T}}, M::Mag{T}, sim_method::Bloch, + groupsize, backend::KA.CPU, prealloc::BlochCPUPrealloc ) where {T<:Real} @@ -121,6 +122,7 @@ function run_spin_excitation!( sig::AbstractArray{Complex{T}}, M::Mag{T}, sim_method::Bloch, + groupsize, backend::KA.CPU, prealloc::BlochCPUPrealloc ) where {T<:Real} diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/BlochGPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/BlochGPU.jl new file mode 100644 index 000000000..a1f2800e5 --- /dev/null +++ b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/BlochGPU.jl @@ -0,0 +1,78 @@ +include("KernelFunctions.jl") +include("PrecessionKernel.jl") +include("ExcitationKernel.jl") + +"""Stores preallocated arrays for use in Bloch GPU run_spin_precession! and run_spin_excitation! functions.""" +struct BlochGPUPrealloc{T} <: PreallocResult{T} + sig_output::AbstractMatrix{Complex{T}} + sig_output_final::AbstractMatrix{Complex{T}} + ΔBz::AbstractVector{T} +end + +"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!.""" +function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, groupsize) where {T<:Real} + return BlochGPUPrealloc( + KA.zeros(backend, Complex{T}, (cld(size(obj.x, 1), groupsize), max_block_length)), + KA.zeros(backend, Complex{T}, 1, max_block_length), + obj.Δw ./ T(2π .* γ) + ) +end + +function run_spin_precession!( + p::Phantom{T}, + seq::DiscreteSequence{T}, + sig::AbstractArray{Complex{T}}, + M::Mag{T}, + sim_method::Bloch, + groupsize::Integer, + backend::KA.Backend, + pre::BlochGPUPrealloc +) where {T<:Real} + #Motion + x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t') + + precession_kernel!(backend, groupsize)( + pre.sig_output, + M.xy, M.z, + x, y, z, pre.ΔBz, p.T1, p.T2, p.ρ, + seq.Gx, seq.Gy, seq.Gz, seq.Δt, seq.ADC, + Val(!(p.motion isa NoMotion)), Val(length(M.xy)), Val(groupsize), Val(next_least_power_of_two(groupsize)), Val(groupsize - next_least_power_of_two(groupsize)), + ndrange=(cld(length(M.xy), groupsize) * groupsize) + ) + + AK.reduce(+, view(pre.sig_output,:,1:length(sig)); init=zero(Complex{T}), dims=1, temp=view(pre.sig_output_final,:,1:length(sig))) + sig .= transpose(view(pre.sig_output_final,:,1:length(sig))) + + #Reset Spin-State (Magnetization). Only for FlowPath + outflow_spin_reset!(M, seq.t', p.motion; replace_by=p.ρ) + + return nothing +end + +function run_spin_excitation!( + p::Phantom{T}, + seq::DiscreteSequence{T}, + sig::AbstractArray{Complex{T}}, + M::Mag{T}, + sim_method::Bloch, + groupsize::Integer, + backend::KA.Backend, + pre::BlochGPUPrealloc +) where {T<:Real} + #Motion + x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t') + + #Excitation + excitation_kernel!(backend, groupsize)( + M.xy, M.z, + x, y, z, pre.ΔBz, p.T1, p.T2, p.ρ, + seq.Gx, seq.Gy, seq.Gz, seq.Δt, seq.Δf, seq.B1, + Val(!(p.motion isa NoMotion)), + ndrange=size(M.xy,1) + ) + + #Reset Spin-State (Magnetization). Only for FlowPath + outflow_spin_reset!(M, seq.t', p.motion; replace_by=p.ρ) # TODO: reset state inside kernel + + return nothing +end \ No newline at end of file diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/ExcitationKernel.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/ExcitationKernel.jl new file mode 100644 index 000000000..009b29491 --- /dev/null +++ b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/ExcitationKernel.jl @@ -0,0 +1,63 @@ +@kernel inbounds=true function excitation_kernel!( + M_xy::AbstractVector{Complex{T}}, M_z, + @Const(p_x), @Const(p_y), @Const(p_z), @Const(p_ΔBz), @Const(p_T1), @Const(p_T2), @Const(p_ρ), + @Const(s_Gx), @Const(s_Gy), @Const(s_Gz), @Const(s_Δt), @Const(s_Δf), @Const(s_B1), + ::Val{MOTION} +) where {T, MOTION} + + i = @index(Global) + + @uniform N_Δt = length(s_Δt) + + x, y, z = get_spin_coordinates(p_x, p_y, p_z, i, 1) + ΔBz = p_ΔBz[i] + Mxy_r, Mxy_i = reim(M_xy[i]) + Mz = M_z[i] + ρ = p_ρ[i] + T1 = p_T1[i] + T2 = p_T2[i] + + for t = 1:N_Δt + if MOTION + x, y, z = get_spin_coordinates(p_x, p_y, p_z, i, t) + end + + Bz = (x * s_Gx[t] + y * s_Gy[t] + z * s_Gz[t]) + ΔBz - s_Δf[t] / T(γ) + B1_r, B1_i = reim(s_B1[t]) + B = sqrt(B1_r^2 + B1_i^2 + Bz^2) + Δt = s_Δt[t] + φ = T(-π * γ) * B * Δt + sin_φ, cos_φ = sincos(φ) + α_r = cos_φ + if iszero(B) + α_i = -(Bz / (B + eps(T))) * sin_φ + β_r = (B1_i / (B + eps(T))) * sin_φ + β_i = -(B1_r / (B + eps(T))) * sin_φ + else + α_i = -(Bz / B) * sin_φ + β_r = (B1_i / B) * sin_φ + β_i = -(B1_r / B) * sin_φ + end + + Mxy_new_r = 2 * (Mxy_i * (α_r * α_i - β_r * β_i) + + Mz * (α_i * β_i + α_r * β_r)) + + Mxy_r * (α_r^2 - α_i^2 - β_r^2 + β_i^2) + + Mxy_new_i = -2 * (Mxy_r * (α_r * α_i + β_r * β_i) - + Mz * (α_r * β_i - α_i * β_r)) + + Mxy_i * (α_r^2 - α_i^2 + β_r^2 - β_i^2) + + Mz_new = Mz * (α_r^2 + α_i^2 - β_r^2 - β_i^2) - + 2 * (Mxy_r * (α_r * β_r - α_i * β_i) + + Mxy_i * (α_r * β_i + α_i * β_r)) + + ΔT1 = exp(-Δt / T1) + ΔT2 = exp(-Δt / T2) + Mxy_r = Mxy_new_r * ΔT2 + Mxy_i = Mxy_new_i * ΔT2 + Mz = Mz_new * ΔT1 + ρ * (1 - ΔT1) + end + + M_xy[i] = complex(Mxy_r, Mxy_i) + M_z[i] = Mz +end \ No newline at end of file diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/KernelFunctions.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/KernelFunctions.jl new file mode 100644 index 000000000..5939e85ba --- /dev/null +++ b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/KernelFunctions.jl @@ -0,0 +1,105 @@ +using KernelAbstractions: @kernel, @Const, @index, @uniform, @localmem, @synchronize, @groupsize + +#Used for getting spin coordinates inside precession and excitation kernels +@inline function get_spin_coordinates(x::AbstractVector{T}, y::AbstractVector{T}, z::AbstractVector{T}, i::Integer, t::Integer) where {T<:Real} + (x[i], y[i], z[i]) +end +@inline function get_spin_coordinates(x::AbstractMatrix{T}, y::AbstractMatrix{T}, z::AbstractMatrix{T}, i::Integer, t::Integer) where {T<:Real} + (x[i, t], y[i, t], z[i, t]) +end + +# Returns the next least power of two starting from n, used to calculate remaining indexes in block-level reduction +@inline function next_least_power_of_two(n) + return n < 2 ? 1 : + n < 4 ? 2 : + n < 8 ? 4 : + n < 16 ? 8 : + n < 32 ? 16 : + n < 64 ? 32 : + n < 128 ? 64 : + n < 256 ? 128 : + n < 512 ? 256 : + n < 1024 ? 512 : + 1024 +end + +@inline function reduce_block!(s_data1, s_data2, i, N, N_closest, R) + if N == 1024u16 + if i <= 512u16 + s_data1[i] = s_data1[i] + s_data1[i + 512u16] + s_data2[i] = s_data2[i] + s_data2[i + 512u16] + end + elseif N != N_closest + #If workgroup size is not a power of two, need to take care of + #remaining indices before entering next if statement + if i <= R + s_data1[i] = s_data1[i] + s_data1[i + N_closest] + s_data2[i] = s_data2[i] + s_data2[i + N_closest] + end + end + @synchronize() + if N >= 512u16 + if i <= 256u16 + s_data1[i] = s_data1[i] + s_data1[i + 256u16] + s_data2[i] = s_data2[i] + s_data2[i + 256u16] + end + @synchronize() + end + if N >= 256u16 + if i <= 128u16 + s_data1[i] = s_data1[i] + s_data1[i + 128u16] + s_data2[i] = s_data2[i] + s_data2[i + 128u16] + end + @synchronize() + end + if N >= 128u16 + if i <= 64u16 + s_data1[i] = s_data1[i] + s_data1[i + 64u16] + s_data2[i] = s_data2[i] + s_data2[i + 64u16] + end + @synchronize() + end + + if N >= 64u16 + if i <= 32u16 + s_data1[i] = s_data1[i] + s_data1[i + 32u16] + s_data2[i] = s_data2[i] + s_data2[i + 32u16] + end + @synchronize() + end + if N >= 32u16 + if i <= 16u16 + s_data1[i] = s_data1[i] + s_data1[i + 16u16] + s_data2[i] = s_data2[i] + s_data2[i + 16u16] + end + @synchronize() + end + if N >= 16u16 + if i <= 8u16 + s_data1[i] = s_data1[i] + s_data1[i + 8u16] + s_data2[i] = s_data2[i] + s_data2[i + 8u16] + end + @synchronize() + end + if N >= 8u16 + if i <= 4u16 + s_data1[i] = s_data1[i] + s_data1[i + 4u16] + s_data2[i] = s_data2[i] + s_data2[i + 4u16] + end + @synchronize() + end + if N >= 4u16 + if i <= 2u16 + s_data1[i] = s_data1[i] + s_data1[i + 2u16] + s_data2[i] = s_data2[i] + s_data2[i + 2u16] + end + @synchronize() + end + if N >= 2u16 + if i <= 1u16 + s_data1[i] = s_data1[i] + s_data1[i + 1u16] + s_data2[i] = s_data2[i] + s_data2[i + 1u16] + end + @synchronize() + end +end \ No newline at end of file diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/PrecessionKernel.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/PrecessionKernel.jl new file mode 100644 index 000000000..d0cc2430a --- /dev/null +++ b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/PrecessionKernel.jl @@ -0,0 +1,91 @@ +@kernel inbounds=true function precession_kernel!( + sig_output::AbstractMatrix{Complex{T}}, + M_xy, M_z, + @Const(p_x), @Const(p_y), @Const(p_z), @Const(p_ΔBz), @Const(p_T1), @Const(p_T2), @Const(p_ρ), + @Const(s_Gx), @Const(s_Gy), @Const(s_Gz), @Const(s_Δt), @Const(s_ADC), + ::Val{MOTION}, ::Val{N_SPINS}, ::Val{GROUPSIZE}, ::Val{GROUPSIZE_CLOSEST_POWER_OF_TWO}, ::Val{GROUPSIZE_REMAINDER} +) where {T, MOTION, N_SPINS, GROUPSIZE, GROUPSIZE_CLOSEST_POWER_OF_TWO, GROUPSIZE_REMAINDER} + + i = @index(Global, Linear) + i_l = @index(Local, Linear) + i_g = @index(Group, Linear) + + @uniform s_length = length(s_Δt)+1 + + sig_group_r = @localmem T GROUPSIZE + sig_group_i = @localmem T GROUPSIZE + sig_group_r[i_l] = zero(T) + sig_group_i[i_l] = zero(T) + + Mxy_r = zero(T) + Mxy_i = zero(T) + t = zero(T) + ϕ = zero(T) + ΔBz = zero(T) + T2 = zero(T) + x = zero(T) + y = zero(T) + z = zero(T) + Bz_prev = zero(T) + Bz_next = zero(T) + + if i <= N_SPINS + Mxy_r, Mxy_i = reim(M_xy[i]) + sig_group_r[i_l] = Mxy_r + sig_group_i[i_l] = Mxy_i + ΔBz = p_ΔBz[i] + T2 = p_T2[i] + x, y, z = get_spin_coordinates(p_x, p_y, p_z, i, 1) + Bz_prev = x * s_Gx[1] + y * s_Gy[1] + z * s_Gz[1] + ΔBz + end + + ADC_idx = 1 + if s_ADC[1] + @synchronize() + reduce_block!(sig_group_r, sig_group_i, i_l, GROUPSIZE, GROUPSIZE_CLOSEST_POWER_OF_TWO, GROUPSIZE_REMAINDER) + if i_l == 1u16 + sig_output[i_g, 1] = complex(sig_group_r[1u16], sig_group_i[1u16]) + end + ADC_idx += 1 + end + + for s_idx = 2:s_length + if i <= N_SPINS + if MOTION + x, y, z = get_spin_coordinates(p_x, p_y, p_z, i, s_idx) + end + + Δt = s_Δt[s_idx-1] + t += Δt + Bz_next = x * s_Gx[s_idx] + y * s_Gy[s_idx] + z * s_Gz[s_idx] + ΔBz + ϕ += (Bz_prev + Bz_next) * T(-π * γ) * Δt + end + + if ADC_idx < s_length && s_ADC[s_idx] + if i <= N_SPINS + ΔT2 = exp(-t / T2) + cis_ϕ_i, cis_ϕ_r = sincos(ϕ) + sig_group_r[i_l] = ΔT2 * (Mxy_r * cis_ϕ_r - Mxy_i * cis_ϕ_i) + sig_group_i[i_l] = ΔT2 * (Mxy_r * cis_ϕ_i + Mxy_i * cis_ϕ_r) + end + @synchronize() + reduce_block!(sig_group_r, sig_group_i, i_l, GROUPSIZE, GROUPSIZE_CLOSEST_POWER_OF_TWO, GROUPSIZE_REMAINDER) + if i_l == 1u16 + sig_output[i_g, ADC_idx] = complex(sig_group_r[1u16], sig_group_i[1u16]) + end + ADC_idx += 1 + end + + if i <= N_SPINS + Bz_prev = Bz_next + end + end + + if i <= N_SPINS + ΔT2 = exp(-t / T2) + cis_ϕ_i, cis_ϕ_r = sincos(ϕ) + M_xy[i] = complex(ΔT2 * (Mxy_r * cis_ϕ_r - Mxy_i * cis_ϕ_i), ΔT2 * (Mxy_r * cis_ϕ_i + Mxy_i * cis_ϕ_r)) + ΔT1 = exp(-t / p_T1[i]) + M_z[i] = M_z[i] * ΔT1 + p_ρ[i] * (1 - ΔT1) + end +end \ No newline at end of file diff --git a/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl index 2c0668d48..85366deb4 100644 --- a/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl +++ b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl @@ -35,6 +35,7 @@ function run_spin_precession!( sig::AbstractArray{Complex{T}}, M::Mag{T}, sim_method::BlochDict, + groupsize, backend::KA.Backend, prealloc::PreallocResult ) where {T<:Real} diff --git a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl index 465d5b8aa..7a15fb1c3 100644 --- a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl +++ b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl @@ -27,6 +27,7 @@ function run_spin_precession!( sig::AbstractArray{Complex{T}}, M::Mag{T}, sim_method::SimulationMethod, + groupsize, backend::KA.Backend, prealloc::PreallocResult ) where {T<:Real} @@ -76,6 +77,7 @@ function run_spin_excitation!( sig::AbstractArray{Complex{T}}, M::Mag{T}, sim_method::SimulationMethod, + groupsize, backend::KA.Backend, prealloc::PreallocResult ) where {T<:Real} diff --git a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl index 8326d0a85..5414b5dd7 100644 --- a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl +++ b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl @@ -27,25 +27,15 @@ end """Stores pre-allocated arrays for use in run_spin_precession! and run_spin_excitation!""" abstract type PreallocResult{T<:Real} end -"""Stores information precalculated before the simulation objects are moved to the GPU.""" -abstract type PrecalcResult{T<:Real} end - """Default preallocation struct, stores nothing.""" struct DefaultPrealloc{T} <: PreallocResult{T} end Base.view(p::PreallocResult, i::UnitRange) = p -prealloc_block(p::PreallocResult, i::Integer) = p - -"""Default precalculation struct, stores nothing.""" -struct DefaultPrecalc{T} <: PrecalcResult{T} end """Default preallocation function.""" -prealloc(sim_method::SimulationMethod, backend::KA.Backend, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real} = DefaultPrealloc{T}() - -"""Default precalc function.""" -precalculate(sim_method::SimulationMethod, backend::KA.Backend, seq::DiscreteSequence{T}, parts::Vector{UnitRange{S}}, excitation_bool::Vector{Bool}) where {T<:Real,S<:Integer} = DefaultPrecalc{T}() +prealloc(sim_method::SimulationMethod, backend::KA.Backend, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, groupsize) where {T<:Real} = DefaultPrealloc{T}() include("BlochSimple/BlochSimple.jl") -include("Bloch/BlochCPU.jl") -include("Bloch/BlochGPU.jl") +include("Bloch/cpu/BlochCPU.jl") +include("Bloch/gpu/BlochGPU.jl") include("BlochDict/BlochDict.jl") \ No newline at end of file diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl index aff8a56f7..964662356 100644 --- a/KomaMRICore/src/simulation/SimulatorCore.jl +++ b/KomaMRICore/src/simulation/SimulatorCore.jl @@ -46,6 +46,8 @@ function default_sim_params(sim_params=Dict{String,Any}()) sampling_params = KomaMRIBase.default_sampling_params() get!(sim_params, "gpu", true) get!(sim_params, "gpu_device", nothing) + get!(sim_params, "gpu_groupsize_precession", DEFAULT_PRECESSION_GROUPSIZE) + get!(sim_params, "gpu_groupsize_excitation", DEFAULT_EXCITATION_GROUPSIZE) get!(sim_params, "Nthreads", Threads.nthreads()) get!(sim_params, "Nblocks", 20) get!(sim_params, "Δt", sampling_params["Δt"]) @@ -82,6 +84,7 @@ function run_spin_precession_parallel!( sig::AbstractArray{Complex{T}}, Xt::SpinStateRepresentation{T}, sim_method::SimulationMethod, + groupsize::Integer, backend::KA.Backend, prealloc::PreallocResult; Nthreads=Threads.nthreads(), @@ -91,7 +94,7 @@ function run_spin_precession_parallel!( ThreadsX.foreach(enumerate(parts)) do (i, p) run_spin_precession!( - @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, backend, @view(prealloc[p]) + @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, groupsize, backend, @view(prealloc[p]) ) end @@ -123,6 +126,7 @@ function run_spin_excitation_parallel!( sig::AbstractArray{Complex{T}}, Xt::SpinStateRepresentation{T}, sim_method::SimulationMethod, + groupsize::Integer, backend::KA.Backend, prealloc::PreallocResult; Nthreads=Threads.nthreads(), @@ -132,7 +136,7 @@ function run_spin_excitation_parallel!( ThreadsX.foreach(enumerate(parts)) do (i, p) run_spin_excitation!( - @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, backend, @view(prealloc[p]) + @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, groupsize, backend, @view(prealloc[p]) ) end @@ -173,8 +177,9 @@ function run_sim_time_iter!( backend::KA.Backend; Nblocks=1, Nthreads=Threads.nthreads(), + precession_groupsize=DEFAULT_PRECESSION_GROUPSIZE, + excitation_groupsize=DEFAULT_EXCITATION_GROUPSIZE, parts=[1:length(seq)], - precalc=nothing, excitation_bool=ones(Bool, size(parts)), w=nothing, ) where {T<:Real} @@ -182,7 +187,7 @@ function run_sim_time_iter!( rfs = 0 samples = 1 progress_bar = Progress(Nblocks; desc="Running simulation...") - prealloc_result = prealloc(sim_method, backend, obj, Xt, maximum(length.(parts))+1, precalc) + prealloc_result = prealloc(sim_method, backend, obj, Xt, maximum(length.(parts))+1, precession_groupsize) for (block, p) in enumerate(parts) seq_block = @view seq[p] @@ -194,16 +199,17 @@ function run_sim_time_iter!( # Simulation wrappers if excitation_bool[block] run_spin_excitation_parallel!( - obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, backend, prealloc_block(prealloc_result, block); Nthreads + obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, excitation_groupsize, backend, prealloc_result; Nthreads ) rfs += 1 else run_spin_precession_parallel!( - obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, backend, prealloc_block(prealloc_result, block); Nthreads + obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, precession_groupsize, backend, prealloc_result; Nthreads ) end samples += Nadc #Update progress + KA.synchronize(backend) next!( progress_bar; showvalues=[ @@ -369,7 +375,6 @@ function simulate( Xt = Xt |> f32 #SpinStateRepresentation sig = sig |> f32 #Signal end - precalc = precalculate(sim_params["sim_method"], backend, seqd, parts, excitation_bool) # Objects to GPU if backend isa KA.GPU isnothing(sim_params["gpu_device"]) || set_device!(backend, sim_params["gpu_device"]) @@ -378,7 +383,6 @@ function simulate( seqd = seqd |> gpu #DiscreteSequence Xt = Xt |> gpu #SpinStateRepresentation sig = sig |> gpu #Signal - precalc = precalc |> gpu #Info calculated prior to simulation end # Simulation @@ -395,9 +399,10 @@ function simulate( backend; Nblocks=length(parts), Nthreads=sim_params["Nthreads"], + precession_groupsize=sim_params["gpu_groupsize_precession"], + excitation_groupsize=sim_params["gpu_groupsize_excitation"], parts, excitation_bool, - precalc, w, ) # Result to CPU, if already in the CPU it does nothing