diff --git a/KomaMRICore/Project.toml b/KomaMRICore/Project.toml
index 82fcb819d..5cf2b5b6e 100644
--- a/KomaMRICore/Project.toml
+++ b/KomaMRICore/Project.toml
@@ -4,6 +4,7 @@ authors = ["Carlos Castillo Passi <cncastillo@uc.cl>"]
 version = "0.9.0"
 
 [deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
@@ -26,6 +27,7 @@ KomaoneAPIExt = "oneAPI"
 
 [compat]
 AMDGPU = "0.9, 1"
+AcceleratedKernels = "0.2.2"
 Adapt = "3, 4"
 CUDA = "3, 4, 5"
 Functors = "0.4, 0.5"
diff --git a/KomaMRICore/src/KomaMRICore.jl b/KomaMRICore/src/KomaMRICore.jl
index 3486329ed..481218596 100644
--- a/KomaMRICore/src/KomaMRICore.jl
+++ b/KomaMRICore/src/KomaMRICore.jl
@@ -3,6 +3,7 @@ module KomaMRICore
 # General
 import Base.*, Base.abs
 import KernelAbstractions as KA
+import AcceleratedKernels as AK
 using Reexport
 using ThreadsX
 # Printing
diff --git a/KomaMRICore/src/simulation/GPUFunctions.jl b/KomaMRICore/src/simulation/GPUFunctions.jl
index 75e63dbb1..bffbae9be 100644
--- a/KomaMRICore/src/simulation/GPUFunctions.jl
+++ b/KomaMRICore/src/simulation/GPUFunctions.jl
@@ -1,5 +1,7 @@
 const LOADED_BACKENDS = Ref{Vector{KA.GPU}}([])
 const BACKEND = Ref{Union{KA.Backend,Nothing}}(nothing)
+const DEFAULT_PRECESSION_GROUPSIZE = 32
+const DEFAULT_EXCITATION_GROUPSIZE = 256
 
 device_name(backend) = @error "device_name called with invalid backend type $(typeof(backend))"
 isfunctional(::KA.CPU) = true
@@ -10,9 +12,12 @@ name(::KA.CPU) = "CPU"
 set_device!(backend, val) = @error "set_device! called with invalid parameter types: '$(typeof(backend))', '$(typeof(val))'" 
 set_device!(val) = set_device!(get_backend(true), val)
 
-#oneAPI.jl doesn't support cis (https://github.com/JuliaGPU/oneAPI.jl/pull/443), so
-#for now we use a custom function for each backend to implement
-function _cis end
+#Copied from AcceleratedKernels.jl: https://github.com/JuliaGPU/AcceleratedKernels.jl/blob/00a9c0e7a36d6e02a51dbd13032a9165caab7909/src/utils.jl#L34
+struct TypeWrap{T} end
+TypeWrap(T) = TypeWrap{T}()
+Base.:*(x::Number, ::TypeWrap{T}) where T = T(x)
+
+const u16 = TypeWrap(UInt16)
 
 """
     get_backend(use_gpu)
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl
deleted file mode 100644
index e3d10ab27..000000000
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl
+++ /dev/null
@@ -1,206 +0,0 @@
-include("KernelFunctions.jl")
-
-"""These properties are redundant with seq.Δt and seq.ADC, but it is much faster
-to calculate them on the CPU before the simulation is run."""
-struct SeqBlockProperties{T<:Real}
-    length::Int64
-    nADC::Int64
-    first_ADC::Bool
-    ADC_indices::AbstractVector{Int64}
-    tp_ADC::AbstractVector{T}
-    dur::T
-end
-
-@functor SeqBlockProperties
-
-"""Stores information added to the prealloc struct for use in run_spin_precession!
-and run_spin_excitation!. This information is calculated on the CPU before the
-simulation is run."""
-struct BlochGPUPrecalc{T} <: PrecalcResult{T}
-    seq_properties::AbstractVector{SeqBlockProperties{T}}
-end
-
-@functor BlochGPUPrecalc
-
-"""Precalculates sequence properties for use in run_spin_precession!"""
-function precalculate(
-    sim_method::Bloch, 
-    backend::KA.GPU,
-    seq::DiscreteSequence{T},
-    parts::Vector{UnitRange{S}}, 
-    excitation_bool::Vector{Bool}
-) where {T<:Real,S<:Integer}
-    seq_properties = SeqBlockProperties{T}[]
-    for (block, p) in enumerate(parts)
-        seq_block = @view seq[p]
-        if excitation_bool[block]
-            push!(seq_properties, SeqBlockProperties(
-                length(seq_block.t),
-                count(seq_block.ADC),
-                false,
-                Int64[],
-                T[],
-                zero(T)
-            ))
-        else
-            ADC_indices = findall(seq_block.ADC) .- 1
-            if seq_block.ADC[1]
-                deleteat!(ADC_indices, 1)
-            end
-            tp = cumsum(seq_block.Δt)
-            push!(seq_properties, SeqBlockProperties(
-                length(seq_block.t),
-                count(seq_block.ADC),
-                seq_block.ADC[1],
-                ADC_indices,
-                tp[ADC_indices],
-                last(tp)
-            ))
-        end
-    end
-
-    return BlochGPUPrecalc(seq_properties)
-end
-
-"""Stores preallocated structs for use in Bloch GPU run_spin_precession function."""
-struct BlochGPUPrealloc{T} <: PreallocResult{T}
-    seq_properties::AbstractVector{SeqBlockProperties{T}}
-    Bz::AbstractMatrix{T}
-    B::AbstractMatrix{T}
-    φ::AbstractMatrix{T}
-    ΔT1::AbstractMatrix{T}
-    ΔT2::AbstractMatrix{T}
-    Δϕ::AbstractMatrix{T}
-    ϕ::AbstractArray{T}
-    Mxy::AbstractMatrix{Complex{T}}
-    ΔBz::AbstractVector{T}
-end
-
-Base.view(p::BlochGPUPrealloc{T}, i::UnitRange) where {T<:Real} = p
-function prealloc_block(p::BlochGPUPrealloc{T}, i::Integer) where {T<:Real}
-    seq_block = p.seq_properties[i]
-
-    return BlochGPUPrealloc(
-        [seq_block],
-        view(p.Bz,:,1:seq_block.length),
-        view(p.B,:,1:seq_block.length),
-        view(p.φ,:,1:seq_block.length-1),
-        view(p.ΔT1,:,1:seq_block.length-1),
-        view(p.ΔT2,:,1:seq_block.length-1),
-        view(p.Δϕ,:,1:seq_block.length-1),
-        seq_block.nADC > 0 ? view(p.ϕ,:,1:seq_block.length-1) : view(p.ϕ,:,1),
-        view(p.Mxy,:,1:seq_block.nADC),
-        p.ΔBz
-    )
-end
-
-"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
-function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
-    if !(precalc isa BlochGPUPrecalc) @error """Sim method Bloch() does not support calling run_sim_time_iter directly. Use method BlochSimple() instead.""" end
-
-    return BlochGPUPrealloc(
-        precalc.seq_properties,
-        KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
-        KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
-        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
-        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
-        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
-        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
-        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
-        KA.zeros(backend, Complex{T}, (size(obj.x, 1), max_block_length)),
-        obj.Δw ./ T(2π .* γ)
-    )
-end
-
-@inline function calculate_precession!(Δϕ::AbstractArray{T}, Δt::AbstractArray{T}, Bz::AbstractArray{T}) where {T<:Real}
-    Δϕ .= (Bz[:,2:end] .+ Bz[:,1:end-1]) .* Δt .* T(-π .* γ)
-end
-@inline function apply_precession!(ϕ::AbstractVector{T}, Δϕ::AbstractArray{T}) where {T<:Real}
-    ϕ .= sum(Δϕ, dims=2)
-end
-function apply_precession!(ϕ::AbstractArray{T}, Δϕ::AbstractArray{T}) where {T<:Real}
-    cumsum!(ϕ, Δϕ, dims=2)
-end
-
-function run_spin_precession!(
-    p::Phantom{T},
-    seq::DiscreteSequence{T},
-    sig::AbstractArray{Complex{T}},
-    M::Mag{T},
-    sim_method::Bloch,
-    backend::KA.GPU,
-    pre::BlochGPUPrealloc
-) where {T<:Real}
-    #Simulation
-    #Motion
-    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
-    #Sequence block info
-    seq_block = pre.seq_properties[1]
-    
-    #Effective field
-    pre.Bz .= x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ pre.ΔBz
-
-    #Rotation
-    calculate_precession!(pre.Δϕ, seq.Δt', pre.Bz)
-    # Reduces Δϕ Nspins x Nt to ϕ Nspins x Nt, if Nadc = 0, to Nspins x 1 
-    apply_precession!(pre.ϕ, pre.Δϕ)
-
-    #Acquired signal
-    if seq_block.nADC > 0
-        ϕ_ADC = @view pre.ϕ[:,seq_block.ADC_indices]
-        if seq_block.first_ADC
-            pre.Mxy[:,1] .= M.xy
-            pre.Mxy[:,2:end] .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* cis.(ϕ_ADC)
-            #Reset Spin-State (Magnetization). Only for FlowPath
-            outflow_spin_reset!(pre.Mxy, seq_block.tp_ADC', p.motion; seq_t=seq.t, add_t0=true)
-        else
-            pre.Mxy .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* cis.(ϕ_ADC)
-            #Reset Spin-State (Magnetization). Only for FlowPath
-            outflow_spin_reset!(pre.Mxy, seq_block.tp_ADC', p.motion; seq_t=seq.t)
-        end
-
-        sig .= transpose(sum(pre.Mxy; dims=1))
-    end
-    
-    #Mxy precession and relaxation, and Mz relaxation
-    M.z  .= M.z .* exp.(-seq_block.dur ./ p.T1) .+ p.ρ .* (T(1) .- exp.(-seq_block.dur ./ p.T1))
-    M.xy .= M.xy .* exp.(-seq_block.dur ./ p.T2) .* cis.(pre.ϕ[:,end])
-
-    #Reset Spin-State (Magnetization). Only for FlowPath
-    outflow_spin_reset!(M, seq.t', p.motion; replace_by=p.ρ)
-
-    return nothing
-end
-
-function run_spin_excitation!(
-    p::Phantom{T},
-    seq::DiscreteSequence{T},
-    sig::AbstractArray{Complex{T}},
-    M::Mag{T},
-    sim_method::Bloch,
-    backend::KA.GPU,
-    pre::BlochGPUPrealloc
-) where {T<:Real}
-    #Motion
-    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
-
-    #Effective Field
-    pre.Bz .= (x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz') .+ pre.ΔBz .- seq.Δf' ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
-    pre.B .= sqrt.(abs.(seq.B1') .^ 2 .+ abs.(pre.Bz) .^ 2)
-    
-    #Spinor Rotation
-    pre.φ .= T(-π .* γ) .* (pre.B[:,1:end-1] .* seq.Δt') # TODO: Use trapezoidal integration here (?),  this is just Forward Euler
-    
-    #Relaxation
-    pre.ΔT1 .= exp.(-seq.Δt' ./ p.T1)
-    pre.ΔT2 .= exp.(-seq.Δt' ./ p.T2)
-
-    #Excitation
-    apply_excitation!(backend, 512)(M.xy, M.z, pre.φ, seq.B1, pre.Bz, pre.B, pre.ΔT1, pre.ΔT2, p.ρ, ndrange=size(M.xy,1))
-    KA.synchronize(backend)
-
-    #Reset Spin-State (Magnetization). Only for FlowPath
-    outflow_spin_reset!(M,  seq.t', p.motion; replace_by=p.ρ) # TODO: reset state inside kernel
-
-    return nothing
-end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl
deleted file mode 100644
index a3e82e513..000000000
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl
+++ /dev/null
@@ -1,60 +0,0 @@
-using KernelAbstractions: @kernel, @Const, @index, @uniform, @groupsize, @localmem
-
-## COV_EXCL_START
-
-@kernel function apply_excitation!(Mxy, Mz, @Const(φ), @Const(B1), @Const(Bz), @Const(B), @Const(ΔT1), @Const(ΔT2), @Const(ρ))
-    i_g = @index(Global)
-    i_l = @index(Local)
-
-    @uniform T = eltype(φ)
-    @uniform N = @groupsize()[1]
-    @uniform N_Δt = size(φ, 2)
-
-    s_α_r = @localmem T (N,)
-    s_α_i = @localmem T (N,)
-    s_β_i = @localmem T (N,)
-    s_β_r = @localmem T (N,)
-    s_Mxy_r = @localmem T (N,)
-    s_Mxy_i = @localmem T (N,)
-    s_Mxy_new_r = @localmem T (N,)
-    s_Mxy_new_i = @localmem T (N,)
-    s_Mz = @localmem T (N,)
-    s_Mz_new = @localmem T (N,)
-    s_ρ = @localmem T (N,)
-
-    @inbounds s_Mxy_r[i_l] = real(Mxy[i_g])
-    @inbounds s_Mxy_i[i_l]  = imag(Mxy[i_g])
-    @inbounds s_Mz[i_l] = Mz[i_g]
-    @inbounds s_ρ[i_l] = ρ[i_g]
-
-    @inbounds for t = 1 : N_Δt
-        sin_φ, cos_φ = sincos(φ[i_g, t])
-        s_α_r[i_l] = cos_φ
-        if (iszero(B[i_g, t]))
-            s_α_i[i_l] = -(Bz[i_g, t] / (B[i_g, t] + eps(T))) * sin_φ
-            s_β_r[i_l] = (imag(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ
-            s_β_i[i_l] = -(real(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ
-        else
-            s_α_i[i_l] = -(Bz[i_g, t] / B[i_g, t]) * sin_φ
-            s_β_r[i_l] = (imag(B1[t]) / B[i_g, t]) * sin_φ
-            s_β_i[i_l] = -(real(B1[t]) / B[i_g, t]) * sin_φ
-        end
-            s_Mxy_new_r[i_l] = 2 * (s_Mxy_i[i_l] * (s_α_r[i_l] * s_α_i[i_l] - s_β_r[i_l] * s_β_i[i_l]) +
-                                s_Mz[i_l] * (s_α_i[i_l] * s_β_i[i_l] + s_α_r[i_l] * s_β_r[i_l])) +
-                                s_Mxy_r[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 - s_β_r[i_l]^2 + s_β_i[i_l]^2)
-            s_Mxy_new_i[i_l] = -2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_α_i[i_l] + s_β_r[i_l] * s_β_i[i_l]) -
-                                s_Mz[i_l] * (s_α_r[i_l] * s_β_i[i_l] - s_α_i[i_l] * s_β_r[i_l])) +
-                                s_Mxy_i[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 + s_β_r[i_l]^2 - s_β_i[i_l]^2)
-            s_Mz_new[i_l] = s_Mz[i_l] * (s_α_r[i_l]^2 + s_α_i[i_l]^2 - s_β_r[i_l]^2 - s_β_i[i_l]^2) -
-                            2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_β_r[i_l] - s_α_i[i_l] * s_β_i[i_l]) +
-                            s_Mxy_i[i_l] * (s_α_r[i_l] * s_β_i[i_l] + s_α_i[i_l] * s_β_r[i_l]))
-        s_Mxy_r[i_l] = s_Mxy_new_r[i_l] * ΔT2[i_g, t]
-        s_Mxy_i[i_l] = s_Mxy_new_i[i_l] * ΔT2[i_g, t]
-        s_Mz[i_l] = s_Mz_new[i_l] * ΔT1[i_g, t] + s_ρ[i_l] * (1 - ΔT1[i_g, t])
-    end
-
-    @inbounds Mxy[i_g] = s_Mxy_r[i_l] + 1im * s_Mxy_i[i_l]
-    @inbounds Mz[i_g] = s_Mz[i_l]
-end
-
-## COV_EXCL_STOP
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/cpu/BlochCPU.jl
similarity index 98%
rename from KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
rename to KomaMRICore/src/simulation/SimMethods/Bloch/cpu/BlochCPU.jl
index ddd74ae7a..6527f9e2a 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/cpu/BlochCPU.jl
@@ -20,7 +20,7 @@ Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
 end
 
 """Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
-function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
+function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, groupsize) where {T<:Real}
     return BlochCPUPrealloc(
         Mag(
             similar(M.xy),
@@ -51,6 +51,7 @@ function run_spin_precession!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::Bloch,
+    groupsize,
     backend::KA.CPU,
     prealloc::BlochCPUPrealloc
 ) where {T<:Real}
@@ -121,6 +122,7 @@ function run_spin_excitation!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::Bloch,
+    groupsize,
     backend::KA.CPU,
     prealloc::BlochCPUPrealloc
 ) where {T<:Real}
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/BlochGPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/BlochGPU.jl
new file mode 100644
index 000000000..a1f2800e5
--- /dev/null
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/BlochGPU.jl
@@ -0,0 +1,78 @@
+include("KernelFunctions.jl")
+include("PrecessionKernel.jl")
+include("ExcitationKernel.jl")
+
+"""Stores preallocated arrays for use in Bloch GPU run_spin_precession! and run_spin_excitation! functions."""
+struct BlochGPUPrealloc{T} <: PreallocResult{T}
+    sig_output::AbstractMatrix{Complex{T}}
+    sig_output_final::AbstractMatrix{Complex{T}}
+    ΔBz::AbstractVector{T}
+end
+
+"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
+function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, groupsize) where {T<:Real}
+    return BlochGPUPrealloc(
+        KA.zeros(backend, Complex{T}, (cld(size(obj.x, 1), groupsize), max_block_length)),
+        KA.zeros(backend, Complex{T}, 1, max_block_length),
+        obj.Δw ./ T(2π .* γ)
+    )
+end
+
+function run_spin_precession!(
+    p::Phantom{T},
+    seq::DiscreteSequence{T},
+    sig::AbstractArray{Complex{T}},
+    M::Mag{T},
+    sim_method::Bloch,
+    groupsize::Integer,
+    backend::KA.Backend,
+    pre::BlochGPUPrealloc
+) where {T<:Real}
+    #Motion
+    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
+
+    precession_kernel!(backend, groupsize)(
+        pre.sig_output, 
+        M.xy, M.z, 
+        x, y, z, pre.ΔBz, p.T1, p.T2, p.ρ, 
+        seq.Gx, seq.Gy, seq.Gz, seq.Δt, seq.ADC, 
+        Val(!(p.motion isa NoMotion)), Val(length(M.xy)), Val(groupsize), Val(next_least_power_of_two(groupsize)), Val(groupsize - next_least_power_of_two(groupsize)), 
+        ndrange=(cld(length(M.xy), groupsize) * groupsize)
+    )
+
+    AK.reduce(+, view(pre.sig_output,:,1:length(sig)); init=zero(Complex{T}), dims=1, temp=view(pre.sig_output_final,:,1:length(sig)))
+    sig .= transpose(view(pre.sig_output_final,:,1:length(sig)))
+
+    #Reset Spin-State (Magnetization). Only for FlowPath
+    outflow_spin_reset!(M, seq.t', p.motion; replace_by=p.ρ)
+
+    return nothing
+end
+
+function run_spin_excitation!(
+    p::Phantom{T},
+    seq::DiscreteSequence{T},
+    sig::AbstractArray{Complex{T}},
+    M::Mag{T},
+    sim_method::Bloch,
+    groupsize::Integer,
+    backend::KA.Backend,
+    pre::BlochGPUPrealloc
+) where {T<:Real}
+    #Motion
+    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
+
+    #Excitation
+    excitation_kernel!(backend, groupsize)(
+        M.xy, M.z, 
+        x, y, z, pre.ΔBz, p.T1, p.T2, p.ρ, 
+        seq.Gx, seq.Gy, seq.Gz, seq.Δt, seq.Δf, seq.B1, 
+        Val(!(p.motion isa NoMotion)), 
+        ndrange=size(M.xy,1)
+    )
+
+    #Reset Spin-State (Magnetization). Only for FlowPath
+    outflow_spin_reset!(M,  seq.t', p.motion; replace_by=p.ρ) # TODO: reset state inside kernel
+
+    return nothing
+end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/ExcitationKernel.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/ExcitationKernel.jl
new file mode 100644
index 000000000..009b29491
--- /dev/null
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/ExcitationKernel.jl
@@ -0,0 +1,63 @@
+@kernel inbounds=true function excitation_kernel!(
+    M_xy::AbstractVector{Complex{T}}, M_z, 
+    @Const(p_x), @Const(p_y), @Const(p_z), @Const(p_ΔBz), @Const(p_T1), @Const(p_T2), @Const(p_ρ), 
+    @Const(s_Gx), @Const(s_Gy), @Const(s_Gz), @Const(s_Δt), @Const(s_Δf), @Const(s_B1), 
+    ::Val{MOTION}
+) where {T, MOTION}
+    
+    i = @index(Global)
+    
+    @uniform N_Δt = length(s_Δt)
+
+    x, y, z = get_spin_coordinates(p_x, p_y, p_z, i, 1)
+    ΔBz = p_ΔBz[i]
+    Mxy_r, Mxy_i = reim(M_xy[i])
+    Mz = M_z[i]
+    ρ = p_ρ[i]
+    T1 = p_T1[i]
+    T2 = p_T2[i]
+
+    for t = 1:N_Δt
+        if MOTION
+            x, y, z = get_spin_coordinates(p_x, p_y, p_z, i, t)
+        end
+        
+        Bz = (x * s_Gx[t] + y * s_Gy[t] + z * s_Gz[t]) + ΔBz - s_Δf[t] / T(γ)
+        B1_r, B1_i = reim(s_B1[t])
+        B = sqrt(B1_r^2 + B1_i^2 + Bz^2)
+        Δt = s_Δt[t]
+        φ = T(-π * γ) * B * Δt
+        sin_φ, cos_φ = sincos(φ)
+        α_r = cos_φ
+        if iszero(B)
+            α_i = -(Bz / (B + eps(T))) * sin_φ
+            β_r = (B1_i / (B + eps(T))) * sin_φ
+            β_i = -(B1_r / (B + eps(T))) * sin_φ
+        else
+            α_i = -(Bz / B) * sin_φ
+            β_r = (B1_i / B) * sin_φ
+            β_i = -(B1_r / B) * sin_φ
+        end
+
+        Mxy_new_r = 2 * (Mxy_i * (α_r * α_i - β_r * β_i) +
+                    Mz * (α_i * β_i + α_r * β_r)) +
+                    Mxy_r * (α_r^2 - α_i^2 - β_r^2 + β_i^2)
+        
+        Mxy_new_i = -2 * (Mxy_r * (α_r * α_i + β_r * β_i) -
+                    Mz * (α_r * β_i - α_i * β_r)) +
+                    Mxy_i * (α_r^2 - α_i^2 + β_r^2 - β_i^2)
+        
+        Mz_new =    Mz * (α_r^2 + α_i^2 - β_r^2 - β_i^2) -
+                    2 * (Mxy_r * (α_r * β_r - α_i * β_i) +
+                    Mxy_i * (α_r * β_i + α_i * β_r))
+        
+        ΔT1 = exp(-Δt / T1)
+        ΔT2 = exp(-Δt / T2)
+        Mxy_r = Mxy_new_r * ΔT2
+        Mxy_i = Mxy_new_i * ΔT2
+        Mz = Mz_new * ΔT1 + ρ * (1 - ΔT1)
+    end
+
+    M_xy[i] = complex(Mxy_r, Mxy_i)
+    M_z[i] = Mz
+end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/KernelFunctions.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/KernelFunctions.jl
new file mode 100644
index 000000000..5939e85ba
--- /dev/null
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/KernelFunctions.jl
@@ -0,0 +1,105 @@
+using KernelAbstractions: @kernel, @Const, @index, @uniform, @localmem, @synchronize, @groupsize
+
+#Used for getting spin coordinates inside precession and excitation kernels
+@inline function get_spin_coordinates(x::AbstractVector{T}, y::AbstractVector{T}, z::AbstractVector{T}, i::Integer, t::Integer) where {T<:Real} 
+    (x[i], y[i], z[i]) 
+end
+@inline function get_spin_coordinates(x::AbstractMatrix{T}, y::AbstractMatrix{T}, z::AbstractMatrix{T}, i::Integer, t::Integer) where {T<:Real} 
+    (x[i, t], y[i, t], z[i, t]) 
+end
+
+# Returns the next least power of two starting from n, used to calculate remaining indexes in block-level reduction
+@inline function next_least_power_of_two(n)
+    return  n < 2 ? 1 :
+            n < 4 ? 2 :
+            n < 8 ? 4 :
+            n < 16 ? 8 :
+            n < 32 ? 16 :
+            n < 64 ? 32 :
+            n < 128 ? 64 :
+            n < 256 ? 128 :
+            n < 512 ? 256 :
+            n < 1024 ? 512 :
+            1024
+end
+
+@inline function reduce_block!(s_data1, s_data2, i, N, N_closest, R)
+    if N == 1024u16
+        if i <= 512u16
+            s_data1[i] = s_data1[i] + s_data1[i + 512u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 512u16]
+        end
+    elseif N != N_closest
+        #If workgroup size is not a power of two, need to take care of
+        #remaining indices before entering next if statement
+        if i <= R
+            s_data1[i] = s_data1[i] + s_data1[i + N_closest]
+            s_data2[i] = s_data2[i] + s_data2[i + N_closest]
+        end
+    end
+    @synchronize()
+    if N >= 512u16
+        if i <= 256u16
+            s_data1[i] = s_data1[i] + s_data1[i + 256u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 256u16]
+        end
+        @synchronize()
+    end
+    if N >= 256u16
+        if i <= 128u16
+            s_data1[i] = s_data1[i] + s_data1[i + 128u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 128u16]
+        end
+        @synchronize()
+    end
+    if N >= 128u16
+        if i <= 64u16
+            s_data1[i] = s_data1[i] + s_data1[i + 64u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 64u16]
+        end
+        @synchronize()
+    end
+
+    if N >= 64u16
+        if i <= 32u16
+            s_data1[i] = s_data1[i] + s_data1[i + 32u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 32u16]
+        end
+        @synchronize()
+    end
+    if N >= 32u16
+        if i <= 16u16
+            s_data1[i] = s_data1[i] + s_data1[i + 16u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 16u16]
+        end
+        @synchronize()
+    end
+    if N >= 16u16
+        if i <= 8u16
+            s_data1[i] = s_data1[i] + s_data1[i + 8u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 8u16]
+        end
+        @synchronize()
+    end
+    if N >= 8u16
+        if i <= 4u16
+            s_data1[i] = s_data1[i] + s_data1[i + 4u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 4u16]
+        end
+        @synchronize()
+    end
+    if N >= 4u16
+        if i <= 2u16
+            s_data1[i] = s_data1[i] + s_data1[i + 2u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 2u16]
+        end
+        @synchronize()
+    end
+    if N >= 2u16
+        if i <= 1u16
+            s_data1[i] = s_data1[i] + s_data1[i + 1u16]
+            s_data2[i] = s_data2[i] + s_data2[i + 1u16]
+        end
+        @synchronize()
+    end
+end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/PrecessionKernel.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/PrecessionKernel.jl
new file mode 100644
index 000000000..d0cc2430a
--- /dev/null
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/gpu/PrecessionKernel.jl
@@ -0,0 +1,91 @@
+@kernel inbounds=true function precession_kernel!(
+    sig_output::AbstractMatrix{Complex{T}}, 
+    M_xy, M_z, 
+    @Const(p_x), @Const(p_y), @Const(p_z), @Const(p_ΔBz), @Const(p_T1), @Const(p_T2), @Const(p_ρ), 
+    @Const(s_Gx), @Const(s_Gy), @Const(s_Gz), @Const(s_Δt), @Const(s_ADC), 
+    ::Val{MOTION}, ::Val{N_SPINS}, ::Val{GROUPSIZE}, ::Val{GROUPSIZE_CLOSEST_POWER_OF_TWO}, ::Val{GROUPSIZE_REMAINDER}
+) where {T, MOTION, N_SPINS, GROUPSIZE, GROUPSIZE_CLOSEST_POWER_OF_TWO, GROUPSIZE_REMAINDER}
+
+    i = @index(Global, Linear)
+    i_l = @index(Local, Linear)
+    i_g = @index(Group, Linear)
+
+    @uniform s_length = length(s_Δt)+1
+
+    sig_group_r = @localmem T GROUPSIZE
+    sig_group_i = @localmem T GROUPSIZE
+    sig_group_r[i_l] = zero(T)
+    sig_group_i[i_l] = zero(T)
+    
+    Mxy_r = zero(T)
+    Mxy_i = zero(T)
+    t = zero(T)
+    ϕ = zero(T)
+    ΔBz = zero(T)
+    T2 = zero(T)
+    x = zero(T)
+    y = zero(T)
+    z = zero(T)
+    Bz_prev = zero(T)
+    Bz_next = zero(T)
+
+    if i <= N_SPINS
+        Mxy_r, Mxy_i = reim(M_xy[i])
+        sig_group_r[i_l] = Mxy_r
+        sig_group_i[i_l] = Mxy_i
+        ΔBz = p_ΔBz[i]
+        T2 = p_T2[i]
+        x, y, z = get_spin_coordinates(p_x, p_y, p_z, i, 1)
+        Bz_prev = x * s_Gx[1] + y * s_Gy[1] + z * s_Gz[1] + ΔBz
+    end
+
+    ADC_idx = 1 
+    if s_ADC[1]
+        @synchronize()
+        reduce_block!(sig_group_r, sig_group_i, i_l, GROUPSIZE, GROUPSIZE_CLOSEST_POWER_OF_TWO, GROUPSIZE_REMAINDER)
+        if i_l == 1u16
+            sig_output[i_g, 1] = complex(sig_group_r[1u16], sig_group_i[1u16])
+        end
+        ADC_idx += 1
+    end
+
+    for s_idx = 2:s_length
+        if i <= N_SPINS
+            if MOTION
+                x, y, z = get_spin_coordinates(p_x, p_y, p_z, i, s_idx)
+            end
+
+            Δt = s_Δt[s_idx-1]
+            t += Δt
+            Bz_next = x * s_Gx[s_idx] + y * s_Gy[s_idx] + z * s_Gz[s_idx] + ΔBz
+            ϕ += (Bz_prev + Bz_next) * T(-π * γ) * Δt
+        end
+
+        if ADC_idx < s_length && s_ADC[s_idx]
+            if i <= N_SPINS
+                ΔT2 = exp(-t / T2)
+                cis_ϕ_i, cis_ϕ_r = sincos(ϕ)
+                sig_group_r[i_l] = ΔT2 * (Mxy_r * cis_ϕ_r - Mxy_i * cis_ϕ_i)
+                sig_group_i[i_l] = ΔT2 * (Mxy_r * cis_ϕ_i + Mxy_i * cis_ϕ_r)
+            end
+            @synchronize()
+            reduce_block!(sig_group_r, sig_group_i, i_l, GROUPSIZE, GROUPSIZE_CLOSEST_POWER_OF_TWO, GROUPSIZE_REMAINDER)
+            if i_l == 1u16
+                sig_output[i_g, ADC_idx] = complex(sig_group_r[1u16], sig_group_i[1u16]) 
+            end
+            ADC_idx += 1
+        end
+
+        if i <= N_SPINS
+            Bz_prev = Bz_next
+        end
+    end
+
+    if i <= N_SPINS
+        ΔT2 = exp(-t / T2)
+        cis_ϕ_i, cis_ϕ_r = sincos(ϕ)
+        M_xy[i] = complex(ΔT2 * (Mxy_r * cis_ϕ_r - Mxy_i * cis_ϕ_i), ΔT2 * (Mxy_r * cis_ϕ_i + Mxy_i * cis_ϕ_r))
+        ΔT1 = exp(-t / p_T1[i])
+        M_z[i] = M_z[i] * ΔT1 + p_ρ[i] * (1 - ΔT1)
+    end
+end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl
index 2c0668d48..85366deb4 100644
--- a/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl
+++ b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl
@@ -35,6 +35,7 @@ function run_spin_precession!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::BlochDict,
+    groupsize,
     backend::KA.Backend,
     prealloc::PreallocResult
 ) where {T<:Real}
diff --git a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
index 465d5b8aa..7a15fb1c3 100644
--- a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
+++ b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
@@ -27,6 +27,7 @@ function run_spin_precession!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::SimulationMethod,
+    groupsize,
     backend::KA.Backend,
     prealloc::PreallocResult
 ) where {T<:Real}
@@ -76,6 +77,7 @@ function run_spin_excitation!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::SimulationMethod,
+    groupsize,
     backend::KA.Backend,
     prealloc::PreallocResult
 ) where {T<:Real}
diff --git a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
index 8326d0a85..5414b5dd7 100644
--- a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
+++ b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
@@ -27,25 +27,15 @@ end
 """Stores pre-allocated arrays for use in run_spin_precession! and run_spin_excitation!"""
 abstract type PreallocResult{T<:Real} end
 
-"""Stores information precalculated before the simulation objects are moved to the GPU."""
-abstract type PrecalcResult{T<:Real} end
-
 """Default preallocation struct, stores nothing."""
 struct DefaultPrealloc{T} <: PreallocResult{T} end
 
 Base.view(p::PreallocResult, i::UnitRange) = p
-prealloc_block(p::PreallocResult, i::Integer) = p
-
-"""Default precalculation struct, stores nothing."""
-struct DefaultPrecalc{T} <: PrecalcResult{T} end
 
 """Default preallocation function."""
-prealloc(sim_method::SimulationMethod, backend::KA.Backend, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real} = DefaultPrealloc{T}()
-
-"""Default precalc function."""
-precalculate(sim_method::SimulationMethod, backend::KA.Backend, seq::DiscreteSequence{T}, parts::Vector{UnitRange{S}}, excitation_bool::Vector{Bool}) where {T<:Real,S<:Integer} = DefaultPrecalc{T}()
+prealloc(sim_method::SimulationMethod, backend::KA.Backend, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, groupsize) where {T<:Real} = DefaultPrealloc{T}()
 
 include("BlochSimple/BlochSimple.jl")
-include("Bloch/BlochCPU.jl")
-include("Bloch/BlochGPU.jl")
+include("Bloch/cpu/BlochCPU.jl")
+include("Bloch/gpu/BlochGPU.jl")
 include("BlochDict/BlochDict.jl")
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
index aff8a56f7..964662356 100644
--- a/KomaMRICore/src/simulation/SimulatorCore.jl
+++ b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -46,6 +46,8 @@ function default_sim_params(sim_params=Dict{String,Any}())
     sampling_params = KomaMRIBase.default_sampling_params()
     get!(sim_params, "gpu", true)
     get!(sim_params, "gpu_device", nothing)
+    get!(sim_params, "gpu_groupsize_precession", DEFAULT_PRECESSION_GROUPSIZE)
+    get!(sim_params, "gpu_groupsize_excitation", DEFAULT_EXCITATION_GROUPSIZE)
     get!(sim_params, "Nthreads", Threads.nthreads())
     get!(sim_params, "Nblocks", 20)
     get!(sim_params, "Δt", sampling_params["Δt"])
@@ -82,6 +84,7 @@ function run_spin_precession_parallel!(
     sig::AbstractArray{Complex{T}},
     Xt::SpinStateRepresentation{T},
     sim_method::SimulationMethod,
+    groupsize::Integer,
     backend::KA.Backend,
     prealloc::PreallocResult;
     Nthreads=Threads.nthreads(),
@@ -91,7 +94,7 @@ function run_spin_precession_parallel!(
 
     ThreadsX.foreach(enumerate(parts)) do (i, p)
         run_spin_precession!(
-            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, backend, @view(prealloc[p])
+            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, groupsize, backend, @view(prealloc[p])
         )
     end
 
@@ -123,6 +126,7 @@ function run_spin_excitation_parallel!(
     sig::AbstractArray{Complex{T}},
     Xt::SpinStateRepresentation{T},
     sim_method::SimulationMethod,
+    groupsize::Integer,
     backend::KA.Backend,
     prealloc::PreallocResult;
     Nthreads=Threads.nthreads(),
@@ -132,7 +136,7 @@ function run_spin_excitation_parallel!(
 
     ThreadsX.foreach(enumerate(parts)) do (i, p)
         run_spin_excitation!(
-            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, backend, @view(prealloc[p])
+            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, groupsize, backend, @view(prealloc[p])
         )
     end
 
@@ -173,8 +177,9 @@ function run_sim_time_iter!(
     backend::KA.Backend;
     Nblocks=1,
     Nthreads=Threads.nthreads(),
+    precession_groupsize=DEFAULT_PRECESSION_GROUPSIZE,
+    excitation_groupsize=DEFAULT_EXCITATION_GROUPSIZE,
     parts=[1:length(seq)],
-    precalc=nothing,
     excitation_bool=ones(Bool, size(parts)),
     w=nothing,
 ) where {T<:Real}
@@ -182,7 +187,7 @@ function run_sim_time_iter!(
     rfs = 0
     samples = 1
     progress_bar = Progress(Nblocks; desc="Running simulation...")
-    prealloc_result = prealloc(sim_method, backend, obj, Xt, maximum(length.(parts))+1, precalc)
+    prealloc_result = prealloc(sim_method, backend, obj, Xt, maximum(length.(parts))+1, precession_groupsize)
 
     for (block, p) in enumerate(parts)
         seq_block = @view seq[p]
@@ -194,16 +199,17 @@ function run_sim_time_iter!(
         # Simulation wrappers
         if excitation_bool[block]
             run_spin_excitation_parallel!(
-                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, backend, prealloc_block(prealloc_result, block); Nthreads
+                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, excitation_groupsize, backend, prealloc_result; Nthreads
             )
             rfs += 1
         else
             run_spin_precession_parallel!(
-                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, backend, prealloc_block(prealloc_result, block); Nthreads
+                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, precession_groupsize, backend, prealloc_result; Nthreads
             )
         end
         samples += Nadc
         #Update progress
+        KA.synchronize(backend)
         next!(
             progress_bar;
             showvalues=[
@@ -369,7 +375,6 @@ function simulate(
         Xt   = Xt |> f32 #SpinStateRepresentation
         sig  = sig |> f32 #Signal
     end
-    precalc = precalculate(sim_params["sim_method"], backend, seqd, parts, excitation_bool)
     # Objects to GPU
     if backend isa KA.GPU
         isnothing(sim_params["gpu_device"]) || set_device!(backend, sim_params["gpu_device"])
@@ -378,7 +383,6 @@ function simulate(
         seqd = seqd |> gpu #DiscreteSequence
         Xt = Xt |> gpu #SpinStateRepresentation
         sig = sig |> gpu #Signal
-        precalc = precalc |> gpu #Info calculated prior to simulation
     end
 
     # Simulation
@@ -395,9 +399,10 @@ function simulate(
         backend;
         Nblocks=length(parts),
         Nthreads=sim_params["Nthreads"],
+        precession_groupsize=sim_params["gpu_groupsize_precession"],
+        excitation_groupsize=sim_params["gpu_groupsize_excitation"],
         parts,
         excitation_bool,
-        precalc,
         w,
     )
     # Result to CPU, if already in the CPU it does nothing