Skip to content

Commit

Permalink
scaling
Browse files Browse the repository at this point in the history
  • Loading branch information
luraess committed Oct 4, 2023
1 parent 65c5bfa commit 17b1a3a
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 57 deletions.
68 changes: 21 additions & 47 deletions scripts_future_API/bench3d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,44 +31,18 @@ macro d2_zi(A) esc(:( $A[ix+1, iy+1, iz+2] - $A[ix+1, iy+1, iz+1] - $A[ix+1, iy+
# end
end

function lapl!(A_new, A, h, _dx, _dy, _dz)
ix = (workgroupIdx().x - UInt32(1)) * workgroupDim().x + workitemIdx().x
iy = (workgroupIdx().y - UInt32(1)) * workgroupDim().y + workitemIdx().y
iz = (workgroupIdx().z - UInt32(1)) * workgroupDim().z + workitemIdx().z
# if ix ∈ axes(A_new, 1)[2:end-1] && iy ∈ axes(A_new, 2)[2:end-1] && iz ∈ axes(A_new, 3)[2:end-1]
# @inbounds A_new[ix, iy, iz] = A[ix, iy, iz] + h #= * (_dx * _dx * @d2_xi(A) + _dy * _dy * @d2_yi(A) + _dz * _dz * @d2_zi(A)) =#
# end
if (ix < size(A, 1) - 2 && iy < size(A, 2) - 2 && iz < size(A, 3) - 2)
@inbounds @inn(A_new) = @inn(A) + h #= * (_dx * _dx * @d2_xi(A) + _dy * _dy * @d2_yi(A) + _dz * _dz * @d2_zi(A)) =#
end
return
end

function compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, iters, me)
function compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, iters, me)
(me==0) && print("Starting the time loop 🚀...")
MPI.Barrier(comm)
tic = time_ns()
for _ = 1:iters
# copyto!(A, A_new)
# AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend)
# hide_comm(diffusion_kernel!(backend, 256), neighbors, ranges, A_new, A, h, _dx, _dy, _dz)
# A, A_new = A_new, A
hide_comm(diffusion_kernel!(backend, 256), neighbors, ranges, A_new, A, h, _dx, _dy, _dz)
A, A_new = A_new, A

diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A))
# diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A) .- 2)
# diffusion_kernel!(backend, 256, (size(A) .- 2))(A_new, A, h, _dx, _dy, _dz, (1, 1, 1))
AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend)
# A, A_new = A_new, A
end
wtime = (time_ns() - tic) * 1e-9
(me==0) && println("done")
return wtime
end

function compute_roc(A_new, A, h, _dx, _dy, _dz, iters, nblocks, nthreads, me)
(me==0) && print("Starting the time loop 🚀...")
tic = time_ns()
for _ = 1:iters
AMDGPU.@sync @roc gridsize=nblocks groupsize=nthreads lapl!(A_new, A, h, _dx, _dy, _dz)
# diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A))
# AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend)
# A, A_new = A_new, A
end
wtime = (time_ns() - tic) * 1e-9
Expand All @@ -83,8 +57,6 @@ function main(backend=CPU(), T::DataType=Float64, dims=(0, 0, 0))
iters, warmup = 35, 5
nx, ny, nz = 1024, 1024, 1024
b_width = (128, 8, 4)
nthreads = (256, 1, 1)
nblocks = cld.((nx, ny, nz), nthreads)
dims, comm, me, neighbors, coords, device = init_distributed(dims; init_MPI=true)
dx, dy, dz = l ./ (nx, ny, nz)
_dx, _dy, _dz = 1.0 ./ (dx, dy, dz)
Expand Down Expand Up @@ -137,22 +109,24 @@ function main(backend=CPU(), T::DataType=Float64, dims=(0, 0, 0))
# GC.gc()
# GC.enable(false)

compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, warmup, me)
wtime = compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, (iters - warmup), me)
compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, warmup, me)

# compute_roc(A_new, A, h, _dx, _dy, _dz, warmup, nblocks, nthreads, me)
# wtime = compute_roc(A_new, A, h, _dx, _dy, _dz, (iters - warmup), nblocks, nthreads, me)
for _ in 1:10
wtime = compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, (iters - warmup), me)

# GC.enable(true)
# GC.gc()

# perf
A_eff = 2 / 2^30 * (nx-2) * (ny-2) * (nz-2) * sizeof(Float64)
wtime_it = wtime / (iters - warmup)
T_eff = A_eff / wtime_it
# (me==0) && @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3))
@printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s - device %s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3), AMDGPU.device_id(AMDGPU.device()))
# GC.enable(true)
# GC.gc()

MPI.Barrier(comm)
wtime_min = MPI.Allreduce(wtime, MPI.MIN, comm)
wtime_max = MPI.Allreduce(wtime, MPI.MAX, comm)
# perf
A_eff = 2 / 2^30 * (nx-2) * (ny-2) * (nz-2) * sizeof(Float64)
wtime_it = (wtime_min, wtime_max) ./ (iters - warmup)
T_eff = A_eff ./ wtime_it
(me==0) && @printf("Executed %d steps in = %1.3e sec @ T_eff = %1.2f GB/s (max %1.2f) \n", (iters - warmup), wtime_max, round(T_eff[2], sigdigits=3), round(T_eff[1], sigdigits=3))
# @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s - device %s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3), AMDGPU.device_id(AMDGPU.device()))
end
finalize_distributed(; finalize_MPI=true)
return
end
Expand Down
6 changes: 0 additions & 6 deletions scripts_future_API/mpi_utils2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@ function finalize_distributed(; finalize_MPI=true)
return
end

@kernel function my_copy!(halo, recv_buf)
ix, iy = @index(Global, NTuple)
halo[ix, iy] = recv_buf[ix, iy]
end

# exchanger
mutable struct Exchanger
@atomic done::Bool
Expand Down Expand Up @@ -75,7 +70,6 @@ mutable struct Exchanger
test_send = MPI.Test(send)
if test_recv && !flag
copyto!(halo, recv_buf)
# my_copy!(backend, 256, size(recv_buf))(halo, recv_buf)
flag = true
end
if test_recv && test_send break end
Expand Down
8 changes: 4 additions & 4 deletions scripts_future_API/sbatch.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/bash
#SBATCH --job-name=scaling_16
#SBATCH --job-name=scaling_2048
#SBATCH --account=project_465000557
#SBATCH --time=00:02:00
#SBATCH --nodes=4
#SBATCH --ntasks=16
#SBATCH --time=00:05:00
#SBATCH --nodes=512
#SBATCH --ntasks=2048
#SBATCH --gpus-per-node=8
#SBATCH --partition=standard-g

Expand Down

0 comments on commit 17b1a3a

Please sign in to comment.