Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: task switching in AMDGPU complex batched_matmul (#178)
Browse files Browse the repository at this point in the history
* ci(buildkite): add downstream testing for NeuralOperators

* perf: restore old batched_mul

* fix: disable threading for certain devices

* revert: "perf: restore old batched_mul"

This reverts commit a8c0f3b.
  • Loading branch information
avik-pal authored Oct 25, 2024
1 parent 98a2d7a commit 877ef96
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 8 deletions.
5 changes: 2 additions & 3 deletions .buildkite/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ steps:
- src
- ext
env:
RETESTITEMS_NWORKERS: 2
BACKEND_GROUP: "AMDGPU"
agents:
queue: "juliagpu"
Expand Down Expand Up @@ -126,6 +125,7 @@ steps:
repo:
- "Boltz"
- "Lux"
- "NeuralOperators"

- group: ":telescope: Downstream AMD GPU"
steps:
Expand All @@ -143,15 +143,14 @@ steps:
queue: "juliagpu"
rocm: "*"
rocmgpu: "*"
env:
RETESTITEMS_NWORKERS: 2
if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ && build.message !~ /\[skip ci\]/ && build.branch != "main"
timeout_in_minutes: 240
matrix:
setup:
repo:
- "Boltz"
- "Lux"
- "NeuralOperators"

env:
JULIA_PKG_SERVER: ""
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.4"
version = "1.3.5"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
41 changes: 37 additions & 4 deletions src/impl/batched_mul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ end
function batched_matmul_loopvec_impl! end

function fallback_batched_matmul(
dev, x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
opmode, x::AbstractArray{xT, 3}, y::AbstractArray{yT, 3}) where {xT, yT}
z = similar(x, promote_type(eltype(x), eltype(y)), size(x, 1),
size(y, 2), max(size(x, 3), size(y, 3)))
fallback_batched_matmul!(z, dev, x, y)
fallback_batched_matmul!(z, opmode, x, y)
return z
end

function fallback_batched_matmul!(
z::AbstractArray{zT, 3}, dev, x::AbstractArray{xT, 3},
z::AbstractArray{zT, 3}, opmode, x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}) where {zT, xT, yT}
# XXX: bring back once the enzyme segfault is fixed
# @warn "Using fallback Batched Matrix Multiply routine for $(dev) with A: size = \
Expand All @@ -90,6 +90,36 @@ function fallback_batched_matmul!(
throw(DimensionMismatch(lazy"size(x) = $(size(x)), size(y) = $(size(y)) inconsistent for batched_matmul."))
end

if use_threaded_batched_matmul(get_device_type(x))
unsafe_fallback_threaded_batched_matmul!(z, x, y)
else
unsafe_fallback_serial_batched_matmul!(z, x, y)
end

return
end

function unsafe_fallback_serial_batched_matmul!(
z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}) where {zT, xT, yT}
if size(x, 3) == size(y, 3)
for L in axes(z, 3)
mul!(batchview(z, L), batchview(x, L), batchview(y, L))
end
elseif size(x, 3) == 1
for L in axes(z, 3)
mul!(batchview(z, L), batchview(x, 1), batchview(y, L))
end
else # has to be size(y, 3) == 1
for L in axes(z, 3)
mul!(batchview(z, L), batchview(x, L), batchview(y, 1))
end
end
end

function unsafe_fallback_threaded_batched_matmul!(
z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}) where {zT, xT, yT}
old_threads = maybe_reduce_BLAS_threads(z)

if size(x, 3) == size(y, 3)
Expand All @@ -107,10 +137,13 @@ function fallback_batched_matmul!(
end

reset_BLAS_threads(old_threads)

return
end

use_threaded_batched_matmul(::Type) = false
use_threaded_batched_matmul(::Type{CUDADevice}) = true
use_threaded_batched_matmul(::Type{CPUDevice}) = true

function CRC.rrule(::typeof(batched_matmul), x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}) where {xT, yT}
∇batched_matmul = @closure Δ_ -> begin
Expand Down

3 comments on commit 877ef96

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/118080

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.5 -m "<description of version>" 877ef96bbcea3abf76fa3627a613d3e2f44a9c2a
git push origin v1.3.5

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 877ef96 Previous: 98a2d7a Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5000 ns 6417 ns 0.78
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5125 ns 6041 ns 0.85
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7375 ns 7167 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4833 ns 5292 ns 0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 108327 ns 103542 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 704958 ns
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 452318 ns 637131 ns 0.71
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10000 ns 10166.5 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9917 ns 9958 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10229.5 ns 10291.5 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9729.5 ns 9979.5 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 538089 ns 494284 ns 1.09
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 2390625 ns
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 709441 ns 719725 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1792 ns 1583 ns 1.13
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1792 ns 1542 ns 1.16
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 2000.5 ns 1666 ns 1.20
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1584 ns 1500 ns 1.06
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 19729 ns 20684 ns 0.95
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 439229 ns
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 33851 ns 33302 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4375 ns 3812.5 ns 1.15
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 3833.5 ns 4125 ns 0.93
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4250 ns 4250 ns 1
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3520.5 ns 4334 ns 0.81
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 134838 ns 134278.5 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 2235354 ns
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 143632.5 ns 143062.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56375 ns 58000 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46875 ns 46417 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46750 ns 46875 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 78375 ns 83750 ns 0.94
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36801 ns 37449 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1444229 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 84285 ns 70883 ns 1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2037375.5 ns 2037500 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2083500 ns 2083416.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2090334 ns 2090916.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1999916 ns 1996979.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 215168.5 ns 220080 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 5415625 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1280705 ns 1213928 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 148666.5 ns 173708 ns 0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 145833 ns 146625 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 152417 ns 165062.5 ns 0.92
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 160792 ns 172000 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167254 ns 167869.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1500250 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 172909 ns 196051.5 ns 0.88
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1133479.5 ns 1113854.5 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1112750 ns 1110541 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1115292 ns 1118667 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1109687.5 ns 1124479.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 623047 ns 644177 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10180459 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1022168 ns 899376 ns 1.14
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4771 ns 5333 ns 0.89
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4708 ns 4875 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6666 ns 6750 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4167 ns 4416 ns 0.94
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 80121.5 ns 83066 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 1222709 ns
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 56392.5 ns 64020 ns 0.88
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8521 ns 8584 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8542 ns 8750 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9375 ns 8875 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8542 ns 8584 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 547974 ns 552192.5 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 7799104.5 ns
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 384758 ns 372446 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18062.5 ns 17229.5 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16875 ns 17250 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21625 ns 21542 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17666.5 ns 17208.5 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 62259 ns 63166 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1327729 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 76443 ns 79573.5 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212542 ns 220583 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 217708 ns 218875 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222604.5 ns 223125 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 235416.5 ns 219625 ns 1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 326680 ns 329089 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5672875 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 468011 ns 423777 ns 1.10
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 583 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 625 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 959 ns 833 ns 1.15
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 792 ns 834 ns 0.95
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 18885 ns 19066 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 446167 ns
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 31881 ns 27311 ns 1.17
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1417 ns 1417 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1375 ns 1417 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1667 ns 1583 ns 1.05
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1375 ns 1375 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 117120.5 ns 116071.5 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 2151437.5 ns
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 135835 ns 118732 ns 1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7375 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 6000 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6083 ns 6083 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10166 ns 10334 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23630 ns 24482 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 838084 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48897 ns 52122 ns 0.94
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220042 ns 229541.5 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 234750 ns 268417 ns 0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 270833.5 ns 241500 ns 1.12
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 253000.5 ns 251250 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 188891 ns 189293 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8581771 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 612944.5 ns 588480 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3958 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 4042 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23120 ns 23660.5 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 433416 ns
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 47491 ns 43502 ns 1.09
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16542 ns 16833 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 17041 ns 16834 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17167 ns 16959 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16875 ns 16666 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 186342.5 ns 188039 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 2081000 ns
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 174571.5 ns 166010.5 ns 1.05
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 919250 ns 929291 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 828041 ns 838708 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 838917 ns 841584 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 1258333 ns 1269208 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113235.5 ns 113941 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 452875 ns
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 243040 ns 396441 ns 0.61
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2556167 ns 2610729.5 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2320333.5 ns 2330541.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2328916.5 ns 2324458 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3549104.5 ns 3478334 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 229235 ns 232093 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 2156125 ns
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 739658 ns 630643.5 ns 1.17
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6084 ns 6000 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5520.5 ns 7042 ns 0.78
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8354 ns 7333.5 ns 1.14
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5834 ns 6584 ns 0.89
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 83528.5 ns 82915 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 1131521 ns
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 58842 ns 62131.5 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11729.5 ns 11875 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11583 ns 11417 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11479.5 ns 12417 ns 0.92
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10999.5 ns 9813 ns 1.12
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 596279 ns 585345.5 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 7505021 ns
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 402564 ns 388046 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23594 ns 23179.5 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 436875 ns
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 48301 ns 41949 ns 1.15
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2084 ns 2083 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2083 ns 2250 ns 0.93
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2167 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2083 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 224089.5 ns 226220 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 2406437.5 ns
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 182056 ns 166171 ns 1.10
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8916 ns 8583 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8292 ns 8542 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11209 ns 10709 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8375 ns 8833 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 101414 ns 100758 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 1214500 ns
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 73272.5 ns 72575 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 18625 ns 17228.5 ns 1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17208.5 ns 18583 ns 0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18667 ns 18500 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16771 ns 17750 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 555190.5 ns 582511 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5531208.5 ns
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 379272 ns 371318.5 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 458 ns 459 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 583 ns 583 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 34468 ns 34079 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 654854 ns
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 45552 ns 44423 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9854 ns 9479 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9250 ns 9750 ns 0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9458 ns 10333 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8562.5 ns 9562.5 ns 0.90
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 257386.5 ns 262881 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5553750 ns
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 366942 ns 351422 ns 1.04
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 396542 ns 396583 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288042 ns 288042 ns 1
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287541 ns 287666 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756167 ns 756167 ns 1
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 112104 ns 112987 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 519187.5 ns
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 76352 ns 77780.5 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1409875 ns 1455709 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1132584 ns 1130291 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1126791.5 ns 1133250 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2436813 ns 2358000 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 199625 ns 202802 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1712834 ns
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 322335 ns 268682 ns 1.20
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7083 ns 7354.5 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6874.5 ns 8000 ns 0.86
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8458 ns 8687.5 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6938 ns 7750 ns 0.90
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 134438.5 ns 137305 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 1132749.5 ns
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 59441 ns 64461 ns 0.92
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16563 ns 12812.5 ns 1.29
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13917 ns 15041.5 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16167 ns 15353.5 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15187.5 ns 12333.5 ns 1.23
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 880177 ns 906003 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 7959042 ns
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 418702.5 ns 413373 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24146 ns 26000 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 23791.5 ns 27562.5 ns 0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 28250 ns 27042 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24896 ns 26021 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 185908.5 ns 186382.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1653167 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 114524 ns 146484 ns 0.78
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 152041 ns 146500 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 105395.5 ns 157750 ns 0.67
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 113125 ns 129416 ns 0.87
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 104979 ns 155812.5 ns 0.67
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1011252 ns 1016426 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8155875 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 577332 ns 551090 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 79000 ns 84667 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76417 ns 80167 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76833 ns 78063 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 80250 ns 80521 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 190543 ns 190829 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1268166 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 125494 ns 124858.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 301375.5 ns 219479 ns 1.37
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 295750 ns 281750 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 231208 ns 278146 ns 0.83
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 209499.5 ns 320791.5 ns 0.65
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1046615 ns 1021778 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9187687.5 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 689189 ns 643542 ns 1.07
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 13333 ns 13125 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13334 ns 13666.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 15062.5 ns 14041.5 ns 1.07
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12750 ns 13459 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 137754.5 ns 136741.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 1170125 ns
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 233927 ns 226473 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 28270.5 ns 27083.5 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26542 ns 26125 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27166.5 ns 27833.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26062 ns 26604.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 912323.5 ns 919419 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 7923459 ns
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 689579 ns 633979.5 ns 1.09
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 15042 ns 14000 ns 1.07
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 14625 ns 14708.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 17292 ns 17583.5 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13834 ns 14792 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 119657.5 ns 119245 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 1225791.5 ns
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 239157 ns 233827 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26375 ns 26875 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26208 ns 25958.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26375 ns 26583 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26375 ns 26541 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 665016.5 ns 676576 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5755000 ns
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 674067.5 ns 589361.5 ns 1.14
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 183750 ns 182375 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 181645.5 ns 183208 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 187833 ns 185583 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 183666 ns 183459 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 101191 ns 102955 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1353021 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 235596.5 ns 232900.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 636291 ns 583500 ns 1.09
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 594625 ns 595083 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 592062.5 ns 597520.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 613458 ns 624167 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 491587 ns 493717.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6127021 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 708249 ns 657463 ns 1.08
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7375 ns 6750 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8333 ns 7645.5 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9417 ns 8167 ns 1.15
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7229.5 ns 7542 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 137783 ns 135360 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 1110021 ns
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 57461 ns 62767 ns 0.92
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14812.5 ns 15375 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14791 ns 14917 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14875 ns 16187.5 ns 0.92
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 12896 ns 15292 ns 0.84
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 881205 ns 885601 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 7653313 ns
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 399470 ns 392428 ns 1.02
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6156708 ns 6153416.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6375958.5 ns 6381624.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6373937.5 ns 6371521 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11907750 ns 11926500 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 347134 ns 346494 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/Metal 1596208 ns
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 300417.5 ns 392843 ns 0.76
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19072062.5 ns 19117208.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19937292 ns 19977084 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 19969000 ns 19957021 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36484084 ns 36558729 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1007983 ns 1005649 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/Metal 7924354 ns
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1163329 ns 1105996 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1750 ns 1750 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 1834 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1792 ns 1834 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23636 ns 23503 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 431667 ns
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 208896 ns 197739 ns 1.06
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4792 ns 4834 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4875 ns 4958 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4959 ns 4917 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4833 ns 4916 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 270525.5 ns 276337.5 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2513333 ns
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 618686 ns 502208 ns 1.23
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 9416.5 ns 8062.5 ns 1.17
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7917 ns 8416 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9625 ns 9459 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7271 ns 8145.5 ns 0.89
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 116370.5 ns 115989 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 1185875 ns
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 68072 ns 71584 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11937.5 ns 11562.5 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10958 ns 12438 ns 0.88
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12417 ns 12541 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11083.5 ns 12875 ns 0.86
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 603718 ns 604320 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5647937.5 ns
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 355648 ns 353160 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 333 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22877 ns 22648 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 443875 ns
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 46351 ns 43592 ns 1.06
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2916 ns 2917 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 3083 ns 2917 ns 1.06
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3250 ns 3041 ns 1.07
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2958 ns 3000 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 196283.5 ns 197848 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 2099292 ns
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 160444 ns 146363.5 ns 1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 14208.5 ns 14604 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 14375 ns 15458.5 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 17521 ns 15896 ns 1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 14729 ns 15000.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 116923.5 ns 117481 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 1146125 ns
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 237206 ns 236802 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25666 ns 26500 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25500 ns 25625 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25875 ns 26041.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25791 ns 25958 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 551650 ns 561217 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5245875 ns
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 650325 ns 566814 ns 1.15
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4208 ns 4291 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4208 ns 4209 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4208 ns 4208 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4167 ns 4375 ns 0.95
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24277 ns 24363 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 445125 ns
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 48561 ns 44754 ns 1.09
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 15917 ns 16250 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16208 ns 16125 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16250 ns 16292 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16125 ns 16416 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 320460 ns 321227 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 2478875 ns
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 206705 ns 190786 ns 1.08
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5625 ns 5916 ns 0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 5875 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5834 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5833 ns 5750 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 35140 ns 34700.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 657000 ns
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 205735 ns 200434 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20708 ns 22292 ns 0.93
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21146 ns 21292 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 22208 ns 21792 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21750 ns 22208 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 281377 ns 283315.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5995542 ns
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 679901 ns 598489 ns 1.14
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 58583 ns 59729 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 65083 ns 64229 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 66334 ns 66833 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51645.5 ns 50958 ns 1.01
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66570 ns 66908 ns 0.99
batchedmm(16, Bsize=512)/forward/GPU/Metal 14881125 ns
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 95562 ns 115781 ns 0.83
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 181791.5 ns 198937.5 ns 0.91
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 125000 ns 144625 ns 0.86
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 149958.5 ns 167291.5 ns 0.90
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 310334 ns 303249.5 ns 1.02
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 209829 ns 208882.5 ns 1.00
batchedmm(16, Bsize=512)/zygote/GPU/Metal 46762875 ns
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 579958 ns 529218 ns 1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 82625 ns 84291 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 80750 ns 83875 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 86292 ns 88125 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82500 ns 81562.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192479 ns 193291 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1995437.5 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 168164 ns 182771 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1923792 ns 1875250 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1884271 ns 1914792 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1888583 ns 1928375 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1917291 ns 1916625 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 508617 ns 505449 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8813959 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 923511 ns 857542 ns 1.08
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 291 ns 333 ns 0.87
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21906 ns 21535 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 450667 ns
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 41861 ns 36788 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1916 ns 1834 ns 1.04
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 246989 ns 243998 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 2172458.5 ns
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 186805 ns 166221 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9979 ns 11229 ns 0.89
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8562.5 ns 9791.5 ns 0.87
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11458 ns 11125 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8666.5 ns 10479.5 ns 0.83
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 114779 ns 114440.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 1098750 ns
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 238165 ns 233386 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9771 ns 10458 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10000 ns 10250 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10291 ns 9917 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9604.5 ns 10145.5 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 492318 ns 491014 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 5055604 ns
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 634834 ns 561274 ns 1.13
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56541 ns 58375 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46708 ns 46917 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46792 ns 46625 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 77500 ns 83708 ns 0.93
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38130.5 ns 38960 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1203084 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 79889 ns 72876 ns 1.10
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1937792 ns 1897625 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1980021 ns 1964750 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1936541.5 ns 1985854 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1886999.5 ns 1899833 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 211665 ns 212091 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11204125 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1008110 ns 994598 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 267979 ns 266354 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 266375 ns 269729 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 271000 ns 271041.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 268291.5 ns 268271 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 193827.5 ns 193629.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1446458.5 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 282897 ns 271156 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 675542 ns 693917 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 673792 ns 692541 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 589042 ns 687708 ns 0.86
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 681292 ns 593833 ns 1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 994673.5 ns 991006 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8996396 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 898667.5 ns 863163 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2161437 ns 2180687.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2211833 ns 2214917 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2212042 ns 2212041 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2215687.5 ns 2208479 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 154115 ns 154859 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1427083.5 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 406627 ns 451844.5 ns 0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5581500 ns 5453666 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5501104 ns 5518208 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5517083.5 ns 5522375 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5264333.5 ns 5522209 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 937351 ns 930442 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10010417 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1552019 ns 1495900 ns 1.04
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 986917 ns 999875 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 898250 ns 913333 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 898500 ns 912895.5 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 1324292 ns 1334562.5 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46763 ns 46425 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 458458.5 ns
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 243438 ns 399125 ns 0.61
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2547916.5 ns 2620166 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2324625 ns 2328541 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2333583 ns 2329395.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3548709 ns 3468667 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 256534 ns 247327 ns 1.04
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2463833 ns
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 770755 ns 658089 ns 1.17
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56084 ns 58083 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46250 ns 46625 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46542 ns 46542 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81750 ns 84000 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 27782 ns 29007 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1193583 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 72909 ns 73392 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2048500 ns 2036000 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2090917 ns 2096916 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2061417 ns 2092208 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1996958.5 ns 1992542 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 223774 ns 225482 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11058874.5 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1035585 ns 1028937.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56458 ns 58417 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46709 ns 47208 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47084 ns 47375 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 78584 ns 83541 ns 0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48280 ns 48550 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1315916.5 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 71380 ns 71593.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1903125 ns 1926354.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1963666.5 ns 1987291 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1961854 ns 1972375 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1850771 ns 1890375 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 231382 ns 231977 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9466667 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 913772 ns 931260 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 333 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 34209 ns 33752 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 630896 ns
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 48489 ns 44343 ns 1.09
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6625 ns 6542 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6375 ns 7187.5 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7208 ns 7625 ns 0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6500 ns 6209 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 205122.5 ns 203191.5 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5599333 ns
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 366869 ns 350064 ns 1.05
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32165 ns 32755 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 385250 ns
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 40300 ns 36558 ns 1.10
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2875 ns 3375 ns 0.85
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3083 ns 3333 ns 0.92
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2959 ns 3000 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 3000 ns 3208 ns 0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 183941 ns 185298.5 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 1836854.5 ns
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 164169.5 ns 144480 ns 1.14
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1427166.5 ns 1465479.5 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1449750 ns 1410667 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1417625 ns 1427770.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1441604 ns 1410417 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 134383 ns 136084 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2843875 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 355189 ns 354201 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4996833 ns 5012687.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5015708 ns 5023959 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5020625 ns 5034167 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4981250 ns 5021667 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 673084.5 ns 673868 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10662292 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1463829 ns 1145811 ns 1.28
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49772312.5 ns 49876625 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35522417 ns 35509791 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35489333 ns 35514916 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 96946583 ns 97103375 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1601690 ns 1608361 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/Metal 10627562.5 ns
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1042214.5 ns 1576726 ns 0.66
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154216458 ns 154443875 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112301604.5 ns 112320833.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112218667 ns 112445042 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 294869708.5 ns 296071750 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6475752.5 ns 6483041.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/Metal 70117375 ns
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5557063.5 ns 6222525 ns 0.89
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 48417 ns 48042 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47916 ns 47667 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 48021 ns 47916 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47541 ns 47583 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 19924.5 ns 19626 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 496041 ns
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 25680 ns 28463 ns 0.90
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 49792 ns 50583.5 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50708.5 ns 50167 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 51209 ns 51000 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 51458 ns 50667 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 245262 ns 245482 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 2146500 ns
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 146160 ns 140773 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 10209 ns 8667 ns 1.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8959 ns 8750 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10750 ns 11167 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 9000 ns 9666.5 ns 0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 118313 ns 118847 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 1163542 ns
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 237350.5 ns 237489 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10708 ns 10791 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10417 ns 10458 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10833 ns 10333 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10208 ns 10709 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 582997 ns 584310 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5755625 ns
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 653411 ns 572469 ns 1.14
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 8417 ns 9125 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 8979 ns 9896 ns 0.91
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11208 ns 10667 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9875 ns 9292 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 115767 ns 115727.5 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 1146625 ns
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 72681 ns 73908 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 14833 ns 13874.5 ns 1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 14584 ns 13750 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 14979.5 ns 14333 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 14125 ns 14375.5 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 554958.5 ns 559680.5 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5137041 ns
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 345660.5 ns 337060 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 958 ns 959 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 958 ns 1083 ns 0.88
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 34204.5 ns 33675 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 638979.5 ns
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 207831 ns 206546 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8291 ns 8917 ns 0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8541 ns 8437.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9292 ns 8791 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9500 ns 9250 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 223363.5 ns 225862.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 5901875 ns
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 657971.5 ns 576667 ns 1.14
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23500 ns 23667 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23542 ns 23292 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23834 ns 23813 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23125 ns 23666 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 20050 ns 20529 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 448583.5 ns
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 188301 ns 187811 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 53770.5 ns 53583.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 53042 ns 52145.5 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 54042 ns 53584 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 55020.5 ns 53667 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 258832 ns 260507 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 2415625 ns
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 588042 ns 549086 ns 1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1448437.5 ns 1444541.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1438125 ns 1445459 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1405125 ns 1414666.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1396021 ns 1401396 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194395.5 ns 195236 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2058625 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 346302 ns 321861 ns 1.08
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5024812.5 ns 5007208 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5026125 ns 5006958 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5011083 ns 5015812.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5006958 ns 5020500 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 510089 ns 510108 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9178458 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1198365 ns 1117899 ns 1.07
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 779661000 ns 828285625 ns 0.94
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 541756209 ns 541921375 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 545828709 ns 542359625 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1513614750 ns 1558200021 ns 0.97
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22673094 ns 22535776.5 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/Metal 107171459 ns
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14686436 ns 12173703 ns 1.21
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2975273958 ns 3903695416 ns 0.76
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 2889890291 ns 1771980416 ns 1.63
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1793050500 ns 1773568584 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 4711214375 ns 5228367459 ns 0.90
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118916960 ns 119027931 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/Metal 2622707250 ns
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 87900974 ns 68450588 ns 1.28
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76541 ns 75916.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 79375 ns 87437.5 ns 0.91
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 79167 ns 84417 ns 0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 85583 ns 81083 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 191949 ns 192111.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1500104 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 105890.5 ns 126607 ns 0.84
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 261583.5 ns 282646 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 232562.5 ns 283042 ns 0.82
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 196625 ns 236875 ns 0.83
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 192687.5 ns 276458 ns 0.70
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 996248 ns 995625 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8743333 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 628158 ns 612404 ns 1.03
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 198984604 ns 199947208.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139204167 ns 139420500 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 139144125 ns 138954958 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 393236834 ns 389188834 ns 1.01
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5825572 ns 5832800 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/Metal 33344937.5 ns
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3611135.5 ns 2958637.5 ns 1.22
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 617564646 ns 618298396 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 440013042 ns 439277916 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 438881145.5 ns 439303895.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1193608916 ns 1200068000 ns 0.99
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26745549.5 ns 26614249.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/Metal 110179542 ns
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 21869093 ns 16011697.5 ns 1.37
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7083 ns 7417 ns 0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6208 ns 6125 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6042 ns 6125 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9833 ns 10125 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26360.5 ns 26885 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 873478.5 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 46220 ns 54341 ns 0.85
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213416.5 ns 214083 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 232437.5 ns 232833 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222375 ns 230000 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219250 ns 207709 ns 1.06
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 215332 ns 215596 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8943333 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 524234 ns 546726.5 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8083 ns 7417 ns 1.09
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8291 ns 8875.5 ns 0.93
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10709 ns 10750 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8500 ns 10459 ns 0.81
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 113094.5 ns 111291 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 1123895.5 ns
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 70651 ns 72956 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8917 ns 7792 ns 1.14
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8958 ns 7833.5 ns 1.14
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8584 ns 8125 ns 1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8208 ns 8375 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 492563 ns 492517.5 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5073167 ns
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 317437.5 ns 322723 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 459 ns 417 ns 1.10
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 541 ns 500 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 542 ns 459 ns 1.18
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 25048 ns 25272 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 713958 ns
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 46561 ns 45194 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10666.5 ns 9646 ns 1.11
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 11479 ns 9541 ns 1.20
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 11583 ns 11104 ns 1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10354 ns 10333 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 244034 ns 247083 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 6283709 ns
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 383588 ns 383457 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 353416 ns 351000 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 353792 ns 354459 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352021 ns 352250 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 350958 ns 351625 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 22877.5 ns 23168 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 312208 ns
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 188432 ns 198701 ns 0.95
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 793000 ns 826000 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 807333.5 ns 820458 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 777437 ns 822083.5 ns 0.95
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 830979 ns 827750 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 218580 ns 214195.5 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 2766209 ns
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 604914.5 ns 578901 ns 1.04
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5521 ns 5229.5 ns 1.06
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 5479 ns 5875 ns 0.93
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7396 ns 6958.5 ns 1.06
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 4166 ns 4667 ns 0.89
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17982 ns 17091 ns 1.05
batchedmm(16, Bsize=32)/forward/GPU/Metal 1438291.5 ns
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 71380 ns 74219 ns 0.96
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 12520.5 ns 13458.5 ns 0.93
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 11521 ns 10625 ns 1.08
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 11521 ns 13041 ns 0.88
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 18042 ns 18542 ns 0.97
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 207562.5 ns 202239.5 ns 1.03
batchedmm(16, Bsize=32)/zygote/GPU/Metal 5079708 ns
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 368113 ns 330217 ns 1.11
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 38125 ns 39833.5 ns 0.96
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51291.5 ns 51209 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 52584 ns 52458.5 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13500 ns 13459 ns 1.00
batchedmm(16, Bsize=128)/forward/GPU/CUDA 20289 ns 19993 ns 1.01
batchedmm(16, Bsize=128)/forward/GPU/Metal 4978875 ns
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 84681 ns 99666.5 ns 0.85
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 36896 ns 38229.5 ns 0.97
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 31458 ns 35125 ns 0.90
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 31958 ns 34187.5 ns 0.93
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 66000 ns 59417 ns 1.11
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 184469 ns 178995.5 ns 1.03
batchedmm(16, Bsize=128)/zygote/GPU/Metal 13432687 ns
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 412423 ns 362888 ns 1.14
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3583 ns 3500 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3666 ns 3667 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3958.5 ns 3833 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3500 ns 3709 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 19634 ns 19015 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 458041 ns
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 28900 ns 29645 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4208 ns 4291 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4375 ns 4500 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4625 ns 4458 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4167 ns 4292 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 197467.5 ns 194611 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 2168666 ns
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 138551.5 ns 126757 ns 1.09
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5208 ns 5916 ns 0.88
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4792 ns 5062.5 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7250 ns 6375 ns 1.14
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3792 ns 4625 ns 0.82
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 142334.5 ns 138395 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 1171167 ns
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 58781 ns 65944 ns 0.89
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9125 ns 9625 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8833 ns 8500 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9125 ns 9333 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8250 ns 10666 ns 0.77
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 822603 ns 807046.5 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 7665708 ns
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 387763.5 ns 378457 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 204042 ns 207583 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 212000 ns 209042 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 210875 ns 213208 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200958 ns 204125 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36985.5 ns 35332 ns 1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 853417 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 205912 ns 203930.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 653187.5 ns 603500 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 665958 ns 623479.5 ns 1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 622770.5 ns 658604.5 ns 0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 585667 ns 586375 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 260510 ns 254148 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8195083 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 799653 ns 767213 ns 1.04
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3369291 ns 3324167 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2332125 ns 2328667 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2329166 ns 2334417 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6307167 ns 6324542 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 205325 ns 206559 ns 0.99
batchedmm(128, Bsize=128)/forward/GPU/Metal 6066541 ns
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 212943 ns 377105 ns 0.56
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11648041 ns 11496208.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8330687.5 ns 8303562.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8348104 ns 8348416.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21116042 ns 21193020.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 734131.5 ns 736080.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/Metal 26082375 ns
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1069061 ns 2044820.5 ns 0.52
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4521 ns 3917 ns 1.15
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5208 ns 5292 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7583 ns 6292 ns 1.21
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5500 ns 7125 ns 0.77
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 132826.5 ns 129442 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 1175375 ns
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 55421 ns 57067 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9292 ns 8500 ns 1.09
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8334 ns 7375 ns 1.13
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9562.5 ns 7833 ns 1.22
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8604.5 ns 8291.5 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 716825.5 ns 711410 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 7184437.5 ns
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 369984 ns 364581 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 98313 ns 117312.5 ns 0.84
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 125521 ns 101437.5 ns 1.24
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 100541 ns 102687.5 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 103500 ns 98458.5 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 149399 ns 149616 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2228333.5 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 182342 ns 210473 ns 0.87
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2046104.5 ns 2008250 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2031250 ns 2022459 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1985791.5 ns 2039937.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2021416.5 ns 2036625 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 674153.5 ns 661994.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10587167 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1250004 ns 963831 ns 1.30
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 34188 ns 33416 ns 1.02
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36000 ns 35459 ns 1.02
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 35021 ns 34709 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 833 ns 750 ns 1.11
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15860 ns 15265 ns 1.04
batchedmm(2, Bsize=4)/forward/GPU/Metal 553417 ns
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 75761 ns 78737 ns 0.96
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 3083.5 ns 3959 ns 0.78
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 3541 ns 2917 ns 1.21
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3625 ns 4708 ns 0.77
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 3375 ns 3666 ns 0.92
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 140010.5 ns 136137.5 ns 1.03
batchedmm(2, Bsize=4)/zygote/GPU/Metal 1942729.5 ns
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 353624 ns 321796.5 ns 1.10
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7000 ns 7250 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6041 ns 6042 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5958 ns 6083 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9958 ns 10042 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35885 ns 34970 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 854042 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50330 ns 56516 ns 0.89
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223104 ns 221584 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 234125 ns 220959 ns 1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221250 ns 234583 ns 0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215667 ns 207333 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 243422 ns 237194 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8021021 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 512516 ns 540189 ns 0.95
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3750 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3750 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3750 ns 3833 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3958 ns 0.94
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22271.5 ns 21681 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 468292 ns
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 43460 ns 39383 ns 1.10
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14167 ns 14458 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14541 ns 14458 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14583 ns 14541 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14500 ns 14625 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 303531 ns 297631.5 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 2253708.5 ns
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 200012.5 ns 190215 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 99083 ns 129834 ns 0.76
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 128333.5 ns 118271 ns 1.09
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 103812 ns 106750 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 103958.5 ns 101666.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 150020 ns 150106 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2875583 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 195772 ns 241781 ns 0.81
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1887875.5 ns 1921708.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1929042 ns 1924583 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1884833 ns 1932000 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1894729 ns 1922750 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 670688 ns 653385 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10463500 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1065452 ns 928325 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18959 ns 18875 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17354.5 ns 17292 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22208 ns 20937 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17541.5 ns 18459 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104525.5 ns 104073.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1362312.5 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79351 ns 91301 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 252250 ns 239083.5 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 260833 ns 224791 ns 1.16
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219458 ns 224958.5 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 257937 ns 218500 ns 1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 495429 ns 493640.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6195583 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 462125 ns 439080 ns 1.05
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 24958.5 ns 26166 ns 0.95
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 32604.5 ns 29167 ns 1.12
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 27500 ns 28958 ns 0.95
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1208 ns 1416 ns 0.85
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16021 ns 15781 ns 1.02
batchedmm(16, Bsize=4)/forward/GPU/Metal 533959 ns
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 80071 ns 72756 ns 1.10
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 5250 ns 6208 ns 0.85
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 5854.5 ns 5041 ns 1.16
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5792 ns 6875 ns 0.84
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 6125 ns 6417 ns 0.95
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 201439.5 ns 199155.5 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/Metal 2014541.5 ns
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 376235 ns 324216 ns 1.16
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221583 ns 221875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 222541.5 ns 223375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 226291 ns 225375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 221875 ns 223542 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 219232.5 ns 216803 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1686583 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 271454 ns 267771 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 559604 ns 508542 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 548354 ns 511042 ns 1.07
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 500083.5 ns 509500 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 498250 ns 557354 ns 0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1034159 ns 1017707.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8587229 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 850955.5 ns 811461 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19625 ns 19104 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 19313 ns 19584 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23208 ns 22063 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20583 ns 19792 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 111518.5 ns 111072 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1475625 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 80186 ns 90009 ns 0.89
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215020.5 ns 221854 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 250333 ns 220250 ns 1.14
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214500 ns 218166.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221729.5 ns 220146 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 708936 ns 700847.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7292833 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 539977 ns 494855 ns 1.09
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6166 ns 6292 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6479 ns 7000 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8042 ns 7375 ns 1.09
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6417 ns 6834 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 133623 ns 130925 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 1170916 ns
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 66921 ns 63498 ns 1.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12250 ns 11041.5 ns 1.11
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11729.5 ns 9959 ns 1.18
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13334 ns 10895.5 ns 1.22
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11645.5 ns 10459 ns 1.11
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 771416.5 ns 770540.5 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 7239334 ns
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 391255 ns 375452 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4500 ns 4104 ns 1.10
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5041.5 ns 7041 ns 0.72
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7042 ns 7166 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5500 ns 6166 ns 0.89
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 134989.5 ns 131485.5 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 1146875 ns
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 58260 ns 62607 ns 0.93
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7750 ns 7416.5 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 7750 ns 1
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 8125 ns 1
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7709 ns 8083 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 738275 ns 737449 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 7536771 ns
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 386245 ns 380902 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14664541 ns 14481917 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10093041 ns 10107542 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10106791 ns 10094750 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27704625 ns 27859959 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/CUDA 529053 ns 533975 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/Metal 22466021 ns
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 401266 ns 867906.5 ns 0.46
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46793583 ns 46387667 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33459958.5 ns 33363354 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33523667 ns 33478875 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85429125 ns 85752792 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2854223 ns 2651799 ns 1.08
batchedmm(128, Bsize=512)/zygote/GPU/Metal 89341312.5 ns
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3309294 ns 5191497.5 ns 0.64
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 188000 ns 185208.5 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 186250 ns 185916 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 188667 ns 188604 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 185938 ns 187271 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 101713 ns 117719.5 ns 0.86
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1484500 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 235268 ns 236051 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 641812.5 ns 634875 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 636958 ns 627937.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 589208 ns 601166 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 591771 ns 587625 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 704450.5 ns 694993 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7517417 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 785986 ns 698169.5 ns 1.13
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 750 ns 625 ns 1.20
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 750 ns 584 ns 1.28
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 667 ns 584 ns 1.14
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32067 ns 31826 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 651375 ns
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 47241 ns 48104.5 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9979 ns 9541 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11521 ns 9687.5 ns 1.19
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10188 ns 10542 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9500 ns 10938 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 276358.5 ns 276120 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5875459 ns
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 374075 ns 371078 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26291 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26291 ns 26333 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26500 ns 26583 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26209 ns 26458 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23479 ns 22942 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 437083 ns
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 210433 ns 206526 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67042 ns 67125 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 68833 ns 67333 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 68917 ns 68792 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67583 ns 66875 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 274089 ns 273858 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 2210459 ns
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 606899 ns 554115 ns 1.10
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 204500 ns 207166 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 210417 ns 211667 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 211125 ns 211167 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200125 ns 202875 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27585 ns 27563 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 861208 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 205157.5 ns 206546 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 652542 ns 609937.5 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 671541 ns 669750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 624208 ns 664812.5 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 580625 ns 609042 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 236486 ns 233231.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9239500 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 837472 ns 798562 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 650083 ns 664875 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 650625 ns 636687.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 550709 ns 648791.5 ns 0.85
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 652708 ns 629792 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 186884 ns 185894.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1405750 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 234974 ns 349393 ns 0.67
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2244125 ns 2244229 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2249625 ns 2225354 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2253687.5 ns 2256708 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2232292 ns 2271792 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 908141 ns 900927 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9610291 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1356860 ns 1235829 ns 1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19479 ns 19333 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20020.5 ns 21166.5 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22000 ns 22375 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20500 ns 19958 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 107405.5 ns 106770.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1497959 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 82031 ns 89387 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 259687.5 ns 227250 ns 1.14
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 234896 ns 262312.5 ns 0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 223354.5 ns 231250 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 222104 ns 222770.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 701938 ns 700957 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7694083.5 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 552123 ns 516550 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 750 ns 584 ns 1.28
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 750 ns 584 ns 1.28
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 667 ns 584 ns 1.14
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 22889 ns 22928 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 713250.5 ns
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 47681 ns 44243 ns 1.08
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10833 ns 9583 ns 1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 11458 ns 9958.5 ns 1.15
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10958 ns 13229.5 ns 0.83
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 11333 ns 10875 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 258094.5 ns 258192 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 6601250 ns
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 398396 ns 395479 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8021 ns 8062.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7916.5 ns 9208 ns 0.86
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10479 ns 10459 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 7771 ns 8333 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 114650.5 ns 112863.5 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 1128833 ns
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 67611 ns 72315 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8625 ns 7500 ns 1.15
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9459 ns 7750 ns 1.22
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9334 ns 14875 ns 0.63
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10083 ns 8917 ns 1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 474110.5 ns 472419 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4853125 ns
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 322085 ns 321811 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2104.5 ns 1979.5 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2375 ns 2500 ns 0.95
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2667 ns 2542 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2125 ns 2416 ns 0.88
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 19503 ns 19845 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 435896 ns
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 189822 ns 191508 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 7666.5 ns 6666 ns 1.15
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 7083 ns 6459 ns 1.10
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 7771 ns 7292 ns 1.07
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 8417 ns 7292 ns 1.15
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 209638.5 ns 208409 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 2304438 ns
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 579508 ns 543621 ns 1.07
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 749167 ns 754167 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 749833.5 ns 751000 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 747292 ns 749375 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 748521 ns 747104 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 22733 ns 22303 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 312604 ns
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 37375.5 ns 47829 ns 0.78
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 778000 ns 792250 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 807229 ns 811750 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 774167 ns 789500 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 776625 ns 794229.5 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 207826 ns 206590.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 2597208 ns
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 220633 ns 233541 ns 0.94
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7209 ns 7250 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 5917 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6042 ns 6000 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 10209 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32931 ns 32976 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 855708.5 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50540 ns 57267 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 262833 ns 228458.5 ns 1.15
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 263396 ns 269270.5 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229333 ns 235021 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212854 ns 213146 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 255573 ns 254662 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8358834 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 524047.5 ns 552652 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12083 ns 12417 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11959 ns 13250 ns 0.90
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13583 ns 14458 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12771 ns 13000 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 132456 ns 131273.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 1189125 ns
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 233113 ns 231363 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25021 ns 24854.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25500 ns 24916 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25458 ns 25542 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24792 ns 24458 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 815326 ns 813324 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 7701292 ns
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 681611 ns 634495 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9562.5 ns 8875 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9833 ns 9958 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12000 ns 11167 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9541.5 ns 9542 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 118599 ns 116553 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 1229416 ns
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 74341 ns 74930 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14375 ns 13770.5 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 20917 ns 14917 ns 1.40
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17250 ns 15916 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15562.5 ns 16437.5 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 626256 ns 621843 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5717062 ns
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 368145 ns 356836 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9270.5 ns 9145.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9208 ns 9354 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11042 ns 10750 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9145.5 ns 10125 ns 0.90
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 117653 ns 116468 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 1158958 ns
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 73341 ns 74383.5 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 14062.5 ns 12916 ns 1.09
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 15125 ns 12959 ns 1.17
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 15125 ns 20541 ns 0.74
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 15146 ns 14500 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 518369.5 ns 515709 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5051833 ns
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 340775 ns 328534 ns 1.04
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 27708 ns 31062 ns 0.89
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 33875 ns 33146 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 31792 ns 30750 ns 1.03
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 2229.5 ns 1833 ns 1.22
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16522 ns 16169 ns 1.02
batchedmm(2, Bsize=128)/forward/GPU/Metal 4854041.5 ns
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 78412 ns 77564 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5583 ns 5562.5 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5917 ns 5312.5 ns 1.11
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 6084 ns 7208 ns 0.84
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 7770.5 ns 7834 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 136257 ns 134922 ns 1.01
batchedmm(2, Bsize=128)/zygote/GPU/Metal 13273333 ns
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 379326 ns 340125 ns 1.12
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 24751 ns 24307 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 682541.5 ns
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 48791 ns 45845 ns 1.06
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7520.5 ns 6166.5 ns 1.22
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8583 ns 6708 ns 1.28
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8625 ns 8167 ns 1.06
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7458.5 ns 7083 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 181857 ns 179926.5 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 6285375 ns
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 389326 ns 372385.5 ns 1.05
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5708 ns 5834 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6208 ns 5833 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6000 ns 5875 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5958 ns 5958 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25394 ns 25187 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 714417 ns
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 207474 ns 201636 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26375 ns 21041 ns 1.25
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 23250 ns 21709 ns 1.07
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21459 ns 23458 ns 0.91
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 20250 ns 26125 ns 0.78
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 262619.5 ns 262884 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 6644125 ns
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 695681 ns 615780.5 ns 1.13
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 145625 ns 192083.5 ns 0.76
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 178292 ns 158917 ns 1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 150417 ns 154416.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 153812.5 ns 146417 ns 1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 188204 ns 184640 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1588584 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 190633 ns 215472.5 ns 0.88
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1345771 ns 1319792 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1331542 ns 1328249.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1322333.5 ns 1347250 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1167354 ns 1337000 ns 0.87
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 856737 ns 844907 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9165250 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 997975 ns 1041340 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24250 ns 24292 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24458.5 ns 24916 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27084 ns 28000 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24417 ns 24833.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 225455 ns 224694.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1705354 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 115742 ns 130334 ns 0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 127500 ns 117583 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 174187 ns 131375 ns 1.33
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 119042 ns 160499.5 ns 0.74
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 130375 ns 164750 ns 0.79
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 984493 ns 967206 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8679292 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 591319 ns 585053 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 291 ns 250 ns 1.16
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22641 ns 22932 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 689208 ns
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 47290 ns 47870 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7083.5 ns 6292 ns 1.13
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8083 ns 6833 ns 1.18
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6958 ns 9416 ns 0.74
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6500 ns 7500 ns 0.87
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 197931.5 ns 196587.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 6549187.5 ns
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 395326.5 ns 380031 ns 1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6333.5 ns 5875 ns 1.08
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5708 ns 6292 ns 0.91
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7541 ns 7187.5 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6000 ns 6562 ns 0.91
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 137058.5 ns 134586 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 1181916.5 ns
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 232733 ns 230170 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10833.5 ns 9833 ns 1.10
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10583 ns 10000 ns 1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10416 ns 11187.5 ns 0.93
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9792 ns 11083 ns 0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 841858 ns 840176 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 8090729 ns
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 672580 ns 631290 ns 1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1625 ns 1542 ns 1.05
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1584 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22927 ns 22272 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 429250 ns
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 208003 ns 204933 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5917 ns 5750 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6375 ns 6125 ns 1.04
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6125 ns 6417 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5750 ns 5875 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 217549 ns 216977 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 2167125 ns
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 581914.5 ns 491814.5 ns 1.18
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8562 ns 8250 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8458 ns 8562.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10291.5 ns 9895.5 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8229.5 ns 9209 ns 0.89
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 116906 ns 115063 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 1209583 ns
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 77271.5 ns 73999 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9104.5 ns 8167 ns 1.11
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 15417 ns 9250 ns 1.67
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8792 ns 9833.5 ns 0.89
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8084 ns 10333 ns 0.78
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 557267.5 ns 548589 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5634417 ns
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 344656 ns 340367 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 125125 ns 127271 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 130729 ns 128750 ns 1.02
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 130250 ns 131062 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 181042 ns 181979.5 ns 0.99
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46296.5 ns 46303.5 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/Metal 364354 ns
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 100232 ns 102121 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 309333 ns 338125 ns 0.91
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 342125 ns 339792 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 313833 ns 346083 ns 0.91
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 570709 ns 595417 ns 0.96
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 185266 ns 181951 ns 1.02
batchedmm(128, Bsize=4)/zygote/GPU/Metal 1373875 ns
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 506148 ns 410627.5 ns 1.23
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 396437.5 ns 397708 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 289000 ns 288375 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288375 ns 287937.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756250 ns 756708 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43482.5 ns 43092 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 434458 ns
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 79761 ns 85671 ns 0.93
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1408916.5 ns 1456291.5 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1136979 ns 1133125 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1132062 ns 1127937.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2443000.5 ns 2360208 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 248184 ns 248595.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1965375 ns
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 349476 ns 266317 ns 1.31
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 645500 ns 643479.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 650562.5 ns 654166 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 546541.5 ns 652750 ns 0.84
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 545645.5 ns 650625 ns 0.84
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 173484 ns 172424.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1350375 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 242424 ns 315089 ns 0.77
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2520666.5 ns 2449417 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2473750 ns 2455020.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2447792 ns 2465625 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2452584 ns 2469208.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 937381.5 ns 922065 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10132041 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1450713 ns 1363193.5 ns 1.06
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 30500 ns 32917 ns 0.93
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 36187.5 ns 35374.5 ns 1.02
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 34146 ns 34417 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 958 ns 1000 ns 0.96
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15458 ns 15534 ns 1.00
batchedmm(2, Bsize=32)/forward/GPU/Metal 1293854 ns
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 71001 ns 78366 ns 0.91
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3084 ns 2937.5 ns 1.05
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3958 ns 3375 ns 1.17
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3333 ns 5208 ns 0.64
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3042 ns 4625 ns 0.66
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 135380 ns 133935.5 ns 1.01
batchedmm(2, Bsize=32)/zygote/GPU/Metal 5260562.5 ns
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 340585.5 ns 318886 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1460666 ns 1464209 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1503375 ns 1500333 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1503000 ns 1501333 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1441729 ns 1442563 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41871 ns 41738 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1242250 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 239254 ns 318625 ns 0.75
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5151979 ns 5128625 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5296833.5 ns 5291041 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5285437.5 ns 5297084 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4980042 ns 4998791.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 230225 ns 230499.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11359208.5 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1233400 ns 1198280 ns 1.03
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3709 ns 3709 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3750 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3750 ns 3750 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3750 ns 3916 ns 0.96
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33654 ns 33583 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 352750 ns
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 39741 ns 36778.5 ns 1.08
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15041 ns 15417 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15709 ns 15500 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15500 ns 15791 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15375 ns 16000 ns 0.96
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 251748 ns 252278 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 1635667 ns
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 165632 ns 161662 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 401812.5 ns 404625 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 296666 ns 296000 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295167 ns 295916 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760709 ns 760625 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113125 ns 113161.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 574187 ns
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 87471 ns 95859 ns 0.91
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1429500 ns 1479249.5 ns 0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1159833 ns 1158584 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1157541 ns 1160500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2466395.5 ns 2383354 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 235512 ns 228888 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1507125 ns
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 353405 ns 265922 ns 1.33
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 959 ns 958 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 1042 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 958 ns 1083 ns 0.88
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24950 ns 24404 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 692770.5 ns
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 208254 ns 207859 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7917 ns 7917 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9916 ns 8542 ns 1.16
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8583 ns 9917 ns 0.87
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8042 ns 12895.5 ns 0.62
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 202658.5 ns 202191 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 6448187.5 ns
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 697032 ns 620871 ns 1.12
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 831021 ns 835834 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 619667 ns 615542 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 618250 ns 617791.5 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1541417 ns 1549375 ns 0.99
batchedmm(128, Bsize=32)/forward/GPU/CUDA 131643 ns 130350.5 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/Metal 1716917 ns
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 166023 ns 215532 ns 0.77
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2699312.5 ns 2690375 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1995500 ns 2000479.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1985791 ns 2007416.5 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4946958 ns 4941104 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 234057 ns 232712 ns 1.01
batchedmm(128, Bsize=32)/zygote/GPU/Metal 6761458 ns
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 852834 ns 872871.5 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 291 ns 291 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32746 ns 31625 ns 1.04
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 642249.5 ns
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 47461 ns 47950 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6208 ns 6084 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9334 ns 6708 ns 1.39
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6708 ns 7666 ns 0.88
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6229 ns 8083 ns 0.77
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 223155 ns 221856.5 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 6000375 ns
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 361916 ns 352319 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1731292 ns 1741791.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1754791 ns 1752167 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1728874.5 ns 1739042 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1745562.5 ns 1719916 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190073 ns 183055.5 ns 1.04
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1502437.5 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 353886 ns 415606.5 ns 0.85
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4404625 ns 4361125 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4422041 ns 4365916.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4362625 ns 4399333 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4346521 ns 4394333 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 855907 ns 827645.5 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9512792 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1246280 ns 1239667.5 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6875 ns 7083 ns 0.97
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 17395.5 ns 7395.5 ns 2.35
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7250 ns 7041 ns 1.03
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6834 ns 6854.5 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 22751 ns 22223.5 ns 1.02
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 272959 ns
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 37041 ns 47178 ns 0.79
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 33000 ns 45292 ns 0.73
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 68979.5 ns 51167 ns 1.35
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 33333 ns 49250 ns 0.68
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 45500 ns 49437 ns 0.92
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 212527.5 ns 204846 ns 1.04
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 2608042 ns
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 221728.5 ns 235841 ns 0.94
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 23417 ns 22125 ns 1.06
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 25542 ns 25125 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 23312.5 ns 24833 ns 0.94
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5625 ns 5458.5 ns 1.03
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18456 ns 17859 ns 1.03
batchedmm(2, Bsize=512)/forward/GPU/Metal 14791020.5 ns
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 89826.5 ns 82154 ns 1.09
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 11917 ns 11792 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 11125 ns 10750 ns 1.03
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 10625 ns 12583 ns 0.84
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 17958 ns 19708.5 ns 0.91
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 223372.5 ns 216235 ns 1.03
batchedmm(2, Bsize=512)/zygote/GPU/Metal 45999500 ns
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 382947 ns 331099 ns 1.16
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 403917 ns 406250 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297500 ns 297333 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 297375 ns 296833.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762334 ns 762833 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47041 ns 46303.5 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 533542 ns
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 89431 ns 97252 ns 0.92
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1426250 ns 1477458 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1164625 ns 1164395.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1163125 ns 1164416 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2468250 ns 2386333 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 281846 ns 268961 ns 1.05
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2244750 ns
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 378111.5 ns 282959 ns 1.34
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1487625 ns 1488416 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1529979.5 ns 1526958 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1529729.5 ns 1529250 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1464667 ns 1466395.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54740 ns 52650 ns 1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1143667 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 235424 ns 326982 ns 0.72
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5146979 ns 5119459 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5286395.5 ns 5285084 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5251625 ns 5297709 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4982541.5 ns 4955208 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 258236.5 ns 250192 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10236958 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1218755 ns 1186136 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28375 ns 28292 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28125 ns 28292 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28250 ns 28333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28333 ns 28417 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24960 ns 23514.5 ns 1.06
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 430583 ns
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 212483 ns 207227 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66375 ns 66542 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66542 ns 66750 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67000 ns 66500 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66584 ns 66208 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 344216.5 ns 333506.5 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 2732875 ns
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 652061 ns 576948.5 ns 1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 84500 ns 124875 ns 0.68
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 93000 ns 81875 ns 1.14
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85541 ns 89166 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81042 ns 86750 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190669 ns 191648 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2029208 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 183273 ns 233116 ns 0.79
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2023313 ns 2025145.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2010958 ns 2021978.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1979291.5 ns 2030542 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1995645.5 ns 1995125 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 520209.5 ns 506195 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9143521 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1082408 ns 881973 ns 1.23

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.