Skip to content

Commit

Permalink
fix: gracefully handle OneHotArrays (#1064)
Browse files Browse the repository at this point in the history
* fix: gracefully handle onehotarrays

* chore: apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* test: test for onehotarrays for reactant

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
avik-pal and github-actions[bot] authored Nov 10, 2024
1 parent ed0d75c commit 22cb59e
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 3 deletions.
2 changes: 1 addition & 1 deletion examples/HyperNet/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Lux = "1"
LuxCUDA = "0.3"
MLDatasets = "0.7"
MLUtils = "0.4"
OneHotArrays = "0.2"
OneHotArrays = "0.2.5"
Optimisers = "0.3.3, 0.4"
Setfield = "1"
Statistics = "1"
Expand Down
2 changes: 1 addition & 1 deletion examples/NeuralODE/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Lux = "1"
LuxCUDA = "0.3"
MLDatasets = "0.7"
MLUtils = "0.4"
OneHotArrays = "0.2"
OneHotArrays = "0.2.5"
Optimisers = "0.3.3, 0.4"
OrdinaryDiffEqTsit5 = "1"
SciMLSensitivity = "7.63"
Expand Down
5 changes: 4 additions & 1 deletion lib/MLDataDevices/Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLDataDevices"
uuid = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.5.0"
version = "1.5.1"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand All @@ -18,6 +18,7 @@ FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
Expand All @@ -35,6 +36,7 @@ MLDataDevicesFillArraysExt = "FillArrays"
MLDataDevicesGPUArraysExt = "GPUArrays"
MLDataDevicesMLUtilsExt = "MLUtils"
MLDataDevicesMetalExt = ["GPUArrays", "Metal"]
MLDataDevicesOneHotArraysExt = "OneHotArrays"
MLDataDevicesReactantExt = "Reactant"
MLDataDevicesRecursiveArrayToolsExt = "RecursiveArrayTools"
MLDataDevicesReverseDiffExt = "ReverseDiff"
Expand All @@ -55,6 +57,7 @@ Functors = "0.4.8"
GPUArrays = "10, 11"
MLUtils = "0.4.4"
Metal = "1"
OneHotArrays = "0.2.5"
Preferences = "1.4"
Random = "1.10"
Reactant = "0.2.4"
Expand Down
17 changes: 17 additions & 0 deletions lib/MLDataDevices/ext/MLDataDevicesOneHotArraysExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
module MLDataDevicesOneHotArraysExt

using Adapt: Adapt
using MLDataDevices: MLDataDevices, Internal, ReactantDevice, CPUDevice
using OneHotArrays: OneHotArray

for op in (:get_device, :get_device_type)
@eval Internal.$(op)(x::OneHotArray) = Internal.$(op)(x.indices)
end

# Reactant doesn't pay very nicely with OneHotArrays at the moment
function Adapt.adapt_structure(dev::ReactantDevice, x::OneHotArray)
x_cpu = Adapt.adapt_structure(CPUDevice(), x)
return Adapt.adapt_storage(dev, convert(Array, x_cpu))
end

end
2 changes: 2 additions & 0 deletions lib/MLDataDevices/test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
Expand All @@ -30,6 +31,7 @@ FillArrays = "1"
ForwardDiff = "0.10.36"
Functors = "0.4.8"
MLUtils = "0.4"
OneHotArrays = "0.2.5"
Pkg = "1.10"
Random = "1.10"
RecursiveArrayTools = "3.8"
Expand Down
22 changes: 22 additions & 0 deletions lib/MLDataDevices/test/misc_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ using ReverseDiff, Tracker, ForwardDiff
using SparseArrays, FillArrays, Zygote, RecursiveArrayTools
using Functors: Functors

const BACKEND_GROUP = lowercase(get(ENV, "BACKEND_GROUP", "none"))

@testset "Issues Patches" begin
@testset "#10 patch" begin
dev = CPUDevice()
Expand Down Expand Up @@ -219,3 +221,23 @@ end

@test only(Zygote.gradient(x -> sum(abs2, gdev(x)), x')) isa Matrix{Float64}
end

@testset "OneHotArrays" begin
using OneHotArrays

x = onehotbatch("abracadabra", 'a':'e', 'e')
@test get_device(x) isa CPUDevice

gdev = gpu_device()
x_g = gdev(x)
@test get_device(x_g) isa parameterless_type(typeof(gdev))

if BACKEND_GROUP == "none" || BACKEND_GROUP == "reactant"
using Reactant

rdev = reactant_device()
x_rd = rdev(x)
@test get_device(x_rd) isa ReactantDevice
@test x_rd isa Reactant.ConcreteRArray{Bool, 2}
end
end

3 comments on commit 22cb59e

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register subdir=lib/MLDataDevices

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/119084

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a MLDataDevices-v1.5.1 -m "<description of version>" 22cb59e55103cb47e31ce2082ec9d115db5c0046
git push origin MLDataDevices-v1.5.1

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 22cb59e Previous: ed0d75c Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4584 ns 4375 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4917 ns 4583 ns 1.07
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5666 ns 8042 ns 0.70
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4042 ns 4125 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 60487 ns 60754 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10167 ns 10083 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 11000 ns 10208 ns 1.08
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10542 ns 11292 ns 0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10542 ns 10625 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 424703 ns 429099 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1125 ns 1083 ns 1.04
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1166 ns 1083 ns 1.08
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1292 ns 1333 ns 0.97
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1125 ns 3583 ns 0.31
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18464 ns 18440 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4000 ns 4166 ns 0.96
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4000 ns 4000 ns 1
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4208 ns 4334 ns 0.97
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4083 ns 3958 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 109915.5 ns 112468 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57375 ns 57792 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38250 ns 46125 ns 0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46375 ns 46167 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81584 ns 81125 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37506 ns 38404 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2012792 ns 2028916 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2093417 ns 2086083 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2086646 ns 2090541.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2000208 ns 1986270.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 197705 ns 199754 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 147000 ns 144083 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 143145.5 ns 146458 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 149666 ns 147167 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 147229.5 ns 145750 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168379 ns 166912 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1012208 ns 1116416 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1152209 ns 1112187 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1110709 ns 1123395.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1119500 ns 1104416 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 522581.5 ns 523848 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4834 ns 3500 ns 1.38
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3792 ns 3541.5 ns 1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4667 ns 4437.5 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3958 ns 3125 ns 1.27
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 65957 ns 67435 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9000 ns 9083 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9292 ns 9500 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9459 ns 10000 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8500 ns 9250 ns 0.92
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 469308.5 ns 492306 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18167 ns 16646 ns 1.09
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15625 ns 14937.5 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18917 ns 18292 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16583 ns 15000 ns 1.11
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 52878 ns 54321 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 252312.5 ns 213937.5 ns 1.18
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 215959 ns 212729.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214625 ns 214000 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214583 ns 213500 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 267130 ns 273554 ns 0.98
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 584 ns 500 ns 1.17
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 583 ns 500 ns 1.17
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 708 ns 750 ns 0.94
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 666 ns 0.75
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17462 ns 17546 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1500 ns 1375 ns 1.09
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1459 ns 1500 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1750 ns 1875 ns 0.93
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1417 ns 1625 ns 0.87
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 100800 ns 104313 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7167 ns 6792 ns 1.06
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5125 ns 5875 ns 0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5875 ns 5875 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9833 ns 9916 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23225 ns 24032 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 259792 ns 221667 ns 1.17
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 232458.5 ns 228854.5 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229520.5 ns 229292 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221875 ns 213000 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 166055.5 ns 170365 ns 0.97
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3875 ns 3834 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3875 ns 3916 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3833 ns 3875 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3875 ns 3833 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23597 ns 23655 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 17125 ns 16666 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16541 ns 16917 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16833 ns 17041 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16667 ns 16750 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 160583 ns 163843.5 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 576583 ns 577166 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 581541 ns 578250 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 573687.5 ns 598583 ns 0.96
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 575750 ns 577750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113170 ns 113312 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1423250 ns 1420667 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1430791.5 ns 1422916 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1431000 ns 1452041 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1421792 ns 1416667 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 207811 ns 214659 ns 0.97
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1075667 ns 1068354.5 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 948313 ns 970500 ns 0.98
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1346646 ns 1344437.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1310750 ns 1302166 ns 1.01
lenet(28, 28, 1, 64)/forward/GPU/CUDA 270367.5 ns 275365.5 ns 0.98
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5995500.5 ns 5882458 ns 1.02
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4593750 ns 4543542 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4976208.5 ns 4907291 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5505395.5 ns 5670583 ns 0.97
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1090295.5 ns 1092293.5 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23458 ns 23682 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2167 ns 2083 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2167 ns 2167 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2208 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2083 ns 2084 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 172926 ns 173257 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6458 ns 5625 ns 1.15
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5125 ns 5834 ns 0.88
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7208 ns 7166 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4333 ns 5500 ns 0.79
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 64432 ns 65459 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11458 ns 11625 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11583 ns 11645.5 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11958 ns 12187.5 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10833 ns 11167 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 442914.5 ns 447697 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7792 ns 7625 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7041 ns 6875 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8458 ns 8500 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6375 ns 6750 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 51253.5 ns 52309 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17833.5 ns 17292 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18000 ns 18333 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18542 ns 18792 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16875 ns 17708 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 298470 ns 302468 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 583 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32349 ns 33300 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8875 ns 8541.5 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9333 ns 9083 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9291 ns 9625 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8875 ns 8750 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 157321.5 ns 161219 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64792 ns 64333 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64667 ns 64687.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64750 ns 64375 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64375 ns 63917 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111151 ns 112623.5 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 275916 ns 288354.5 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 293917 ns 281500 ns 1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 291666 ns 277333.5 ns 1.05
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 274417 ns 282958.5 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 183162.5 ns 187466 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3323375 ns 3321750 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 2861812 ns 3035042 ns 0.94
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3049625 ns 3019750 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 3939000 ns 3935396 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 580012.5 ns 580332 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7623333 ns 7600458.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7263625 ns 7434083 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7327354 ns 7457646 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8196041 ns 8171125 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1311084.5 ns 1357218 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 18847291 ns 18821167 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 19137541 ns 19108667 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 19205875 ns 19164083 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 15425792 ns 15675625 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23654958 ns 23766000 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43401291.5 ns 33980750 ns 1.28
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37089791.5 ns 36953187.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34880750 ns 34917125 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1841996 ns 1857952 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 188777125 ns 188752208 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 178489062.5 ns 164370333 ns 1.09
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 152827958 ns 153098083 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 438354958 ns 437817875 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13884864 ns 13921239 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 289730542 ns 289969375 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 273653750 ns 340002146 ns 0.80
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 300146084 ns 299659604.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 363130458 ns 336790458 ns 1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24959 ns 23709 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 23166 ns 23958 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 26250 ns 25375 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21541 ns 23666 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 93319 ns 95315 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 104333 ns 103625 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 104208 ns 103250 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104041 ns 106729 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103292 ns 103291 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 494914.5 ns 500314.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7375 ns 7167 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7062.5 ns 6667 ns 1.06
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8083 ns 7667 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6959 ns 6500 ns 1.07
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 66496.5 ns 67818 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15333 ns 15042 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16334 ns 16208 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15958 ns 16750 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14750 ns 15125 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 467266 ns 474547 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3009270.5 ns 2918125 ns 1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2083125 ns 2093292 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2291250 ns 2262166.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4920209 ns 4794666.5 ns 1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 585803 ns 587541 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23529584 ns 23488583 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18299083 ns 18018687.5 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17952042 ns 18014917 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35984709 ns 35776875 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3109259 ns 3117851 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33275020.5 ns 33272812.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28041667 ns 27626875 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27515834 ns 27533833 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41779084 ns 41814417 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75459 ns 74625 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 81146 ns 74292 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76416.5 ns 75666 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72291 ns 74417 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 100380 ns 100380.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 285041.5 ns 292896 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 311542 ns 222583 ns 1.40
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 292833 ns 210416.5 ns 1.39
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 315375 ns 205792 ns 1.53
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 544347 ns 540203.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12667 ns 12583 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12771 ns 12333 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13750 ns 13291 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12083 ns 12791 ns 0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 70337.5 ns 70496 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27042 ns 26917 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27625 ns 27125 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27708 ns 28125 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26875 ns 26625 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 473629 ns 470516 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 13083 ns 12917 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13250 ns 13084 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14833 ns 13666 ns 1.09
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13125 ns 12667 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 52795 ns 51912 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26250 ns 25667 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26750 ns 25875 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 28792 ns 26625 ns 1.08
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26167 ns 26167 ns 1
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 304928.5 ns 301936 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 181791 ns 180583 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 181750 ns 181729 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 184875 ns 183250 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 181833 ns 179708.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 56540.5 ns 55883 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 615187.5 ns 593354.5 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 620771.5 ns 590687.5 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 583541 ns 591291.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 595499.5 ns 584292 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 285956 ns 283120 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6958 ns 7208 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7083 ns 7042 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8041 ns 7875 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6375 ns 6875 ns 0.93
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 70068.5 ns 69418.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14375 ns 14208 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15333 ns 15042 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15333 ns 15584 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14500 ns 14042 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 463652.5 ns 454550 ns 1.02
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1234312.5 ns 1165604 ns 1.06
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1279667 ns 1224917 ns 1.04
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1269833.5 ns 1272500 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1312458 ns 1318479 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301465 ns 300980.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4127187.5 ns 4116000 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4510874.5 ns 4366375 ns 1.03
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4533354 ns 4511145.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 4443687.5 ns 4453083 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1047444 ns 1040994 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1834 ns 1750 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1833 ns 1875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23871 ns 23357 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4917 ns 4833 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4917 ns 5083 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4959 ns 4959 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 190792.5 ns 187819.5 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7041.5 ns 6833 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6292 ns 6520.5 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9208 ns 6958 ns 1.32
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7166 ns 7292 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 56472 ns 53959.5 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11750 ns 11437.5 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11584 ns 11708 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11812.5 ns 12125 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10792 ns 10875 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 335267 ns 325142 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 333 ns 250 ns 1.33
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 375 ns 0.78
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23092 ns 22864 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2958 ns 2750 ns 1.08
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2667 ns 2959 ns 0.90
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2667 ns 3042 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2708 ns 2750 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 161307 ns 158390 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 14395.5 ns 13292 ns 1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12333 ns 12333 ns 1
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 14917 ns 15167 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 13145.5 ns 13542 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 56807.5 ns 55595.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25375 ns 24354.5 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25333 ns 24917 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24958 ns 25625 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25333 ns 24500 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 292514 ns 291342 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4125 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4167 ns 4166 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4167 ns 4208 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4125 ns 4167 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 25065 ns 24702 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16333 ns 16208 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16000 ns 16292 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16250 ns 16375 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16167 ns 16084 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 198557.5 ns 196319.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5791 ns 5625 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5833 ns 5708 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5792 ns 5833 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5750 ns 5667 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33912.5 ns 34095 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 21041 ns 20542 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21250 ns 21000 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21395.5 ns 21375 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21042 ns 20292 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 176321 ns 175766 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 408208 ns 399250 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 363583.5 ns 379792 ns 0.96
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 492667 ns 489500 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 523542 ns 532604.5 ns 0.98
batchedmm(16, Bsize=512)/forward/GPU/CUDA 67347 ns 66554 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 978667 ns 963624.5 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 891000.5 ns 856312.5 ns 1.04
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1242958 ns 1230417 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 1420417 ns 1311562 ns 1.08
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 190609 ns 191675 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 82666 ns 82792 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 82709 ns 80875 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85834 ns 84083.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 133542 ns 82521 ns 1.62
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193457 ns 192735.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1923750 ns 1915250 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1936250.5 ns 1909021 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1914520.5 ns 1928396.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1920083 ns 1912916.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 399634.5 ns 406192 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22639 ns 22043 ns 1.03
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1834 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1834 ns 1875 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1875 ns 1917 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 174147.5 ns 170121.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8542 ns 6083 ns 1.40
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7292 ns 6875 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9083 ns 9000 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6541 ns 8375 ns 0.78
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 60578 ns 60029.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9542 ns 8875 ns 1.08
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9479.5 ns 9333 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9542 ns 9667 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9541 ns 9333 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 313158.5 ns 304727 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120031270.5 ns 121958916.5 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 181860604 ns 173853917 ns 1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147859583 ns 147607125 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 107036271 ns 103815750 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5506155 ns 5473827 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 615708666.5 ns 616983729 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 581207833 ns 554296083 ns 1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 450770312.5 ns 450713625 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 758274833.5 ns 754890083 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34927722 ns 38208156 ns 0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 650246750 ns 651694958 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 685688396 ns 668848604.5 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 577502729 ns 588116250 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 743657333 ns 750086792 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59167 ns 59458 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39333 ns 47459 ns 0.83
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47625 ns 47791 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83542 ns 83958 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38483 ns 37381 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1924917 ns 1929916 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1972334 ns 1974333 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1976458 ns 1985833.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1895208 ns 1859375 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 176241.5 ns 173072.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 270958 ns 269020.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 269042 ns 268292 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 270875 ns 270292 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 267958 ns 267000 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 128472 ns 127268.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 682312.5 ns 587354 ns 1.16
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 684021 ns 693917 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 678333 ns 589417 ns 1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 683083 ns 581937.5 ns 1.17
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 712823 ns 661051 ns 1.08
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2110062.5 ns 2096834 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2217708.5 ns 2094750 ns 1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2221875 ns 2206250 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2230541 ns 2187125 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 134372 ns 133017 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5507000 ns 5491854 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5539625 ns 5493249.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5512958 ns 5515395.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5509604 ns 5510750 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 755964 ns 703282.5 ns 1.07
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 638125 ns 645833 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 651667 ns 648583 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 638459 ns 647708 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 647208 ns 650958 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 47881 ns 46809.5 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1826416 ns 1821000 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1675750 ns 1727666.5 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1720875 ns 1746292 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2104000 ns 2100333 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 224321 ns 221775 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58208 ns 58333 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38792 ns 47000 ns 0.83
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46584 ns 46333 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83542 ns 83625 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 29060 ns 28634 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2031958 ns 2035667 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2100291.5 ns 2083749.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2085291 ns 2091208 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2007250 ns 1962146 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 191693.5 ns 189516.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13371646.5 ns 13367687 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12465792 ns 12447041.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12501042 ns 12572666 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15188916 ns 15028854.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 510743.5 ns 514202 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47270208 ns 47344292 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 42049416.5 ns 41848500.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41051834 ns 40966542 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58110084 ns 58373500 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3204565.5 ns 3196704 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 96634583 ns 73579562.5 ns 1.31
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 91624583 ns 91406041.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90630541 ns 90565250 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 98906458.5 ns 76782875 ns 1.29
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58500 ns 59000 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38709 ns 46917 ns 0.83
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47125 ns 47333 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83541 ns 82792 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 47960 ns 46869 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1920000 ns 1927021 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1969792 ns 1714541 ns 1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1972500 ns 1977833.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1889834 ns 1884375 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 192720 ns 191330 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 417 ns 416 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 31940 ns 32378 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6750 ns 6209 ns 1.09
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6625 ns 6541 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6583 ns 6959 ns 0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6250 ns 6333 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 171690.5 ns 168990.5 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31426 ns 31794 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2833 ns 2625 ns 1.08
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2792 ns 2958 ns 0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2834 ns 2916 ns 0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2625 ns 2625 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 160271 ns 156010.5 ns 1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 287478708.5 ns 287628499.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 347117687.5 ns 340509375 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 313742875 ns 315088770.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 271337417 ns 267551959 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7120485.5 ns 7063426 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 999672583 ns 1001322625 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 962585125 ns 944816500 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 847863396 ns 856957937.5 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1159606875 ns 1159027042 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34018012.5 ns 34074066 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1668327625 ns 1313285104.5 ns 1.27
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1694566583 ns 1697633000 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1646047208 ns 1638900292 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1665789292 ns 1318281104 ns 1.26
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1415313 ns 1410625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1417167 ns 1407792 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1417459 ns 1412125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1412583 ns 1407708 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 128511 ns 127251.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5021792 ns 5023916.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5044792 ns 5011417 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5021250 ns 5023520.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5024292 ns 5000791 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 495850 ns 530171 ns 0.94
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 169190166 ns 168946208 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 179239187.5 ns 132469479 ns 1.35
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 128995104.5 ns 121413791.5 ns 1.06
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 162929271 ns 161625979.5 ns 1.01
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4883493 ns 4880231 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 671536958 ns 827387542 ns 0.81
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 604481292 ns 641003042 ns 0.94
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 531751292 ns 530713834 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 681136250 ns 675863750 ns 1.01
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16104554 ns 16337541 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8980854 ns 9000792 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8853334 ns 8791292 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7886771 ns 7890062.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 10140625 ns 10164708.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1602269.5 ns 1595476 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 36048625 ns 36017917 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 37859417 ns 36663604 ns 1.03
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33187042 ns 33249104.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 39063937.5 ns 38766083.5 ns 1.01
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 8827671 ns 6515962 ns 1.35
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47666 ns 47250 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47667 ns 47500 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47625 ns 47625 ns 1
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47542 ns 47416.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18332 ns 18983.5 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50416 ns 50250 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50500 ns 50625 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50541 ns 50625 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 53000 ns 50333 ns 1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 183394 ns 163825 ns 1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7833 ns 8084 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7500 ns 7125 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9375 ns 9667 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6979.5 ns 8458 ns 0.83
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 85722.5 ns 77103.5 ns 1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10500 ns 9375 ns 1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10500 ns 10042 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10625 ns 10583 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10167 ns 10000 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 484512.5 ns 461429.5 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9250 ns 7833 ns 1.18
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6750 ns 6771 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 9417 ns 9625 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 7792 ns 8125 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 105586.5 ns 90885 ns 1.16
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13083 ns 12979 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13250 ns 15208.5 ns 0.87
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13458.5 ns 14041 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13417 ns 13791.5 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 467808.5 ns 419264.5 ns 1.12
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1083 ns 959 ns 1.13
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1042 ns 1042 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 31641 ns 32012 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8167 ns 7833 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8209 ns 8208 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8291 ns 8709 ns 0.95
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8125 ns 7958.5 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 195119.5 ns 189226.5 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 25167 ns 22875 ns 1.10
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23250 ns 23292 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23270.5 ns 23500 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23125 ns 23000 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18534 ns 18216 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 53062 ns 51917 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52375 ns 52792 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52500 ns 52875 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52708 ns 52416 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 252220 ns 223112 ns 1.13
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1400708 ns 1402916.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1409834 ns 1402812.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1399229.5 ns 1406208 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1398458 ns 1403604.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194493.5 ns 195501 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5016000 ns 5020125 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5040334 ns 5010437.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4993708.5 ns 5024792 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4643770.5 ns 5012270.5 ns 0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 597903.5 ns 554008 ns 1.08
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3046458 ns 2999416 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2118792 ns 2081166.5 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2287146 ns 2289708 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4859250 ns 4780250 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 581676 ns 584316.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24338167 ns 24335979 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 19105334 ns 18882500 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18916917 ns 18838542 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36315667 ns 36465395.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3195442 ns 3208033 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33985645.5 ns 34044458 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28693250 ns 28308645.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27979104.5 ns 28011708 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41435375 ns 41401312.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144577667 ns 144926583 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 142667333 ns 142567458 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 124796041.5 ns 123989708 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 174395646 ns 174263458.5 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22784954 ns 22544725 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 908417417 ns 1382510875 ns 0.66
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 866595875 ns 1114547875 ns 0.78
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 690147541 ns 1238644000 ns 0.56
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 679371625 ns 669081959 ns 1.02
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118837225 ns 118397054.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76312 ns 73750 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76708.5 ns 74062.5 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78062.5 ns 76666 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 74292 ns 72500 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 239745 ns 212207.5 ns 1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 279187.5 ns 282500 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 297958 ns 285000 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 283125 ns 279062.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 265791.5 ns 244104.5 ns 1.09
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1232585 ns 1165691.5 ns 1.06
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35449875 ns 35470229.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 35824917 ns 35610729 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32070395.5 ns 32380708 ns 0.99
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40877625 ns 40906104 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5847896 ns 5839614 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 147901291 ns 147585625 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 155872291 ns 153161750 ns 1.02
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 133368083 ns 134802500 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 286886250 ns 286930833 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34880972 ns 34862634 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 121105063 ns 122086979 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 181834292 ns 174426667 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147760625 ns 148037375 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 101356500 ns 103434500 ns 0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5478431 ns 5431132 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 473677042 ns 469930000 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 485888583.5 ns 467117250 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 437646959 ns 440742583 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 740881667 ns 740751583 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32245879 ns 35158004 ns 0.92
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 707376812.5 ns 646279541 ns 1.09
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 667253771 ns 655203979 ns 1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 576063750 ns 572927666 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 852206792 ns 850524375 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1266333 ns 1278041 ns 0.99
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 788917 ns 973833 ns 0.81
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 969500 ns 990208 ns 0.98
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2069208.5 ns 1941459 ns 1.07
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 586368.5 ns 582790.5 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2969541 ns 2969750 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2523083 ns 2465416 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2620708 ns 2613271 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3700583 ns 3708542 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1794949 ns 1712702 ns 1.05
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 6640437.5 ns 6655667 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 6484958 ns 6494792 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 6451083 ns 6503416.5 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 4447979 ns 4455333.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7500 ns 7292 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5334 ns 6000 ns 0.89
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 6041 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9917 ns 9917 ns 1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25270 ns 25550 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212167 ns 212437.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221000 ns 220625 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221125 ns 225708.5 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207458 ns 207291.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 252957 ns 255206 ns 0.99
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 313894603.5 ns 315688312.5 ns 0.99
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 280731020.5 ns 223172541 ns 1.26
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 185850791.5 ns 190206958.5 ns 0.98
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 312245084 ns 311635083 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7682659 ns 7671660.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1079816500.5 ns 1080721667 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 989067125 ns 916406083 ns 1.08
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 810903834 ns 811802750 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1155211625 ns 1154276750 ns 1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26590890 ns 26458418 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7416.5 ns 5625 ns 1.32
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6209 ns 6333 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6917 ns 8125 ns 0.85
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5729.5 ns 5458 ns 1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 151351 ns 163515 ns 0.93
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7604.5 ns 7042 ns 1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7542 ns 7833 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7541 ns 7625 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7542 ns 7625 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 598449 ns 635993.5 ns 0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 541 ns 417 ns 1.30
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 541 ns 583 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 459 ns 458 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24254 ns 23629 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9416 ns 9000 ns 1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9292 ns 9583 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9458 ns 10000 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9250 ns 8562.5 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 214013.5 ns 232549 ns 0.92
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 352917 ns 350979 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 355083.5 ns 351250 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 350833 ns 351479.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 353583 ns 351771 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21515 ns 21199 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 828979 ns 821375 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 787167 ns 774000 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 774312.5 ns 814249.5 ns 0.95
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 823875 ns 823854 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 271369.5 ns 305212.5 ns 0.89
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 338958.5 ns 333979 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 320167 ns 338146 ns 0.95
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 453291 ns 448750 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 331895.5 ns 336917 ns 0.99
batchedmm(16, Bsize=32)/forward/GPU/CUDA 18690 ns 17454 ns 1.07
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 696291 ns 691709 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 744854.5 ns 748375 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1036229 ns 1025583 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 686042 ns 685896 ns 1.00
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 234671 ns 284244 ns 0.83
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 361375 ns 353083.5 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 336417 ns 350875 ns 0.96
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 425792 ns 433770.5 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 377584 ns 379520.5 ns 0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22985 ns 22107 ns 1.04
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 760187 ns 754208 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 753000 ns 751666 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1084125 ns 1064541.5 ns 1.02
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 812791.5 ns 821250 ns 0.99
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 215024 ns 223088.5 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3625 ns 3333 ns 1.09
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3708 ns 3500 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3625 ns 3791 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3541 ns 3583 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 18002 ns 17921 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4291 ns 4208 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4583 ns 4541 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4500 ns 4375 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4541 ns 4458 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 239767 ns 285329 ns 0.84
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5687.5 ns 4458.5 ns 1.28
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4125 ns 4000 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4959 ns 6125 ns 0.81
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3875 ns 3333 ns 1.16
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 180564 ns 222015.5 ns 0.81
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8708 ns 8125 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8500 ns 8625 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8667 ns 9250 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8541 ns 8500 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1101874 ns 1221788 ns 0.90
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 208292 ns 202834 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209250 ns 210459 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209166.5 ns 209708 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200375 ns 200375 ns 1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34680 ns 34798 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 649916 ns 628000 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 632250 ns 624875.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 621979 ns 633791 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 632208 ns 628750 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 306075 ns 343726 ns 0.89
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 975416.5 ns 959833 ns 1.02
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 936645.5 ns 938500 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 954895.5 ns 948167 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 1290104.5 ns 1293584 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 206706.5 ns 207285 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4495416.5 ns 4498417 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4624208 ns 4485958.5 ns 1.03
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4293833.5 ns 4301916.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 6306792 ns 6237771 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 924556 ns 977432 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4667 ns 3541.5 ns 1.32
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4333 ns 4000 ns 1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5208 ns 6520.5 ns 0.80
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3125 ns 3270.5 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 201570 ns 218695 ns 0.92
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7625 ns 6916 ns 1.10
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7958 ns 0.93
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7333 ns 7770.5 ns 0.94
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7125 ns 7084 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 964645.5 ns 1002409 ns 0.96
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1660208.5 ns 1587875 ns 1.05
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1158208 ns 1157417 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1364146 ns 1362500 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2354187 ns 2449833.5 ns 0.96
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213379 ns 212742.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12376417 ns 12321000 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9587708.5 ns 9541250 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9262687 ns 9282937.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 17957375 ns 17977708 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1953093.5 ns 1958143 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17363667 ns 17266875 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14466208 ns 14360083 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14361333 ns 14299667 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21148875 ns 21040375.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 136479 ns 92000 ns 1.48
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 90541.5 ns 88708 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 91959 ns 93500 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 88917 ns 90750 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126286 ns 126341 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2029396 ns 2026042 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2020021 ns 1932750 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2021541.5 ns 2038687.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2009791 ns 2023042 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 970059 ns 1038826 ns 0.93
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 348458 ns 342396 ns 1.02
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 336521 ns 348521 ns 0.97
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 399187.5 ns 398542 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 313500 ns 315291.5 ns 0.99
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15421 ns 15350 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 709188 ns 702708 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 737750 ns 735833 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 1023375 ns 1022333 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 643583 ns 642041 ns 1.00
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 185776.5 ns 195993 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7334 ns 7250 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 6000 ns 0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6042 ns 5875 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9959 ns 10000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33229 ns 34698 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223750 ns 212229.5 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228166.5 ns 221334 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220459 ns 223542 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215250 ns 214291 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 289979.5 ns 330238.5 ns 0.88
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3708 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3709 ns 3708 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3667 ns 3625 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3667 ns 3667 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22473 ns 23068 ns 0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14375 ns 14459 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14250 ns 14416 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14417 ns 14375 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14500 ns 14416 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 454491 ns 475565.5 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 93937.5 ns 95667 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 96000 ns 93084 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 95750 ns 98228.5 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 94229 ns 142542 ns 0.66
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125724.5 ns 125649 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921562.5 ns 1928708.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1938875 ns 1918770.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1920667 ns 1920229.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1918854.5 ns 1922417 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 949972 ns 986877 ns 0.96
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 886792 ns 870500 ns 1.02
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 812958 ns 822542 ns 0.99
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1228020.5 ns 1224667 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 961021 ns 959708.5 ns 1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA 266393 ns 280652 ns 0.95
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2837791.5 ns 2768000 ns 1.03
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2523625 ns 2463583 ns 1.02
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3323459 ns 3332229 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3391708 ns 3408208 ns 1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1589685.5 ns 1607675.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17625 ns 17166.5 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15833 ns 15250 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18750 ns 20000 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15833 ns 17125 ns 0.92
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 140920 ns 144011 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216604.5 ns 259584 ns 0.83
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 223875 ns 223937.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216062.5 ns 217584 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 257042 ns 256291.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 635870.5 ns 639995.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 227209 ns 220833 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 220833 ns 220916 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 223271 ns 223833 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 219541 ns 222000 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 267876.5 ns 270421 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 523334 ns 506875 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 557334 ns 560584 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 498187.5 ns 503459 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 540416 ns 524208 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1349491 ns 1352767 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 334459 ns 328125 ns 1.02
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 317417 ns 340334 ns 0.93
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 364250 ns 381292 ns 0.96
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 320791 ns 328083.5 ns 0.98
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16596.5 ns 16321 ns 1.02
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 715750.5 ns 715937.5 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 735750 ns 730917 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 1025729.5 ns 1019083 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 657937.5 ns 654375 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 193892 ns 194891 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17667 ns 17625 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17417 ns 17833 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20583.5 ns 20625 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16833 ns 18500 ns 0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 144720.5 ns 144691 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212749.5 ns 211958 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212500 ns 211770.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213583 ns 213792 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 223229 ns 224209 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 930226.5 ns 927744 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7458 ns 6083 ns 1.23
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5000 ns 5333 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7458 ns 7333 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6000 ns 6375 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 229973.5 ns 195900.5 ns 1.17
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10750 ns 10125 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10604 ns 10959 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11000 ns 11167 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10458 ns 10250 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1052874 ns 1052511.5 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4167 ns 3084 ns 1.35
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4145.5 ns 3500 ns 1.18
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5250 ns 6208 ns 0.85
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 2834 ns 2833 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 236091.5 ns 237907.5 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7625 ns 7125 ns 1.07
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7875 ns 7833 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7750 ns 8000 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7209 ns 7375 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1061672 ns 1089693 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23478292 ns 23765688 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43131583 ns 34116708 ns 1.26
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37763437.5 ns 37548791 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34891125.5 ns 34871603.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1856489 ns 1848733 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184985667 ns 184574541 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 171828500 ns 158637292 ns 1.08
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146459896 ns 146492249.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 412533125 ns 412825666 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16498145 ns 16516963.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 426401458 ns 428413125 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 257893209 ns 245145812.5 ns 1.05
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 231907209 ns 232582250 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 482223334 ns 482166083 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 183271 ns 182208 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183354.5 ns 182167 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 186750 ns 187187.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 182250 ns 184125 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 202451.5 ns 228276.5 ns 0.89
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 589375 ns 596834 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 596958.5 ns 635292 ns 0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 589000 ns 596083.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 632167 ns 631333.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1041439 ns 1083975 ns 0.96
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3849562 ns 3833209 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3881896 ns 3802458.5 ns 1.02
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3464521 ns 3469041.5 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 5356333 ns 5353229 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 536569.5 ns 538332.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17412625 ns 17399875 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17756875 ns 17212083 ns 1.03
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16608479 ns 16570812.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 22042750 ns 22200875 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2637828 ns 2641650 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 583 ns 417 ns 1.40
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 584 ns 583 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32430 ns 32777 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9875 ns 8792 ns 1.12
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9500 ns 9437.5 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9750 ns 9875 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9145.5 ns 8750 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 267467.5 ns 268383.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 504434042 ns 505323209 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 458633542 ns 430592979 ns 1.07
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 381209021 ns 372723313 ns 1.02
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 671200875.5 ns 593451250 ns 1.13
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12484248 ns 12484712 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 2048273395.5 ns 2052799291.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1661422833 ns 1634023584 ns 1.02
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1499198563 ns 1488799646 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2207989770.5 ns 2213028333.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49043755 ns 49227788.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1648062.5 ns 1616500 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1192292 ns 1177979 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1392792 ns 1381750 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2475542 ns 2503313 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 218335.5 ns 216079 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12753208 ns 12726146.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9970145.5 ns 9927958 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9709187 ns 9644812 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18405562.5 ns 18434917 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2007331 ns 2044196 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17672750 ns 17630500.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14774167 ns 14697728.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14626875 ns 14551916 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21434167 ns 21455334 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26208 ns 26208 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26250 ns 26209 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26209 ns 26375 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26208 ns 26167 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 24803 ns 24313 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66917 ns 67125 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66833 ns 66958 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67875 ns 68417 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66750 ns 66833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 397350.5 ns 408721.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 204750 ns 203333 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209167 ns 209334 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209917 ns 210375 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200042 ns 198959 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26341 ns 26930 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 612792 ns 606708 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 669042 ns 670042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 665479.5 ns 666042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 633646 ns 630875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 340366 ns 354960.5 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 656958 ns 604375 ns 1.09
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 628166 ns 540500 ns 1.16
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 637292 ns 645667 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 658854 ns 638000 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131658 ns 132024 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2236438 ns 2256750 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2302291.5 ns 2054209 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2233208.5 ns 2228916 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2244083.5 ns 2248437.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1141510 ns 1184553 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17708.5 ns 18000 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17875 ns 16917 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22791.5 ns 23250 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17812.5 ns 16917 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 143266.5 ns 146620 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 231271 ns 230417 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 262583 ns 230104.5 ns 1.14
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 262520.5 ns 260042 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 262167 ns 262271 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 974956 ns 1058952 ns 0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 583 ns 459 ns 1.27
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23116 ns 23956 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10167 ns 9250 ns 1.10
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9666 ns 9750 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10125 ns 10292 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10084 ns 9792 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 255373.5 ns 261926 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7125 ns 5625 ns 1.27
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6209 ns 6333 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7354.5 ns 8083 ns 0.91
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5792 ns 7792 ns 0.74
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 224318.5 ns 235109 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7708 ns 7042 ns 1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7625 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7541 ns 7792 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7209 ns 7459 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 798172.5 ns 809933 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2208.5 ns 1916 ns 1.15
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2291 ns 2208 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2209 ns 2416.5 ns 0.91
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2167 ns 2125 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17921 ns 17938 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6875 ns 6542 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6500 ns 6625 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6750 ns 6708 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6708 ns 6709 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 329206 ns 333610.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 749437.5 ns 746541 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 748917 ns 748729 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749541 ns 749667 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 751833.5 ns 747333.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21135 ns 21921 ns 0.96
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 795541 ns 791604.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 788459 ns 776416.5 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 792916 ns 791708 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 791791.5 ns 790958 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 292229.5 ns 297538.5 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7209 ns 7250 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5333 ns 5833 ns 0.91
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5958 ns 3875 ns 1.54
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10166 ns 10042 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32459 ns 34255 ns 0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 229666.5 ns 228167 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 239729.5 ns 227333 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 264354.5 ns 269083 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 255083.5 ns 254291.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 359407.5 ns 364412.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12770.5 ns 12334 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11125 ns 10417 ns 1.07
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12792 ns 13041 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10541 ns 10416 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 243081 ns 249045.5 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25208 ns 24667 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24916 ns 24583 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25208 ns 26458.5 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24625 ns 24750 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1117079 ns 1131953 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106480583 ns 106536125 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 125655584 ns 117252875.5 ns 1.07
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120834166 ns 120784042 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117491666 ns 117622625 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2637704 ns 2632972 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 393188541 ns 394299334 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 380341000 ns 367861709 ns 1.03
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 357677834 ns 421890563 ns 0.85
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 481091583 ns 488934791 ns 0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15233085 ns 15275033 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 937085875 ns 759409875 ns 1.23
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 774220083 ns 757340667 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 745186000 ns 744212312.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 945237625.5 ns 765863541.5 ns 1.23
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8625 ns 7666 ns 1.13
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7500 ns 6959 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8875 ns 8708 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7833 ns 7250 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 237576 ns 239312.5 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14250 ns 13542 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14375 ns 13875 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 13916 ns 14875 ns 0.94
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14083 ns 14000 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1078858 ns 1090703 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9125 ns 7625 ns 1.20
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7041 ns 7937.5 ns 0.89
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 9083 ns 9583 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 7042 ns 8459 ns 0.83
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 235440 ns 237956.5 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12916.5 ns 12125 ns 1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13208 ns 12666 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12792 ns 13375 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12708 ns 12417 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 787408.5 ns 799124.5 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 353104 ns 342375 ns 1.03
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 328604 ns 347229.5 ns 0.95
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 398083 ns 395334 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 314250 ns 320000 ns 0.98
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16719 ns 16875 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 711500 ns 702437.5 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 737000 ns 736167 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 1029562.5 ns 1020125 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 649000 ns 650833.5 ns 1.00
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 196298 ns 201562.5 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 291 ns 1.29
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23316 ns 23767 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6584 ns 6334 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6750 ns 6625 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6625 ns 6917 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6292 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 238133 ns 243851 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5875 ns 5708 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5916 ns 5792 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5875 ns 5875 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5792 ns 5708 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 23849 ns 25012 ns 0.95
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21583 ns 23750 ns 0.91
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21542 ns 21416.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 22395.5 ns 21875 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21250 ns 21395.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 259774.5 ns 265929 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 148459 ns 147104.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 146500 ns 144042 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151167 ns 150708 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 149209 ns 145416.5 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168521.5 ns 167550.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1306312.5 ns 1335979.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1335292 ns 1317041.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1326333 ns 1303209 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1329459 ns 1321541 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1332341.5 ns 1351455.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25520.5 ns 24500 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22687.5 ns 23792 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25917 ns 25791 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 23417 ns 24396 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 283013 ns 352723.5 ns 0.80
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 176479.5 ns 181833.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 119334 ns 179333 ns 0.67
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 131395.5 ns 128333 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 178542 ns 130104.5 ns 1.37
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1446515 ns 1448170.5 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 416 ns 250 ns 1.66
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22447 ns 22864 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7000 ns 6208 ns 1.13
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6792 ns 6604.5 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6833 ns 6875 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6604.5 ns 6417 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 254907.5 ns 254855.5 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5791.5 ns 4833 ns 1.20
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5041.5 ns 4979 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7375 ns 7000 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5583.5 ns 4583 ns 1.22
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 252117.5 ns 253659 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10250 ns 9958 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10292 ns 10125 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10208 ns 10520.5 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10292 ns 10125 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1346292 ns 1348895 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1625 ns 1584 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1584 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1584 ns 1583 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1583 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23009 ns 22776 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5958 ns 5708 ns 1.04
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5625 ns 5958 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5667 ns 6042 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5791 ns 5625 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 270989.5 ns 272052 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6824625 ns 6789167 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6348145.5 ns 6396375 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6519020.5 ns 6536083 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7697209 ns 7542208 ns 1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213576.5 ns 212440 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24071458 ns 24091354.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21312916.5 ns 21305541 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21105208.5 ns 21007521 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29655708 ns 29773354.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2112366 ns 2122649 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 48607583 ns 37337124.5 ns 1.30
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45891875 ns 45672500.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45733979.5 ns 45753542 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 49303792 ns 38138500 ns 1.29
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7292 ns 7375 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6916 ns 6729.5 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7667 ns 8520.5 ns 0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6812.5 ns 7250 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 236251.5 ns 238000 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8833 ns 8042 ns 1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9084 ns 8334 ns 1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9125 ns 9250 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8375 ns 8542 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1057827.5 ns 1071201 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1557041.5 ns 1501083 ns 1.04
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1245708 ns 1262125 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1634792 ns 1631083 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2151354 ns 2165291.5 ns 0.99
lenet(28, 28, 1, 128)/forward/GPU/CUDA 269564 ns 282584 ns 0.95
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7905354 ns 7848208 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6660125 ns 6228916.5 ns 1.07
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7215708 ns 7164000 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10061000 ns 10495396 ns 0.96
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1851007 ns 1889865 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 347583.5 ns 336187.5 ns 1.03
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 330250 ns 351479 ns 0.94
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 398666.5 ns 397354.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 347854.5 ns 346166.5 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46483.5 ns 42753.5 ns 1.09
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 750667 ns 746334 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 791375 ns 790062.5 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1087833 ns 1079583 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 760750 ns 740292 ns 1.03
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 231907 ns 308349.5 ns 0.75
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397542 ns 397125 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 213292 ns 288791 ns 0.74
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288208 ns 288208 ns 1
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 750375 ns 749667 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43637 ns 44616.5 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 666667 ns 672395.5 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 472875 ns 529625 ns 0.89
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 532542 ns 529667 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 973709 ns 987916 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 187534.5 ns 193722.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 596583 ns 595958 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 643625 ns 642042 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 658187.5 ns 649708 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 659375 ns 646709 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131892 ns 132676 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2455000 ns 2466250.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2514542 ns 2439562 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2453792 ns 2451645.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2461334 ns 2457083 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1187757 ns 1304894 ns 0.91
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 352771 ns 341042 ns 1.03
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 330916.5 ns 352125 ns 0.94
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 399291 ns 397209 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 312854.5 ns 319750 ns 0.98
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15466 ns 16162 ns 0.96
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 710875 ns 704500 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 734791 ns 730916 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 1025771 ns 1018500 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 642208 ns 639666.5 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 195407.5 ns 200487.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1465375 ns 1458125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1498459 ns 1500666 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1502875 ns 1500792 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1442833 ns 1439500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40141 ns 41288 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5101625 ns 5129292 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5303750 ns 5286854 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5295812.5 ns 5283687.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4993584 ns 4974916.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 196609 ns 199307.5 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3667 ns 3667 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3667 ns 3667 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3666 ns 3667 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33049 ns 34015 ns 0.97
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15292 ns 15083 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15167 ns 15333 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15291 ns 15416 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15167 ns 15083 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 375124.5 ns 381336.5 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71334 ns 71417 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71333 ns 70459 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71250 ns 71125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71041 ns 70958 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113867.5 ns 114243 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 318833 ns 318042 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 321959 ns 319333 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 317750 ns 318708 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 317541 ns 318583 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 192238.5 ns 197759 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1084 ns 958 ns 1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1000 ns 959 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23138 ns 24130 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8583 ns 7916 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8417 ns 8395.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8459 ns 8750 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7958 ns 8000 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 258287 ns 263324.5 ns 0.98
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 475687.5 ns 462583 ns 1.03
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 463395.5 ns 475584 ns 0.97
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 562708 ns 552146 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 552729.5 ns 551937.5 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130132 ns 130186.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1400250 ns 1394875 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1394771 ns 1380000 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1643270.5 ns 1617687 ns 1.02
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 1597458 ns 1583875 ns 1.01
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 277863 ns 276893 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 417 ns 375 ns 1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31425 ns 32430 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6750 ns 5958 ns 1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6833 ns 6666 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6834 ns 6750 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6208 ns 6042 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 261831.5 ns 267754.5 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1726416.5 ns 1725917 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1745625 ns 1722625 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1724625 ns 1730791.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1725854 ns 1725417 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 169678 ns 168976.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4357021 ns 4353875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3978291.5 ns 4352396 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4384375 ns 4382625 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4359458.5 ns 4352021 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1215814 ns 1246090 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6750 ns 6458 ns 1.05
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6875 ns 6709 ns 1.02
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7312.5 ns 7020.5 ns 1.04
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6792 ns 6792 ns 1
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20951 ns 20586 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 48417 ns 32667 ns 1.48
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 33583 ns 32542 ns 1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 73208.5 ns 51937.5 ns 1.41
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 70500 ns 34416.5 ns 2.05
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 288573 ns 294547 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 360125 ns 350167 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 330312.5 ns 349083 ns 0.95
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 410854.5 ns 434792 ns 0.94
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 324312.5 ns 328937 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18716 ns 18162 ns 1.03
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 717250 ns 723145.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 741709 ns 743896 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 1036125.5 ns 1028021 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 667292 ns 667083 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 340218.5 ns 332754 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75417 ns 75334 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75208 ns 75375 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75292 ns 74833 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75333 ns 75125 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46771 ns 47376 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 325792 ns 340667 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 333167 ns 325625 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 325417 ns 324500 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 324333 ns 325042 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 208628 ns 212363.5 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1487791 ns 1484667 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1523333 ns 1526708 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1526708 ns 1526750 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1466375 ns 1463167 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 51173 ns 52318 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5109167 ns 5112645.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5274250 ns 5287000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5289270.5 ns 5288062 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4981458.5 ns 4975542 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 201765 ns 205581 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28250 ns 28166 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28167 ns 28208 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28250 ns 28209 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28208 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24387 ns 24403 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66625 ns 66500 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66250 ns 66542 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66375 ns 66625 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66792 ns 66584 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 518482.5 ns 542042 ns 0.96
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1471916.5 ns 1376375 ns 1.07
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 936458 ns 1069750 ns 0.88
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1142000 ns 1150250 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2245542 ns 2253791.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 593805 ns 588018 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3051000 ns 3106500 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2625979.5 ns 2733166.5 ns 0.96
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2744916 ns 2745500 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3827125 ns 3801667 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2034429 ns 2007531 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 8759417 ns 8875333 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 8720687.5 ns 8801916.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 8789874.5 ns 8770875 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 6417375 ns 6358146 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 83687.5 ns 83750 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 82438 ns 80520.5 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 83416.5 ns 85625 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82771 ns 82542 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194015.5 ns 193821 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2015417 ns 2013771 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2036291 ns 2020312.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2016500 ns 2023875 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2009667 ns 2016896 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 802404 ns 796366 ns 1.01

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.