-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: gracefully handle
OneHotArrays
(#1064)
* fix: gracefully handle onehotarrays * chore: apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * test: test for onehotarrays for reactant --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
- Loading branch information
1 parent
ed0d75c
commit 22cb59e
Showing
6 changed files
with
47 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "MLDataDevices" | ||
uuid = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.5.0" | ||
version = "1.5.1" | ||
|
||
[deps] | ||
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" | ||
|
@@ -18,6 +18,7 @@ FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" | |
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" | ||
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" | ||
Metal = "dde4c033-4e86-420c-a63e-0dd931031962" | ||
OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" | ||
Reactant = "3c362404-f566-11ee-1572-e11a4b42c853" | ||
RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd" | ||
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" | ||
|
@@ -35,6 +36,7 @@ MLDataDevicesFillArraysExt = "FillArrays" | |
MLDataDevicesGPUArraysExt = "GPUArrays" | ||
MLDataDevicesMLUtilsExt = "MLUtils" | ||
MLDataDevicesMetalExt = ["GPUArrays", "Metal"] | ||
MLDataDevicesOneHotArraysExt = "OneHotArrays" | ||
MLDataDevicesReactantExt = "Reactant" | ||
MLDataDevicesRecursiveArrayToolsExt = "RecursiveArrayTools" | ||
MLDataDevicesReverseDiffExt = "ReverseDiff" | ||
|
@@ -55,6 +57,7 @@ Functors = "0.4.8" | |
GPUArrays = "10, 11" | ||
MLUtils = "0.4.4" | ||
Metal = "1" | ||
OneHotArrays = "0.2.5" | ||
Preferences = "1.4" | ||
Random = "1.10" | ||
Reactant = "0.2.4" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
module MLDataDevicesOneHotArraysExt | ||
|
||
using Adapt: Adapt | ||
using MLDataDevices: MLDataDevices, Internal, ReactantDevice, CPUDevice | ||
using OneHotArrays: OneHotArray | ||
|
||
for op in (:get_device, :get_device_type) | ||
@eval Internal.$(op)(x::OneHotArray) = Internal.$(op)(x.indices) | ||
end | ||
|
||
# Reactant doesn't pay very nicely with OneHotArrays at the moment | ||
function Adapt.adapt_structure(dev::ReactantDevice, x::OneHotArray) | ||
x_cpu = Adapt.adapt_structure(CPUDevice(), x) | ||
return Adapt.adapt_storage(dev, convert(Array, x_cpu)) | ||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
22cb59e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register subdir=lib/MLDataDevices
22cb59e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/119084
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
22cb59e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4584
ns4375
ns1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4917
ns4583
ns1.07
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
5666
ns8042
ns0.70
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4042
ns4125
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
60487
ns60754
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10167
ns10083
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
11000
ns10208
ns1.08
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10542
ns11292
ns0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10542
ns10625
ns0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
424703
ns429099
ns0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1125
ns1083
ns1.04
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1166
ns1083
ns1.08
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1292
ns1333
ns0.97
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1125
ns3583
ns0.31
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
18464
ns18440
ns1.00
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4000
ns4166
ns0.96
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4000
ns4000
ns1
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4208
ns4334
ns0.97
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
4083
ns3958
ns1.03
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
109915.5
ns112468
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57375
ns57792
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
38250
ns46125
ns0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46375
ns46167
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
81584
ns81125
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37506
ns38404
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2012792
ns2028916
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2093417
ns2086083
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2086646
ns2090541.5
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2000208
ns1986270.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
197705
ns199754
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
147000
ns144083
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
143145.5
ns146458
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
149666
ns147167
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
147229.5
ns145750
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
168379
ns166912
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1012208
ns1116416
ns0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1152209
ns1112187
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1110709
ns1123395.5
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1119500
ns1104416
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
522581.5
ns523848
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4834
ns3500
ns1.38
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
3792
ns3541.5
ns1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4667
ns4437.5
ns1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3958
ns3125
ns1.27
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
65957
ns67435
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9000
ns9083
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9292
ns9500
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9459
ns10000
ns0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8500
ns9250
ns0.92
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
469308.5
ns492306
ns0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
18167
ns16646
ns1.09
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
15625
ns14937.5
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18917
ns18292
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16583
ns15000
ns1.11
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
52878
ns54321
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
252312.5
ns213937.5
ns1.18
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
215959
ns212729.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
214625
ns214000
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214583
ns213500
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
267130
ns273554
ns0.98
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
584
ns500
ns1.17
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
583
ns500
ns1.17
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
708
ns750
ns0.94
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
500
ns666
ns0.75
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
17462
ns17546
ns1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1500
ns1375
ns1.09
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1459
ns1500
ns0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1750
ns1875
ns0.93
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1417
ns1625
ns0.87
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
100800
ns104313
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7167
ns6792
ns1.06
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5125
ns5875
ns0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5875
ns5875
ns1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9833
ns9916
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
23225
ns24032
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
259792
ns221667
ns1.17
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
232458.5
ns228854.5
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229520.5
ns229292
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
221875
ns213000
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
166055.5
ns170365
ns0.97
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3875
ns3834
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
3875
ns3916
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
3833
ns3875
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3875
ns3833
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
23597
ns23655
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
17125
ns16666
ns1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16541
ns16917
ns0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16833
ns17041
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16667
ns16750
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
160583
ns163843.5
ns0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
576583
ns577166
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
581541
ns578250
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
573687.5
ns598583
ns0.96
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
575750
ns577750
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113170
ns113312
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1423250
ns1420667
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1430791.5
ns1422916
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1431000
ns1452041
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
1421792
ns1416667
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
207811
ns214659
ns0.97
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1075667
ns1068354.5
ns1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
948313
ns970500
ns0.98
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
1346646
ns1344437.5
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1310750
ns1302166
ns1.01
lenet(28, 28, 1, 64)/forward/GPU/CUDA
270367.5
ns275365.5
ns0.98
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
5995500.5
ns5882458
ns1.02
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4593750
ns4543542
ns1.01
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
4976208.5
ns4907291
ns1.01
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
5505395.5
ns5670583
ns0.97
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1090295.5
ns1092293.5
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
542
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
500
ns500
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23458
ns23682
ns0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2167
ns2083
ns1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2167
ns2167
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2208
ns2208
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2083
ns2084
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
172926
ns173257
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6458
ns5625
ns1.15
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5125
ns5834
ns0.88
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7208
ns7166
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
4333
ns5500
ns0.79
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
64432
ns65459
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11458
ns11625
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11583
ns11645.5
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11958
ns12187.5
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10833
ns11167
ns0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
442914.5
ns447697
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7792
ns7625
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7041
ns6875
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8458
ns8500
ns1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6375
ns6750
ns0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
51253.5
ns52309
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
17833.5
ns17292
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
18000
ns18333
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
18542
ns18792
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
16875
ns17708
ns0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
298470
ns302468
ns0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
542
ns500
ns1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
583
ns583
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
583
ns584
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
500
ns500
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
32349
ns33300
ns0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
8875
ns8541.5
ns1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9333
ns9083
ns1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9291
ns9625
ns0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8875
ns8750
ns1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
157321.5
ns161219
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
64792
ns64333
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
64667
ns64687.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
64750
ns64375
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
64375
ns63917
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
111151
ns112623.5
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
275916
ns288354.5
ns0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
293917
ns281500
ns1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
291666
ns277333.5
ns1.05
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
274417
ns282958.5
ns0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
183162.5
ns187466
ns0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
3323375
ns3321750
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
2861812
ns3035042
ns0.94
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
3049625
ns3019750
ns1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
3939000
ns3935396
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
580012.5
ns580332
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
7623333
ns7600458.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
7263625
ns7434083
ns0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
7327354
ns7457646
ns0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
8196041
ns8171125
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1311084.5
ns1357218
ns0.97
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
18847291
ns18821167
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
19137541
ns19108667
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
19205875
ns19164083
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
15425792
ns15675625
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23654958
ns23766000
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
43401291.5
ns33980750
ns1.28
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37089791.5
ns36953187.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
34880750
ns34917125
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1841996
ns1857952
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
188777125
ns188752208
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
178489062.5
ns164370333
ns1.09
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
152827958
ns153098083
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
438354958
ns437817875
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13884864
ns13921239
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
289730542
ns289969375
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
273653750
ns340002146
ns0.80
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
300146084
ns299659604.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
363130458
ns336790458
ns1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
24959
ns23709
ns1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
23166
ns23958
ns0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
26250
ns25375
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
21541
ns23666
ns0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
93319
ns95315
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
104333
ns103625
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
104208
ns103250
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
104041
ns106729
ns0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
103292
ns103291
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
494914.5
ns500314.5
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7375
ns7167
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7062.5
ns6667
ns1.06
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8083
ns7667
ns1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6959
ns6500
ns1.07
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
66496.5
ns67818
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
15333
ns15042
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
16334
ns16208
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15958
ns16750
ns0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14750
ns15125
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
467266
ns474547
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3009270.5
ns2918125
ns1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2083125
ns2093292
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2291250
ns2262166.5
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4920209
ns4794666.5
ns1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
585803
ns587541
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
23529584
ns23488583
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18299083
ns18018687.5
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
17952042
ns18014917
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
35984709
ns35776875
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
3109259
ns3117851
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33275020.5
ns33272812.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28041667
ns27626875
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
27515834
ns27533833
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41779084
ns41814417
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
75459
ns74625
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
81146
ns74292
ns1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
76416.5
ns75666
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
72291
ns74417
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
100380
ns100380.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
285041.5
ns292896
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
311542
ns222583
ns1.40
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
292833
ns210416.5
ns1.39
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
315375
ns205792
ns1.53
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
544347
ns540203.5
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
12667
ns12583
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12771
ns12333
ns1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
13750
ns13291
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12083
ns12791
ns0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
70337.5
ns70496
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
27042
ns26917
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
27625
ns27125
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27708
ns28125
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26875
ns26625
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
473629
ns470516
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
13083
ns12917
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
13250
ns13084
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14833
ns13666
ns1.09
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
13125
ns12667
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
52795
ns51912
ns1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26250
ns25667
ns1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26750
ns25875
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
28792
ns26625
ns1.08
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26167
ns26167
ns1
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
304928.5
ns301936
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
181791
ns180583
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
181750
ns181729
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
184875
ns183250
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
181833
ns179708.5
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
56540.5
ns55883
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
615187.5
ns593354.5
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
620771.5
ns590687.5
ns1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
583541
ns591291.5
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
595499.5
ns584292
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
285956
ns283120
ns1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6958
ns7208
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7083
ns7042
ns1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8041
ns7875
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6375
ns6875
ns0.93
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
70068.5
ns69418.5
ns1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14375
ns14208
ns1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
15333
ns15042
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15333
ns15584
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14500
ns14042
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
463652.5
ns454550
ns1.02
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
1234312.5
ns1165604
ns1.06
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
1279667
ns1224917
ns1.04
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
1269833.5
ns1272500
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
1312458
ns1318479
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA
301465
ns300980.5
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
4127187.5
ns4116000
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
4510874.5
ns4366375
ns1.03
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
4533354
ns4511145.5
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
4443687.5
ns4453083
ns1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1047444
ns1040994
ns1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1834
ns1750
ns1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1833
ns1875
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1833
ns1833
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1833
ns1833
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
23871
ns23357
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4917
ns4833
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4917
ns5083
ns0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4959
ns4959
ns1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4875
ns4875
ns1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
190792.5
ns187819.5
ns1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7041.5
ns6833
ns1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6292
ns6520.5
ns0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
9208
ns6958
ns1.32
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7166
ns7292
ns0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
56472
ns53959.5
ns1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11750
ns11437.5
ns1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
11584
ns11708
ns0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11812.5
ns12125
ns0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10792
ns10875
ns0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
335267
ns325142
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
333
ns250
ns1.33
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
292
ns375
ns0.78
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
292
ns333
ns0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
23092
ns22864
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2958
ns2750
ns1.08
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2667
ns2959
ns0.90
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2667
ns3042
ns0.88
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2708
ns2750
ns0.98
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
161307
ns158390
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
14395.5
ns13292
ns1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
12333
ns12333
ns1
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
14917
ns15167
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
13145.5
ns13542
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
56807.5
ns55595.5
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
25375
ns24354.5
ns1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
25333
ns24917
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
24958
ns25625
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
25333
ns24500
ns1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
292514
ns291342
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4125
ns4125
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4167
ns4166
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4167
ns4208
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4125
ns4167
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
25065
ns24702
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16333
ns16208
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16000
ns16292
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16250
ns16375
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16167
ns16084
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
198557.5
ns196319.5
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5791
ns5625
ns1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5833
ns5708
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5792
ns5833
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5750
ns5667
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
33912.5
ns34095
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
21041
ns20542
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
21250
ns21000
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21395.5
ns21375
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
21042
ns20292
ns1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
176321
ns175766
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
408208
ns399250
ns1.02
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
363583.5
ns379792
ns0.96
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
492667
ns489500
ns1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
523542
ns532604.5
ns0.98
batchedmm(16, Bsize=512)/forward/GPU/CUDA
67347
ns66554
ns1.01
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
978667
ns963624.5
ns1.02
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
891000.5
ns856312.5
ns1.04
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
1242958
ns1230417
ns1.01
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
1420417
ns1311562
ns1.08
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
190609
ns191675
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
82666
ns82792
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
82709
ns80875
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
85834
ns84083.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
133542
ns82521
ns1.62
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193457
ns192735.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1923750
ns1915250
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1936250.5
ns1909021
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1914520.5
ns1928396.5
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1920083
ns1912916.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
399634.5
ns406192
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns291
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
333
ns292
ns1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
22639
ns22043
ns1.03
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1834
ns1792
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1834
ns1875
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1875
ns1917
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1792
ns1792
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
174147.5
ns170121.5
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
8542
ns6083
ns1.40
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
7292
ns6875
ns1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
9083
ns9000
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6541
ns8375
ns0.78
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
60578
ns60029.5
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9542
ns8875
ns1.08
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9479.5
ns9333
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9542
ns9667
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9541
ns9333
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
313158.5
ns304727
ns1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
120031270.5
ns121958916.5
ns0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
181860604
ns173853917
ns1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
147859583
ns147607125
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
107036271
ns103815750
ns1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5506155
ns5473827
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
615708666.5
ns616983729
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
581207833
ns554296083
ns1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
450770312.5
ns450713625
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
758274833.5
ns754890083
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34927722
ns38208156
ns0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
650246750
ns651694958
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
685688396
ns668848604.5
ns1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
577502729
ns588116250
ns0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
743657333
ns750086792
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
59167
ns59458
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
39333
ns47459
ns0.83
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47625
ns47791
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83542
ns83958
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
38483
ns37381
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1924917
ns1929916
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1972334
ns1974333
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1976458
ns1985833.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1895208
ns1859375
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
176241.5
ns173072.5
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
270958
ns269020.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
269042
ns268292
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
270875
ns270292
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
267958
ns267000
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
128472
ns127268.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
682312.5
ns587354
ns1.16
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
684021
ns693917
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
678333
ns589417
ns1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
683083
ns581937.5
ns1.17
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
712823
ns661051
ns1.08
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2110062.5
ns2096834
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2217708.5
ns2094750
ns1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2221875
ns2206250
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2230541
ns2187125
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
134372
ns133017
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5507000
ns5491854
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5539625
ns5493249.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5512958
ns5515395.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5509604
ns5510750
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
755964
ns703282.5
ns1.07
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
638125
ns645833
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
651667
ns648583
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
638459
ns647708
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
647208
ns650958
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
47881
ns46809.5
ns1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1826416
ns1821000
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1675750
ns1727666.5
ns0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1720875
ns1746292
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
2104000
ns2100333
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
224321
ns221775
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58208
ns58333
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
38792
ns47000
ns0.83
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46584
ns46333
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83542
ns83625
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
29060
ns28634
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2031958
ns2035667
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2100291.5
ns2083749.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2085291
ns2091208
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2007250
ns1962146
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
191693.5
ns189516.5
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
13371646.5
ns13367687
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
12465792
ns12447041.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
12501042
ns12572666
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
15188916
ns15028854.5
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
510743.5
ns514202
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
47270208
ns47344292
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
42049416.5
ns41848500.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
41051834
ns40966542
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
58110084
ns58373500
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
3204565.5
ns3196704
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
96634583
ns73579562.5
ns1.31
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
91624583
ns91406041.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
90630541
ns90565250
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
98906458.5
ns76782875
ns1.29
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58500
ns59000
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
38709
ns46917
ns0.83
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47125
ns47333
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83541
ns82792
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
47960
ns46869
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1920000
ns1927021
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1969792
ns1714541
ns1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1972500
ns1977833.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1889834
ns1884375
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
192720
ns191330
ns1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
375
ns292
ns1.28
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
416
ns375
ns1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
417
ns416
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
31940
ns32378
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6750
ns6209
ns1.09
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6625
ns6541
ns1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6583
ns6959
ns0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6250
ns6333
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
171690.5
ns168990.5
ns1.02
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns250
ns1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns250
ns1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
31426
ns31794
ns0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
2833
ns2625
ns1.08
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
2792
ns2958
ns0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
2834
ns2916
ns0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2625
ns2625
ns1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
160271
ns156010.5
ns1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
287478708.5
ns287628499.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
347117687.5
ns340509375
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
313742875
ns315088770.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
271337417
ns267551959
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7120485.5
ns7063426
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
999672583
ns1001322625
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
962585125
ns944816500
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
847863396
ns856957937.5
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1159606875
ns1159027042
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34018012.5
ns34074066
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1668327625
ns1313285104.5
ns1.27
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1694566583
ns1697633000
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1646047208
ns1638900292
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1665789292
ns1318281104
ns1.26
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1415313
ns1410625
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1417167
ns1407792
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1417459
ns1412125
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1412583
ns1407708
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
128511
ns127251.5
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5021792
ns5023916.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5044792
ns5011417
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5021250
ns5023520.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5024292
ns5000791
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
495850
ns530171
ns0.94
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
169190166
ns168946208
ns1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
179239187.5
ns132469479
ns1.35
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
128995104.5
ns121413791.5
ns1.06
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
162929271
ns161625979.5
ns1.01
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4883493
ns4880231
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
671536958
ns827387542
ns0.81
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
604481292
ns641003042
ns0.94
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
531751292
ns530713834
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
681136250
ns675863750
ns1.01
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
16104554
ns16337541
ns0.99
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
8980854
ns9000792
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
8853334
ns8791292
ns1.01
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
7886771
ns7890062.5
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
10140625
ns10164708.5
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1602269.5
ns1595476
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
36048625
ns36017917
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
37859417
ns36663604
ns1.03
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
33187042
ns33249104.5
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
39063937.5
ns38766083.5
ns1.01
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
8827671
ns6515962
ns1.35
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
47666
ns47250
ns1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
47667
ns47500
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
47625
ns47625
ns1
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
47542
ns47416.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
18332
ns18983.5
ns0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
50416
ns50250
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
50500
ns50625
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
50541
ns50625
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
53000
ns50333
ns1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
183394
ns163825
ns1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7833
ns8084
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
7500
ns7125
ns1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
9375
ns9667
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6979.5
ns8458
ns0.83
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
85722.5
ns77103.5
ns1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10500
ns9375
ns1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10500
ns10042
ns1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10625
ns10583
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10167
ns10000
ns1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
484512.5
ns461429.5
ns1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
9250
ns7833
ns1.18
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6750
ns6771
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
9417
ns9625
ns0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
7792
ns8125
ns0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
105586.5
ns90885
ns1.16
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13083
ns12979
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13250
ns15208.5
ns0.87
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13458.5
ns14041
ns0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
13417
ns13791.5
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
467808.5
ns419264.5
ns1.12
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1083
ns959
ns1.13
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1083
ns1042
ns1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1083
ns1083
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1042
ns1042
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
31641
ns32012
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8167
ns7833
ns1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8209
ns8208
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8291
ns8709
ns0.95
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8125
ns7958.5
ns1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
195119.5
ns189226.5
ns1.03
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
25167
ns22875
ns1.10
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
23250
ns23292
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
23270.5
ns23500
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
23125
ns23000
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
18534
ns18216
ns1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
53062
ns51917
ns1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
52375
ns52792
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
52500
ns52875
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
52708
ns52416
ns1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
252220
ns223112
ns1.13
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1400708
ns1402916.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1409834
ns1402812.5
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1399229.5
ns1406208
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1398458
ns1403604.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
194493.5
ns195501
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5016000
ns5020125
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5040334
ns5010437.5
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4993708.5
ns5024792
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4643770.5
ns5012270.5
ns0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
597903.5
ns554008
ns1.08
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3046458
ns2999416
ns1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2118792
ns2081166.5
ns1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2287146
ns2289708
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4859250
ns4780250
ns1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
581676
ns584316.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24338167
ns24335979
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
19105334
ns18882500
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
18916917
ns18838542
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
36315667
ns36465395.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
3195442
ns3208033
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33985645.5
ns34044458
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28693250
ns28308645.5
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
27979104.5
ns28011708
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41435375
ns41401312.5
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
144577667
ns144926583
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
142667333
ns142567458
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
124796041.5
ns123989708
ns1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
174395646
ns174263458.5
ns1.00
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22784954
ns22544725
ns1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
908417417
ns1382510875
ns0.66
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
866595875
ns1114547875
ns0.78
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
690147541
ns1238644000
ns0.56
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
679371625
ns669081959
ns1.02
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
118837225
ns118397054.5
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
76312
ns73750
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
76708.5
ns74062.5
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
78062.5
ns76666
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
74292
ns72500
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
239745
ns212207.5
ns1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
279187.5
ns282500
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
297958
ns285000
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
283125
ns279062.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
265791.5
ns244104.5
ns1.09
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1232585
ns1165691.5
ns1.06
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
35449875
ns35470229.5
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
35824917
ns35610729
ns1.01
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
32070395.5
ns32380708
ns0.99
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
40877625
ns40906104
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5847896
ns5839614
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
147901291
ns147585625
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
155872291
ns153161750
ns1.02
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
133368083
ns134802500
ns0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
286886250
ns286930833
ns1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
34880972
ns34862634
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
121105063
ns122086979
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
181834292
ns174426667
ns1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
147760625
ns148037375
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
101356500
ns103434500
ns0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5478431
ns5431132
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
473677042
ns469930000
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
485888583.5
ns467117250
ns1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
437646959
ns440742583
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
740881667
ns740751583
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
32245879
ns35158004
ns0.92
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
707376812.5
ns646279541
ns1.09
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
667253771
ns655203979
ns1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
576063750
ns572927666
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
852206792
ns850524375
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1266333
ns1278041
ns0.99
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
788917
ns973833
ns0.81
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
969500
ns990208
ns0.98
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2069208.5
ns1941459
ns1.07
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
586368.5
ns582790.5
ns1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
2969541
ns2969750
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2523083
ns2465416
ns1.02
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
2620708
ns2613271
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
3700583
ns3708542
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1794949
ns1712702
ns1.05
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
6640437.5
ns6655667
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
6484958
ns6494792
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
6451083
ns6503416.5
ns0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
4447979
ns4455333.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7500
ns7292
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5334
ns6000
ns0.89
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6125
ns6041
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9917
ns9917
ns1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
25270
ns25550
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212167
ns212437.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
221000
ns220625
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
221125
ns225708.5
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
207458
ns207291.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
252957
ns255206
ns0.99
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
313894603.5
ns315688312.5
ns0.99
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
280731020.5
ns223172541
ns1.26
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
185850791.5
ns190206958.5
ns0.98
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
312245084
ns311635083
ns1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7682659
ns7671660.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
1079816500.5
ns1080721667
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
989067125
ns916406083
ns1.08
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
810903834
ns811802750
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
1155211625
ns1154276750
ns1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
26590890
ns26458418
ns1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7416.5
ns5625
ns1.32
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6209
ns6333
ns0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6917
ns8125
ns0.85
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5729.5
ns5458
ns1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
151351
ns163515
ns0.93
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7604.5
ns7042
ns1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7542
ns7833
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7541
ns7625
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7542
ns7625
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
598449
ns635993.5
ns0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
541
ns417
ns1.30
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
541
ns583
ns0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
625
ns584
ns1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
459
ns458
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
24254
ns23629
ns1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9416
ns9000
ns1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9292
ns9583
ns0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9458
ns10000
ns0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9250
ns8562.5
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
214013.5
ns232549
ns0.92
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
352917
ns350979
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
355083.5
ns351250
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
350833
ns351479.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
353583
ns351771
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
21515
ns21199
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
828979
ns821375
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
787167
ns774000
ns1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
774312.5
ns814249.5
ns0.95
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
823875
ns823854
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
271369.5
ns305212.5
ns0.89
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
338958.5
ns333979
ns1.01
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
320167
ns338146
ns0.95
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
453291
ns448750
ns1.01
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
331895.5
ns336917
ns0.99
batchedmm(16, Bsize=32)/forward/GPU/CUDA
18690
ns17454
ns1.07
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
696291
ns691709
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
744854.5
ns748375
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
1036229
ns1025583
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
686042
ns685896
ns1.00
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
234671
ns284244
ns0.83
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
361375
ns353083.5
ns1.02
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
336417
ns350875
ns0.96
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
425792
ns433770.5
ns0.98
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
377584
ns379520.5
ns0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA
22985
ns22107
ns1.04
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
760187
ns754208
ns1.01
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
753000
ns751666
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
1084125
ns1064541.5
ns1.02
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
812791.5
ns821250
ns0.99
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
215024
ns223088.5
ns0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
3625
ns3333
ns1.09
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
3708
ns3500
ns1.06
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
3625
ns3791
ns0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
3541
ns3583
ns0.99
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
18002
ns17921
ns1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
4291
ns4208
ns1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
4583
ns4541
ns1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
4500
ns4375
ns1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
4541
ns4458
ns1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
239767
ns285329
ns0.84
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5687.5
ns4458.5
ns1.28
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4125
ns4000
ns1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4959
ns6125
ns0.81
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3875
ns3333
ns1.16
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
180564
ns222015.5
ns0.81
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8708
ns8125
ns1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8500
ns8625
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8667
ns9250
ns0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8541
ns8500
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1101874
ns1221788
ns0.90
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
208292
ns202834
ns1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
209250
ns210459
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
209166.5
ns209708
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
200375
ns200375
ns1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
34680
ns34798
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
649916
ns628000
ns1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
632250
ns624875.5
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
621979
ns633791
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
632208
ns628750
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
306075
ns343726
ns0.89
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
975416.5
ns959833
ns1.02
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
936645.5
ns938500
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
954895.5
ns948167
ns1.01
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
1290104.5
ns1293584
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA
206706.5
ns207285
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
4495416.5
ns4498417
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
4624208
ns4485958.5
ns1.03
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
4293833.5
ns4301916.5
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
6306792
ns6237771
ns1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
924556
ns977432
ns0.95
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4667
ns3541.5
ns1.32
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4333
ns4000
ns1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
5208
ns6520.5
ns0.80
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3125
ns3270.5
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
201570
ns218695
ns0.92
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7625
ns6916
ns1.10
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7375
ns7958
ns0.93
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7333
ns7770.5
ns0.94
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7125
ns7084
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
964645.5
ns1002409
ns0.96
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1660208.5
ns1587875
ns1.05
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1158208
ns1157417
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1364146
ns1362500
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2354187
ns2449833.5
ns0.96
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
213379
ns212742.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12376417
ns12321000
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9587708.5
ns9541250
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9262687
ns9282937.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
17957375
ns17977708
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1953093.5
ns1958143
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17363667
ns17266875
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14466208
ns14360083
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14361333
ns14299667
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21148875
ns21040375.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
136479
ns92000
ns1.48
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
90541.5
ns88708
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
91959
ns93500
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
88917
ns90750
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
126286
ns126341
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2029396
ns2026042
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2020021
ns1932750
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2021541.5
ns2038687.5
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2009791
ns2023042
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
970059
ns1038826
ns0.93
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
348458
ns342396
ns1.02
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
336521
ns348521
ns0.97
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
399187.5
ns398542
ns1.00
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
313500
ns315291.5
ns0.99
batchedmm(2, Bsize=4)/forward/GPU/CUDA
15421
ns15350
ns1.00
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
709188
ns702708
ns1.01
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
737750
ns735833
ns1.00
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
1023375
ns1022333
ns1.00
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
643583
ns642041
ns1.00
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
185776.5
ns195993
ns0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7334
ns7250
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5292
ns6000
ns0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6042
ns5875
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9959
ns10000
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
33229
ns34698
ns0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
223750
ns212229.5
ns1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
228166.5
ns221334
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220459
ns223542
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
215250
ns214291
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
289979.5
ns330238.5
ns0.88
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3750
ns3708
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3709
ns3708
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3667
ns3625
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3667
ns3667
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
22473
ns23068
ns0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14375
ns14459
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14250
ns14416
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14417
ns14375
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14500
ns14416
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
454491
ns475565.5
ns0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
93937.5
ns95667
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
96000
ns93084
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
95750
ns98228.5
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
94229
ns142542
ns0.66
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
125724.5
ns125649
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1921562.5
ns1928708.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1938875
ns1918770.5
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1920667
ns1920229.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1918854.5
ns1922417
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
949972
ns986877
ns0.96
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
886792
ns870500
ns1.02
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
812958
ns822542
ns0.99
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
1228020.5
ns1224667
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
961021
ns959708.5
ns1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA
266393
ns280652
ns0.95
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2837791.5
ns2768000
ns1.03
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2523625
ns2463583
ns1.02
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
3323459
ns3332229
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3391708
ns3408208
ns1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1589685.5
ns1607675.5
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17625
ns17166.5
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
15833
ns15250
ns1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18750
ns20000
ns0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
15833
ns17125
ns0.92
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
140920
ns144011
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
216604.5
ns259584
ns0.83
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
223875
ns223937.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
216062.5
ns217584
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
257042
ns256291.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
635870.5
ns639995.5
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
227209
ns220833
ns1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
220833
ns220916
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
223271
ns223833
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
219541
ns222000
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
267876.5
ns270421
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
523334
ns506875
ns1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
557334
ns560584
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
498187.5
ns503459
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
540416
ns524208
ns1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1349491
ns1352767
ns1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
334459
ns328125
ns1.02
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
317417
ns340334
ns0.93
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
364250
ns381292
ns0.96
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
320791
ns328083.5
ns0.98
batchedmm(16, Bsize=4)/forward/GPU/CUDA
16596.5
ns16321
ns1.02
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
715750.5
ns715937.5
ns1.00
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
735750
ns730917
ns1.01
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
1025729.5
ns1019083
ns1.01
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
657937.5
ns654375
ns1.01
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
193892
ns194891
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17667
ns17625
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17417
ns17833
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
20583.5
ns20625
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16833
ns18500
ns0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
144720.5
ns144691
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212749.5
ns211958
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
212500
ns211770.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
213583
ns213792
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
223229
ns224209
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
930226.5
ns927744
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
7458
ns6083
ns1.23
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5000
ns5333
ns0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7458
ns7333
ns1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6000
ns6375
ns0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
229973.5
ns195900.5
ns1.17
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10750
ns10125
ns1.06
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10604
ns10959
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11000
ns11167
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10458
ns10250
ns1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
1052874
ns1052511.5
ns1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4167
ns3084
ns1.35
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4145.5
ns3500
ns1.18
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
5250
ns6208
ns0.85
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
2834
ns2833
ns1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
236091.5
ns237907.5
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7625
ns7125
ns1.07
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7875
ns7833
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7750
ns8000
ns0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7209
ns7375
ns0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
1061672
ns1089693
ns0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23478292
ns23765688
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
43131583
ns34116708
ns1.26
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37763437.5
ns37548791
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
34891125.5
ns34871603.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1856489
ns1848733
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
184985667
ns184574541
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
171828500
ns158637292
ns1.08
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
146459896
ns146492249.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
412533125
ns412825666
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
16498145
ns16516963.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
426401458
ns428413125
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
257893209
ns245145812.5
ns1.05
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
231907209
ns232582250
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
482223334
ns482166083
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
183271
ns182208
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
183354.5
ns182167
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
186750
ns187187.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
182250
ns184125
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
202451.5
ns228276.5
ns0.89
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
589375
ns596834
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
596958.5
ns635292
ns0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
589000
ns596083.5
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
632167
ns631333.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1041439
ns1083975
ns0.96
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
3849562
ns3833209
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
3881896
ns3802458.5
ns1.02
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
3464521
ns3469041.5
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
5356333
ns5353229
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA
536569.5
ns538332.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
17412625
ns17399875
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
17756875
ns17212083
ns1.03
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
16608479
ns16570812.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
22042750
ns22200875
ns0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2637828
ns2641650
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
583
ns417
ns1.40
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
542
ns583
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
584
ns583
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
500
ns500
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
32430
ns32777
ns0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9875
ns8792
ns1.12
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9500
ns9437.5
ns1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9750
ns9875
ns0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9145.5
ns8750
ns1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
267467.5
ns268383.5
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
504434042
ns505323209
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
458633542
ns430592979
ns1.07
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
381209021
ns372723313
ns1.02
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
671200875.5
ns593451250
ns1.13
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
12484248
ns12484712
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
2048273395.5
ns2052799291.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1661422833
ns1634023584
ns1.02
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
1499198563
ns1488799646
ns1.01
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
2207989770.5
ns2213028333.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49043755
ns49227788.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1648062.5
ns1616500
ns1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1192292
ns1177979
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1392792
ns1381750
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2475542
ns2503313
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
218335.5
ns216079
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12753208
ns12726146.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9970145.5
ns9927958
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9709187
ns9644812
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18405562.5
ns18434917
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2007331
ns2044196
ns0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17672750
ns17630500.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14774167
ns14697728.5
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14626875
ns14551916
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21434167
ns21455334
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
26208
ns26208
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
26250
ns26209
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
26209
ns26375
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
26208
ns26167
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
24803
ns24313
ns1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66917
ns67125
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66833
ns66958
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
67875
ns68417
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66750
ns66833
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
397350.5
ns408721.5
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
204750
ns203333
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
209167
ns209334
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
209917
ns210375
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
200042
ns198959
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
26341
ns26930
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
612792
ns606708
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
669042
ns670042
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
665479.5
ns666042
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
633646
ns630875
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
340366
ns354960.5
ns0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
656958
ns604375
ns1.09
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
628166
ns540500
ns1.16
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
637292
ns645667
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
658854
ns638000
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
131658
ns132024
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2236438
ns2256750
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2302291.5
ns2054209
ns1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2233208.5
ns2228916
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2244083.5
ns2248437.5
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1141510
ns1184553
ns0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17708.5
ns18000
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17875
ns16917
ns1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
22791.5
ns23250
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17812.5
ns16917
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
143266.5
ns146620
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
231271
ns230417
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
262583
ns230104.5
ns1.14
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
262520.5
ns260042
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
262167
ns262271
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
974956
ns1058952
ns0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
583
ns459
ns1.27
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
542
ns583
ns0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
583
ns584
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
500
ns500
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
23116
ns23956
ns0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
10167
ns9250
ns1.10
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9666
ns9750
ns0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10125
ns10292
ns0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
10084
ns9792
ns1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
255373.5
ns261926
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7125
ns5625
ns1.27
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6209
ns6333
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7354.5
ns8083
ns0.91
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5792
ns7792
ns0.74
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
224318.5
ns235109
ns0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7708
ns7042
ns1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7375
ns7625
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7541
ns7792
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7209
ns7459
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
798172.5
ns809933
ns0.99
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
2208.5
ns1916
ns1.15
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
2291
ns2208
ns1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2209
ns2416.5
ns0.91
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
2167
ns2125
ns1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
17921
ns17938
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6875
ns6542
ns1.05
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6500
ns6625
ns0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6750
ns6708
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
6708
ns6709
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
329206
ns333610.5
ns0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
749437.5
ns746541
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
748917
ns748729
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
749541
ns749667
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
751833.5
ns747333.5
ns1.01
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
21135
ns21921
ns0.96
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
795541
ns791604.5
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
788459
ns776416.5
ns1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
792916
ns791708
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
791791.5
ns790958
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
292229.5
ns297538.5
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7209
ns7250
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5333
ns5833
ns0.91
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5958
ns3875
ns1.54
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10166
ns10042
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
32459
ns34255
ns0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
229666.5
ns228167
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
239729.5
ns227333
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
264354.5
ns269083
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
255083.5
ns254291.5
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
359407.5
ns364412.5
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
12770.5
ns12334
ns1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
11125
ns10417
ns1.07
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
12792
ns13041
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
10541
ns10416
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
243081
ns249045.5
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
25208
ns24667
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24916
ns24583
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25208
ns26458.5
ns0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24625
ns24750
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
1117079
ns1131953
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
106480583
ns106536125
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
125655584
ns117252875.5
ns1.07
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
120834166
ns120784042
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
117491666
ns117622625
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2637704
ns2632972
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
393188541
ns394299334
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
380341000
ns367861709
ns1.03
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
357677834
ns421890563
ns0.85
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
481091583
ns488934791
ns0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
15233085
ns15275033
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
937085875
ns759409875
ns1.23
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
774220083
ns757340667
ns1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
745186000
ns744212312.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
945237625.5
ns765863541.5
ns1.23
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
8625
ns7666
ns1.13
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7500
ns6959
ns1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8875
ns8708
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7833
ns7250
ns1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
237576
ns239312.5
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14250
ns13542
ns1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14375
ns13875
ns1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
13916
ns14875
ns0.94
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14083
ns14000
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
1078858
ns1090703
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
9125
ns7625
ns1.20
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7041
ns7937.5
ns0.89
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
9083
ns9583
ns0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
7042
ns8459
ns0.83
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
235440
ns237956.5
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12916.5
ns12125
ns1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13208
ns12666
ns1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12792
ns13375
ns0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12708
ns12417
ns1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
787408.5
ns799124.5
ns0.99
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
353104
ns342375
ns1.03
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
328604
ns347229.5
ns0.95
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
398083
ns395334
ns1.01
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
314250
ns320000
ns0.98
batchedmm(2, Bsize=128)/forward/GPU/CUDA
16719
ns16875
ns0.99
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
711500
ns702437.5
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
737000
ns736167
ns1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
1029562.5
ns1020125
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
649000
ns650833.5
ns1.00
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
196298
ns201562.5
ns0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
375
ns291
ns1.29
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
333
ns292
ns1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
23316
ns23767
ns0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6584
ns6334
ns1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6750
ns6625
ns1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6625
ns6917
ns0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6375
ns6292
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
238133
ns243851
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5875
ns5708
ns1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5916
ns5792
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
5875
ns5875
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5792
ns5708
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
23849
ns25012
ns0.95
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
21583
ns23750
ns0.91
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
21542
ns21416.5
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
22395.5
ns21875
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
21250
ns21395.5
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
259774.5
ns265929
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
148459
ns147104.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
146500
ns144042
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
151167
ns150708
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
149209
ns145416.5
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
168521.5
ns167550.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1306312.5
ns1335979.5
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1335292
ns1317041.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1326333
ns1303209
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1329459
ns1321541
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1332341.5
ns1351455.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
25520.5
ns24500
ns1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
22687.5
ns23792
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
25917
ns25791
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
23417
ns24396
ns0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
283013
ns352723.5
ns0.80
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
176479.5
ns181833.5
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
119334
ns179333
ns0.67
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
131395.5
ns128333
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
178542
ns130104.5
ns1.37
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1446515
ns1448170.5
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
416
ns250
ns1.66
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
334
ns375
ns0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns416
ns0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
22447
ns22864
ns0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
7000
ns6208
ns1.13
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6792
ns6604.5
ns1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6833
ns6875
ns0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6604.5
ns6417
ns1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
254907.5
ns254855.5
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5791.5
ns4833
ns1.20
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5041.5
ns4979
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7375
ns7000
ns1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5583.5
ns4583
ns1.22
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
252117.5
ns253659
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10250
ns9958
ns1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10292
ns10125
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10208
ns10520.5
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10292
ns10125
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
1346292
ns1348895
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1625
ns1584
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1584
ns1625
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1584
ns1583
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1583
ns1583
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
23009
ns22776
ns1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
5958
ns5708
ns1.04
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
5625
ns5958
ns0.94
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
5667
ns6042
ns0.94
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
5791
ns5625
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
270989.5
ns272052
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
6824625
ns6789167
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
6348145.5
ns6396375
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
6519020.5
ns6536083
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
7697209
ns7542208
ns1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
213576.5
ns212440
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
24071458
ns24091354.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
21312916.5
ns21305541
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
21105208.5
ns21007521
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
29655708
ns29773354.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2112366
ns2122649
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
48607583
ns37337124.5
ns1.30
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
45891875
ns45672500.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
45733979.5
ns45753542
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
49303792
ns38138500
ns1.29
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7292
ns7375
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6916
ns6729.5
ns1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7667
ns8520.5
ns0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6812.5
ns7250
ns0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
236251.5
ns238000
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8833
ns8042
ns1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9084
ns8334
ns1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9125
ns9250
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8375
ns8542
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1057827.5
ns1071201
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
1557041.5
ns1501083
ns1.04
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1245708
ns1262125
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
1634792
ns1631083
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2151354
ns2165291.5
ns0.99
lenet(28, 28, 1, 128)/forward/GPU/CUDA
269564
ns282584
ns0.95
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
7905354
ns7848208
ns1.01
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6660125
ns6228916.5
ns1.07
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
7215708
ns7164000
ns1.01
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
10061000
ns10495396
ns0.96
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1851007
ns1889865
ns0.98
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
347583.5
ns336187.5
ns1.03
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
330250
ns351479
ns0.94
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
398666.5
ns397354.5
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
347854.5
ns346166.5
ns1.00
batchedmm(128, Bsize=4)/forward/GPU/CUDA
46483.5
ns42753.5
ns1.09
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
750667
ns746334
ns1.01
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
791375
ns790062.5
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
1087833
ns1079583
ns1.01
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
760750
ns740292
ns1.03
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
231907
ns308349.5
ns0.75
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397542
ns397125
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
213292
ns288791
ns0.74
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
288208
ns288208
ns1
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
750375
ns749667
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
43637
ns44616.5
ns0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
666667
ns672395.5
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
472875
ns529625
ns0.89
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
532542
ns529667
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
973709
ns987916
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
187534.5
ns193722.5
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
596583
ns595958
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
643625
ns642042
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
658187.5
ns649708
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
659375
ns646709
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
131892
ns132676
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2455000
ns2466250.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2514542
ns2439562
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2453792
ns2451645.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2461334
ns2457083
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1187757
ns1304894
ns0.91
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
352771
ns341042
ns1.03
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
330916.5
ns352125
ns0.94
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
399291
ns397209
ns1.01
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
312854.5
ns319750
ns0.98
batchedmm(2, Bsize=32)/forward/GPU/CUDA
15466
ns16162
ns0.96
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
710875
ns704500
ns1.01
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
734791
ns730916
ns1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
1025771
ns1018500
ns1.01
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
642208
ns639666.5
ns1.00
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
195407.5
ns200487.5
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1465375
ns1458125
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1498459
ns1500666
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1502875
ns1500792
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1442833
ns1439500
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
40141
ns41288
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5101625
ns5129292
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5303750
ns5286854
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5295812.5
ns5283687.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4993584
ns4974916.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
196609
ns199307.5
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3667
ns3667
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3708
ns3708
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3667
ns3667
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3666
ns3667
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
33049
ns34015
ns0.97
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15292
ns15083
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15167
ns15333
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15291
ns15416
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15167
ns15083
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
375124.5
ns381336.5
ns0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
71334
ns71417
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
71333
ns70459
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
71250
ns71125
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
71041
ns70958
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
113867.5
ns114243
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
318833
ns318042
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
321959
ns319333
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
317750
ns318708
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
317541
ns318583
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
192238.5
ns197759
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
1084
ns958
ns1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
1083
ns1083
ns1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
1083
ns1083
ns1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
1000
ns959
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
23138
ns24130
ns0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8583
ns7916
ns1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8417
ns8395.5
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8459
ns8750
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7958
ns8000
ns0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
258287
ns263324.5
ns0.98
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
475687.5
ns462583
ns1.03
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
463395.5
ns475584
ns0.97
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
562708
ns552146
ns1.02
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
552729.5
ns551937.5
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA
130132
ns130186.5
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
1400250
ns1394875
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
1394771
ns1380000
ns1.01
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
1643270.5
ns1617687
ns1.02
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
1597458
ns1583875
ns1.01
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
277863
ns276893
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
375
ns292
ns1.28
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
417
ns375
ns1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
31425
ns32430
ns0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6750
ns5958
ns1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6833
ns6666
ns1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6834
ns6750
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6208
ns6042
ns1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
261831.5
ns267754.5
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1726416.5
ns1725917
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1745625
ns1722625
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1724625
ns1730791.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1725854
ns1725417
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
169678
ns168976.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4357021
ns4353875
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3978291.5
ns4352396
ns0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4384375
ns4382625
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4359458.5
ns4352021
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1215814
ns1246090
ns0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
6750
ns6458
ns1.05
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
6875
ns6709
ns1.02
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
7312.5
ns7020.5
ns1.04
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6792
ns6792
ns1
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
20951
ns20586
ns1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
48417
ns32667
ns1.48
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
33583
ns32542
ns1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
73208.5
ns51937.5
ns1.41
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
70500
ns34416.5
ns2.05
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
288573
ns294547
ns0.98
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
360125
ns350167
ns1.03
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
330312.5
ns349083
ns0.95
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
410854.5
ns434792
ns0.94
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
324312.5
ns328937
ns0.99
batchedmm(2, Bsize=512)/forward/GPU/CUDA
18716
ns18162
ns1.03
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
717250
ns723145.5
ns0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
741709
ns743896
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
1036125.5
ns1028021
ns1.01
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
667292
ns667083
ns1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
340218.5
ns332754
ns1.02
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
75417
ns75334
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
75208
ns75375
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
75292
ns74833
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
75333
ns75125
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
46771
ns47376
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
325792
ns340667
ns0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
333167
ns325625
ns1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
325417
ns324500
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
324333
ns325042
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
208628
ns212363.5
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1487791
ns1484667
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1523333
ns1526708
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1526708
ns1526750
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1466375
ns1463167
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
51173
ns52318
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5109167
ns5112645.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5274250
ns5287000
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5289270.5
ns5288062
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4981458.5
ns4975542
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
201765
ns205581
ns0.98
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
28250
ns28166
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
28167
ns28208
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
28250
ns28209
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
28208
ns28250
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
24387
ns24403
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66625
ns66500
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66250
ns66542
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
66375
ns66625
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66792
ns66584
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
518482.5
ns542042
ns0.96
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1471916.5
ns1376375
ns1.07
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
936458
ns1069750
ns0.88
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1142000
ns1150250
ns0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2245542
ns2253791.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
593805
ns588018
ns1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
3051000
ns3106500
ns0.98
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2625979.5
ns2733166.5
ns0.96
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
2744916
ns2745500
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
3827125
ns3801667
ns1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
2034429
ns2007531
ns1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
8759417
ns8875333
ns0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
8720687.5
ns8801916.5
ns0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
8789874.5
ns8770875
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
6417375
ns6358146
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
83687.5
ns83750
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
82438
ns80520.5
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
83416.5
ns85625
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82771
ns82542
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
194015.5
ns193821
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2015417
ns2013771
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2036291
ns2020312.5
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2016500
ns2023875
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2009667
ns2016896
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
802404
ns796366
ns1.01
This comment was automatically generated by workflow using github-action-benchmark.