diff --git a/Project.toml b/Project.toml index b33b90564..14cee5b4f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "NNlib" uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" -version = "0.9.6" +version = "0.9.7" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" @@ -9,41 +9,54 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Requires = "ae029012-a4dd-5104-9daa-d747884805df" +Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" [extensions] NNlibAMDGPUExt = "AMDGPU" -NNlibCUDAExt = "CUDA" NNlibCUDACUDNNExt = ["CUDA", "cuDNN"] +NNlibCUDAExt = "CUDA" +NNlibEnzymeCoreExt = "EnzymeCore" +NNlibLoopVectorizationExt = "LoopVectorization" [compat] AMDGPU = "0.5, 0.6" Adapt = "3.2" Atomix = "0.1" -ChainRulesCore = "1.13" CUDA = "4, 5" -cuDNN = "1" +ChainRulesCore = "1.13" +EnzymeCore = "0.5, 0.6" GPUArraysCore = "0.1" KernelAbstractions = "0.9.2" Requires = "1.0" +cuDNN = "1" julia = "1.9" [extras] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" +CpuId = "adafc99b-e345-5852-983c-f28acb93d879" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +EnzymeTestUtils = "12d8515a-0907-448a-8884-5fe00fdf1c5a" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" @@ -52,6 +65,5 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [targets] -test = ["AMDGPU", "CUDA", "ChainRulesTestUtils", "Documenter", - "FiniteDifferences", "ForwardDiff", "Logging", "ReverseDiff", - "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"] +test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN", + "Enzyme", "EnzymeCore", "EnzymeTestUtils"] diff --git a/bench_torch.py b/bench_torch.py new file mode 100644 index 000000000..1a97dc673 --- /dev/null +++ b/bench_torch.py @@ -0,0 +1,32 @@ +import torch +import torchvision.models as models +from torch.profiler import profile, record_function, ProfilerActivity +import time + +model = models.efficientnet_v2_m() +model.eval() + +b_size = 1 +img = torch.rand(b_size, 3, 224, 224) + +with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof: + with record_function("model_inference"): + pred = model(img) + """ + with record_function("model_backward"): + loss = torch.sum(pred - 0.5) # dummy loss + loss.backward() + """ + +print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1)) +# print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=-1)) + +start1 = time.perf_counter() +pred = model(img) +start2 = time.perf_counter() +loss = torch.sum(pred - 0.5) # dummy loss +loss.backward() +end = time.perf_counter() +print(f"Time used inference: {start2 - start1} seconds") +print(f"Time used backward: {end - start2} seconds") +print(f"Time used inference and backward: {end - start1} seconds") \ No newline at end of file diff --git a/benchmark.jl b/benchmark.jl new file mode 100644 index 000000000..072144d51 --- /dev/null +++ b/benchmark.jl @@ -0,0 +1,82 @@ +using NNlib, Flux, Metalhead +using BenchmarkTools, Statistics +using DataFrames, CSV + +forward(model, input) = model(input) + +dummy_loss(output) = sum(output .- 1) + +function train_step(model, input) + ∇model, ∇input = gradient(model, input) do m, x + dummy_loss(m(x)) + end + return ∇model, ∇input +end + +function benchmark(models, dtype, batch_sizes, channels, spatial_size) + model_names = sort(collect(keys(models))) # make sure the models are always in the same order + forward_times = zeros(length(model_names), length(batch_sizes)) + train_step_times = zeros(length(model_names), length(batch_sizes)) + + for (i, model_name) in enumerate(model_names) + println("Benchmarking $model_name...") + for (j, batch_size) in enumerate(batch_sizes) + + input = rand(dtype, spatial_size..., channels, batch_size) + model = models[model_name] + + forward(model, input) # compilation + train_step(model, input) # compilation + + # using @belapsed (minimum time) + #= + forward_times[i, j] = @belapsed forward($model, $input) + train_step_times[i, j] = @belapsed train_step($model, $input) + =# + # using median time + forward_times[i, j] = median(@benchmark forward($model, $input)).time / 10^9 + train_step_times[i, j] = median(@benchmark train_step($model, $input)).time / 10^9 + + end + end + + return forward_times, train_step_times +end + +# models which should be benchmarked +models = Dict( + "ResNet18" => ResNet(18), + "WideResNet50" => WideResNet(50), + "DenseNet121" => DenseNet(121), + "EfficientNet" => EfficientNet(:b0), + "EfficientNetv2" => EfficientNetv2(:small), + "MobileNetv3" => MobileNetv3(:small), + # "GoogLeNet" => GoogLeNet(), + "ConvNeXt" => ConvNeXt(:tiny), +) + +# the data type and batch sizes which should be benchmarked +dtype = Float32 +batch_sizes = (1, 32) +# size information (e.g. ImageNet-like images) +channels = 3 +spatial_size = (224, 224) # WH + +forward_times1, train_step_times1 = benchmark(models, dtype, batch_sizes, channels, spatial_size) +using LoopVectorization # load LoopVectorization here to load the lv-extension +forward_times2, train_step_times2 = benchmark(models, dtype, batch_sizes, channels, spatial_size) + +df = DataFrame() +df[!, "model_names"] = sort(collect(keys(models))) # make sure the models are always in the same order + +for (i, batch_size) in enumerate(batch_sizes) + df[!, "acceleration inference, batch_size: $batch_size"] = forward_times1[:, i] ./ forward_times2[:, i] + df[!, "acceleration train, batch_size: $batch_size"] = train_step_times1[:, i] ./ train_step_times2[:, i] + + df[!, "im2col, inference, batch_size: $batch_size"] = forward_times1[:, i] + df[!, "lv-ext, inference, batch_size: $batch_size"] = forward_times2[:, i] + df[!, "im2col, train, batch_size: $batch_size"] = train_step_times1[:, i] + df[!, "lv-ext, train, batch_size: $batch_size"] = train_step_times2[:, i] +end + +CSV.write("benchmark_result_julia.csv", df) \ No newline at end of file diff --git a/benchmark_result_julia.csv b/benchmark_result_julia.csv new file mode 100644 index 000000000..6ca132f8e --- /dev/null +++ b/benchmark_result_julia.csv @@ -0,0 +1,8 @@ +model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32" +ConvNeXt,3.0131868428607564,1.3994729097036838,0.4240265,0.1407236,1.43802405,1.0275469,1.1620754379865017,1.0632876498150545,4.6366846,3.9900031,14.7683656,13.8893418 +DenseNet121,2.7011062104755816,1.575624841888005,0.1855009,0.0686759,0.7069096,0.4486535,1.2534693725910626,1.036169100624124,2.6923755,2.1479388,12.6862194,12.2433871 +EfficientNet,6.669989485006747,2.4963127103581892,0.49731575,0.0745602,1.33507035,0.53481695,1.121879889261884,1.1298641673853496,2.5537233,2.2762894,8.1940817,7.2522715 +EfficientNetv2,16.28186773870062,7.202334907846903,2.5620854,0.1573582,12.053267,1.67352215,1.4558721609174592,1.203183521905458,6.1329556,4.21256465,21.0444893,17.4906728 +MobileNetv3,12.105678302652656,1.5684538123069776,0.1103481,0.0091154,0.31291775,0.19950715,1.2884028351358188,1.1391237206595466,0.43895395,0.3406962,2.0458146,1.7959547 +ResNet18,1.321074637025202,1.0621579200481972,0.0558948,0.0423101,0.2110332,0.19868345,1.0855325609238786,0.8862720054297211,0.8071219,0.7435262,2.98925695,3.3728437 +WideResNet50,0.6797203960326701,0.7846926795922912,0.1863516,0.2741592,0.6916193,0.88138875,0.8693605563082452,0.7977827693085691,3.68315245,4.23662245,13.5918181,17.0369913 diff --git a/benchmark_result_julia_BLAS.set_num_threads(1).csv b/benchmark_result_julia_BLAS.set_num_threads(1).csv new file mode 100644 index 000000000..8f84f3c6d --- /dev/null +++ b/benchmark_result_julia_BLAS.set_num_threads(1).csv @@ -0,0 +1,8 @@ +model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32" +ConvNeXt,2.3560260677286187,1.177474414020855,0.4724393,0.2005238,1.5661168,1.3300644,1.1190087278994194,1.0605556431775283,6.1761583,5.519312,20.2704419,19.1130395 +DenseNet121,3.122060107720349,1.6430011190729519,0.2130831,0.0682508,0.7872384,0.4791466,1.0415097744527333,0.9663276503256204,2.2208416,2.1323291,13.0980777,13.5544892 +EfficientNet,7.620280694206955,2.614653451310472,0.55973705,0.0734536,1.3205672,0.50506395,1.0814317166163767,1.4227466786836107,2.3782218,2.1991419,7.8866889,5.543284 +EfficientNetv2,19.58985619614861,6.567161064233096,3.0016087,0.1532226,12.1968172,1.8572435,1.393368850971046,1.4053620350129883,5.7712793,4.1419609,21.7360648,15.4665234 +MobileNetv3,11.880463635205707,1.4356279307443949,0.1165919,0.00981375,0.31690295,0.2207417,1.2244903507960017,1.0962253667265058,0.4233406,0.345728,1.9161035,1.7479102 +ResNet18,1.8553921481455138,1.1963831276674246,0.0782507,0.04217475,0.26524125,0.2217026,0.7610563817484197,0.8134254105937594,0.5683152,0.7467452,3.1617936,3.8870111 +WideResNet50,1.244074585655623,1.0739855756563101,0.33919395,0.2726476,1.0936016,1.0182647,0.625895474130196,0.8091050354680169,2.59841495,4.1515158,16.8332114,20.8047295 diff --git a/benchmark_result_pytorch.csv b/benchmark_result_pytorch.csv new file mode 100644 index 000000000..153e03686 --- /dev/null +++ b/benchmark_result_pytorch.csv @@ -0,0 +1,8 @@ +,model_names,"inference, batch_size: 1","train, batch_size: 1","inference, batch_size: 32","train, batch_size: 32" +0,ConvNeXt,0.0838077,0.3047408,1.6291157,4.6741033 +1,DenseNet121,0.108188,0.2309508,1.5926049,4.6524887 +2,EfficientNet,0.0661599,0.1544869,0.9558551,2.707274 +3,EfficientNetv2,0.129968,0.2956851,1.830179,5.1837134 +4,MobileNetv3,0.022812,0.0398323,0.2456348,0.5598582 +5,ResNet18,0.0305176,0.074905,0.4218851,1.192504 +6,WideResNet50,0.1575364,0.4871242,2.2386081,6.8065705 diff --git a/benchmark_torch.py b/benchmark_torch.py new file mode 100644 index 000000000..55705ad8b --- /dev/null +++ b/benchmark_torch.py @@ -0,0 +1,64 @@ +import torch +import torchvision.models as visionmodels +import time +import pandas as pd + +def dummy_loss(output): + return torch.sum(output - 1) + +def train_step(model, input_to_model): + output = model(input_to_model) + loss = dummy_loss(output) + loss.backward() + +def benchmark(models, batch_sizes, channels, spatial_size): + model_names = sorted(list(models.keys())) # make sure the models are always in the same order + forward_times = torch.zeros(len(model_names), len(batch_sizes)) + train_step_times = torch.zeros(len(model_names), len(batch_sizes)) + + for i, model_name in enumerate(model_names): + print(f"Benchmarking {model_name}...") + for j, batch_size in enumerate(batch_sizes): + + input_to_model = torch.rand(batch_size, channels, spatial_size[0], spatial_size[1]) + model = models[model_name] + + time_start = time.perf_counter() + model(input_to_model) + time_duration = time.perf_counter() - time_start + forward_times[i, j] = time_duration + + time_start = time.perf_counter() + train_step(model, input_to_model) + time_duration = time.perf_counter() - time_start + train_step_times[i, j] = time_duration + + return forward_times, train_step_times + +models = { + "ResNet18" : visionmodels.resnet18(), + "WideResNet50" : visionmodels.wide_resnet50_2(), + "DenseNet121" : visionmodels.densenet121(), + "EfficientNet" : visionmodels.efficientnet_b0(), + "EfficientNetv2" : visionmodels.efficientnet_v2_s(), + "MobileNetv3" : visionmodels.mobilenet_v3_small(), + # "GoogLeNet" : visionmodels.googlenet(), + "ConvNeXt" : visionmodels.convnext_tiny(), +} + +# the batch sizes which should be benchmarked +batch_sizes = (1, 32) +# size information (e.g. ImageNet-like images) +channels = 3 +spatial_size = (224, 224) # HW + +forward_times, train_step_times = benchmark(models, batch_sizes, channels, spatial_size) + +df = pd.DataFrame() +df["model_names"] = sorted(list(models.keys())) # make sure the models are always in the same order + +for (i, batch_size) in enumerate(batch_sizes): + df[f"inference, batch_size: {batch_size}"] = forward_times[:, i] + df[f"train, batch_size: {batch_size}"] = train_step_times[:, i] + +df.to_csv("benchmark_result_pytorch.csv") \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl new file mode 100644 index 000000000..6e4b8d8e0 --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl @@ -0,0 +1,12 @@ +module NNlibLoopVectorizationExt + +using NNlib +using LoopVectorization +using Random, Statistics +using OffsetArrays, Static + +include("conv.jl") +include("pooling.jl") +include("activations.jl") + +end # module \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/activations.jl b/ext/NNlibLoopVectorizationExt/activations.jl new file mode 100644 index 000000000..8201e7299 --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/activations.jl @@ -0,0 +1,26 @@ +_tanh(x) = tanh(x) +Base.broadcasted(::typeof(tanh), x::AbstractArray) = @turbo _tanh.(x) + +_softsign(x) = x / (1 + abs(x)) +Base.broadcasted(::typeof(NNlib.softsign), x::AbstractArray) = @turbo _softsign.(x) + +_softplus(x) = log1p(exp(-abs(x))) +Base.broadcasted(::typeof(NNlib.softplus), x::AbstractArray) = (@turbo _softplus.(x)) .+ NNlib.relu.(x) + +function _sigmoid(x) + t = exp(-abs(x)) + ifelse(x ≥ 0, inv(1 + t), t / (1 + t)) +end +Base.broadcasted(::typeof(NNlib.sigmoid), x::AbstractArray) = @turbo _sigmoid.(x) +Base.broadcasted(::typeof(NNlib.sigmoid_fast), x::AbstractArray) = @turbo _sigmoid.(x) # don't do the same for tanh_fast, it would be slower + +function _hardsigmoid(x) + clamp((x + 3) / 6, 0, 1) +end +Base.broadcasted(::typeof(NNlib.hardsigmoid), x::AbstractArray) = @turbo _hardsigmoid.(x) + +_logsigmoid(x) = -_softplus(-x) +Base.broadcasted(::typeof(NNlib.logsigmoid), x::AbstractArray) = @turbo _logsigmoid.(x) + +_swish(x) = x * _sigmoid(x) +Base.broadcasted(::typeof(NNlib.swish), x::AbstractArray) = @turbo _swish.(x) \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl new file mode 100644 index 000000000..0ca75f466 --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/conv.jl @@ -0,0 +1,191 @@ +#= +Accelerated convolution for 2d-images using the power of LoopVectorization. +The acceleration is usually greatest when the inputs have a large spatial size and few channels. +Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit. + +Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the +original implementation in some situations, it is left out for the moment. + +Implementation for forward pass mostly copied from here (Jonas Steinebach, MIT license): +https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl + +Implementation for backward pass mostly copied from here (Chris Elrod, MIT license): +https://github.com/PumasAI/SimpleChains.jl/blob/main/src/conv.jl +=# + +function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} + + # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims) + + # padding is done naively at the moment + if cdims.padding != (0, 0, 0, 0) + input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2)) + end + + output_width, output_height, _ = size(output) + input_width, input_height, in_channels, batch_size = size(input) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + out_channels_per_group = out_channels ÷ groups + + if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("forward: very specialized case for maximum performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + value = zero(T) + for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance + # println("forward: second specialized case for better performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + value = zero(T) + # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + value += input[n + x_w - 1, m + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + elseif groups == 1 # third specialized case for better performance + # println("forward: third specialized case for better performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + value = zero(T) + # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + else # general case for any convolution + # println("forward: general case for any convolution") + + @tturbo for index_batch in 1:batch_size + for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + value = zero(T) + # for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width + value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_weight + (group - 1) * in_channels_weight, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + end + + return output +end + +function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} + + ∇conv_data!( + NNlib.insert_singleton_spatial_dimension(input_gradient, 1), + NNlib.insert_singleton_spatial_dimension(output_gradient, 1), + NNlib.insert_singleton_spatial_dimension(weight, 1), + NNlib.insert_singleton_spatial_dimension(cdims, 1) + ) + + return input_gradient +end + +function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} + + # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims) + + if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("backward: very specialized case for maximum performance") + + # storing all the necessary shapes + output_width, output_height, out_channels, batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + # check if input_gradient must be padded (padding is done naively at the moment) + if cdims.padding != (0, 0, 0, 0) + input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2)) + else + input_gradient_padded = input_gradient + end + + # store the size of input after padding + input_width, input_height, in_channels, batch_size = size(input_gradient_padded) # size after padding + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + output_gradient = OffsetArray(output_gradient, OffsetArrays.Origin(0, 0, 0, 0)) + input_gradient_padded = OffsetArray(input_gradient_padded, OffsetArrays.Origin(0, 0, 0, 0)) + weight = OffsetArray(weight, OffsetArrays.Origin(0, 0, 0, 0)) + + input_width, input_height, in_channels, batch_size = size(input_gradient_padded) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + @tturbo for index_batch in 0:batch_size-1 + for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1 + + value = zero(T) + for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1) + # for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1 + + is_in_bound_x = (x_in - x_w >= 0) & (x_in - x_w < output_width) + is_in_bound_y = (y_in - y_w >= 0) & (y_in - y_w < output_height) + output_gradient_value = (is_in_bound_x & is_in_bound_y) ? output_gradient[x_in - x_w, y_in - y_w, out_channel, index_batch] : zero(T) + value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value + + end + input_gradient_padded[x_in, y_in, in_channel, index_batch] = value + + end + end + + input_gradient_padded = input_gradient_padded.parent + + # depad + if cdims.padding != (0, 0, 0, 0) + x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding + input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :] + end + + else # general case for any convolution + input_gradient = ∇conv_data_im2col_grouped!(input_gradient, output_gradient, weight, cdims) + end + + return input_gradient +end \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/pooling.jl b/ext/NNlibLoopVectorizationExt/pooling.jl new file mode 100644 index 000000000..5af194408 --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/pooling.jl @@ -0,0 +1,110 @@ +#= +Accelerated mean pooling for 2d-images using the power of LoopVectorization. +The speed up is usually lower compared to conv but can be approximately up to 2x. + +Since the current state of LoopVectorization ∇meanpool! isn't really faster than the +original implementation in some situations, it is left out for the moment. + +Implementation inspired from here (Jonas Steinebach, MIT license): +https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl +=# + +function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real} + NNlib.check_dims(size(input), size(output), pdims) + + # storing all the necessary shapes + input_width, input_height, channels, batch_size = size(input) + output_width, output_height, channels, batch_size = size(output) + kernel_width, kernel_height = pdims.kernel_size + + x_stride, y_stride = pdims.stride + x_dilation, y_dilation = pdims.dilation + x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding + + # A helper function to project from output (w, h) to input (input_w, input_h) + @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1 + + # We use calc_padding_regions to split outselves up into separate regions that may or + # may not need to worry about padding: + pdims_3d = PoolDims((input_width, input_height, 1, channels, batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) + # println(pdims_3d.padding) + padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d) + + # We represent division by kernel size by rolling it + # into the `alpha` multiplier. + _alpha = T(1 / prod(pdims.kernel_size)) + + # Start with the central region + w_region, h_region, _ = central_region + + if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance + # println("specialized case for better performance") + + @tturbo for index_batch in 1:batch_size + # compute pooling for each channel separatly + for channel in 1:channels, y_out in h_region, x_out in w_region + kernel_sum = zero(T) + for y_w in 1:kernel_height, x_w in 1:kernel_width + kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] + end + output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha + end + end + + else # general case for any meanpooling + # println("general case for any meanpooling") + + @tturbo for index_batch in 1:batch_size + # compute pooling for each channel separatly + for channel in 1:channels, y_out in h_region, x_out in w_region + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 + kernel_sum = zero(T) + for y_w in 1:kernel_height, x_w in 1:kernel_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + kernel_sum += input[x_in, y_in, channel, index_batch] + end + output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha + end + end + + end + + # Next, the padded regions + if pdims.padding != (0, 0, 0, 0) + @inbounds for (w_region, h_region, d_region) in padded_regions + for index_batch in 1:batch_size, channel in 1:channels + for z_out in d_region # for skipping the d_regions + for y_out in h_region + m = project(y_out, y_stride, y_pad1) + for x_out in w_region + n = project(x_out, x_stride, x_pad1) + kernel_sum = zero(T) + + for y_w in 1:kernel_height + y_in = m + (y_w - 1) * y_dilation + if y_in <= 0 || y_in > input_height + continue + end + + for x_w in 1:kernel_width + x_in = n + (x_w - 1) * x_dilation + if x_in <= 0 || x_in > input_width + continue + end + + kernel_sum += input[x_in, y_in, channel, index_batch] + end + end + + output[x_out, y_out, channel, index_batch] = _alpha * kernel_sum + end + end + end + end + end + end + + return output +end \ No newline at end of file diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl new file mode 100644 index 000000000..c081a8fb1 --- /dev/null +++ b/test/ext_loopvectorization/minimal_test.jl @@ -0,0 +1,89 @@ +using NNlib, LoopVectorization, CpuId + +function ∇conv_data!_avx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + + NNlib.check_dims(size(input_gradient), size(weight), size(output_gradient), cdims) + + # storing all the necessary shapes + output_width, output_height, out_channels, current_batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + input_width, input_height, in_channels, current_batch_size = size(input_gradient) + + if cdims.padding != (0, 0, 0, 0) || cdims.groupcount != 1 || cdims.stride != (1, 1) || cdims.dilation != (1, 1) + throw(ArgumentError("this test function only supports basic conv (or crosscor) bwd with pad=0, stride=1, dilation=1, groups=1")) + end + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + + for index_batch in 1:current_batch_size # Threads.@threads + @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + + return input_gradient +end + +function ∇conv_data!_noavx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + + NNlib.check_dims(size(input_gradient), size(weight), size(output_gradient), cdims) + + # storing all the necessary shapes + output_width, output_height, out_channels, current_batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + input_width, input_height, in_channels, current_batch_size = size(input_gradient) + + if cdims.padding != (0, 0, 0, 0) || cdims.groupcount != 1 || cdims.stride != (1, 1) || cdims.dilation != (1, 1) + throw(ArgumentError("this test function only supports basic conv (or crosscor) bwd with pad=0, stride=1, dilation=1, groups=1")) + end + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + + for index_batch in 1:current_batch_size # NO @threads here + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # NO @turbo here! + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + + return input_gradient +end + +println(cpuinfo()) + +dtype = Float32 # Float64 +batch_size = 5 +input = rand(dtype, 50, 50, 3, batch_size) +weight = rand(dtype, 5, 5, 3, 9) +cdims = NNlib.DenseConvDims(size(input), size(weight), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1) +output_gradient = rand(dtype, NNlib.output_size(cdims)..., 9, batch_size) + +input_gradient_noavx = zeros(dtype, size(input)...) +input_gradient_noavx = ∇conv_data!_noavx(input_gradient_noavx, output_gradient, weight, cdims) +input_gradient_noavx = @time ∇conv_data!_noavx(input_gradient_noavx, output_gradient, weight, cdims) + +input_gradient_avx = zeros(dtype, size(input)...) +input_gradient_avx = ∇conv_data!_avx(input_gradient_avx, output_gradient, weight, cdims) +input_gradient_avx = @time ∇conv_data!_avx(input_gradient_avx, output_gradient, weight, cdims) + +@info isapprox(input_gradient_noavx, input_gradient_avx) +@testset "conv bwd minimal" begin + @test isapprox(input_gradient_noavx, input_gradient_avx) +end +@show sum(input_gradient_noavx) +@show sum(input_gradient_avx) \ No newline at end of file diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl new file mode 100644 index 000000000..f3cc771c8 --- /dev/null +++ b/test/ext_loopvectorization/runtests.jl @@ -0,0 +1,87 @@ +using NNlib, Test, BenchmarkTools + +function compute_conv_outputs(settings::Vector{<:NNlib.ConvDims}, input::Array{T,4}, weight_ungrouped::Array{T,4}, weight_grouped::Array{T,4}, conv_output_grads::Vector{Array{T,4}}) where {T<:Real} + conv_outs = Vector{Array{T, 4}}(undef, length(settings)) + conv_grads = Vector{Array{T, 4}}(undef, length(settings)) + + for (i, setting) in enumerate(settings) + if setting.groupcount > 1 + weight = weight_grouped + else + weight = weight_ungrouped + end + + out = @btime NNlib.conv($input, $weight, $setting) + output_gradient = conv_output_grads[i] + + conv_grads[i] = @btime NNlib.∇conv_data($output_gradient, $weight, $setting) + conv_outs[i] = out + end + + return conv_outs, conv_grads +end + +function compute_pool_outputs(settings::Vector{<:NNlib.PoolDims}, input::Array{T,4}) where {T<:Real} + pool_outs = Vector{Array{T, 4}}(undef, length(settings)) + + for (i, setting) in enumerate(settings) + pdims = NNlib.PoolDims(size(input), setting.kernel_size, stride = setting.stride, padding = setting.padding, dilation = setting.dilation) + pool_outs[i] = @btime NNlib.meanpool($input, $pdims) + end + + return pool_outs +end + +@testset "Convolution & Pooling & Activations" begin + + dtype = Float32 # Float64 + batch_size = 64 # 1 # 64 # 32 + input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool + weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv + weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv + + conv_settings_list = [ + NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case' + NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'second specialized case' + NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'third specialized case' + NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case' + ] + + conv_output_grads = [rand(dtype, NNlib.output_size(setting)..., 27, batch_size) for setting in conv_settings_list] + + pool_settings_list = [ + NNlib.PoolDims(size(input), (5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case' + NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case' + ] + + modified_activations = [tanh, softsign, softplus, sigmoid, sigmoid_fast, hardsigmoid, logsigmoid, swish] + + # compute outputs before loading LoopVectorization + + println("without LoopVectorization") + conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) + pool_outs_std = compute_pool_outputs(pool_settings_list, input) + act_outs_std = [@btime $act.($input) for act in modified_activations] + + using LoopVectorization # now load the NNlibLoopVectorizationExt + + println("with LoopVectorization") + conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) + pool_outs_lv = compute_pool_outputs(pool_settings_list, input) + act_outs_lv = [@btime $act.($input) for act in modified_activations] + + # validate conv + @testset "Convolution" begin + @test all(isapprox.(conv_outs_std, conv_outs_lv)) + @test all(isapprox.(conv_grads_std, conv_grads_lv)) + end + # validate pool + @testset "Pooling" begin + @test all(isapprox.(pool_outs_std, pool_outs_lv)) + end + # validate activations + @testset "Activations" begin + @test all(isapprox.(act_outs_std, act_outs_lv)) + end + +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 03602a40d..c78c74bcd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,8 @@ DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursiv # ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests # ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests -# ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests +# ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests +ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests const rng = StableRNG(123) include("test_utils.jl") @@ -155,4 +156,16 @@ end else @info "Skipping AMDGPU tests, set NNLIB_TEST_AMDGPU=true to run them." end + + if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true" + @testset "LoopVectorization" begin + # Don't import LoopVectorization here! + # Because the LV-impls are simply tested against NNlib's standard CPU impls, + # importing LoopVectorization here would load NNlibLoopVectorizationExt too early! + include("ext_loopvectorization/runtests.jl") + end + else + @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them." + end + end