Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Possible way to implement a LoopVectorization extension for conv2d & meanpool2d & activations #540

Open
wants to merge 37 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
6d32073
Add files via upload
jonas208 Sep 26, 2023
2a3cf3e
Delete ext/NNlibLoopVectorizationExt/conv_old.jl
jonas208 Sep 26, 2023
28a027a
Delete ext/NNlibLoopVectorizationExt/pooling_old.jl
jonas208 Sep 26, 2023
5339aaf
Add files via upload
jonas208 Sep 26, 2023
9e0dc6d
Add files via upload
jonas208 Sep 26, 2023
dd0f0ed
Add files via upload
jonas208 Sep 26, 2023
b341d1c
Add files via upload
jonas208 Sep 26, 2023
94f7964
Update runtests.jl
jonas208 Sep 26, 2023
6cc2e75
Add files via upload
jonas208 Sep 27, 2023
c5c79ee
Add files via upload
jonas208 Sep 27, 2023
132e35c
Add files via upload
jonas208 Sep 27, 2023
aa019e9
Add files via upload
jonas208 Sep 27, 2023
52e2a78
Add files via upload
jonas208 Sep 27, 2023
d63f8a5
Add files via upload
jonas208 Sep 28, 2023
ae86d13
Add files via upload
jonas208 Sep 28, 2023
776835d
Add files via upload
jonas208 Sep 28, 2023
13205da
Add files via upload
jonas208 Sep 28, 2023
5850341
Add files via upload
jonas208 Sep 28, 2023
00b28f2
Add files via upload
jonas208 Sep 28, 2023
af04cc6
Delete runtests.jl
jonas208 Sep 28, 2023
990a34c
Delete Project.toml
jonas208 Sep 28, 2023
db0ad66
Add files via upload
jonas208 Sep 28, 2023
a4e18e6
Add files via upload
jonas208 Sep 29, 2023
f584377
Add files via upload
jonas208 Sep 30, 2023
274db10
Add files via upload
jonas208 Sep 30, 2023
6c33d5c
Add files via upload
jonas208 Sep 30, 2023
7affd46
Add files via upload
jonas208 Oct 3, 2023
3130f8a
Add files via upload
jonas208 Oct 3, 2023
d87f909
Add files via upload
jonas208 Oct 7, 2023
82abca8
Add files via upload
jonas208 Oct 7, 2023
c5ec713
Delete bench_torch.py
jonas208 Oct 8, 2023
3f1c6dc
Add files via upload
jonas208 Oct 8, 2023
8dde5f9
Add files via upload
jonas208 Oct 8, 2023
5505157
Add files via upload
jonas208 Oct 8, 2023
0aa3a3f
Add files via upload
jonas208 Oct 8, 2023
35f2b77
Add files via upload
jonas208 Oct 8, 2023
07943d7
Add files via upload
jonas208 Oct 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"

[extensions]
NNlibAMDGPUExt = "AMDGPU"
NNlibCUDAExt = "CUDA"
NNlibCUDACUDNNExt = ["CUDA", "cuDNN"]
NNlibLoopVectorizationExt = "LoopVectorization"

[compat]
AMDGPU = "0.5, 0.6"
Expand All @@ -35,15 +37,18 @@ GPUArraysCore = "0.1"
KernelAbstractions = "0.9.2"
Requires = "1.0"
julia = "1.9"
LoopVectorization = "=0.12.146"

[extras]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand All @@ -52,6 +57,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

[targets]
test = ["AMDGPU", "CUDA", "ChainRulesTestUtils", "Documenter",
"FiniteDifferences", "ForwardDiff", "Logging", "ReverseDiff",
"StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]
test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "Documenter",
"FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization",
"ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]
10 changes: 10 additions & 0 deletions ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module NNlibLoopVectorizationExt

using NNlib
using LoopVectorization
using Random, Statistics

include("conv.jl")
include("pooling.jl")

end # module
181 changes: 181 additions & 0 deletions ext/NNlibLoopVectorizationExt/conv.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#=
Accelerated convolution for 2d-images using the power of LoopVectorization.
The acceleration is usually greatest when the inputs have a large spatial size and few channels.
Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit.

Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the
original implementation in some situations, it is left out for the moment.

Implementation copied from here (Jonas Steinebach, MIT):
https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl
=#

function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}

# fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)

# padding is done naively at the moment
if cdims.padding != (0, 0, 0, 0)
input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2))
end

output_width, output_height, _ = size(output)
input_width, input_height, in_channels, batches = size(input)
weight_width, weight_height, in_channels_weight, out_channels = size(weight)

# it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
if !NNlib.flipkernel(cdims)
weight = reverse(weight, dims=(1, 2))
end

groups = cdims.groupcount
x_stride, y_stride = cdims.stride
x_dilation, y_dilation = cdims.dilation
out_channels_per_group = out_channels ÷ groups

if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
# println("forward: very specialized case for maximum performance")

@tturbo for index_batch in 1:batches
for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
value = zero(T)
for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
end
output[x_out, y_out, out_channel, index_batch] = value
end
end

elseif groups == 1 # second specialized case for better performance
# println("forward: second specialized case for better performance")

@tturbo for index_batch in 1:batches
for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
m = y_out + (y_stride - 1) * (y_out - 1)
n = x_out + (x_stride - 1) * (x_out - 1)
value = zero(T)
for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
y_in = m + (y_w - 1) * y_dilation
x_in = n + (x_w - 1) * x_dilation

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if it makes a difference, but LV's handling of indices written within an index expression might be better.
It has separate code for building indices by parsing index expressions vs parsing the loop body.
I'd have to look at the details to see the degree to which they're different. In theory, they should do the same thing.

LV would definitely benefit from the special case of x_dilation being 1.
Might be worth branching over (also, special case -1 or some other some other common small factors).

value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
end
output[x_out, y_out, out_channel, index_batch] = value
end
end

else # general case for any convolution
# println("forward: general case for any convolution")

@tturbo for index_batch in 1:batches
for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
m = y_out + (y_stride - 1) * (y_out - 1)
n = x_out + (x_stride - 1) * (x_out - 1)
out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
value = zero(T)
for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
y_in = m + (y_w - 1) * y_dilation
x_in = n + (x_w - 1) * x_dilation
in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
end
output[x_out, y_out, out_channel, index_batch] = value
end
end

end

return output
end

function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}

# fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims)

# storing all the necessary shapes
output_width, output_height, out_channels, current_batch_size = size(output_gradient)
weight_width, weight_height, in_channels_weight, out_channels = size(weight)

# because in the actual computation section, values are added, it's saver to reset the given input_gradient first
input_gradient .= zero(T)
# check if input_gradient must be padded (padding is done naively at the moment)
if cdims.padding != (0, 0, 0, 0)
input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2))
else
input_gradient_padded = input_gradient
end

# store the size of input after padding
input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding

# it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
if !NNlib.flipkernel(cdims)
weight = reverse(weight, dims=(1, 2))
end

groups = cdims.groupcount
x_stride, y_stride = cdims.stride
x_dilation, y_dilation = cdims.dilation
out_channels_per_group = out_channels ÷ groups

# actual computation (using @tturbo instead of Threads.@threads + @turbo may end up in wrong results)
if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
# println("backward: very specialized case for maximum performance")

Threads.@threads for index_batch in 1:current_batch_size
@turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
end
end
end

elseif groups == 1 # second specialized case for better performance
# println("backward: second specialized case for better performance")

Threads.@threads for index_batch in 1:current_batch_size
@turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
m = y_out + (y_stride - 1) * (y_out - 1)
n = x_out + (x_stride - 1) * (x_out - 1)
for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
y_in = m + (y_w - 1) * y_dilation
x_in = n + (x_w - 1) * x_dilation
input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
end
end
end

else # general case for any convolution
# println("backward: general case for any convolution")

Threads.@threads for index_batch in 1:current_batch_size
for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results
@turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
m = y_out + (y_stride - 1) * (y_out - 1)
n = x_out + (x_stride - 1) * (x_out - 1)
out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
y_in = m + (y_w - 1) * y_dilation
x_in = n + (x_w - 1) * x_dilation
in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
end
end
end
end

end

# depad
if cdims.padding != (0, 0, 0, 0)
x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
end

return input_gradient
end
109 changes: 109 additions & 0 deletions ext/NNlibLoopVectorizationExt/pooling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#=
Accelerated mean pooling for 2d-images using the power of LoopVectorization.
The speed up is usually lower compared to conv but can be approximately up to 2x.

Since the current state of LoopVectorization ∇meanpool! isn't really faster than the
original implementation in some situations, it is left out for the moment.

Implementation inspired from here (Jonas Steinebach, MIT):
https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl
=#

function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real}
NNlib.check_dims(size(input), size(output), pdims)

# storing all the necessary shapes
input_width, input_height, channels, current_batch_size = size(input)
output_width, output_height, channels, current_batch_size = size(output)
kernel_width, kernel_height = pdims.kernel_size

x_stride, y_stride = pdims.stride
x_dilation, y_dilation = pdims.dilation
x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding

# A helper function to project from output (w, h) to input (input_w, input_h)
@inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1

# We use calc_padding_regions to split outselves up into separate regions that may or
# may not need to worry about padding:
pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
# println(pdims_3d.padding)
padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d)

# We represent division by kernel size by rolling it
# into the `alpha` multiplier.
_alpha = T(1 / prod(pdims.kernel_size))

# Start with the central region
w_region, h_region, _ = central_region

if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance
# println("specialized case for better performance")

@tturbo for index_batch in 1:current_batch_size
# compute pooling for each channel separatly
for channel in 1:channels, y_out in h_region, x_out in w_region
kernel_sum = zero(T)
for y_w in 1:kernel_height, x_w in 1:kernel_width
# kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch]
kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch]
end
output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
end
end

else # general case for any meanpooling
# println("general case for any meanpooling")

@tturbo for index_batch in 1:current_batch_size
# compute pooling for each channel separatly
for channel in 1:channels, y_out in h_region, x_out in w_region
m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
kernel_sum = zero(T)
for y_w in 1:kernel_height, x_w in 1:kernel_width
y_in = m + (y_w - 1) * y_dilation # - y_pad1
x_in = n + (x_w - 1) * x_dilation # - x_pad1
kernel_sum += input[x_in, y_in, channel, index_batch]
end
output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
end
end

end

# Next, the padded regions
@inbounds for (w_region, h_region, d_region) in padded_regions
for index_batch in 1:current_batch_size, channel in 1:channels
for d in d_region # for skipping the d_regions
for h in h_region
ph = project(h, y_stride, y_pad1)
for w in w_region
pw = project(w, x_stride, x_pad1)
m = zero(T)

for kh in 1:kernel_height
input_kh = ph + (kh - 1) * y_dilation
if input_kh <= 0 || input_kh > input_height
continue
end

for kw in 1:kernel_width
input_kw = pw + (kw - 1) * x_dilation
if input_kw <= 0 || input_kw > input_width
continue
end

m += input[input_kw, input_kh, channel, index_batch]
end
end

output[w, h, channel, index_batch] = _alpha * m
end
end
end
end
end

return output
end
Loading