diff --git a/Project.toml b/Project.toml
index b33b90564..14cee5b4f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "NNlib"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.9.6"
+version = "0.9.7"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -9,41 +9,54 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 
 [extensions]
 NNlibAMDGPUExt = "AMDGPU"
-NNlibCUDAExt = "CUDA"
 NNlibCUDACUDNNExt = ["CUDA", "cuDNN"]
+NNlibCUDAExt = "CUDA"
+NNlibEnzymeCoreExt = "EnzymeCore"
+NNlibLoopVectorizationExt = "LoopVectorization"
 
 [compat]
 AMDGPU = "0.5, 0.6"
 Adapt = "3.2"
 Atomix = "0.1"
-ChainRulesCore = "1.13"
 CUDA = "4, 5"
-cuDNN = "1"
+ChainRulesCore = "1.13"
+EnzymeCore = "0.5, 0.6"
 GPUArraysCore = "0.1"
 KernelAbstractions = "0.9.2"
 Requires = "1.0"
+cuDNN = "1"
 julia = "1.9"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
+CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+EnzymeTestUtils = "12d8515a-0907-448a-8884-5fe00fdf1c5a"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
@@ -52,6 +65,5 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["AMDGPU", "CUDA", "ChainRulesTestUtils", "Documenter", 
-        "FiniteDifferences", "ForwardDiff", "Logging", "ReverseDiff", 
-        "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]
+test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN",
+		"Enzyme", "EnzymeCore", "EnzymeTestUtils"]
diff --git a/bench_torch.py b/bench_torch.py
new file mode 100644
index 000000000..1a97dc673
--- /dev/null
+++ b/bench_torch.py
@@ -0,0 +1,32 @@
+import torch
+import torchvision.models as models
+from torch.profiler import profile, record_function, ProfilerActivity
+import time
+
+model = models.efficientnet_v2_m()
+model.eval()
+
+b_size = 1
+img = torch.rand(b_size, 3, 224, 224)
+
+with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof:
+    with record_function("model_inference"):
+        pred = model(img)
+    """
+    with record_function("model_backward"):
+        loss = torch.sum(pred - 0.5) # dummy loss
+        loss.backward()
+    """
+
+print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1))
+# print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=-1))
+
+start1 = time.perf_counter()
+pred = model(img)
+start2 = time.perf_counter()
+loss = torch.sum(pred - 0.5) # dummy loss
+loss.backward()
+end = time.perf_counter()
+print(f"Time used inference: {start2 - start1} seconds")
+print(f"Time used backward: {end - start2} seconds")
+print(f"Time used inference and backward: {end - start1} seconds")
\ No newline at end of file
diff --git a/benchmark.jl b/benchmark.jl
new file mode 100644
index 000000000..072144d51
--- /dev/null
+++ b/benchmark.jl
@@ -0,0 +1,82 @@
+using NNlib, Flux, Metalhead
+using BenchmarkTools, Statistics
+using DataFrames, CSV
+
+forward(model, input) = model(input)
+
+dummy_loss(output) = sum(output .- 1)
+
+function train_step(model, input)
+    ∇model, ∇input = gradient(model, input) do m, x
+        dummy_loss(m(x))
+    end
+    return ∇model, ∇input
+end
+
+function benchmark(models, dtype, batch_sizes, channels, spatial_size)
+    model_names = sort(collect(keys(models))) # make sure the models are always in the same order
+    forward_times = zeros(length(model_names), length(batch_sizes))
+    train_step_times = zeros(length(model_names), length(batch_sizes))
+
+    for (i, model_name) in enumerate(model_names)
+        println("Benchmarking $model_name...")
+        for (j, batch_size) in enumerate(batch_sizes)
+    
+            input = rand(dtype, spatial_size..., channels, batch_size)
+            model = models[model_name]
+    
+            forward(model, input) # compilation
+            train_step(model, input) # compilation
+
+            # using @belapsed (minimum time)
+            #=
+            forward_times[i, j] = @belapsed forward($model, $input)
+            train_step_times[i, j] = @belapsed train_step($model, $input)
+            =#
+            # using median time
+            forward_times[i, j] = median(@benchmark forward($model, $input)).time / 10^9
+            train_step_times[i, j] = median(@benchmark train_step($model, $input)).time / 10^9
+
+        end
+    end
+
+    return forward_times, train_step_times
+end
+
+# models which should be benchmarked
+models = Dict(
+    "ResNet18" => ResNet(18),
+    "WideResNet50" => WideResNet(50),
+    "DenseNet121" => DenseNet(121),
+    "EfficientNet" => EfficientNet(:b0),
+    "EfficientNetv2" => EfficientNetv2(:small),
+    "MobileNetv3" => MobileNetv3(:small),
+    # "GoogLeNet" => GoogLeNet(),
+    "ConvNeXt" => ConvNeXt(:tiny),
+)
+
+# the data type and batch sizes which should be benchmarked
+dtype = Float32
+batch_sizes = (1, 32)
+# size information (e.g. ImageNet-like images)
+channels = 3
+spatial_size = (224, 224) # WH
+
+forward_times1, train_step_times1 = benchmark(models, dtype, batch_sizes, channels, spatial_size)
+using LoopVectorization # load LoopVectorization here to load the lv-extension
+forward_times2, train_step_times2 = benchmark(models, dtype, batch_sizes, channels, spatial_size)
+
+df = DataFrame()
+df[!, "model_names"] = sort(collect(keys(models))) # make sure the models are always in the same order
+
+for (i, batch_size) in enumerate(batch_sizes)
+    df[!, "acceleration inference, batch_size: $batch_size"] = forward_times1[:, i] ./ forward_times2[:, i]
+    df[!, "acceleration train, batch_size: $batch_size"] = train_step_times1[:, i] ./ train_step_times2[:, i]
+
+    df[!, "im2col, inference, batch_size: $batch_size"] = forward_times1[:, i]
+    df[!, "lv-ext, inference, batch_size: $batch_size"] = forward_times2[:, i]
+    df[!, "im2col, train, batch_size: $batch_size"] = train_step_times1[:, i]
+    df[!, "lv-ext, train, batch_size: $batch_size"] = train_step_times2[:, i]
+end
+
+CSV.write("benchmark_result_julia.csv", df)
\ No newline at end of file
diff --git a/benchmark_result_julia.csv b/benchmark_result_julia.csv
new file mode 100644
index 000000000..6ca132f8e
--- /dev/null
+++ b/benchmark_result_julia.csv
@@ -0,0 +1,8 @@
+model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32"
+ConvNeXt,3.0131868428607564,1.3994729097036838,0.4240265,0.1407236,1.43802405,1.0275469,1.1620754379865017,1.0632876498150545,4.6366846,3.9900031,14.7683656,13.8893418
+DenseNet121,2.7011062104755816,1.575624841888005,0.1855009,0.0686759,0.7069096,0.4486535,1.2534693725910626,1.036169100624124,2.6923755,2.1479388,12.6862194,12.2433871
+EfficientNet,6.669989485006747,2.4963127103581892,0.49731575,0.0745602,1.33507035,0.53481695,1.121879889261884,1.1298641673853496,2.5537233,2.2762894,8.1940817,7.2522715
+EfficientNetv2,16.28186773870062,7.202334907846903,2.5620854,0.1573582,12.053267,1.67352215,1.4558721609174592,1.203183521905458,6.1329556,4.21256465,21.0444893,17.4906728
+MobileNetv3,12.105678302652656,1.5684538123069776,0.1103481,0.0091154,0.31291775,0.19950715,1.2884028351358188,1.1391237206595466,0.43895395,0.3406962,2.0458146,1.7959547
+ResNet18,1.321074637025202,1.0621579200481972,0.0558948,0.0423101,0.2110332,0.19868345,1.0855325609238786,0.8862720054297211,0.8071219,0.7435262,2.98925695,3.3728437
+WideResNet50,0.6797203960326701,0.7846926795922912,0.1863516,0.2741592,0.6916193,0.88138875,0.8693605563082452,0.7977827693085691,3.68315245,4.23662245,13.5918181,17.0369913
diff --git a/benchmark_result_julia_BLAS.set_num_threads(1).csv b/benchmark_result_julia_BLAS.set_num_threads(1).csv
new file mode 100644
index 000000000..8f84f3c6d
--- /dev/null
+++ b/benchmark_result_julia_BLAS.set_num_threads(1).csv
@@ -0,0 +1,8 @@
+model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32"
+ConvNeXt,2.3560260677286187,1.177474414020855,0.4724393,0.2005238,1.5661168,1.3300644,1.1190087278994194,1.0605556431775283,6.1761583,5.519312,20.2704419,19.1130395
+DenseNet121,3.122060107720349,1.6430011190729519,0.2130831,0.0682508,0.7872384,0.4791466,1.0415097744527333,0.9663276503256204,2.2208416,2.1323291,13.0980777,13.5544892
+EfficientNet,7.620280694206955,2.614653451310472,0.55973705,0.0734536,1.3205672,0.50506395,1.0814317166163767,1.4227466786836107,2.3782218,2.1991419,7.8866889,5.543284
+EfficientNetv2,19.58985619614861,6.567161064233096,3.0016087,0.1532226,12.1968172,1.8572435,1.393368850971046,1.4053620350129883,5.7712793,4.1419609,21.7360648,15.4665234
+MobileNetv3,11.880463635205707,1.4356279307443949,0.1165919,0.00981375,0.31690295,0.2207417,1.2244903507960017,1.0962253667265058,0.4233406,0.345728,1.9161035,1.7479102
+ResNet18,1.8553921481455138,1.1963831276674246,0.0782507,0.04217475,0.26524125,0.2217026,0.7610563817484197,0.8134254105937594,0.5683152,0.7467452,3.1617936,3.8870111
+WideResNet50,1.244074585655623,1.0739855756563101,0.33919395,0.2726476,1.0936016,1.0182647,0.625895474130196,0.8091050354680169,2.59841495,4.1515158,16.8332114,20.8047295
diff --git a/benchmark_result_pytorch.csv b/benchmark_result_pytorch.csv
new file mode 100644
index 000000000..153e03686
--- /dev/null
+++ b/benchmark_result_pytorch.csv
@@ -0,0 +1,8 @@
+,model_names,"inference, batch_size: 1","train, batch_size: 1","inference, batch_size: 32","train, batch_size: 32"
+0,ConvNeXt,0.0838077,0.3047408,1.6291157,4.6741033
+1,DenseNet121,0.108188,0.2309508,1.5926049,4.6524887
+2,EfficientNet,0.0661599,0.1544869,0.9558551,2.707274
+3,EfficientNetv2,0.129968,0.2956851,1.830179,5.1837134
+4,MobileNetv3,0.022812,0.0398323,0.2456348,0.5598582
+5,ResNet18,0.0305176,0.074905,0.4218851,1.192504
+6,WideResNet50,0.1575364,0.4871242,2.2386081,6.8065705
diff --git a/benchmark_torch.py b/benchmark_torch.py
new file mode 100644
index 000000000..55705ad8b
--- /dev/null
+++ b/benchmark_torch.py
@@ -0,0 +1,64 @@
+import torch
+import torchvision.models as visionmodels
+import time
+import pandas as pd
+
+def dummy_loss(output):
+    return torch.sum(output - 1)
+
+def train_step(model, input_to_model):
+    output = model(input_to_model)
+    loss = dummy_loss(output)
+    loss.backward()
+
+def benchmark(models, batch_sizes, channels, spatial_size):
+    model_names = sorted(list(models.keys())) # make sure the models are always in the same order
+    forward_times = torch.zeros(len(model_names), len(batch_sizes))
+    train_step_times = torch.zeros(len(model_names), len(batch_sizes))
+
+    for i, model_name in enumerate(model_names):
+        print(f"Benchmarking {model_name}...")
+        for j, batch_size in enumerate(batch_sizes):
+    
+            input_to_model = torch.rand(batch_size, channels, spatial_size[0], spatial_size[1])
+            model = models[model_name]
+    
+            time_start = time.perf_counter()
+            model(input_to_model)
+            time_duration = time.perf_counter() - time_start
+            forward_times[i, j] = time_duration
+
+            time_start = time.perf_counter()
+            train_step(model, input_to_model)
+            time_duration = time.perf_counter() - time_start
+            train_step_times[i, j] = time_duration
+
+    return forward_times, train_step_times
+
+models = {
+    "ResNet18" : visionmodels.resnet18(),
+    "WideResNet50" : visionmodels.wide_resnet50_2(),
+    "DenseNet121" : visionmodels.densenet121(),
+    "EfficientNet" : visionmodels.efficientnet_b0(),
+    "EfficientNetv2" : visionmodels.efficientnet_v2_s(),
+    "MobileNetv3" : visionmodels.mobilenet_v3_small(),
+    # "GoogLeNet" : visionmodels.googlenet(),
+    "ConvNeXt" : visionmodels.convnext_tiny(),
+}
+
+# the batch sizes which should be benchmarked
+batch_sizes = (1, 32)
+# size information (e.g. ImageNet-like images)
+channels = 3
+spatial_size = (224, 224) # HW
+
+forward_times, train_step_times = benchmark(models, batch_sizes, channels, spatial_size)
+
+df = pd.DataFrame()
+df["model_names"] = sorted(list(models.keys())) # make sure the models are always in the same order
+
+for (i, batch_size) in enumerate(batch_sizes):
+    df[f"inference, batch_size: {batch_size}"] = forward_times[:, i]
+    df[f"train, batch_size: {batch_size}"] = train_step_times[:, i]
+
+df.to_csv("benchmark_result_pytorch.csv")
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
new file mode 100644
index 000000000..6e4b8d8e0
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
@@ -0,0 +1,12 @@
+module NNlibLoopVectorizationExt
+
+using NNlib
+using LoopVectorization
+using Random, Statistics
+using OffsetArrays, Static
+
+include("conv.jl")
+include("pooling.jl")
+include("activations.jl")
+
+end # module
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/activations.jl b/ext/NNlibLoopVectorizationExt/activations.jl
new file mode 100644
index 000000000..8201e7299
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/activations.jl
@@ -0,0 +1,26 @@
+_tanh(x) = tanh(x)
+Base.broadcasted(::typeof(tanh), x::AbstractArray) = @turbo _tanh.(x)
+
+_softsign(x) = x / (1 + abs(x))
+Base.broadcasted(::typeof(NNlib.softsign), x::AbstractArray) = @turbo _softsign.(x)
+
+_softplus(x) = log1p(exp(-abs(x)))
+Base.broadcasted(::typeof(NNlib.softplus), x::AbstractArray) = (@turbo _softplus.(x)) .+ NNlib.relu.(x)
+
+function _sigmoid(x)
+    t = exp(-abs(x))
+    ifelse(x ≥ 0, inv(1 + t), t / (1 + t))
+end
+Base.broadcasted(::typeof(NNlib.sigmoid), x::AbstractArray) = @turbo _sigmoid.(x)
+Base.broadcasted(::typeof(NNlib.sigmoid_fast), x::AbstractArray) = @turbo _sigmoid.(x) # don't do the same for tanh_fast, it would be slower
+
+function _hardsigmoid(x)
+    clamp((x + 3) / 6, 0, 1)
+end
+Base.broadcasted(::typeof(NNlib.hardsigmoid), x::AbstractArray) = @turbo _hardsigmoid.(x)
+
+_logsigmoid(x) = -_softplus(-x)
+Base.broadcasted(::typeof(NNlib.logsigmoid), x::AbstractArray) = @turbo _logsigmoid.(x)
+
+_swish(x) = x * _sigmoid(x)
+Base.broadcasted(::typeof(NNlib.swish), x::AbstractArray) = @turbo _swish.(x)
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl
new file mode 100644
index 000000000..0ca75f466
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/conv.jl
@@ -0,0 +1,191 @@
+#=
+Accelerated convolution for 2d-images using the power of LoopVectorization.
+The acceleration is usually greatest when the inputs have a large spatial size and few channels. 
+Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit.
+
+Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the 
+original implementation in some situations, it is left out for the moment.
+
+Implementation for forward pass mostly copied from here (Jonas Steinebach, MIT license):
+https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl
+
+Implementation for backward pass mostly copied from here (Chris Elrod, MIT license):
+https://github.com/PumasAI/SimpleChains.jl/blob/main/src/conv.jl
+=#
+
+function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
+
+    # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)
+
+    # padding is done naively at the moment
+    if cdims.padding != (0, 0, 0, 0)
+        input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2))
+    end
+
+    output_width, output_height, _ = size(output)
+    input_width, input_height, in_channels, batch_size = size(input)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    out_channels_per_group = out_channels ÷ groups
+
+    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("forward: very specialized case for maximum performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                value = zero(T)
+                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                # for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance
+        # println("forward: second specialized case for better performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                value = zero(T)
+                # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    value += input[n + x_w - 1, m + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    elseif groups == 1 # third specialized case for better performance
+        # println("forward: third specialized case for better performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                value = zero(T)
+                # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    else # general case for any convolution
+        # println("forward: general case for any convolution")
+
+        @tturbo for index_batch in 1:batch_size
+            for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                value = zero(T)
+                # for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
+                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_weight + (group - 1) * in_channels_weight, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    end
+
+    return output
+end
+
+function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
+
+    ∇conv_data!(
+        NNlib.insert_singleton_spatial_dimension(input_gradient, 1),
+        NNlib.insert_singleton_spatial_dimension(output_gradient, 1),
+        NNlib.insert_singleton_spatial_dimension(weight, 1),
+        NNlib.insert_singleton_spatial_dimension(cdims, 1)
+    )
+
+    return input_gradient
+end
+
+function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
+
+    # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims)
+
+    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("backward: very specialized case for maximum performance")
+
+        # storing all the necessary shapes
+        output_width, output_height, out_channels, batch_size = size(output_gradient)
+        weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+        # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+        input_gradient .= zero(T)
+        # check if input_gradient must be padded (padding is done naively at the moment)
+        if cdims.padding != (0, 0, 0, 0)
+            input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2))
+        else
+            input_gradient_padded = input_gradient
+        end
+
+        # store the size of input after padding 
+        input_width, input_height, in_channels, batch_size = size(input_gradient_padded) # size after padding
+
+        # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+        if !NNlib.flipkernel(cdims)
+            weight = reverse(weight, dims=(1, 2))
+        end
+
+        output_gradient = OffsetArray(output_gradient, OffsetArrays.Origin(0, 0, 0, 0))
+        input_gradient_padded = OffsetArray(input_gradient_padded, OffsetArrays.Origin(0, 0, 0, 0))
+        weight = OffsetArray(weight, OffsetArrays.Origin(0, 0, 0, 0))
+
+        input_width, input_height, in_channels, batch_size = size(input_gradient_padded)
+        weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+        @tturbo for index_batch in 0:batch_size-1
+            for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1
+
+                value = zero(T)
+                for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1)
+                # for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1
+
+                    is_in_bound_x = (x_in - x_w >= 0) & (x_in - x_w < output_width)
+                    is_in_bound_y = (y_in - y_w >= 0) & (y_in - y_w < output_height)
+                    output_gradient_value = (is_in_bound_x & is_in_bound_y) ? output_gradient[x_in - x_w, y_in - y_w, out_channel, index_batch] : zero(T)
+                    value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value
+                    
+                end
+                input_gradient_padded[x_in, y_in, in_channel, index_batch] = value
+
+            end
+        end
+
+        input_gradient_padded = input_gradient_padded.parent
+
+        # depad 
+        if cdims.padding != (0, 0, 0, 0)
+            x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
+            input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
+        end
+
+    else # general case for any convolution 
+        input_gradient = ∇conv_data_im2col_grouped!(input_gradient, output_gradient, weight, cdims)
+    end        
+
+    return input_gradient
+end
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/pooling.jl b/ext/NNlibLoopVectorizationExt/pooling.jl
new file mode 100644
index 000000000..5af194408
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/pooling.jl
@@ -0,0 +1,110 @@
+#=
+Accelerated mean pooling for 2d-images using the power of LoopVectorization.
+The speed up is usually lower compared to conv but can be approximately up to 2x.
+
+Since the current state of LoopVectorization ∇meanpool! isn't really faster than the 
+original implementation in some situations, it is left out for the moment.
+
+Implementation inspired from here (Jonas Steinebach, MIT license):
+https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl
+=#
+
+function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real}
+    NNlib.check_dims(size(input), size(output), pdims)
+    
+    # storing all the necessary shapes
+    input_width, input_height, channels, batch_size = size(input)
+    output_width, output_height, channels, batch_size = size(output)
+    kernel_width, kernel_height = pdims.kernel_size
+
+    x_stride, y_stride = pdims.stride
+    x_dilation, y_dilation = pdims.dilation
+    x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding
+
+    # A helper function to project from output (w, h) to input (input_w, input_h)
+    @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1
+
+    # We use calc_padding_regions to split outselves up into separate regions that may or
+    # may not need to worry about padding:
+    pdims_3d = PoolDims((input_width, input_height, 1, channels, batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
+    # println(pdims_3d.padding)
+    padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d)
+
+    # We represent division by kernel size by rolling it
+    # into the `alpha` multiplier. 
+    _alpha = T(1 / prod(pdims.kernel_size))
+
+    # Start with the central region
+    w_region, h_region, _ = central_region
+
+    if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance
+        # println("specialized case for better performance")
+
+        @tturbo for index_batch in 1:batch_size
+            # compute pooling for each channel separatly
+            for channel in 1:channels, y_out in h_region, x_out in w_region
+                kernel_sum = zero(T)
+                for y_w in 1:kernel_height, x_w in 1:kernel_width
+                    kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch]
+                end
+                output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
+            end
+        end
+
+    else # general case for any meanpooling
+        # println("general case for any meanpooling")
+
+        @tturbo for index_batch in 1:batch_size
+            # compute pooling for each channel separatly
+            for channel in 1:channels, y_out in h_region, x_out in w_region
+                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
+                kernel_sum = zero(T)
+                for y_w in 1:kernel_height, x_w in 1:kernel_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    kernel_sum += input[x_in, y_in, channel, index_batch]
+                end
+                output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
+            end
+        end
+
+    end
+
+    # Next, the padded regions
+    if pdims.padding != (0, 0, 0, 0)
+        @inbounds for (w_region, h_region, d_region) in padded_regions
+            for index_batch in 1:batch_size, channel in 1:channels
+                for z_out in d_region # for skipping the d_regions
+                for y_out in h_region
+                m = project(y_out, y_stride, y_pad1)
+                for x_out in w_region
+                n = project(x_out, x_stride, x_pad1)
+                kernel_sum = zero(T)
+
+                    for y_w in 1:kernel_height
+                        y_in = m + (y_w - 1) * y_dilation
+                        if y_in <= 0 || y_in > input_height
+                            continue
+                        end
+
+                        for x_w in 1:kernel_width
+                            x_in = n + (x_w - 1) * x_dilation
+                            if x_in <= 0 || x_in > input_width
+                                continue
+                            end
+
+                            kernel_sum += input[x_in, y_in, channel, index_batch]
+                        end
+                    end
+
+                output[x_out, y_out, channel, index_batch] = _alpha * kernel_sum
+                end
+                end
+                end
+            end
+        end
+    end
+
+    return output
+end
\ No newline at end of file
diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl
new file mode 100644
index 000000000..c081a8fb1
--- /dev/null
+++ b/test/ext_loopvectorization/minimal_test.jl
@@ -0,0 +1,89 @@
+using NNlib, LoopVectorization, CpuId
+
+function ∇conv_data!_avx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+
+    NNlib.check_dims(size(input_gradient), size(weight), size(output_gradient), cdims)
+    
+    # storing all the necessary shapes
+    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+    input_width, input_height, in_channels, current_batch_size = size(input_gradient)
+
+    if cdims.padding != (0, 0, 0, 0) || cdims.groupcount != 1 || cdims.stride != (1, 1) || cdims.dilation != (1, 1)
+        throw(ArgumentError("this test function only supports basic conv (or crosscor) bwd with pad=0, stride=1, dilation=1, groups=1"))
+    end
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+    input_gradient .= zero(T)
+
+    for index_batch in 1:current_batch_size # Threads.@threads 
+        @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+            end
+        end
+    end
+
+    return input_gradient
+end
+
+function ∇conv_data!_noavx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+
+    NNlib.check_dims(size(input_gradient), size(weight), size(output_gradient), cdims)
+    
+    # storing all the necessary shapes
+    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+    input_width, input_height, in_channels, current_batch_size = size(input_gradient)
+
+    if cdims.padding != (0, 0, 0, 0) || cdims.groupcount != 1 || cdims.stride != (1, 1) || cdims.dilation != (1, 1)
+        throw(ArgumentError("this test function only supports basic conv (or crosscor) bwd with pad=0, stride=1, dilation=1, groups=1"))
+    end
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+    input_gradient .= zero(T)
+
+    for index_batch in 1:current_batch_size # NO @threads here
+        for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # NO @turbo here!
+            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+            end
+        end
+    end
+
+    return input_gradient
+end
+
+println(cpuinfo())
+
+dtype = Float32 # Float64
+batch_size = 5
+input = rand(dtype, 50, 50, 3, batch_size)
+weight = rand(dtype, 5, 5, 3, 9)
+cdims = NNlib.DenseConvDims(size(input), size(weight), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1)
+output_gradient = rand(dtype, NNlib.output_size(cdims)..., 9, batch_size)
+
+input_gradient_noavx = zeros(dtype, size(input)...)
+input_gradient_noavx = ∇conv_data!_noavx(input_gradient_noavx, output_gradient, weight, cdims)
+input_gradient_noavx = @time ∇conv_data!_noavx(input_gradient_noavx, output_gradient, weight, cdims)
+
+input_gradient_avx = zeros(dtype, size(input)...)
+input_gradient_avx = ∇conv_data!_avx(input_gradient_avx, output_gradient, weight, cdims)
+input_gradient_avx = @time ∇conv_data!_avx(input_gradient_avx, output_gradient, weight, cdims)
+
+@info isapprox(input_gradient_noavx, input_gradient_avx)
+@testset "conv bwd minimal" begin
+    @test isapprox(input_gradient_noavx, input_gradient_avx)
+end
+@show sum(input_gradient_noavx)
+@show sum(input_gradient_avx)
\ No newline at end of file
diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
new file mode 100644
index 000000000..f3cc771c8
--- /dev/null
+++ b/test/ext_loopvectorization/runtests.jl
@@ -0,0 +1,87 @@
+using NNlib, Test, BenchmarkTools
+
+function compute_conv_outputs(settings::Vector{<:NNlib.ConvDims}, input::Array{T,4}, weight_ungrouped::Array{T,4}, weight_grouped::Array{T,4}, conv_output_grads::Vector{Array{T,4}}) where {T<:Real}
+    conv_outs = Vector{Array{T, 4}}(undef, length(settings))
+    conv_grads = Vector{Array{T, 4}}(undef, length(settings))
+
+    for (i, setting) in enumerate(settings)
+        if setting.groupcount > 1
+            weight = weight_grouped
+        else
+            weight = weight_ungrouped
+        end
+
+        out = @btime NNlib.conv($input, $weight, $setting)
+        output_gradient = conv_output_grads[i]
+
+        conv_grads[i] = @btime NNlib.∇conv_data($output_gradient, $weight, $setting)
+        conv_outs[i] = out
+    end
+
+    return conv_outs, conv_grads
+end
+
+function compute_pool_outputs(settings::Vector{<:NNlib.PoolDims}, input::Array{T,4}) where {T<:Real}
+    pool_outs = Vector{Array{T, 4}}(undef, length(settings))
+
+    for (i, setting) in enumerate(settings)
+        pdims = NNlib.PoolDims(size(input), setting.kernel_size, stride = setting.stride, padding = setting.padding, dilation = setting.dilation)
+        pool_outs[i] = @btime NNlib.meanpool($input, $pdims)
+    end
+
+    return pool_outs
+end
+
+@testset "Convolution & Pooling & Activations" begin
+    
+    dtype = Float32 # Float64
+    batch_size = 64 # 1 # 64 # 32
+    input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool
+    weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv
+    weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv
+    
+    conv_settings_list = [
+        NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case'
+        NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'second specialized case'
+        NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'third specialized case'
+        NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case'
+    ]
+
+    conv_output_grads = [rand(dtype, NNlib.output_size(setting)..., 27, batch_size) for setting in conv_settings_list]
+
+    pool_settings_list = [
+        NNlib.PoolDims(size(input), (5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case'
+        NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case'
+    ]
+
+    modified_activations = [tanh, softsign, softplus, sigmoid, sigmoid_fast, hardsigmoid, logsigmoid, swish]
+
+    # compute outputs before loading LoopVectorization
+    
+    println("without LoopVectorization")
+    conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
+    pool_outs_std = compute_pool_outputs(pool_settings_list, input)
+    act_outs_std = [@btime $act.($input) for act in modified_activations]
+
+    using LoopVectorization # now load the NNlibLoopVectorizationExt
+
+    println("with LoopVectorization")
+    conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
+    pool_outs_lv = compute_pool_outputs(pool_settings_list, input)
+    act_outs_lv = [@btime $act.($input) for act in modified_activations]
+
+    # validate conv
+    @testset "Convolution" begin
+        @test all(isapprox.(conv_outs_std, conv_outs_lv))
+        @test all(isapprox.(conv_grads_std, conv_grads_lv))
+    end
+    # validate pool
+    @testset "Pooling" begin
+        @test all(isapprox.(pool_outs_std, pool_outs_lv))
+    end
+    # validate activations
+    @testset "Activations" begin
+        @test all(isapprox.(act_outs_std, act_outs_lv))
+    end
+
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 03602a40d..c78c74bcd 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -15,7 +15,8 @@ DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursiv
 
 # ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests
 # ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests
-# ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
+# ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests
+ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
 
 const rng = StableRNG(123)
 include("test_utils.jl")
@@ -155,4 +156,16 @@ end
     else
         @info "Skipping AMDGPU tests, set NNLIB_TEST_AMDGPU=true to run them."
     end
+
+    if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true"
+        @testset "LoopVectorization" begin
+            # Don't import LoopVectorization here! 
+            # Because the LV-impls are simply tested against NNlib's standard CPU impls,
+            # importing LoopVectorization here would load NNlibLoopVectorizationExt too early!
+            include("ext_loopvectorization/runtests.jl") 
+        end
+    else
+        @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them."
+    end
+
 end