diff --git a/README.md b/README.md
index 5a8fa61..09b5478 100644
--- a/README.md
+++ b/README.md
@@ -9,11 +9,11 @@ Ported from AviSynth plugin http://bengal.missouri.edu/~kes25c/
 Usage
 =====
 
-    tcanny.TCanny(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int opt=0, int[] planes])
+    tcanny.TCanny(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int opt=0, int[] planes=[0, 1, 2]])
 
 * clip: Clip to process. Any planar format with either integer sample type of 8-16 bit depth or float sample type of 32 bit depth is supported.
 
-* sigma: Standard deviation of gaussian blur. If a single sigma is specified, it will be used for all planes. The value used internally will be adjusted according to the subsampling of the second and third plane in each direction. If two sigma are given then the second value will be used for the third plane as well.
+* sigma: Standard deviation of gaussian blur. If a single `sigma` is specified, it will be used for all planes. If two `sigma` are given then the second value will be used for the third plane as well. The value used internally will be adjusted according to the subsampling of the second and third plane in each direction.
 
 * t_h: High gradient magnitude threshold for hysteresis.
 
@@ -30,7 +30,7 @@ Usage
   * 2 = the Sobel operator
   * 3 = the Scharr operator
 
-* gmmax: Used for scaling gradient magnitude into [0, 2^bitdepth-1] for mode=1.
+* gmmax: Used for scaling gradient magnitude into [0, 2^bitdepth-1] for `mode=1`.
 
 * opt: Sets which cpu optimizations to use.
   * 0 = auto detect
@@ -39,11 +39,11 @@ Usage
   * 3 = use avx
   * 4 = use avx2
 
-* planes: A list of the planes to process. By default all planes are processed.
+* planes: Sets which planes will be processed. Any unprocessed planes will be simply copied.
 
 ---
 
-    tcanny.TCannyCL(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int device=-1, bint list_device=False, bint info=False, int[] planes])
+    tcanny.TCannyCL(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int device=-1, bint list_device=False, bint info=False, int[] planes=[0, 1, 2]])
 
 * device: Sets target OpenCL device. Use `list_device` to get the index of the available devices. By default the default device is selected.
 
@@ -55,9 +55,10 @@ Usage
 Compilation
 ===========
 
-Requires `Boost` unless specify `-D opencl=false`.
+Requires `Boost` unless specify `-Dopencl=false`.
 
 ```
 meson build
 ninja -C build
+ninja -C build install
 ```
diff --git a/TCanny/TCanny.cl b/TCanny/TCanny.cl
index 81c168b..f1aadaf 100644
--- a/TCanny/TCanny.cl
+++ b/TCanny/TCanny.cl
@@ -2,27 +2,27 @@ static const char * source =
 "static __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;                                         \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void copyPlane_uint(__read_only image2d_t src, __write_only image2d_t blur, const float offset) {                                                           \n"
+"void copyPlane_uint(__read_only image2d_t src, __write_only image2d_t dst, const float offset) {                                                            \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
 "    const uint input = read_imageui(src, sampler, (int2)(x, y)).x;                                                                                          \n"
 "    const float output = input;                                                                                                                             \n"
-"    write_imagef(blur, (int2)(x, y), (float4)output);                                                                                                       \n"
+"    write_imagef(dst, (int2)(x, y), (float4)output);                                                                                                        \n"
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void copyPlane_float(__read_only image2d_t src, __write_only image2d_t blur, const float offset) {                                                          \n"
+"void copyPlane_float(__read_only image2d_t src, __write_only image2d_t dst, const float offset) {                                                           \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
 "    const float input = read_imagef(src, sampler, (int2)(x, y)).x;                                                                                          \n"
 "    const float output = input + offset;                                                                                                                    \n"
-"    write_imagef(blur, (int2)(x, y), (float4)output);                                                                                                       \n"
+"    write_imagef(dst, (int2)(x, y), (float4)output);                                                                                                        \n"
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void gaussianBlurH(__read_only image2d_t src, __write_only image2d_t dst, __constant float * weights, const int radius) {                                   \n"
+"void gaussianBlurV_uint(__read_only image2d_t src, __write_only image2d_t dst, __constant float * weights, const int radius, const float offset) {          \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
@@ -30,7 +30,7 @@ static const char * source =
 "    float sum = 0.f;                                                                                                                                        \n"
 "                                                                                                                                                            \n"
 "    for (int i = -radius; i <= radius; i++) {                                                                                                               \n"
-"        const float input = read_imagef(src, sampler, (int2)(x + i, y)).x;                                                                                  \n"
+"        const uint input = read_imageui(src, sampler, (int2)(x, y + i)).x;                                                                                  \n"
 "        sum += input * weights[i];                                                                                                                          \n"
 "    }                                                                                                                                                       \n"
 "                                                                                                                                                            \n"
@@ -38,7 +38,7 @@ static const char * source =
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void gaussianBlurV_uint(__read_only image2d_t src, __write_only image2d_t dst, __constant float * weights, const int radius, const float offset) {          \n"
+"void gaussianBlurV_float(__read_only image2d_t src, __write_only image2d_t dst, __constant float * weights, const int radius, const float offset) {         \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
@@ -46,15 +46,15 @@ static const char * source =
 "    float sum = 0.f;                                                                                                                                        \n"
 "                                                                                                                                                            \n"
 "    for (int i = -radius; i <= radius; i++) {                                                                                                               \n"
-"        const uint input = read_imageui(src, sampler, (int2)(x, y + i)).x;                                                                                  \n"
-"        sum += input * weights[i];                                                                                                                          \n"
+"        const float input = read_imagef(src, sampler, (int2)(x, y + i)).x;                                                                                  \n"
+"        sum += (input + offset) * weights[i];                                                                                                               \n"
 "    }                                                                                                                                                       \n"
 "                                                                                                                                                            \n"
 "    write_imagef(dst, (int2)(x, y), (float4)sum);                                                                                                           \n"
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void gaussianBlurV_float(__read_only image2d_t src, __write_only image2d_t dst, __constant float * weights, const int radius, const float offset) {         \n"
+"void gaussianBlurH(__read_only image2d_t src, __write_only image2d_t dst, __constant float * weights, const int radius) {                                   \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
@@ -62,8 +62,8 @@ static const char * source =
 "    float sum = 0.f;                                                                                                                                        \n"
 "                                                                                                                                                            \n"
 "    for (int i = -radius; i <= radius; i++) {                                                                                                               \n"
-"        const float input = read_imagef(src, sampler, (int2)(x, y + i)).x;                                                                                  \n"
-"        sum += (input + offset) * weights[i];                                                                                                               \n"
+"        const float input = read_imagef(src, sampler, (int2)(x + i, y)).x;                                                                                  \n"
+"        sum += input * weights[i];                                                                                                                          \n"
 "    }                                                                                                                                                       \n"
 "                                                                                                                                                            \n"
 "    write_imagef(dst, (int2)(x, y), (float4)sum);                                                                                                           \n"
@@ -115,14 +115,14 @@ static const char * source =
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void nonMaximumSuppression(__read_only image2d_t gradient, __read_only image2d_t direction, __global float * buffer) {                                      \n"
+"void nonMaximumSuppression(__read_only image2d_t direction, __read_only image2d_t gradient, __global float * buffer) {                                      \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
-"    const int pos = mad24(get_image_width(gradient), y, x);                                                                                                 \n"
+"    const int pos = mad24(get_image_width(direction), y, x);                                                                                                \n"
 "                                                                                                                                                            \n"
-"    const float input = read_imagef(gradient, sampler, (int2)(x, y)).x;                                                                                     \n"
 "    const uint bin = read_imageui(direction, sampler, (int2)(x, y)).x;                                                                                      \n"
+"    const float input = read_imagef(gradient, sampler, (int2)(x, y)).x;                                                                                     \n"
 "    float output;                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "    if (bin == 0) {                                                                                                                                         \n"
@@ -147,7 +147,7 @@ static const char * source =
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel __attribute__((reqd_work_group_size(8, 8, 1)))                                                                                                     \n"
-"void hysteresis(__global float * buffer, __global uchar * label, const int width, const int height) {                                                       \n"
+"void hysteresis(__global float * srcp, __global uchar * found, const int width, const int height) {                                                         \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "    const int localId = mad24(8, (int)get_local_id(1), (int)get_local_id(0));                                                                               \n"
@@ -157,7 +157,7 @@ static const char * source =
 "                                                                                                                                                            \n"
 "    const int pos = mad24(width, y, x);                                                                                                                     \n"
 "                                                                                                                                                            \n"
-"    __local int2 localCoordinates[256];                                                                                                                     \n"
+"    __local int2 localCoordinates[512];                                                                                                                     \n"
 "    __local int localCount;                                                                                                                                 \n"
 "                                                                                                                                                            \n"
 "    if (localId == 0)                                                                                                                                       \n"
@@ -165,16 +165,16 @@ static const char * source =
 "                                                                                                                                                            \n"
 "    barrier(CLK_LOCAL_MEM_FENCE);                                                                                                                           \n"
 "                                                                                                                                                            \n"
-"    if (!label[pos] && buffer[pos] >= T_H) {                                                                                                                \n"
-"        label[pos] = UCHAR_MAX;                                                                                                                             \n"
-"        buffer[pos] = FLT_MAX;                                                                                                                              \n"
+"    if (!found[pos] && srcp[pos] >= T_H) {                                                                                                                  \n"
+"        srcp[pos] = FLT_MAX;                                                                                                                                \n"
+"        found[pos] = UCHAR_MAX;                                                                                                                             \n"
 "                                                                                                                                                            \n"
 "        localCoordinates[atomic_inc(&localCount)] = (int2)(x, y);                                                                                           \n"
 "    }                                                                                                                                                       \n"
 "                                                                                                                                                            \n"
 "    barrier(CLK_LOCAL_MEM_FENCE);                                                                                                                           \n"
 "                                                                                                                                                            \n"
-"    int2 privateCoordinates[8];                                                                                                                             \n"
+"    int2 privateCoordinates[64];                                                                                                                            \n"
 "    int privateCount = 0;                                                                                                                                   \n"
 "                                                                                                                                                            \n"
 "    while (localCount != 0) {                                                                                                                               \n"
@@ -193,9 +193,9 @@ static const char * source =
 "                for (int xx = xxStart; xx <= xxStop; xx++) {                                                                                                \n"
 "                    const int pos3 = mad24(width, yy, xx);                                                                                                  \n"
 "                                                                                                                                                            \n"
-"                    if (!label[pos3] && buffer[pos3] >= T_L) {                                                                                              \n"
-"                        label[pos3] = UCHAR_MAX;                                                                                                            \n"
-"                        buffer[pos3] = FLT_MAX;                                                                                                             \n"
+"                    if (!found[pos3] && srcp[pos3] >= T_L) {                                                                                                \n"
+"                        srcp[pos3] = FLT_MAX;                                                                                                               \n"
+"                        found[pos3] = UCHAR_MAX;                                                                                                            \n"
 "                                                                                                                                                            \n"
 "                        privateCoordinates[privateCount++] = (int2)(xx, yy);                                                                                \n"
 "                    }                                                                                                                                       \n"
@@ -213,65 +213,65 @@ static const char * source =
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void outputGB_uint(__read_only image2d_t blur, __write_only image2d_t dst, const uint peak, const float offset) {                                           \n"
+"void outputGB_uint(__read_only image2d_t src, __write_only image2d_t dst, const uint peak, const float offset) {                                            \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
-"    const float input = read_imagef(blur, sampler, (int2)(x, y)).x;                                                                                         \n"
+"    const float input = read_imagef(src, sampler, (int2)(x, y)).x;                                                                                          \n"
 "    const uint output = min((uint)(input + 0.5f), peak);                                                                                                    \n"
 "    write_imageui(dst, (int2)(x, y), (uint4)output);                                                                                                        \n"
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void outputGB_float(__read_only image2d_t blur, __write_only image2d_t dst, const uint peak, const float offset) {                                          \n"
+"void outputGB_float(__read_only image2d_t src, __write_only image2d_t dst, const uint peak, const float offset) {                                           \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
-"    const float input = read_imagef(blur, sampler, (int2)(x, y)).x;                                                                                         \n"
+"    const float input = read_imagef(src, sampler, (int2)(x, y)).x;                                                                                          \n"
 "    const float output = input - offset;                                                                                                                    \n"
 "    write_imagef(dst, (int2)(x, y), (float4)output);                                                                                                        \n"
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void binarizeCE_uint(const __global float * buffer, __write_only image2d_t dst, const uint peak, const float lower, const float upper) {                    \n"
+"void binarizeCE_uint(const __global float * src, __write_only image2d_t dst, const uint peak, const float lower, const float upper) {                       \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
 "    const int pos = mad24(get_image_width(dst), y, x);                                                                                                      \n"
 "                                                                                                                                                            \n"
-"    const float input = buffer[pos];                                                                                                                        \n"
+"    const float input = src[pos];                                                                                                                           \n"
 "    const uint output = select(0U, peak, input == FLT_MAX);                                                                                                 \n"
 "    write_imageui(dst, (int2)(x, y), (uint4)output);                                                                                                        \n"
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void binarizeCE_float(const __global float * buffer, __write_only image2d_t dst, const uint peak, const float lower, const float upper) {                   \n"
+"void binarizeCE_float(const __global float * src, __write_only image2d_t dst, const uint peak, const float lower, const float upper) {                      \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
 "    const int pos = mad24(get_image_width(dst), y, x);                                                                                                      \n"
 "                                                                                                                                                            \n"
-"    const float input = buffer[pos];                                                                                                                        \n"
+"    const float input = src[pos];                                                                                                                           \n"
 "    const float output = select(lower, upper, input == FLT_MAX);                                                                                            \n"
 "    write_imagef(dst, (int2)(x, y), (float4)output);                                                                                                        \n"
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void discretizeGM_uint(__read_only image2d_t gradient, __write_only image2d_t dst, const uint peak, const float offset) {                                   \n"
+"void discretizeGM_uint(__read_only image2d_t src, __write_only image2d_t dst, const uint peak, const float offset) {                                        \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
-"    const float input = read_imagef(gradient, sampler, (int2)(x, y)).x;                                                                                     \n"
+"    const float input = read_imagef(src, sampler, (int2)(x, y)).x;                                                                                          \n"
 "    const uint output = min((uint)(input * MAGNITUDE + 0.5f), peak);                                                                                        \n"
 "    write_imageui(dst, (int2)(x, y), (uint4)output);                                                                                                        \n"
 "}                                                                                                                                                           \n"
 "                                                                                                                                                            \n"
 "__kernel                                                                                                                                                    \n"
-"void discretizeGM_float(__read_only image2d_t gradient, __write_only image2d_t dst, const uint peak, const float offset) {                                  \n"
+"void discretizeGM_float(__read_only image2d_t src, __write_only image2d_t dst, const uint peak, const float offset) {                                       \n"
 "    const int x = get_global_id(0);                                                                                                                         \n"
 "    const int y = get_global_id(1);                                                                                                                         \n"
 "                                                                                                                                                            \n"
-"    const float input = read_imagef(gradient, sampler, (int2)(x, y)).x;                                                                                     \n"
+"    const float input = read_imagef(src, sampler, (int2)(x, y)).x;                                                                                          \n"
 "    const float output = input * MAGNITUDE - offset;                                                                                                        \n"
 "    write_imagef(dst, (int2)(x, y), (float4)output);                                                                                                        \n"
 "}                                                                                                                                                             ";
\ No newline at end of file
diff --git a/TCanny/TCanny.cpp b/TCanny/TCanny.cpp
index b6bf278..73df4bc 100644
--- a/TCanny/TCanny.cpp
+++ b/TCanny/TCanny.cpp
@@ -20,121 +20,93 @@
 **   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
-#include <vector>
-
 #include "TCanny.hpp"
 
 #ifdef VS_TARGET_CPU_X86
-template<typename T> extern void copyPlane_sse2(const T *, float *, const int, const int, const int, const int, const float) noexcept;
-template<typename T> extern void copyPlane_avx(const T *, float *, const int, const int, const int, const int, const float) noexcept;
-template<typename T> extern void copyPlane_avx2(const T *, float *, const int, const int, const int, const int, const float) noexcept;
-
-extern void gaussianBlurH_sse2(float *, float *, const float *, const int, const int) noexcept;
-extern void gaussianBlurH_avx(float *, float *, const float *, const int, const int) noexcept;
-extern void gaussianBlurH_avx2(float *, float *, const float *, const int, const int) noexcept;
+template<typename T> extern void filter_sse2(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+template<typename T> extern void filter_avx(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+template<typename T> extern void filter_avx2(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+#endif
 
-template<typename T> extern void gaussianBlurV_sse2(const T *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
-template<typename T> extern void gaussianBlurV_avx(const T *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
-template<typename T> extern void gaussianBlurV_avx2(const T *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
+template<typename T>
+static void copyPlane(const T * srcp, float * VS_RESTRICT dstp, const int width, const int height, const int srcStride, const int dstStride, const float offset) noexcept {
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            if (std::is_integral<T>::value)
+                dstp[x] = srcp[x];
+            else
+                dstp[x] = srcp[x] + offset;
+        }
 
-extern void detectEdge_sse2(float *, float *, unsigned *, const int, const int, const int, const int, const int, const unsigned) noexcept;
-extern void detectEdge_avx(float *, float *, unsigned *, const int, const int, const int, const int, const int, const unsigned) noexcept;
-extern void detectEdge_avx2(float *, float *, unsigned *, const int, const int, const int, const int, const int, const unsigned) noexcept;
+        srcp += srcStride;
+        dstp += dstStride;
+    }
+}
 
-extern void nonMaximumSuppression_sse2(const unsigned *, float *, float *, const int, const int, const int, const int) noexcept;
-extern void nonMaximumSuppression_avx(const unsigned *, float *, float *, const int, const int, const int, const int) noexcept;
-extern void nonMaximumSuppression_avx2(const unsigned *, float *, float *, const int, const int, const int, const int) noexcept;
+template<typename T>
+static void gaussianBlur(const T * _srcp, float * VS_RESTRICT temp, float * VS_RESTRICT dstp, const float * weightsH, const float * weightsV, const int width, const int height,
+                         const int srcStride, const int dstStride, const int radiusH, const int radiusV, const float offset) noexcept {
+    const int diameter = radiusV * 2 + 1;
+    const T ** srcp = new const T *[diameter];
 
-template<typename T> extern void outputGB_sse2(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
-template<typename T> extern void outputGB_avx(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
-template<typename T> extern void outputGB_avx2(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
+    srcp[radiusV] = _srcp;
+    for (int i = 1; i <= radiusV; i++) {
+        srcp[radiusV - i] = srcp[radiusV - 1 + i];
+        srcp[radiusV + i] = srcp[radiusV] + srcStride * i;
+    }
 
-template<typename T> extern void binarizeCE_sse2(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
-template<typename T> extern void binarizeCE_avx(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
-template<typename T> extern void binarizeCE_avx2(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
+    weightsH += radiusH;
 
-template<typename T> extern void discretizeGM_sse2(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
-template<typename T> extern void discretizeGM_avx(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
-template<typename T> extern void discretizeGM_avx2(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
-#endif
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            float sum = 0.f;
 
-template<typename T> static void (*copyPlane)(const T *, float *, const int, const int, const int, const int, const float) = nullptr;
-template<typename T> static void (*gaussianBlurV)(const T *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) = nullptr;
-static void (*detectEdge)(float *, float *, unsigned *, const int, const int, const int, const int, const int, const unsigned) = nullptr;
-static void (*nonMaximumSuppression)(const unsigned *, float *, float *, const int, const int, const int, const int) = nullptr;
-template<typename T> static void (*outputGB)(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) = nullptr;
-template<typename T> static void (*binarizeCE)(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) = nullptr;
-template<typename T> static void (*discretizeGM)(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) = nullptr;
-
-struct TCannyData {
-    VSNodeRef * node;
-    const VSVideoInfo * vi;
-    float t_h, t_l;
-    int mode, op;
-    bool process[3];
-    float * horizontalWeights[3], * verticalWeights[3];
-    int horizontalRadius[3], verticalRadius[3], radiusAlign;
-    float magnitude;
-    uint16_t peak;
-    float offset[3], lower[3], upper[3];
-    std::unordered_map<std::thread::id, float *> buffer, blur, gradient;
-    std::unordered_map<std::thread::id, unsigned *> direction;
-    std::unordered_map<std::thread::id, bool *> label;
-};
+            for (int i = 0; i < diameter; i++) {
+                if (std::is_integral<T>::value)
+                    sum += srcp[i][x] * weightsV[i];
+                else
+                    sum += (srcp[i][x] + offset) * weightsV[i];
+            }
 
-template<typename T>
-static void copyPlane_c(const T * srcp, float * VS_RESTRICT blur, const int width, const int height, const int stride, const int bgStride, const float offset) noexcept {
-    if (std::is_integral<T>::value) {
-        for (int y = 0; y < height; y++) {
-            for (int x = 0; x < width; x++)
-                blur[x] = srcp[x];
-
-            srcp += stride;
-            blur += bgStride;
+            temp[x] = sum;
         }
-    } else {
-        if (offset) {
-            for (int y = 0; y < height; y++) {
-                for (int x = 0; x < width; x++)
-                    blur[x] = srcp[x] + offset;
 
-                srcp += stride;
-                blur += bgStride;
-            }
-        } else {
-            vs_bitblt(blur, bgStride * sizeof(float), srcp, stride * sizeof(float), width * sizeof(float), height);
+        for (int i = 1; i <= radiusH; i++) {
+            temp[-i] = temp[-1 + i];
+            temp[width - 1 + i] = temp[width - i];
         }
-    }
-}
-
-static inline void gaussianBlurH_c(float * VS_RESTRICT buffer, float * VS_RESTRICT blur, const float * weights, const int width, const int radius) noexcept {
-    weights += radius;
 
-    for (int i = 1; i <= radius; i++) {
-        buffer[-i] = buffer[-1 + i];
-        buffer[width - 1 + i] = buffer[width - i];
-    }
+        for (int x = 0; x < width; x++) {
+            float sum = 0.f;
 
-    for (int x = 0; x < width; x++) {
-        float sum = 0.f;
+            for (int i = -radiusH; i <= radiusH; i++)
+                sum += temp[x + i] * weightsH[i];
 
-        for (int i = -radius; i <= radius; i++)
-            sum += buffer[x + i] * weights[i];
+            dstp[x] = sum;
+        }
 
-        blur[x] = sum;
+        for (int i = 0; i < diameter - 1; i++)
+            srcp[i] = srcp[i + 1];
+        if (y < height - 1 - radiusV)
+            srcp[diameter - 1] += srcStride;
+        else if (y > height - 1 - radiusV)
+            srcp[diameter - 1] -= srcStride;
+        dstp += dstStride;
     }
+
+    delete[] srcp;
 }
 
 template<typename T>
-static void gaussianBlurV_c(const T * _srcp, float * VS_RESTRICT buffer, float * VS_RESTRICT blur, const float * horizontalWeights, const float * verticalWeights,
-                            const int width, const int height, const int stride, const int bgStride, const int horizontalRadius, const int verticalRadius, const float offset) noexcept {
-    const int diameter = verticalRadius * 2 + 1;
+static void gaussianBlurV(const T * _srcp, float * VS_RESTRICT dstp, const float * weights, const int width, const int height, const int srcStride, const int dstStride,
+                          const int radius, const float offset) noexcept {
+    const int diameter = radius * 2 + 1;
     const T ** srcp = new const T *[diameter];
 
-    srcp[verticalRadius] = _srcp;
-    for (int i = 1; i <= verticalRadius; i++) {
-        srcp[verticalRadius - i] = srcp[verticalRadius - 1 + i];
-        srcp[verticalRadius + i] = srcp[verticalRadius] + stride * i;
+    srcp[radius] = _srcp;
+    for (int i = 1; i <= radius; i++) {
+        srcp[radius - i] = srcp[radius - 1 + i];
+        srcp[radius + i] = srcp[radius] + srcStride * i;
     }
 
     for (int y = 0; y < height; y++) {
@@ -143,30 +115,60 @@ static void gaussianBlurV_c(const T * _srcp, float * VS_RESTRICT buffer, float *
 
             for (int i = 0; i < diameter; i++) {
                 if (std::is_integral<T>::value)
-                    sum += srcp[i][x] * verticalWeights[i];
+                    sum += srcp[i][x] * weights[i];
                 else
-                    sum += (srcp[i][x] + offset) * verticalWeights[i];
+                    sum += (srcp[i][x] + offset) * weights[i];
             }
 
-            buffer[x] = sum;
+            dstp[x] = sum;
         }
 
-        gaussianBlurH_c(buffer, blur, horizontalWeights, width, horizontalRadius);
-
         for (int i = 0; i < diameter - 1; i++)
             srcp[i] = srcp[i + 1];
-        if (y < height - 1 - verticalRadius)
-            srcp[diameter - 1] += stride;
-        else if (y > height - 1 - verticalRadius)
-            srcp[diameter - 1] -= stride;
-        blur += bgStride;
+        if (y < height - 1 - radius)
+            srcp[diameter - 1] += srcStride;
+        else if (y > height - 1 - radius)
+            srcp[diameter - 1] -= srcStride;
+        dstp += dstStride;
     }
 
     delete[] srcp;
 }
 
-static void detectEdge_c(float * blur, float * VS_RESTRICT gradient, unsigned * VS_RESTRICT direction, const int width, const int height, const int stride, const int bgStride,
-                         const int mode, const unsigned op) noexcept {
+template<typename T>
+static void gaussianBlurH(const T * srcp, float * VS_RESTRICT temp, float * VS_RESTRICT dstp, const float * weights, const int width, const int height,
+                          const int srcStride, const int dstStride, const int radius, const float offset) noexcept {
+    weights += radius;
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            if (std::is_integral<T>::value)
+                temp[x] = srcp[x];
+            else
+                temp[x] = srcp[x] + offset;
+        }
+
+        for (int i = 1; i <= radius; i++) {
+            temp[-i] = temp[-1 + i];
+            temp[width - 1 + i] = temp[width - i];
+        }
+
+        for (int x = 0; x < width; x++) {
+            float sum = 0.f;
+
+            for (int i = -radius; i <= radius; i++)
+                sum += temp[x + i] * weights[i];
+
+            dstp[x] = sum;
+        }
+
+        srcp += srcStride;
+        dstp += dstStride;
+    }
+}
+
+static void detectEdge(float * blur, float * VS_RESTRICT gradient, unsigned * VS_RESTRICT direction, const int width, const int height, const int stride, const int bgStride,
+                       const int mode, const int op) noexcept {
     float * VS_RESTRICT srcpp = blur;
     float * VS_RESTRICT srcp = blur;
     float * VS_RESTRICT srcpn = blur + bgStride;
@@ -216,16 +218,16 @@ static void detectEdge_c(float * blur, float * VS_RESTRICT gradient, unsigned *
     }
 }
 
-static void nonMaximumSuppression_c(const unsigned * direction, float * VS_RESTRICT gradient, float * VS_RESTRICT blur, const int width, const int height,
-                                    const int stride, const int bgStride) noexcept {
+static void nonMaximumSuppression(const unsigned * direction, float * VS_RESTRICT gradient, float * VS_RESTRICT blur, const int width, const int height,
+                                  const int stride, const int bgStride, const int radiusAlign) noexcept {
     const int offsets[] = { 1, -bgStride + 1, -bgStride, -bgStride - 1 };
 
     gradient[-1] = gradient[0];
     gradient[-1 + bgStride * (height - 1)] = gradient[bgStride * (height - 1)];
     gradient[width] = gradient[width - 1];
     gradient[width + bgStride * (height - 1)] = gradient[width - 1 + bgStride * (height - 1)];
-    std::copy_n(gradient - 8, width + 16, gradient - 8 - bgStride);
-    std::copy_n(gradient - 8 + bgStride * (height - 1), width + 16, gradient - 8 + bgStride * height);
+    std::copy_n(gradient - radiusAlign, width + radiusAlign * 2, gradient - radiusAlign - bgStride);
+    std::copy_n(gradient - radiusAlign + bgStride * (height - 1), width + radiusAlign * 2, gradient - radiusAlign + bgStride * height);
 
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
@@ -239,236 +241,149 @@ static void nonMaximumSuppression_c(const unsigned * direction, float * VS_RESTR
     }
 }
 
-static void hysteresis(float * VS_RESTRICT blur, bool * VS_RESTRICT label, const int width, const int height, const int bgStride, const float t_h, const float t_l) noexcept {
-    std::fill_n(label, width * height, false);
-
-    std::vector<std::pair<int, int>> coordinates;
-
-    for (int y = 0; y < height; y++) {
-        for (int x = 0; x < width; x++) {
-            if (!label[width * y + x] && blur[bgStride * y + x] >= t_h) {
-                label[width * y + x] = true;
-                blur[bgStride * y + x] = fltMax;
-
-                coordinates.emplace_back(std::make_pair(x, y));
-
-                while (!coordinates.empty()) {
-                    const auto pos = coordinates.back();
-                    coordinates.pop_back();
-
-                    const int xxStart = std::max(pos.first - 1, 0);
-                    const int xxStop = std::min(pos.first + 1, width - 1);
-                    const int yyStart = std::max(pos.second - 1, 0);
-                    const int yyStop = std::min(pos.second + 1, height - 1);
-
-                    for (int yy = yyStart; yy <= yyStop; yy++) {
-                        for (int xx = xxStart; xx <= xxStop; xx++) {
-                            if (!label[width * yy + xx] && blur[bgStride * yy + xx] >= t_l) {
-                                label[width * yy + xx] = true;
-                                blur[bgStride * yy + xx] = fltMax;
-
-                                coordinates.emplace_back(std::make_pair(xx, yy));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 template<typename T>
-static void outputGB_c(const float * blur, T * VS_RESTRICT dstp, const int width, const int height, const int stride, const int bgStride,
-                       const uint16_t peak, const float offset) noexcept {
+static void outputGB(const float * srcp, T * VS_RESTRICT dstp, const int width, const int height, const int srcStride, const int dstStride,
+                     const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             if (std::is_integral<T>::value)
-                dstp[x] = std::min<unsigned>(blur[x] + 0.5f, peak);
+                dstp[x] = std::min<unsigned>(srcp[x] + 0.5f, peak);
             else
-                dstp[x] = blur[x] - offset;
+                dstp[x] = srcp[x] - offset;
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<typename T>
-static void binarizeCE_c(const float * blur, T * VS_RESTRICT dstp, const int width, const int height, const int stride, const int bgStride,
-                         const uint16_t peak, const float lower, const float upper) noexcept {
+static void binarizeCE(const float * srcp, T * VS_RESTRICT dstp, const int width, const int height, const int srcStride, const int dstStride,
+                       const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             if (std::is_integral<T>::value)
-                dstp[x] = (blur[x] == fltMax) ? peak : 0;
+                dstp[x] = (srcp[x] == fltMax) ? peak : 0;
             else
-                dstp[x] = (blur[x] == fltMax) ? upper : lower;
+                dstp[x] = (srcp[x] == fltMax) ? upper : lower;
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<typename T>
-static void discretizeGM_c(const float * gradient, T * VS_RESTRICT dstp, const int width, const int height, const int stride, const int bgStride,
-                           const float magnitude, const uint16_t peak, const float offset) noexcept {
+static void discretizeGM(const float * srcp, T * VS_RESTRICT dstp, const int width, const int height, const int srcStride, const int dstStride,
+                         const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             if (std::is_integral<T>::value)
-                dstp[x] = std::min<unsigned>(gradient[x] * magnitude + 0.5f, peak);
+                dstp[x] = std::min<unsigned>(srcp[x] * magnitude + 0.5f, peak);
             else
-                dstp[x] = gradient[x] * magnitude - offset;
+                dstp[x] = srcp[x] * magnitude - offset;
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<typename T>
-static void filter(const VSFrameRef * src, VSFrameRef * dst, const TCannyData * d, const VSAPI * vsapi) noexcept {
+static void filter_c(const VSFrameRef * src, VSFrameRef * dst, const TCannyData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
     for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
         if (d->process[plane]) {
             const int width = vsapi->getFrameWidth(src, plane);
             const int height = vsapi->getFrameHeight(src, plane);
             const int stride = vsapi->getStride(src, plane) / sizeof(T);
-            const int bgStride = stride + 16;
+            const int bgStride = stride + d->radiusAlign * 2;
             const T * srcp = reinterpret_cast<const T *>(vsapi->getReadPtr(src, plane));
             T * dstp = reinterpret_cast<T *>(vsapi->getWritePtr(dst, plane));
 
             const auto threadId = std::this_thread::get_id();
-            float * buffer = d->buffer.at(threadId) + d->radiusAlign;
-            float * blur = d->blur.at(threadId) + 8;
-            float * gradient = d->gradient.at(threadId) + bgStride + 8;
+            float * blur = d->blur.at(threadId) + d->radiusAlign;
+            float * gradient = d->gradient.at(threadId) + bgStride + d->radiusAlign;
             unsigned * direction = d->direction.at(threadId);
-            bool * label = d->label.at(threadId);
-
-            if (d->horizontalRadius[plane])
-                gaussianBlurV<T>(srcp, buffer, blur, d->horizontalWeights[plane], d->verticalWeights[plane], width, height, stride, bgStride,
-                                 d->horizontalRadius[plane], d->verticalRadius[plane], d->offset[plane]);
+            bool * found = d->found.at(threadId);
+
+            if (d->radiusV[plane] && d->radiusH[plane])
+                gaussianBlur(srcp, gradient, blur, d->weightsH[plane], d->weightsV[plane], width, height, stride, bgStride, d->radiusH[plane], d->radiusV[plane], d->offset[plane]);
+            else if (d->radiusV[plane])
+                gaussianBlurV(srcp, blur, d->weightsV[plane], width, height, stride, bgStride, d->radiusV[plane], d->offset[plane]);
+            else if (d->radiusH[plane])
+                gaussianBlurH(srcp, gradient, blur, d->weightsH[plane], width, height, stride, bgStride, d->radiusH[plane], d->offset[plane]);
             else
-                copyPlane<T>(srcp, blur, width, height, stride, bgStride, d->offset[plane]);
+                copyPlane(srcp, blur, width, height, stride, bgStride, d->offset[plane]);
 
             if (d->mode != -1) {
                 detectEdge(blur, gradient, direction, width, height, stride, bgStride, d->mode, d->op);
 
                 if (d->mode == 0) {
-                    nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride);
-                    hysteresis(blur, label, width, height, bgStride, d->t_h, d->t_l);
+                    nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, d->radiusAlign);
+                    hysteresis(blur, found, width, height, bgStride, d->t_h, d->t_l);
                 }
             }
 
             if (d->mode == -1)
-                outputGB<T>(blur, dstp, width, height, stride, bgStride, d->peak, d->offset[plane]);
+                outputGB(blur, dstp, width, height, bgStride, stride, d->peak, d->offset[plane]);
             else if (d->mode == 0)
-                binarizeCE<T>(blur, dstp, width, height, stride, bgStride, d->peak, d->lower[plane], d->upper[plane]);
+                binarizeCE(blur, dstp, width, height, bgStride, stride, d->peak, d->lower[plane], d->upper[plane]);
             else
-                discretizeGM<T>(gradient, dstp, width, height, stride, bgStride, d->magnitude, d->peak, d->offset[plane]);
+                discretizeGM(gradient, dstp, width, height, bgStride, stride, d->magnitude, d->peak, d->offset[plane]);
         }
     }
 }
 
-static void selectFunctions(const unsigned opt) noexcept {
-    copyPlane<uint8_t> = copyPlane_c;
-    copyPlane<uint16_t> = copyPlane_c;
-    copyPlane<float> = copyPlane_c;
-
-    gaussianBlurV<uint8_t> = gaussianBlurV_c;
-    gaussianBlurV<uint16_t> = gaussianBlurV_c;
-    gaussianBlurV<float> = gaussianBlurV_c;
-
-    detectEdge = detectEdge_c;
-
-    nonMaximumSuppression = nonMaximumSuppression_c;
-
-    outputGB<uint8_t> = outputGB_c;
-    outputGB<uint16_t> = outputGB_c;
-    outputGB<float> = outputGB_c;
-
-    binarizeCE<uint8_t> = binarizeCE_c;
-    binarizeCE<uint16_t> = binarizeCE_c;
-    binarizeCE<float> = binarizeCE_c;
-
-    discretizeGM<uint8_t> = discretizeGM_c;
-    discretizeGM<uint16_t> = discretizeGM_c;
-    discretizeGM<float> = discretizeGM_c;
+static void selectFunctions(const unsigned opt, TCannyData * d) noexcept {
+    d->vectorSize = 1;
+    d->alignment = 4;
 
 #ifdef VS_TARGET_CPU_X86
     const int iset = instrset_detect();
-    if ((opt == 0 && iset >= 8) || opt == 4) {
-        copyPlane<uint8_t> = copyPlane_avx2;
-        copyPlane<uint16_t> = copyPlane_avx2;
-        copyPlane<float> = copyPlane_avx2;
-
-        gaussianBlurV<uint8_t> = gaussianBlurV_avx2;
-        gaussianBlurV<uint16_t> = gaussianBlurV_avx2;
-        gaussianBlurV<float> = gaussianBlurV_avx2;
-
-        detectEdge = detectEdge_avx2;
-
-        nonMaximumSuppression = nonMaximumSuppression_avx2;
-
-        outputGB<uint8_t> = outputGB_avx2;
-        outputGB<uint16_t> = outputGB_avx2;
-        outputGB<float> = outputGB_avx2;
-
-        binarizeCE<uint8_t> = binarizeCE_avx2;
-        binarizeCE<uint16_t> = binarizeCE_avx2;
-        binarizeCE<float> = binarizeCE_avx2;
-
-        discretizeGM<uint8_t> = discretizeGM_avx2;
-        discretizeGM<uint16_t> = discretizeGM_avx2;
-        discretizeGM<float> = discretizeGM_avx2;
-    } else if ((opt == 0 && iset == 7) || opt == 3) {
-        copyPlane<uint8_t> = copyPlane_avx;
-        copyPlane<uint16_t> = copyPlane_avx;
-        copyPlane<float> = copyPlane_avx;
-
-        gaussianBlurV<uint8_t> = gaussianBlurV_avx;
-        gaussianBlurV<uint16_t> = gaussianBlurV_avx;
-        gaussianBlurV<float> = gaussianBlurV_avx;
-
-        detectEdge = detectEdge_avx;
 
-        nonMaximumSuppression = nonMaximumSuppression_avx;
-
-        outputGB<uint8_t> = outputGB_avx;
-        outputGB<uint16_t> = outputGB_avx;
-        outputGB<float> = outputGB_avx;
-
-        binarizeCE<uint8_t> = binarizeCE_avx;
-        binarizeCE<uint16_t> = binarizeCE_avx;
-        binarizeCE<float> = binarizeCE_avx;
-
-        discretizeGM<uint8_t> = discretizeGM_avx;
-        discretizeGM<uint16_t> = discretizeGM_avx;
-        discretizeGM<float> = discretizeGM_avx;
+    if ((opt == 0 && iset >= 7) || opt >= 3) {
+        d->vectorSize = 8;
+        d->alignment = 32;
     } else if ((opt == 0 && iset >= 2) || opt == 2) {
-        copyPlane<uint8_t> = copyPlane_sse2;
-        copyPlane<uint16_t> = copyPlane_sse2;
-        copyPlane<float> = copyPlane_sse2;
-
-        gaussianBlurV<uint8_t> = gaussianBlurV_sse2;
-        gaussianBlurV<uint16_t> = gaussianBlurV_sse2;
-        gaussianBlurV<float> = gaussianBlurV_sse2;
-
-        detectEdge = detectEdge_sse2;
+        d->vectorSize = 4;
+        d->alignment = 16;
+    }
+#endif
 
-        nonMaximumSuppression = nonMaximumSuppression_sse2;
+    if (d->vi->format->bytesPerSample == 1) {
+        d->filter = filter_c<uint8_t>;
 
-        outputGB<uint8_t> = outputGB_sse2;
-        outputGB<uint16_t> = outputGB_sse2;
-        outputGB<float> = outputGB_sse2;
+#ifdef VS_TARGET_CPU_X86
+        if ((opt == 0 && iset >= 8) || opt == 4)
+            d->filter = filter_avx2<uint8_t>;
+        else if ((opt == 0 && iset == 7) || opt == 3)
+            d->filter = filter_avx<uint8_t>;
+        else if ((opt == 0 && iset >= 2) || opt == 2)
+            d->filter = filter_sse2<uint8_t>;
+#endif
+    } else if (d->vi->format->bytesPerSample == 2) {
+        d->filter = filter_c<uint16_t>;
 
-        binarizeCE<uint8_t> = binarizeCE_sse2;
-        binarizeCE<uint16_t> = binarizeCE_sse2;
-        binarizeCE<float> = binarizeCE_sse2;
+#ifdef VS_TARGET_CPU_X86
+        if ((opt == 0 && iset >= 8) || opt == 4)
+            d->filter = filter_avx2<uint16_t>;
+        else if ((opt == 0 && iset == 7) || opt == 3)
+            d->filter = filter_avx<uint16_t>;
+        else if ((opt == 0 && iset >= 2) || opt == 2)
+            d->filter = filter_sse2<uint16_t>;
+#endif
+    } else {
+        d->filter = filter_c<float>;
 
-        discretizeGM<uint8_t> = discretizeGM_sse2;
-        discretizeGM<uint16_t> = discretizeGM_sse2;
-        discretizeGM<float> = discretizeGM_sse2;
-    }
+#ifdef VS_TARGET_CPU_X86
+        if ((opt == 0 && iset >= 8) || opt == 4)
+            d->filter = filter_avx2<float>;
+        else if ((opt == 0 && iset == 7) || opt == 3)
+            d->filter = filter_avx<float>;
+        else if ((opt == 0 && iset >= 2) || opt == 2)
+            d->filter = filter_sse2<float>;
 #endif
+    }
 }
 
 static void VS_CC tcannyInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
@@ -494,39 +409,30 @@ static const VSFrameRef *VS_CC tcannyGetFrame(int n, int activationReason, void
         try {
             auto threadId = std::this_thread::get_id();
 
-            if (!d->buffer.count(threadId)) {
-                float * buffer = vs_aligned_malloc<float>((d->vi->width + d->radiusAlign * 2) * sizeof(float), 32);
-                if (!buffer)
-                    throw std::string{ "malloc failure (buffer)" };
-                d->buffer.emplace(threadId, buffer);
-
-                float * blur = vs_aligned_malloc<float>((vsapi->getStride(src, 0) / d->vi->format->bytesPerSample + 16) * d->vi->height * sizeof(float), 32);
+            if (!d->blur.count(threadId)) {
+                float * blur = vs_aligned_malloc<float>((vsapi->getStride(src, 0) / d->vi->format->bytesPerSample + d->radiusAlign * 2) * d->vi->height * sizeof(float), d->alignment);
                 if (!blur)
                     throw std::string{ "malloc failure (blur)" };
                 d->blur.emplace(threadId, blur);
 
-                if (d->mode != -1) {
-                    float * gradient = vs_aligned_malloc<float>((vsapi->getStride(src, 0) / d->vi->format->bytesPerSample + 16) * (d->vi->height + 2) * sizeof(float), 32);
-                    if (!gradient)
-                        throw std::string{ "malloc failure (gradient)" };
-                    d->gradient.emplace(threadId, gradient);
-                } else {
-                    d->gradient.emplace(threadId, nullptr);
-                }
+                float * gradient = vs_aligned_malloc<float>((vsapi->getStride(src, 0) / d->vi->format->bytesPerSample + d->radiusAlign * 2) * (d->vi->height + 2) * sizeof(float), d->alignment);
+                if (!gradient)
+                    throw std::string{ "malloc failure (gradient)" };
+                d->gradient.emplace(threadId, gradient);
 
                 if (d->mode == 0) {
-                    unsigned * direction = vs_aligned_malloc<unsigned>(vsapi->getStride(src, 0) / d->vi->format->bytesPerSample * d->vi->height * sizeof(unsigned), 32);
+                    unsigned * direction = vs_aligned_malloc<unsigned>(vsapi->getStride(src, 0) / d->vi->format->bytesPerSample * d->vi->height * sizeof(unsigned), d->alignment);
                     if (!direction)
                         throw std::string{ "malloc failure (direction)" };
                     d->direction.emplace(threadId, direction);
 
-                    bool * label = new (std::nothrow) bool[d->vi->width * d->vi->height];
-                    if (!label)
-                        throw std::string{ "malloc failure (label)" };
-                    d->label.emplace(threadId, label);
+                    bool * found = new (std::nothrow) bool[d->vi->width * d->vi->height];
+                    if (!found)
+                        throw std::string{ "malloc failure (found)" };
+                    d->found.emplace(threadId, found);
                 } else {
                     d->direction.emplace(threadId, nullptr);
-                    d->label.emplace(threadId, nullptr);
+                    d->found.emplace(threadId, nullptr);
                 }
             }
         } catch (const std::string & error) {
@@ -536,12 +442,7 @@ static const VSFrameRef *VS_CC tcannyGetFrame(int n, int activationReason, void
             return nullptr;
         }
 
-        if (d->vi->format->bytesPerSample == 1)
-            filter<uint8_t>(src, dst, d, vsapi);
-        else if (d->vi->format->bytesPerSample == 2)
-            filter<uint16_t>(src, dst, d, vsapi);
-        else
-            filter<float>(src, dst, d, vsapi);
+        d->filter(src, dst, d, vsapi);
 
         vsapi->freeFrame(src);
         return dst;
@@ -556,13 +457,10 @@ static void VS_CC tcannyFree(void *instanceData, VSCore *core, const VSAPI *vsap
     vsapi->freeNode(d->node);
 
     for (int i = 0; i < 3; i++) {
-        delete[] d->horizontalWeights[i];
-        delete[] d->verticalWeights[i];
+        delete[] d->weightsH[i];
+        delete[] d->weightsV[i];
     }
 
-    for (auto & iter : d->buffer)
-        vs_aligned_free(iter.second);
-
     for (auto & iter : d->blur)
         vs_aligned_free(iter.second);
 
@@ -572,7 +470,7 @@ static void VS_CC tcannyFree(void *instanceData, VSCore *core, const VSAPI *vsap
     for (auto & iter : d->direction)
         vs_aligned_free(iter.second);
 
-    for (auto & iter : d->label)
+    for (auto & iter : d->found)
         delete[] iter.second;
 
     delete d;
@@ -586,7 +484,8 @@ static void VS_CC tcannyCreate(const VSMap *in, VSMap *out, void *userData, VSCo
     d->vi = vsapi->getVideoInfo(d->node);
 
     try {
-        if (!isConstantFormat(d->vi) || (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
+        if (!isConstantFormat(d->vi) ||
+            (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
             (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32))
             throw std::string{ "only constant format 8-16 bit integer and 32 bit float input supported" };
 
@@ -597,19 +496,19 @@ static void VS_CC tcannyCreate(const VSMap *in, VSMap *out, void *userData, VSCo
         if (numSigma > d->vi->format->numPlanes)
             throw std::string{ "more sigma given than the number of planes" };
 
-        float horizontalSigma[3], verticalSigma[3];
+        float sigmaH[3], sigmaV[3];
 
         for (int i = 0; i < 3; i++) {
             if (i < numSigma) {
-                horizontalSigma[i] = verticalSigma[i] = static_cast<float>(vsapi->propGetFloat(in, "sigma", i, nullptr));
+                sigmaH[i] = sigmaV[i] = static_cast<float>(vsapi->propGetFloat(in, "sigma", i, nullptr));
             } else if (i == 0) {
-                horizontalSigma[0] = verticalSigma[0] = 1.5f;
+                sigmaH[0] = sigmaV[0] = 1.5f;
             } else if (i == 1) {
-                horizontalSigma[1] = horizontalSigma[0] / (1 << d->vi->format->subSamplingW);
-                verticalSigma[1] = verticalSigma[0] / (1 << d->vi->format->subSamplingH);
+                sigmaH[1] = sigmaH[0] / (1 << d->vi->format->subSamplingW);
+                sigmaV[1] = sigmaV[0] / (1 << d->vi->format->subSamplingH);
             } else {
-                horizontalSigma[2] = horizontalSigma[1];
-                verticalSigma[2] = verticalSigma[1];
+                sigmaH[2] = sigmaH[1];
+                sigmaV[2] = sigmaV[1];
             }
         }
 
@@ -633,8 +532,25 @@ static void VS_CC tcannyCreate(const VSMap *in, VSMap *out, void *userData, VSCo
 
         const int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err));
 
+        const int m = vsapi->propNumElements(in, "planes");
+
+        for (int i = 0; i < 3; i++)
+            d->process[i] = (m <= 0);
+
+        for (int i = 0; i < m; i++) {
+            const int n = int64ToIntS(vsapi->propGetInt(in, "planes", i, nullptr));
+
+            if (n < 0 || n >= d->vi->format->numPlanes)
+                throw std::string{ "plane index out of range" };
+
+            if (d->process[n])
+                throw std::string{ "plane specified twice" };
+
+            d->process[n] = true;
+        }
+
         for (int i = 0; i < 3; i++) {
-            if (horizontalSigma[i] < 0.f)
+            if (sigmaH[i] < 0.f)
                 throw std::string{ "sigma must be greater than or equal to 0.0" };
         }
 
@@ -653,31 +569,13 @@ static void VS_CC tcannyCreate(const VSMap *in, VSMap *out, void *userData, VSCo
         if (opt < 0 || opt > 4)
             throw std::string{ "opt must be 0, 1, 2, 3, or 4" };
 
-        const int m = vsapi->propNumElements(in, "planes");
-
-        for (int i = 0; i < 3; i++)
-            d->process[i] = (m <= 0);
-
-        for (int i = 0; i < m; i++) {
-            const int n = int64ToIntS(vsapi->propGetInt(in, "planes", i, nullptr));
-
-            if (n < 0 || n >= d->vi->format->numPlanes)
-                throw std::string{ "plane index out of range" };
-
-            if (d->process[n])
-                throw std::string{ "plane specified twice" };
-
-            d->process[n] = true;
-        }
-
         const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads;
-        d->buffer.reserve(numThreads);
         d->blur.reserve(numThreads);
         d->gradient.reserve(numThreads);
         d->direction.reserve(numThreads);
-        d->label.reserve(numThreads);
+        d->found.reserve(numThreads);
 
-        selectFunctions(opt);
+        selectFunctions(opt, d.get());
 
         if (d->vi->format->sampleType == stInteger) {
             d->peak = (1 << d->vi->format->bitsPerSample) - 1;
@@ -702,25 +600,25 @@ static void VS_CC tcannyCreate(const VSMap *in, VSMap *out, void *userData, VSCo
         }
 
         for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
-            if (d->process[plane] && horizontalSigma[plane]) {
-                d->horizontalWeights[plane] = gaussianWeights(horizontalSigma[plane], &d->horizontalRadius[plane]);
-                d->verticalWeights[plane] = gaussianWeights(verticalSigma[plane], &d->verticalRadius[plane]);
-                if (!d->horizontalWeights[plane] || !d->verticalWeights[plane])
+            if (d->process[plane] && sigmaH[plane]) {
+                d->weightsH[plane] = gaussianWeights(sigmaH[plane], d->radiusH[plane]);
+                d->weightsV[plane] = gaussianWeights(sigmaV[plane], d->radiusV[plane]);
+                if (!d->weightsH[plane] || !d->weightsV[plane])
                     throw std::string{ "malloc failure (weights)" };
 
                 const int width = d->vi->width >> (plane ? d->vi->format->subSamplingW : 0);
                 const int height = d->vi->height >> (plane ? d->vi->format->subSamplingH : 0);
                 const std::string planeOrder{ plane == 0 ? "first" : (plane == 1 ? "second" : "third") };
 
-                if (width < d->horizontalRadius[plane] + 1)
-                    throw std::string{ "the " + planeOrder + " plane's width must be greater than or equal to " + std::to_string(d->horizontalRadius[plane] + 1) + " for specified sigma" };
+                if (width < d->radiusH[plane] + 1)
+                    throw std::string{ "the " + planeOrder + " plane's width must be greater than or equal to " + std::to_string(d->radiusH[plane] + 1) + " for specified sigma" };
 
-                if (height < d->verticalRadius[plane] + 1)
-                    throw std::string{ "the " + planeOrder + " plane's height must be greater than or equal to " + std::to_string(d->verticalRadius[plane] + 1) + " for specified sigma" };
+                if (height < d->radiusV[plane] + 1)
+                    throw std::string{ "the " + planeOrder + " plane's height must be greater than or equal to " + std::to_string(d->radiusV[plane] + 1) + " for specified sigma" };
             }
         }
 
-        d->radiusAlign = (std::max({ d->horizontalRadius[0], d->horizontalRadius[1], d->horizontalRadius[2] }) + 7) & -8;
+        d->radiusAlign = (std::max({ d->radiusH[0], d->radiusH[1], d->radiusH[2], 1 }) + d->vectorSize - 1) & -d->vectorSize;
 
         d->magnitude = 255.f / gmmax;
     } catch (const std::string & error) {
diff --git a/TCanny/TCanny.hpp b/TCanny/TCanny.hpp
index f351242..195ed59 100644
--- a/TCanny/TCanny.hpp
+++ b/TCanny/TCanny.hpp
@@ -1,16 +1,11 @@
 #pragma once
 
-#include <algorithm>
 #include <cmath>
 #include <limits>
-#include <memory>
-#include <string>
-#include <thread>
 #include <type_traits>
-#include <unordered_map>
+#include <vector>
 
-#include <VapourSynth.h>
-#include <VSHelper.h>
+#include "shared.hpp"
 
 #ifdef VS_TARGET_CPU_X86
 #include "vectorclass/vectormath_trig.h"
@@ -21,24 +16,54 @@ static constexpr float M_1_PIF = 0.318309886183790671538f;
 static constexpr float fltMax = std::numeric_limits<float>::max();
 static constexpr float fltLowest = std::numeric_limits<float>::lowest();
 
-static inline float * gaussianWeights(const float sigma, int * radius) noexcept {
-    const int diameter = std::max<int>(sigma * 3.f + 0.5f, 1) * 2 + 1;
-    *radius = diameter / 2;
+struct TCannyData {
+    VSNodeRef * node;
+    const VSVideoInfo * vi;
+    float t_h, t_l;
+    int mode, op;
+    bool process[3];
+    float * weightsH[3], * weightsV[3], magnitude, offset[3], lower[3], upper[3];
+    int vectorSize, alignment, radiusH[3], radiusV[3], radiusAlign;
+    uint16_t peak;
+    std::unordered_map<std::thread::id, float *> blur, gradient;
+    std::unordered_map<std::thread::id, unsigned *> direction;
+    std::unordered_map<std::thread::id, bool *> found;
+    void (*filter)(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *);
+};
 
-    float * VS_RESTRICT weights = new (std::nothrow) float[diameter];
-    if (!weights)
-        return nullptr;
+static void hysteresis(float * VS_RESTRICT srcp, bool * VS_RESTRICT found, const int width, const int height, const int stride, const float t_h, const float t_l) noexcept {
+    std::fill_n(found, width * height, false);
+    std::vector<std::pair<int, int>> coordinates;
 
-    float sum = 0.f;
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            if (!found[width * y + x] && srcp[stride * y + x] >= t_h) {
+                srcp[stride * y + x] = fltMax;
+                found[width * y + x] = true;
 
-    for (int k = -(*radius); k <= *radius; k++) {
-        const float w = std::exp(-(k * k) / (2.f * sigma * sigma));
-        weights[k + *radius] = w;
-        sum += w;
-    }
+                coordinates.emplace_back(std::make_pair(x, y));
+
+                while (!coordinates.empty()) {
+                    const auto pos = coordinates.back();
+                    coordinates.pop_back();
 
-    for (int k = 0; k < diameter; k++)
-        weights[k] /= sum;
+                    const int xxStart = std::max(pos.first - 1, 0);
+                    const int xxStop = std::min(pos.first + 1, width - 1);
+                    const int yyStart = std::max(pos.second - 1, 0);
+                    const int yyStop = std::min(pos.second + 1, height - 1);
 
-    return weights;
+                    for (int yy = yyStart; yy <= yyStop; yy++) {
+                        for (int xx = xxStart; xx <= xxStop; xx++) {
+                            if (!found[width * yy + xx] && srcp[stride * yy + xx] >= t_l) {
+                                srcp[stride * yy + xx] = fltMax;
+                                found[width * yy + xx] = true;
+
+                                coordinates.emplace_back(std::make_pair(xx, yy));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
 }
diff --git a/TCanny/TCanny.vcxproj b/TCanny/TCanny.vcxproj
index 2de44a4..ded8a89 100644
--- a/TCanny/TCanny.vcxproj
+++ b/TCanny/TCanny.vcxproj
@@ -31,8 +31,8 @@
   </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <IncludePath>C:\Program Files %28x86%29\VapourSynth\sdk\include\vapoursynth;C:\boost_1_68_0;$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
-    <LibraryPath>$(CUDA_PATH)\lib\$(Platform);C:\boost_1_68_0\lib64-msvc-14.1;$(LibraryPath)</LibraryPath>
+    <IncludePath>$(CUDA_PATH)\include;C:\boost_1_69_0;C:\Program Files %28x86%29\VapourSynth\sdk\include\vapoursynth;$(IncludePath)</IncludePath>
+    <LibraryPath>$(CUDA_PATH)\lib\x64;C:\boost_1_69_0\lib64-msvc-14.1;$(LibraryPath)</LibraryPath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
@@ -41,7 +41,6 @@
       <DiagnosticsFormat>Column</DiagnosticsFormat>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
       <BufferSecurityCheck>false</BufferSecurityCheck>
-      <FloatingPointModel>Fast</FloatingPointModel>
       <ConformanceMode>true</ConformanceMode>
       <EnforceTypeConversionRules>true</EnforceTypeConversionRules>
       <DisableSpecificWarnings>4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
@@ -50,7 +49,7 @@
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
       <SubSystem>Windows</SubSystem>
-      <AdditionalDependencies>OpenCL.lib;libboost_filesystem-vc141-mt-x64-1_68.lib;libboost_system-vc141-mt-x64-1_68.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>OpenCL.lib;libboost_filesystem-vc141-mt-x64-1_69.lib;libboost_system-vc141-mt-x64-1_69.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
@@ -66,6 +65,7 @@
     <ClCompile Include="vectorclass\instrset_detect.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="shared.hpp" />
     <ClInclude Include="TCanny.hpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/TCanny/TCanny.vcxproj.filters b/TCanny/TCanny.vcxproj.filters
index 8215734..ce8f3ff 100644
--- a/TCanny/TCanny.vcxproj.filters
+++ b/TCanny/TCanny.vcxproj.filters
@@ -7,7 +7,7 @@
     </Filter>
     <Filter Include="Header Files">
       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
-      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;ipp;xsd</Extensions>
     </Filter>
     <Filter Include="Resource Files">
       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
@@ -38,5 +38,8 @@
     <ClInclude Include="TCanny.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="shared.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/TCanny/TCannyCL.cpp b/TCanny/TCannyCL.cpp
index 7554509..75244a2 100644
--- a/TCanny/TCannyCL.cpp
+++ b/TCanny/TCannyCL.cpp
@@ -24,7 +24,7 @@
 #include <clocale>
 #include <cstdio>
 
-#include "TCanny.hpp"
+#include "shared.hpp"
 #include "TCanny.cl"
 
 #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
@@ -39,18 +39,18 @@ struct TCannyCLData {
     const VSVideoInfo * vi;
     int mode;
     bool process[3];
-    int horizontalRadius[3], verticalRadius[3];
+    int radiusH[3], radiusV[3];
     unsigned peak;
     float offset[3], lower[3], upper[3];
     compute::device gpu;
     compute::context ctx;
     compute::program program;
-    compute::buffer horizontalWeights[3], verticalWeights[3];
+    compute::buffer weightsH[3], weightsV[3];
     cl_image_format clImageFormat;
     std::unordered_map<std::thread::id, compute::command_queue> queue;
     std::unordered_map<std::thread::id, compute::kernel> copyPlane, gaussianBlurH, gaussianBlurV, detectEdge, nonMaximumSuppression, hysteresis, outputGB, binarizeCE, discretizeGM;
     std::unordered_map<std::thread::id, compute::image2d> src[3], dst[3], blur[3], gradient[3], direction[3];
-    std::unordered_map<std::thread::id, compute::buffer> buffer, label;
+    std::unordered_map<std::thread::id, compute::buffer> buffer, found;
 };
 
 static void VS_CC tcannyclInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
@@ -124,13 +124,13 @@ static const VSFrameRef *VS_CC tcannyclGetFrame(int n, int activationReason, voi
                 }
 
                 d->buffer.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_float), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
-                d->label.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_uchar), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
+                d->found.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_uchar), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
             }
 
             auto queue = d->queue.at(threadId);
             auto copyPlane = d->copyPlane.at(threadId);
-            auto gaussianBlurH = d->gaussianBlurH.at(threadId);
             auto gaussianBlurV = d->gaussianBlurV.at(threadId);
+            auto gaussianBlurH = d->gaussianBlurH.at(threadId);
             auto detectEdge = d->detectEdge.at(threadId);
             auto nonMaximumSuppression = d->nonMaximumSuppression.at(threadId);
             auto hysteresis = d->hysteresis.at(threadId);
@@ -138,7 +138,7 @@ static const VSFrameRef *VS_CC tcannyclGetFrame(int n, int activationReason, voi
             auto binarizeCE = d->binarizeCE.at(threadId);
             auto discretizeGM = d->discretizeGM.at(threadId);
             auto buffer = d->buffer.at(threadId);
-            auto label = d->label.at(threadId);
+            auto found = d->found.at(threadId);
 
             for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
                 if (d->process[plane]) {
@@ -160,32 +160,34 @@ static const VSFrameRef *VS_CC tcannyclGetFrame(int n, int activationReason, voi
 
                     queue.enqueue_write_image(srcImage, origin, region, srcp, stride);
 
-                    if (d->horizontalRadius[plane]) {
-                        gaussianBlurV.set_args(srcImage, gradientImage, d->verticalWeights[plane], d->verticalRadius[plane], d->offset[plane]);
+                    if (d->radiusV[plane]) {
+                        gaussianBlurV.set_args(srcImage, gradientImage, d->weightsV[plane], d->radiusV[plane], d->offset[plane]);
                         queue.enqueue_nd_range_kernel(gaussianBlurV, 2, nullptr, globalWorkSize, nullptr);
-
-                        gaussianBlurH.set_args(gradientImage, blurImage, d->horizontalWeights[plane], d->horizontalRadius[plane]);
-                        queue.enqueue_nd_range_kernel(gaussianBlurH, 2, nullptr, globalWorkSize, nullptr);
                     } else {
-                        copyPlane.set_args(srcImage, blurImage, d->offset[plane]);
+                        copyPlane.set_args(srcImage, d->radiusH[plane] ? gradientImage : blurImage, d->offset[plane]);
                         queue.enqueue_nd_range_kernel(copyPlane, 2, nullptr, globalWorkSize, nullptr);
                     }
 
+                    if (d->radiusH[plane]) {
+                        gaussianBlurH.set_args(gradientImage, blurImage, d->weightsH[plane], d->radiusH[plane]);
+                        queue.enqueue_nd_range_kernel(gaussianBlurH, 2, nullptr, globalWorkSize, nullptr);
+                    }
+
                     if (d->mode != -1) {
                         detectEdge.set_args(blurImage, gradientImage, directionImage);
                         queue.enqueue_nd_range_kernel(detectEdge, 2, nullptr, globalWorkSize, nullptr);
 
                         if (d->mode == 0) {
-                            nonMaximumSuppression.set_args(gradientImage, directionImage, buffer);
+                            nonMaximumSuppression.set_args(directionImage, gradientImage, buffer);
                             queue.enqueue_nd_range_kernel(nonMaximumSuppression, 2, nullptr, globalWorkSize, nullptr);
 
                             constexpr cl_uchar pattern = 0;
-                            queue.enqueue_fill_buffer(label, &pattern, sizeof(cl_uchar), 0, width * height * sizeof(cl_uchar));
+                            queue.enqueue_fill_buffer(found, &pattern, sizeof(cl_uchar), 0, width * height * sizeof(cl_uchar));
 
                             const size_t paddedGlobalWorkSize[] = { (width + 7) & -8, (height + 7) & -8 };
                             const size_t localWorkSize[] = { 8, 8 };
 
-                            hysteresis.set_args(buffer, label, static_cast<int>(width), static_cast<int>(height));
+                            hysteresis.set_args(buffer, found, static_cast<int>(width), static_cast<int>(height));
                             queue.enqueue_nd_range_kernel(hysteresis, 2, nullptr, paddedGlobalWorkSize, localWorkSize);
                         }
                     }
@@ -232,7 +234,8 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
     d->vi = vsapi->getVideoInfo(d->node);
 
     try {
-        if (!isConstantFormat(d->vi) || (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
+        if (!isConstantFormat(d->vi) ||
+            (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
             (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32))
             throw std::string{ "only constant format 8-16 bit integer and 32 bit float input supported" };
 
@@ -240,19 +243,19 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
         if (numSigma > d->vi->format->numPlanes)
             throw std::string{ "more sigma given than the number of planes" };
 
-        float horizontalSigma[3], verticalSigma[3];
+        float sigmaH[3], sigmaV[3];
 
         for (int i = 0; i < 3; i++) {
             if (i < numSigma) {
-                horizontalSigma[i] = verticalSigma[i] = static_cast<float>(vsapi->propGetFloat(in, "sigma", i, nullptr));
+                sigmaH[i] = sigmaV[i] = static_cast<float>(vsapi->propGetFloat(in, "sigma", i, nullptr));
             } else if (i == 0) {
-                horizontalSigma[0] = verticalSigma[0] = 1.5f;
+                sigmaH[0] = sigmaV[0] = 1.5f;
             } else if (i == 1) {
-                horizontalSigma[1] = horizontalSigma[0] / (1 << d->vi->format->subSamplingW);
-                verticalSigma[1] = verticalSigma[0] / (1 << d->vi->format->subSamplingH);
+                sigmaH[1] = sigmaH[0] / (1 << d->vi->format->subSamplingW);
+                sigmaV[1] = sigmaV[0] / (1 << d->vi->format->subSamplingH);
             } else {
-                horizontalSigma[2] = horizontalSigma[1];
-                verticalSigma[2] = verticalSigma[1];
+                sigmaH[2] = sigmaH[1];
+                sigmaV[2] = sigmaV[1];
             }
         }
 
@@ -296,7 +299,7 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
         }
 
         for (int i = 0; i < 3; i++) {
-            if (horizontalSigma[i] < 0.f)
+            if (sigmaH[i] < 0.f)
                 throw std::string{ "sigma must be greater than or equal to 0.0" };
         }
 
@@ -343,9 +346,7 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
             return;
         }
 
-        d->gpu = compute::system::default_device();
-        if (device > -1)
-            d->gpu = compute::system::devices().at(device);
+        d->gpu = (device < 0) ? compute::system::default_device() : compute::system::devices().at(device);
         d->ctx = compute::context{ d->gpu };
 
         if (!!vsapi->propGetInt(in, "info", 0, &err)) {
@@ -391,8 +392,8 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
         const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads;
         d->queue.reserve(numThreads);
         d->copyPlane.reserve(numThreads);
-        d->gaussianBlurH.reserve(numThreads);
         d->gaussianBlurV.reserve(numThreads);
+        d->gaussianBlurH.reserve(numThreads);
         d->detectEdge.reserve(numThreads);
         d->nonMaximumSuppression.reserve(numThreads);
         d->hysteresis.reserve(numThreads);
@@ -400,7 +401,7 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
         d->binarizeCE.reserve(numThreads);
         d->discretizeGM.reserve(numThreads);
         d->buffer.reserve(numThreads);
-        d->label.reserve(numThreads);
+        d->found.reserve(numThreads);
         for (int i = 0; i < 3; i++) {
             d->src[i].reserve(numThreads);
             d->dst[i].reserve(numThreads);
@@ -432,17 +433,17 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
         }
 
         for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
-            if (d->process[plane] && horizontalSigma[plane]) {
-                float * horizontalWeights = gaussianWeights(horizontalSigma[plane], &d->horizontalRadius[plane]);
-                float * verticalWeights = gaussianWeights(verticalSigma[plane], &d->verticalRadius[plane]);
-                if (!horizontalWeights || !verticalWeights)
+            if (d->process[plane] && sigmaH[plane]) {
+                float * weightsH = gaussianWeights(sigmaH[plane], d->radiusH[plane]);
+                float * weightsV = gaussianWeights(sigmaV[plane], d->radiusV[plane]);
+                if (!weightsH || !weightsV)
                     throw std::string{ "malloc failure (weights)" };
 
-                d->horizontalWeights[plane] = compute::buffer{ d->ctx, (d->horizontalRadius[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, horizontalWeights };
-                d->verticalWeights[plane] = compute::buffer{ d->ctx, (d->verticalRadius[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, verticalWeights };
+                d->weightsH[plane] = compute::buffer{ d->ctx, (d->radiusH[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weightsH };
+                d->weightsV[plane] = compute::buffer{ d->ctx, (d->radiusV[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weightsV };
 
-                delete[] horizontalWeights;
-                delete[] verticalWeights;
+                delete[] weightsH;
+                delete[] weightsV;
             }
         }
 
diff --git a/TCanny/TCanny_AVX.cpp b/TCanny/TCanny_AVX.cpp
index dddc560..6cc7758 100644
--- a/TCanny/TCanny_AVX.cpp
+++ b/TCanny/TCanny_AVX.cpp
@@ -6,56 +6,94 @@
 #include "TCanny.hpp"
 
 template<typename T>
-void copyPlane_avx(const T * srcp, float * blur, const int width, const int height, const int stride, const int bgStride, const float offset) noexcept {
+static void copyPlane(const T * srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
             if (std::is_same<T, uint8_t>::value)
-                to_float(Vec8i().load_8uc(srcp + x)).stream(blur + x);
+                to_float(Vec8i().load_8uc(srcp + x)).stream(dstp + x);
             else if (std::is_same<T, uint16_t>::value)
-                to_float(Vec8i().load_8us(srcp + x)).stream(blur + x);
+                to_float(Vec8i().load_8us(srcp + x)).stream(dstp + x);
             else
-                (Vec8f().load_a(srcp + x) + offset).stream(blur + x);
+                (Vec8f().load_a(srcp + x) + offset).stream(dstp + x);
         }
 
-        srcp += stride;
-        blur += bgStride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template void copyPlane_avx(const uint8_t *, float *, const int, const int, const int, const int, const float) noexcept;
-template void copyPlane_avx(const uint16_t *, float *, const int, const int, const int, const int, const float) noexcept;
-template void copyPlane_avx(const float *, float *, const int, const int, const int, const int, const float) noexcept;
-
-static inline void gaussianBlurH_avx(float * buffer, float * blur, const float * weights, const int width, const int radius) noexcept {
-    weights += radius;
+template<typename T>
+static void gaussianBlur(const T * __srcp, float * temp, float * dstp, const float * weightsH, const float * weightsV, const int width, const int height,
+                         const int srcStride, const int dstStride, const int radiusH, const int radiusV, const float offset) noexcept {
+    const int diameter = radiusV * 2 + 1;
+    const T ** _srcp = new const T *[diameter];
 
-    for (int i = 1; i <= radius; i++) {
-        buffer[-i] = buffer[-1 + i];
-        buffer[width - 1 + i] = buffer[width - i];
+    _srcp[radiusV] = __srcp;
+    for (int i = 1; i <= radiusV; i++) {
+        _srcp[radiusV - i] = _srcp[radiusV - 1 + i];
+        _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i;
     }
 
-    for (int x = 0; x < width; x += 8) {
-        Vec8f sum = zero_8f();
+    weightsH += radiusH;
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += 8) {
+            Vec8f sum = zero_8f();
+
+            for (int i = 0; i < diameter; i++) {
+                if (std::is_same<T, uint8_t>::value) {
+                    const Vec8f srcp = to_float(Vec8i().load_8uc(_srcp[i] + x));
+                    sum = mul_add(srcp, weightsV[i], sum);
+                } else if (std::is_same<T, uint16_t>::value) {
+                    const Vec8f srcp = to_float(Vec8i().load_8us(_srcp[i] + x));
+                    sum = mul_add(srcp, weightsV[i], sum);
+                } else {
+                    const Vec8f srcp = Vec8f().load_a(_srcp[i] + x);
+                    sum = mul_add(srcp + offset, weightsV[i], sum);
+                }
+            }
+
+            sum.store_a(temp + x);
+        }
+
+        for (int i = 1; i <= radiusH; i++) {
+            temp[-i] = temp[-1 + i];
+            temp[width - 1 + i] = temp[width - i];
+        }
+
+        for (int x = 0; x < width; x += 8) {
+            Vec8f sum = zero_8f();
 
-        for (int i = -radius; i <= radius; i++) {
-            const Vec8f srcp = Vec8f().load(buffer + x + i);
-            sum = mul_add(srcp, weights[i], sum);
+            for (int i = -radiusH; i <= radiusH; i++) {
+                const Vec8f srcp = Vec8f().load(temp + x + i);
+                sum = mul_add(srcp, weightsH[i], sum);
+            }
+
+            sum.stream(dstp + x);
         }
 
-        sum.stream(blur + x);
+        for (int i = 0; i < diameter - 1; i++)
+            _srcp[i] = _srcp[i + 1];
+        if (y < height - 1 - radiusV)
+            _srcp[diameter - 1] += srcStride;
+        else if (y > height - 1 - radiusV)
+            _srcp[diameter - 1] -= srcStride;
+        dstp += dstStride;
     }
+
+    delete[] _srcp;
 }
 
 template<typename T>
-void gaussianBlurV_avx(const T * __srcp, float * buffer, float * blur, const float * horizontalWeights, const float * verticalWeights,
-                       const int width, const int height, const int stride, const int bgStride, const int horizontalRadius, const int verticalRadius, const float offset) noexcept {
-    const int diameter = verticalRadius * 2 + 1;
+static void gaussianBlurV(const T * __srcp, float * dstp, const float * weights, const int width, const int height, const int srcStride, const int dstStride,
+                          const int radius, const float offset) noexcept {
+    const int diameter = radius * 2 + 1;
     const T ** _srcp = new const T *[diameter];
 
-    _srcp[verticalRadius] = __srcp;
-    for (int i = 1; i <= verticalRadius; i++) {
-        _srcp[verticalRadius - i] = _srcp[verticalRadius - 1 + i];
-        _srcp[verticalRadius + i] = _srcp[verticalRadius] + stride * i;
+    _srcp[radius] = __srcp;
+    for (int i = 1; i <= radius; i++) {
+        _srcp[radius - i] = _srcp[radius - 1 + i];
+        _srcp[radius + i] = _srcp[radius] + srcStride * i;
     }
 
     for (int y = 0; y < height; y++) {
@@ -65,39 +103,69 @@ void gaussianBlurV_avx(const T * __srcp, float * buffer, float * blur, const flo
             for (int i = 0; i < diameter; i++) {
                 if (std::is_same<T, uint8_t>::value) {
                     const Vec8f srcp = to_float(Vec8i().load_8uc(_srcp[i] + x));
-                    sum = mul_add(srcp, verticalWeights[i], sum);
+                    sum = mul_add(srcp, weights[i], sum);
                 } else if (std::is_same<T, uint16_t>::value) {
                     const Vec8f srcp = to_float(Vec8i().load_8us(_srcp[i] + x));
-                    sum = mul_add(srcp, verticalWeights[i], sum);
+                    sum = mul_add(srcp, weights[i], sum);
                 } else {
                     const Vec8f srcp = Vec8f().load_a(_srcp[i] + x);
-                    sum = mul_add(srcp + offset, verticalWeights[i], sum);
+                    sum = mul_add(srcp + offset, weights[i], sum);
                 }
             }
 
-            sum.store_a(buffer + x);
+            sum.stream(dstp + x);
         }
 
-        gaussianBlurH_avx(buffer, blur, horizontalWeights, width, horizontalRadius);
-
         for (int i = 0; i < diameter - 1; i++)
             _srcp[i] = _srcp[i + 1];
-        if (y < height - 1 - verticalRadius)
-            _srcp[diameter - 1] += stride;
-        else if (y > height - 1 - verticalRadius)
-            _srcp[diameter - 1] -= stride;
-        blur += bgStride;
+        if (y < height - 1 - radius)
+            _srcp[diameter - 1] += srcStride;
+        else if (y > height - 1 - radius)
+            _srcp[diameter - 1] -= srcStride;
+        dstp += dstStride;
     }
 
     delete[] _srcp;
 }
 
-template void gaussianBlurV_avx(const uint8_t *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
-template void gaussianBlurV_avx(const uint16_t *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
-template void gaussianBlurV_avx(const float *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
+template<typename T>
+static void gaussianBlurH(const T * _srcp, float * temp, float * dstp, const float * weights, const int width, const int height,
+                          const int srcStride, const int dstStride, const int radius, const float offset) noexcept {
+    weights += radius;
 
-void detectEdge_avx(float * blur, float * gradient, unsigned * direction, const int width, const int height, const int stride, const int bgStride,
-                    const int mode, const unsigned op) noexcept {
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += 8) {
+            if (std::is_same<T, uint8_t>::value)
+                to_float(Vec8i().load_8uc(_srcp + x)).store_a(temp + x);
+            else if (std::is_same<T, uint16_t>::value)
+                to_float(Vec8i().load_8us(_srcp + x)).store_a(temp + x);
+            else
+                (Vec8f().load_a(_srcp + x) + offset).store_a(temp + x);
+        }
+
+        for (int i = 1; i <= radius; i++) {
+            temp[-i] = temp[-1 + i];
+            temp[width - 1 + i] = temp[width - i];
+        }
+
+        for (int x = 0; x < width; x += 8) {
+            Vec8f sum = zero_8f();
+
+            for (int i = -radius; i <= radius; i++) {
+                const Vec8f srcp = Vec8f().load(temp + x + i);
+                sum = mul_add(srcp, weights[i], sum);
+            }
+
+            sum.stream(dstp + x);
+        }
+
+        _srcp += srcStride;
+        dstp += dstStride;
+    }
+}
+
+static void detectEdge(float * blur, float * gradient, unsigned * direction, const int width, const int height, const int stride, const int bgStride,
+                       const int mode, const int op) noexcept {
     float * srcpp = blur;
     float * srcp = blur;
     float * srcpn = blur + bgStride;
@@ -155,13 +223,14 @@ void detectEdge_avx(float * blur, float * gradient, unsigned * direction, const
     }
 }
 
-void nonMaximumSuppression_avx(const unsigned * _direction, float * _gradient, float * blur, const int width, const int height, const int stride, const int bgStride) noexcept {
+static void nonMaximumSuppression(const unsigned * _direction, float * _gradient, float * blur, const int width, const int height,
+                                  const int stride, const int bgStride, const int radiusAlign) noexcept {
     _gradient[-1] = _gradient[0];
     _gradient[-1 + bgStride * (height - 1)] = _gradient[bgStride * (height - 1)];
     _gradient[width] = _gradient[width - 1];
     _gradient[width + bgStride * (height - 1)] = _gradient[width - 1 + bgStride * (height - 1)];
-    std::copy_n(_gradient - 8, width + 16, _gradient - 8 - bgStride);
-    std::copy_n(_gradient - 8 + bgStride * (height - 1), width + 16, _gradient - 8 + bgStride * height);
+    std::copy_n(_gradient - radiusAlign, width + radiusAlign * 2, _gradient - radiusAlign - bgStride);
+    std::copy_n(_gradient - radiusAlign + bgStride * (height - 1), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * height);
 
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
@@ -193,110 +262,110 @@ void nonMaximumSuppression_avx(const unsigned * _direction, float * _gradient, f
     }
 }
 
-template<typename T> void outputGB_avx(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
+template<typename T> static void outputGB(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
 
 template<>
-void outputGB_avx(const float * blur, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec8i srcp_8i_0 = truncate_to_int(Vec8f().load_a(blur + x) + 0.5f);
-            const Vec8i srcp_8i_1 = truncate_to_int(Vec8f().load_a(blur + x + 8) + 0.5f);
+            const Vec8i srcp_8i_0 = truncate_to_int(Vec8f().load_a(_srcp + x) + 0.5f);
+            const Vec8i srcp_8i_1 = truncate_to_int(Vec8f().load_a(_srcp + x + 8) + 0.5f);
             const Vec8s srcp_8s_0 = compress_saturated(srcp_8i_0.get_low(), srcp_8i_0.get_high());
             const Vec8s srcp_8s_1 = compress_saturated(srcp_8i_1.get_low(), srcp_8i_1.get_high());
             const Vec16uc srcp = compress_saturated_s2u(srcp_8s_0, srcp_8s_1);
             srcp.stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void outputGB_avx(const float * blur, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8i srcp_8i = truncate_to_int(Vec8f().load_a(blur + x) + 0.5f);
+            const Vec8i srcp_8i = truncate_to_int(Vec8f().load_a(_srcp + x) + 0.5f);
             const Vec8us srcp = compress_saturated_s2u(srcp_8i.get_low(), srcp_8i.get_high());
             min(srcp, peak).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void outputGB_avx(const float * blur, float * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8f srcp = Vec8f().load_a(blur + x);
+            const Vec8f srcp = Vec8f().load_a(_srcp + x);
             (srcp - offset).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template<typename T> void binarizeCE_avx(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
+template<typename T> static void binarizeCE(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
 
 template<>
-void binarizeCE_avx(const float * blur, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                    const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec8ib mask_8ib_0 = Vec8ib(Vec8f().load_a(blur + x) == fltMax);
-            const Vec8ib mask_8ib_1 = Vec8ib(Vec8f().load_a(blur + x + 8) == fltMax);
+            const Vec8ib mask_8ib_0 = Vec8ib(Vec8f().load_a(srcp + x) == fltMax);
+            const Vec8ib mask_8ib_1 = Vec8ib(Vec8f().load_a(srcp + x + 8) == fltMax);
             const Vec8sb mask_8sb_0 = Vec8sb(compress_saturated(mask_8ib_0.get_low(), mask_8ib_0.get_high()));
             const Vec8sb mask_8sb_1 = Vec8sb(compress_saturated(mask_8ib_1.get_low(), mask_8ib_1.get_high()));
             const Vec16cb mask = Vec16cb(compress_saturated(mask_8sb_0, mask_8sb_1));
             select(mask, Vec16uc(255), zero_128b()).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void binarizeCE_avx(const float * blur, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                    const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8ib mask_8ib = Vec8ib(Vec8f().load_a(blur + x) == fltMax);
+            const Vec8ib mask_8ib = Vec8ib(Vec8f().load_a(srcp + x) == fltMax);
             const Vec8sb mask = Vec8sb(compress_saturated(mask_8ib.get_low(), mask_8ib.get_high()));
             select(mask, Vec8us(peak), zero_128b()).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void binarizeCE_avx(const float * blur, float * dstp, const int width, const int height, const int stride, const int bgStride,
-                    const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8fb mask = Vec8f().load_a(blur + x) == fltMax;
+            const Vec8fb mask = (Vec8f().load_a(srcp + x) == fltMax);
             select(mask, Vec8f(upper), Vec8f(lower)).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template<typename T> void discretizeGM_avx(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
+template<typename T> static void discretizeGM(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
 
 template<>
-void discretizeGM_avx(const float * gradient, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                      const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec8f srcp_8f_0 = Vec8f().load_a(gradient + x);
-            const Vec8f srcp_8f_1 = Vec8f().load_a(gradient + x + 8);
+            const Vec8f srcp_8f_0 = Vec8f().load_a(_srcp + x);
+            const Vec8f srcp_8f_1 = Vec8f().load_a(_srcp + x + 8);
             const Vec8i srcp_8i_0 = truncate_to_int(mul_add(srcp_8f_0, magnitude, 0.5f));
             const Vec8i srcp_8i_1 = truncate_to_int(mul_add(srcp_8f_1, magnitude, 0.5f));
             const Vec8s srcp_8s_0 = compress_saturated(srcp_8i_0.get_low(), srcp_8i_0.get_high());
@@ -305,38 +374,87 @@ void discretizeGM_avx(const float * gradient, uint8_t * dstp, const int width, c
             srcp.stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void discretizeGM_avx(const float * gradient, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                      const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8f srcp_8f = Vec8f().load_a(gradient + x);
+            const Vec8f srcp_8f = Vec8f().load_a(_srcp + x);
             const Vec8i srcp_8i = truncate_to_int(mul_add(srcp_8f, magnitude, 0.5f));
             const Vec8us srcp = compress_saturated_s2u(srcp_8i.get_low(), srcp_8i.get_high());
             min(srcp, peak).stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void discretizeGM_avx(const float * gradient, float * dstp, const int width, const int height, const int stride, const int bgStride,
-                      const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8f srcp = Vec8f().load_a(gradient + x);
+            const Vec8f srcp = Vec8f().load_a(_srcp + x);
             mul_sub(srcp, magnitude, offset).stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
+
+template<typename T>
+void filter_avx(const VSFrameRef * src, VSFrameRef * dst, const TCannyData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
+    for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
+        if (d->process[plane]) {
+            const int width = vsapi->getFrameWidth(src, plane);
+            const int height = vsapi->getFrameHeight(src, plane);
+            const int stride = vsapi->getStride(src, plane) / sizeof(T);
+            const int bgStride = stride + d->radiusAlign * 2;
+            const T * srcp = reinterpret_cast<const T *>(vsapi->getReadPtr(src, plane));
+            T * dstp = reinterpret_cast<T *>(vsapi->getWritePtr(dst, plane));
+
+            const auto threadId = std::this_thread::get_id();
+            float * blur = d->blur.at(threadId) + d->radiusAlign;
+            float * gradient = d->gradient.at(threadId) + bgStride + d->radiusAlign;
+            unsigned * direction = d->direction.at(threadId);
+            bool * found = d->found.at(threadId);
+
+            if (d->radiusV[plane] && d->radiusH[plane])
+                gaussianBlur(srcp, gradient, blur, d->weightsH[plane], d->weightsV[plane], width, height, stride, bgStride, d->radiusH[plane], d->radiusV[plane], d->offset[plane]);
+            else if (d->radiusV[plane])
+                gaussianBlurV(srcp, blur, d->weightsV[plane], width, height, stride, bgStride, d->radiusV[plane], d->offset[plane]);
+            else if (d->radiusH[plane])
+                gaussianBlurH(srcp, gradient, blur, d->weightsH[plane], width, height, stride, bgStride, d->radiusH[plane], d->offset[plane]);
+            else
+                copyPlane(srcp, blur, width, height, stride, bgStride, d->offset[plane]);
+
+            if (d->mode != -1) {
+                detectEdge(blur, gradient, direction, width, height, stride, bgStride, d->mode, d->op);
+
+                if (d->mode == 0) {
+                    nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, d->radiusAlign);
+                    hysteresis(blur, found, width, height, bgStride, d->t_h, d->t_l);
+                }
+            }
+
+            if (d->mode == -1)
+                outputGB(blur, dstp, width, height, bgStride, stride, d->peak, d->offset[plane]);
+            else if (d->mode == 0)
+                binarizeCE(blur, dstp, width, height, bgStride, stride, d->peak, d->lower[plane], d->upper[plane]);
+            else
+                discretizeGM(gradient, dstp, width, height, bgStride, stride, d->magnitude, d->peak, d->offset[plane]);
+        }
+    }
+}
+
+template void filter_avx<uint8_t>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+template void filter_avx<uint16_t>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+template void filter_avx<float>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
 #endif
diff --git a/TCanny/TCanny_AVX2.cpp b/TCanny/TCanny_AVX2.cpp
index 8f21265..846e01e 100644
--- a/TCanny/TCanny_AVX2.cpp
+++ b/TCanny/TCanny_AVX2.cpp
@@ -6,56 +6,94 @@
 #include "TCanny.hpp"
 
 template<typename T>
-void copyPlane_avx2(const T * srcp, float * blur, const int width, const int height, const int stride, const int bgStride, const float offset) noexcept {
+static void copyPlane(const T * srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
             if (std::is_same<T, uint8_t>::value)
-                to_float(Vec8i().load_8uc(srcp + x)).stream(blur + x);
+                to_float(Vec8i().load_8uc(srcp + x)).stream(dstp + x);
             else if (std::is_same<T, uint16_t>::value)
-                to_float(Vec8i().load_8us(srcp + x)).stream(blur + x);
+                to_float(Vec8i().load_8us(srcp + x)).stream(dstp + x);
             else
-                (Vec8f().load_a(srcp + x) + offset).stream(blur + x);
+                (Vec8f().load_a(srcp + x) + offset).stream(dstp + x);
         }
 
-        srcp += stride;
-        blur += bgStride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template void copyPlane_avx2(const uint8_t *, float *, const int, const int, const int, const int, const float) noexcept;
-template void copyPlane_avx2(const uint16_t *, float *, const int, const int, const int, const int, const float) noexcept;
-template void copyPlane_avx2(const float *, float *, const int, const int, const int, const int, const float) noexcept;
-
-static inline void gaussianBlurH_avx2(float * buffer, float * blur, const float * weights, const int width, const int radius) noexcept {
-    weights += radius;
+template<typename T>
+static void gaussianBlur(const T * __srcp, float * temp, float * dstp, const float * weightsH, const float * weightsV, const int width, const int height,
+                         const int srcStride, const int dstStride, const int radiusH, const int radiusV, const float offset) noexcept {
+    const int diameter = radiusV * 2 + 1;
+    const T ** _srcp = new const T *[diameter];
 
-    for (int i = 1; i <= radius; i++) {
-        buffer[-i] = buffer[-1 + i];
-        buffer[width - 1 + i] = buffer[width - i];
+    _srcp[radiusV] = __srcp;
+    for (int i = 1; i <= radiusV; i++) {
+        _srcp[radiusV - i] = _srcp[radiusV - 1 + i];
+        _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i;
     }
 
-    for (int x = 0; x < width; x += 8) {
-        Vec8f sum = zero_8f();
+    weightsH += radiusH;
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += 8) {
+            Vec8f sum = zero_8f();
+
+            for (int i = 0; i < diameter; i++) {
+                if (std::is_same<T, uint8_t>::value) {
+                    const Vec8f srcp = to_float(Vec8i().load_8uc(_srcp[i] + x));
+                    sum = mul_add(srcp, weightsV[i], sum);
+                } else if (std::is_same<T, uint16_t>::value) {
+                    const Vec8f srcp = to_float(Vec8i().load_8us(_srcp[i] + x));
+                    sum = mul_add(srcp, weightsV[i], sum);
+                } else {
+                    const Vec8f srcp = Vec8f().load_a(_srcp[i] + x);
+                    sum = mul_add(srcp + offset, weightsV[i], sum);
+                }
+            }
+
+            sum.store_a(temp + x);
+        }
+
+        for (int i = 1; i <= radiusH; i++) {
+            temp[-i] = temp[-1 + i];
+            temp[width - 1 + i] = temp[width - i];
+        }
+
+        for (int x = 0; x < width; x += 8) {
+            Vec8f sum = zero_8f();
 
-        for (int i = -radius; i <= radius; i++) {
-            const Vec8f srcp = Vec8f().load(buffer + x + i);
-            sum = mul_add(srcp, weights[i], sum);
+            for (int i = -radiusH; i <= radiusH; i++) {
+                const Vec8f srcp = Vec8f().load(temp + x + i);
+                sum = mul_add(srcp, weightsH[i], sum);
+            }
+
+            sum.stream(dstp + x);
         }
 
-        sum.stream(blur + x);
+        for (int i = 0; i < diameter - 1; i++)
+            _srcp[i] = _srcp[i + 1];
+        if (y < height - 1 - radiusV)
+            _srcp[diameter - 1] += srcStride;
+        else if (y > height - 1 - radiusV)
+            _srcp[diameter - 1] -= srcStride;
+        dstp += dstStride;
     }
+
+    delete[] _srcp;
 }
 
 template<typename T>
-void gaussianBlurV_avx2(const T * __srcp, float * buffer, float * blur, const float * horizontalWeights, const float * verticalWeights,
-                        const int width, const int height, const int stride, const int bgStride, const int horizontalRadius, const int verticalRadius, const float offset) noexcept {
-    const int diameter = verticalRadius * 2 + 1;
+static void gaussianBlurV(const T * __srcp, float * dstp, const float * weights, const int width, const int height, const int srcStride, const int dstStride,
+                          const int radius, const float offset) noexcept {
+    const int diameter = radius * 2 + 1;
     const T ** _srcp = new const T *[diameter];
 
-    _srcp[verticalRadius] = __srcp;
-    for (int i = 1; i <= verticalRadius; i++) {
-        _srcp[verticalRadius - i] = _srcp[verticalRadius - 1 + i];
-        _srcp[verticalRadius + i] = _srcp[verticalRadius] + stride * i;
+    _srcp[radius] = __srcp;
+    for (int i = 1; i <= radius; i++) {
+        _srcp[radius - i] = _srcp[radius - 1 + i];
+        _srcp[radius + i] = _srcp[radius] + srcStride * i;
     }
 
     for (int y = 0; y < height; y++) {
@@ -65,39 +103,69 @@ void gaussianBlurV_avx2(const T * __srcp, float * buffer, float * blur, const fl
             for (int i = 0; i < diameter; i++) {
                 if (std::is_same<T, uint8_t>::value) {
                     const Vec8f srcp = to_float(Vec8i().load_8uc(_srcp[i] + x));
-                    sum = mul_add(srcp, verticalWeights[i], sum);
+                    sum = mul_add(srcp, weights[i], sum);
                 } else if (std::is_same<T, uint16_t>::value) {
                     const Vec8f srcp = to_float(Vec8i().load_8us(_srcp[i] + x));
-                    sum = mul_add(srcp, verticalWeights[i], sum);
+                    sum = mul_add(srcp, weights[i], sum);
                 } else {
                     const Vec8f srcp = Vec8f().load_a(_srcp[i] + x);
-                    sum = mul_add(srcp + offset, verticalWeights[i], sum);
+                    sum = mul_add(srcp + offset, weights[i], sum);
                 }
             }
 
-            sum.store_a(buffer + x);
+            sum.stream(dstp + x);
         }
 
-        gaussianBlurH_avx2(buffer, blur, horizontalWeights, width, horizontalRadius);
-
         for (int i = 0; i < diameter - 1; i++)
             _srcp[i] = _srcp[i + 1];
-        if (y < height - 1 - verticalRadius)
-            _srcp[diameter - 1] += stride;
-        else if (y > height - 1 - verticalRadius)
-            _srcp[diameter - 1] -= stride;
-        blur += bgStride;
+        if (y < height - 1 - radius)
+            _srcp[diameter - 1] += srcStride;
+        else if (y > height - 1 - radius)
+            _srcp[diameter - 1] -= srcStride;
+        dstp += dstStride;
     }
 
     delete[] _srcp;
 }
 
-template void gaussianBlurV_avx2(const uint8_t *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
-template void gaussianBlurV_avx2(const uint16_t *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
-template void gaussianBlurV_avx2(const float *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
+template<typename T>
+static void gaussianBlurH(const T * _srcp, float * temp, float * dstp, const float * weights, const int width, const int height,
+                          const int srcStride, const int dstStride, const int radius, const float offset) noexcept {
+    weights += radius;
 
-void detectEdge_avx2(float * blur, float * gradient, unsigned * direction, const int width, const int height, const int stride, const int bgStride,
-                     const int mode, const unsigned op) noexcept {
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += 8) {
+            if (std::is_same<T, uint8_t>::value)
+                to_float(Vec8i().load_8uc(_srcp + x)).store_a(temp + x);
+            else if (std::is_same<T, uint16_t>::value)
+                to_float(Vec8i().load_8us(_srcp + x)).store_a(temp + x);
+            else
+                (Vec8f().load_a(_srcp + x) + offset).store_a(temp + x);
+        }
+
+        for (int i = 1; i <= radius; i++) {
+            temp[-i] = temp[-1 + i];
+            temp[width - 1 + i] = temp[width - i];
+        }
+
+        for (int x = 0; x < width; x += 8) {
+            Vec8f sum = zero_8f();
+
+            for (int i = -radius; i <= radius; i++) {
+                const Vec8f srcp = Vec8f().load(temp + x + i);
+                sum = mul_add(srcp, weights[i], sum);
+            }
+
+            sum.stream(dstp + x);
+        }
+
+        _srcp += srcStride;
+        dstp += dstStride;
+    }
+}
+
+static void detectEdge(float * blur, float * gradient, unsigned * direction, const int width, const int height, const int stride, const int bgStride,
+                       const int mode, const int op) noexcept {
     float * srcpp = blur;
     float * srcp = blur;
     float * srcpn = blur + bgStride;
@@ -155,13 +223,14 @@ void detectEdge_avx2(float * blur, float * gradient, unsigned * direction, const
     }
 }
 
-void nonMaximumSuppression_avx2(const unsigned * _direction, float * _gradient, float * blur, const int width, const int height, const int stride, const int bgStride) noexcept {
+static void nonMaximumSuppression(const unsigned * _direction, float * _gradient, float * blur, const int width, const int height,
+                                  const int stride, const int bgStride, const int radiusAlign) noexcept {
     _gradient[-1] = _gradient[0];
     _gradient[-1 + bgStride * (height - 1)] = _gradient[bgStride * (height - 1)];
     _gradient[width] = _gradient[width - 1];
     _gradient[width + bgStride * (height - 1)] = _gradient[width - 1 + bgStride * (height - 1)];
-    std::copy_n(_gradient - 8, width + 16, _gradient - 8 - bgStride);
-    std::copy_n(_gradient - 8 + bgStride * (height - 1), width + 16, _gradient - 8 + bgStride * height);
+    std::copy_n(_gradient - radiusAlign, width + radiusAlign * 2, _gradient - radiusAlign - bgStride);
+    std::copy_n(_gradient - radiusAlign + bgStride * (height - 1), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * height);
 
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
@@ -193,118 +262,118 @@ void nonMaximumSuppression_avx2(const unsigned * _direction, float * _gradient,
     }
 }
 
-template<typename T> void outputGB_avx2(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
+template<typename T> static void outputGB(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
 
 template<>
-void outputGB_avx2(const float * blur, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 32) {
-            const Vec8i srcp_8i_0 = truncate_to_int(Vec8f().load_a(blur + x) + 0.5f);
-            const Vec8i srcp_8i_1 = truncate_to_int(Vec8f().load_a(blur + x + 8) + 0.5f);
-            const Vec8i srcp_8i_2 = truncate_to_int(Vec8f().load_a(blur + x + 16) + 0.5f);
-            const Vec8i srcp_8i_3 = truncate_to_int(Vec8f().load_a(blur + x + 24) + 0.5f);
+            const Vec8i srcp_8i_0 = truncate_to_int(Vec8f().load_a(_srcp + x) + 0.5f);
+            const Vec8i srcp_8i_1 = truncate_to_int(Vec8f().load_a(_srcp + x + 8) + 0.5f);
+            const Vec8i srcp_8i_2 = truncate_to_int(Vec8f().load_a(_srcp + x + 16) + 0.5f);
+            const Vec8i srcp_8i_3 = truncate_to_int(Vec8f().load_a(_srcp + x + 24) + 0.5f);
             const Vec16s srcp_16s_0 = compress_saturated(srcp_8i_0, srcp_8i_1);
             const Vec16s srcp_16s_1 = compress_saturated(srcp_8i_2, srcp_8i_3);
             const Vec32uc srcp = compress_saturated_s2u(srcp_16s_0, srcp_16s_1);
             srcp.stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void outputGB_avx2(const float * blur, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec8i srcp_8i_0 = truncate_to_int(Vec8f().load_a(blur + x) + 0.5f);
-            const Vec8i srcp_8i_1 = truncate_to_int(Vec8f().load_a(blur + x + 8) + 0.5f);
+            const Vec8i srcp_8i_0 = truncate_to_int(Vec8f().load_a(_srcp + x) + 0.5f);
+            const Vec8i srcp_8i_1 = truncate_to_int(Vec8f().load_a(_srcp + x + 8) + 0.5f);
             const Vec16us srcp = compress_saturated_s2u(srcp_8i_0, srcp_8i_1);
             min(srcp, peak).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void outputGB_avx2(const float * blur, float * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8f srcp = Vec8f().load_a(blur + x);
+            const Vec8f srcp = Vec8f().load_a(_srcp + x);
             (srcp - offset).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template<typename T> void binarizeCE_avx2(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
+template<typename T> static void binarizeCE(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
 
 template<>
-void binarizeCE_avx2(const float * blur, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                     const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 32) {
-            const Vec8ib mask_8ib_0 = Vec8ib(Vec8f().load_a(blur + x) == fltMax);
-            const Vec8ib mask_8ib_1 = Vec8ib(Vec8f().load_a(blur + x + 8) == fltMax);
-            const Vec8ib mask_8ib_2 = Vec8ib(Vec8f().load_a(blur + x + 16) == fltMax);
-            const Vec8ib mask_8ib_3 = Vec8ib(Vec8f().load_a(blur + x + 24) == fltMax);
+            const Vec8ib mask_8ib_0 = Vec8ib(Vec8f().load_a(srcp + x) == fltMax);
+            const Vec8ib mask_8ib_1 = Vec8ib(Vec8f().load_a(srcp + x + 8) == fltMax);
+            const Vec8ib mask_8ib_2 = Vec8ib(Vec8f().load_a(srcp + x + 16) == fltMax);
+            const Vec8ib mask_8ib_3 = Vec8ib(Vec8f().load_a(srcp + x + 24) == fltMax);
             const Vec16sb mask_16sb_0 = Vec16sb(compress_saturated(mask_8ib_0, mask_8ib_1));
             const Vec16sb mask_16sb_1 = Vec16sb(compress_saturated(mask_8ib_2, mask_8ib_3));
             const Vec32cb mask = Vec32cb(compress_saturated(mask_16sb_0, mask_16sb_1));
             select(mask, Vec32uc(255), zero_256b()).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void binarizeCE_avx2(const float * blur, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                     const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec8ib mask_8ib_0 = Vec8ib(Vec8f().load_a(blur + x) == fltMax);
-            const Vec8ib mask_8ib_1 = Vec8ib(Vec8f().load_a(blur + x + 8) == fltMax);
+            const Vec8ib mask_8ib_0 = Vec8ib(Vec8f().load_a(srcp + x) == fltMax);
+            const Vec8ib mask_8ib_1 = Vec8ib(Vec8f().load_a(srcp + x + 8) == fltMax);
             const Vec16sb mask = Vec16sb(compress_saturated(mask_8ib_0, mask_8ib_1));
             select(mask, Vec16us(peak), zero_256b()).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void binarizeCE_avx2(const float * blur, float * dstp, const int width, const int height, const int stride, const int bgStride,
-                     const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8fb mask = Vec8f().load_a(blur + x) == fltMax;
+            const Vec8fb mask = (Vec8f().load_a(srcp + x) == fltMax);
             select(mask, Vec8f(upper), Vec8f(lower)).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template<typename T> void discretizeGM_avx2(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
+template<typename T> static void discretizeGM(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
 
 template<>
-void discretizeGM_avx2(const float * gradient, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                       const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 32) {
-            const Vec8f srcp_8f_0 = Vec8f().load_a(gradient + x);
-            const Vec8f srcp_8f_1 = Vec8f().load_a(gradient + x + 8);
-            const Vec8f srcp_8f_2 = Vec8f().load_a(gradient + x + 16);
-            const Vec8f srcp_8f_3 = Vec8f().load_a(gradient + x + 24);
+            const Vec8f srcp_8f_0 = Vec8f().load_a(_srcp + x);
+            const Vec8f srcp_8f_1 = Vec8f().load_a(_srcp + x + 8);
+            const Vec8f srcp_8f_2 = Vec8f().load_a(_srcp + x + 16);
+            const Vec8f srcp_8f_3 = Vec8f().load_a(_srcp + x + 24);
             const Vec8i srcp_8i_0 = truncate_to_int(mul_add(srcp_8f_0, magnitude, 0.5f));
             const Vec8i srcp_8i_1 = truncate_to_int(mul_add(srcp_8f_1, magnitude, 0.5f));
             const Vec8i srcp_8i_2 = truncate_to_int(mul_add(srcp_8f_2, magnitude, 0.5f));
@@ -315,40 +384,89 @@ void discretizeGM_avx2(const float * gradient, uint8_t * dstp, const int width,
             srcp.stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void discretizeGM_avx2(const float * gradient, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                       const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec8f srcp_8f_0 = Vec8f().load_a(gradient + x);
-            const Vec8f srcp_8f_1 = Vec8f().load_a(gradient + x + 8);
+            const Vec8f srcp_8f_0 = Vec8f().load_a(_srcp + x);
+            const Vec8f srcp_8f_1 = Vec8f().load_a(_srcp + x + 8);
             const Vec8i srcp_8i_0 = truncate_to_int(mul_add(srcp_8f_0, magnitude, 0.5f));
             const Vec8i srcp_8i_1 = truncate_to_int(mul_add(srcp_8f_1, magnitude, 0.5f));
             const Vec16us srcp = compress_saturated_s2u(srcp_8i_0, srcp_8i_1);
             min(srcp, peak).stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void discretizeGM_avx2(const float * gradient, float * dstp, const int width, const int height, const int stride, const int bgStride,
-                       const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec8f srcp = Vec8f().load_a(gradient + x);
+            const Vec8f srcp = Vec8f().load_a(_srcp + x);
             mul_sub(srcp, magnitude, offset).stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
+
+template<typename T>
+void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const TCannyData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
+    for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
+        if (d->process[plane]) {
+            const int width = vsapi->getFrameWidth(src, plane);
+            const int height = vsapi->getFrameHeight(src, plane);
+            const int stride = vsapi->getStride(src, plane) / sizeof(T);
+            const int bgStride = stride + d->radiusAlign * 2;
+            const T * srcp = reinterpret_cast<const T *>(vsapi->getReadPtr(src, plane));
+            T * dstp = reinterpret_cast<T *>(vsapi->getWritePtr(dst, plane));
+
+            const auto threadId = std::this_thread::get_id();
+            float * blur = d->blur.at(threadId) + d->radiusAlign;
+            float * gradient = d->gradient.at(threadId) + bgStride + d->radiusAlign;
+            unsigned * direction = d->direction.at(threadId);
+            bool * found = d->found.at(threadId);
+
+            if (d->radiusV[plane] && d->radiusH[plane])
+                gaussianBlur(srcp, gradient, blur, d->weightsH[plane], d->weightsV[plane], width, height, stride, bgStride, d->radiusH[plane], d->radiusV[plane], d->offset[plane]);
+            else if (d->radiusV[plane])
+                gaussianBlurV(srcp, blur, d->weightsV[plane], width, height, stride, bgStride, d->radiusV[plane], d->offset[plane]);
+            else if (d->radiusH[plane])
+                gaussianBlurH(srcp, gradient, blur, d->weightsH[plane], width, height, stride, bgStride, d->radiusH[plane], d->offset[plane]);
+            else
+                copyPlane(srcp, blur, width, height, stride, bgStride, d->offset[plane]);
+
+            if (d->mode != -1) {
+                detectEdge(blur, gradient, direction, width, height, stride, bgStride, d->mode, d->op);
+
+                if (d->mode == 0) {
+                    nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, d->radiusAlign);
+                    hysteresis(blur, found, width, height, bgStride, d->t_h, d->t_l);
+                }
+            }
+
+            if (d->mode == -1)
+                outputGB(blur, dstp, width, height, bgStride, stride, d->peak, d->offset[plane]);
+            else if (d->mode == 0)
+                binarizeCE(blur, dstp, width, height, bgStride, stride, d->peak, d->lower[plane], d->upper[plane]);
+            else
+                discretizeGM(gradient, dstp, width, height, bgStride, stride, d->magnitude, d->peak, d->offset[plane]);
+        }
+    }
+}
+
+template void filter_avx2<uint8_t>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+template void filter_avx2<uint16_t>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+template void filter_avx2<float>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
 #endif
diff --git a/TCanny/TCanny_SSE2.cpp b/TCanny/TCanny_SSE2.cpp
index 474089b..536b172 100644
--- a/TCanny/TCanny_SSE2.cpp
+++ b/TCanny/TCanny_SSE2.cpp
@@ -2,56 +2,94 @@
 #include "TCanny.hpp"
 
 template<typename T>
-void copyPlane_sse2(const T * srcp, float * blur, const int width, const int height, const int stride, const int bgStride, const float offset) noexcept {
+static void copyPlane(const T * srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 4) {
             if (std::is_same<T, uint8_t>::value)
-                to_float(Vec4i().load_4uc(srcp + x)).stream(blur + x);
+                to_float(Vec4i().load_4uc(srcp + x)).stream(dstp + x);
             else if (std::is_same<T, uint16_t>::value)
-                to_float(Vec4i().load_4us(srcp + x)).stream(blur + x);
+                to_float(Vec4i().load_4us(srcp + x)).stream(dstp + x);
             else
-                (Vec4f().load_a(srcp + x) + offset).stream(blur + x);
+                (Vec4f().load_a(srcp + x) + offset).stream(dstp + x);
         }
 
-        srcp += stride;
-        blur += bgStride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template void copyPlane_sse2(const uint8_t *, float *, const int, const int, const int, const int, const float) noexcept;
-template void copyPlane_sse2(const uint16_t *, float *, const int, const int, const int, const int, const float) noexcept;
-template void copyPlane_sse2(const float *, float *, const int, const int, const int, const int, const float) noexcept;
-
-static inline void gaussianBlurH_sse2(float * buffer, float * blur, const float * weights, const int width, const int radius) noexcept {
-    weights += radius;
+template<typename T>
+static void gaussianBlur(const T * __srcp, float * temp, float * dstp, const float * weightsH, const float * weightsV, const int width, const int height,
+                         const int srcStride, const int dstStride, const int radiusH, const int radiusV, const float offset) noexcept {
+    const int diameter = radiusV * 2 + 1;
+    const T ** _srcp = new const T *[diameter];
 
-    for (int i = 1; i <= radius; i++) {
-        buffer[-i] = buffer[-1 + i];
-        buffer[width - 1 + i] = buffer[width - i];
+    _srcp[radiusV] = __srcp;
+    for (int i = 1; i <= radiusV; i++) {
+        _srcp[radiusV - i] = _srcp[radiusV - 1 + i];
+        _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i;
     }
 
-    for (int x = 0; x < width; x += 4) {
-        Vec4f sum = zero_4f();
+    weightsH += radiusH;
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += 4) {
+            Vec4f sum = zero_4f();
+
+            for (int i = 0; i < diameter; i++) {
+                if (std::is_same<T, uint8_t>::value) {
+                    const Vec4f srcp = to_float(Vec4i().load_4uc(_srcp[i] + x));
+                    sum = mul_add(srcp, weightsV[i], sum);
+                } else if (std::is_same<T, uint16_t>::value) {
+                    const Vec4f srcp = to_float(Vec4i().load_4us(_srcp[i] + x));
+                    sum = mul_add(srcp, weightsV[i], sum);
+                } else {
+                    const Vec4f srcp = Vec4f().load_a(_srcp[i] + x);
+                    sum = mul_add(srcp + offset, weightsV[i], sum);
+                }
+            }
+
+            sum.store_a(temp + x);
+        }
+
+        for (int i = 1; i <= radiusH; i++) {
+            temp[-i] = temp[-1 + i];
+            temp[width - 1 + i] = temp[width - i];
+        }
+
+        for (int x = 0; x < width; x += 4) {
+            Vec4f sum = zero_4f();
 
-        for (int i = -radius; i <= radius; i++) {
-            const Vec4f srcp = Vec4f().load(buffer + x + i);
-            sum = mul_add(srcp, weights[i], sum);
+            for (int i = -radiusH; i <= radiusH; i++) {
+                const Vec4f srcp = Vec4f().load(temp + x + i);
+                sum = mul_add(srcp, weightsH[i], sum);
+            }
+
+            sum.stream(dstp + x);
         }
 
-        sum.stream(blur + x);
+        for (int i = 0; i < diameter - 1; i++)
+            _srcp[i] = _srcp[i + 1];
+        if (y < height - 1 - radiusV)
+            _srcp[diameter - 1] += srcStride;
+        else if (y > height - 1 - radiusV)
+            _srcp[diameter - 1] -= srcStride;
+        dstp += dstStride;
     }
+
+    delete[] _srcp;
 }
 
 template<typename T>
-void gaussianBlurV_sse2(const T * __srcp, float * buffer, float * blur, const float * horizontalWeights, const float * verticalWeights,
-                        const int width, const int height, const int stride, const int bgStride, const int horizontalRadius, const int verticalRadius, const float offset) noexcept {
-    const int diameter = verticalRadius * 2 + 1;
+static void gaussianBlurV(const T * __srcp, float * dstp, const float * weights, const int width, const int height, const int srcStride, const int dstStride,
+                          const int radius, const float offset) noexcept {
+    const int diameter = radius * 2 + 1;
     const T ** _srcp = new const T *[diameter];
 
-    _srcp[verticalRadius] = __srcp;
-    for (int i = 1; i <= verticalRadius; i++) {
-        _srcp[verticalRadius - i] = _srcp[verticalRadius - 1 + i];
-        _srcp[verticalRadius + i] = _srcp[verticalRadius] + stride * i;
+    _srcp[radius] = __srcp;
+    for (int i = 1; i <= radius; i++) {
+        _srcp[radius - i] = _srcp[radius - 1 + i];
+        _srcp[radius + i] = _srcp[radius] + srcStride * i;
     }
 
     for (int y = 0; y < height; y++) {
@@ -61,39 +99,69 @@ void gaussianBlurV_sse2(const T * __srcp, float * buffer, float * blur, const fl
             for (int i = 0; i < diameter; i++) {
                 if (std::is_same<T, uint8_t>::value) {
                     const Vec4f srcp = to_float(Vec4i().load_4uc(_srcp[i] + x));
-                    sum = mul_add(srcp, verticalWeights[i], sum);
+                    sum = mul_add(srcp, weights[i], sum);
                 } else if (std::is_same<T, uint16_t>::value) {
                     const Vec4f srcp = to_float(Vec4i().load_4us(_srcp[i] + x));
-                    sum = mul_add(srcp, verticalWeights[i], sum);
+                    sum = mul_add(srcp, weights[i], sum);
                 } else {
                     const Vec4f srcp = Vec4f().load_a(_srcp[i] + x);
-                    sum = mul_add(srcp + offset, verticalWeights[i], sum);
+                    sum = mul_add(srcp + offset, weights[i], sum);
                 }
             }
 
-            sum.store_a(buffer + x);
+            sum.stream(dstp + x);
         }
 
-        gaussianBlurH_sse2(buffer, blur, horizontalWeights, width, horizontalRadius);
-
         for (int i = 0; i < diameter - 1; i++)
             _srcp[i] = _srcp[i + 1];
-        if (y < height - 1 - verticalRadius)
-            _srcp[diameter - 1] += stride;
-        else if (y > height - 1 - verticalRadius)
-            _srcp[diameter - 1] -= stride;
-        blur += bgStride;
+        if (y < height - 1 - radius)
+            _srcp[diameter - 1] += srcStride;
+        else if (y > height - 1 - radius)
+            _srcp[diameter - 1] -= srcStride;
+        dstp += dstStride;
     }
 
     delete[] _srcp;
 }
 
-template void gaussianBlurV_sse2(const uint8_t *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
-template void gaussianBlurV_sse2(const uint16_t *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
-template void gaussianBlurV_sse2(const float *, float *, float *, const float *, const float *, const int, const int, const int, const int, const int, const int, const float) noexcept;
+template<typename T>
+static void gaussianBlurH(const T * _srcp, float * temp, float * dstp, const float * weights, const int width, const int height,
+                          const int srcStride, const int dstStride, const int radius, const float offset) noexcept {
+    weights += radius;
 
-void detectEdge_sse2(float * blur, float * gradient, unsigned * direction, const int width, const int height, const int stride, const int bgStride,
-                     const int mode, const unsigned op) noexcept {
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += 4) {
+            if (std::is_same<T, uint8_t>::value)
+                to_float(Vec4i().load_4uc(_srcp + x)).store_a(temp + x);
+            else if (std::is_same<T, uint16_t>::value)
+                to_float(Vec4i().load_4us(_srcp + x)).store_a(temp + x);
+            else
+                (Vec4f().load_a(_srcp + x) + offset).store_a(temp + x);
+        }
+
+        for (int i = 1; i <= radius; i++) {
+            temp[-i] = temp[-1 + i];
+            temp[width - 1 + i] = temp[width - i];
+        }
+
+        for (int x = 0; x < width; x += 4) {
+            Vec4f sum = zero_4f();
+
+            for (int i = -radius; i <= radius; i++) {
+                const Vec4f srcp = Vec4f().load(temp + x + i);
+                sum = mul_add(srcp, weights[i], sum);
+            }
+
+            sum.stream(dstp + x);
+        }
+
+        _srcp += srcStride;
+        dstp += dstStride;
+    }
+}
+
+static void detectEdge(float * blur, float * gradient, unsigned * direction, const int width, const int height, const int stride, const int bgStride,
+                       const int mode, const int op) noexcept {
     float * srcpp = blur;
     float * srcp = blur;
     float * srcpn = blur + bgStride;
@@ -151,13 +219,14 @@ void detectEdge_sse2(float * blur, float * gradient, unsigned * direction, const
     }
 }
 
-void nonMaximumSuppression_sse2(const unsigned * _direction, float * _gradient, float * blur, const int width, const int height, const int stride, const int bgStride) noexcept {
+static void nonMaximumSuppression(const unsigned * _direction, float * _gradient, float * blur, const int width, const int height,
+                                  const int stride, const int bgStride, const int radiusAlign) noexcept {
     _gradient[-1] = _gradient[0];
     _gradient[-1 + bgStride * (height - 1)] = _gradient[bgStride * (height - 1)];
     _gradient[width] = _gradient[width - 1];
     _gradient[width + bgStride * (height - 1)] = _gradient[width - 1 + bgStride * (height - 1)];
-    std::copy_n(_gradient - 8, width + 16, _gradient - 8 - bgStride);
-    std::copy_n(_gradient - 8 + bgStride * (height - 1), width + 16, _gradient - 8 + bgStride * height);
+    std::copy_n(_gradient - radiusAlign, width + radiusAlign * 2, _gradient - radiusAlign - bgStride);
+    std::copy_n(_gradient - radiusAlign + bgStride * (height - 1), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * height);
 
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 4) {
@@ -189,118 +258,118 @@ void nonMaximumSuppression_sse2(const unsigned * _direction, float * _gradient,
     }
 }
 
-template<typename T> void outputGB_sse2(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
+template<typename T> static void outputGB(const float *, T *, const int, const int, const int, const int, const uint16_t, const float) noexcept;
 
 template<>
-void outputGB_sse2(const float * blur, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec4i srcp_4i_0 = truncate_to_int(Vec4f().load_a(blur + x) + 0.5f);
-            const Vec4i srcp_4i_1 = truncate_to_int(Vec4f().load_a(blur + x + 4) + 0.5f);
-            const Vec4i srcp_4i_2 = truncate_to_int(Vec4f().load_a(blur + x + 8) + 0.5f);
-            const Vec4i srcp_4i_3 = truncate_to_int(Vec4f().load_a(blur + x + 12) + 0.5f);
+            const Vec4i srcp_4i_0 = truncate_to_int(Vec4f().load_a(_srcp + x) + 0.5f);
+            const Vec4i srcp_4i_1 = truncate_to_int(Vec4f().load_a(_srcp + x + 4) + 0.5f);
+            const Vec4i srcp_4i_2 = truncate_to_int(Vec4f().load_a(_srcp + x + 8) + 0.5f);
+            const Vec4i srcp_4i_3 = truncate_to_int(Vec4f().load_a(_srcp + x + 12) + 0.5f);
             const Vec8s srcp_8s_0 = compress_saturated(srcp_4i_0, srcp_4i_1);
             const Vec8s srcp_8s_1 = compress_saturated(srcp_4i_2, srcp_4i_3);
             const Vec16uc srcp = compress_saturated_s2u(srcp_8s_0, srcp_8s_1);
             srcp.stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void outputGB_sse2(const float * blur, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec4i srcp_4i_0 = truncate_to_int(Vec4f().load_a(blur + x) + 0.5f);
-            const Vec4i srcp_4i_1 = truncate_to_int(Vec4f().load_a(blur + x + 4) + 0.5f);
+            const Vec4i srcp_4i_0 = truncate_to_int(Vec4f().load_a(_srcp + x) + 0.5f);
+            const Vec4i srcp_4i_1 = truncate_to_int(Vec4f().load_a(_srcp + x + 4) + 0.5f);
             const Vec8us srcp = compress_saturated_s2u(srcp_4i_0, srcp_4i_1);
             min(srcp, peak).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void outputGB_sse2(const float * blur, float * dstp, const int width, const int height, const int stride, const int bgStride, const uint16_t peak, const float offset) noexcept {
+void outputGB(const float * _srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 4) {
-            const Vec4f srcp = Vec4f().load_a(blur + x);
+            const Vec4f srcp = Vec4f().load_a(_srcp + x);
             (srcp - offset).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template<typename T> void binarizeCE_sse2(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
+template<typename T> static void binarizeCE(const float *, T *, const int, const int, const int, const int, const uint16_t, const float, const float) noexcept;
 
 template<>
-void binarizeCE_sse2(const float * blur, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                     const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec4ib mask_4ib_0 = Vec4ib(Vec4f().load_a(blur + x) == fltMax);
-            const Vec4ib mask_4ib_1 = Vec4ib(Vec4f().load_a(blur + x + 4) == fltMax);
-            const Vec4ib mask_4ib_2 = Vec4ib(Vec4f().load_a(blur + x + 8) == fltMax);
-            const Vec4ib mask_4ib_3 = Vec4ib(Vec4f().load_a(blur + x + 12) == fltMax);
+            const Vec4ib mask_4ib_0 = Vec4ib(Vec4f().load_a(srcp + x) == fltMax);
+            const Vec4ib mask_4ib_1 = Vec4ib(Vec4f().load_a(srcp + x + 4) == fltMax);
+            const Vec4ib mask_4ib_2 = Vec4ib(Vec4f().load_a(srcp + x + 8) == fltMax);
+            const Vec4ib mask_4ib_3 = Vec4ib(Vec4f().load_a(srcp + x + 12) == fltMax);
             const Vec8sb mask_8sb_0 = Vec8sb(compress_saturated(mask_4ib_0, mask_4ib_1));
             const Vec8sb mask_8sb_1 = Vec8sb(compress_saturated(mask_4ib_2, mask_4ib_3));
             const Vec16cb mask = Vec16cb(compress_saturated(mask_8sb_0, mask_8sb_1));
             select(mask, Vec16uc(255), zero_128b()).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void binarizeCE_sse2(const float * blur, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                     const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec4ib mask_4ib_0 = Vec4ib(Vec4f().load_a(blur + x) == fltMax);
-            const Vec4ib mask_4ib_1 = Vec4ib(Vec4f().load_a(blur + x + 4) == fltMax);
+            const Vec4ib mask_4ib_0 = Vec4ib(Vec4f().load_a(srcp + x) == fltMax);
+            const Vec4ib mask_4ib_1 = Vec4ib(Vec4f().load_a(srcp + x + 4) == fltMax);
             const Vec8sb mask = Vec8sb(compress_saturated(mask_4ib_0, mask_4ib_1));
             select(mask, Vec8us(peak), zero_128b()).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void binarizeCE_sse2(const float * blur, float * dstp, const int width, const int height, const int stride, const int bgStride,
-                     const uint16_t peak, const float lower, const float upper) noexcept {
+void binarizeCE(const float * srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                const uint16_t peak, const float lower, const float upper) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 4) {
-            const Vec4fb mask = Vec4f().load_a(blur + x) == fltMax;
+            const Vec4fb mask = (Vec4f().load_a(srcp + x) == fltMax);
             select(mask, Vec4f(upper), Vec4f(lower)).stream(dstp + x);
         }
 
-        blur += bgStride;
-        dstp += stride;
+        srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
-template<typename T> void discretizeGM_sse2(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
+template<typename T> static void discretizeGM(const float *, T *, const int, const int, const int, const int, const float, const uint16_t, const float) noexcept;
 
 template<>
-void discretizeGM_sse2(const float * gradient, uint8_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                       const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, uint8_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 16) {
-            const Vec4f srcp_4f_0 = Vec4f().load_a(gradient + x);
-            const Vec4f srcp_4f_1 = Vec4f().load_a(gradient + x + 4);
-            const Vec4f srcp_4f_2 = Vec4f().load_a(gradient + x + 8);
-            const Vec4f srcp_4f_3 = Vec4f().load_a(gradient + x + 12);
+            const Vec4f srcp_4f_0 = Vec4f().load_a(_srcp + x);
+            const Vec4f srcp_4f_1 = Vec4f().load_a(_srcp + x + 4);
+            const Vec4f srcp_4f_2 = Vec4f().load_a(_srcp + x + 8);
+            const Vec4f srcp_4f_3 = Vec4f().load_a(_srcp + x + 12);
             const Vec4i srcp_4i_0 = truncate_to_int(mul_add(srcp_4f_0, magnitude, 0.5f));
             const Vec4i srcp_4i_1 = truncate_to_int(mul_add(srcp_4f_1, magnitude, 0.5f));
             const Vec4i srcp_4i_2 = truncate_to_int(mul_add(srcp_4f_2, magnitude, 0.5f));
@@ -311,40 +380,89 @@ void discretizeGM_sse2(const float * gradient, uint8_t * dstp, const int width,
             srcp.stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void discretizeGM_sse2(const float * gradient, uint16_t * dstp, const int width, const int height, const int stride, const int bgStride,
-                       const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, uint16_t * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 8) {
-            const Vec4f srcp_4f_0 = Vec4f().load_a(gradient + x);
-            const Vec4f srcp_4f_1 = Vec4f().load_a(gradient + x + 4);
+            const Vec4f srcp_4f_0 = Vec4f().load_a(_srcp + x);
+            const Vec4f srcp_4f_1 = Vec4f().load_a(_srcp + x + 4);
             const Vec4i srcp_4i_0 = truncate_to_int(mul_add(srcp_4f_0, magnitude, 0.5f));
             const Vec4i srcp_4i_1 = truncate_to_int(mul_add(srcp_4f_1, magnitude, 0.5f));
             const Vec8us srcp = compress_saturated_s2u(srcp_4i_0, srcp_4i_1);
             min(srcp, peak).stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
 
 template<>
-void discretizeGM_sse2(const float * gradient, float * dstp, const int width, const int height, const int stride, const int bgStride,
-                       const float magnitude, const uint16_t peak, const float offset) noexcept {
+void discretizeGM(const float * _srcp, float * dstp, const int width, const int height, const int srcStride, const int dstStride,
+                  const float magnitude, const uint16_t peak, const float offset) noexcept {
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x += 4) {
-            const Vec4f srcp = Vec4f().load_a(gradient + x);
+            const Vec4f srcp = Vec4f().load_a(_srcp + x);
             mul_sub(srcp, magnitude, offset).stream(dstp + x);
         }
 
-        gradient += bgStride;
-        dstp += stride;
+        _srcp += srcStride;
+        dstp += dstStride;
     }
 }
+
+template<typename T>
+void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const TCannyData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
+    for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
+        if (d->process[plane]) {
+            const int width = vsapi->getFrameWidth(src, plane);
+            const int height = vsapi->getFrameHeight(src, plane);
+            const int stride = vsapi->getStride(src, plane) / sizeof(T);
+            const int bgStride = stride + d->radiusAlign * 2;
+            const T * srcp = reinterpret_cast<const T *>(vsapi->getReadPtr(src, plane));
+            T * dstp = reinterpret_cast<T *>(vsapi->getWritePtr(dst, plane));
+
+            const auto threadId = std::this_thread::get_id();
+            float * blur = d->blur.at(threadId) + d->radiusAlign;
+            float * gradient = d->gradient.at(threadId) + bgStride + d->radiusAlign;
+            unsigned * direction = d->direction.at(threadId);
+            bool * found = d->found.at(threadId);
+
+            if (d->radiusV[plane] && d->radiusH[plane])
+                gaussianBlur(srcp, gradient, blur, d->weightsH[plane], d->weightsV[plane], width, height, stride, bgStride, d->radiusH[plane], d->radiusV[plane], d->offset[plane]);
+            else if (d->radiusV[plane])
+                gaussianBlurV(srcp, blur, d->weightsV[plane], width, height, stride, bgStride, d->radiusV[plane], d->offset[plane]);
+            else if (d->radiusH[plane])
+                gaussianBlurH(srcp, gradient, blur, d->weightsH[plane], width, height, stride, bgStride, d->radiusH[plane], d->offset[plane]);
+            else
+                copyPlane(srcp, blur, width, height, stride, bgStride, d->offset[plane]);
+
+            if (d->mode != -1) {
+                detectEdge(blur, gradient, direction, width, height, stride, bgStride, d->mode, d->op);
+
+                if (d->mode == 0) {
+                    nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, d->radiusAlign);
+                    hysteresis(blur, found, width, height, bgStride, d->t_h, d->t_l);
+                }
+            }
+
+            if (d->mode == -1)
+                outputGB(blur, dstp, width, height, bgStride, stride, d->peak, d->offset[plane]);
+            else if (d->mode == 0)
+                binarizeCE(blur, dstp, width, height, bgStride, stride, d->peak, d->lower[plane], d->upper[plane]);
+            else
+                discretizeGM(gradient, dstp, width, height, bgStride, stride, d->magnitude, d->peak, d->offset[plane]);
+        }
+    }
+}
+
+template void filter_sse2<uint8_t>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+template void filter_sse2<uint16_t>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
+template void filter_sse2<float>(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *) noexcept;
 #endif
diff --git a/TCanny/shared.hpp b/TCanny/shared.hpp
new file mode 100644
index 0000000..c702a24
--- /dev/null
+++ b/TCanny/shared.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+#include <VapourSynth.h>
+#include <VSHelper.h>
+
+static inline float * gaussianWeights(const float sigma, int & radius) noexcept {
+    const int diameter = std::max<int>(sigma * 3.f + 0.5f, 1) * 2 + 1;
+    radius = diameter / 2;
+
+    float * VS_RESTRICT weights = new (std::nothrow) float[diameter];
+    if (!weights)
+        return nullptr;
+
+    float sum = 0.f;
+
+    for (int k = -radius; k <= radius; k++) {
+        const float w = std::exp(-(k * k) / (2.f * sigma * sigma));
+        weights[k + radius] = w;
+        sum += w;
+    }
+
+    for (int k = 0; k < diameter; k++)
+        weights[k] /= sum;
+
+    return weights;
+}
diff --git a/meson.build b/meson.build
index a178859..9a97629 100644
--- a/meson.build
+++ b/meson.build
@@ -1,12 +1,13 @@
 project('TCanny', 'cpp',
   default_options : ['buildtype=release', 'b_ndebug=if-release', 'cpp_std=c++14'],
   meson_version : '>=0.48.0',
-  version : '11'
+  version : '12'
 )
 
 add_project_arguments('-ffast-math', language : 'cpp')
 
 sources = [
+  'TCanny/shared.hpp',
   'TCanny/TCanny.cpp',
   'TCanny/TCanny.hpp',
   'TCanny/vectorclass/instrset.h',
@@ -53,13 +54,13 @@ if host_machine.cpu_family().startswith('x86')
   endif
 
   libs += static_library('avx', 'TCanny/TCanny_AVX.cpp',
-    dependencies : [vapoursynth_dep],
+    dependencies : vapoursynth_dep,
     cpp_args : ['-mavx'],
     gnu_symbol_visibility : 'hidden'
   )
 
   libs += static_library('avx2', 'TCanny/TCanny_AVX2.cpp',
-    dependencies : [vapoursynth_dep],
+    dependencies : vapoursynth_dep,
     cpp_args : ['-mavx2', '-mfma'],
     gnu_symbol_visibility : 'hidden'
   )
diff --git a/meson_options.txt b/meson_options.txt
index 27fc587..5b12728 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1 +1,5 @@
-option('opencl', type : 'boolean', value : true, description : 'Enable TCannyCL filter')
+option('opencl',
+  type : 'boolean',
+  value : true,
+  description : 'Enable TCannyCL filter'
+)