Refactor

HomeOfVapourSynthEvolution · Feb 4, 2019 · b48f29b · b48f29b
1 parent 88c9623
commit b48f29b
Show file tree

Hide file tree

Showing 13 changed files with 1,074 additions and 755 deletions.
diff --git a/README.md b/README.md
@@ -9,11 +9,11 @@ Ported from AviSynth plugin http://bengal.missouri.edu/~kes25c/
 Usage
 =====
 
-    tcanny.TCanny(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int opt=0, int[] planes])
+    tcanny.TCanny(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int opt=0, int[] planes=[0, 1, 2]])
 
 * clip: Clip to process. Any planar format with either integer sample type of 8-16 bit depth or float sample type of 32 bit depth is supported.
 
-* sigma: Standard deviation of gaussian blur. If a single sigma is specified, it will be used for all planes. The value used internally will be adjusted according to the subsampling of the second and third plane in each direction. If two sigma are given then the second value will be used for the third plane as well.
+* sigma: Standard deviation of gaussian blur. If a single `sigma` is specified, it will be used for all planes. If two `sigma` are given then the second value will be used for the third plane as well. The value used internally will be adjusted according to the subsampling of the second and third plane in each direction.
 
 * t_h: High gradient magnitude threshold for hysteresis.
 
@@ -30,7 +30,7 @@ Usage
   * 2 = the Sobel operator
   * 3 = the Scharr operator
 
-* gmmax: Used for scaling gradient magnitude into [0, 2^bitdepth-1] for mode=1.
+* gmmax: Used for scaling gradient magnitude into [0, 2^bitdepth-1] for `mode=1`.
 
 * opt: Sets which cpu optimizations to use.
   * 0 = auto detect
@@ -39,11 +39,11 @@ Usage
   * 3 = use avx
   * 4 = use avx2
 
-* planes: A list of the planes to process. By default all planes are processed.
+* planes: Sets which planes will be processed. Any unprocessed planes will be simply copied.
 
 ---
 
-    tcanny.TCannyCL(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int device=-1, bint list_device=False, bint info=False, int[] planes])
+    tcanny.TCannyCL(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int device=-1, bint list_device=False, bint info=False, int[] planes=[0, 1, 2]])
 
 * device: Sets target OpenCL device. Use `list_device` to get the index of the available devices. By default the default device is selected.
 
@@ -55,9 +55,10 @@ Usage
 Compilation
 ===========
 
-Requires `Boost` unless specify `-D opencl=false`.
+Requires `Boost` unless specify `-Dopencl=false`.
 
 ```
 meson build
 ninja -C build
+ninja -C build install
 ```
diff --git a/TCanny/TCanny.cl b/TCanny/TCanny.cl
diff --git a/TCanny/TCanny.cpp b/TCanny/TCanny.cpp
diff --git a/TCanny/TCanny.hpp b/TCanny/TCanny.hpp
@@ -1,16 +1,11 @@
 #pragma once
 
-#include <algorithm>
 #include <cmath>
 #include <limits>
-#include <memory>
-#include <string>
-#include <thread>
 #include <type_traits>
-#include <unordered_map>
+#include <vector>
 
-#include <VapourSynth.h>
-#include <VSHelper.h>
+#include "shared.hpp"
 
 #ifdef VS_TARGET_CPU_X86
 #include "vectorclass/vectormath_trig.h"
@@ -21,24 +16,54 @@ static constexpr float M_1_PIF = 0.318309886183790671538f;
 static constexpr float fltMax = std::numeric_limits<float>::max();
 static constexpr float fltLowest = std::numeric_limits<float>::lowest();
 
-static inline float * gaussianWeights(const float sigma, int * radius) noexcept {
-    const int diameter = std::max<int>(sigma * 3.f + 0.5f, 1) * 2 + 1;
-    *radius = diameter / 2;
+struct TCannyData {
+    VSNodeRef * node;
+    const VSVideoInfo * vi;
+    float t_h, t_l;
+    int mode, op;
+    bool process[3];
+    float * weightsH[3], * weightsV[3], magnitude, offset[3], lower[3], upper[3];
+    int vectorSize, alignment, radiusH[3], radiusV[3], radiusAlign;
+    uint16_t peak;
+    std::unordered_map<std::thread::id, float *> blur, gradient;
+    std::unordered_map<std::thread::id, unsigned *> direction;
+    std::unordered_map<std::thread::id, bool *> found;
+    void (*filter)(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *);
+};
 
-    float * VS_RESTRICT weights = new (std::nothrow) float[diameter];
-    if (!weights)
-        return nullptr;
+static void hysteresis(float * VS_RESTRICT srcp, bool * VS_RESTRICT found, const int width, const int height, const int stride, const float t_h, const float t_l) noexcept {
+    std::fill_n(found, width * height, false);
+    std::vector<std::pair<int, int>> coordinates;
 
-    float sum = 0.f;
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            if (!found[width * y + x] && srcp[stride * y + x] >= t_h) {
+                srcp[stride * y + x] = fltMax;
+                found[width * y + x] = true;
 
-    for (int k = -(*radius); k <= *radius; k++) {
-        const float w = std::exp(-(k * k) / (2.f * sigma * sigma));
-        weights[k + *radius] = w;
-        sum += w;
-    }
+                coordinates.emplace_back(std::make_pair(x, y));
+
+                while (!coordinates.empty()) {
+                    const auto pos = coordinates.back();
+                    coordinates.pop_back();
 
-    for (int k = 0; k < diameter; k++)
-        weights[k] /= sum;
+                    const int xxStart = std::max(pos.first - 1, 0);
+                    const int xxStop = std::min(pos.first + 1, width - 1);
+                    const int yyStart = std::max(pos.second - 1, 0);
+                    const int yyStop = std::min(pos.second + 1, height - 1);
 
-    return weights;
+                    for (int yy = yyStart; yy <= yyStop; yy++) {
+                        for (int xx = xxStart; xx <= xxStop; xx++) {
+                            if (!found[width * yy + xx] && srcp[stride * yy + xx] >= t_l) {
+                                srcp[stride * yy + xx] = fltMax;
+                                found[width * yy + xx] = true;
+
+                                coordinates.emplace_back(std::make_pair(xx, yy));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
 }
diff --git a/TCanny/TCanny.vcxproj b/TCanny/TCanny.vcxproj
@@ -31,8 +31,8 @@
   </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <IncludePath>C:\Program Files %28x86%29\VapourSynth\sdk\include\vapoursynth;C:\boost_1_68_0;$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
-    <LibraryPath>$(CUDA_PATH)\lib\$(Platform);C:\boost_1_68_0\lib64-msvc-14.1;$(LibraryPath)</LibraryPath>
+    <IncludePath>$(CUDA_PATH)\include;C:\boost_1_69_0;C:\Program Files %28x86%29\VapourSynth\sdk\include\vapoursynth;$(IncludePath)</IncludePath>
+    <LibraryPath>$(CUDA_PATH)\lib\x64;C:\boost_1_69_0\lib64-msvc-14.1;$(LibraryPath)</LibraryPath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
@@ -41,7 +41,6 @@
       <DiagnosticsFormat>Column</DiagnosticsFormat>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
       <BufferSecurityCheck>false</BufferSecurityCheck>
-      <FloatingPointModel>Fast</FloatingPointModel>
       <ConformanceMode>true</ConformanceMode>
       <EnforceTypeConversionRules>true</EnforceTypeConversionRules>
       <DisableSpecificWarnings>4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
@@ -50,7 +49,7 @@
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
       <SubSystem>Windows</SubSystem>
-      <AdditionalDependencies>OpenCL.lib;libboost_filesystem-vc141-mt-x64-1_68.lib;libboost_system-vc141-mt-x64-1_68.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>OpenCL.lib;libboost_filesystem-vc141-mt-x64-1_69.lib;libboost_system-vc141-mt-x64-1_69.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
@@ -66,6 +65,7 @@
     <ClCompile Include="vectorclass\instrset_detect.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="shared.hpp" />
     <ClInclude Include="TCanny.hpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

diff --git a/TCanny/TCanny.vcxproj.filters b/TCanny/TCanny.vcxproj.filters
@@ -7,7 +7,7 @@
     </Filter>
     <Filter Include="Header Files">
       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
-      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;ipp;xsd</Extensions>
     </Filter>
     <Filter Include="Resource Files">
       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
@@ -38,5 +38,8 @@
     <ClInclude Include="TCanny.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="shared.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
diff --git a/TCanny/TCannyCL.cpp b/TCanny/TCannyCL.cpp
@@ -24,7 +24,7 @@
 #include <clocale>
 #include <cstdio>
 
-#include "TCanny.hpp"
+#include "shared.hpp"
 #include "TCanny.cl"
 
 #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
@@ -39,18 +39,18 @@ struct TCannyCLData {
     const VSVideoInfo * vi;
     int mode;
     bool process[3];
-    int horizontalRadius[3], verticalRadius[3];
+    int radiusH[3], radiusV[3];
     unsigned peak;
     float offset[3], lower[3], upper[3];
     compute::device gpu;
     compute::context ctx;
     compute::program program;
-    compute::buffer horizontalWeights[3], verticalWeights[3];
+    compute::buffer weightsH[3], weightsV[3];
     cl_image_format clImageFormat;
     std::unordered_map<std::thread::id, compute::command_queue> queue;
     std::unordered_map<std::thread::id, compute::kernel> copyPlane, gaussianBlurH, gaussianBlurV, detectEdge, nonMaximumSuppression, hysteresis, outputGB, binarizeCE, discretizeGM;
     std::unordered_map<std::thread::id, compute::image2d> src[3], dst[3], blur[3], gradient[3], direction[3];
-    std::unordered_map<std::thread::id, compute::buffer> buffer, label;
+    std::unordered_map<std::thread::id, compute::buffer> buffer, found;
 };
 
 static void VS_CC tcannyclInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
@@ -124,21 +124,21 @@ static const VSFrameRef *VS_CC tcannyclGetFrame(int n, int activationReason, voi
                 }
 
                 d->buffer.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_float), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
-                d->label.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_uchar), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
+                d->found.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_uchar), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
             }
 
             auto queue = d->queue.at(threadId);
             auto copyPlane = d->copyPlane.at(threadId);
-            auto gaussianBlurH = d->gaussianBlurH.at(threadId);
             auto gaussianBlurV = d->gaussianBlurV.at(threadId);
+            auto gaussianBlurH = d->gaussianBlurH.at(threadId);
             auto detectEdge = d->detectEdge.at(threadId);
             auto nonMaximumSuppression = d->nonMaximumSuppression.at(threadId);
             auto hysteresis = d->hysteresis.at(threadId);
             auto outputGB = d->outputGB.at(threadId);
             auto binarizeCE = d->binarizeCE.at(threadId);
             auto discretizeGM = d->discretizeGM.at(threadId);
             auto buffer = d->buffer.at(threadId);
-            auto label = d->label.at(threadId);
+            auto found = d->found.at(threadId);
 
             for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
                 if (d->process[plane]) {
@@ -160,32 +160,34 @@ static const VSFrameRef *VS_CC tcannyclGetFrame(int n, int activationReason, voi
 
                     queue.enqueue_write_image(srcImage, origin, region, srcp, stride);
 
-                    if (d->horizontalRadius[plane]) {
-                        gaussianBlurV.set_args(srcImage, gradientImage, d->verticalWeights[plane], d->verticalRadius[plane], d->offset[plane]);
+                    if (d->radiusV[plane]) {
+                        gaussianBlurV.set_args(srcImage, gradientImage, d->weightsV[plane], d->radiusV[plane], d->offset[plane]);
                         queue.enqueue_nd_range_kernel(gaussianBlurV, 2, nullptr, globalWorkSize, nullptr);
-
-                        gaussianBlurH.set_args(gradientImage, blurImage, d->horizontalWeights[plane], d->horizontalRadius[plane]);
-                        queue.enqueue_nd_range_kernel(gaussianBlurH, 2, nullptr, globalWorkSize, nullptr);
                     } else {
-                        copyPlane.set_args(srcImage, blurImage, d->offset[plane]);
+                        copyPlane.set_args(srcImage, d->radiusH[plane] ? gradientImage : blurImage, d->offset[plane]);
                         queue.enqueue_nd_range_kernel(copyPlane, 2, nullptr, globalWorkSize, nullptr);
                     }
 
+                    if (d->radiusH[plane]) {
+                        gaussianBlurH.set_args(gradientImage, blurImage, d->weightsH[plane], d->radiusH[plane]);
+                        queue.enqueue_nd_range_kernel(gaussianBlurH, 2, nullptr, globalWorkSize, nullptr);
+                    }
+
                     if (d->mode != -1) {
                         detectEdge.set_args(blurImage, gradientImage, directionImage);
                         queue.enqueue_nd_range_kernel(detectEdge, 2, nullptr, globalWorkSize, nullptr);
 
                         if (d->mode == 0) {
-                            nonMaximumSuppression.set_args(gradientImage, directionImage, buffer);
+                            nonMaximumSuppression.set_args(directionImage, gradientImage, buffer);
                             queue.enqueue_nd_range_kernel(nonMaximumSuppression, 2, nullptr, globalWorkSize, nullptr);
 
                             constexpr cl_uchar pattern = 0;
-                            queue.enqueue_fill_buffer(label, &pattern, sizeof(cl_uchar), 0, width * height * sizeof(cl_uchar));
+                            queue.enqueue_fill_buffer(found, &pattern, sizeof(cl_uchar), 0, width * height * sizeof(cl_uchar));
 
                             const size_t paddedGlobalWorkSize[] = { (width + 7) & -8, (height + 7) & -8 };
                             const size_t localWorkSize[] = { 8, 8 };
 
-                            hysteresis.set_args(buffer, label, static_cast<int>(width), static_cast<int>(height));
+                            hysteresis.set_args(buffer, found, static_cast<int>(width), static_cast<int>(height));
                             queue.enqueue_nd_range_kernel(hysteresis, 2, nullptr, paddedGlobalWorkSize, localWorkSize);
                         }
                     }
@@ -232,27 +234,28 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
     d->vi = vsapi->getVideoInfo(d->node);
 
     try {
-        if (!isConstantFormat(d->vi) || (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
+        if (!isConstantFormat(d->vi) ||
+            (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
             (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32))
             throw std::string{ "only constant format 8-16 bit integer and 32 bit float input supported" };
 
         const int numSigma = vsapi->propNumElements(in, "sigma");
         if (numSigma > d->vi->format->numPlanes)
             throw std::string{ "more sigma given than the number of planes" };
 
-        float horizontalSigma[3], verticalSigma[3];
+        float sigmaH[3], sigmaV[3];
 
         for (int i = 0; i < 3; i++) {
             if (i < numSigma) {
-                horizontalSigma[i] = verticalSigma[i] = static_cast<float>(vsapi->propGetFloat(in, "sigma", i, nullptr));
+                sigmaH[i] = sigmaV[i] = static_cast<float>(vsapi->propGetFloat(in, "sigma", i, nullptr));
             } else if (i == 0) {
-                horizontalSigma[0] = verticalSigma[0] = 1.5f;
+                sigmaH[0] = sigmaV[0] = 1.5f;
             } else if (i == 1) {
-                horizontalSigma[1] = horizontalSigma[0] / (1 << d->vi->format->subSamplingW);
-                verticalSigma[1] = verticalSigma[0] / (1 << d->vi->format->subSamplingH);
+                sigmaH[1] = sigmaH[0] / (1 << d->vi->format->subSamplingW);
+                sigmaV[1] = sigmaV[0] / (1 << d->vi->format->subSamplingH);
             } else {
-                horizontalSigma[2] = horizontalSigma[1];
-                verticalSigma[2] = verticalSigma[1];
+                sigmaH[2] = sigmaH[1];
+                sigmaV[2] = sigmaV[1];
             }
         }
 
@@ -296,7 +299,7 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
         }
 
         for (int i = 0; i < 3; i++) {
-            if (horizontalSigma[i] < 0.f)
+            if (sigmaH[i] < 0.f)
                 throw std::string{ "sigma must be greater than or equal to 0.0" };
         }
 
@@ -343,9 +346,7 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
             return;
         }
 
-        d->gpu = compute::system::default_device();
-        if (device > -1)
-            d->gpu = compute::system::devices().at(device);
+        d->gpu = (device < 0) ? compute::system::default_device() : compute::system::devices().at(device);
         d->ctx = compute::context{ d->gpu };
 
         if (!!vsapi->propGetInt(in, "info", 0, &err)) {
@@ -391,16 +392,16 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
         const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads;
         d->queue.reserve(numThreads);
         d->copyPlane.reserve(numThreads);
-        d->gaussianBlurH.reserve(numThreads);
         d->gaussianBlurV.reserve(numThreads);
+        d->gaussianBlurH.reserve(numThreads);
         d->detectEdge.reserve(numThreads);
         d->nonMaximumSuppression.reserve(numThreads);
         d->hysteresis.reserve(numThreads);
         d->outputGB.reserve(numThreads);
         d->binarizeCE.reserve(numThreads);
         d->discretizeGM.reserve(numThreads);
         d->buffer.reserve(numThreads);
-        d->label.reserve(numThreads);
+        d->found.reserve(numThreads);
         for (int i = 0; i < 3; i++) {
             d->src[i].reserve(numThreads);
             d->dst[i].reserve(numThreads);
@@ -432,17 +433,17 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
         }
 
         for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
-            if (d->process[plane] && horizontalSigma[plane]) {
-                float * horizontalWeights = gaussianWeights(horizontalSigma[plane], &d->horizontalRadius[plane]);
-                float * verticalWeights = gaussianWeights(verticalSigma[plane], &d->verticalRadius[plane]);
-                if (!horizontalWeights || !verticalWeights)
+            if (d->process[plane] && sigmaH[plane]) {
+                float * weightsH = gaussianWeights(sigmaH[plane], d->radiusH[plane]);
+                float * weightsV = gaussianWeights(sigmaV[plane], d->radiusV[plane]);
+                if (!weightsH || !weightsV)
                     throw std::string{ "malloc failure (weights)" };
 
-                d->horizontalWeights[plane] = compute::buffer{ d->ctx, (d->horizontalRadius[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, horizontalWeights };
-                d->verticalWeights[plane] = compute::buffer{ d->ctx, (d->verticalRadius[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, verticalWeights };
+                d->weightsH[plane] = compute::buffer{ d->ctx, (d->radiusH[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weightsH };
+                d->weightsV[plane] = compute::buffer{ d->ctx, (d->radiusV[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weightsV };
 
-                delete[] horizontalWeights;
-                delete[] verticalWeights;
+                delete[] weightsH;
+                delete[] weightsV;
             }
         }