Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
HolyWu committed Feb 4, 2019
1 parent 88c9623 commit b48f29b
Show file tree
Hide file tree
Showing 13 changed files with 1,074 additions and 755 deletions.
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ Ported from AviSynth plugin http://bengal.missouri.edu/~kes25c/
Usage
=====

tcanny.TCanny(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int opt=0, int[] planes])
tcanny.TCanny(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int opt=0, int[] planes=[0, 1, 2]])

* clip: Clip to process. Any planar format with either integer sample type of 8-16 bit depth or float sample type of 32 bit depth is supported.

* sigma: Standard deviation of gaussian blur. If a single sigma is specified, it will be used for all planes. The value used internally will be adjusted according to the subsampling of the second and third plane in each direction. If two sigma are given then the second value will be used for the third plane as well.
* sigma: Standard deviation of gaussian blur. If a single `sigma` is specified, it will be used for all planes. If two `sigma` are given then the second value will be used for the third plane as well. The value used internally will be adjusted according to the subsampling of the second and third plane in each direction.

* t_h: High gradient magnitude threshold for hysteresis.

Expand All @@ -30,7 +30,7 @@ Usage
* 2 = the Sobel operator
* 3 = the Scharr operator

* gmmax: Used for scaling gradient magnitude into [0, 2^bitdepth-1] for mode=1.
* gmmax: Used for scaling gradient magnitude into [0, 2^bitdepth-1] for `mode=1`.

* opt: Sets which cpu optimizations to use.
* 0 = auto detect
Expand All @@ -39,11 +39,11 @@ Usage
* 3 = use avx
* 4 = use avx2

* planes: A list of the planes to process. By default all planes are processed.
* planes: Sets which planes will be processed. Any unprocessed planes will be simply copied.

---

tcanny.TCannyCL(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int device=-1, bint list_device=False, bint info=False, int[] planes])
tcanny.TCannyCL(clip clip[, float[] sigma=1.5, float t_h=8.0, float t_l=1.0, int mode=0, int op=1, float gmmax=50.0, int device=-1, bint list_device=False, bint info=False, int[] planes=[0, 1, 2]])

* device: Sets target OpenCL device. Use `list_device` to get the index of the available devices. By default the default device is selected.

Expand All @@ -55,9 +55,10 @@ Usage
Compilation
===========

Requires `Boost` unless specify `-D opencl=false`.
Requires `Boost` unless specify `-Dopencl=false`.

```
meson build
ninja -C build
ninja -C build install
```
72 changes: 36 additions & 36 deletions TCanny/TCanny.cl

Large diffs are not rendered by default.

580 changes: 239 additions & 341 deletions TCanny/TCanny.cpp

Large diffs are not rendered by default.

69 changes: 47 additions & 22 deletions TCanny/TCanny.hpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
#pragma once

#include <algorithm>
#include <cmath>
#include <limits>
#include <memory>
#include <string>
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <vector>

#include <VapourSynth.h>
#include <VSHelper.h>
#include "shared.hpp"

#ifdef VS_TARGET_CPU_X86
#include "vectorclass/vectormath_trig.h"
Expand All @@ -21,24 +16,54 @@ static constexpr float M_1_PIF = 0.318309886183790671538f;
static constexpr float fltMax = std::numeric_limits<float>::max();
static constexpr float fltLowest = std::numeric_limits<float>::lowest();

static inline float * gaussianWeights(const float sigma, int * radius) noexcept {
const int diameter = std::max<int>(sigma * 3.f + 0.5f, 1) * 2 + 1;
*radius = diameter / 2;
struct TCannyData {
VSNodeRef * node;
const VSVideoInfo * vi;
float t_h, t_l;
int mode, op;
bool process[3];
float * weightsH[3], * weightsV[3], magnitude, offset[3], lower[3], upper[3];
int vectorSize, alignment, radiusH[3], radiusV[3], radiusAlign;
uint16_t peak;
std::unordered_map<std::thread::id, float *> blur, gradient;
std::unordered_map<std::thread::id, unsigned *> direction;
std::unordered_map<std::thread::id, bool *> found;
void (*filter)(const VSFrameRef *, VSFrameRef *, const TCannyData * const VS_RESTRICT, const VSAPI *);
};

float * VS_RESTRICT weights = new (std::nothrow) float[diameter];
if (!weights)
return nullptr;
static void hysteresis(float * VS_RESTRICT srcp, bool * VS_RESTRICT found, const int width, const int height, const int stride, const float t_h, const float t_l) noexcept {
std::fill_n(found, width * height, false);
std::vector<std::pair<int, int>> coordinates;

float sum = 0.f;
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
if (!found[width * y + x] && srcp[stride * y + x] >= t_h) {
srcp[stride * y + x] = fltMax;
found[width * y + x] = true;

for (int k = -(*radius); k <= *radius; k++) {
const float w = std::exp(-(k * k) / (2.f * sigma * sigma));
weights[k + *radius] = w;
sum += w;
}
coordinates.emplace_back(std::make_pair(x, y));

while (!coordinates.empty()) {
const auto pos = coordinates.back();
coordinates.pop_back();

for (int k = 0; k < diameter; k++)
weights[k] /= sum;
const int xxStart = std::max(pos.first - 1, 0);
const int xxStop = std::min(pos.first + 1, width - 1);
const int yyStart = std::max(pos.second - 1, 0);
const int yyStop = std::min(pos.second + 1, height - 1);

return weights;
for (int yy = yyStart; yy <= yyStop; yy++) {
for (int xx = xxStart; xx <= xxStop; xx++) {
if (!found[width * yy + xx] && srcp[stride * yy + xx] >= t_l) {
srcp[stride * yy + xx] = fltMax;
found[width * yy + xx] = true;

coordinates.emplace_back(std::make_pair(xx, yy));
}
}
}
}
}
}
}
}
8 changes: 4 additions & 4 deletions TCanny/TCanny.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<IncludePath>C:\Program Files %28x86%29\VapourSynth\sdk\include\vapoursynth;C:\boost_1_68_0;$(CUDA_PATH)\include;$(IncludePath)</IncludePath>
<LibraryPath>$(CUDA_PATH)\lib\$(Platform);C:\boost_1_68_0\lib64-msvc-14.1;$(LibraryPath)</LibraryPath>
<IncludePath>$(CUDA_PATH)\include;C:\boost_1_69_0;C:\Program Files %28x86%29\VapourSynth\sdk\include\vapoursynth;$(IncludePath)</IncludePath>
<LibraryPath>$(CUDA_PATH)\lib\x64;C:\boost_1_69_0\lib64-msvc-14.1;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
Expand All @@ -41,7 +41,6 @@
<DiagnosticsFormat>Column</DiagnosticsFormat>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<BufferSecurityCheck>false</BufferSecurityCheck>
<FloatingPointModel>Fast</FloatingPointModel>
<ConformanceMode>true</ConformanceMode>
<EnforceTypeConversionRules>true</EnforceTypeConversionRules>
<DisableSpecificWarnings>4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
Expand All @@ -50,7 +49,7 @@
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<SubSystem>Windows</SubSystem>
<AdditionalDependencies>OpenCL.lib;libboost_filesystem-vc141-mt-x64-1_68.lib;libboost_system-vc141-mt-x64-1_68.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>OpenCL.lib;libboost_filesystem-vc141-mt-x64-1_69.lib;libboost_system-vc141-mt-x64-1_69.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
Expand All @@ -66,6 +65,7 @@
<ClCompile Include="vectorclass\instrset_detect.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="shared.hpp" />
<ClInclude Include="TCanny.hpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
Expand Down
5 changes: 4 additions & 1 deletion TCanny/TCanny.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
<Extensions>h;hh;hpp;hxx;hm;inl;inc;ipp;xsd</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
Expand Down Expand Up @@ -38,5 +38,8 @@
<ClInclude Include="TCanny.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="shared.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>
77 changes: 39 additions & 38 deletions TCanny/TCannyCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include <clocale>
#include <cstdio>

#include "TCanny.hpp"
#include "shared.hpp"
#include "TCanny.cl"

#define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
Expand All @@ -39,18 +39,18 @@ struct TCannyCLData {
const VSVideoInfo * vi;
int mode;
bool process[3];
int horizontalRadius[3], verticalRadius[3];
int radiusH[3], radiusV[3];
unsigned peak;
float offset[3], lower[3], upper[3];
compute::device gpu;
compute::context ctx;
compute::program program;
compute::buffer horizontalWeights[3], verticalWeights[3];
compute::buffer weightsH[3], weightsV[3];
cl_image_format clImageFormat;
std::unordered_map<std::thread::id, compute::command_queue> queue;
std::unordered_map<std::thread::id, compute::kernel> copyPlane, gaussianBlurH, gaussianBlurV, detectEdge, nonMaximumSuppression, hysteresis, outputGB, binarizeCE, discretizeGM;
std::unordered_map<std::thread::id, compute::image2d> src[3], dst[3], blur[3], gradient[3], direction[3];
std::unordered_map<std::thread::id, compute::buffer> buffer, label;
std::unordered_map<std::thread::id, compute::buffer> buffer, found;
};

static void VS_CC tcannyclInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
Expand Down Expand Up @@ -124,21 +124,21 @@ static const VSFrameRef *VS_CC tcannyclGetFrame(int n, int activationReason, voi
}

d->buffer.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_float), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
d->label.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_uchar), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
d->found.emplace(threadId, compute::buffer{ d->ctx, d->vi->width * d->vi->height * sizeof(cl_uchar), CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
}

auto queue = d->queue.at(threadId);
auto copyPlane = d->copyPlane.at(threadId);
auto gaussianBlurH = d->gaussianBlurH.at(threadId);
auto gaussianBlurV = d->gaussianBlurV.at(threadId);
auto gaussianBlurH = d->gaussianBlurH.at(threadId);
auto detectEdge = d->detectEdge.at(threadId);
auto nonMaximumSuppression = d->nonMaximumSuppression.at(threadId);
auto hysteresis = d->hysteresis.at(threadId);
auto outputGB = d->outputGB.at(threadId);
auto binarizeCE = d->binarizeCE.at(threadId);
auto discretizeGM = d->discretizeGM.at(threadId);
auto buffer = d->buffer.at(threadId);
auto label = d->label.at(threadId);
auto found = d->found.at(threadId);

for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
if (d->process[plane]) {
Expand All @@ -160,32 +160,34 @@ static const VSFrameRef *VS_CC tcannyclGetFrame(int n, int activationReason, voi

queue.enqueue_write_image(srcImage, origin, region, srcp, stride);

if (d->horizontalRadius[plane]) {
gaussianBlurV.set_args(srcImage, gradientImage, d->verticalWeights[plane], d->verticalRadius[plane], d->offset[plane]);
if (d->radiusV[plane]) {
gaussianBlurV.set_args(srcImage, gradientImage, d->weightsV[plane], d->radiusV[plane], d->offset[plane]);
queue.enqueue_nd_range_kernel(gaussianBlurV, 2, nullptr, globalWorkSize, nullptr);

gaussianBlurH.set_args(gradientImage, blurImage, d->horizontalWeights[plane], d->horizontalRadius[plane]);
queue.enqueue_nd_range_kernel(gaussianBlurH, 2, nullptr, globalWorkSize, nullptr);
} else {
copyPlane.set_args(srcImage, blurImage, d->offset[plane]);
copyPlane.set_args(srcImage, d->radiusH[plane] ? gradientImage : blurImage, d->offset[plane]);
queue.enqueue_nd_range_kernel(copyPlane, 2, nullptr, globalWorkSize, nullptr);
}

if (d->radiusH[plane]) {
gaussianBlurH.set_args(gradientImage, blurImage, d->weightsH[plane], d->radiusH[plane]);
queue.enqueue_nd_range_kernel(gaussianBlurH, 2, nullptr, globalWorkSize, nullptr);
}

if (d->mode != -1) {
detectEdge.set_args(blurImage, gradientImage, directionImage);
queue.enqueue_nd_range_kernel(detectEdge, 2, nullptr, globalWorkSize, nullptr);

if (d->mode == 0) {
nonMaximumSuppression.set_args(gradientImage, directionImage, buffer);
nonMaximumSuppression.set_args(directionImage, gradientImage, buffer);
queue.enqueue_nd_range_kernel(nonMaximumSuppression, 2, nullptr, globalWorkSize, nullptr);

constexpr cl_uchar pattern = 0;
queue.enqueue_fill_buffer(label, &pattern, sizeof(cl_uchar), 0, width * height * sizeof(cl_uchar));
queue.enqueue_fill_buffer(found, &pattern, sizeof(cl_uchar), 0, width * height * sizeof(cl_uchar));

const size_t paddedGlobalWorkSize[] = { (width + 7) & -8, (height + 7) & -8 };
const size_t localWorkSize[] = { 8, 8 };

hysteresis.set_args(buffer, label, static_cast<int>(width), static_cast<int>(height));
hysteresis.set_args(buffer, found, static_cast<int>(width), static_cast<int>(height));
queue.enqueue_nd_range_kernel(hysteresis, 2, nullptr, paddedGlobalWorkSize, localWorkSize);
}
}
Expand Down Expand Up @@ -232,27 +234,28 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
d->vi = vsapi->getVideoInfo(d->node);

try {
if (!isConstantFormat(d->vi) || (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
if (!isConstantFormat(d->vi) ||
(d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
(d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32))
throw std::string{ "only constant format 8-16 bit integer and 32 bit float input supported" };

const int numSigma = vsapi->propNumElements(in, "sigma");
if (numSigma > d->vi->format->numPlanes)
throw std::string{ "more sigma given than the number of planes" };

float horizontalSigma[3], verticalSigma[3];
float sigmaH[3], sigmaV[3];

for (int i = 0; i < 3; i++) {
if (i < numSigma) {
horizontalSigma[i] = verticalSigma[i] = static_cast<float>(vsapi->propGetFloat(in, "sigma", i, nullptr));
sigmaH[i] = sigmaV[i] = static_cast<float>(vsapi->propGetFloat(in, "sigma", i, nullptr));
} else if (i == 0) {
horizontalSigma[0] = verticalSigma[0] = 1.5f;
sigmaH[0] = sigmaV[0] = 1.5f;
} else if (i == 1) {
horizontalSigma[1] = horizontalSigma[0] / (1 << d->vi->format->subSamplingW);
verticalSigma[1] = verticalSigma[0] / (1 << d->vi->format->subSamplingH);
sigmaH[1] = sigmaH[0] / (1 << d->vi->format->subSamplingW);
sigmaV[1] = sigmaV[0] / (1 << d->vi->format->subSamplingH);
} else {
horizontalSigma[2] = horizontalSigma[1];
verticalSigma[2] = verticalSigma[1];
sigmaH[2] = sigmaH[1];
sigmaV[2] = sigmaV[1];
}
}

Expand Down Expand Up @@ -296,7 +299,7 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
}

for (int i = 0; i < 3; i++) {
if (horizontalSigma[i] < 0.f)
if (sigmaH[i] < 0.f)
throw std::string{ "sigma must be greater than or equal to 0.0" };
}

Expand Down Expand Up @@ -343,9 +346,7 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
return;
}

d->gpu = compute::system::default_device();
if (device > -1)
d->gpu = compute::system::devices().at(device);
d->gpu = (device < 0) ? compute::system::default_device() : compute::system::devices().at(device);
d->ctx = compute::context{ d->gpu };

if (!!vsapi->propGetInt(in, "info", 0, &err)) {
Expand Down Expand Up @@ -391,16 +392,16 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads;
d->queue.reserve(numThreads);
d->copyPlane.reserve(numThreads);
d->gaussianBlurH.reserve(numThreads);
d->gaussianBlurV.reserve(numThreads);
d->gaussianBlurH.reserve(numThreads);
d->detectEdge.reserve(numThreads);
d->nonMaximumSuppression.reserve(numThreads);
d->hysteresis.reserve(numThreads);
d->outputGB.reserve(numThreads);
d->binarizeCE.reserve(numThreads);
d->discretizeGM.reserve(numThreads);
d->buffer.reserve(numThreads);
d->label.reserve(numThreads);
d->found.reserve(numThreads);
for (int i = 0; i < 3; i++) {
d->src[i].reserve(numThreads);
d->dst[i].reserve(numThreads);
Expand Down Expand Up @@ -432,17 +433,17 @@ void VS_CC tcannyclCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
}

for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
if (d->process[plane] && horizontalSigma[plane]) {
float * horizontalWeights = gaussianWeights(horizontalSigma[plane], &d->horizontalRadius[plane]);
float * verticalWeights = gaussianWeights(verticalSigma[plane], &d->verticalRadius[plane]);
if (!horizontalWeights || !verticalWeights)
if (d->process[plane] && sigmaH[plane]) {
float * weightsH = gaussianWeights(sigmaH[plane], d->radiusH[plane]);
float * weightsV = gaussianWeights(sigmaV[plane], d->radiusV[plane]);
if (!weightsH || !weightsV)
throw std::string{ "malloc failure (weights)" };

d->horizontalWeights[plane] = compute::buffer{ d->ctx, (d->horizontalRadius[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, horizontalWeights };
d->verticalWeights[plane] = compute::buffer{ d->ctx, (d->verticalRadius[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, verticalWeights };
d->weightsH[plane] = compute::buffer{ d->ctx, (d->radiusH[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weightsH };
d->weightsV[plane] = compute::buffer{ d->ctx, (d->radiusV[plane] * 2 + 1) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weightsV };

delete[] horizontalWeights;
delete[] verticalWeights;
delete[] weightsH;
delete[] weightsV;
}
}

Expand Down
Loading

0 comments on commit b48f29b

Please sign in to comment.