Skip to content

Commit

Permalink
Add SSE2, AVX2 and AVX512 optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
HolyWu committed Jun 19, 2020
1 parent a91c3b9 commit 75782b6
Show file tree
Hide file tree
Showing 31 changed files with 46,597 additions and 42 deletions.
108 changes: 70 additions & 38 deletions AddGrain/AddGrain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,14 @@
#include <algorithm>
#include <memory>
#include <string>
#include <type_traits>
#include <vector>

#include <VapourSynth.h>
#include <VSHelper.h>

// max # of noise planes
static constexpr int MAXP = 2;

// offset in pixels of the fake plane MAXP relative to plane MAXP-1
static constexpr int OFFSET_FAKEPLANE = 32;

struct AddGrainData final {
VSNodeRef * node;
const VSVideoInfo * vi;
float var, uvar, hcorr, vcorr;
bool constant;
bool process[3];
int storedFrames, peak;
std::vector<uint8_t> pNoiseSeeds;
long idum;
int nStride[MAXP], nHeight[MAXP], nSize[MAXP];
void * pN[MAXP];
};

#include "AddGrain.h"

#ifdef ADDGRAIN_X86
template<typename pixel_t, typename noise_t> extern void updateFrame_sse2(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
template<typename pixel_t, typename noise_t> extern void updateFrame_avx2(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
template<typename pixel_t, typename noise_t> extern void updateFrame_avx512(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
#endif

template<typename T>
static T getArg(const VSAPI * vsapi, const VSMap * map, const char * key, const T defaultValue) noexcept {
Expand Down Expand Up @@ -137,7 +121,7 @@ static void generateNoise(const int planesNoise, const float scale, AddGrainData
d->nSize[plane] = d->nStride[plane] * h;

// allocate space for noise
d->pN[plane] = vs_aligned_malloc(d->nSize[plane] * sizeof(noise_t), 16);
d->pN[plane] = malloc(d->nSize[plane] * sizeof(noise_t));

for (int x = 0; x < d->nStride[plane]; x++)
lastLine[x] = gaussianRand(mean, pvar[plane], iset, gset, d->idum); // things to vertically smooth against
Expand Down Expand Up @@ -198,8 +182,10 @@ static void setRand(int & plane, int & noiseOffs, const int frameNumber, AddGrai
}

template<typename pixel_t, typename noise_t>
static void updateFrame(const pixel_t * srcp, pixel_t * VS_RESTRICT dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs,
const AddGrainData * const VS_RESTRICT d) noexcept {
static void updateFrame_c(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs,
const AddGrainData * const VS_RESTRICT d) noexcept {
const pixel_t * srcp = reinterpret_cast<const pixel_t *>(_srcp);
pixel_t * VS_RESTRICT dstp = reinterpret_cast<pixel_t *>(_dstp);
const noise_t * pNW = reinterpret_cast<noise_t *>(d->pN[noisePlane]) + noiseOffs;

for (int y = 0; y < height; y++) {
Expand Down Expand Up @@ -243,13 +229,7 @@ static const VSFrameRef * VS_CC addgrainGetFrame(int n, int activationReason, vo
int noisePlane = (d->vi->format->colorFamily == cmRGB) ? 0 : plane;
int noiseOffs = 0;
setRand(noisePlane, noiseOffs, n, d); // seed randomness w/ plane & frame

if (d->vi->format->bytesPerSample == 1)
updateFrame<uint8_t, int8_t>(srcp, dstp, width, height, stride, noisePlane, noiseOffs, d);
else if (d->vi->format->bytesPerSample == 2)
updateFrame<uint16_t, int16_t>(reinterpret_cast<const uint16_t *>(srcp), reinterpret_cast<uint16_t *>(dstp), width, height, stride, noisePlane, noiseOffs, d);
else
updateFrame<float, float>(reinterpret_cast<const float *>(srcp), reinterpret_cast<float *>(dstp), width, height, stride, noisePlane, noiseOffs, d);
d->updateFrame(srcp, dstp, width, height, stride, noisePlane, noiseOffs, d);
}
}

Expand All @@ -266,7 +246,7 @@ static void VS_CC addgrainFree(void * instanceData, VSCore * core, const VSAPI *
vsapi->freeNode(d->node);

for (int i = 0; i < MAXP; i++)
vs_aligned_free(d->pN[i]);
free(d->pN[i]);

delete d;
}
Expand All @@ -291,10 +271,61 @@ static void VS_CC addgrainCreate(const VSMap * in, VSMap * out, void * userData,
d->vcorr = getArg(vsapi, in, "vcorr", 0.0f);
long seed = getArg(vsapi, in, "seed", -1);
d->constant = getArg(vsapi, in, "constant", false);
const int opt = getArg(vsapi, in, "opt", 0);

if (d->hcorr < 0.0f || d->hcorr > 1.0f || d->vcorr < 0.0f || d->vcorr > 1.0f)
throw "hcorr and vcorr must be between 0.0 and 1.0 (inclusive)"sv;

if (opt < 0 || opt > 4)
throw "opt must be 0, 1, 2, 3, or 4"sv;

{
if (d->vi->format->bytesPerSample == 1)
d->updateFrame = updateFrame_c<uint8_t, int8_t>;
else if (d->vi->format->bytesPerSample == 2)
d->updateFrame = updateFrame_c<uint16_t, int16_t>;
else
d->updateFrame = updateFrame_c<float, float>;

#ifdef ADDGRAIN_X86
const int iset = instrset_detect();
if ((opt == 0 && iset >= 10) || opt == 4) {
if (d->vi->format->bytesPerSample == 1) {
d->updateFrame = updateFrame_avx512<uint8_t, int8_t>;
d->step = 64;
} else if (d->vi->format->bytesPerSample == 2) {
d->updateFrame = updateFrame_avx512<uint16_t, int16_t>;
d->step = 32;
} else {
d->updateFrame = updateFrame_avx512<float, float>;
d->step = 16;
}
} else if ((opt == 0 && iset >= 8) || opt == 3) {
if (d->vi->format->bytesPerSample == 1) {
d->updateFrame = updateFrame_avx2<uint8_t, int8_t>;
d->step = 32;
} else if (d->vi->format->bytesPerSample == 2) {
d->updateFrame = updateFrame_avx2<uint16_t, int16_t>;
d->step = 16;
} else {
d->updateFrame = updateFrame_avx2<float, float>;
d->step = 8;
}
} else if ((opt == 0 && iset >= 2) || opt == 2) {
if (d->vi->format->bytesPerSample == 1) {
d->updateFrame = updateFrame_sse2<uint8_t, int8_t>;
d->step = 16;
} else if (d->vi->format->bytesPerSample == 2) {
d->updateFrame = updateFrame_sse2<uint16_t, int16_t>;
d->step = 8;
} else {
d->updateFrame = updateFrame_sse2<float, float>;
d->step = 4;
}
}
#endif
}

float scale;
if (d->vi->format->sampleType == stInteger) {
d->peak = (1 << d->vi->format->bitsPerSample) - 1;
Expand All @@ -304,15 +335,15 @@ static void VS_CC addgrainCreate(const VSMap * in, VSMap * out, void * userData,
}

int planesNoise = 1;
d->nStride[0] = (d->vi->width + 15) & ~15; // first plane
d->nStride[0] = (d->vi->width + 63) & ~63; // first plane
d->nHeight[0] = d->vi->height;
if (d->vi->format->colorFamily == cmGray) {
d->uvar = 0.0f;
} else if (d->vi->format->colorFamily == cmRGB) {
d->uvar = d->var;
} else {
planesNoise = 2;
d->nStride[1] = ((d->vi->width >> d->vi->format->subSamplingW) + 15) & ~15; // second and third plane
d->nStride[1] = ((d->vi->width >> d->vi->format->subSamplingW) + 63) & ~63; // second and third plane
d->nHeight[1] = d->vi->height >> d->vi->format->subSamplingH;
}

Expand Down Expand Up @@ -359,6 +390,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegiste
"hcorr:float:opt;"
"vcorr:float:opt;"
"seed:int:opt;"
"constant:int:opt;",
"constant:int:opt;"
"opt:int:opt;",
addgrainCreate, nullptr, plugin);
}
31 changes: 31 additions & 0 deletions AddGrain/AddGrain.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#pragma once

#include <type_traits>
#include <vector>

#include <VapourSynth.h>
#include <VSHelper.h>

#ifdef ADDGRAIN_X86
#include "VCL2/vectorclass.h"
#endif

// max # of noise planes
static constexpr int MAXP = 2;

// offset in pixels of the fake plane MAXP relative to plane MAXP-1
static constexpr int OFFSET_FAKEPLANE = 32;

struct AddGrainData final {
VSNodeRef * node;
const VSVideoInfo * vi;
float var, uvar, hcorr, vcorr;
bool constant;
bool process[3];
int storedFrames, step, peak;
std::vector<uint8_t> pNoiseSeeds;
long idum;
int nStride[MAXP], nHeight[MAXP], nSize[MAXP];
void * pN[MAXP];
void (*updateFrame)(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
};
14 changes: 13 additions & 1 deletion AddGrain/AddGrain.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>ADDGRAIN_X86;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<WarningLevel>Level3</WarningLevel>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<BufferSecurityCheck>false</BufferSecurityCheck>
<ConformanceMode>true</ConformanceMode>
<LanguageStandard>stdcpp17</LanguageStandard>
Expand All @@ -49,6 +50,17 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="AddGrain.cpp" />
<ClCompile Include="AddGrain_AVX2.cpp">
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
</ClCompile>
<ClCompile Include="AddGrain_AVX512.cpp">
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
</ClCompile>
<ClCompile Include="AddGrain_SSE2.cpp" />
<ClCompile Include="VCL2\instrset_detect.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="AddGrain.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
Expand Down
17 changes: 17 additions & 0 deletions AddGrain/AddGrain.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,22 @@
<ClCompile Include="AddGrain.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="AddGrain_SSE2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="AddGrain_AVX2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="AddGrain_AVX512.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="VCL2\instrset_detect.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="AddGrain.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>
45 changes: 45 additions & 0 deletions AddGrain/AddGrain_AVX2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifdef ADDGRAIN_X86
#include "AddGrain.h"

template<typename pixel_t, typename noise_t>
void updateFrame_avx2(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs,
const AddGrainData * const VS_RESTRICT d) noexcept {
const pixel_t * srcp = reinterpret_cast<const pixel_t *>(_srcp);
pixel_t * dstp = reinterpret_cast<pixel_t *>(_dstp);
const noise_t * pNW = reinterpret_cast<noise_t *>(d->pN[noisePlane]) + noiseOffs;

for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x += d->step) {
if constexpr (std::is_same_v<pixel_t, uint8_t>) {
const Vec32c sign = 0x80;
Vec32c src = Vec32c().load_a(srcp + x);
const Vec32c noise = Vec32c().load(pNW + x);
src ^= sign;
src = add_saturated(src, noise);
src ^= sign;
src.store_nt(dstp + x);
} else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
const Vec16s sign = 0x8000;
Vec16s src = Vec16s().load_a(srcp + x);
const Vec16s noise = Vec16s().load(pNW + x);
src ^= sign;
src = add_saturated(src, noise);
src ^= sign;
min(Vec16us(src), d->peak).store_nt(dstp + x);
} else {
Vec8f src = Vec8f().load_a(srcp + x);
const Vec8f noise = Vec8f().load(pNW + x);
(src + noise).store_nt(dstp + x);
}
}

srcp += stride;
dstp += stride;
pNW += d->nStride[noisePlane];
}
}

template void updateFrame_avx2<uint8_t, int8_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
template void updateFrame_avx2<uint16_t, int16_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
template void updateFrame_avx2<float, float>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
#endif
45 changes: 45 additions & 0 deletions AddGrain/AddGrain_AVX512.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifdef ADDGRAIN_X86
#include "AddGrain.h"

template<typename pixel_t, typename noise_t>
void updateFrame_avx512(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs,
const AddGrainData * const VS_RESTRICT d) noexcept {
const pixel_t * srcp = reinterpret_cast<const pixel_t *>(_srcp);
pixel_t * dstp = reinterpret_cast<pixel_t *>(_dstp);
const noise_t * pNW = reinterpret_cast<noise_t *>(d->pN[noisePlane]) + noiseOffs;

for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x += d->step) {
if constexpr (std::is_same_v<pixel_t, uint8_t>) {
const Vec64c sign = 0x80;
Vec64c src = Vec64c().load_a(srcp + x);
const Vec64c noise = Vec64c().load(pNW + x);
src ^= sign;
src = add_saturated(src, noise);
src ^= sign;
src.store_nt(dstp + x);
} else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
const Vec32s sign = 0x8000;
Vec32s src = Vec32s().load_a(srcp + x);
const Vec32s noise = Vec32s().load(pNW + x);
src ^= sign;
src = add_saturated(src, noise);
src ^= sign;
min(Vec32us(src), d->peak).store_nt(dstp + x);
} else {
Vec16f src = Vec16f().load_a(srcp + x);
const Vec16f noise = Vec16f().load(pNW + x);
(src + noise).store_nt(dstp + x);
}
}

srcp += stride;
dstp += stride;
pNW += d->nStride[noisePlane];
}
}

template void updateFrame_avx512<uint8_t, int8_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
template void updateFrame_avx512<uint16_t, int16_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
template void updateFrame_avx512<float, float>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
#endif
45 changes: 45 additions & 0 deletions AddGrain/AddGrain_SSE2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifdef ADDGRAIN_X86
#include "AddGrain.h"

template<typename pixel_t, typename noise_t>
void updateFrame_sse2(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs,
const AddGrainData * const VS_RESTRICT d) noexcept {
const pixel_t * srcp = reinterpret_cast<const pixel_t *>(_srcp);
pixel_t * dstp = reinterpret_cast<pixel_t *>(_dstp);
const noise_t * pNW = reinterpret_cast<noise_t *>(d->pN[noisePlane]) + noiseOffs;

for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x += d->step) {
if constexpr (std::is_same_v<pixel_t, uint8_t>) {
const Vec16c sign = 0x80;
Vec16c src = Vec16c().load_a(srcp + x);
const Vec16c noise = Vec16c().load(pNW + x);
src ^= sign;
src = add_saturated(src, noise);
src ^= sign;
src.store_nt(dstp + x);
} else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
const Vec8s sign = 0x8000;
Vec8s src = Vec8s().load_a(srcp + x);
const Vec8s noise = Vec8s().load(pNW + x);
src ^= sign;
src = add_saturated(src, noise);
src ^= sign;
min(Vec8us(src), d->peak).store_nt(dstp + x);
} else {
Vec4f src = Vec4f().load_a(srcp + x);
const Vec4f noise = Vec4f().load(pNW + x);
(src + noise).store_nt(dstp + x);
}
}

srcp += stride;
dstp += stride;
pNW += d->nStride[noisePlane];
}
}

template void updateFrame_sse2<uint8_t, int8_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
template void updateFrame_sse2<uint16_t, int16_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
template void updateFrame_sse2<float, float>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept;
#endif
Loading

0 comments on commit 75782b6

Please sign in to comment.