-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add SSE2, AVX2 and AVX512 optimizations
- Loading branch information
Showing
31 changed files
with
46,597 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#pragma once | ||
|
||
#include <type_traits> | ||
#include <vector> | ||
|
||
#include <VapourSynth.h> | ||
#include <VSHelper.h> | ||
|
||
#ifdef ADDGRAIN_X86 | ||
#include "VCL2/vectorclass.h" | ||
#endif | ||
|
||
// max # of noise planes | ||
static constexpr int MAXP = 2; | ||
|
||
// offset in pixels of the fake plane MAXP relative to plane MAXP-1 | ||
static constexpr int OFFSET_FAKEPLANE = 32; | ||
|
||
struct AddGrainData final { | ||
VSNodeRef * node; | ||
const VSVideoInfo * vi; | ||
float var, uvar, hcorr, vcorr; | ||
bool constant; | ||
bool process[3]; | ||
int storedFrames, step, peak; | ||
std::vector<uint8_t> pNoiseSeeds; | ||
long idum; | ||
int nStride[MAXP], nHeight[MAXP], nSize[MAXP]; | ||
void * pN[MAXP]; | ||
void (*updateFrame)(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#ifdef ADDGRAIN_X86 | ||
#include "AddGrain.h" | ||
|
||
template<typename pixel_t, typename noise_t> | ||
void updateFrame_avx2(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, | ||
const AddGrainData * const VS_RESTRICT d) noexcept { | ||
const pixel_t * srcp = reinterpret_cast<const pixel_t *>(_srcp); | ||
pixel_t * dstp = reinterpret_cast<pixel_t *>(_dstp); | ||
const noise_t * pNW = reinterpret_cast<noise_t *>(d->pN[noisePlane]) + noiseOffs; | ||
|
||
for (int y = 0; y < height; y++) { | ||
for (int x = 0; x < width; x += d->step) { | ||
if constexpr (std::is_same_v<pixel_t, uint8_t>) { | ||
const Vec32c sign = 0x80; | ||
Vec32c src = Vec32c().load_a(srcp + x); | ||
const Vec32c noise = Vec32c().load(pNW + x); | ||
src ^= sign; | ||
src = add_saturated(src, noise); | ||
src ^= sign; | ||
src.store_nt(dstp + x); | ||
} else if constexpr (std::is_same_v<pixel_t, uint16_t>) { | ||
const Vec16s sign = 0x8000; | ||
Vec16s src = Vec16s().load_a(srcp + x); | ||
const Vec16s noise = Vec16s().load(pNW + x); | ||
src ^= sign; | ||
src = add_saturated(src, noise); | ||
src ^= sign; | ||
min(Vec16us(src), d->peak).store_nt(dstp + x); | ||
} else { | ||
Vec8f src = Vec8f().load_a(srcp + x); | ||
const Vec8f noise = Vec8f().load(pNW + x); | ||
(src + noise).store_nt(dstp + x); | ||
} | ||
} | ||
|
||
srcp += stride; | ||
dstp += stride; | ||
pNW += d->nStride[noisePlane]; | ||
} | ||
} | ||
|
||
template void updateFrame_avx2<uint8_t, int8_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
template void updateFrame_avx2<uint16_t, int16_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
template void updateFrame_avx2<float, float>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#ifdef ADDGRAIN_X86 | ||
#include "AddGrain.h" | ||
|
||
template<typename pixel_t, typename noise_t> | ||
void updateFrame_avx512(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, | ||
const AddGrainData * const VS_RESTRICT d) noexcept { | ||
const pixel_t * srcp = reinterpret_cast<const pixel_t *>(_srcp); | ||
pixel_t * dstp = reinterpret_cast<pixel_t *>(_dstp); | ||
const noise_t * pNW = reinterpret_cast<noise_t *>(d->pN[noisePlane]) + noiseOffs; | ||
|
||
for (int y = 0; y < height; y++) { | ||
for (int x = 0; x < width; x += d->step) { | ||
if constexpr (std::is_same_v<pixel_t, uint8_t>) { | ||
const Vec64c sign = 0x80; | ||
Vec64c src = Vec64c().load_a(srcp + x); | ||
const Vec64c noise = Vec64c().load(pNW + x); | ||
src ^= sign; | ||
src = add_saturated(src, noise); | ||
src ^= sign; | ||
src.store_nt(dstp + x); | ||
} else if constexpr (std::is_same_v<pixel_t, uint16_t>) { | ||
const Vec32s sign = 0x8000; | ||
Vec32s src = Vec32s().load_a(srcp + x); | ||
const Vec32s noise = Vec32s().load(pNW + x); | ||
src ^= sign; | ||
src = add_saturated(src, noise); | ||
src ^= sign; | ||
min(Vec32us(src), d->peak).store_nt(dstp + x); | ||
} else { | ||
Vec16f src = Vec16f().load_a(srcp + x); | ||
const Vec16f noise = Vec16f().load(pNW + x); | ||
(src + noise).store_nt(dstp + x); | ||
} | ||
} | ||
|
||
srcp += stride; | ||
dstp += stride; | ||
pNW += d->nStride[noisePlane]; | ||
} | ||
} | ||
|
||
template void updateFrame_avx512<uint8_t, int8_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
template void updateFrame_avx512<uint16_t, int16_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
template void updateFrame_avx512<float, float>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#ifdef ADDGRAIN_X86 | ||
#include "AddGrain.h" | ||
|
||
template<typename pixel_t, typename noise_t> | ||
void updateFrame_sse2(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, | ||
const AddGrainData * const VS_RESTRICT d) noexcept { | ||
const pixel_t * srcp = reinterpret_cast<const pixel_t *>(_srcp); | ||
pixel_t * dstp = reinterpret_cast<pixel_t *>(_dstp); | ||
const noise_t * pNW = reinterpret_cast<noise_t *>(d->pN[noisePlane]) + noiseOffs; | ||
|
||
for (int y = 0; y < height; y++) { | ||
for (int x = 0; x < width; x += d->step) { | ||
if constexpr (std::is_same_v<pixel_t, uint8_t>) { | ||
const Vec16c sign = 0x80; | ||
Vec16c src = Vec16c().load_a(srcp + x); | ||
const Vec16c noise = Vec16c().load(pNW + x); | ||
src ^= sign; | ||
src = add_saturated(src, noise); | ||
src ^= sign; | ||
src.store_nt(dstp + x); | ||
} else if constexpr (std::is_same_v<pixel_t, uint16_t>) { | ||
const Vec8s sign = 0x8000; | ||
Vec8s src = Vec8s().load_a(srcp + x); | ||
const Vec8s noise = Vec8s().load(pNW + x); | ||
src ^= sign; | ||
src = add_saturated(src, noise); | ||
src ^= sign; | ||
min(Vec8us(src), d->peak).store_nt(dstp + x); | ||
} else { | ||
Vec4f src = Vec4f().load_a(srcp + x); | ||
const Vec4f noise = Vec4f().load(pNW + x); | ||
(src + noise).store_nt(dstp + x); | ||
} | ||
} | ||
|
||
srcp += stride; | ||
dstp += stride; | ||
pNW += d->nStride[noisePlane]; | ||
} | ||
} | ||
|
||
template void updateFrame_sse2<uint8_t, int8_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
template void updateFrame_sse2<uint16_t, int16_t>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
template void updateFrame_sse2<float, float>(const void * _srcp, void * _dstp, const int width, const int height, const int stride, const int noisePlane, const int noiseOffs, const AddGrainData * const VS_RESTRICT d) noexcept; | ||
#endif |
Oops, something went wrong.