Skip to content

Commit

Permalink
#22 implement Pooling1D
Browse files Browse the repository at this point in the history
  • Loading branch information
fthielke committed May 24, 2024
1 parent dffa5cc commit 85bd44b
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 6 deletions.
196 changes: 192 additions & 4 deletions Src/CompiledNN/CompiledNN/Operations/Pooling1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,159 @@ namespace NeuralNetwork
}
}

void Pooling1DCompiler::pool(x86::Assembler& a, const unsigned int padding, const unsigned int channels, bool& helperRegInitialized) const
{
const bool aligned = channels % 4 == 0;
const bool isPadded = padding > 0;
const unsigned int regsPerStep = aligned && !(isPadded && p.method == PoolingMethod::max) ? settings.xmmRegs() : settings.xmmRegs() - 1;
const x86::Xmm helperReg = aligned ? x86::xmm(settings.xmmRegs() - 1) : x86::xmm(settings.xmmRegs() - 2);

if(!helperRegInitialized && (channels + 3) / 4 < (aligned ? settings.xmmRegs() : settings.xmmRegs() - 1))
{
if(isPadded && p.method == PoolingMethod::max)
a.xorps(helperReg, helperReg);
else if(p.method == PoolingMethod::average && p.kernelSize > 1)
a.movaps(helperReg, x86::ptr(constants.back().label));

helperRegInitialized = true;
}

for(unsigned int channelOffset = 0; channelOffset < channels; channelOffset += 4 * regsPerStep)
{
const unsigned int processedChannels = std::min(regsPerStep * 4, channels - channelOffset);
const unsigned int stepSize = (processedChannels + 3) / 4;

// Apply filter
bool first = true;
for(unsigned int filterIndex = 0; filterIndex < p.kernelSize - padding; filterIndex++)
{
unsigned int offset = (filterIndex * channels + channelOffset) * sizeof(float);
if(first)
{
for(unsigned int step = 0; step < stepSize; step++)
{
if(aligned)
a.movaps(x86::xmm(step), a.ptr_zsi(offset));
else
a.movups(x86::xmm(step), a.ptr_zsi(offset));
offset += 4 * sizeof(float);
}

first = false;
}
else
{
if(aligned)
{
for(unsigned int step = 0; step < stepSize; step++)
{
if(p.method == PoolingMethod::average)
a.addps(x86::xmm(step), a.ptr_zsi(offset));
else // method == Pooling2DLayer::PoolingMethod::max
a.maxps(x86::xmm(step), a.ptr_zsi(offset));
offset += 4 * sizeof(float);
}
}
else
{
const unsigned int helperOffset = stepSize;
const unsigned int helperCount = settings.xmmRegs() - stepSize;
unsigned int helper = 0;
for(unsigned int step = 0; step < stepSize;)
{
a.movups(x86::xmm(helperOffset + helper), a.ptr_zsi(offset));
step++;
offset += 4 * sizeof(float);
helper++;

if(helper == helperCount)
{
for(helper = 0; helper < helperCount; helper++)
{
if(p.method == PoolingMethod::average)
a.addps(x86::xmm(step - helperCount + helper), x86::xmm(helperOffset + helper));
else // method == Pooling2DLayer::PoolingMethod::max
a.maxps(x86::xmm(step - helperCount + helper), x86::xmm(helperOffset + helper));
}

helper = 0;
}
}

if(helper != 0)
{
for(unsigned int i = 0; i < helper; i++)
{
if(p.method == PoolingMethod::average)
a.addps(x86::xmm(stepSize - (helper - i)), x86::xmm(helperOffset + i));
else // method == Pooling2DLayer::PoolingMethod::max
a.maxps(x86::xmm(stepSize - (helper - i)), x86::xmm(helperOffset + i));
}
}
}
}
}

if(isPadded && p.method == PoolingMethod::max)
{
if(!helperRegInitialized)
{
a.xorps(x86::xmm(settings.xmmRegs() - 1), x86::xmm(settings.xmmRegs() - 1));
for(unsigned int step = 0; step < stepSize; step++)
a.maxps(x86::xmm(step), x86::xmm(settings.xmmRegs() - 1));
}
else
{
a.xorps(helperReg, helperReg);
for(unsigned int step = 0; step < stepSize; step++)
a.maxps(x86::xmm(step), helperReg);
}
}
if(p.method == PoolingMethod::average && p.kernelSize > 1)
{
if(!helperRegInitialized)
{
for(unsigned int step = 0; step < stepSize; step++)
a.mulps(x86::xmm(step), x86::ptr(constants.back().label));
}
else
{
for(unsigned int step = 0; step < stepSize; step++)
a.mulps(x86::xmm(step), helperReg);
}
}

// Store results
for(unsigned int step = 0; step < stepSize; step++)
{
if(aligned)
a.movaps(a.ptr_zdi((channelOffset + step * 4) * sizeof(float)), x86::xmm(step));
else
a.movups(a.ptr_zdi((channelOffset + step * 4) * sizeof(float)), x86::xmm(step));
}
}

a.add(a.zdi(), imm(channels * sizeof(float)));
}

void Pooling1DCompiler::compile(x86::Assembler& a, ActivationFunctionHandler&, const TensorPointerXf& input, const TensorPointerXf& output) const
{
ASSERT(input.rank() == 2);
ASSERT(output.rank() == 2);
const unsigned int inputWidth = input.dims(0);
const unsigned int outputWidth = output.dims(0);
const unsigned int channels = input.dims(1);

if(p.kernelSize <= 1 && p.stride <= 1)
return;

// Calculate padding (cf. https://github.com/eigenteam/eigen-git-mirror/blob/master/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h#L262)
const bool validPadding = p.padding == PaddingType::valid;
//const unsigned int paddingLeft = validPadding ? 0 : ((output.dims(0) - 1) * p.stride + p.kernelSize - input.dims(0)) / 2;
const unsigned int paddingLeft = validPadding ? 0 : ((outputWidth - 1) * p.stride + p.kernelSize - inputWidth) / 2;
if(validPadding)
ASSERT(output.dims(0) == (input.dims(0) - p.kernelSize + p.stride) / p.stride);
ASSERT(outputWidth == (inputWidth - p.kernelSize + p.stride) / p.stride);
else
ASSERT(output.dims(0) == (input.dims(0) + p.stride - 1) / p.stride);
ASSERT(outputWidth == (inputWidth + p.stride - 1) / p.stride);

// Load input/output base addresses
a.mov(a.zsi(), imm(input.data()));
Expand All @@ -44,7 +182,57 @@ namespace NeuralNetwork
else
a.mov(a.zdi(), imm(output.data()));

FAIL("Not implemented");
bool helperRegInitialized = false;

// Pool left-padded cells
unsigned int inputCol = 0;
unsigned int outputCol = 0;
for(; inputCol < paddingLeft; inputCol += p.stride, outputCol++)
{
pool(a, paddingLeft - inputCol, channels, helperRegInitialized);
}
if(inputCol > paddingLeft)
{
a.add(a.zsi(), imm((inputCol - paddingLeft) * channels * sizeof(float)));
}

// Calculate number of non-padded cols
unsigned int nonPaddedCols = 0;
for(; inputCol < paddingLeft + inputWidth - p.kernelSize + 1; inputCol += p.stride, outputCol++, nonPaddedCols++);

if(nonPaddedCols)
{
// Begin loop over image cols
Label inputColLoop;
if(nonPaddedCols > 1)
{
a.mov(a.zcx(), imm(nonPaddedCols));
inputColLoop = a.newLabel();
a.bind(inputColLoop);
}

// Pool current cell
pool(a, 0, channels, helperRegInitialized);

// Set input offset to next column, respecting the stride
a.add(a.zsi(), imm(p.stride * channels * sizeof(float)));

// End loop over image cols
if(nonPaddedCols > 1)
{
a.dec(a.zcx());
a.jnz(inputColLoop);
}
}

// Pool right-padded cells
for(; outputCol < outputWidth; inputCol += p.stride, outputCol++)
{
pool(a, inputCol + p.kernelSize - (paddingLeft + inputWidth), channels, helperRegInitialized);

if(outputCol < outputWidth - 1)
a.add(a.zsi(), imm(p.stride * channels * sizeof(float)));
}
}
}
}
9 changes: 7 additions & 2 deletions Src/CompiledNN/CompiledNN/Operations/Pooling1D.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,14 @@ namespace NeuralNetwork
inline std::vector<unsigned int> calcOutputDimensions(const std::vector<unsigned int>& inputDimensions) const override
{
ASSERT(inputDimensions.size() == 2);
return {{(inputDimensions[0] - (p.padding == PaddingType::valid ? p.kernelSize - 1 : 0) + p.stride - 1) / p.stride,
inputDimensions[1]}};
return {{
(inputDimensions[0] - (p.padding == PaddingType::valid ? p.kernelSize - 1 : 0) + p.stride - 1) / p.stride,
inputDimensions[1]
}};
}

private:
void pool(x86::Assembler& a, const unsigned int padding, const unsigned int channels, bool& helperRegInitialized) const;
};
}
}

0 comments on commit 85bd44b

Please sign in to comment.