Skip to content

Commit

Permalink
#22 implement ZeroPadding1D
Browse files Browse the repository at this point in the history
  • Loading branch information
fthielke committed May 24, 2024
1 parent 79a51e6 commit dffa5cc
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Src/CompiledNN/CompiledNN.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ namespace NeuralNetwork
ASSERT(kernelSize.size() == strides.size());

std::vector<unsigned int> padding(kernelSize.size());
for(std::size_t i=0;i<padding.size();i++)
for(std::size_t i = 0; i < padding.size(); i++)
{
const unsigned int remainder = node.inputDimensions[0][i] % strides[i];
padding[i] = std::max<int>(0, static_cast<int>(kernelSize[i]) - (remainder ? remainder : strides[i]));
Expand Down
154 changes: 150 additions & 4 deletions Src/CompiledNN/CompiledNN/Operations/ZeroPadding1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,167 @@ namespace NeuralNetwork
{
namespace CompiledNNImpl
{
void ZeroPadding1DCompiler::compile(x86::Assembler&, ActivationFunctionHandler&, const TensorPointerXf& input, const TensorPointerXf& output) const
int ZeroPadding1DCompiler::copyLoopPacked(x86::Assembler& a, const int size, const int numRegs, const bool inputAligned, const bool outputAligned) const
{
const int stepSize = numRegs * 4;
if(size < stepSize || numRegs < 1)
return size;

const int numIterations = size / stepSize;

Label loop;
if(numIterations > 1)
{
loop = a.newLabel();
a.mov(a.zcx(), imm(numIterations));
a.bind(loop);
}

for(int i = 0; i < numRegs; i++)
{
if(inputAligned)
a.movaps(x86::xmm(i), a.ptr_zsi(((i * 4) - stepSize) * sizeof(float)));
else
a.movups(x86::xmm(i), a.ptr_zsi(((i * 4) - stepSize) * sizeof(float)));
}
for(int i = 0; i < numRegs; i++)
{
if(outputAligned)
a.movaps(a.ptr_zdi(((i * 4) - stepSize) * sizeof(float)), x86::xmm(i));
else
a.movups(a.ptr_zdi(((i * 4) - stepSize) * sizeof(float)), x86::xmm(i));
}

a.sub(a.zsi(), imm(stepSize * sizeof(float)));
a.sub(a.zdi(), imm(stepSize * sizeof(float)));

if(numIterations > 1)
{
a.dec(a.zcx());
a.jnz(loop);
}

return size % stepSize;
}

void ZeroPadding1DCompiler::copyLoopSingle(x86::Assembler& a, const int size) const
{
for(int i = 0; i < size; i++)
{
a.movss(x86::xmm(i), a.ptr_zsi((i - size) * sizeof(float)));
}
for(int i = 0; i < size; i++)
{
a.movss(a.ptr_zdi((i - size) * sizeof(float)), x86::xmm(i));
}
}

int ZeroPadding1DCompiler::zeroLoopPacked(x86::Assembler& a, const int size, const int numRegs, const bool aligned, std::vector<bool>& xmmIsZero) const
{
const int stepSize = numRegs * 4;
if(size < stepSize || numRegs < 1)
return size;

const int numIterations = size / stepSize;

for(int i = 0; i < numRegs; i++)
{
if(!xmmIsZero[i])
{
a.pxor(x86::xmm(i), x86::xmm(i));
xmmIsZero[i] = true;
}
}

Label loop;
if(numIterations > 1)
{
loop = a.newLabel();
a.mov(a.zcx(), imm(numIterations));
a.bind(loop);
}

for(int i = 0; i < numRegs; i++)
{
if(aligned)
a.movaps(a.ptr_zdi((i * 4) * sizeof(float)), x86::xmm(i));
else
a.movups(a.ptr_zdi((i * 4) * sizeof(float)), x86::xmm(i));
}

a.add(a.zdi(), imm(stepSize * sizeof(float)));

if(numIterations > 1)
{
a.dec(a.zcx());
a.jnz(loop);
}

return size % stepSize;
}

void ZeroPadding1DCompiler::zeroLoopSingle(x86::Assembler& a, const int size, std::vector<bool>& xmmIsZero) const
{
if(size > 0 && !xmmIsZero[0])
{
a.pxor(x86::xmm(0), x86::xmm(0));
xmmIsZero[0] = true;
}
for(int i = 0; i < size; i++)
{
a.movss(a.ptr_zdi(i * sizeof(float)), x86::xmm(xmmIsZero[i] ? i : 0));
}
}

void ZeroPadding1DCompiler::compile(x86::Assembler& a, ActivationFunctionHandler&, const TensorPointerXf& input, const TensorPointerXf& output) const
{
ASSERT(input.rank() == 2);
ASSERT(output.rank() == 2);
ASSERT(input.dims(0) + p.padding[ZeroPadding1DLayer::LEFT] + p.padding[ZeroPadding1DLayer::RIGHT] == output.dims(0));
ASSERT(input.dims(1) == output.dims(1));
ASSERT(input.data() == output.data());

std::vector<bool> xmmIsZero(settings.xmmRegs(), false);

if(p.padding[ZeroPadding1DLayer::LEFT] > 0)
{
// Copy data
input.size();
a.mov(a.zsi(), imm(input.data() + input.size()));
a.mov(a.zdi(), imm(output.data() + (input.size() + p.padding[ZeroPadding1DLayer::LEFT] * input.dims(1))));

const bool inputAligned = (input.size() % 4) == 0;
const bool outputAligned = ((input.size() + p.padding[ZeroPadding1DLayer::LEFT] * input.dims(1)) % 4) == 0;
int remainingSize = copyLoopPacked(a, input.size(), settings.xmmRegs(), inputAligned, outputAligned);
if(remainingSize > 0)
{
remainingSize = copyLoopPacked(a, remainingSize, remainingSize / 4, inputAligned, outputAligned);
if(remainingSize > 0)
copyLoopSingle(a, remainingSize);
}

// Set left border to zero
a.mov(a.zdi(), imm(output.data()));
remainingSize = zeroLoopPacked(a, p.padding[ZeroPadding1DLayer::LEFT] * input.dims(1), settings.xmmRegs(), true, xmmIsZero);
if(remainingSize > 0)
{
remainingSize = zeroLoopPacked(a, remainingSize, remainingSize / 4, true, xmmIsZero);
if(remainingSize > 0)
zeroLoopSingle(a, remainingSize, xmmIsZero);
}
}

FAIL("Not implemented");
if(p.padding[ZeroPadding1DLayer::RIGHT] > 0)
{
// Set right border to zero
a.mov(a.zdi(), imm(output.data() + (input.size() + p.padding[ZeroPadding1DLayer::LEFT] * input.dims(1))));
const bool aligned = ((input.size() + p.padding[ZeroPadding1DLayer::LEFT] * input.dims(1)) % 4) == 0;
int remainingSize = zeroLoopPacked(a, p.padding[ZeroPadding1DLayer::RIGHT] * input.dims(1), settings.xmmRegs(), aligned, xmmIsZero);
if(remainingSize > 0)
{
remainingSize = zeroLoopPacked(a, remainingSize, remainingSize / 4, aligned, xmmIsZero);
if(remainingSize > 0)
zeroLoopSingle(a, remainingSize, xmmIsZero);
}
}
}
}
}
7 changes: 7 additions & 0 deletions Src/CompiledNN/CompiledNN/Operations/ZeroPadding1D.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#pragma once

#include <vector>
#include "../CompiledNNImplBase.h"

namespace NeuralNetwork
Expand Down Expand Up @@ -37,6 +38,12 @@ namespace NeuralNetwork
inputDimensions[1]
}};
}

private:
int copyLoopPacked(x86::Assembler& a, const int size, const int numRegs, const bool inputAligned, const bool outputAligned) const;
void copyLoopSingle(x86::Assembler& a, const int size) const;
int zeroLoopPacked(x86::Assembler& a, const int size, const int numRegs, const bool aligned, std::vector<bool>& xmmIsZero) const;
void zeroLoopSingle(x86::Assembler& a, const int size, std::vector<bool>& xmmIsZero) const;
};
}
}
2 changes: 1 addition & 1 deletion Src/CompiledNN/Formats/KerasHDF5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ namespace NeuralNetwork
std::unique_ptr<Layer> parseZeroPadding1DLayer(const SimpleMap::Record* config, const KerasHDF5::GetWeights2FuncType&, unsigned long)
{
const SimpleMap::Array* padding = getRecordEntry<SimpleMap::Array>(config, "padding");
const std::string dataFormat = getLiteral<std::string>(getRecordEntry<SimpleMap::Literal>(config, "data_format"));
const std::string dataFormat = config->find("data_format") == config->end() ? "channels_last" : getLiteral<std::string>(getRecordEntry<SimpleMap::Literal>(config, "data_format"));

if(dataFormat != "channels_last")
FAIL("Data formats other than channels last are not supported.");
Expand Down
2 changes: 1 addition & 1 deletion Src/CompiledNN/SimpleNN.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ namespace NeuralNetwork

std::vector<unsigned int> i(2);

for(i[0] = 0; i[0] < output.dims(1); i[0]++)
for(i[0] = 0; i[0] < output.dims(0); i[0]++)
{
if(i[0] < layer.padding[ZeroPadding1DLayer::LEFT] || output.dims(0) - i[0] <= layer.padding[ZeroPadding1DLayer::RIGHT])
{
Expand Down

0 comments on commit dffa5cc

Please sign in to comment.