Skip to content

Commit

Permalink
systolic-array: Fix bugs for small PE configurations. (#45)
Browse files Browse the repository at this point in the history
systolic-array: Fix bugs for small PE configurations.

To compute a large convolution, we tile the operation into two levels of
"folds". First, the kernels are tiled into (numKerns / peArrayCols) weight
folds (as each PE column will finish a kernel), and within each weight fold,
each output feature map is further tiled into (outputRows * outputCols /
peArrayRows) output folds (as each PE row is responsible for producing a output
channel). Each PE row has a commit unit that collects finished results from the
row and writes the data to the output scratchpad. The current write size is
always set to the line size of the scratchpad, which is not correct for a small
PE column size where the entire row doesn't form a line. This commit fixes this
by taking into account when peArrayCols is smaller than elemsInLine.

Also, this also fixes the incorrect initial tensor iterator place for the
commit units.
  • Loading branch information
yaoyuannnn authored Jan 21, 2022
1 parent 39f50e1 commit d4efbee
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 34 deletions.
72 changes: 40 additions & 32 deletions src/systolic_array/commit.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include <math.h>

#include "systolic_array.h"
#include "commit.h"
#include "activations.h"
Expand Down Expand Up @@ -34,7 +36,7 @@ void Commit::setParams() {
{ 0, 0, 0, 0 },
{ 1, accel.outputRows, accel.outputCols, accel.peArrayCols });
// Move the iterator to the correct starting place.
iter += { 0, 0, 0, accel.lineSize / accel.elemSize * id };
iter += { 0, 0, id, 0 };
// If the iterator reaches the end of the tensor, then this commit unit should
// be left idle through the whole execution.
if (iter.end())
Expand All @@ -56,31 +58,37 @@ void Commit::evaluate() {
return;

// Collect any finished output pixel from the output register of the PEs.
// Since the writeback granularity is a line, if we have collected every
// output pixel forming the line, create a commit request and queue it to the
// commit queue to be sent.
// Since the writeback granularity is a line (or the number of PE columns for
// a small configuration), if we have collected every output pixel in a
// writeback, create a commit request and queue it to the commit queue to be
// sent.
//
// There are two cases where the commit unit will never see some of output
// pixels ready: 1) The commit unit is not used at all, which means the whole
// PE row is left idle. 2) Some of the PE columns are left idle due to a lack
// of weights. In this case, we should do a writeback once all the "active"
// columns have produced outputs.
for (int i = 0; i < inputs.size() / elemsPerLine; i++) {
for (int j = 0; j < elemsPerLine; j++) {
int index = i * elemsPerLine + j;
if (inputs[index]->isWindowEnd()) {
assert(!outputBuffer[index].isWindowEnd() &&

for (int remainingElems = inputs.size(); remainingElems > 0;
remainingElems -= elemsPerLine) {
// Check if we have collected all the pixels for a writeback.
int elemsToWrite = std::min(elemsPerLine, remainingElems);
int start = inputs.size() - remainingElems;
if (isLineComplete(start, elemsToWrite))
queueCommitRequest(start, elemsToWrite);

// Collect any finished output pixels.
for (int i = 0; i < elemsToWrite; i++, start++) {
if (inputs[start]->isWindowEnd()) {
assert(!outputBuffer[start].isWindowEnd() &&
"A new output pixel finished while the previous one from the "
"same PE has not been written back.");
// Collect the output pixel and store it in the local buffer.
outputBuffer[index] = *inputs[index];
DPRINTF(
SystolicCommit, "Collected output data from column %d.\n", index);
outputBuffer[start] = *inputs[start];
DPRINTF(SystolicCommit, "Collected output data from column %d.\n",
start);
}
}
// Check if we have collected all the pixels for a writeback.
if (isLineComplete(i))
queueCommitRequest(i);
}

// Send requests from the commit queue if there are requests waiting
Expand Down Expand Up @@ -111,13 +119,12 @@ void Commit::evaluate() {
}
}

bool Commit::isLineComplete(int lineIndex) {
bool Commit::isLineComplete(int start, int elemsToWrite) {
// Check if every slot in the local output buffer has been filled with
// finished output. We also take the last weight fold into account, where some
// PE columns can be left idle, thus the corresponding slot in the local
// buffer will never see finished data.
for (int i = lineIndex * elemsPerLine; i < (lineIndex + 1) * elemsPerLine;
i++) {
for (int i = start; i < start + elemsToWrite; i++) {
// We have idle PE columns in the last weight fold if the number of weights
// is non-multiples of peArrayCols.
bool haveIdleColumns = remainingWeightFolds == 1 &&
Expand All @@ -138,6 +145,7 @@ void Commit::accumOutputs(float16* currOutputs, float16* prevOutputs) {
}

void Commit::localSpadCallback(PacketPtr pkt) {
assert(pkt->getSize() % accel.elemSize == 0);
DPRINTF(SystolicCommit, "Received response, addr %#x.\n", pkt->getAddr());
CommitSenderState* state = pkt->findNextSenderState<CommitSenderState>();
LineData* lineSlotPtr = state->getCommitQueueSlotPtr();
Expand All @@ -163,14 +171,14 @@ void Commit::localSpadCallback(PacketPtr pkt) {
// If the outputs are finished, do the activation function before we send
// the outputs back to the scratchpad.
activationFunc(lineSlotPtr->getDataPtr<uint8_t>(),
elemsPerLine,
pkt->getSize() / accel.elemSize,
accel.actType,
accel.actParams,
accel.dataType);
}
// Send the write request.
auto req = std::make_shared<Request>(
pkt->getAddr(), accel.lineSize, 0, localSpadMasterId);
pkt->getAddr(), pkt->getSize(), 0, localSpadMasterId);
req->setContext(accel.getContextId());
PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
pkt->dataDynamic(lineSlotPtr->getDataPtr<uint8_t>());
Expand All @@ -184,14 +192,15 @@ void Commit::localSpadCallback(PacketPtr pkt) {
}
}

void Commit::queueCommitRequest(int lineIndex) {
void Commit::queueCommitRequest(int start, int elemsToWrite) {
Addr addr = iter * accel.elemSize;
uint8_t* data = new uint8_t[accel.lineSize]();
int reqSize = elemsToWrite * accel.elemSize;
uint8_t* data = new uint8_t[reqSize];
// Copy data from the buffer for the collected data.
for (int i = 0; i < elemsPerLine; i++) {
if (!outputBuffer[lineIndex * elemsPerLine + i].isBubble()) {
for (int i = 0; i < elemsToWrite; i++) {
if (!outputBuffer[start + i].isBubble()) {
memcpy(&data[i * accel.elemSize],
outputBuffer[lineIndex * elemsPerLine + i].getDataPtr<uint8_t>(),
outputBuffer[start + i].getDataPtr<uint8_t>(),
accel.elemSize);
}
}
Expand All @@ -200,7 +209,7 @@ void Commit::queueCommitRequest(int lineIndex) {
if (accel.accumResults) {
// If we need to accumulate results, read the previous results first.
auto req =
std::make_shared<Request>(addr, accel.lineSize, 0, localSpadMasterId);
std::make_shared<Request>(addr, reqSize, 0, localSpadMasterId);
req->setContext(accel.getContextId());
pkt = new Packet(req, MemCmd::ReadReq);
pkt->allocate();
Expand All @@ -209,13 +218,13 @@ void Commit::queueCommitRequest(int lineIndex) {
if (accel.sendResults) {
// If the outputs are finished, do the activation function before we send
// the outputs back to the scratchpad.
activationFunc(
data, elemsPerLine, accel.actType, accel.actParams, accel.dataType);
activationFunc(data, elemsToWrite, accel.actType, accel.actParams,
accel.dataType);
}
// Directly write to the scratchpad if we don't need to accumulate the
// results.
auto req =
std::make_shared<Request>(addr, accel.lineSize, 0, localSpadMasterId);
std::make_shared<Request>(addr, reqSize, 0, localSpadMasterId);
req->setContext(accel.getContextId());
pkt = new Packet(req, MemCmd::WriteReq);
pkt->dataDynamic(data);
Expand All @@ -234,8 +243,7 @@ void Commit::queueCommitRequest(int lineIndex) {
pkt->pushSenderState(state);

// Clear the line in output buffer.
for (int i = lineIndex * elemsPerLine; i < (lineIndex + 1) * elemsPerLine;
i++) {
for (int i = start; i < start + elemsToWrite; i++) {
outputBuffer[i].clear();
}

Expand All @@ -251,7 +259,7 @@ void Commit::queueCommitRequest(int lineIndex) {
} else {
// Move the iterator to the correct starting place for the next weight
// fold.
iter += { 0, 0, 0, accel.lineSize / accel.elemSize * id };
iter += { 0, 0, id, 0 };
DPRINTF(SystolicCommit, "Advanced iterator to %s.\n", iter);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/systolic_array/commit.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ class Commit : public LocalSpadInterface {
void localSpadCallback(PacketPtr pkt) override;

// Check if we have collected all the output data in the specified line.
bool isLineComplete(int lineIndex);
bool isLineComplete(int start, int elemsToWrite);

// Create a writeback request and queue it to the commit queue.
void queueCommitRequest(int lineIndex);
void queueCommitRequest(int start, int elemsToWrite);

template <typename ElemType>
void accumOutputs(ElemType* currOutputs, ElemType* prevOutputs) {
Expand Down

0 comments on commit d4efbee

Please sign in to comment.