Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[aievec] Migrate aievec:UPDOp to aievec AIE1 dialect #1673

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.td
Original file line number Diff line number Diff line change
Expand Up @@ -329,4 +329,33 @@ def AIEVecAIE1_ExtOp:
}];
}

def AIEVecAIE1_UPDOp:
AIEVecAIE1_Op<"upd", [
Pure,
AttrSizedOperandSegments
]>,
Arguments<(ins AnyShaped:$source,
Variadic<Index>:$indices,
DefaultValuedAttr<AIEI32Attr, "0">:$offset,
DefaultValuedAttr<ConfinedAttr<AIEI8Attr,
[IntMinValue<0>, IntMaxValue<1>]>, "0">:$index,
Optional<AnyVector>:$vector)>,
Results<(outs AnyVector:$result)> {
let summary = "AIE upd";
let description = [{
AMD-specific update intrinsic. General upd intrinsic updates contiguous
lanes of the result vector from a smaller source vector. This form of
upd intrinsic combines the load of data from memory into a vector
register, and then updating the lanes of the result vector using it.
`$result = upd($source[$indices], $offset, $index)`
}];
let builders = [
OpBuilder<(ins "mlir::Type":$resultType, "mlir::Value":$source,
"mlir::ValueRange":$indices,
"int32_t":$offset, "int8_t":$index),
[{build($_builder, $_state, resultType, source, indices,
offset, index, nullptr);}]>
];
}

#endif // AIEVEC_AIE1_OPS
128 changes: 0 additions & 128 deletions lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1128,133 +1128,6 @@ class SRSOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::SRSOp> {
}
};

class UPDOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::UPDOp> {
public:
using ConvertOpToLLVMPattern<aievec::UPDOp>::ConvertOpToLLVMPattern;

static std::string getIntrinsicName(aievec::UPDOp op, int loadSize) {
auto resultType = cast<VectorType>(op.getResult().getType());
std::stringstream ss;
ss << "llvm.aie.upd.";
ss << (loadSize == 128 ? 'v' : loadSize == 256 ? 'w' : 'x') << ".";
ss << getVectorTypeString(resultType) << ".";
// The index affects which intrinsic to call
ss << (op.getIndex() == 0 ? "lo" : "hi");
return ss.str();
}

LogicalResult
matchAndRewrite(aievec::UPDOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
auto module = op->getParentOfType<ModuleOp>();
MLIRContext *context = rewriter.getContext();

// A bit more complicated: load the vector, then update result vector
// AIE1 is capable of 128-bit on one bank and 256-bit loads on even-odd
// banks Identify size of update
int vecSizeInBits =
getVectorSizeInBits(cast<VectorType>(op.getResult().getType()));

auto ptr = this->getStridedElementPtr(
op->getLoc(), cast<MemRefType>(op.getSource().getType()),
adaptor.getSource(), adaptor.getIndices(), rewriter);

// TODO: handle the offset field

if (vecSizeInBits <= 256) {
// Total <=256-bit updates are much simpler:
// we can do a direct load into the vector register
// look at the indices to calculate the address
auto vectorPtrType = LLVM::LLVMPointerType::get(
getContext(),
cast<MemRefType>(op.getSource().getType()).getMemorySpaceAsInt());
auto castedPtr =
rewriter.create<LLVM::BitcastOp>(op->getLoc(), vectorPtrType, ptr);
auto vecType = cast<VectorType>(op.getResult().getType());
rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, vecType, castedPtr, 1);
} else {
// Total >256-bit updates will require upd ops to fill the whole vector
// each UDP op represents one of these 256-bit loads and updates

// Determine the load size
// TODO: no examples of 1024-bit output vectors: doesn't feel right
// to attempt a 512-bit load to do an update like this
int loadSize = vecSizeInBits == 256 ? 128
: vecSizeInBits == 512 ? 256
: 512;

// Create a vectorType for the load proper
// Load half of the final result vector
auto resultType = cast<VectorType>(op.getResult().getType());
int lanes = getVectorLaneSize(resultType);
auto loadType =
VectorType::get({(int64_t)lanes / 2}, resultType.getElementType());

// Load the vector
auto vectorPtrType = LLVM::LLVMPointerType::get(
getContext(),
cast<MemRefType>(op.getSource().getType()).getMemorySpaceAsInt());
auto castedPtr =
rewriter.create<LLVM::BitcastOp>(op->getLoc(), vectorPtrType, ptr);
auto loadValue =
rewriter.create<LLVM::LoadOp>(op->getLoc(), loadType, castedPtr, 1);

// Get set up for the intrinsic
std::string intrinsicName = getIntrinsicName(op, loadSize);

// If the intrinsic declaration doesn't exist, create it
auto func = module.lookupSymbol<LLVM::LLVMFuncOp>(
StringAttr::get(context, intrinsicName));

if (!func) {
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(module.getBody());
func = rewriter.create<LLVM::LLVMFuncOp>(
rewriter.getUnknownLoc(), intrinsicName,
LLVM::LLVMFunctionType::get(resultType, {resultType, loadType}));
}

// Determine what the destination is
Value destValue;
if (adaptor.getVector()) {
// This UPD is using an existing destination vector
destValue = adaptor.getVector();
} else {
// If this UPD is not working off of an existing destination vector,
// create an undefined vector as the destination

// TODO: determine if the undef intrinsic is needed or if an LLVM
// undef suffices destValue =
// rewriter.create<LLVM::UndefOp>(op->getLoc(), resultType);

std::stringstream ss;
ss << "llvm.aie." << getVectorTypeString(resultType) << ".undef";
std::string intrinsicName = ss.str();

auto func = module.lookupSymbol<LLVM::LLVMFuncOp>(
StringAttr::get(rewriter.getContext(), intrinsicName));

if (!func) {
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(module.getBody());
func = rewriter.create<LLVM::LLVMFuncOp>(
rewriter.getUnknownLoc(), intrinsicName,
LLVM::LLVMFunctionType::get(resultType, {}));
}
destValue =
rewriter.create<LLVM::CallOp>(op->getLoc(), func, ValueRange{})
->getOpResult(0);
}

// Create our call
rewriter.replaceOpWithNewOp<LLVM::CallOp>(
op, func, ValueRange{destValue, loadValue});
}

return success();
}
};

class ConcatOpConversion
: public mlir::ConvertOpToLLVMPattern<aievec::ConcatOp> {
public:
Expand Down Expand Up @@ -2330,7 +2203,6 @@ void populateAIEVecToLLVMConversionPatterns(
MulOpConversion,
UPSOpConversion,
SRSOpConversion,
UPDOpConversion,
ConcatOpConversion,
ExtOpConversion,
SelectOpConversion,
Expand Down
98 changes: 98 additions & 0 deletions lib/Dialect/AIEVec/IR/AIE1/AIEVecAIE1Ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,104 @@ ParseResult ExtOp::parse(OpAsmParser &parser, OperationState &result) {
return parser.addTypeToList(resultType, result.types);
}

//===----------------------------------------------------------------------===//
// UPDOp
//===----------------------------------------------------------------------===//

// Print out UPD op.
void UPDOp::print(OpAsmPrinter &p) {
// Print the source memref
p << " " << getSource() << "[" << getIndices() << "]";
// Now print the optional vector that links upd idx=1 with idx=0
if (getVector())
p << ", " << getVector();

// Print the attributes, but don't print the operand segment sizes
SmallVector<StringRef, 3> elidedAttrs;
elidedAttrs.push_back(UPDOp::getOperandSegmentSizeAttr());
p.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);

// And now print the types
p << " : " << getSource().getType() << ", " << getResult().getType();
}

// Verify UPD op.
LogicalResult UPDOp::verify() {
// Verify the types: source is memref, and result is vector
MemRefType sourceType = llvm::dyn_cast<MemRefType>(getSource().getType());
VectorType resultType = llvm::dyn_cast<VectorType>(getResult().getType());
if (!sourceType)
return emitError("requires memref type");
if (!resultType)
return emitError("requires vector type");
if (getIndices().empty())
return emitError("upd source cannot come from scalar value");

// If this UPD op is linked to another UPD op, then verify that the linked
// vector and the result vector match.
if (getVector()) {
Type vecType = llvm::dyn_cast<VectorType>(getVector().getType());
if (vecType != resultType)
return emitError("result types of linked UPD ops do not match");
}
return success();
}

// Parse UPD op.
ParseResult UPDOp::parse(OpAsmParser &parser, OperationState &result) {
auto &builder = parser.getBuilder();
llvm::SMLoc typesLoc;
SmallVector<Type, 2> types;
OpAsmParser::UnresolvedOperand source, vector;
SmallVector<OpAsmParser::UnresolvedOperand, 8> indices;

// Parse the source, indices, and optional vector
if (parser.parseOperand(source) ||
parser.parseOperandList(indices, OpAsmParser::Delimiter::Square))
return failure();
ParseResult hasVector = parser.parseOptionalComma();
if (hasVector.succeeded() && parser.parseOperand(vector))
return failure();

// Parse all the attributes and types
if (parser.parseOptionalAttrDict(result.attributes) ||
parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
return failure();

if (result.attributes.getAttrs().size() != 2)
return parser.emitError(typesLoc, "requires two attributes");

// Assert that there are two types (memref source and vector result)
if (types.size() != 2)
return parser.emitError(typesLoc, "requires two types");

// Some verification
auto memrefType = llvm::dyn_cast<MemRefType>(types[0]);
if (!memrefType)
return parser.emitError(typesLoc, "requires memref type");
VectorType vectorType = llvm::dyn_cast<VectorType>(types[1]);
if (!vectorType)
return parser.emitError(typesLoc, "requires vector type");
auto indicesType = builder.getIndexType();

// Populate the source and indices in result
if (parser.resolveOperand(source, memrefType, result.operands) ||
parser.resolveOperands(indices, indicesType, result.operands))
return failure();
// Populate optional vector in result
if (hasVector.succeeded())
if (parser.resolveOperand(vector, vectorType, result.operands))
return failure();

// Populate operand size attribute in result
result.addAttribute(UPDOp::getOperandSegmentSizeAttr(),
builder.getDenseI32ArrayAttr(
{1, static_cast<int32_t>(indices.size()),
static_cast<int32_t>(hasVector.succeeded())}));

return parser.addTypeToList(vectorType, result.types);
}

} // namespace xilinx::aievec::aie1

// #define GET_ATTRDEF_CLASSES
Expand Down
33 changes: 18 additions & 15 deletions lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -966,11 +966,11 @@ static Operation *generateMulOp(T mulOp, AIEOpAttributes &opAttr,
// subsumed by the same interval. The updOps will have to be inserted at the
// head of region if the region has multiple blocks, or closer to the readOp
// otherwise.
static aievec::UPDOp
generateUPDOp(TransferReadOp readOp,
mlir::DenseMap<std::tuple<IntervalReuse *, int32_t, int32_t>,
std::pair<aievec::UPDOp, int8_t>> &memToUpdMap,
Region &region, VectState *state) {
static aievec::aie1::UPDOp generateUPDOp(
TransferReadOp readOp,
mlir::DenseMap<std::tuple<IntervalReuse *, int32_t, int32_t>,
std::pair<aievec::aie1::UPDOp, int8_t>> &memToUpdMap,
Region &region, VectState *state) {
// Get the read access extent and interval of this read operation
IntervalReuse *iv = state->getIntervalForOperation(readOp);
auto extent = iv->getAccessExtent(readOp);
Expand Down Expand Up @@ -1007,7 +1007,7 @@ generateUPDOp(TransferReadOp readOp,
: mid;

// Find if we have already created upd op idx=0/idx=1 for this interval
aievec::UPDOp updOp = nullptr;
aievec::aie1::UPDOp updOp = nullptr;
// initial value 0 of updIndices means neither upd op idx=0 nor idx=1 were
// created.
int8_t updIndices = 0;
Expand Down Expand Up @@ -1064,7 +1064,7 @@ generateUPDOp(TransferReadOp readOp,
if (lb <= start && ub >= end && (updIndices & idx) == 0) {
// Generate the upd instruction, and link it with a previous upd op
// corresponding to the same read.
updOp = state->builder.create<aievec::UPDOp>(
updOp = state->builder.create<aievec::aie1::UPDOp>(
readOp.getLoc(), updVecType, readOp.getSource(), indices,
start - offset, idx - 1,
updOp ? updOp.getResult() : TypedValue<VectorType>(nullptr));
Expand Down Expand Up @@ -2146,8 +2146,8 @@ static bool canFuseMulFMAOpsForInt16(Operation *Op) {
}

// Check 6. The def of two operands are upd operations
auto lUpdOp = dyn_cast<aievec::UPDOp>(lhs.getDefiningOp());
auto rUpdOp = dyn_cast<aievec::UPDOp>(rhs.getDefiningOp());
auto lUpdOp = dyn_cast<aievec::aie1::UPDOp>(lhs.getDefiningOp());
auto rUpdOp = dyn_cast<aievec::aie1::UPDOp>(rhs.getDefiningOp());

if (!lUpdOp || !rUpdOp) {
return false;
Expand Down Expand Up @@ -2179,17 +2179,19 @@ static void fuseMulFMAOpsForInt16(Operation *Op, VectState *state) {
// lhs of current FMAOp should be an upd operation with 512-bit vector width.
// For AIE-ML, we can directly load 512 bits vectors. Thus, we can delete the
// upd operation with index 1.
auto lUpdOp = dyn_cast<aievec::UPDOp>(lhs.getDefiningOp());
auto lUpdOp = dyn_cast<aievec::aie1::UPDOp>(lhs.getDefiningOp());
if (lUpdOp.getIndex() == 1) {
auto lUpdOp0 = dyn_cast<aievec::UPDOp>(lUpdOp.getVector().getDefiningOp());
auto lUpdOp0 =
dyn_cast<aievec::aie1::UPDOp>(lUpdOp.getVector().getDefiningOp());
lUpdOp->replaceAllUsesWith(lUpdOp0);
lUpdOp->erase();
}

// 2. Deal with the rhs:
// Since vector size of current FMAOp rhs is 256 bits, we need to generate a
// concat op to make the vector size to 512 bits.
auto rUpdOp = dyn_cast<aievec::UPDOp>(curOp->getOperand(1).getDefiningOp());
auto rUpdOp =
dyn_cast<aievec::aie1::UPDOp>(curOp->getOperand(1).getDefiningOp());
state->builder.setInsertionPointAfter(rUpdOp);
AIEVecAttributes rstat = getOperandVecStats(curOp, state, 1);
assert(rstat.vecSizeInBits % 256 == 0);
Expand Down Expand Up @@ -2434,16 +2436,17 @@ static void insertUPDOpsInLoop(affine::AffineForOp forOp, VectState *state) {
// achieving that. The value also has an 8-bit field, whose first/second bit
// is set if upd op idx=0/idx=1 is already created for this interval.
mlir::DenseMap<std::tuple<IntervalReuse *, int32_t, int32_t>,
std::pair<aievec::UPDOp, int8_t>>
std::pair<aievec::aie1::UPDOp, int8_t>>
memToUpdMap;
// A map from a read operation to its corresponding UPD operation. The idea
// is that multiple read ops will derive from the same bigger vector
// register.
mlir::DenseMap<Operation *, aievec::UPDOp> readOpToUpdMap;
mlir::DenseMap<Operation *, aievec::aie1::UPDOp> readOpToUpdMap;
// Iterate over all the transfer_read ops within this loop
Region &region = forOp.getRegion();
for (TransferReadOp readOp : region.getOps<TransferReadOp>()) {
aievec::UPDOp updOp = generateUPDOp(readOp, memToUpdMap, region, state);
aievec::aie1::UPDOp updOp =
generateUPDOp(readOp, memToUpdMap, region, state);
readOpToUpdMap[readOp] = updOp;
}

Expand Down
Loading
Loading