Skip to content

Commit

Permalink
Making execution region results queue-ordered allocas.
Browse files Browse the repository at this point in the history
We don't currently insert deallocas and don't track live ranges but that
can come in the future as we support more control flow. For now this
at least gets all of the common allocations within an invocation into
the queue-ordered bucket so that we can do proper async execution and
use native queue-ordered (e.g. stream-ordered allocations in CUDA)
functionality.

With this change the caching allocator is no longer needed for CUDA
in almost all cases.
  • Loading branch information
benvanik authored and raikonenfnu committed Oct 4, 2023
1 parent cffe727 commit 8827906
Show file tree
Hide file tree
Showing 21 changed files with 262 additions and 315 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -332,26 +332,19 @@ struct ResourceAllocOpPattern
lookupAllocatorAndQueueAffinityFor(allocOp, rewriter);
auto bufferType = rewriter.getType<IREE::HAL::BufferType>();

SmallVector<Value> results;
for (auto [resourceResult, storageSize] :
llvm::zip_equal(allocOp.getResults(), allocOp.getStorageSizes())) {
auto resourceType =
llvm::cast<IREE::Stream::ResourceType>(resourceResult.getType());

auto memoryTypes = IREE::HAL::MemoryTypeBitfield::None;
auto bufferUsage = IREE::HAL::BufferUsageBitfield::None;
if (failed(deriveAllowedResourceBufferBits(allocOp.getLoc(), resourceType,
memoryTypes, bufferUsage))) {
return failure();
}
auto resourceType =
cast<IREE::Stream::ResourceType>(allocOp.getResult().getType());

auto allocateOp = rewriter.create<IREE::HAL::AllocatorAllocateOp>(
allocOp.getLoc(), bufferType, allocator, queueAffinity, memoryTypes,
bufferUsage, storageSize);
results.push_back(allocateOp.getResult());
auto memoryTypes = IREE::HAL::MemoryTypeBitfield::None;
auto bufferUsage = IREE::HAL::BufferUsageBitfield::None;
if (failed(deriveAllowedResourceBufferBits(allocOp.getLoc(), resourceType,
memoryTypes, bufferUsage))) {
return failure();
}

rewriter.replaceOp(allocOp, results);
rewriter.replaceOpWithNewOp<IREE::HAL::AllocatorAllocateOp>(
allocOp, bufferType, allocator, queueAffinity, memoryTypes, bufferUsage,
adaptor.getStorageSize());
return success();
}
};
Expand All @@ -367,16 +360,14 @@ struct ResourceAllocaOpPattern
lookupDeviceAndQueueAffinityFor(allocaOp, rewriter);
auto bufferType = rewriter.getType<IREE::HAL::BufferType>();

// Transient allocations are device-local. Copies are required to get their
// contents back on the host/another device.
auto memoryTypes = IREE::HAL::MemoryTypeBitfield::DeviceLocal;

// TODO(benvanik): refine usage.
// We know by construction that transient buffers are not host visible and
// as such can only be used for device commands. We should be able to more
// closely limit to just dispatch or transfer though.
auto bufferUsage = IREE::HAL::BufferUsageBitfield::Transfer |
IREE::HAL::BufferUsageBitfield::DispatchStorage;
auto resourceType =
cast<IREE::Stream::ResourceType>(allocaOp.getResult().getType());
auto memoryTypes = IREE::HAL::MemoryTypeBitfield::None;
auto bufferUsage = IREE::HAL::BufferUsageBitfield::None;
if (failed(deriveAllowedResourceBufferBits(loc, resourceType, memoryTypes,
bufferUsage))) {
return failure();
}

// Gather wait/signal fence, which are optional.
Value waitFence =
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,21 @@
// RUN: iree-opt --split-input-file --iree-hal-conversion %s | FileCheck %s

// CHECK-LABEL: @resourceAlloc
func.func @resourceAlloc(%arg0: index, %arg1: index) -> (!stream.resource<transient>, !stream.resource<transient>) {
func.func @resourceAlloc(%arg0: index) -> !stream.resource<transient> {
// CHECK: %[[RET0:.+]] = hal.allocator.allocate
// CHECK-SAME: type("DeviceVisible|DeviceLocal")
// CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}")
// CHECK-SAME: : !hal.buffer{%arg0}
// CHECK-NEXT: %[[RET1:.+]] = hal.allocator.allocate
// CHECK-SAME: type("DeviceVisible|DeviceLocal")
// CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}")
// CHECK-SAME: : !hal.buffer{%arg1}
%0:2 = stream.resource.alloc uninitialized : !stream.resource<transient>{%arg0}, !stream.resource<transient>{%arg1}
// CHECK: return %[[RET0]], %[[RET1]]
return %0#0, %0#1 : !stream.resource<transient>, !stream.resource<transient>
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%arg0}
// CHECK: return %[[RET0]]
return %0 : !stream.resource<transient>
}

// -----

// CHECK-LABEL: @resourceAlloca
// CHECK-SAME: (%[[SIZE:.+]]: index)
func.func @resourceAlloca(%size: index) -> (!stream.resource<staging>, !stream.timepoint) {
func.func @resourceAlloca(%size: index) -> (!stream.resource<transient>, !stream.timepoint) {
// CHECK: %[[WAIT_FENCE:.+]] = util.null : !hal.fence
// CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create
// CHECK: %[[RET0:.+]] = hal.device.queue.alloca
Expand All @@ -30,16 +26,16 @@ func.func @resourceAlloca(%size: index) -> (!stream.resource<staging>, !stream.t
// CHECK-SAME: type("DeviceVisible|DeviceLocal")
// CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}")
// CHECK-SAME: : !hal.buffer{%[[SIZE]]}
%0:2 = stream.resource.alloca uninitialized : !stream.resource<staging>{%size} => !stream.timepoint
%0:2 = stream.resource.alloca uninitialized : !stream.resource<transient>{%size} => !stream.timepoint
// CHECK: return %[[RET0]], %[[SIGNAL_FENCE]]
return %0#0, %0#1 : !stream.resource<staging>, !stream.timepoint
return %0#0, %0#1 : !stream.resource<transient>, !stream.timepoint
}

// -----

// CHECK-LABEL: @resourceAllocaAwait
// CHECK-SAME: (%[[SIZE:.+]]: index, %[[WAIT_FENCE:.+]]: !hal.fence)
func.func @resourceAllocaAwait(%size: index, %await_timepoint: !stream.timepoint) -> (!stream.resource<staging>, !stream.timepoint) {
func.func @resourceAllocaAwait(%size: index, %await_timepoint: !stream.timepoint) -> (!stream.resource<transient>, !stream.timepoint) {
// CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create
// CHECK: %[[RET0:.+]] = hal.device.queue.alloca
// CHECK-SAME: affinity(%c-1
Expand All @@ -49,24 +45,24 @@ func.func @resourceAllocaAwait(%size: index, %await_timepoint: !stream.timepoint
// CHECK-SAME: type("DeviceVisible|DeviceLocal")
// CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}")
// CHECK-SAME: : !hal.buffer{%[[SIZE]]}
%0:2 = stream.resource.alloca uninitialized await(%await_timepoint) => !stream.resource<staging>{%size} => !stream.timepoint
%0:2 = stream.resource.alloca uninitialized await(%await_timepoint) => !stream.resource<transient>{%size} => !stream.timepoint
// CHECK: return %[[RET0]], %[[SIGNAL_FENCE]]
return %0#0, %0#1 : !stream.resource<staging>, !stream.timepoint
return %0#0, %0#1 : !stream.resource<transient>, !stream.timepoint
}

// -----

// CHECK-LABEL: @resourceDealloca
// CHECK-SAME: (%[[SIZE:.+]]: index, %[[RESOURCE:.+]]: !hal.buffer)
func.func @resourceDealloca(%size: index, %resource: !stream.resource<staging>) -> !stream.timepoint {
func.func @resourceDealloca(%size: index, %resource: !stream.resource<transient>) -> !stream.timepoint {
// CHECK: %[[WAIT_FENCE:.+]] = util.null : !hal.fence
// CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create
// CHECK: hal.device.queue.dealloca
// CHECK-SAME: affinity(%c-1
// CHECK-SAME: wait(%[[WAIT_FENCE]])
// CHECK-SAME: signal(%[[SIGNAL_FENCE]])
// CHECK-SAME: buffer(%[[RESOURCE]] : !hal.buffer)
%0 = stream.resource.dealloca %resource : !stream.resource<staging>{%size} => !stream.timepoint
%0 = stream.resource.dealloca %resource : !stream.resource<transient>{%size} => !stream.timepoint
// CHECK: return %[[SIGNAL_FENCE]]
return %0 : !stream.timepoint
}
Expand All @@ -77,14 +73,14 @@ func.func @resourceDealloca(%size: index, %resource: !stream.resource<staging>)

// CHECK-LABEL: @resourceDeallocaAwait
// CHECK-SAME: (%[[SIZE:.+]]: index, %[[RESOURCE:.+]]: !hal.buffer, %[[WAIT_FENCE:.+]]: !hal.fence)
func.func @resourceDeallocaAwait(%size: index, %resource: !stream.resource<staging>, %await_timepoint: !stream.timepoint) -> !stream.timepoint {
func.func @resourceDeallocaAwait(%size: index, %resource: !stream.resource<transient>, %await_timepoint: !stream.timepoint) -> !stream.timepoint {
// CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create
// CHECK: hal.device.queue.dealloca
// CHECK-SAME: affinity(%c-1
// CHECK-SAME: wait(%[[WAIT_FENCE]])
// CHECK-SAME: signal(%[[SIGNAL_FENCE]])
// CHECK-SAME: buffer(%[[RESOURCE]] : !hal.buffer)
%0 = stream.resource.dealloca await(%await_timepoint) => %resource : !stream.resource<staging>{%size} => !stream.timepoint
%0 = stream.resource.dealloca await(%await_timepoint) => %resource : !stream.resource<transient>{%size} => !stream.timepoint
// CHECK: return %[[SIGNAL_FENCE]]
return %0 : !stream.timepoint
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,9 @@ class MaterializeDispatchInstrumentationPass
OpBuilder::atBlockBegin(initializerOp.addEntryBlock());
Value bufferSize =
initializerBuilder.create<arith::ConstantOp>(loc, bufferSizeAttr);
Value buffer = initializerBuilder
.create<IREE::Stream::ResourceAllocOp>(
loc, globalOp.getType(), bufferSize,
/*uninitialized=*/true, /*affinity=*/nullptr)
.getResult(0);
Value buffer = initializerBuilder.create<IREE::Stream::ResourceAllocOp>(
loc, globalOp.getType(), bufferSize,
/*uninitialized=*/true, /*affinity=*/nullptr);
initializerBuilder.create<IREE::Util::GlobalStoreOp>(loc, buffer,
globalOp);
initializerBuilder.create<IREE::Util::InitializerReturnOp>(loc);
Expand Down
126 changes: 114 additions & 12 deletions compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,22 +615,124 @@ static void printWorkgroupCountRegion(OpAsmPrinter &p, Operation *op,
// stream.resource.alloc
//===----------------------------------------------------------------------===//

LogicalResult ResourceAllocOp::verify() {
ResourceAllocOp op = *this;
if (failed(verifyOpValueSizes(op, op.getResults(), op.getStorageSizes()))) {
return failure();
// static
std::pair<IREE::Stream::ResourceAllocOp, SmallVector<Value>>
ResourceAllocOp::createSuballocations(
Type resourceType, ArrayRef<Location> locs, ValueRange storageSizes,
bool uninitialized, AffinityAttr affinityAttr, OpBuilder &builder) {
assert(locs.size() == storageSizes.size() &&
"expect locs and storageSizes to match");
if (locs.empty())
return {};
if (locs.size() == 1) {
auto allocOp = builder.create<IREE::Stream::ResourceAllocOp>(
locs.front(), resourceType, storageSizes.front(), uninitialized,
affinityAttr);
return {allocOp, {allocOp.getResult()}};
}
auto fusedLoc = builder.getFusedLoc(locs);

// All allocated resources must have the same lifetime.
auto anyType = op.getResults().front().getType();
for (auto type : op.getResultTypes()) {
if (type != anyType) {
return op.emitError()
<< "all allocated resources must have the same lifetime";
}
// NOTE: this is risky: we are assuming right now that all of the
// allocations will fit within the constraints of the system. This is not
// guaranteed: a very low maximum buffer range may lead to packed slabs
// that are not fully addressable. For now we are processing models with
// small enough workloads and our target devices are relatively lax on
// things so long as we stay under UINT32_MAX boundaries.

// All slices are 0-0 (overlapping).
size_t sliceCount = locs.size();
SmallVector<int64_t> lifetimeIntervals(sliceCount * 2, 0);

// Compute total size and the offsets of all suballocated resources via the
// pack op.
auto indexType = builder.getIndexType();
SmallVector<Type> packedOffsetTypes(sliceCount, indexType);
auto packOp = builder.create<IREE::Stream::ResourcePackOp>(
fusedLoc, indexType, packedOffsetTypes, /*offset=*/nullptr,
builder.getIndexArrayAttr(lifetimeIntervals), storageSizes, affinityAttr);

// Create the new alloca based on the total required size.
auto allocOp = builder.create<IREE::Stream::ResourceAllocOp>(
fusedLoc, resourceType, packOp.getTotalLength(), uninitialized,
affinityAttr);
auto slab = allocOp.getResult();
auto slabSize = packOp.getTotalLength();

// Create subviews for all of the suballocated resources.
SmallVector<Value> results;
for (auto [loc, subviewOffset, subviewLength] :
llvm::zip_equal(locs, packOp.getPackedOffsets(), storageSizes)) {
results.push_back(builder
.create<IREE::Stream::ResourceSubviewOp>(
loc, slab, slabSize, subviewOffset, subviewLength)
.getResult());
}
return {allocOp, results};
}

return success();
//===----------------------------------------------------------------------===//
// stream.resource.alloca
//===----------------------------------------------------------------------===//

// static
std::pair<IREE::Stream::ResourceAllocaOp, SmallVector<Value>>
ResourceAllocaOp::createSuballocations(Type timepointType, Type resourceType,
ArrayRef<Location> locs,
ValueRange storageSizes,
Value awaitTimepoint,
AffinityAttr affinityAttr,
OpBuilder &builder) {
assert(locs.size() == storageSizes.size() &&
"expect locs and storageSizes to match");
if (locs.empty())
return {};
if (locs.size() == 1) {
auto allocaOp = builder.create<IREE::Stream::ResourceAllocaOp>(
locs.front(), resourceType, timepointType, storageSizes.front(),
awaitTimepoint, affinityAttr);
return {allocaOp, {allocaOp.getResult()}};
}
auto fusedLoc = builder.getFusedLoc(locs);

// NOTE: this is risky: we are assuming right now that all of the
// allocations will fit within the constraints of the system. This is not
// guaranteed: a very low maximum buffer range may lead to packed slabs
// that are not fully addressable. For now we are processing models with
// small enough workloads and our target devices are relatively lax on
// things so long as we stay under UINT32_MAX boundaries. If a user starts
// hitting this the solution is to do in-place outputs such that we don't
// need to allocate them; when possible that's always going to be better than
// leaving them to the IREE compiled program to deal with.

// All slices are 0-0 (overlapping).
size_t sliceCount = locs.size();
SmallVector<int64_t> lifetimeIntervals(sliceCount * 2, 0);

// Compute total size and the offsets of all suballocated resources via the
// pack op.
auto indexType = builder.getIndexType();
SmallVector<Type> packedOffsetTypes(sliceCount, indexType);
auto packOp = builder.create<IREE::Stream::ResourcePackOp>(
fusedLoc, indexType, packedOffsetTypes, /*offset=*/nullptr,
builder.getIndexArrayAttr(lifetimeIntervals), storageSizes, affinityAttr);

// Create the new alloca based on the total required size.
auto allocaOp = builder.create<IREE::Stream::ResourceAllocaOp>(
fusedLoc, resourceType, timepointType, packOp.getTotalLength(),
awaitTimepoint, affinityAttr);
auto slab = allocaOp.getResult();
auto slabSize = packOp.getTotalLength();

// Create subviews for all of the suballocated resources.
SmallVector<Value> results;
for (auto [loc, subviewOffset, subviewLength] :
llvm::zip_equal(locs, packOp.getPackedOffsets(), storageSizes)) {
results.push_back(builder
.create<IREE::Stream::ResourceSubviewOp>(
loc, slab, slabSize, subviewOffset, subviewLength)
.getResult());
}
return {allocaOp, results};
}

//===----------------------------------------------------------------------===//
Expand Down
Loading

0 comments on commit 8827906

Please sign in to comment.