Skip to content

Commit

Permalink
[VM] [Hexagon] Introduce 2D Discontiguous vtcm alloc tensor
Browse files Browse the repository at this point in the history
Adds 2D Discontiguous alloc tensor hexagon builtin to support 2D
allocations for hexagon at relax level. This is needed when the ops are
implemented to take advantage of 2d indirections and enables
memory manager optimizations to try utilize VTCM memory efficiently.

This patch also introduces the `R.vm.copy_tensor` op to support copies
between different tensors, specifically planned to be used when copying
tensors from one memory scope to another

Co-authored-by: arangasa <[email protected]>
  • Loading branch information
quic-sanirudh and arangasa committed Feb 12, 2024
1 parent 2d9c6c4 commit 37683e4
Show file tree
Hide file tree
Showing 14 changed files with 657 additions and 24 deletions.
3 changes: 3 additions & 0 deletions include/tvm/runtime/memory/memory_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class StorageObj : public Object {
/*! \brief The index into the VM function table. */
Buffer buffer;

/* \brief Common function to create an NDArray container with the provided offset, shape and dtype */
NDArray::Container* CreateNDArrayContainer(int64_t offset, ShapeTuple shape, DLDataType dtype);

/*! \brief Allocate an NDArray from a given piece of storage. */
NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);

Expand Down
2 changes: 1 addition & 1 deletion python/tvm/relax/op/vm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
# under the License.
"""Relax vm primitives."""

from .vm import alloc_storage, alloc_tensor, call_tir_dyn, kill_object
from .vm import alloc_storage, alloc_tensor, call_tir_dyn, copy_tensor, kill_object
20 changes: 20 additions & 0 deletions python/tvm/relax/op/vm/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,23 @@ def call_tir_dyn(func: Expr, args: Tuple) -> Call:
args = Tuple(args)

return _ffi_api.call_tir_dyn(func, args) # type: ignore


@args_converter.auto
def copy_tensor(src: Expr, dst: Expr) -> Call:
"""Construct a call to copy one tensor to another.
Parameters
----------
src : Expr
Source tensor for copy.
dst : Expr
Destination tensor for copy.
Returns
-------
result : Call
A relax Call, which performs the copy.
"""
return _ffi_api.copy_tensor(src, dst) # type: ignore
13 changes: 13 additions & 0 deletions src/relax/backend/vm/codegen_vm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
EmitAllocStorage(call, dst_reg);
} else if (call_node->op == alloc_tensor_op_) {
EmitAllocTensor(call, dst_reg);
} else if (call_node->op == copy_tensor_op_) {
EmitCopyTensor(call, dst_reg);
} else if (call_node->op == kill_object_op_) {
dst_reg = EmitKillObject(call);
} else {
Expand Down Expand Up @@ -361,6 +363,16 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
builder_->EmitCall("vm.builtin.alloc_tensor", args, dst_reg);
}

void EmitCopyTensor(const Call& call_node, RegName dst_reg) {
ICHECK_EQ(call_node->args.size(), 2);
std::vector<Instruction::Arg> args;
args.reserve(2);
for (Expr arg : call_node->args) {
args.push_back(this->VisitExpr(arg));
}
builder_->EmitCall("vm.builtin.copy_tensor", args, dst_reg);
}

RegName EmitKillObject(const Call& call_node) {
ICHECK_EQ(call_node->args.size(), 1);
Instruction::Arg arg = this->VisitExpr(call_node->args[0]);
Expand Down Expand Up @@ -422,6 +434,7 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
/*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */
const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage");
const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor");
const Op& copy_tensor_op_ = Op::Get("relax.vm.copy_tensor");
const Op& kill_object_op_ = Op::Get("relax.vm.kill_object");
const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx");
const Op& null_value_op_ = Op::Get("relax.null_value");
Expand Down
13 changes: 13 additions & 0 deletions src/relax/backend/vm/codegen_vm_tir.cc
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,8 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
EmitAllocStorage(call, dst_reg);
} else if (call_node->op == alloc_tensor_op_) {
EmitAllocTensor(call, dst_reg);
} else if (call_node->op == copy_tensor_op_) {
EmitCopyTensor(call, dst_reg);
} else if (call_node->op == kill_object_op_) {
dst_reg = EmitKillObject(call);
} else {
Expand Down Expand Up @@ -414,6 +416,16 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
this->EmitCallPacked("vm.builtin.alloc_tensor", args, dst_reg);
}

void EmitCopyTensor(const Call& call_node, int64_t dst_reg) {
ICHECK_EQ(call_node->args.size(), 2);
Array<PrimExpr> args;
args.reserve(2);
for (Expr arg : call_node->args) {
args.push_back(this->VisitExpr(arg).value());
}
this->EmitCallPacked("vm.builtin.copy_tensor", args, dst_reg);
}

int64_t EmitKillObject(const Call& call_node) {
ICHECK_EQ(call_node->args.size(), 1);
PrimExpr arg = this->VisitExpr(call_node->args[0]).value();
Expand Down Expand Up @@ -519,6 +531,7 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
/*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */
const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage");
const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor");
const Op& copy_tensor_op_ = Op::Get("relax.vm.copy_tensor");
const Op& kill_object_op_ = Op::Get("relax.vm.kill_object");
const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx");
const Op& null_value_op_ = Op::Get("relax.null_value");
Expand Down
16 changes: 16 additions & 0 deletions src/relax/op/op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,22 @@ Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm d

TVM_REGISTER_GLOBAL("relax.op.vm.alloc_tensor").set_body_typed(MakeVMAllocTensor);

// vm copy_tensor

RELAY_REGISTER_OP("relax.vm.copy_tensor")
.set_num_inputs(2)
.add_argument("src", "Expr", "The tensor to copy from")
.add_argument("dst", "Expr", "The tensor to copy to")
.set_attr<FInferStructInfo>("FInferStructInfo", ReturnVoidStructInfo)
.set_attr<Bool>("FPurity", Bool(true));

Expr MakeVMCopyTensor(Expr src, Expr dst) {
static const Op& op = Op::Get("relax.vm.copy_tensor");
return Call(op, {src, dst}, Attrs(), {});
}

TVM_REGISTER_GLOBAL("relax.op.vm.copy_tensor").set_body_typed(MakeVMCopyTensor);

// vm kill_object

TVM_REGISTER_OP("relax.vm.kill_object")
Expand Down
4 changes: 4 additions & 0 deletions src/runtime/hexagon/hexagon_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,10 @@ struct BufferSet {
size_t region_size_bytes;
};

void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src,
size_t bytes_to_copy, bool src_is_hexbuff,
bool dest_is_hexbuff);

} // namespace hexagon
} // namespace runtime
} // namespace tvm
Expand Down
79 changes: 58 additions & 21 deletions src/runtime/hexagon/hexagon_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@
#include <cstring>

#include "../workspace_pool.h"
#include "hexagon_buffer.h"
#include "hexagon_common.h"
#include "qurt_memory.h"

namespace tvm {
namespace runtime {
Expand Down Expand Up @@ -91,23 +93,29 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
<< "HexagonDeviceAPI::AllocDataSpace before initializing resources. "
<< "Please call HexagonDeviceAPI::AcquireResources";

void* base_ptr;
PhysicalShape physical_shape;
if (ndim == 0) {
// Allocate storage for a single scalar value.
return runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
physical_shape = {1, 1, typesize};
} else if (ndim == 1) {
// Allocate a single, contiguous memory region.
size_t nbytes = shape[0] * typesize;
return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
physical_shape = {1, 1, nbytes};
} else if (ndim == 2) {
// Allocate the region(s) needed for Hexagon's indirect-tensor format.
size_t nallocs = shape[0];
size_t nbytes = shape[1] * typesize;
return runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment,
mem_scope);
base_ptr =
runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope);
physical_shape = {2, nallocs, nbytes};
} else {
return nullptr; // unreachable
}
SetPhysicalShape(base_ptr, physical_shape);
return base_ptr;
}

void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
Expand All @@ -121,7 +129,10 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
<< "HexagonDeviceAPI::AllocDataSpace before initializing resources. "
<< "Please call HexagonDeviceAPI::AcquireResources";
return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
void* base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
PhysicalShape physical_shape = {1, 1, nbytes};
SetPhysicalShape(base_ptr, physical_shape);
return base_ptr;
}

void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
Expand All @@ -134,6 +145,7 @@ void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
// occur in the normal course of shutdown, log a message and continue.
DLOG(INFO) << "FreeDataSpace called outside a session for " << ptr;
}
ndarray_physical_shape.erase(ptr);
}

// WorkSpace: runtime allocations for Hexagon
Expand All @@ -157,6 +169,10 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
}

void* get_data_start(DLTensor* tensor) {
return (reinterpret_cast<uint8_t*>(tensor->data));
}

void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
CHECK_EQ(from->byte_offset, 0);
CHECK_EQ(to->byte_offset, 0);
Expand All @@ -165,23 +181,44 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
<< "HexagonDeviceAPI::CopyDataFromTo before initializing resources. "
<< "Please call HexagonDeviceAPI::AcquireResources";

auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
return runtime_hexbuffs->FindHexagonBuffer(ptr);
};
auto numBytes = GetDataSize(*from);

size_t FlatShape = 1;
for (auto i = 0; i < from->ndim; ++i) FlatShape *= from->shape[i];

PhysicalShape source_shape = {1, 1, FlatShape};
PhysicalShape dest_shape = {1, 1, FlatShape};
auto it1 = ndarray_physical_shape.find(from->data);
if (it1 != ndarray_physical_shape.end()) source_shape = it1->second;
size_t src_rank = source_shape.ndim;
void* src_start = get_data_start(from);
void* dst_start = get_data_start(to);
BufferSet src((src_rank == 1) ? &(src_start) : static_cast<void**>(src_start),
source_shape.nblocks, numBytes / source_shape.nblocks);
auto it2 = ndarray_physical_shape.find(to->data);
if (it2 != ndarray_physical_shape.end()) dest_shape = it2->second;
size_t dest_rank = dest_shape.ndim;
BufferSet dest((dest_rank == 1) ? &(dst_start) : static_cast<void**>(dst_start),
dest_shape.nblocks, numBytes / dest_shape.nblocks);
hexagon_buffer_copy_across_regions(dest, src, numBytes, (it1 != ndarray_physical_shape.end()),
(it2 != ndarray_physical_shape.end()));
return;
}

HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
void HexagonDeviceAPI::SetPhysicalShape(const DLTensor* tensor, const int64_t ndim,
const int64_t* shape) {
PhysicalShape physical_shape = {static_cast<size_t>(ndim), static_cast<size_t>(shape[0]),
static_cast<size_t>(shape[1])};
SetPhysicalShape(tensor->data, physical_shape);
}

if (hex_from_buf && hex_to_buf) {
hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from));
} else if (hex_to_buf) {
hex_to_buf->CopyFrom(from->data, GetDataSize(*from));
} else if (hex_from_buf) {
hex_from_buf->CopyTo(to->data, GetDataSize(*to));
} else {
CHECK(false) << "CopyDataFromTo requested between src and dst which are not managed by the "
"hexagon device api.";
}
void HexagonDeviceAPI::SetPhysicalShape(const void* data, const PhysicalShape& physical_shape) {
auto it = ndarray_physical_shape.find(const_cast<void*>(data));
if (it != ndarray_physical_shape.end())
ndarray_physical_shape[const_cast<void*>(data)] = physical_shape;
else
ndarray_physical_shape.insert(
std::pair<void*, PhysicalShape>(const_cast<void*>(data), physical_shape));
}

void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to,
Expand Down
19 changes: 19 additions & 0 deletions src/runtime/hexagon/hexagon_device_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ namespace tvm {
namespace runtime {
namespace hexagon {

struct PhysicalShape {
size_t ndim;
size_t nblocks;
size_t block_size;
};

/*!
* \brief Hexagon Device API that is compiled and run on Hexagon.
*/
Expand Down Expand Up @@ -148,6 +154,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
*/
void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;

/*!
* \brief set physical shape of tensor
*/
void SetPhysicalShape(const DLTensor* tensor, const int64_t ndim, const int64_t* shape);

HexagonThreadManager* ThreadManager() {
CHECK(runtime_threads) << "runtime_threads has not been created";
return runtime_threads.get();
Expand Down Expand Up @@ -178,6 +189,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
return (dev.device_type == kDLHexagon) || (dev.device_type == kDLCPU);
}

/*!
* \brief set physical shape of tensor - private helper
*/
void SetPhysicalShape(const void* data, const PhysicalShape&);

//! \brief Manages runtime HexagonBuffer allocations
// runtime_hexbuffs is used for runtime allocations. It is created with a call to
// AcquireResources, and destroyed on ReleaseResources. The buffers in this manager are scoped
Expand All @@ -199,6 +215,9 @@ class HexagonDeviceAPI final : public DeviceAPI {

//! \brief Hexagon power manager
std::unique_ptr<HexagonPowerManager> runtime_power_manager;

//! \brief NDArray base -> Physical Shape map
std::map<void*, PhysicalShape> ndarray_physical_shape;
};
} // namespace hexagon
} // namespace runtime
Expand Down
9 changes: 7 additions & 2 deletions src/runtime/memory/memory_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
return align;
}

NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) {
NDArray::Container* StorageObj::CreateNDArrayContainer(int64_t offset, ShapeTuple shape, DLDataType dtype) {
VerifyDataType(dtype);

// crtical zone: allocate header, cannot throw
Expand All @@ -92,7 +92,6 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt
container->dl_tensor.byte_offset = offset;

container->SetDeleter(StorageObj::Deleter);
size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor);
this->IncRef();
// The manager context pointer must continue to point to the storage object
// which owns the backing memory, and keeps track of the reference count.
Expand All @@ -101,6 +100,12 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt
// reference count, then destroy the container, but leave the underlying
// buffer intact.
container->manager_ctx = reinterpret_cast<void*>(this);
return container;
}

NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) {
auto* container = CreateNDArrayContainer(offset, shape, dtype);
size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor);

if (this->buffer.device.device_type == kDLHexagon) {
// For Hexagon, non-zero offset support simply requires adjusting the
Expand Down
4 changes: 4 additions & 0 deletions src/runtime/relax_vm/builtin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,10 @@ TVM_REGISTER_GLOBAL("vm.builtin.reshape").set_body_typed([](NDArray data, ShapeT
return data.CreateView(new_shape, data->dtype);
});

TVM_REGISTER_GLOBAL("vm.builtin.copy_tensor").set_body_typed([](NDArray src, NDArray dst) {
dst.CopyFrom(src);
});

TVM_REGISTER_GLOBAL("vm.builtin.null_value").set_body([](TVMArgs args, TVMRetValue* rv) {
CHECK_EQ(args.size(), 0);
*rv = nullptr;
Expand Down
Loading

0 comments on commit 37683e4

Please sign in to comment.