diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h index 8e4ed4875e631..d759f765b6c8d 100644 --- a/include/tvm/runtime/memory/memory_manager.h +++ b/include/tvm/runtime/memory/memory_manager.h @@ -139,6 +139,10 @@ class StorageObj : public Object { /*! \brief The index into the VM function table. */ Buffer buffer; + /* \brief Common function to create an NDArray container with the provided offset, shape and dtype + */ + NDArray::Container* CreateNDArrayContainer(int64_t offset, ShapeTuple shape, DLDataType dtype); + /*! \brief Allocate an NDArray from a given piece of storage. */ NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype); diff --git a/python/tvm/relax/op/vm/__init__.py b/python/tvm/relax/op/vm/__init__.py index e68ecbcebdf13..03821e46d30d4 100644 --- a/python/tvm/relax/op/vm/__init__.py +++ b/python/tvm/relax/op/vm/__init__.py @@ -16,4 +16,4 @@ # under the License. """Relax vm primitives.""" -from .vm import alloc_storage, alloc_tensor, call_tir_dyn, kill_object +from .vm import alloc_storage, alloc_tensor, call_tir_dyn, copy_tensor_from_to, kill_object diff --git a/python/tvm/relax/op/vm/vm.py b/python/tvm/relax/op/vm/vm.py index 3ed6b29648b4f..bdd0aca4fd4f7 100644 --- a/python/tvm/relax/op/vm/vm.py +++ b/python/tvm/relax/op/vm/vm.py @@ -131,3 +131,23 @@ def call_tir_dyn(func: Expr, args: Tuple) -> Call: args = Tuple(args) return _ffi_api.call_tir_dyn(func, args) # type: ignore + + +@args_converter.auto +def copy_tensor_from_to(src: Expr, dst: Expr) -> Call: + """Construct a call to copy one tensor to another. + + Parameters + ---------- + src : Expr + Source tensor for copy. + + dst : Expr + Destination tensor for copy. + + Returns + ------- + result : Call + A relax Call, which performs the copy. + """ + return _ffi_api.copy_tensor_from_to(src, dst) # type: ignore diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc index 329da67e84ecd..5023505f5e360 100644 --- a/src/relax/backend/vm/codegen_vm.cc +++ b/src/relax/backend/vm/codegen_vm.cc @@ -169,6 +169,8 @@ class CodeGenVM : public ExprFunctor { EmitAllocStorage(call, dst_reg); } else if (call_node->op == alloc_tensor_op_) { EmitAllocTensor(call, dst_reg); + } else if (call_node->op == copy_tensor_from_to_op_) { + EmitCopyTensor(call, dst_reg); } else if (call_node->op == kill_object_op_) { dst_reg = EmitKillObject(call); } else { @@ -361,6 +363,16 @@ class CodeGenVM : public ExprFunctor { builder_->EmitCall("vm.builtin.alloc_tensor", args, dst_reg); } + void EmitCopyTensor(const Call& call_node, RegName dst_reg) { + ICHECK_EQ(call_node->args.size(), 2); + std::vector args; + args.reserve(2); + for (Expr arg : call_node->args) { + args.push_back(this->VisitExpr(arg)); + } + builder_->EmitCall("vm.builtin.copy_tensor_from_to", args, dst_reg); + } + RegName EmitKillObject(const Call& call_node) { ICHECK_EQ(call_node->args.size(), 1); Instruction::Arg arg = this->VisitExpr(call_node->args[0]); @@ -430,6 +442,7 @@ class CodeGenVM : public ExprFunctor { /*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */ const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage"); const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor"); + const Op& copy_tensor_from_to_op_ = Op::Get("relax.vm.copy_tensor_from_to"); const Op& kill_object_op_ = Op::Get("relax.vm.kill_object"); const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx"); const Op& null_value_op_ = Op::Get("relax.null_value"); diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc index ec1678e9e0f31..02c7aabe18cc3 100644 --- a/src/relax/backend/vm/codegen_vm_tir.cc +++ b/src/relax/backend/vm/codegen_vm_tir.cc @@ -238,6 +238,8 @@ class CodeGenVMTIR : public ExprFunctor(const Expr&)> { EmitAllocStorage(call, dst_reg); } else if (call_node->op == alloc_tensor_op_) { EmitAllocTensor(call, dst_reg); + } else if (call_node->op == copy_tensor_from_to_op_) { + EmitCopyTensor(call, dst_reg); } else if (call_node->op == kill_object_op_) { dst_reg = EmitKillObject(call); } else { @@ -414,6 +416,16 @@ class CodeGenVMTIR : public ExprFunctor(const Expr&)> { this->EmitCallPacked("vm.builtin.alloc_tensor", args, dst_reg); } + void EmitCopyTensor(const Call& call_node, int64_t dst_reg) { + ICHECK_EQ(call_node->args.size(), 2); + Array args; + args.reserve(2); + for (Expr arg : call_node->args) { + args.push_back(this->VisitExpr(arg).value()); + } + this->EmitCallPacked("vm.builtin.copy_tensor_from_to", args, dst_reg); + } + int64_t EmitKillObject(const Call& call_node) { ICHECK_EQ(call_node->args.size(), 1); PrimExpr arg = this->VisitExpr(call_node->args[0]).value(); @@ -519,6 +531,7 @@ class CodeGenVMTIR : public ExprFunctor(const Expr&)> { /*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */ const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage"); const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor"); + const Op& copy_tensor_from_to_op_ = Op::Get("relax.vm.copy_tensor_from_to"); const Op& kill_object_op_ = Op::Get("relax.vm.kill_object"); const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx"); const Op& null_value_op_ = Op::Get("relax.null_value"); diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc index 489886e50f764..48399662fa0ce 100644 --- a/src/relax/op/op.cc +++ b/src/relax/op/op.cc @@ -1004,6 +1004,22 @@ Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm d TVM_REGISTER_GLOBAL("relax.op.vm.alloc_tensor").set_body_typed(MakeVMAllocTensor); +// vm copy_tensor_from_to + +RELAY_REGISTER_OP("relax.vm.copy_tensor_from_to") + .set_num_inputs(2) + .add_argument("src", "Expr", "The tensor to copy from") + .add_argument("dst", "Expr", "The tensor to copy to") + .set_attr("FInferStructInfo", ReturnVoidStructInfo) + .set_attr("FPurity", Bool(true)); + +Expr MakeVMCopyTensor(Expr src, Expr dst) { + static const Op& op = Op::Get("relax.vm.copy_tensor_from_to"); + return Call(op, {src, dst}, Attrs(), {}); +} + +TVM_REGISTER_GLOBAL("relax.op.vm.copy_tensor_from_to").set_body_typed(MakeVMCopyTensor); + // vm kill_object TVM_REGISTER_OP("relax.vm.kill_object") diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index 48afa5770afd1..73afb7c005019 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -233,7 +233,7 @@ std::vector MemoryCopy::MergeAdjacent(std::vector micro_ return macro_copies; } -void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src, +void HexagonBufferCopyAcrossRegions(const BufferSet& dest, const BufferSet& src, size_t bytes_to_copy, bool src_is_hexbuff, bool dest_is_hexbuff) { // First, determine all copies that do not cross boundaries in @@ -268,7 +268,7 @@ void HexagonBuffer::CopyTo(void* data, size_t nbytes) const { BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_); BufferSet dest(&data, 1, nbytes); - hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */, + HexagonBufferCopyAcrossRegions(dest, src, nbytes, true /* src_is_hexbuff */, false /* dest_is_hexbuff */); } @@ -276,7 +276,7 @@ void HexagonBuffer::CopyFrom(void* data, size_t nbytes) { BufferSet src(&data, 1, nbytes); BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_); - hexagon_buffer_copy_across_regions(dest, src, nbytes, false /* src_is_hexbuff */, + HexagonBufferCopyAcrossRegions(dest, src, nbytes, false /* src_is_hexbuff */, true /* dest_is_hexbuff */); } @@ -284,7 +284,7 @@ void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) { BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_); BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_); - hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */, + HexagonBufferCopyAcrossRegions(dest, src, nbytes, true /* src_is_hexbuff */, true /* dest_is_hexbuff */); } diff --git a/src/runtime/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon_buffer.h index 8cb8a32095141..9d11a50bc60ad 100644 --- a/src/runtime/hexagon/hexagon_buffer.h +++ b/src/runtime/hexagon/hexagon_buffer.h @@ -195,6 +195,19 @@ struct BufferSet { size_t region_size_bytes; }; +/** + * @brief Single function to handle copying potentially discontiguous buffers efficiently + * + * @param The destination buffer + * @param The source buffer + * @param Number of bytes to copy. This should be less than both source and dest buffer size + * @param Boolean to specify whether the source is a hexagon buffer + * @param Boolean to specify whether the destination is a hexagon buffer + */ +void HexagonBufferCopyAcrossRegions(const BufferSet& dest, const BufferSet& src, + size_t bytes_to_copy, bool src_is_hexbuff, + bool dest_is_hexbuff); + } // namespace hexagon } // namespace runtime } // namespace tvm diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 65162e7cc6d0d..21944fb03d987 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -32,7 +32,9 @@ #include #include "../workspace_pool.h" +#include "hexagon_buffer.h" #include "hexagon_common.h" +#include "qurt_memory.h" namespace tvm { namespace runtime { @@ -91,23 +93,29 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with " << "HexagonDeviceAPI::AllocDataSpace before initializing resources. " << "Please call HexagonDeviceAPI::AcquireResources"; - + void* base_ptr; + PhysicalShape physical_shape; if (ndim == 0) { // Allocate storage for a single scalar value. - return runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope); + base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope); + physical_shape = {1, 1, typesize}; } else if (ndim == 1) { // Allocate a single, contiguous memory region. size_t nbytes = shape[0] * typesize; - return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope); + base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope); + physical_shape = {1, 1, nbytes}; } else if (ndim == 2) { // Allocate the region(s) needed for Hexagon's indirect-tensor format. size_t nallocs = shape[0]; size_t nbytes = shape[1] * typesize; - return runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, - mem_scope); + base_ptr = + runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope); + physical_shape = {2, nallocs, nbytes}; } else { return nullptr; // unreachable } + SetPhysicalShape(base_ptr, physical_shape); + return base_ptr; } void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment, @@ -121,7 +129,10 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with " << "HexagonDeviceAPI::AllocDataSpace before initializing resources. " << "Please call HexagonDeviceAPI::AcquireResources"; - return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global")); + void* base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global")); + PhysicalShape physical_shape = {1, 1, nbytes}; + SetPhysicalShape(base_ptr, physical_shape); + return base_ptr; } void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) { @@ -134,6 +145,7 @@ void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) { // occur in the normal course of shutdown, log a message and continue. DLOG(INFO) << "FreeDataSpace called outside a session for " << ptr; } + ndarray_physical_shape.erase(ptr); } // WorkSpace: runtime allocations for Hexagon @@ -157,6 +169,8 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) { dmlc::ThreadLocalStore::Get()->FreeWorkspace(dev, data); } +void* get_data_start(DLTensor* tensor) { return (reinterpret_cast(tensor->data)); } + void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { CHECK_EQ(from->byte_offset, 0); CHECK_EQ(to->byte_offset, 0); @@ -165,22 +179,44 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan << "HexagonDeviceAPI::CopyDataFromTo before initializing resources. " << "Please call HexagonDeviceAPI::AcquireResources"; - auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* { - return runtime_hexbuffs->FindHexagonBuffer(ptr); - }; + auto numBytes = GetDataSize(*from); + + size_t FlatShape = 1; + for (auto i = 0; i < from->ndim; ++i) FlatShape *= from->shape[i]; + + PhysicalShape source_shape = {1, 1, FlatShape}; + PhysicalShape dest_shape = {1, 1, FlatShape}; + auto it1 = ndarray_physical_shape.find(from->data); + if (it1 != ndarray_physical_shape.end()) source_shape = it1->second; + size_t src_rank = source_shape.ndim; + void* src_start = get_data_start(from); + void* dst_start = get_data_start(to); + BufferSet src((src_rank == 1) ? &(src_start) : static_cast(src_start), + source_shape.nblocks, numBytes / source_shape.nblocks); + auto it2 = ndarray_physical_shape.find(to->data); + if (it2 != ndarray_physical_shape.end()) dest_shape = it2->second; + size_t dest_rank = dest_shape.ndim; + BufferSet dest((dest_rank == 1) ? &(dst_start) : static_cast(dst_start), + dest_shape.nblocks, numBytes / dest_shape.nblocks); + HexagonBufferCopyAcrossRegions(dest, src, numBytes, (it1 != ndarray_physical_shape.end()), + (it2 != ndarray_physical_shape.end())); + return; +} - HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data); - HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data); +void HexagonDeviceAPI::SetPhysicalShape(const DLTensor* tensor, const int64_t ndim, + const int64_t* shape) { + PhysicalShape physical_shape = {static_cast(ndim), static_cast(shape[0]), + static_cast(shape[1])}; + SetPhysicalShape(tensor->data, physical_shape); +} - if (hex_from_buf && hex_to_buf) { - hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from)); - } else if (hex_to_buf) { - hex_to_buf->CopyFrom(from->data, GetDataSize(*from)); - } else if (hex_from_buf) { - hex_from_buf->CopyTo(to->data, GetDataSize(*to)); +void HexagonDeviceAPI::SetPhysicalShape(const void* data, const PhysicalShape& physical_shape) { + auto it = ndarray_physical_shape.find(const_cast(data)); + if (it != ndarray_physical_shape.end()) { + ndarray_physical_shape[const_cast(data)] = physical_shape; } else { - CHECK(false) << "CopyDataFromTo requested between src and dst which are not managed by the " - "hexagon device api."; + ndarray_physical_shape.insert( + std::pair(const_cast(data), physical_shape)); } } diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h index c4e87a957ade9..0560e56534f4d 100644 --- a/src/runtime/hexagon/hexagon_device_api.h +++ b/src/runtime/hexagon/hexagon_device_api.h @@ -22,7 +22,6 @@ #include -#include #include #include #include @@ -40,6 +39,12 @@ namespace tvm { namespace runtime { namespace hexagon { +struct PhysicalShape { + size_t ndim; + size_t nblocks; + size_t block_size; +}; + /*! * \brief Hexagon Device API that is compiled and run on Hexagon. */ @@ -148,6 +153,11 @@ class HexagonDeviceAPI final : public DeviceAPI { */ void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final; + /*! + * \brief set physical shape of tensor + */ + void SetPhysicalShape(const DLTensor* tensor, const int64_t ndim, const int64_t* shape); + HexagonThreadManager* ThreadManager() { CHECK(runtime_threads) << "runtime_threads has not been created"; return runtime_threads.get(); @@ -178,6 +188,11 @@ class HexagonDeviceAPI final : public DeviceAPI { return (dev.device_type == kDLHexagon) || (dev.device_type == kDLCPU); } + /*! + * \brief set physical shape of tensor - private helper + */ + void SetPhysicalShape(const void* data, const PhysicalShape&); + //! \brief Manages runtime HexagonBuffer allocations // runtime_hexbuffs is used for runtime allocations. It is created with a call to // AcquireResources, and destroyed on ReleaseResources. The buffers in this manager are scoped @@ -199,6 +214,9 @@ class HexagonDeviceAPI final : public DeviceAPI { //! \brief Hexagon power manager std::unique_ptr runtime_power_manager; + + //! \brief NDArray base -> Physical Shape map + std::unordered_map ndarray_physical_shape; }; } // namespace hexagon } // namespace runtime diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc index 5e3c1ed9e6d4c..aadf227a027f5 100644 --- a/src/runtime/memory/memory_manager.cc +++ b/src/runtime/memory/memory_manager.cc @@ -83,7 +83,8 @@ inline size_t GetDataAlignment(const DLTensor& arr) { return align; } -NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) { +NDArray::Container* StorageObj::CreateNDArrayContainer(int64_t offset, ShapeTuple shape, + DLDataType dtype) { VerifyDataType(dtype); // crtical zone: allocate header, cannot throw @@ -92,7 +93,6 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt container->dl_tensor.byte_offset = offset; container->SetDeleter(StorageObj::Deleter); - size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor); this->IncRef(); // The manager context pointer must continue to point to the storage object // which owns the backing memory, and keeps track of the reference count. @@ -101,6 +101,12 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt // reference count, then destroy the container, but leave the underlying // buffer intact. container->manager_ctx = reinterpret_cast(this); + return container; +} + +NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) { + auto* container = CreateNDArrayContainer(offset, shape, dtype); + size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor); if (this->buffer.device.device_type == kDLHexagon) { // For Hexagon, non-zero offset support simply requires adjusting the diff --git a/src/runtime/relax_vm/builtin.cc b/src/runtime/relax_vm/builtin.cc index fb24a3699d87a..9334cbefa5106 100644 --- a/src/runtime/relax_vm/builtin.cc +++ b/src/runtime/relax_vm/builtin.cc @@ -410,6 +410,10 @@ TVM_REGISTER_GLOBAL("vm.builtin.reshape").set_body_typed([](NDArray data, ShapeT return data.CreateView(new_shape, data->dtype); }); +TVM_REGISTER_GLOBAL("vm.builtin.copy_tensor_from_to").set_body_typed([](NDArray src, NDArray dst) { + dst.CopyFrom(src); +}); + TVM_REGISTER_GLOBAL("vm.builtin.null_value").set_body([](TVMArgs args, TVMRetValue* rv) { CHECK_EQ(args.size(), 0); *rv = nullptr; diff --git a/src/runtime/relax_vm/hexagon/builtin.cc b/src/runtime/relax_vm/hexagon/builtin.cc index d18c434193be3..fb0ce4dc512b9 100644 --- a/src/runtime/relax_vm/hexagon/builtin.cc +++ b/src/runtime/relax_vm/hexagon/builtin.cc @@ -57,6 +57,44 @@ TVM_REGISTER_GLOBAL("vm.builtin.hexagon.dma_wait") ICHECK(inflight_dma >= 0); tvm::runtime::hexagon::HexagonDeviceAPI::Global()->UserDMA()->Wait(queue_id, inflight_dma); }); + +NDArray AllocNDArrayFromOffsets(TVMArgValue vm_ptr, Storage storage, uint64_t offset, + Storage data_storage, ShapeTuple data_storage_offsets, + ShapeTuple logical_shape, ShapeTuple shape_2d, DLDataType dtype) { + auto* storage_obj = storage.operator->(); + + auto* container = storage_obj->CreateNDArrayContainer(offset, logical_shape, dtype); + + size_t needed_size = sizeof(void*) * shape_2d[0]; + auto offset_ptr = + reinterpret_cast(storage_obj->buffer.data) + static_cast(offset); + auto cast_offset_ptr = reinterpret_cast(offset_ptr); + uint8_t* data_base = reinterpret_cast(data_storage->buffer.data); + size_t indx = 0; + for (auto elem : data_storage_offsets) { + cast_offset_ptr[indx] = &(data_base[static_cast(elem)]); + indx++; + } + + container->dl_tensor.data = reinterpret_cast(offset_ptr); + container->dl_tensor.byte_offset = 0; + + NDArray ret(GetObjectPtr(container)); + // RAII in effect, now run the check. + + ICHECK((offset + sizeof(void*) * shape_2d[0]) <= storage_obj->buffer.size) + << "storage allocation failure, attempted to allocate " << needed_size << " at offset " + << offset << " in region that is " << storage_obj->buffer.size << "bytes"; + + tvm::runtime::hexagon::HexagonDeviceAPI::Global()->SetPhysicalShape( + ret.operator->(), 2, const_cast(shape_2d.data())); + + return ret; +} + +TVM_REGISTER_GLOBAL("vm.builtin.hexagon.alloc_discontiguous_tensor") + .set_body_typed(AllocNDArrayFromOffsets); + } // namespace relax_vm } // namespace runtime } // namespace tvm diff --git a/tests/python/contrib/test_hexagon/test_vtcm_alloc_compute_use_offsets.py b/tests/python/contrib/test_hexagon/test_vtcm_alloc_compute_use_offsets.py new file mode 100644 index 0000000000000..167cb22d6fcc1 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_vtcm_alloc_compute_use_offsets.py @@ -0,0 +1,425 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=unused-wildcard-import, invalid-name, missing-docstring,no-self-argument +"""Test Discontiguous allocation for hexagon""" + +import numpy as np + +import tvm +import tvm.contrib.hexagon +import tvm.script +import tvm.testing +from tvm import relax +from tvm.script.parser import ir as I +from tvm.script.parser import relax as R +from tvm.script.parser import tir as T + + +@I.ir_module +class Module: + @T.prim_func + def compute_add_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None: + m, n = T.int32(), T.int32() + A = T.match_buffer(a, (m, n), "int32", scope="global.vtcm") + B = T.match_buffer(b, (m, n), "int32", scope="global.vtcm") + C = T.match_buffer(c, (m, n), "int32", scope="global.vtcm") + for ax0, ax1 in T.grid(m, n): + with T.block("T_add"): + v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1]) + T.reads(A[v_ax0, v_ax1], B[v_ax0, v_ax1]) + T.writes(C[v_ax0, v_ax1]) + C[v_ax0, v_ax1] = A[v_ax0, v_ax1] + B[v_ax0, v_ax1] + + @T.prim_func + def compute_mul_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None: + m, n = T.int32(), T.int32() + A = T.match_buffer(a, (m, n), "int32", scope="global.vtcm") + B = T.match_buffer(b, (m, n), "int32", scope="global.vtcm") + C = T.match_buffer(c, (m, n), "int32", scope="global.vtcm") + for ax0, ax1 in T.grid(m, n): + with T.block("T_add"): + v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1]) + T.reads(A[v_ax0, v_ax1], B[v_ax0, v_ax1]) + T.writes(C[v_ax0, v_ax1]) + C[v_ax0, v_ax1] = A[v_ax0, v_ax1] * B[v_ax0, v_ax1] + + @R.function + def main( + x: R.Tensor((4, 64), "int32"), + y: R.Tensor((4, 64), "int32"), + z: R.Tensor((4, 64), "int32"), + ) -> R.Tensor((4, 64), "int32"): + cls = Module + vtcm_obj: R.Object = R.vm.alloc_storage( + R.shape([4096]), runtime_device_index=0, dtype="uint8", storage_scope="global.vtcm" + ) + a: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor( + vtcm_obj, offset=0, shape=R.shape([4, 64]), dtype="int32" + ) + __: R.Tuple = R.vm.copy_tensor_from_to(x, a) + b: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor( + vtcm_obj, offset=1024, shape=R.shape([4, 64]), dtype="int32" + ) + _: R.Tuple = R.vm.copy_tensor_from_to(y, b) + c: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor( + vtcm_obj, offset=2048, shape=R.shape([4, 64]), dtype="int32" + ) + ___: R.Tuple = cls.compute_add_in_vtcm(a, b, c) + _t1: R.Tuple = R.vm.kill_object(a) + _t2: R.Tuple = R.vm.kill_object(b) + d: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor( + vtcm_obj, offset=0, shape=R.shape([4, 64]), dtype="int32" + ) + ___1: R.Tuple = R.vm.copy_tensor_from_to(z, d) + e: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor( + vtcm_obj, offset=1024, shape=R.shape([4, 64]), dtype="int32" + ) + ___2: R.Tuple = cls.compute_mul_in_vtcm(c, d, e) + _t2: R.Tuple = R.vm.kill_object(c) + _t12: R.Tuple = R.vm.kill_object(d) + f: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor( + vtcm_obj, offset=2048, shape=R.shape([4, 64]), dtype="int32" + ) + _t13: R.Tuple = R.vm.copy_tensor_from_to(e, f) + _t14: R.Tuple = R.vm.kill_object(e) + ret_val: R.Tensor([4, 64], dtype="int32") = R.builtin.alloc_tensor( + R.shape([4, 64]), R.dtype("int32"), R.prim_value(0) + ) + _1: R.Tuple = R.vm.copy_tensor_from_to(f, ret_val) + _t15: R.Tuple = R.vm.kill_object(f) + _t3: R.Tuple = R.vm.kill_object(vtcm_obj) + lv: R.Tensor([4, 64], dtype="int32") = ret_val + return lv + + +@I.ir_module +class Module_2d: + @T.prim_func + def compute_add_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None: + m, n = T.int32(), T.int32() + A = T.match_buffer(a, (m, n), "int32", scope="global.vtcm", axis_separators=[1]) + B = T.match_buffer(b, (m, n), "int32", scope="global.vtcm", axis_separators=[1]) + C = T.match_buffer(c, (m, n), "int32", scope="global.vtcm", axis_separators=[1]) + for ax0, ax1 in T.grid(m, n): + with T.block("T_add"): + v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1]) + T.reads(A[v_ax0, v_ax1], B[v_ax0, v_ax1]) + T.writes(C[v_ax0, v_ax1]) + C[v_ax0, v_ax1] = A[v_ax0, v_ax1] + B[v_ax0, v_ax1] + + @T.prim_func + def compute_mul_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None: + m, n = T.int32(), T.int32() + A = T.match_buffer(a, (m, n), "int32", scope="global.vtcm", axis_separators=[1]) + B = T.match_buffer(b, (m, n), "int32", scope="global.vtcm", axis_separators=[1]) + C = T.match_buffer(c, (m, n), "int32", scope="global.vtcm", axis_separators=[1]) + for ax0, ax1 in T.grid(m, n): + with T.block("T_add"): + v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1]) + T.reads(A[v_ax0, v_ax1], B[v_ax0, v_ax1]) + T.writes(C[v_ax0, v_ax1]) + C[v_ax0, v_ax1] = A[v_ax0, v_ax1] * B[v_ax0, v_ax1] + + @R.function + def main( + x: R.Tensor((4, 64), "int32"), + y: R.Tensor((4, 64), "int32"), + z: R.Tensor((4, 64), "int32"), + ) -> R.Tensor((4, 64), "int32"): + cls = Module_2d + vtcm_obj: R.Object = R.vm.alloc_storage( + R.shape([4096]), runtime_device_index=0, dtype="uint8", storage_scope="global.vtcm" + ) + global_obj: R.Object = R.vm.alloc_storage( + R.shape([64]), runtime_device_index=0, dtype="uint8", storage_scope="global" + ) + a: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 0, + vtcm_obj, + R.shape([768, 256, 2304, 3072]), + R.shape([4, 64]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + __: R.Tuple = R.vm.copy_tensor_from_to(x, a) + b: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 16, + vtcm_obj, + R.shape([1536, 1280, 3328, 2560]), + R.shape([4, 64]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + _: R.Tuple = R.vm.copy_tensor_from_to(y, b) + + c: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 32, + vtcm_obj, + R.shape([512, 0, 2048, 3840]), + R.shape([4, 64]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + ___: R.Tuple = cls.compute_add_in_vtcm(a, b, c) + _t1: R.Tuple = R.vm.kill_object(a) + _t2: R.Tuple = R.vm.kill_object(b) + + d: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 0, + vtcm_obj, + R.shape([1536, 1280, 3328, 2560]), + R.shape([4, 64]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + ___1: R.Tuple = R.vm.copy_tensor_from_to(z, d) + vtcm_2d_obj: R.Object = R.vm.alloc_storage( + R.shape([4, 64]), runtime_device_index=0, dtype="int32", storage_scope="global.vtcm" + ) + vtcm_2d_tensor: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor( + vtcm_2d_obj, offset=0, shape=R.shape([4, 64]), dtype="int32" + ) + ___2: R.Tuple = cls.compute_mul_in_vtcm(c, d, vtcm_2d_tensor) + _t2: R.Tuple = R.vm.kill_object(c) + _t12: R.Tuple = R.vm.kill_object(d) + + e: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 16, + vtcm_obj, + R.shape([768, 256, 2304, 3072]), + R.shape([4, 64]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + _t21: R.Tuple = R.vm.copy_tensor_from_to(vtcm_2d_tensor, e) + _t22: R.Tuple = R.vm.kill_object(vtcm_2d_tensor) + _t23: R.Tuple = R.vm.kill_object(vtcm_2d_obj) + f: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 32, + vtcm_obj, + R.shape([1536, 1280, 3328, 2560]), + R.shape([4, 64]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + _t13: R.Tuple = R.vm.copy_tensor_from_to(e, f) + _t14: R.Tuple = R.vm.kill_object(e) + ret_val: R.Tensor([4, 64], dtype="int32") = R.builtin.alloc_tensor( + R.shape([4, 64]), R.dtype("int32"), R.prim_value(0) + ) + _1: R.Tuple = R.vm.copy_tensor_from_to(f, ret_val) + _t15: R.Tuple = R.vm.kill_object(f) + _t16: R.Tuple = R.vm.kill_object(vtcm_obj) + _t17: R.Tuple = R.vm.kill_object(global_obj) + lv: R.Tensor([4, 64], dtype="int32") = ret_val + return lv + + +@I.ir_module +class NDLogicalShapesModule: + @T.prim_func + def compute_add_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None: + m, n, o, p = T.int32(), T.int32(), T.int32(), T.int32() + A = T.match_buffer(a, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2]) + B = T.match_buffer(b, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2]) + C = T.match_buffer(c, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2]) + for ax0, ax1, ax2, ax3 in T.grid(m, n, o, p): + with T.block("T_add"): + v_ax0, v_ax1, v_ax2, v_ax3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3]) + T.reads(A[v_ax0, v_ax1, v_ax2, v_ax3], B[v_ax0, v_ax1, v_ax2, v_ax3]) + T.writes(C[v_ax0, v_ax1, v_ax2, v_ax3]) + C[v_ax0, v_ax1, v_ax2, v_ax3] = ( + A[v_ax0, v_ax1, v_ax2, v_ax3] + B[v_ax0, v_ax1, v_ax2, v_ax3] + ) + + @T.prim_func + def compute_mul_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None: + m, n, o, p = T.int32(), T.int32(), T.int32(), T.int32() + A = T.match_buffer(a, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2]) + B = T.match_buffer(b, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2]) + C = T.match_buffer(c, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2]) + for ax0, ax1, ax2, ax3 in T.grid(m, n, o, p): + with T.block("T_add"): + v_ax0, v_ax1, v_ax2, v_ax3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3]) + T.reads(A[v_ax0, v_ax1, v_ax2, v_ax3], B[v_ax0, v_ax1, v_ax2, v_ax3]) + T.writes(C[v_ax0, v_ax1, v_ax2, v_ax3]) + C[v_ax0, v_ax1, v_ax2, v_ax3] = ( + A[v_ax0, v_ax1, v_ax2, v_ax3] * B[v_ax0, v_ax1, v_ax2, v_ax3] + ) + + @R.function + def main( + x: R.Tensor((2, 2, 8, 8), "int32"), + y: R.Tensor((2, 2, 8, 8), "int32"), + z: R.Tensor((2, 2, 8, 8), "int32"), + ) -> R.Tensor((2, 2, 8, 8), "int32"): + cls = NDLogicalShapesModule + vtcm_obj: R.Object = R.vm.alloc_storage( + R.shape([4096]), runtime_device_index=0, dtype="uint8", storage_scope="global.vtcm" + ) + global_obj: R.Object = R.vm.alloc_storage( + R.shape([64]), runtime_device_index=0, dtype="uint8", storage_scope="global" + ) + a: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 0, + vtcm_obj, + R.shape([768, 256, 2304, 3072]), + R.shape([2, 2, 8, 8]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + __: R.Tuple = R.vm.copy_tensor_from_to(x, a) + b: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 16, + vtcm_obj, + R.shape([1536, 1280, 3328, 2560]), + R.shape([2, 2, 8, 8]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + _: R.Tuple = R.vm.copy_tensor_from_to(y, b) + c: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 32, + vtcm_obj, + R.shape([512, 0, 2048, 3840]), + R.shape([2, 2, 8, 8]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + ___: R.Tuple = cls.compute_add_in_vtcm(a, b, c) + _t1: R.Tuple = R.vm.kill_object(a) + _t2: R.Tuple = R.vm.kill_object(b) + d: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 0, + vtcm_obj, + R.shape([1536, 1280, 3328, 2560]), + R.shape([2, 2, 8, 8]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + _: R.Tuple = R.vm.copy_tensor_from_to(z, d) + e: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx( + "vm.builtin.hexagon.alloc_discontiguous_tensor", + [ + global_obj, + 16, + vtcm_obj, + R.shape([768, 256, 2304, 3072]), + R.shape([2, 2, 8, 8]), + R.shape([4, 64]), + "int32", + ], + sinfo_args=[], + ) + ___2: R.Tuple = cls.compute_mul_in_vtcm(c, d, e) + _t2: R.Tuple = R.vm.kill_object(c) + _t12: R.Tuple = R.vm.kill_object(d) + ret_val: R.Tensor([2, 2, 8, 8], dtype="int32") = R.builtin.alloc_tensor( + R.shape([2, 2, 8, 8]), R.dtype("int32"), R.prim_value(0) + ) + _1: R.Tuple = R.vm.copy_tensor_from_to(e, ret_val) + _t14: R.Tuple = R.vm.kill_object(e) + _t16: R.Tuple = R.vm.kill_object(vtcm_obj) + _t17: R.Tuple = R.vm.kill_object(global_obj) + lv: R.Tensor([2, 2, 8, 8], dtype="int32") = ret_val + return lv + + +class TestVTCMAlloc: + """Tests for VTCM Alloc, Compute and Copy""" + + mode = tvm.testing.parameter("bytecode", "compiled") + (module, in_shape) = tvm.testing.parameters( + (Module_2d, (4, 64)), + (Module, (4, 64)), + (NDLogicalShapesModule, (2, 2, 8, 8)), + ) + + @tvm.testing.requires_hexagon + def test_vtcm_alloc_compute(self, hexagon_launcher, mode, module, in_shape): + target_hexagon = tvm.target.hexagon("v69") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + with tvm.transform.PassContext(opt_level=3, config=[], instruments=[]): + ex = relax.build(mod=module, target=target, exec_mode=mode) + + with hexagon_launcher.create_session() as session: + dev = session.device + input_arg0_data = np.random.randint(0, 9, size=in_shape, dtype="int32") + input_arg1_data = np.random.randint(0, 9, size=in_shape, dtype="int32") + input_arg2_data = np.random.randint(0, 9, size=in_shape, dtype="int32") + output_data = np.multiply(np.add(input_arg0_data, input_arg1_data), input_arg2_data) + vm_mod = session.get_executor_from_factory(ex) + vm_rt = relax.VirtualMachine( + vm_mod, dev, "naive" + ) # Use naive allocator to exercise VTCM allocation in relax + data0 = tvm.nd.array(input_arg0_data, dev) + data1 = tvm.nd.array(input_arg1_data, dev) + data2 = tvm.nd.array(input_arg2_data, dev) + vm_rt.set_input("main", data0, data1, data2) + vm_rt.invoke_stateful("main") + hexagon_output = vm_rt.get_outputs("main").numpy() + tvm.testing.assert_allclose(output_data, hexagon_output) diff --git a/tests/python/relax/test_vm_codegen_only.py b/tests/python/relax/test_vm_codegen_only.py index 0d461f0713c21..5093f2bd29882 100644 --- a/tests/python/relax/test_vm_codegen_only.py +++ b/tests/python/relax/test_vm_codegen_only.py @@ -359,6 +359,42 @@ def main(x: R.Tensor((3, 4), "float32")): tvm.testing.assert_allclose(res.numpy(), expected, rtol=1e-7, atol=1e-7) +@pytest.mark.parametrize("exec_mode", EXEC_MODE) +def test_vm_builtin_copy_tensor_from_to(exec_mode): + @tvm.script.ir_module + class TestVMCopyTensor: + @R.function + def main(x: R.Tensor((3, 4), "float32")): + R.func_attr({"global_symbol": "main"}) + storage: R.Object = R.vm.alloc_storage( + R.shape( + [ + 48, + ] + ), + R.prim_value(0), + R.dtype("uint8"), + ) + ret_val: R.Tensor((3, 4), dtype="float32") = R.vm.alloc_tensor( + storage, R.prim_value(0), R.shape([3, 4]), R.dtype("float32") + ) + __: R.Tuple = R.vm.copy_tensor_from_to(x, ret_val) + lv: R.Tensor([3, 4], dtype="float32") = ret_val + return lv + + mod = TestVMCopyTensor + target = tvm.target.Target("llvm", host="llvm") + ex = codegen(mod, target, exec_mode) + dev = tvm.cpu() + vm = relax.VirtualMachine(ex, dev) + + input_np = np.random.rand(3, 4).astype("float32") + input = tvm.nd.array(input_np, dev) + res = vm["main"](input) + expected = input_np + tvm.testing.assert_allclose(res.numpy(), expected, rtol=1e-7, atol=1e-7) + + @pytest.mark.parametrize("exec_mode", EXEC_MODE) def test_vm_kill_object(exec_mode): @I.ir_module