[VM] [Hexagon] Introduce 2D Discontiguous vtcm alloc tensor

Adds 2D Discontiguous alloc tensor hexagon builtin to support 2D allocations for hexagon at relax level. This is needed when the ops are implemented to take advantage of 2d indirections and enables memory manager optimizations to try utilize VTCM memory efficiently. This patch also introduces the `R.vm.copy_tensor` op to support copies between different tensors, specifically planned to be used when copying tensors from one memory scope to another Co-authored-by: arangasa <[email protected]>
apache · Feb 12, 2024 · 37683e4 · 37683e4
1 parent 2d9c6c4
commit 37683e4
Show file tree

Hide file tree

Showing 14 changed files with 657 additions and 24 deletions.
diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
@@ -139,6 +139,9 @@ class StorageObj : public Object {
   /*! \brief The index into the VM function table. */
   Buffer buffer;
 
+  /* \brief Common function to create an NDArray container with the provided offset, shape and dtype */
+  NDArray::Container* CreateNDArrayContainer(int64_t offset, ShapeTuple shape, DLDataType dtype);
+
   /*! \brief Allocate an NDArray from a given piece of storage. */
   NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);
 

diff --git a/python/tvm/relax/op/vm/__init__.py b/python/tvm/relax/op/vm/__init__.py
@@ -16,4 +16,4 @@
 # under the License.
 """Relax vm primitives."""
 
-from .vm import alloc_storage, alloc_tensor, call_tir_dyn, kill_object
+from .vm import alloc_storage, alloc_tensor, call_tir_dyn, copy_tensor, kill_object
diff --git a/python/tvm/relax/op/vm/vm.py b/python/tvm/relax/op/vm/vm.py
@@ -131,3 +131,23 @@ def call_tir_dyn(func: Expr, args: Tuple) -> Call:
         args = Tuple(args)
 
     return _ffi_api.call_tir_dyn(func, args)  # type: ignore
+
+
+@args_converter.auto
+def copy_tensor(src: Expr, dst: Expr) -> Call:
+    """Construct a call to copy one tensor to another.
+
+    Parameters
+    ----------
+    src : Expr
+        Source tensor for copy.
+
+    dst : Expr
+        Destination tensor for copy.
+
+    Returns
+    -------
+    result : Call
+        A relax Call, which performs the copy.
+    """
+    return _ffi_api.copy_tensor(src, dst)  # type: ignore
diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc
@@ -169,6 +169,8 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
         EmitAllocStorage(call, dst_reg);
       } else if (call_node->op == alloc_tensor_op_) {
         EmitAllocTensor(call, dst_reg);
+      } else if (call_node->op == copy_tensor_op_) {
+        EmitCopyTensor(call, dst_reg);
       } else if (call_node->op == kill_object_op_) {
         dst_reg = EmitKillObject(call);
       } else {
@@ -361,6 +363,16 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
     builder_->EmitCall("vm.builtin.alloc_tensor", args, dst_reg);
   }
 
+  void EmitCopyTensor(const Call& call_node, RegName dst_reg) {
+    ICHECK_EQ(call_node->args.size(), 2);
+    std::vector<Instruction::Arg> args;
+    args.reserve(2);
+    for (Expr arg : call_node->args) {
+      args.push_back(this->VisitExpr(arg));
+    }
+    builder_->EmitCall("vm.builtin.copy_tensor", args, dst_reg);
+  }
+
   RegName EmitKillObject(const Call& call_node) {
     ICHECK_EQ(call_node->args.size(), 1);
     Instruction::Arg arg = this->VisitExpr(call_node->args[0]);
@@ -422,6 +434,7 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   /*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */
   const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage");
   const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor");
+  const Op& copy_tensor_op_ = Op::Get("relax.vm.copy_tensor");
   const Op& kill_object_op_ = Op::Get("relax.vm.kill_object");
   const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx");
   const Op& null_value_op_ = Op::Get("relax.null_value");

diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc
@@ -238,6 +238,8 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
         EmitAllocStorage(call, dst_reg);
       } else if (call_node->op == alloc_tensor_op_) {
         EmitAllocTensor(call, dst_reg);
+      } else if (call_node->op == copy_tensor_op_) {
+        EmitCopyTensor(call, dst_reg);
       } else if (call_node->op == kill_object_op_) {
         dst_reg = EmitKillObject(call);
       } else {
@@ -414,6 +416,16 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
     this->EmitCallPacked("vm.builtin.alloc_tensor", args, dst_reg);
   }
 
+  void EmitCopyTensor(const Call& call_node, int64_t dst_reg) {
+    ICHECK_EQ(call_node->args.size(), 2);
+    Array<PrimExpr> args;
+    args.reserve(2);
+    for (Expr arg : call_node->args) {
+      args.push_back(this->VisitExpr(arg).value());
+    }
+    this->EmitCallPacked("vm.builtin.copy_tensor", args, dst_reg);
+  }
+
   int64_t EmitKillObject(const Call& call_node) {
     ICHECK_EQ(call_node->args.size(), 1);
     PrimExpr arg = this->VisitExpr(call_node->args[0]).value();
@@ -519,6 +531,7 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
   /*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */
   const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage");
   const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor");
+  const Op& copy_tensor_op_ = Op::Get("relax.vm.copy_tensor");
   const Op& kill_object_op_ = Op::Get("relax.vm.kill_object");
   const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx");
   const Op& null_value_op_ = Op::Get("relax.null_value");

diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc
@@ -1004,6 +1004,22 @@ Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm d
 
 TVM_REGISTER_GLOBAL("relax.op.vm.alloc_tensor").set_body_typed(MakeVMAllocTensor);
 
+// vm copy_tensor
+
+RELAY_REGISTER_OP("relax.vm.copy_tensor")
+    .set_num_inputs(2)
+    .add_argument("src", "Expr", "The tensor to copy from")
+    .add_argument("dst", "Expr", "The tensor to copy to")
+    .set_attr<FInferStructInfo>("FInferStructInfo", ReturnVoidStructInfo)
+    .set_attr<Bool>("FPurity", Bool(true));
+
+Expr MakeVMCopyTensor(Expr src, Expr dst) {
+  static const Op& op = Op::Get("relax.vm.copy_tensor");
+  return Call(op, {src, dst}, Attrs(), {});
+}
+
+TVM_REGISTER_GLOBAL("relax.op.vm.copy_tensor").set_body_typed(MakeVMCopyTensor);
+
 // vm kill_object
 
 TVM_REGISTER_OP("relax.vm.kill_object")

diff --git a/src/runtime/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon_buffer.h
@@ -195,6 +195,10 @@ struct BufferSet {
   size_t region_size_bytes;
 };
 
+void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src,
+                                        size_t bytes_to_copy, bool src_is_hexbuff,
+                                        bool dest_is_hexbuff);
+
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
@@ -32,7 +32,9 @@
 #include <cstring>
 
 #include "../workspace_pool.h"
+#include "hexagon_buffer.h"
 #include "hexagon_common.h"
+#include "qurt_memory.h"
 
 namespace tvm {
 namespace runtime {
@@ -91,23 +93,29 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
   CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
                           << "HexagonDeviceAPI::AllocDataSpace before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
-
+  void* base_ptr;
+  PhysicalShape physical_shape;
   if (ndim == 0) {
     // Allocate storage for a single scalar value.
-    return runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
+    base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {1, 1, typesize};
   } else if (ndim == 1) {
     // Allocate a single, contiguous memory region.
     size_t nbytes = shape[0] * typesize;
-    return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
+    base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {1, 1, nbytes};
   } else if (ndim == 2) {
     // Allocate the region(s) needed for Hexagon's indirect-tensor format.
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment,
-                                                   mem_scope);
+    base_ptr =
+        runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {2, nallocs, nbytes};
   } else {
     return nullptr;  // unreachable
   }
+  SetPhysicalShape(base_ptr, physical_shape);
+  return base_ptr;
 }
 
 void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
@@ -121,7 +129,10 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
   CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
                           << "HexagonDeviceAPI::AllocDataSpace before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
-  return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  void* base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  PhysicalShape physical_shape = {1, 1, nbytes};
+  SetPhysicalShape(base_ptr, physical_shape);
+  return base_ptr;
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
@@ -134,6 +145,7 @@ void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
     // occur in the normal course of shutdown, log a message and continue.
     DLOG(INFO) << "FreeDataSpace called outside a session for " << ptr;
   }
+  ndarray_physical_shape.erase(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -157,6 +169,10 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
+void* get_data_start(DLTensor* tensor) {
+  return (reinterpret_cast<uint8_t*>(tensor->data));
+}
+
 void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   CHECK_EQ(from->byte_offset, 0);
   CHECK_EQ(to->byte_offset, 0);
@@ -165,23 +181,44 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
                           << "HexagonDeviceAPI::CopyDataFromTo before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
 
-  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
-    return runtime_hexbuffs->FindHexagonBuffer(ptr);
-  };
+  auto numBytes = GetDataSize(*from);
+
+  size_t FlatShape = 1;
+  for (auto i = 0; i < from->ndim; ++i) FlatShape *= from->shape[i];
+
+  PhysicalShape source_shape = {1, 1, FlatShape};
+  PhysicalShape dest_shape = {1, 1, FlatShape};
+  auto it1 = ndarray_physical_shape.find(from->data);
+  if (it1 != ndarray_physical_shape.end()) source_shape = it1->second;
+  size_t src_rank = source_shape.ndim;
+  void* src_start = get_data_start(from);
+  void* dst_start = get_data_start(to);
+  BufferSet src((src_rank == 1) ? &(src_start) : static_cast<void**>(src_start),
+                source_shape.nblocks, numBytes / source_shape.nblocks);
+  auto it2 = ndarray_physical_shape.find(to->data);
+  if (it2 != ndarray_physical_shape.end()) dest_shape = it2->second;
+  size_t dest_rank = dest_shape.ndim;
+  BufferSet dest((dest_rank == 1) ? &(dst_start) : static_cast<void**>(dst_start),
+                 dest_shape.nblocks, numBytes / dest_shape.nblocks);
+  hexagon_buffer_copy_across_regions(dest, src, numBytes, (it1 != ndarray_physical_shape.end()),
+                                     (it2 != ndarray_physical_shape.end()));
+  return;
+}
 
-  HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
-  HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
+void HexagonDeviceAPI::SetPhysicalShape(const DLTensor* tensor, const int64_t ndim,
+                                        const int64_t* shape) {
+  PhysicalShape physical_shape = {static_cast<size_t>(ndim), static_cast<size_t>(shape[0]),
+                                  static_cast<size_t>(shape[1])};
+  SetPhysicalShape(tensor->data, physical_shape);
+}
 
-  if (hex_from_buf && hex_to_buf) {
-    hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from));
-  } else if (hex_to_buf) {
-    hex_to_buf->CopyFrom(from->data, GetDataSize(*from));
-  } else if (hex_from_buf) {
-    hex_from_buf->CopyTo(to->data, GetDataSize(*to));
-  } else {
-    CHECK(false) << "CopyDataFromTo requested between src and dst which are not managed by the "
-                    "hexagon device api.";
-  }
+void HexagonDeviceAPI::SetPhysicalShape(const void* data, const PhysicalShape& physical_shape) {
+  auto it = ndarray_physical_shape.find(const_cast<void*>(data));
+  if (it != ndarray_physical_shape.end())
+    ndarray_physical_shape[const_cast<void*>(data)] = physical_shape;
+  else
+    ndarray_physical_shape.insert(
+        std::pair<void*, PhysicalShape>(const_cast<void*>(data), physical_shape));
 }
 
 void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to,

diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
@@ -40,6 +40,12 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
+struct PhysicalShape {
+  size_t ndim;
+  size_t nblocks;
+  size_t block_size;
+};
+
 /*!
  * \brief Hexagon Device API that is compiled and run on Hexagon.
  */
@@ -148,6 +154,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
    */
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
 
+  /*!
+   * \brief set physical shape of tensor
+   */
+  void SetPhysicalShape(const DLTensor* tensor, const int64_t ndim, const int64_t* shape);
+
   HexagonThreadManager* ThreadManager() {
     CHECK(runtime_threads) << "runtime_threads has not been created";
     return runtime_threads.get();
@@ -178,6 +189,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
     return (dev.device_type == kDLHexagon) || (dev.device_type == kDLCPU);
   }
 
+  /*!
+   * \brief set physical shape of tensor - private helper
+   */
+  void SetPhysicalShape(const void* data, const PhysicalShape&);
+
   //! \brief Manages runtime HexagonBuffer allocations
   // runtime_hexbuffs is used for runtime allocations.  It is created with a call to
   // AcquireResources, and destroyed on ReleaseResources.  The buffers in this manager are scoped
@@ -199,6 +215,9 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
   //! \brief Hexagon power manager
   std::unique_ptr<HexagonPowerManager> runtime_power_manager;
+
+  //! \brief NDArray base -> Physical Shape map
+  std::map<void*, PhysicalShape> ndarray_physical_shape;
 };
 }  // namespace hexagon
 }  // namespace runtime

diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc
@@ -83,7 +83,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   return align;
 }
 
-NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) {
+NDArray::Container* StorageObj::CreateNDArrayContainer(int64_t offset, ShapeTuple shape, DLDataType dtype) {
   VerifyDataType(dtype);
 
   // crtical zone: allocate header, cannot throw
@@ -92,7 +92,6 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt
   container->dl_tensor.byte_offset = offset;
 
   container->SetDeleter(StorageObj::Deleter);
-  size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor);
   this->IncRef();
   // The manager context pointer must continue to point to the storage object
   // which owns the backing memory, and keeps track of the reference count.
@@ -101,6 +100,12 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt
   // reference count, then destroy the container, but leave the underlying
   // buffer intact.
   container->manager_ctx = reinterpret_cast<void*>(this);
+  return container;
+}
+
+NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) {
+  auto* container = CreateNDArrayContainer(offset, shape, dtype);
+  size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor);
 
   if (this->buffer.device.device_type == kDLHexagon) {
     // For Hexagon, non-zero offset support simply requires adjusting the

diff --git a/src/runtime/relax_vm/builtin.cc b/src/runtime/relax_vm/builtin.cc
@@ -410,6 +410,10 @@ TVM_REGISTER_GLOBAL("vm.builtin.reshape").set_body_typed([](NDArray data, ShapeT
   return data.CreateView(new_shape, data->dtype);
 });
 
+TVM_REGISTER_GLOBAL("vm.builtin.copy_tensor").set_body_typed([](NDArray src, NDArray dst) {
+  dst.CopyFrom(src);
+});
+
 TVM_REGISTER_GLOBAL("vm.builtin.null_value").set_body([](TVMArgs args, TVMRetValue* rv) {
   CHECK_EQ(args.size(), 0);
   *rv = nullptr;