[VM] [Hexagon] Introduce 2D Discontiguous vtcm alloc tensor

Adds 2D Discontiguous alloc tensor hexagon builtin to support 2D allocations for hexagon at relax level. This is needed when the ops are implemented to take advantage of 2d indirections and enables memory manager optimizations to try utilize VTCM memory efficiently. This patch also introduces the `R.vm.copy_tensor` op to support copies between different tensors, specifically planned to be used when copying tensors from one memory scope to another Co-authored-by: arangasa <[email protected]>
apache · Feb 20, 2024 · 218fce3 · 218fce3
1 parent dd70941
commit 218fce3
Show file tree

Hide file tree

Showing 15 changed files with 674 additions and 32 deletions.
diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
@@ -139,6 +139,10 @@ class StorageObj : public Object {
   /*! \brief The index into the VM function table. */
   Buffer buffer;
 
+  /* \brief Common function to create an NDArray container with the provided offset, shape and dtype
+   */
+  NDArray::Container* CreateNDArrayContainer(int64_t offset, ShapeTuple shape, DLDataType dtype);
+
   /*! \brief Allocate an NDArray from a given piece of storage. */
   NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);
 

diff --git a/python/tvm/relax/op/vm/__init__.py b/python/tvm/relax/op/vm/__init__.py
@@ -16,4 +16,4 @@
 # under the License.
 """Relax vm primitives."""
 
-from .vm import alloc_storage, alloc_tensor, call_tir_dyn, kill_object
+from .vm import alloc_storage, alloc_tensor, call_tir_dyn, copy_tensor_from_to, kill_object
diff --git a/python/tvm/relax/op/vm/vm.py b/python/tvm/relax/op/vm/vm.py
@@ -131,3 +131,23 @@ def call_tir_dyn(func: Expr, args: Tuple) -> Call:
         args = Tuple(args)
 
     return _ffi_api.call_tir_dyn(func, args)  # type: ignore
+
+
+@args_converter.auto
+def copy_tensor_from_to(src: Expr, dst: Expr) -> Call:
+    """Construct a call to copy one tensor to another.
+
+    Parameters
+    ----------
+    src : Expr
+        Source tensor for copy.
+
+    dst : Expr
+        Destination tensor for copy.
+
+    Returns
+    -------
+    result : Call
+        A relax Call, which performs the copy.
+    """
+    return _ffi_api.copy_tensor_from_to(src, dst)  # type: ignore
diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc
@@ -169,6 +169,8 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
         EmitAllocStorage(call, dst_reg);
       } else if (call_node->op == alloc_tensor_op_) {
         EmitAllocTensor(call, dst_reg);
+      } else if (call_node->op == copy_tensor_from_to_op_) {
+        EmitCopyTensor(call, dst_reg);
       } else if (call_node->op == kill_object_op_) {
         dst_reg = EmitKillObject(call);
       } else {
@@ -361,6 +363,16 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
     builder_->EmitCall("vm.builtin.alloc_tensor", args, dst_reg);
   }
 
+  void EmitCopyTensor(const Call& call_node, RegName dst_reg) {
+    ICHECK_EQ(call_node->args.size(), 2);
+    std::vector<Instruction::Arg> args;
+    args.reserve(2);
+    for (Expr arg : call_node->args) {
+      args.push_back(this->VisitExpr(arg));
+    }
+    builder_->EmitCall("vm.builtin.copy_tensor_from_to", args, dst_reg);
+  }
+
   RegName EmitKillObject(const Call& call_node) {
     ICHECK_EQ(call_node->args.size(), 1);
     Instruction::Arg arg = this->VisitExpr(call_node->args[0]);
@@ -430,6 +442,7 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   /*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */
   const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage");
   const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor");
+  const Op& copy_tensor_from_to_op_ = Op::Get("relax.vm.copy_tensor_from_to");
   const Op& kill_object_op_ = Op::Get("relax.vm.kill_object");
   const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx");
   const Op& null_value_op_ = Op::Get("relax.null_value");

diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc
@@ -238,6 +238,8 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
         EmitAllocStorage(call, dst_reg);
       } else if (call_node->op == alloc_tensor_op_) {
         EmitAllocTensor(call, dst_reg);
+      } else if (call_node->op == copy_tensor_from_to_op_) {
+        EmitCopyTensor(call, dst_reg);
       } else if (call_node->op == kill_object_op_) {
         dst_reg = EmitKillObject(call);
       } else {
@@ -414,6 +416,16 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
     this->EmitCallPacked("vm.builtin.alloc_tensor", args, dst_reg);
   }
 
+  void EmitCopyTensor(const Call& call_node, int64_t dst_reg) {
+    ICHECK_EQ(call_node->args.size(), 2);
+    Array<PrimExpr> args;
+    args.reserve(2);
+    for (Expr arg : call_node->args) {
+      args.push_back(this->VisitExpr(arg).value());
+    }
+    this->EmitCallPacked("vm.builtin.copy_tensor_from_to", args, dst_reg);
+  }
+
   int64_t EmitKillObject(const Call& call_node) {
     ICHECK_EQ(call_node->args.size(), 1);
     PrimExpr arg = this->VisitExpr(call_node->args[0]).value();
@@ -519,6 +531,7 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
   /*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */
   const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage");
   const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor");
+  const Op& copy_tensor_from_to_op_ = Op::Get("relax.vm.copy_tensor_from_to");
   const Op& kill_object_op_ = Op::Get("relax.vm.kill_object");
   const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx");
   const Op& null_value_op_ = Op::Get("relax.null_value");

diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc
@@ -1004,6 +1004,22 @@ Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm d
 
 TVM_REGISTER_GLOBAL("relax.op.vm.alloc_tensor").set_body_typed(MakeVMAllocTensor);
 
+// vm copy_tensor_from_to
+
+RELAY_REGISTER_OP("relax.vm.copy_tensor_from_to")
+    .set_num_inputs(2)
+    .add_argument("src", "Expr", "The tensor to copy from")
+    .add_argument("dst", "Expr", "The tensor to copy to")
+    .set_attr<FInferStructInfo>("FInferStructInfo", ReturnVoidStructInfo)
+    .set_attr<Bool>("FPurity", Bool(true));
+
+Expr MakeVMCopyTensor(Expr src, Expr dst) {
+  static const Op& op = Op::Get("relax.vm.copy_tensor_from_to");
+  return Call(op, {src, dst}, Attrs(), {});
+}
+
+TVM_REGISTER_GLOBAL("relax.op.vm.copy_tensor_from_to").set_body_typed(MakeVMCopyTensor);
+
 // vm kill_object
 
 TVM_REGISTER_OP("relax.vm.kill_object")

diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
@@ -233,9 +233,9 @@ std::vector<MemoryCopy> MemoryCopy::MergeAdjacent(std::vector<MemoryCopy> micro_
   return macro_copies;
 }
 
-void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src,
-                                        size_t bytes_to_copy, bool src_is_hexbuff,
-                                        bool dest_is_hexbuff) {
+void HexagonBufferCopyAcrossRegions(const BufferSet& dest, const BufferSet& src,
+                                    size_t bytes_to_copy, bool src_is_hexbuff,
+                                    bool dest_is_hexbuff) {
   // First, determine all copies that do not cross boundaries in
   // either source or destination region.
   auto micro_copies = BufferSet::MemoryCopies(dest, src, bytes_to_copy);
@@ -268,24 +268,24 @@ void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
   BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
   BufferSet dest(&data, 1, nbytes);
 
-  hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */,
-                                     false /* dest_is_hexbuff */);
+  HexagonBufferCopyAcrossRegions(dest, src, nbytes, true /* src_is_hexbuff */,
+                                 false /* dest_is_hexbuff */);
 }
 
 void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
   BufferSet src(&data, 1, nbytes);
   BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
-  hexagon_buffer_copy_across_regions(dest, src, nbytes, false /* src_is_hexbuff */,
-                                     true /* dest_is_hexbuff */);
+  HexagonBufferCopyAcrossRegions(dest, src, nbytes, false /* src_is_hexbuff */,
+                                 true /* dest_is_hexbuff */);
 }
 
 void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) {
   BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_);
   BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
-  hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */,
-                                     true /* dest_is_hexbuff */);
+  HexagonBufferCopyAcrossRegions(dest, src, nbytes, true /* src_is_hexbuff */,
+                                 true /* dest_is_hexbuff */);
 }
 
 }  // namespace hexagon

diff --git a/src/runtime/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon_buffer.h
@@ -195,6 +195,19 @@ struct BufferSet {
   size_t region_size_bytes;
 };
 
+/**
+ * @brief Single function to handle copying potentially discontiguous buffers efficiently
+ *
+ * @param The destination buffer
+ * @param The source buffer
+ * @param Number of bytes to copy. This should be less than both source and dest buffer size
+ * @param Boolean to specify whether the source is a hexagon buffer
+ * @param Boolean to specify whether the destination is a hexagon buffer
+ */
+void HexagonBufferCopyAcrossRegions(const BufferSet& dest, const BufferSet& src,
+                                    size_t bytes_to_copy, bool src_is_hexbuff,
+                                    bool dest_is_hexbuff);
+
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
@@ -32,7 +32,9 @@
 #include <cstring>
 
 #include "../workspace_pool.h"
+#include "hexagon_buffer.h"
 #include "hexagon_common.h"
+#include "qurt_memory.h"
 
 namespace tvm {
 namespace runtime {
@@ -91,23 +93,29 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
   CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
                           << "HexagonDeviceAPI::AllocDataSpace before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
-
+  void* base_ptr;
+  PhysicalShape physical_shape;
   if (ndim == 0) {
     // Allocate storage for a single scalar value.
-    return runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
+    base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {1, 1, typesize};
   } else if (ndim == 1) {
     // Allocate a single, contiguous memory region.
     size_t nbytes = shape[0] * typesize;
-    return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
+    base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {1, 1, nbytes};
   } else if (ndim == 2) {
     // Allocate the region(s) needed for Hexagon's indirect-tensor format.
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment,
-                                                   mem_scope);
+    base_ptr =
+        runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {2, nallocs, nbytes};
   } else {
     return nullptr;  // unreachable
   }
+  SetPhysicalShape(base_ptr, physical_shape);
+  return base_ptr;
 }
 
 void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
@@ -121,7 +129,10 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
   CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
                           << "HexagonDeviceAPI::AllocDataSpace before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
-  return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  void* base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  PhysicalShape physical_shape = {1, 1, nbytes};
+  SetPhysicalShape(base_ptr, physical_shape);
+  return base_ptr;
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
@@ -134,6 +145,7 @@ void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
     // occur in the normal course of shutdown, log a message and continue.
     DLOG(INFO) << "FreeDataSpace called outside a session for " << ptr;
   }
+  ndarray_physical_shape.erase(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -157,6 +169,8 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
+void* get_data_start(DLTensor* tensor) { return (reinterpret_cast<uint8_t*>(tensor->data)); }
+
 void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   CHECK_EQ(from->byte_offset, 0);
   CHECK_EQ(to->byte_offset, 0);
@@ -165,22 +179,44 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
                           << "HexagonDeviceAPI::CopyDataFromTo before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
 
-  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
-    return runtime_hexbuffs->FindHexagonBuffer(ptr);
-  };
+  auto numBytes = GetDataSize(*from);
+
+  size_t FlatShape = 1;
+  for (auto i = 0; i < from->ndim; ++i) FlatShape *= from->shape[i];
+
+  PhysicalShape source_shape = {1, 1, FlatShape};
+  PhysicalShape dest_shape = {1, 1, FlatShape};
+  auto it1 = ndarray_physical_shape.find(from->data);
+  if (it1 != ndarray_physical_shape.end()) source_shape = it1->second;
+  size_t src_rank = source_shape.ndim;
+  void* src_start = get_data_start(from);
+  void* dst_start = get_data_start(to);
+  BufferSet src((src_rank == 1) ? &(src_start) : static_cast<void**>(src_start),
+                source_shape.nblocks, numBytes / source_shape.nblocks);
+  auto it2 = ndarray_physical_shape.find(to->data);
+  if (it2 != ndarray_physical_shape.end()) dest_shape = it2->second;
+  size_t dest_rank = dest_shape.ndim;
+  BufferSet dest((dest_rank == 1) ? &(dst_start) : static_cast<void**>(dst_start),
+                 dest_shape.nblocks, numBytes / dest_shape.nblocks);
+  HexagonBufferCopyAcrossRegions(dest, src, numBytes, (it1 != ndarray_physical_shape.end()),
+                                 (it2 != ndarray_physical_shape.end()));
+  return;
+}
 
-  HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
-  HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
+void HexagonDeviceAPI::SetPhysicalShape(const DLTensor* tensor, const int64_t ndim,
+                                        const int64_t* shape) {
+  PhysicalShape physical_shape = {static_cast<size_t>(ndim), static_cast<size_t>(shape[0]),
+                                  static_cast<size_t>(shape[1])};
+  SetPhysicalShape(tensor->data, physical_shape);
+}
 
-  if (hex_from_buf && hex_to_buf) {
-    hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from));
-  } else if (hex_to_buf) {
-    hex_to_buf->CopyFrom(from->data, GetDataSize(*from));
-  } else if (hex_from_buf) {
-    hex_from_buf->CopyTo(to->data, GetDataSize(*to));
+void HexagonDeviceAPI::SetPhysicalShape(const void* data, const PhysicalShape& physical_shape) {
+  auto it = ndarray_physical_shape.find(const_cast<void*>(data));
+  if (it != ndarray_physical_shape.end()) {
+    ndarray_physical_shape[const_cast<void*>(data)] = physical_shape;
   } else {
-    CHECK(false) << "CopyDataFromTo requested between src and dst which are not managed by the "
-                    "hexagon device api.";
+    ndarray_physical_shape.insert(
+        std::pair<void*, PhysicalShape>(const_cast<void*>(data), physical_shape));
   }
 }
 

diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
@@ -22,7 +22,6 @@
 
 #include <tvm/runtime/device_api.h>
 
-#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -40,6 +39,12 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
+struct PhysicalShape {
+  size_t ndim;
+  size_t nblocks;
+  size_t block_size;
+};
+
 /*!
  * \brief Hexagon Device API that is compiled and run on Hexagon.
  */
@@ -148,6 +153,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
    */
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
 
+  /*!
+   * \brief set physical shape of tensor
+   */
+  void SetPhysicalShape(const DLTensor* tensor, const int64_t ndim, const int64_t* shape);
+
   HexagonThreadManager* ThreadManager() {
     CHECK(runtime_threads) << "runtime_threads has not been created";
     return runtime_threads.get();
@@ -178,6 +188,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
     return (dev.device_type == kDLHexagon) || (dev.device_type == kDLCPU);
   }
 
+  /*!
+   * \brief set physical shape of tensor - private helper
+   */
+  void SetPhysicalShape(const void* data, const PhysicalShape&);
+
   //! \brief Manages runtime HexagonBuffer allocations
   // runtime_hexbuffs is used for runtime allocations.  It is created with a call to
   // AcquireResources, and destroyed on ReleaseResources.  The buffers in this manager are scoped
@@ -199,6 +214,9 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
   //! \brief Hexagon power manager
   std::unique_ptr<HexagonPowerManager> runtime_power_manager;
+
+  //! \brief NDArray base -> Physical Shape map
+  std::unordered_map<void*, PhysicalShape> ndarray_physical_shape;
 };
 }  // namespace hexagon
 }  // namespace runtime