From 218fce3b129c16807e2daddeacd72ef89d1fe2ad Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Sun, 11 Feb 2024 11:45:06 +0530
Subject: [PATCH] [VM] [Hexagon] Introduce 2D Discontiguous vtcm alloc tensor

Adds 2D Discontiguous alloc tensor hexagon builtin to support 2D
allocations for hexagon at relax level. This is needed when the ops are
implemented to take advantage of 2d indirections and enables
memory manager optimizations to try utilize VTCM memory efficiently.

This patch also introduces the `R.vm.copy_tensor` op to support copies
between different tensors, specifically planned to be used when copying
tensors from one memory scope to another

Co-authored-by: arangasa <76030063+arangasa@users.noreply.github.com>
---
 include/tvm/runtime/memory/memory_manager.h   |   4 +
 python/tvm/relax/op/vm/__init__.py            |   2 +-
 python/tvm/relax/op/vm/vm.py                  |  20 +
 src/relax/backend/vm/codegen_vm.cc            |  13 +
 src/relax/backend/vm/codegen_vm_tir.cc        |  13 +
 src/relax/op/op.cc                            |  16 +
 src/runtime/hexagon/hexagon_buffer.cc         |  18 +-
 src/runtime/hexagon/hexagon_buffer.h          |  13 +
 src/runtime/hexagon/hexagon_device_api.cc     |  74 ++-
 src/runtime/hexagon/hexagon_device_api.h      |  20 +-
 src/runtime/memory/memory_manager.cc          |  10 +-
 src/runtime/relax_vm/builtin.cc               |   4 +
 src/runtime/relax_vm/hexagon/builtin.cc       |  38 ++
 .../test_vtcm_alloc_compute_use_offsets.py    | 425 ++++++++++++++++++
 tests/python/relax/test_vm_codegen_only.py    |  36 ++
 15 files changed, 674 insertions(+), 32 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/test_vtcm_alloc_compute_use_offsets.py

diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index 8e4ed4875e631..d759f765b6c8d 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -139,6 +139,10 @@ class StorageObj : public Object {
   /*! \brief The index into the VM function table. */
   Buffer buffer;
 
+  /* \brief Common function to create an NDArray container with the provided offset, shape and dtype
+   */
+  NDArray::Container* CreateNDArrayContainer(int64_t offset, ShapeTuple shape, DLDataType dtype);
+
   /*! \brief Allocate an NDArray from a given piece of storage. */
   NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);
 
diff --git a/python/tvm/relax/op/vm/__init__.py b/python/tvm/relax/op/vm/__init__.py
index e68ecbcebdf13..03821e46d30d4 100644
--- a/python/tvm/relax/op/vm/__init__.py
+++ b/python/tvm/relax/op/vm/__init__.py
@@ -16,4 +16,4 @@
 # under the License.
 """Relax vm primitives."""
 
-from .vm import alloc_storage, alloc_tensor, call_tir_dyn, kill_object
+from .vm import alloc_storage, alloc_tensor, call_tir_dyn, copy_tensor_from_to, kill_object
diff --git a/python/tvm/relax/op/vm/vm.py b/python/tvm/relax/op/vm/vm.py
index 3ed6b29648b4f..bdd0aca4fd4f7 100644
--- a/python/tvm/relax/op/vm/vm.py
+++ b/python/tvm/relax/op/vm/vm.py
@@ -131,3 +131,23 @@ def call_tir_dyn(func: Expr, args: Tuple) -> Call:
         args = Tuple(args)
 
     return _ffi_api.call_tir_dyn(func, args)  # type: ignore
+
+
+@args_converter.auto
+def copy_tensor_from_to(src: Expr, dst: Expr) -> Call:
+    """Construct a call to copy one tensor to another.
+
+    Parameters
+    ----------
+    src : Expr
+        Source tensor for copy.
+
+    dst : Expr
+        Destination tensor for copy.
+
+    Returns
+    -------
+    result : Call
+        A relax Call, which performs the copy.
+    """
+    return _ffi_api.copy_tensor_from_to(src, dst)  # type: ignore
diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc
index 329da67e84ecd..5023505f5e360 100644
--- a/src/relax/backend/vm/codegen_vm.cc
+++ b/src/relax/backend/vm/codegen_vm.cc
@@ -169,6 +169,8 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
         EmitAllocStorage(call, dst_reg);
       } else if (call_node->op == alloc_tensor_op_) {
         EmitAllocTensor(call, dst_reg);
+      } else if (call_node->op == copy_tensor_from_to_op_) {
+        EmitCopyTensor(call, dst_reg);
       } else if (call_node->op == kill_object_op_) {
         dst_reg = EmitKillObject(call);
       } else {
@@ -361,6 +363,16 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
     builder_->EmitCall("vm.builtin.alloc_tensor", args, dst_reg);
   }
 
+  void EmitCopyTensor(const Call& call_node, RegName dst_reg) {
+    ICHECK_EQ(call_node->args.size(), 2);
+    std::vector<Instruction::Arg> args;
+    args.reserve(2);
+    for (Expr arg : call_node->args) {
+      args.push_back(this->VisitExpr(arg));
+    }
+    builder_->EmitCall("vm.builtin.copy_tensor_from_to", args, dst_reg);
+  }
+
   RegName EmitKillObject(const Call& call_node) {
     ICHECK_EQ(call_node->args.size(), 1);
     Instruction::Arg arg = this->VisitExpr(call_node->args[0]);
@@ -430,6 +442,7 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   /*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */
   const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage");
   const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor");
+  const Op& copy_tensor_from_to_op_ = Op::Get("relax.vm.copy_tensor_from_to");
   const Op& kill_object_op_ = Op::Get("relax.vm.kill_object");
   const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx");
   const Op& null_value_op_ = Op::Get("relax.null_value");
diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc
index ec1678e9e0f31..02c7aabe18cc3 100644
--- a/src/relax/backend/vm/codegen_vm_tir.cc
+++ b/src/relax/backend/vm/codegen_vm_tir.cc
@@ -238,6 +238,8 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
         EmitAllocStorage(call, dst_reg);
       } else if (call_node->op == alloc_tensor_op_) {
         EmitAllocTensor(call, dst_reg);
+      } else if (call_node->op == copy_tensor_from_to_op_) {
+        EmitCopyTensor(call, dst_reg);
       } else if (call_node->op == kill_object_op_) {
         dst_reg = EmitKillObject(call);
       } else {
@@ -414,6 +416,16 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
     this->EmitCallPacked("vm.builtin.alloc_tensor", args, dst_reg);
   }
 
+  void EmitCopyTensor(const Call& call_node, int64_t dst_reg) {
+    ICHECK_EQ(call_node->args.size(), 2);
+    Array<PrimExpr> args;
+    args.reserve(2);
+    for (Expr arg : call_node->args) {
+      args.push_back(this->VisitExpr(arg).value());
+    }
+    this->EmitCallPacked("vm.builtin.copy_tensor_from_to", args, dst_reg);
+  }
+
   int64_t EmitKillObject(const Call& call_node) {
     ICHECK_EQ(call_node->args.size(), 1);
     PrimExpr arg = this->VisitExpr(call_node->args[0]).value();
@@ -519,6 +531,7 @@ class CodeGenVMTIR : public ExprFunctor<Optional<PrimExpr>(const Expr&)> {
   /*! \brief Cache ops that need to be frequently used later to reduce lookup overhead. */
   const Op& alloc_storage_op_ = Op::Get("relax.vm.alloc_storage");
   const Op& alloc_tensor_op_ = Op::Get("relax.vm.alloc_tensor");
+  const Op& copy_tensor_from_to_op_ = Op::Get("relax.vm.copy_tensor_from_to");
   const Op& kill_object_op_ = Op::Get("relax.vm.kill_object");
   const Op& call_builtin_with_ctx_op_ = Op::Get("relax.call_builtin_with_ctx");
   const Op& null_value_op_ = Op::Get("relax.null_value");
diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc
index 489886e50f764..48399662fa0ce 100644
--- a/src/relax/op/op.cc
+++ b/src/relax/op/op.cc
@@ -1004,6 +1004,22 @@ Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm d
 
 TVM_REGISTER_GLOBAL("relax.op.vm.alloc_tensor").set_body_typed(MakeVMAllocTensor);
 
+// vm copy_tensor_from_to
+
+RELAY_REGISTER_OP("relax.vm.copy_tensor_from_to")
+    .set_num_inputs(2)
+    .add_argument("src", "Expr", "The tensor to copy from")
+    .add_argument("dst", "Expr", "The tensor to copy to")
+    .set_attr<FInferStructInfo>("FInferStructInfo", ReturnVoidStructInfo)
+    .set_attr<Bool>("FPurity", Bool(true));
+
+Expr MakeVMCopyTensor(Expr src, Expr dst) {
+  static const Op& op = Op::Get("relax.vm.copy_tensor_from_to");
+  return Call(op, {src, dst}, Attrs(), {});
+}
+
+TVM_REGISTER_GLOBAL("relax.op.vm.copy_tensor_from_to").set_body_typed(MakeVMCopyTensor);
+
 // vm kill_object
 
 TVM_REGISTER_OP("relax.vm.kill_object")
diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index 48afa5770afd1..068fc1854d1d9 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -233,9 +233,9 @@ std::vector<MemoryCopy> MemoryCopy::MergeAdjacent(std::vector<MemoryCopy> micro_
   return macro_copies;
 }
 
-void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src,
-                                        size_t bytes_to_copy, bool src_is_hexbuff,
-                                        bool dest_is_hexbuff) {
+void HexagonBufferCopyAcrossRegions(const BufferSet& dest, const BufferSet& src,
+                                    size_t bytes_to_copy, bool src_is_hexbuff,
+                                    bool dest_is_hexbuff) {
   // First, determine all copies that do not cross boundaries in
   // either source or destination region.
   auto micro_copies = BufferSet::MemoryCopies(dest, src, bytes_to_copy);
@@ -268,24 +268,24 @@ void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
   BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
   BufferSet dest(&data, 1, nbytes);
 
-  hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */,
-                                     false /* dest_is_hexbuff */);
+  HexagonBufferCopyAcrossRegions(dest, src, nbytes, true /* src_is_hexbuff */,
+                                 false /* dest_is_hexbuff */);
 }
 
 void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
   BufferSet src(&data, 1, nbytes);
   BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
-  hexagon_buffer_copy_across_regions(dest, src, nbytes, false /* src_is_hexbuff */,
-                                     true /* dest_is_hexbuff */);
+  HexagonBufferCopyAcrossRegions(dest, src, nbytes, false /* src_is_hexbuff */,
+                                 true /* dest_is_hexbuff */);
 }
 
 void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) {
   BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_);
   BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
-  hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */,
-                                     true /* dest_is_hexbuff */);
+  HexagonBufferCopyAcrossRegions(dest, src, nbytes, true /* src_is_hexbuff */,
+                                 true /* dest_is_hexbuff */);
 }
 
 }  // namespace hexagon
diff --git a/src/runtime/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon_buffer.h
index 8cb8a32095141..9d11a50bc60ad 100644
--- a/src/runtime/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon_buffer.h
@@ -195,6 +195,19 @@ struct BufferSet {
   size_t region_size_bytes;
 };
 
+/**
+ * @brief Single function to handle copying potentially discontiguous buffers efficiently
+ *
+ * @param The destination buffer
+ * @param The source buffer
+ * @param Number of bytes to copy. This should be less than both source and dest buffer size
+ * @param Boolean to specify whether the source is a hexagon buffer
+ * @param Boolean to specify whether the destination is a hexagon buffer
+ */
+void HexagonBufferCopyAcrossRegions(const BufferSet& dest, const BufferSet& src,
+                                    size_t bytes_to_copy, bool src_is_hexbuff,
+                                    bool dest_is_hexbuff);
+
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 65162e7cc6d0d..21944fb03d987 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -32,7 +32,9 @@
 #include <cstring>
 
 #include "../workspace_pool.h"
+#include "hexagon_buffer.h"
 #include "hexagon_common.h"
+#include "qurt_memory.h"
 
 namespace tvm {
 namespace runtime {
@@ -91,23 +93,29 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
   CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
                           << "HexagonDeviceAPI::AllocDataSpace before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
-
+  void* base_ptr;
+  PhysicalShape physical_shape;
   if (ndim == 0) {
     // Allocate storage for a single scalar value.
-    return runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
+    base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {1, 1, typesize};
   } else if (ndim == 1) {
     // Allocate a single, contiguous memory region.
     size_t nbytes = shape[0] * typesize;
-    return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
+    base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {1, 1, nbytes};
   } else if (ndim == 2) {
     // Allocate the region(s) needed for Hexagon's indirect-tensor format.
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment,
-                                                   mem_scope);
+    base_ptr =
+        runtime_hexbuffs->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope);
+    physical_shape = {2, nallocs, nbytes};
   } else {
     return nullptr;  // unreachable
   }
+  SetPhysicalShape(base_ptr, physical_shape);
+  return base_ptr;
 }
 
 void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
@@ -121,7 +129,10 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
   CHECK(runtime_hexbuffs) << "Attempted to allocate Hexagon data with "
                           << "HexagonDeviceAPI::AllocDataSpace before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
-  return runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  void* base_ptr = runtime_hexbuffs->AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  PhysicalShape physical_shape = {1, 1, nbytes};
+  SetPhysicalShape(base_ptr, physical_shape);
+  return base_ptr;
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
@@ -134,6 +145,7 @@ void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
     // occur in the normal course of shutdown, log a message and continue.
     DLOG(INFO) << "FreeDataSpace called outside a session for " << ptr;
   }
+  ndarray_physical_shape.erase(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -157,6 +169,8 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
+void* get_data_start(DLTensor* tensor) { return (reinterpret_cast<uint8_t*>(tensor->data)); }
+
 void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   CHECK_EQ(from->byte_offset, 0);
   CHECK_EQ(to->byte_offset, 0);
@@ -165,22 +179,44 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
                           << "HexagonDeviceAPI::CopyDataFromTo before initializing resources.  "
                           << "Please call HexagonDeviceAPI::AcquireResources";
 
-  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
-    return runtime_hexbuffs->FindHexagonBuffer(ptr);
-  };
+  auto numBytes = GetDataSize(*from);
+
+  size_t FlatShape = 1;
+  for (auto i = 0; i < from->ndim; ++i) FlatShape *= from->shape[i];
+
+  PhysicalShape source_shape = {1, 1, FlatShape};
+  PhysicalShape dest_shape = {1, 1, FlatShape};
+  auto it1 = ndarray_physical_shape.find(from->data);
+  if (it1 != ndarray_physical_shape.end()) source_shape = it1->second;
+  size_t src_rank = source_shape.ndim;
+  void* src_start = get_data_start(from);
+  void* dst_start = get_data_start(to);
+  BufferSet src((src_rank == 1) ? &(src_start) : static_cast<void**>(src_start),
+                source_shape.nblocks, numBytes / source_shape.nblocks);
+  auto it2 = ndarray_physical_shape.find(to->data);
+  if (it2 != ndarray_physical_shape.end()) dest_shape = it2->second;
+  size_t dest_rank = dest_shape.ndim;
+  BufferSet dest((dest_rank == 1) ? &(dst_start) : static_cast<void**>(dst_start),
+                 dest_shape.nblocks, numBytes / dest_shape.nblocks);
+  HexagonBufferCopyAcrossRegions(dest, src, numBytes, (it1 != ndarray_physical_shape.end()),
+                                 (it2 != ndarray_physical_shape.end()));
+  return;
+}
 
-  HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
-  HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
+void HexagonDeviceAPI::SetPhysicalShape(const DLTensor* tensor, const int64_t ndim,
+                                        const int64_t* shape) {
+  PhysicalShape physical_shape = {static_cast<size_t>(ndim), static_cast<size_t>(shape[0]),
+                                  static_cast<size_t>(shape[1])};
+  SetPhysicalShape(tensor->data, physical_shape);
+}
 
-  if (hex_from_buf && hex_to_buf) {
-    hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from));
-  } else if (hex_to_buf) {
-    hex_to_buf->CopyFrom(from->data, GetDataSize(*from));
-  } else if (hex_from_buf) {
-    hex_from_buf->CopyTo(to->data, GetDataSize(*to));
+void HexagonDeviceAPI::SetPhysicalShape(const void* data, const PhysicalShape& physical_shape) {
+  auto it = ndarray_physical_shape.find(const_cast<void*>(data));
+  if (it != ndarray_physical_shape.end()) {
+    ndarray_physical_shape[const_cast<void*>(data)] = physical_shape;
   } else {
-    CHECK(false) << "CopyDataFromTo requested between src and dst which are not managed by the "
-                    "hexagon device api.";
+    ndarray_physical_shape.insert(
+        std::pair<void*, PhysicalShape>(const_cast<void*>(data), physical_shape));
   }
 }
 
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index c4e87a957ade9..0560e56534f4d 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -22,7 +22,6 @@
 
 #include <tvm/runtime/device_api.h>
 
-#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -40,6 +39,12 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
+struct PhysicalShape {
+  size_t ndim;
+  size_t nblocks;
+  size_t block_size;
+};
+
 /*!
  * \brief Hexagon Device API that is compiled and run on Hexagon.
  */
@@ -148,6 +153,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
    */
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
 
+  /*!
+   * \brief set physical shape of tensor
+   */
+  void SetPhysicalShape(const DLTensor* tensor, const int64_t ndim, const int64_t* shape);
+
   HexagonThreadManager* ThreadManager() {
     CHECK(runtime_threads) << "runtime_threads has not been created";
     return runtime_threads.get();
@@ -178,6 +188,11 @@ class HexagonDeviceAPI final : public DeviceAPI {
     return (dev.device_type == kDLHexagon) || (dev.device_type == kDLCPU);
   }
 
+  /*!
+   * \brief set physical shape of tensor - private helper
+   */
+  void SetPhysicalShape(const void* data, const PhysicalShape&);
+
   //! \brief Manages runtime HexagonBuffer allocations
   // runtime_hexbuffs is used for runtime allocations.  It is created with a call to
   // AcquireResources, and destroyed on ReleaseResources.  The buffers in this manager are scoped
@@ -199,6 +214,9 @@ class HexagonDeviceAPI final : public DeviceAPI {
 
   //! \brief Hexagon power manager
   std::unique_ptr<HexagonPowerManager> runtime_power_manager;
+
+  //! \brief NDArray base -> Physical Shape map
+  std::unordered_map<void*, PhysicalShape> ndarray_physical_shape;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc
index 5e3c1ed9e6d4c..aadf227a027f5 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -83,7 +83,8 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   return align;
 }
 
-NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) {
+NDArray::Container* StorageObj::CreateNDArrayContainer(int64_t offset, ShapeTuple shape,
+                                                       DLDataType dtype) {
   VerifyDataType(dtype);
 
   // crtical zone: allocate header, cannot throw
@@ -92,7 +93,6 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt
   container->dl_tensor.byte_offset = offset;
 
   container->SetDeleter(StorageObj::Deleter);
-  size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor);
   this->IncRef();
   // The manager context pointer must continue to point to the storage object
   // which owns the backing memory, and keeps track of the reference count.
@@ -101,6 +101,12 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt
   // reference count, then destroy the container, but leave the underlying
   // buffer intact.
   container->manager_ctx = reinterpret_cast<void*>(this);
+  return container;
+}
+
+NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) {
+  auto* container = CreateNDArrayContainer(offset, shape, dtype);
+  size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor);
 
   if (this->buffer.device.device_type == kDLHexagon) {
     // For Hexagon, non-zero offset support simply requires adjusting the
diff --git a/src/runtime/relax_vm/builtin.cc b/src/runtime/relax_vm/builtin.cc
index fb24a3699d87a..9334cbefa5106 100644
--- a/src/runtime/relax_vm/builtin.cc
+++ b/src/runtime/relax_vm/builtin.cc
@@ -410,6 +410,10 @@ TVM_REGISTER_GLOBAL("vm.builtin.reshape").set_body_typed([](NDArray data, ShapeT
   return data.CreateView(new_shape, data->dtype);
 });
 
+TVM_REGISTER_GLOBAL("vm.builtin.copy_tensor_from_to").set_body_typed([](NDArray src, NDArray dst) {
+  dst.CopyFrom(src);
+});
+
 TVM_REGISTER_GLOBAL("vm.builtin.null_value").set_body([](TVMArgs args, TVMRetValue* rv) {
   CHECK_EQ(args.size(), 0);
   *rv = nullptr;
diff --git a/src/runtime/relax_vm/hexagon/builtin.cc b/src/runtime/relax_vm/hexagon/builtin.cc
index d18c434193be3..fb0ce4dc512b9 100644
--- a/src/runtime/relax_vm/hexagon/builtin.cc
+++ b/src/runtime/relax_vm/hexagon/builtin.cc
@@ -57,6 +57,44 @@ TVM_REGISTER_GLOBAL("vm.builtin.hexagon.dma_wait")
       ICHECK(inflight_dma >= 0);
       tvm::runtime::hexagon::HexagonDeviceAPI::Global()->UserDMA()->Wait(queue_id, inflight_dma);
     });
+
+NDArray AllocNDArrayFromOffsets(TVMArgValue vm_ptr, Storage storage, uint64_t offset,
+                                Storage data_storage, ShapeTuple data_storage_offsets,
+                                ShapeTuple logical_shape, ShapeTuple shape_2d, DLDataType dtype) {
+  auto* storage_obj = storage.operator->();
+
+  auto* container = storage_obj->CreateNDArrayContainer(offset, logical_shape, dtype);
+
+  size_t needed_size = sizeof(void*) * shape_2d[0];
+  auto offset_ptr =
+      reinterpret_cast<uint8_t*>(storage_obj->buffer.data) + static_cast<size_t>(offset);
+  auto cast_offset_ptr = reinterpret_cast<uint8_t**>(offset_ptr);
+  uint8_t* data_base = reinterpret_cast<uint8_t*>(data_storage->buffer.data);
+  size_t indx = 0;
+  for (auto elem : data_storage_offsets) {
+    cast_offset_ptr[indx] = &(data_base[static_cast<size_t>(elem)]);
+    indx++;
+  }
+
+  container->dl_tensor.data = reinterpret_cast<void*>(offset_ptr);
+  container->dl_tensor.byte_offset = 0;
+
+  NDArray ret(GetObjectPtr<Object>(container));
+  // RAII in effect, now run the check.
+
+  ICHECK((offset + sizeof(void*) * shape_2d[0]) <= storage_obj->buffer.size)
+      << "storage allocation failure, attempted to allocate " << needed_size << " at offset "
+      << offset << " in region that is " << storage_obj->buffer.size << "bytes";
+
+  tvm::runtime::hexagon::HexagonDeviceAPI::Global()->SetPhysicalShape(
+      ret.operator->(), 2, const_cast<int64_t*>(shape_2d.data()));
+
+  return ret;
+}
+
+TVM_REGISTER_GLOBAL("vm.builtin.hexagon.alloc_discontiguous_tensor")
+    .set_body_typed(AllocNDArrayFromOffsets);
+
 }  // namespace relax_vm
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_alloc_compute_use_offsets.py b/tests/python/contrib/test_hexagon/test_vtcm_alloc_compute_use_offsets.py
new file mode 100644
index 0000000000000..167cb22d6fcc1
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_vtcm_alloc_compute_use_offsets.py
@@ -0,0 +1,425 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-wildcard-import, invalid-name, missing-docstring,no-self-argument
+"""Test Discontiguous allocation for hexagon"""
+
+import numpy as np
+
+import tvm
+import tvm.contrib.hexagon
+import tvm.script
+import tvm.testing
+from tvm import relax
+from tvm.script.parser import ir as I
+from tvm.script.parser import relax as R
+from tvm.script.parser import tir as T
+
+
+@I.ir_module
+class Module:
+    @T.prim_func
+    def compute_add_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None:
+        m, n = T.int32(), T.int32()
+        A = T.match_buffer(a, (m, n), "int32", scope="global.vtcm")
+        B = T.match_buffer(b, (m, n), "int32", scope="global.vtcm")
+        C = T.match_buffer(c, (m, n), "int32", scope="global.vtcm")
+        for ax0, ax1 in T.grid(m, n):
+            with T.block("T_add"):
+                v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                T.reads(A[v_ax0, v_ax1], B[v_ax0, v_ax1])
+                T.writes(C[v_ax0, v_ax1])
+                C[v_ax0, v_ax1] = A[v_ax0, v_ax1] + B[v_ax0, v_ax1]
+
+    @T.prim_func
+    def compute_mul_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None:
+        m, n = T.int32(), T.int32()
+        A = T.match_buffer(a, (m, n), "int32", scope="global.vtcm")
+        B = T.match_buffer(b, (m, n), "int32", scope="global.vtcm")
+        C = T.match_buffer(c, (m, n), "int32", scope="global.vtcm")
+        for ax0, ax1 in T.grid(m, n):
+            with T.block("T_add"):
+                v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                T.reads(A[v_ax0, v_ax1], B[v_ax0, v_ax1])
+                T.writes(C[v_ax0, v_ax1])
+                C[v_ax0, v_ax1] = A[v_ax0, v_ax1] * B[v_ax0, v_ax1]
+
+    @R.function
+    def main(
+        x: R.Tensor((4, 64), "int32"),
+        y: R.Tensor((4, 64), "int32"),
+        z: R.Tensor((4, 64), "int32"),
+    ) -> R.Tensor((4, 64), "int32"):
+        cls = Module
+        vtcm_obj: R.Object = R.vm.alloc_storage(
+            R.shape([4096]), runtime_device_index=0, dtype="uint8", storage_scope="global.vtcm"
+        )
+        a: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor(
+            vtcm_obj, offset=0, shape=R.shape([4, 64]), dtype="int32"
+        )
+        __: R.Tuple = R.vm.copy_tensor_from_to(x, a)
+        b: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor(
+            vtcm_obj, offset=1024, shape=R.shape([4, 64]), dtype="int32"
+        )
+        _: R.Tuple = R.vm.copy_tensor_from_to(y, b)
+        c: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor(
+            vtcm_obj, offset=2048, shape=R.shape([4, 64]), dtype="int32"
+        )
+        ___: R.Tuple = cls.compute_add_in_vtcm(a, b, c)
+        _t1: R.Tuple = R.vm.kill_object(a)
+        _t2: R.Tuple = R.vm.kill_object(b)
+        d: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor(
+            vtcm_obj, offset=0, shape=R.shape([4, 64]), dtype="int32"
+        )
+        ___1: R.Tuple = R.vm.copy_tensor_from_to(z, d)
+        e: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor(
+            vtcm_obj, offset=1024, shape=R.shape([4, 64]), dtype="int32"
+        )
+        ___2: R.Tuple = cls.compute_mul_in_vtcm(c, d, e)
+        _t2: R.Tuple = R.vm.kill_object(c)
+        _t12: R.Tuple = R.vm.kill_object(d)
+        f: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor(
+            vtcm_obj, offset=2048, shape=R.shape([4, 64]), dtype="int32"
+        )
+        _t13: R.Tuple = R.vm.copy_tensor_from_to(e, f)
+        _t14: R.Tuple = R.vm.kill_object(e)
+        ret_val: R.Tensor([4, 64], dtype="int32") = R.builtin.alloc_tensor(
+            R.shape([4, 64]), R.dtype("int32"), R.prim_value(0)
+        )
+        _1: R.Tuple = R.vm.copy_tensor_from_to(f, ret_val)
+        _t15: R.Tuple = R.vm.kill_object(f)
+        _t3: R.Tuple = R.vm.kill_object(vtcm_obj)
+        lv: R.Tensor([4, 64], dtype="int32") = ret_val
+        return lv
+
+
+@I.ir_module
+class Module_2d:
+    @T.prim_func
+    def compute_add_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None:
+        m, n = T.int32(), T.int32()
+        A = T.match_buffer(a, (m, n), "int32", scope="global.vtcm", axis_separators=[1])
+        B = T.match_buffer(b, (m, n), "int32", scope="global.vtcm", axis_separators=[1])
+        C = T.match_buffer(c, (m, n), "int32", scope="global.vtcm", axis_separators=[1])
+        for ax0, ax1 in T.grid(m, n):
+            with T.block("T_add"):
+                v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                T.reads(A[v_ax0, v_ax1], B[v_ax0, v_ax1])
+                T.writes(C[v_ax0, v_ax1])
+                C[v_ax0, v_ax1] = A[v_ax0, v_ax1] + B[v_ax0, v_ax1]
+
+    @T.prim_func
+    def compute_mul_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None:
+        m, n = T.int32(), T.int32()
+        A = T.match_buffer(a, (m, n), "int32", scope="global.vtcm", axis_separators=[1])
+        B = T.match_buffer(b, (m, n), "int32", scope="global.vtcm", axis_separators=[1])
+        C = T.match_buffer(c, (m, n), "int32", scope="global.vtcm", axis_separators=[1])
+        for ax0, ax1 in T.grid(m, n):
+            with T.block("T_add"):
+                v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                T.reads(A[v_ax0, v_ax1], B[v_ax0, v_ax1])
+                T.writes(C[v_ax0, v_ax1])
+                C[v_ax0, v_ax1] = A[v_ax0, v_ax1] * B[v_ax0, v_ax1]
+
+    @R.function
+    def main(
+        x: R.Tensor((4, 64), "int32"),
+        y: R.Tensor((4, 64), "int32"),
+        z: R.Tensor((4, 64), "int32"),
+    ) -> R.Tensor((4, 64), "int32"):
+        cls = Module_2d
+        vtcm_obj: R.Object = R.vm.alloc_storage(
+            R.shape([4096]), runtime_device_index=0, dtype="uint8", storage_scope="global.vtcm"
+        )
+        global_obj: R.Object = R.vm.alloc_storage(
+            R.shape([64]), runtime_device_index=0, dtype="uint8", storage_scope="global"
+        )
+        a: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                0,
+                vtcm_obj,
+                R.shape([768, 256, 2304, 3072]),
+                R.shape([4, 64]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        __: R.Tuple = R.vm.copy_tensor_from_to(x, a)
+        b: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                16,
+                vtcm_obj,
+                R.shape([1536, 1280, 3328, 2560]),
+                R.shape([4, 64]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        _: R.Tuple = R.vm.copy_tensor_from_to(y, b)
+
+        c: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                32,
+                vtcm_obj,
+                R.shape([512, 0, 2048, 3840]),
+                R.shape([4, 64]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        ___: R.Tuple = cls.compute_add_in_vtcm(a, b, c)
+        _t1: R.Tuple = R.vm.kill_object(a)
+        _t2: R.Tuple = R.vm.kill_object(b)
+
+        d: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                0,
+                vtcm_obj,
+                R.shape([1536, 1280, 3328, 2560]),
+                R.shape([4, 64]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        ___1: R.Tuple = R.vm.copy_tensor_from_to(z, d)
+        vtcm_2d_obj: R.Object = R.vm.alloc_storage(
+            R.shape([4, 64]), runtime_device_index=0, dtype="int32", storage_scope="global.vtcm"
+        )
+        vtcm_2d_tensor: R.Tensor([4, 64], dtype="int32") = R.vm.alloc_tensor(
+            vtcm_2d_obj, offset=0, shape=R.shape([4, 64]), dtype="int32"
+        )
+        ___2: R.Tuple = cls.compute_mul_in_vtcm(c, d, vtcm_2d_tensor)
+        _t2: R.Tuple = R.vm.kill_object(c)
+        _t12: R.Tuple = R.vm.kill_object(d)
+
+        e: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                16,
+                vtcm_obj,
+                R.shape([768, 256, 2304, 3072]),
+                R.shape([4, 64]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        _t21: R.Tuple = R.vm.copy_tensor_from_to(vtcm_2d_tensor, e)
+        _t22: R.Tuple = R.vm.kill_object(vtcm_2d_tensor)
+        _t23: R.Tuple = R.vm.kill_object(vtcm_2d_obj)
+        f: R.Tensor([4, 64], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                32,
+                vtcm_obj,
+                R.shape([1536, 1280, 3328, 2560]),
+                R.shape([4, 64]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        _t13: R.Tuple = R.vm.copy_tensor_from_to(e, f)
+        _t14: R.Tuple = R.vm.kill_object(e)
+        ret_val: R.Tensor([4, 64], dtype="int32") = R.builtin.alloc_tensor(
+            R.shape([4, 64]), R.dtype("int32"), R.prim_value(0)
+        )
+        _1: R.Tuple = R.vm.copy_tensor_from_to(f, ret_val)
+        _t15: R.Tuple = R.vm.kill_object(f)
+        _t16: R.Tuple = R.vm.kill_object(vtcm_obj)
+        _t17: R.Tuple = R.vm.kill_object(global_obj)
+        lv: R.Tensor([4, 64], dtype="int32") = ret_val
+        return lv
+
+
+@I.ir_module
+class NDLogicalShapesModule:
+    @T.prim_func
+    def compute_add_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None:
+        m, n, o, p = T.int32(), T.int32(), T.int32(), T.int32()
+        A = T.match_buffer(a, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2])
+        B = T.match_buffer(b, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2])
+        C = T.match_buffer(c, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2])
+        for ax0, ax1, ax2, ax3 in T.grid(m, n, o, p):
+            with T.block("T_add"):
+                v_ax0, v_ax1, v_ax2, v_ax3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
+                T.reads(A[v_ax0, v_ax1, v_ax2, v_ax3], B[v_ax0, v_ax1, v_ax2, v_ax3])
+                T.writes(C[v_ax0, v_ax1, v_ax2, v_ax3])
+                C[v_ax0, v_ax1, v_ax2, v_ax3] = (
+                    A[v_ax0, v_ax1, v_ax2, v_ax3] + B[v_ax0, v_ax1, v_ax2, v_ax3]
+                )
+
+    @T.prim_func
+    def compute_mul_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None:
+        m, n, o, p = T.int32(), T.int32(), T.int32(), T.int32()
+        A = T.match_buffer(a, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2])
+        B = T.match_buffer(b, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2])
+        C = T.match_buffer(c, (m, n, o, p), "int32", scope="global.vtcm", axis_separators=[2])
+        for ax0, ax1, ax2, ax3 in T.grid(m, n, o, p):
+            with T.block("T_add"):
+                v_ax0, v_ax1, v_ax2, v_ax3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
+                T.reads(A[v_ax0, v_ax1, v_ax2, v_ax3], B[v_ax0, v_ax1, v_ax2, v_ax3])
+                T.writes(C[v_ax0, v_ax1, v_ax2, v_ax3])
+                C[v_ax0, v_ax1, v_ax2, v_ax3] = (
+                    A[v_ax0, v_ax1, v_ax2, v_ax3] * B[v_ax0, v_ax1, v_ax2, v_ax3]
+                )
+
+    @R.function
+    def main(
+        x: R.Tensor((2, 2, 8, 8), "int32"),
+        y: R.Tensor((2, 2, 8, 8), "int32"),
+        z: R.Tensor((2, 2, 8, 8), "int32"),
+    ) -> R.Tensor((2, 2, 8, 8), "int32"):
+        cls = NDLogicalShapesModule
+        vtcm_obj: R.Object = R.vm.alloc_storage(
+            R.shape([4096]), runtime_device_index=0, dtype="uint8", storage_scope="global.vtcm"
+        )
+        global_obj: R.Object = R.vm.alloc_storage(
+            R.shape([64]), runtime_device_index=0, dtype="uint8", storage_scope="global"
+        )
+        a: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                0,
+                vtcm_obj,
+                R.shape([768, 256, 2304, 3072]),
+                R.shape([2, 2, 8, 8]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        __: R.Tuple = R.vm.copy_tensor_from_to(x, a)
+        b: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                16,
+                vtcm_obj,
+                R.shape([1536, 1280, 3328, 2560]),
+                R.shape([2, 2, 8, 8]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        _: R.Tuple = R.vm.copy_tensor_from_to(y, b)
+        c: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                32,
+                vtcm_obj,
+                R.shape([512, 0, 2048, 3840]),
+                R.shape([2, 2, 8, 8]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        ___: R.Tuple = cls.compute_add_in_vtcm(a, b, c)
+        _t1: R.Tuple = R.vm.kill_object(a)
+        _t2: R.Tuple = R.vm.kill_object(b)
+        d: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                0,
+                vtcm_obj,
+                R.shape([1536, 1280, 3328, 2560]),
+                R.shape([2, 2, 8, 8]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        _: R.Tuple = R.vm.copy_tensor_from_to(z, d)
+        e: R.Tensor([2, 2, 8, 8], dtype="int32") = R.call_builtin_with_ctx(
+            "vm.builtin.hexagon.alloc_discontiguous_tensor",
+            [
+                global_obj,
+                16,
+                vtcm_obj,
+                R.shape([768, 256, 2304, 3072]),
+                R.shape([2, 2, 8, 8]),
+                R.shape([4, 64]),
+                "int32",
+            ],
+            sinfo_args=[],
+        )
+        ___2: R.Tuple = cls.compute_mul_in_vtcm(c, d, e)
+        _t2: R.Tuple = R.vm.kill_object(c)
+        _t12: R.Tuple = R.vm.kill_object(d)
+        ret_val: R.Tensor([2, 2, 8, 8], dtype="int32") = R.builtin.alloc_tensor(
+            R.shape([2, 2, 8, 8]), R.dtype("int32"), R.prim_value(0)
+        )
+        _1: R.Tuple = R.vm.copy_tensor_from_to(e, ret_val)
+        _t14: R.Tuple = R.vm.kill_object(e)
+        _t16: R.Tuple = R.vm.kill_object(vtcm_obj)
+        _t17: R.Tuple = R.vm.kill_object(global_obj)
+        lv: R.Tensor([2, 2, 8, 8], dtype="int32") = ret_val
+        return lv
+
+
+class TestVTCMAlloc:
+    """Tests for VTCM Alloc, Compute and Copy"""
+
+    mode = tvm.testing.parameter("bytecode", "compiled")
+    (module, in_shape) = tvm.testing.parameters(
+        (Module_2d, (4, 64)),
+        (Module, (4, 64)),
+        (NDLogicalShapesModule, (2, 2, 8, 8)),
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_vtcm_alloc_compute(self, hexagon_launcher, mode, module, in_shape):
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+        with tvm.transform.PassContext(opt_level=3, config=[], instruments=[]):
+            ex = relax.build(mod=module, target=target, exec_mode=mode)
+
+        with hexagon_launcher.create_session() as session:
+            dev = session.device
+            input_arg0_data = np.random.randint(0, 9, size=in_shape, dtype="int32")
+            input_arg1_data = np.random.randint(0, 9, size=in_shape, dtype="int32")
+            input_arg2_data = np.random.randint(0, 9, size=in_shape, dtype="int32")
+            output_data = np.multiply(np.add(input_arg0_data, input_arg1_data), input_arg2_data)
+            vm_mod = session.get_executor_from_factory(ex)
+            vm_rt = relax.VirtualMachine(
+                vm_mod, dev, "naive"
+            )  # Use naive allocator to exercise VTCM allocation in relax
+            data0 = tvm.nd.array(input_arg0_data, dev)
+            data1 = tvm.nd.array(input_arg1_data, dev)
+            data2 = tvm.nd.array(input_arg2_data, dev)
+            vm_rt.set_input("main", data0, data1, data2)
+            vm_rt.invoke_stateful("main")
+            hexagon_output = vm_rt.get_outputs("main").numpy()
+            tvm.testing.assert_allclose(output_data, hexagon_output)
diff --git a/tests/python/relax/test_vm_codegen_only.py b/tests/python/relax/test_vm_codegen_only.py
index 0d461f0713c21..5093f2bd29882 100644
--- a/tests/python/relax/test_vm_codegen_only.py
+++ b/tests/python/relax/test_vm_codegen_only.py
@@ -359,6 +359,42 @@ def main(x: R.Tensor((3, 4), "float32")):
     tvm.testing.assert_allclose(res.numpy(), expected, rtol=1e-7, atol=1e-7)
 
 
+@pytest.mark.parametrize("exec_mode", EXEC_MODE)
+def test_vm_builtin_copy_tensor_from_to(exec_mode):
+    @tvm.script.ir_module
+    class TestVMCopyTensor:
+        @R.function
+        def main(x: R.Tensor((3, 4), "float32")):
+            R.func_attr({"global_symbol": "main"})
+            storage: R.Object = R.vm.alloc_storage(
+                R.shape(
+                    [
+                        48,
+                    ]
+                ),
+                R.prim_value(0),
+                R.dtype("uint8"),
+            )
+            ret_val: R.Tensor((3, 4), dtype="float32") = R.vm.alloc_tensor(
+                storage, R.prim_value(0), R.shape([3, 4]), R.dtype("float32")
+            )
+            __: R.Tuple = R.vm.copy_tensor_from_to(x, ret_val)
+            lv: R.Tensor([3, 4], dtype="float32") = ret_val
+            return lv
+
+    mod = TestVMCopyTensor
+    target = tvm.target.Target("llvm", host="llvm")
+    ex = codegen(mod, target, exec_mode)
+    dev = tvm.cpu()
+    vm = relax.VirtualMachine(ex, dev)
+
+    input_np = np.random.rand(3, 4).astype("float32")
+    input = tvm.nd.array(input_np, dev)
+    res = vm["main"](input)
+    expected = input_np
+    tvm.testing.assert_allclose(res.numpy(), expected, rtol=1e-7, atol=1e-7)
+
+
 @pytest.mark.parametrize("exec_mode", EXEC_MODE)
 def test_vm_kill_object(exec_mode):
     @I.ir_module