From c17a89b8549fc9b547a72bd3c5a6d50f1dcb4da3 Mon Sep 17 00:00:00 2001 From: Krishna Bindumadhavan Date: Mon, 28 Oct 2024 05:38:00 +0000 Subject: [PATCH] [mrvl][runtime]: Support Marvell Hardware Runtime Signed-off-by: Krishna Bindumadhavan Change-Id: Id9052552fbb3f19a53462183967d30679f3aa286 --- cmake/modules/contrib/Mrvl.cmake | 1 + docs/how_to/deploy/mrvl.rst | 33 +- python/tvm/relay/op/contrib/mrvl.py | 17 + src/relay/backend/contrib/mrvl/codegen.cc | 26 +- src/runtime/contrib/mrvl/mrvl_hw_runtime.cc | 485 ++++++++++++++++++++ 5 files changed, 555 insertions(+), 7 deletions(-) create mode 100644 src/runtime/contrib/mrvl/mrvl_hw_runtime.cc diff --git a/cmake/modules/contrib/Mrvl.cmake b/cmake/modules/contrib/Mrvl.cmake index 8bf48e02ca21..52907011e632 100644 --- a/cmake/modules/contrib/Mrvl.cmake +++ b/cmake/modules/contrib/Mrvl.cmake @@ -20,6 +20,7 @@ if(USE_MRVL) message(STATUS "Build with Mrvl support") file(GLOB RUNTIME_MRVL_SRCS src/runtime/contrib/mrvl/mrvl_runtime.cc + src/runtime/contrib/mrvl/mrvl_hw_runtime.cc src/runtime/contrib/mrvl/mrvl_sw_runtime_lib.cc ) list(APPEND RUNTIME_SRCS ${RUNTIME_MRVL_SRCS}) diff --git a/docs/how_to/deploy/mrvl.rst b/docs/how_to/deploy/mrvl.rst index 7b41e2ee3a74..a0876fbe5aec 100644 --- a/docs/how_to/deploy/mrvl.rst +++ b/docs/how_to/deploy/mrvl.rst @@ -100,11 +100,11 @@ integrated MLIP cn10ka processor, using only 4 tiles in the block. python3 -m tvm.driver.tvmc compile --target="mrvl, llvm" \ --target-llvm-mtriple=aarch64-linux-gnu --target-llvm-mcpu=neoverse-n2 \ --target-mrvl-num_tiles=4 \ + --target-mrvl-mattr="hw -quantize=fp16 -wb_pin_ocm=1" \ --cross-compiler aarch64-linux-gnu-gcc \ --output model.tar \ mnist-12.onnx -The runtime support for hardware acceleration is a WIP, it will be added in future PR. 3.3. TVMC Compiler: mrvl specific Command Line Options ------------------------------------------------------ @@ -125,7 +125,7 @@ The runtime support for hardware acceleration is a WIP, it will be added in futu Maximum number of tiles that may be used, possible values = {1,2,4,8}, defaults to 8 * mattr: - Attributes for mrvl; possible values = {quantize, wb_pin_ocm} + Attributes for mrvl; possible values = {quantize, wb_pin_ocm, run_mode} mattr specifies the data type, code generation options and optimizations. @@ -141,8 +141,13 @@ The runtime support for hardware acceleration is a WIP, it will be added in futu Optimize runtime by preloading a model's weights and bias into the on chip memory. Possible values = {0, 1}. Default is 0 (no preload) -4. Compile ONNX model for Simulator + LLVM / x86_64 target ----------------------------------------------------------- + **3. run_mode** + + Specify whether to compile for the simulator or for the target hardware (Octeon). + Possible values = {sim, hw}. Default is sim (software simulator). + +4. Compile ONNX model using the TVMC flow +----------------------------------------- In the TVMC mrvl flow, the model is partitioned into Marvell and LLVM regions. Building each partitioned Marvell subgraph generates serialized nodes.json and @@ -150,6 +155,9 @@ const.json. Partitioned nodes.json is the representation of the model graph whic suitable for the Marvell compiler (mrvl-tmlc). The compiler compiles the model graph to generate the model binary with MLIP instructions. +4.1 Compile and Run ONNX model for Simulator + LLVM / x86_64 target +-------------------------------------------------------------------- + **Model Compilation for Simulator + LLVM / x86_64 target** .. code:: python @@ -165,6 +173,23 @@ Generated model binary is simulated using Marvell's MLIP Simulator(mrvl-mlsim). python3 -m tvm.driver.tvmc run --inputs infer.npz --outputs predict.npz model.tar --number=0 +4.2 Compile and Run ONNX model for Octeon target +---------------------------------------------------------- + +**Model Compilation for Octeon target** + +Please refer to section 3.2 for the example command line. + +**Run TVM models on the Octeon Target** + +The cross compiled binary can be run on the target hardware using the tvmc run command. +Alternatively, the RPC flow enables remote execution on the target device from your +local machine: https://tvm.apache.org/docs/how_to/tutorials/cross_compilation_and_rpc.html + +.. code:: python + + python3 -m tvm.driver.tvmc run --inputs infer.npz --outputs predict.npz model.tar + 5. Compiling a model using Python APIs -------------------------------------- diff --git a/python/tvm/relay/op/contrib/mrvl.py b/python/tvm/relay/op/contrib/mrvl.py index b13cf3d9533d..6100fcb991c0 100644 --- a/python/tvm/relay/op/contrib/mrvl.py +++ b/python/tvm/relay/op/contrib/mrvl.py @@ -272,6 +272,8 @@ def add_attributes(mod, annotate_target_str, **kwargs): mod : module with attributes """ working_dir = mrvl_contrib.get_working_dir() + sim_attr_found = False + hw_attr_found = False if "mattr" in kwargs: base_opts_str = kwargs.get("mattr") @@ -286,6 +288,14 @@ def add_attributes(mod, annotate_target_str, **kwargs): if "wb_pin_ocm" not in base_opts_str: base_opts_str = f"{base_opts_str} -wb_pin_ocm=0" + if "sim" in base_opts_str: + sim_attr_found = True + base_opts_str = base_opts_str.replace("sim", "") + + if "hw" in base_opts_str: + hw_attr_found = True + base_opts_str = base_opts_str.replace("hw", "") + else: base_opts_str = "-arch=mlip -quantize=fp16 -wb_pin_ocm=0" @@ -294,6 +304,12 @@ def add_attributes(mod, annotate_target_str, **kwargs): elif "num_tiles" not in base_opts_str: base_opts_str = f"{base_opts_str} -num_tiles=8" + mode_string = "sim" + if sim_attr_found: + mode_string = "sim" + elif hw_attr_found: + mode_string = "hw" + for var in mod.get_global_vars(): func_name = var.name_hint func = mod[func_name] @@ -301,6 +317,7 @@ def add_attributes(mod, annotate_target_str, **kwargs): if annotate_target_str in func_name: func = func.with_attr("working_dir", working_dir) func = func.with_attr("compiler_opts_string", base_opts_str) + func = func.with_attr("mode", mode_string) mod.update_func(var, func) return mod diff --git a/src/relay/backend/contrib/mrvl/codegen.cc b/src/relay/backend/contrib/mrvl/codegen.cc index 96121e4b4b69..7c410f565fcd 100644 --- a/src/relay/backend/contrib/mrvl/codegen.cc +++ b/src/relay/backend/contrib/mrvl/codegen.cc @@ -1467,6 +1467,7 @@ runtime::Module MrvlCompiler(const ObjectRef& ref) { Function func = Downcast(ref); std::string func_name = backend::GetExtSymbol(func); + const std::string mrvl_run_mode = func->GetAttr("mode").value(); runtime::Module runtime_lib; // Extract attributes from the frontend to be passed to the runtime @@ -1485,13 +1486,32 @@ runtime::Module MrvlCompiler(const ObjectRef& ref) { std::string modified_json = (*modifyConsts)(nodes_json_string, consts_json_string); auto json_vec = split(modified_json, '|'); + // Extract attributes from the nodes_json by key-value lookup using Python API + // These are passed to hardware runtime module for initialization + const tvm::runtime::PackedFunc* json_lookup; + json_lookup = runtime::Registry::Get("tvm.mrvl.find_value_in_KV_pair"); + const std::string string_inp = (*json_lookup)(nodes_json_string, "num_subgraph_inputs"); + const int num_inputs = std::stoi(string_inp); + const std::string string_out = (*json_lookup)(nodes_json_string, "num_subgraph_outputs"); + const int num_outputs = std::stoi(string_out); + const std::string string_bsize = (*json_lookup)(nodes_json_string, "batch_size"); + const int batch_size = std::stoi(string_bsize); + // Invoke Marvell Backend compiler to generate binary for sub graph const auto* compile = runtime::Registry::Get("tvm.mrvl.CompileModel"); std::string bin = (*compile)(func_name, json_vec[0], json_vec[1], compiler_opt); - const auto* pf = runtime::Registry::Get("runtime.mrvl_runtime_create"); - ICHECK(pf != nullptr) << "Cannot find software simulator runtime module to create"; - runtime_lib = (*pf)(func_name, json_vec[0], bin); + if (mrvl_run_mode == "sim") { + const auto* pf = runtime::Registry::Get("runtime.mrvl_runtime_create"); + ICHECK(pf != nullptr) << "Cannot find software simulator runtime module to create"; + runtime_lib = (*pf)(func_name, json_vec[0], bin); + } else if (mrvl_run_mode == "hw") { + const auto* pf = runtime::Registry::Get("runtime.mrvl_hw_runtime_create"); + ICHECK(pf != nullptr) << "Cannot find hardware runtime module to create"; + runtime_lib = (*pf)(func_name, json_vec[0], bin, num_inputs, num_outputs, batch_size); + } else { + ICHECK(0) << "Unrecognized Marvell Run Mode! " << mrvl_run_mode; + } return runtime_lib; } diff --git a/src/runtime/contrib/mrvl/mrvl_hw_runtime.cc b/src/runtime/contrib/mrvl/mrvl_hw_runtime.cc new file mode 100644 index 000000000000..84b178b3139f --- /dev/null +++ b/src/runtime/contrib/mrvl/mrvl_hw_runtime.cc @@ -0,0 +1,485 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/runtime/contrib/mrvl/mrvl_hw_runtime.cc + * \brief runtime implementation for Marvell Target. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "../../../../src/support/base64.h" +#include "mrvl_base64.h" + +#define MRVL_LIBMLDPC_DEFAULT_PATH "/usr/lib/libmldpc.so" + +namespace tvm { +namespace runtime { +namespace contrib { + +enum buffer_type { input_quantize = 0, input_dequantize, output_quantize, output_dequantize }; +enum model_type { TVM = 0, MLIP }; + +struct run_args { + int model_id; + void* i_q_buf; + void* o_q_buf; + int num_batches; + void* device; + uint16_t layer_idx; +}; + +void* device_handle; +int model_id; + +/* Marvell DPDK Interface library callbacks for TVMC */ +extern "C" typedef int (*mrvl_tvmc_ml_init_ptr)(int argc, char* argv[]); +extern "C" typedef int (*mrvl_tvmc_ml_finish_ptr)(void); +extern "C" typedef int (*mrvl_tvmc_ml_model_load_ptr)(char* model_buffer, int model_size); +extern "C" typedef int (*mrvl_tvmc_ml_model_unload_ptr)(int model_id); +extern "C" typedef void* (*mrvl_tvmc_ml_io_alloc_ptr)(int model_id, enum buffer_type dt, + uint64_t* size); +extern "C" typedef void (*mrvl_tvmc_ml_io_free_ptr)(int model_id, enum buffer_type dt, void* addr); +extern "C" typedef int (*mrvl_tvmc_ml_model_quantize_ptr)(int model_id, void* dbuffer, + void* qbuffer); +extern "C" typedef int (*mrvl_tvmc_ml_model_dequantize_ptr)(int model_id, void* qbuffer, + void* dbuffer); +extern "C" typedef int (*mrvl_tvmc_ml_model_run_ptr)(int model_id, void* input_buffer, + void* output_buffer, int num_batches); + +struct ml_tvmc_cb { + void* handle; + mrvl_tvmc_ml_init_ptr mrvl_tvmc_ml_init; + mrvl_tvmc_ml_finish_ptr mrvl_tvmc_ml_finish; + mrvl_tvmc_ml_model_load_ptr mrvl_tvmc_ml_model_load; + mrvl_tvmc_ml_model_unload_ptr mrvl_tvmc_ml_model_unload; + mrvl_tvmc_ml_io_alloc_ptr mrvl_tvmc_ml_io_alloc; + mrvl_tvmc_ml_io_free_ptr mrvl_tvmc_ml_io_free; + mrvl_tvmc_ml_model_quantize_ptr mrvl_tvmc_ml_model_quantize; + mrvl_tvmc_ml_model_dequantize_ptr mrvl_tvmc_ml_model_dequantize; + mrvl_tvmc_ml_model_run_ptr mrvl_tvmc_ml_model_run; +}; + +/* DPDK callback functions */ +extern "C" typedef int (*mrvl_dpdk_glow_layer_load_cb)(void* device, uint16_t model_id, + const char* layer_name, uint8_t* buffer, + size_t size, uint16_t* index); +extern "C" typedef int (*mrvl_dpdk_glow_layer_unload_cb)(void* device, uint16_t model_id, + const char* layer_name); + +extern "C" typedef int (*mrvl_dpdk_io_alloc_cb)(void* device, uint16_t model_id, + const char* layer_name, uint64_t** input_qbuffer, + uint64_t** output_qbuffer); +extern "C" typedef int (*mrvl_dpdk_io_free_cb)(void* device, uint16_t model_id, + const char* layer_name); + +extern "C" typedef int (*mrvl_dpdk_malloc_cb)(const char* name, size_t size, uint32_t align, + void** addr); +extern "C" typedef int (*mrvl_dpdk_free_cb)(const char* name); + +extern "C" typedef int (*mrvl_dpdk_quantize_cb)(void* device, uint16_t model_id, + const char* layer_name, const DLTensor** deq_tensor, + void* qbuffer); +extern "C" typedef int (*mrvl_dpdk_dequantize_cb)(void* device, uint16_t model_id, + const char* layer_name, void* qbuffer, + const DLTensor** deq_tensor); +extern "C" typedef int (*mrvl_dpdk_inference_cb)(void* device, uint16_t index, void* input, + void* output, uint16_t nb_batches); + +/* Call back functions structure */ +struct ml_dpdk_cb { + mrvl_dpdk_glow_layer_load_cb mrvl_dpdk_glow_layer_load; + mrvl_dpdk_glow_layer_unload_cb mrvl_dpdk_glow_layer_unload; + mrvl_dpdk_io_alloc_cb mrvl_dpdk_io_alloc; + mrvl_dpdk_io_free_cb mrvl_dpdk_io_free; + mrvl_dpdk_malloc_cb mrvl_dpdk_malloc; + mrvl_dpdk_free_cb mrvl_dpdk_free; + mrvl_dpdk_quantize_cb mrvl_dpdk_quantize; + mrvl_dpdk_dequantize_cb mrvl_dpdk_dequantize; + mrvl_dpdk_inference_cb mrvl_dpdk_inference; +}; + +void get_tvmc_callbacks(const char* so_path, ml_tvmc_cb* obj) { + obj->handle = dlopen(so_path, RTLD_LAZY); + if (obj->handle == nullptr) + ICHECK(false) << "Marvell-Runtime-ERROR Loading shared library failed"; + + obj->mrvl_tvmc_ml_init = (mrvl_tvmc_ml_init_ptr)dlsym(obj->handle, "mrvl_ml_init"); + obj->mrvl_tvmc_ml_finish = (mrvl_tvmc_ml_finish_ptr)dlsym(obj->handle, "mrvl_ml_finish"); + obj->mrvl_tvmc_ml_model_load = + (mrvl_tvmc_ml_model_load_ptr)dlsym(obj->handle, "mrvl_ml_model_load"); + obj->mrvl_tvmc_ml_model_unload = + (mrvl_tvmc_ml_model_unload_ptr)dlsym(obj->handle, "mrvl_ml_model_unload"); + obj->mrvl_tvmc_ml_io_alloc = (mrvl_tvmc_ml_io_alloc_ptr)dlsym(obj->handle, "mrvl_ml_io_alloc"); + obj->mrvl_tvmc_ml_io_free = (mrvl_tvmc_ml_io_free_ptr)dlsym(obj->handle, "mrvl_ml_io_free"); + obj->mrvl_tvmc_ml_model_quantize = + (mrvl_tvmc_ml_model_quantize_ptr)dlsym(obj->handle, "mrvl_ml_model_quantize"); + obj->mrvl_tvmc_ml_model_dequantize = + (mrvl_tvmc_ml_model_dequantize_ptr)dlsym(obj->handle, "mrvl_ml_model_dequantize"); + obj->mrvl_tvmc_ml_model_run = (mrvl_tvmc_ml_model_run_ptr)dlsym(obj->handle, "mrvl_ml_model_run"); +} + +/*! + * \brief A json runtime that compiles the serialized JSON format to a binary for Marvell +hardware and then runs the generated binary on the target hardware. + * \param symbol_name The name of the subgraph / relay function + * \param nodes_json The serialized JSON representation of relay function + * \param bin_code The binary code generated by the Marvell backend compiler for the subgraph + * \param input_count Number of subgraph inputs + * \param output_count Number of subgraph outputs + * \param batch_size Batch count + * + */ + +class MarvellHardwareModuleNode : public ModuleNode { + public: + MarvellHardwareModuleNode(const std::string& symbol_name, const std::string& nodes_json, + const std::string& bin_code, const int input_count, + const int output_count, const int batch_size) + : symbol_name_(symbol_name), + nodes_json_(nodes_json), + bin_code_(bin_code), + num_inputs_(input_count), + num_outputs_(output_count) { + run_arg.num_batches = batch_size; + } + + ~MarvellHardwareModuleNode() { + if (use_dpdk_cb) { + int ret; + + // Deallocate input quantize and output quantize buffer + ret = dpdk_cb_.mrvl_dpdk_io_free(device_handle, run_arg.model_id, symbol_name_.c_str()); + + ICHECK(ret == 0) << "IO free failed, model_id =" << run_arg.model_id; + + // Unload model + ret = dpdk_cb_.mrvl_dpdk_glow_layer_unload(run_arg.device, run_arg.model_id, + symbol_name_.c_str()); + ICHECK(ret == 0) << "Model layer unload failed, model_id =" << run_arg.model_id; + num_loaded--; + } else { + // Clean Up + if (tvmc_cb_.handle != nullptr) { + // Deallocate input quantize and dequant buffer + tvmc_cb_.mrvl_tvmc_ml_io_free(run_arg.model_id, input_quantize, run_arg.i_q_buf); + tvmc_cb_.mrvl_tvmc_ml_io_free(run_arg.model_id, input_dequantize, i_d_buf); + // Deallocate output quantize and dequant buffer + tvmc_cb_.mrvl_tvmc_ml_io_free(run_arg.model_id, output_quantize, run_arg.o_q_buf); + tvmc_cb_.mrvl_tvmc_ml_io_free(run_arg.model_id, output_dequantize, o_d_buf); + // Unload model + tvmc_cb_.mrvl_tvmc_ml_model_unload(run_arg.model_id); + num_loaded--; + } + // All models unloaded; finish the session + if (tvmc_cb_.handle != nullptr && num_loaded == 0) tvmc_cb_.mrvl_tvmc_ml_finish(); + } + } + + const char* type_key() const { return "mrvl_hw"; } + + int GetPropertyMask() const final { + return ModulePropertyMask::kBinarySerializable | ModulePropertyMask::kRunnable; + } + + /*! + * \brief Get a packed function. + * \param name The name/symbol of the function. + * \param sptr_to_self The pointer to the module node. + * \return The packed function. + */ + virtual PackedFunc GetFunction(const String& name, const ObjectPtr& sptr_to_self) { + if (name == "get_symbol") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->symbol_name_; }); + } else if (name == "register_cb") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + struct ml_dpdk_cb* a = static_cast(args[0].value().v_handle); + memcpy(&dpdk_cb_, a, sizeof(struct ml_dpdk_cb)); + device_handle = args[1].value().v_handle; + model_id = args[2]; + use_dpdk_cb = true; + }); + } else if (name == "get_const_vars") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = Array{}; }); + } else if (this->symbol_name_ == name) { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + RunInference(args); + *rv = 0; + }); + } else if ("__init_" + this->symbol_name_ == name) { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + run_arg.device = device_handle; + run_arg.model_id = model_id; + load_and_initialize_model(); + *rv = 0; + }); + } + return PackedFunc(nullptr); + } + + virtual void SaveToBinary(dmlc::Stream* stream) { + // Save the symbol name and other data and serialize them to + // binary format. + stream->Write(symbol_name_); + stream->Write(nodes_json_); + stream->Write(bin_code_); + stream->Write(num_inputs_); + stream->Write(num_outputs_); + stream->Write(run_arg.num_batches); + } + + static Module LoadFromBinary(void* strm) { + dmlc::Stream* stream = static_cast(strm); + std::string symbol_name; + std::string nodes_json; + std::string bin_code; + int num_inputs, num_outputs, batch_size; + + // Load the symbol_name and other data to construct the module + ICHECK(stream->Read(&symbol_name)) << "Loading symbol name failed"; + ICHECK(stream->Read(&nodes_json)) << "Loading nodes json failed"; + ICHECK(stream->Read(&bin_code)) << "Loading binary code failed"; + ICHECK(stream->Read(&num_inputs)) << "Loading num_inputs failed"; + ICHECK(stream->Read(&num_outputs)) << "Loading num_outputs failed"; + ICHECK(stream->Read(&batch_size)) << "Loading batch_size failed"; + auto n = make_object(symbol_name, nodes_json, bin_code, num_inputs, + num_outputs, batch_size); + return Module(n); + } + + /*! + * \brief Get the source generated by codegen. + * + * \param format the format to return. + * \return A string of JSON. + */ + String GetSource(const String& format = "json") override { return nodes_json_; } + + protected: + std::string symbol_name_; + std::string nodes_json_; + std::string bin_code_; + int num_inputs_; + int num_outputs_; + static ml_tvmc_cb tvmc_cb_; + static ml_dpdk_cb dpdk_cb_; + static bool initialized_model; + static int num_loaded; + void* i_d_buf = nullptr; + void* o_d_buf = nullptr; + struct run_args run_arg; + static bool use_dpdk_cb; + + void RunInference_TVMC(TVMArgs args) { + float* i_d_buf_float; + float* o_d_buf_float; + const DLTensor* tensor; + + i_d_buf_float = reinterpret_cast(i_d_buf); + for (int in = 0; in < num_inputs_; in++) { + if (args[in].IsObjectRef()) { + NDArray arr = args[in]; + tensor = arr.operator->(); + } else { + tensor = args[in].operator DLTensor*(); + } + + if (num_inputs_ == 1) { + // Perform Quantization + tvmc_cb_.mrvl_tvmc_ml_model_quantize( + run_arg.model_id, reinterpret_cast(tensor->data) + tensor->byte_offset, + run_arg.i_q_buf); + } else { + uint64_t in_tot_dim = 1; + + for (int i = 0; i < tensor->ndim; i++) { + in_tot_dim *= tensor->shape[i]; + } + + memcpy(i_d_buf_float, tensor->data, sizeof(float) * in_tot_dim); + i_d_buf_float += in_tot_dim; + } + } + + if (num_inputs_ > 1) { + // Perform Quantization + tvmc_cb_.mrvl_tvmc_ml_model_quantize(run_arg.model_id, i_d_buf, run_arg.i_q_buf); + } + + tvmc_cb_.mrvl_tvmc_ml_model_run(run_arg.model_id, run_arg.i_q_buf, run_arg.o_q_buf, + run_arg.num_batches); + + const DLTensor* outTensor; + int out = num_inputs_; + + if (num_outputs_ == 1) { + if (args[out].IsObjectRef()) { + NDArray arr = args[out]; + outTensor = arr.operator->(); + } else { + outTensor = args[out].operator DLTensor*(); + } + tvmc_cb_.mrvl_tvmc_ml_model_dequantize( + run_arg.model_id, run_arg.o_q_buf, + (reinterpret_cast(outTensor->data) + outTensor->byte_offset)); + + } else { + tvmc_cb_.mrvl_tvmc_ml_model_dequantize(run_arg.model_id, run_arg.o_q_buf, o_d_buf); + o_d_buf_float = reinterpret_cast(o_d_buf); + + for (out = num_inputs_; out < args.size(); out++) { + int out_tot_dim = 1; + if (args[out].IsObjectRef()) { + NDArray arr = args[out]; + outTensor = arr.operator->(); + } else { + outTensor = args[out].operator DLTensor*(); + } + + for (int i = 0; i < outTensor->ndim; i++) { + out_tot_dim *= outTensor->shape[i]; + } + + memcpy(outTensor->data, o_d_buf_float, sizeof(float) * out_tot_dim); + o_d_buf_float += out_tot_dim; + } + } + } + + void RunInference_DPDK(TVMArgs args) { + const DLTensor* tensor[64]; + + for (int in = 0; in < num_inputs_; in++) { + if (args[in].IsObjectRef()) { + NDArray arr = args[in]; + tensor[in] = arr.operator->(); + } else { + tensor[in] = args[in].operator DLTensor*(); + } + } + + dpdk_cb_.mrvl_dpdk_quantize(run_arg.device, run_arg.model_id, symbol_name_.c_str(), tensor, + run_arg.i_q_buf); + + dpdk_cb_.mrvl_dpdk_inference(run_arg.device, run_arg.layer_idx, run_arg.i_q_buf, + run_arg.o_q_buf, run_arg.num_batches); + + int i = 0; + for (int out = num_inputs_; out < args.size(); out++) { + if (args[out].IsObjectRef()) { + NDArray arr = args[out]; + tensor[i] = arr.operator->(); + } else { + tensor[i] = args[out].operator DLTensor*(); + } + i++; + } + + dpdk_cb_.mrvl_dpdk_dequantize(run_arg.device, run_arg.model_id, symbol_name_.c_str(), + run_arg.o_q_buf, tensor); + } + + void RunInference(TVMArgs args) { + if (use_dpdk_cb) + RunInference_DPDK(args); + else + RunInference_TVMC(args); + } + + void load_and_initialize_model() { + // Load dll and get the APIs from Library + if (!(use_dpdk_cb) && !(initialized_model)) { + char* libpath = getenv("MRVL_LIBMLDPC_PATH"); + if (libpath == nullptr) { + std::string str = MRVL_LIBMLDPC_DEFAULT_PATH; + libpath = new char[str.length() + 1]; + snprintf(libpath, str.length() + 1, "%s", str.c_str()); + } + std::cout << "MRVL_LIBMLDPC_PATH: " << libpath << std::endl; + get_tvmc_callbacks(const_cast(libpath), &tvmc_cb_); + int argc = 1; + char* argv[] = {const_cast("tvmc")}; + tvmc_cb_.mrvl_tvmc_ml_init(argc, argv); + initialized_model = true; + } + + // Create byte array to pass to the init function + int num_bytes = tvm::runtime::contrib::mrvl::b64strlen(bin_code_); + std::vector byte_array(num_bytes); + tvm::runtime::contrib::mrvl::b64decode(bin_code_, byte_array.data()); + + if (use_dpdk_cb) { + int ret; + ret = dpdk_cb_.mrvl_dpdk_glow_layer_load( + run_arg.device, run_arg.model_id, symbol_name_.c_str(), + reinterpret_cast(byte_array.data()), num_bytes, &run_arg.layer_idx); + ICHECK(ret == 0) << "Model layer load failed, model_id =" << run_arg.model_id; + num_loaded++; + + // Allocate input quantize and output quantize buffer + ret = dpdk_cb_.mrvl_dpdk_io_alloc(device_handle, run_arg.model_id, symbol_name_.c_str(), + reinterpret_cast(&run_arg.i_q_buf), + reinterpret_cast(&run_arg.o_q_buf)); + ICHECK(ret == 0) << "IO alloc failed, model_id =" << run_arg.model_id; + } else { + // Load the model + run_arg.model_id = + tvmc_cb_.mrvl_tvmc_ml_model_load(reinterpret_cast(byte_array.data()), num_bytes); + ICHECK(run_arg.model_id >= 0) << "Failed to load model!"; + num_loaded++; + // Allocate input quantize and dequant buffer + run_arg.i_q_buf = tvmc_cb_.mrvl_tvmc_ml_io_alloc(run_arg.model_id, input_quantize, nullptr); + i_d_buf = tvmc_cb_.mrvl_tvmc_ml_io_alloc(run_arg.model_id, input_dequantize, nullptr); + // Allocate output quantize and dequant buffer + run_arg.o_q_buf = tvmc_cb_.mrvl_tvmc_ml_io_alloc(run_arg.model_id, output_quantize, nullptr); + o_d_buf = tvmc_cb_.mrvl_tvmc_ml_io_alloc(run_arg.model_id, output_dequantize, nullptr); + } + } +}; + +runtime::Module MarvellHardwareModuleRuntimeCreate(const String& symbol_name, + const String& nodes_json, const String& bin_code, + int num_input, int num_output, int batch_size) { + auto n = make_object(symbol_name, nodes_json, bin_code, num_input, + num_output, batch_size); + return runtime::Module(n); +} + +bool MarvellHardwareModuleNode::initialized_model = false; +int MarvellHardwareModuleNode::num_loaded = 0; +bool MarvellHardwareModuleNode::use_dpdk_cb = false; +ml_tvmc_cb MarvellHardwareModuleNode::tvmc_cb_ = {}; +ml_dpdk_cb MarvellHardwareModuleNode::dpdk_cb_ = {}; + +TVM_REGISTER_GLOBAL("runtime.mrvl_hw_runtime_create") + .set_body_typed(MarvellHardwareModuleRuntimeCreate); +TVM_REGISTER_GLOBAL("runtime.module.loadbinary_mrvl_hw") + .set_body_typed(MarvellHardwareModuleNode::LoadFromBinary); +} // namespace contrib +} // namespace runtime +} // namespace tvm