From b5d3187d1cb5b62b2fd5fe449ea614da6e9ecc65 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <2453524+malfet@users.noreply.github.com>
Date: Sat, 18 May 2024 14:13:24 -0700
Subject: [PATCH] Make it possible to load both CPU and CUDA models using same
 runner (#815)

By wrapping attempt to load a model with `try {} catch (std::runtime_error) {}` and attempting to create model on GPU first, as attempt to load CPU model on CUDA destroys CUDA context (bugs/fixes againt PyTorch are coming, tracked in https://github.com/pytorch/pytorch/issues/126547 )

Also, fix two bugs in the repo:
 - Initialize `Tokenizer::initialized_` to false
 - Change name of the tokenizer file in a workflow from `tokenizer.bin` to `tokenizer.model`


Fixes https://github.com/pytorch/torchchat/issues/709

Test plan:
```
python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model_cpu.so --device cpu
python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model.so
./cmake-out/aoti_run ./model.so -z checkpoints/stories15M/tokenizer.model
./cmake-out/aoti_run ./model_cpu.so -z checkpoints/stories15M/tokenizer.model
```
---
 .github/workflows/runner-cuda-dtype.yml |  2 +-
 runner/aoti.cmake                       |  4 +++-
 runner/run.cpp                          | 24 +++++++++++++++++-------
 tokenizer/tokenizer.h                   |  2 +-
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
index 76b340999..334b12f1e 100644
--- a/.github/workflows/runner-cuda-dtype.yml
+++ b/.github/workflows/runner-cuda-dtype.yml
@@ -58,7 +58,7 @@ jobs:
 
             python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so
 
-            ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}"
+            ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
 
             echo "**********************************************"
             echo "******** INT4 HQQ group-wise quantized *******"
diff --git a/runner/aoti.cmake b/runner/aoti.cmake
index b5b4f4352..2ae9a4079 100644
--- a/runner/aoti.cmake
+++ b/runner/aoti.cmake
@@ -13,8 +13,10 @@ if(Torch_FOUND)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${TORCH_CXX_FLAGS} -fpermissive")
 
     add_executable(aoti_run runner/run.cpp)
-
     target_compile_options(aoti_run PUBLIC -D__AOTI_MODEL__)
+    if(DEFINED TORCH_CUDA_LIBRARIES)
+        target_compile_options(aoti_run PUBLIC -DUSE_CUDA)
+    endif()
     target_include_directories(aoti_run PRIVATE ${TORCHCHAT_ROOT}/runner)
     target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
     set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)
diff --git a/runner/run.cpp b/runner/run.cpp
index e572bfe99..54e334c51 100644
--- a/runner/run.cpp
+++ b/runner/run.cpp
@@ -24,7 +24,10 @@
 
 #ifdef __AOTI_MODEL__
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
-torch::Device cpu_device(torch::kCPU);
+#ifdef USE_CUDA
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
+#endif
+torch::Device aoti_device(torch::kCPU);
 
 #else // __ET_MODEL__
 #include <executorch/extension/module/module.h>
@@ -82,7 +85,7 @@ typedef struct {
   RunState state; // buffers for the "wave" of activations in the forward pass
 
 #ifdef __AOTI_MODEL__
-  torch::inductor::AOTIModelContainerRunnerCpu* runner;
+  torch::inductor::AOTIModelContainerRunner* runner;
 #else // __ET_MODEL__
   Module* runner;
 #endif
@@ -132,9 +135,16 @@ void build_transformer(
   malloc_run_state(&t->state, &t->config);
 
 #ifdef __AOTI_MODEL__
-  t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(
-      /* path to model DSO */ model_path,
-      /* thread pool size  */ 1);
+#ifdef USE_CUDA
+  try {
+    t->runner = new torch::inductor::AOTIModelContainerRunnerCuda(model_path);
+    aoti_device = torch::Device(torch::kCUDA);
+  } catch (std::runtime_error& e) {
+#else
+  {
+#endif
+    t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(model_path);
+  }
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -186,11 +196,11 @@ float* forward(Transformer* transformer, int token, int pos) {
   torch::Tensor token_tensor =
       torch::from_blob(token_buffer, {1, 1}, torch::kLong);
   torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
-  std::vector<torch::Tensor> inputs{token_tensor, pos_tensor};
+  std::vector<torch::Tensor> inputs{token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};
 
   torch::Tensor result = transformer->runner->run(inputs)[0]
                              .to(torch::dtype(torch::kFloat32))
-                             .to(cpu_device);
+                             .to(torch::kCPU);
   auto logits = result[0].data_ptr();
 #else // __ET_MODEL__
   ManagedTensor pos_managed(pos_buffer, sizeof(int64_t), {1}, ScalarType::Long);
diff --git a/tokenizer/tokenizer.h b/tokenizer/tokenizer.h
index 08a81ced8..9e1977b71 100644
--- a/tokenizer/tokenizer.h
+++ b/tokenizer/tokenizer.h
@@ -50,7 +50,7 @@ class Tokenizer {
   }
 
  protected:
-  bool initialized_;
+  bool initialized_ = false;
   int32_t vocab_size_;
   uint64_t bos_tok_, eos_tok_;
 };