Make it possible to load both CPU and CUDA models using same runner (#…

…815) By wrapping attempt to load a model with `try {} catch (std::runtime_error) {}` and attempting to create model on GPU first, as attempt to load CPU model on CUDA destroys CUDA context (bugs/fixes againt PyTorch are coming, tracked in pytorch/pytorch#126547 ) Also, fix two bugs in the repo: - Initialize `Tokenizer::initialized_` to false - Change name of the tokenizer file in a workflow from `tokenizer.bin` to `tokenizer.model` Fixes #709 Test plan: ``` python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model_cpu.so --device cpu python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model.so ./cmake-out/aoti_run ./model.so -z checkpoints/stories15M/tokenizer.model ./cmake-out/aoti_run ./model_cpu.so -z checkpoints/stories15M/tokenizer.model ```
pytorch · Jul 17, 2024 · b5d3187 · b5d3187
1 parent a31c5ee
commit b5d3187
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 10 deletions.
diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
@@ -58,7 +58,7 @@ jobs:
 
             python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so
 
-            ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}"
+            ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
 
             echo "**********************************************"
             echo "******** INT4 HQQ group-wise quantized *******"

diff --git a/runner/aoti.cmake b/runner/aoti.cmake
@@ -13,8 +13,10 @@ if(Torch_FOUND)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${TORCH_CXX_FLAGS} -fpermissive")
 
     add_executable(aoti_run runner/run.cpp)
-
     target_compile_options(aoti_run PUBLIC -D__AOTI_MODEL__)
+    if(DEFINED TORCH_CUDA_LIBRARIES)
+        target_compile_options(aoti_run PUBLIC -DUSE_CUDA)
+    endif()
     target_include_directories(aoti_run PRIVATE ${TORCHCHAT_ROOT}/runner)
     target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
     set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)

diff --git a/runner/run.cpp b/runner/run.cpp
@@ -24,7 +24,10 @@
 
 #ifdef __AOTI_MODEL__
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
-torch::Device cpu_device(torch::kCPU);
+#ifdef USE_CUDA
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
+#endif
+torch::Device aoti_device(torch::kCPU);
 
 #else // __ET_MODEL__
 #include <executorch/extension/module/module.h>
@@ -82,7 +85,7 @@ typedef struct {
   RunState state; // buffers for the "wave" of activations in the forward pass
 
 #ifdef __AOTI_MODEL__
-  torch::inductor::AOTIModelContainerRunnerCpu* runner;
+  torch::inductor::AOTIModelContainerRunner* runner;
 #else // __ET_MODEL__
   Module* runner;
 #endif
@@ -132,9 +135,16 @@ void build_transformer(
   malloc_run_state(&t->state, &t->config);
 
 #ifdef __AOTI_MODEL__
-  t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(
-      /* path to model DSO */ model_path,
-      /* thread pool size  */ 1);
+#ifdef USE_CUDA
+  try {
+    t->runner = new torch::inductor::AOTIModelContainerRunnerCuda(model_path);
+    aoti_device = torch::Device(torch::kCUDA);
+  } catch (std::runtime_error& e) {
+#else
+  {
+#endif
+    t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(model_path);
+  }
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -186,11 +196,11 @@ float* forward(Transformer* transformer, int token, int pos) {
   torch::Tensor token_tensor =
       torch::from_blob(token_buffer, {1, 1}, torch::kLong);
   torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
-  std::vector<torch::Tensor> inputs{token_tensor, pos_tensor};
+  std::vector<torch::Tensor> inputs{token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};
 
   torch::Tensor result = transformer->runner->run(inputs)[0]
                              .to(torch::dtype(torch::kFloat32))
-                             .to(cpu_device);
+                             .to(torch::kCPU);
   auto logits = result[0].data_ptr();
 #else // __ET_MODEL__
   ManagedTensor pos_managed(pos_buffer, sizeof(int64_t), {1}, ScalarType::Long);

diff --git a/tokenizer/tokenizer.h b/tokenizer/tokenizer.h
@@ -50,7 +50,7 @@ class Tokenizer {
   }
 
  protected:
-  bool initialized_;
+  bool initialized_ = false;
   int32_t vocab_size_;
   uint64_t bos_tok_, eos_tok_;
 };