From b5d3187d1cb5b62b2fd5fe449ea614da6e9ecc65 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Sat, 18 May 2024 14:13:24 -0700 Subject: [PATCH] Make it possible to load both CPU and CUDA models using same runner (#815) By wrapping attempt to load a model with `try {} catch (std::runtime_error) {}` and attempting to create model on GPU first, as attempt to load CPU model on CUDA destroys CUDA context (bugs/fixes againt PyTorch are coming, tracked in https://github.com/pytorch/pytorch/issues/126547 ) Also, fix two bugs in the repo: - Initialize `Tokenizer::initialized_` to false - Change name of the tokenizer file in a workflow from `tokenizer.bin` to `tokenizer.model` Fixes https://github.com/pytorch/torchchat/issues/709 Test plan: ``` python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model_cpu.so --device cpu python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model.so ./cmake-out/aoti_run ./model.so -z checkpoints/stories15M/tokenizer.model ./cmake-out/aoti_run ./model_cpu.so -z checkpoints/stories15M/tokenizer.model ``` --- .github/workflows/runner-cuda-dtype.yml | 2 +- runner/aoti.cmake | 4 +++- runner/run.cpp | 24 +++++++++++++++++------- tokenizer/tokenizer.h | 2 +- 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml index 76b340999..334b12f1e 100644 --- a/.github/workflows/runner-cuda-dtype.yml +++ b/.github/workflows/runner-cuda-dtype.yml @@ -58,7 +58,7 @@ jobs: python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so - ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}" + ./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}" echo "**********************************************" echo "******** INT4 HQQ group-wise quantized *******" diff --git a/runner/aoti.cmake b/runner/aoti.cmake index b5b4f4352..2ae9a4079 100644 --- a/runner/aoti.cmake +++ b/runner/aoti.cmake @@ -13,8 +13,10 @@ if(Torch_FOUND) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${TORCH_CXX_FLAGS} -fpermissive") add_executable(aoti_run runner/run.cpp) - target_compile_options(aoti_run PUBLIC -D__AOTI_MODEL__) + if(DEFINED TORCH_CUDA_LIBRARIES) + target_compile_options(aoti_run PUBLIC -DUSE_CUDA) + endif() target_include_directories(aoti_run PRIVATE ${TORCHCHAT_ROOT}/runner) target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m) set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17) diff --git a/runner/run.cpp b/runner/run.cpp index e572bfe99..54e334c51 100644 --- a/runner/run.cpp +++ b/runner/run.cpp @@ -24,7 +24,10 @@ #ifdef __AOTI_MODEL__ #include -torch::Device cpu_device(torch::kCPU); +#ifdef USE_CUDA +#include +#endif +torch::Device aoti_device(torch::kCPU); #else // __ET_MODEL__ #include @@ -82,7 +85,7 @@ typedef struct { RunState state; // buffers for the "wave" of activations in the forward pass #ifdef __AOTI_MODEL__ - torch::inductor::AOTIModelContainerRunnerCpu* runner; + torch::inductor::AOTIModelContainerRunner* runner; #else // __ET_MODEL__ Module* runner; #endif @@ -132,9 +135,16 @@ void build_transformer( malloc_run_state(&t->state, &t->config); #ifdef __AOTI_MODEL__ - t->runner = new torch::inductor::AOTIModelContainerRunnerCpu( - /* path to model DSO */ model_path, - /* thread pool size */ 1); +#ifdef USE_CUDA + try { + t->runner = new torch::inductor::AOTIModelContainerRunnerCuda(model_path); + aoti_device = torch::Device(torch::kCUDA); + } catch (std::runtime_error& e) { +#else + { +#endif + t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(model_path); + } #else //__ET_MODEL__ t->runner = new Module( /* path to PTE model */ model_path, @@ -186,11 +196,11 @@ float* forward(Transformer* transformer, int token, int pos) { torch::Tensor token_tensor = torch::from_blob(token_buffer, {1, 1}, torch::kLong); torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong); - std::vector inputs{token_tensor, pos_tensor}; + std::vector inputs{token_tensor.to(aoti_device), pos_tensor.to(aoti_device)}; torch::Tensor result = transformer->runner->run(inputs)[0] .to(torch::dtype(torch::kFloat32)) - .to(cpu_device); + .to(torch::kCPU); auto logits = result[0].data_ptr(); #else // __ET_MODEL__ ManagedTensor pos_managed(pos_buffer, sizeof(int64_t), {1}, ScalarType::Long); diff --git a/tokenizer/tokenizer.h b/tokenizer/tokenizer.h index 08a81ced8..9e1977b71 100644 --- a/tokenizer/tokenizer.h +++ b/tokenizer/tokenizer.h @@ -50,7 +50,7 @@ class Tokenizer { } protected: - bool initialized_; + bool initialized_ = false; int32_t vocab_size_; uint64_t bos_tok_, eos_tok_; };