Skip to content

Commit

Permalink
Make it possible to load both CPU and CUDA models using same runner (#…
Browse files Browse the repository at this point in the history
…815)

By wrapping attempt to load a model with `try {} catch (std::runtime_error) {}` and attempting to create model on GPU first, as attempt to load CPU model on CUDA destroys CUDA context (bugs/fixes againt PyTorch are coming, tracked in pytorch/pytorch#126547 )

Also, fix two bugs in the repo:
 - Initialize `Tokenizer::initialized_` to false
 - Change name of the tokenizer file in a workflow from `tokenizer.bin` to `tokenizer.model`


Fixes #709

Test plan:
```
python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model_cpu.so --device cpu
python3 torchchat.py export --checkpoint-path checkpoints/stories15M/model.pth --output-dso-path model.so
./cmake-out/aoti_run ./model.so -z checkpoints/stories15M/tokenizer.model
./cmake-out/aoti_run ./model_cpu.so -z checkpoints/stories15M/tokenizer.model
```
  • Loading branch information
malfet committed Jul 17, 2024
1 parent a31c5ee commit b5d3187
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/runner-cuda-dtype.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}"
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
echo "**********************************************"
echo "******** INT4 HQQ group-wise quantized *******"
Expand Down
4 changes: 3 additions & 1 deletion runner/aoti.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ if(Torch_FOUND)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${TORCH_CXX_FLAGS} -fpermissive")

add_executable(aoti_run runner/run.cpp)

target_compile_options(aoti_run PUBLIC -D__AOTI_MODEL__)
if(DEFINED TORCH_CUDA_LIBRARIES)
target_compile_options(aoti_run PUBLIC -DUSE_CUDA)
endif()
target_include_directories(aoti_run PRIVATE ${TORCHCHAT_ROOT}/runner)
target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)
Expand Down
24 changes: 17 additions & 7 deletions runner/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@

#ifdef __AOTI_MODEL__
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
torch::Device cpu_device(torch::kCPU);
#ifdef USE_CUDA
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
#endif
torch::Device aoti_device(torch::kCPU);

#else // __ET_MODEL__
#include <executorch/extension/module/module.h>
Expand Down Expand Up @@ -82,7 +85,7 @@ typedef struct {
RunState state; // buffers for the "wave" of activations in the forward pass

#ifdef __AOTI_MODEL__
torch::inductor::AOTIModelContainerRunnerCpu* runner;
torch::inductor::AOTIModelContainerRunner* runner;
#else // __ET_MODEL__
Module* runner;
#endif
Expand Down Expand Up @@ -132,9 +135,16 @@ void build_transformer(
malloc_run_state(&t->state, &t->config);

#ifdef __AOTI_MODEL__
t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(
/* path to model DSO */ model_path,
/* thread pool size */ 1);
#ifdef USE_CUDA
try {
t->runner = new torch::inductor::AOTIModelContainerRunnerCuda(model_path);
aoti_device = torch::Device(torch::kCUDA);
} catch (std::runtime_error& e) {
#else
{
#endif
t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(model_path);
}
#else //__ET_MODEL__
t->runner = new Module(
/* path to PTE model */ model_path,
Expand Down Expand Up @@ -186,11 +196,11 @@ float* forward(Transformer* transformer, int token, int pos) {
torch::Tensor token_tensor =
torch::from_blob(token_buffer, {1, 1}, torch::kLong);
torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
std::vector<torch::Tensor> inputs{token_tensor, pos_tensor};
std::vector<torch::Tensor> inputs{token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};

torch::Tensor result = transformer->runner->run(inputs)[0]
.to(torch::dtype(torch::kFloat32))
.to(cpu_device);
.to(torch::kCPU);
auto logits = result[0].data_ptr();
#else // __ET_MODEL__
ManagedTensor pos_managed(pos_buffer, sizeof(int64_t), {1}, ScalarType::Long);
Expand Down
2 changes: 1 addition & 1 deletion tokenizer/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class Tokenizer {
}

protected:
bool initialized_;
bool initialized_ = false;
int32_t vocab_size_;
uint64_t bos_tok_, eos_tok_;
};
Expand Down

0 comments on commit b5d3187

Please sign in to comment.