From 949248fb9427a064413a0b282f456fcda4111cda Mon Sep 17 00:00:00 2001 From: kingbri Date: Wed, 14 Feb 2024 21:44:04 -0500 Subject: [PATCH] Config: Add experimental torch cuda malloc backend This option saves some VRAM, but does have the chance to error out. Add this in the experimental config section. Signed-off-by: kingbri --- common/args.py | 5 +++++ config_sample.yml | 6 +++++- main.py | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/common/args.py b/common/args.py index a0745aee..d3f19d8d 100644 --- a/common/args.py +++ b/common/args.py @@ -140,3 +140,8 @@ def add_developer_args(parser: argparse.ArgumentParser): type=str_to_bool, help="Disables API request streaming", ) + developer_group.add_argument( + "--cuda-malloc-backend", + type=str_to_bool, + help="Disables API request streaming", + ) diff --git a/config_sample.yml b/config_sample.yml index a8750b68..96dd2fd8 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -35,7 +35,7 @@ sampling: # WARNING: Using this can result in a generation speed penalty #override_preset: -# Options for development +# Options for development and experimentation developer: # Skips exllamav2 version check (default: False) # It's highly recommended to update your dependencies rather than enabling this flag @@ -46,6 +46,10 @@ developer: # A kill switch for turning off SSE in the API server #disable_request_streaming: False + # Enable the torch CUDA malloc backend (default: False) + # This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk. + #cuda_malloc_backend: False + # Options for model overrides and loading model: # Overrides the directory to look for models (default: models) diff --git a/main.py b/main.py index edb17477..85907ef8 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ """The main tabbyAPI module. Contains the FastAPI server and endpoints.""" +import os import pathlib import uvicorn from asyncio import CancelledError @@ -600,6 +601,11 @@ def entrypoint(args: Optional[dict] = None): else: check_exllama_version() + # Enable CUDA malloc backend + if unwrap(developer_config.get("cuda_malloc_backend"), False): + os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" + logger.warning("Enabled the experimental CUDA malloc backend.") + network_config = get_network_config() # Initialize auth keys