Skip to content

Commit

Permalink
fix: disable chunked prefill and prefix caching for multimodal models (
Browse files Browse the repository at this point in the history
  • Loading branch information
AlpinDale authored Dec 27, 2024
1 parent c951a54 commit 271879a
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion aphrodite/engine/args_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,13 @@ def create_engine_config(self, ) -> EngineConfig:
override_neuron_config=self.override_neuron_config
)

if model_config.is_multimodal_model:
if self.enable_prefix_caching:
logger.warning(
"--enable-prefix-caching is currently not "
"supported for multimodal models and has been disabled.")
self.enable_prefix_caching = False

cache_config = CacheConfig(
block_size=self.block_size if self.device != "neuron" else
self.max_model_len,
Expand Down Expand Up @@ -989,7 +996,9 @@ def create_engine_config(self, ) -> EngineConfig:
# If not explicitly set, enable chunked prefill by default for
# long context (> 32K) models. This is to avoid OOM errors in the
# initial memory profiling phase.
if use_long_context:
# Chunked prefill is currently disabled for multimodal models by
# default.
if use_long_context and not model_config.is_multimodal_model:
is_gpu = device_config.device_type == "cuda"
use_sliding_window = (model_config.get_sliding_window()
is not None)
Expand Down

0 comments on commit 271879a

Please sign in to comment.