fix: disable chunked prefill and prefix caching for multimodal models (…

…#1037)
PygmalionAI · Dec 27, 2024 · 271879a · 271879a
1 parent c951a54
commit 271879a
Showing 1 changed file with 10 additions and 1 deletion.
diff --git a/aphrodite/engine/args_tools.py b/aphrodite/engine/args_tools.py
@@ -956,6 +956,13 @@ def create_engine_config(self, ) -> EngineConfig:
             override_neuron_config=self.override_neuron_config
         )
 
+        if model_config.is_multimodal_model:
+            if self.enable_prefix_caching:
+                logger.warning(
+                    "--enable-prefix-caching is currently not "
+                    "supported for multimodal models and has been disabled.")
+            self.enable_prefix_caching = False
+
         cache_config = CacheConfig(
             block_size=self.block_size if self.device != "neuron" else
             self.max_model_len,
@@ -989,7 +996,9 @@ def create_engine_config(self, ) -> EngineConfig:
             # If not explicitly set, enable chunked prefill by default for
             # long context (> 32K) models. This is to avoid OOM errors in the
             # initial memory profiling phase.
-            if use_long_context:
+            # Chunked prefill is currently disabled for multimodal models by
+            # default.
+            if use_long_context and not model_config.is_multimodal_model:
                 is_gpu = device_config.device_type == "cuda"
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)