[Platforms] Refactor openvino code

vllm-project · Nov 22, 2024 · 865339e · 865339e
1 parent 11fcf0e
commit 865339e
Showing 1 changed file with 73 additions and 73 deletions.
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
@@ -1,18 +1,18 @@
 from typing import List, Set, Tuple
 
-import openvino as ov
-import openvino.properties.hint as hints
+# import openvino as ov
+# import openvino.properties.hint as hints
 import torch
 
 import vllm.envs as envs
-from vllm.config import CacheConfig, ModelConfig
+# from vllm.config import CacheConfig, ModelConfig
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip,
+from vllm.utils import (get_distributed_init_method, get_ip,
                         get_open_port, make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -30,10 +30,10 @@ def _init_executor(self) -> None:
             current_platform.is_openvino_gpu(), \
             "OpenVINO backend supports only CPU and GPU devices"
 
-        self.ov_core = ov.Core()
-        self.model_config = _verify_and_get_model_config(self.model_config)
-        self.cache_config = _verify_and_get_cache_config(
-            self.ov_core, self.cache_config)
+        # self.ov_core = ov.Core()
+        # self.model_config = _verify_and_get_model_config(self.model_config)
+        # self.cache_config = _verify_and_get_cache_config(
+        #     self.ov_core, self.cache_config)
 
         # Instantiate the worker and load the model to CPU.
         self._init_worker()
@@ -132,68 +132,68 @@ async def check_health_async(self) -> None:
         return
 
 
-def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
-    if config.dtype != torch.float32:
-        logger.warning(
-            f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}."  # noqa: G004, E501
-        )
-        config.dtype = torch.float32
-    if not config.enforce_eager:
-        logger.warning(
-            "CUDA graph is not supported on OpenVINO backend, fallback to the "
-            "eager mode.")
-        config.enforce_eager = True
-    return config
-
-
-def _verify_and_get_cache_config(ov_core: ov.Core,
-                                 config: CacheConfig) -> CacheConfig:
-    if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
-        if not current_platform.is_openvino_cpu():
-            logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
-                        "ignored for GPU, f16 data type will be used.")
-            config.cache_dtype = ov.Type.f16
-        else:
-            logger.info("KV cache type is overridden to u8 via "
-                        "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
-            config.cache_dtype = ov.Type.u8
-    else:
-        if current_platform.is_openvino_cpu():
-            ov_device = envs.VLLM_OPENVINO_DEVICE
-            inference_precision = ov_core.get_property(
-                ov_device, hints.inference_precision)
-            if inference_precision == ov.Type.bf16:
-                config.cache_dtype = ov.Type.bf16
-            else:
-                config.cache_dtype = ov.Type.f16
-        else:
-            config.cache_dtype = ov.Type.f16
-
-    if current_platform.is_openvino_cpu():
-        if config.block_size != 32:
-            logger.info(
-                f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}"  # noqa: G004, E501
-            )
-            config.block_size = 32
-    else:
-        if config.block_size != 16:
-            logger.info(
-                f"OpenVINO GPU optimal block size is 16, overriding currently set {config.block_size}"  # noqa: G004, E501
-            )
-            config.block_size = 16
-
-    kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
-    if kv_cache_space >= 0:
-        if kv_cache_space == 0 and current_platform.is_openvino_cpu():
-            config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
-            logger.warning(
-                "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
-                "for OpenVINO backend is not set, using 4 by default.")
-        else:
-            config.openvino_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore
-    else:
-        raise RuntimeError(
-            "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
-            f" {kv_cache_space}, expect a positive integer value.")
-
-    return config
+# def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+#     if config.dtype != torch.float32:
+#         logger.warning(
+#             f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}."  # noqa: G004, E501
+#         )
+#         config.dtype = torch.float32
+#     if not config.enforce_eager:
+#         logger.warning(
+#             "CUDA graph is not supported on OpenVINO backend, fallback to the "
+#             "eager mode.")
+#         config.enforce_eager = True
+#     return config
+
+
+# def _verify_and_get_cache_config(ov_core: ov.Core,
+#                                  config: CacheConfig) -> CacheConfig:
+#     if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
+#         if not current_platform.is_openvino_cpu():
+#             logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
+#                         "ignored for GPU, f16 data type will be used.")
+#             config.cache_dtype = ov.Type.f16
+#         else:
+#             logger.info("KV cache type is overridden to u8 via "
+#                         "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
+#             config.cache_dtype = ov.Type.u8
+#     else:
+#         if current_platform.is_openvino_cpu():
+#             ov_device = envs.VLLM_OPENVINO_DEVICE
+#             inference_precision = ov_core.get_property(
+#                 ov_device, hints.inference_precision)
+#             if inference_precision == ov.Type.bf16:
+#                 config.cache_dtype = ov.Type.bf16
+#             else:
+#                 config.cache_dtype = ov.Type.f16
+#         else:
+#             config.cache_dtype = ov.Type.f16
+
+#     if current_platform.is_openvino_cpu():
+#         if config.block_size != 32:
+#             logger.info(
+#                 f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}"  # noqa: G004, E501
+#             )
+#             config.block_size = 32
+#     else:
+#         if config.block_size != 16:
+#             logger.info(
+#                 f"OpenVINO GPU optimal block size is 16, overriding currently set {config.block_size}"  # noqa: G004, E501
+#             )
+#             config.block_size = 16
+
+#     kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
+#     if kv_cache_space >= 0:
+#         if kv_cache_space == 0 and current_platform.is_openvino_cpu():
+#             config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+#             logger.warning(
+#                 "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
+#                 "for OpenVINO backend is not set, using 4 by default.")
+#         else:
+#             config.openvino_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore
+#     else:
+#         raise RuntimeError(
+#             "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
+#             f" {kv_cache_space}, expect a positive integer value.")
+
+#     return config