Skip to content

Commit

Permalink
[Platforms] Refactor openvino code
Browse files Browse the repository at this point in the history
  • Loading branch information
ji-huazhong committed Nov 22, 2024
1 parent 11fcf0e commit 865339e
Showing 1 changed file with 73 additions and 73 deletions.
146 changes: 73 additions & 73 deletions vllm/executor/openvino_executor.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from typing import List, Set, Tuple

import openvino as ov
import openvino.properties.hint as hints
# import openvino as ov
# import openvino.properties.hint as hints
import torch

Check failure on line 5 in vllm/executor/openvino_executor.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (F401)

vllm/executor/openvino_executor.py:5:8: F401 `torch` imported but unused

import vllm.envs as envs
from vllm.config import CacheConfig, ModelConfig
# from vllm.config import CacheConfig, ModelConfig
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.platforms import current_platform
from vllm.sequence import ExecuteModelRequest
from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip,
from vllm.utils import (get_distributed_init_method, get_ip,
get_open_port, make_async)
from vllm.worker.worker_base import WorkerWrapperBase

Expand All @@ -30,10 +30,10 @@ def _init_executor(self) -> None:
current_platform.is_openvino_gpu(), \
"OpenVINO backend supports only CPU and GPU devices"

self.ov_core = ov.Core()
self.model_config = _verify_and_get_model_config(self.model_config)
self.cache_config = _verify_and_get_cache_config(
self.ov_core, self.cache_config)
# self.ov_core = ov.Core()
# self.model_config = _verify_and_get_model_config(self.model_config)
# self.cache_config = _verify_and_get_cache_config(
# self.ov_core, self.cache_config)

# Instantiate the worker and load the model to CPU.
self._init_worker()
Expand Down Expand Up @@ -132,68 +132,68 @@ async def check_health_async(self) -> None:
return


def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
if config.dtype != torch.float32:
logger.warning(
f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}." # noqa: G004, E501
)
config.dtype = torch.float32
if not config.enforce_eager:
logger.warning(
"CUDA graph is not supported on OpenVINO backend, fallback to the "
"eager mode.")
config.enforce_eager = True
return config


def _verify_and_get_cache_config(ov_core: ov.Core,
config: CacheConfig) -> CacheConfig:
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
if not current_platform.is_openvino_cpu():
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
"ignored for GPU, f16 data type will be used.")
config.cache_dtype = ov.Type.f16
else:
logger.info("KV cache type is overridden to u8 via "
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
config.cache_dtype = ov.Type.u8
else:
if current_platform.is_openvino_cpu():
ov_device = envs.VLLM_OPENVINO_DEVICE
inference_precision = ov_core.get_property(
ov_device, hints.inference_precision)
if inference_precision == ov.Type.bf16:
config.cache_dtype = ov.Type.bf16
else:
config.cache_dtype = ov.Type.f16
else:
config.cache_dtype = ov.Type.f16

if current_platform.is_openvino_cpu():
if config.block_size != 32:
logger.info(
f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}" # noqa: G004, E501
)
config.block_size = 32
else:
if config.block_size != 16:
logger.info(
f"OpenVINO GPU optimal block size is 16, overriding currently set {config.block_size}" # noqa: G004, E501
)
config.block_size = 16

kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
if kv_cache_space >= 0:
if kv_cache_space == 0 and current_platform.is_openvino_cpu():
config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
logger.warning(
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
"for OpenVINO backend is not set, using 4 by default.")
else:
config.openvino_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore
else:
raise RuntimeError(
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
f" {kv_cache_space}, expect a positive integer value.")

return config
# def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
# if config.dtype != torch.float32:
# logger.warning(
# f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}." # noqa: G004, E501
# )
# config.dtype = torch.float32
# if not config.enforce_eager:
# logger.warning(
# "CUDA graph is not supported on OpenVINO backend, fallback to the "

Check failure on line 143 in vllm/executor/openvino_executor.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

vllm/executor/openvino_executor.py:143:81: E501 Line too long (81 > 80)
# "eager mode.")
# config.enforce_eager = True
# return config


# def _verify_and_get_cache_config(ov_core: ov.Core,
# config: CacheConfig) -> CacheConfig:
# if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
# if not current_platform.is_openvino_cpu():
# logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
# "ignored for GPU, f16 data type will be used.")
# config.cache_dtype = ov.Type.f16
# else:
# logger.info("KV cache type is overridden to u8 via "
# "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
# config.cache_dtype = ov.Type.u8
# else:
# if current_platform.is_openvino_cpu():
# ov_device = envs.VLLM_OPENVINO_DEVICE
# inference_precision = ov_core.get_property(
# ov_device, hints.inference_precision)
# if inference_precision == ov.Type.bf16:
# config.cache_dtype = ov.Type.bf16
# else:
# config.cache_dtype = ov.Type.f16
# else:
# config.cache_dtype = ov.Type.f16

# if current_platform.is_openvino_cpu():
# if config.block_size != 32:
# logger.info(
# f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}" # noqa: G004, E501
# )
# config.block_size = 32
# else:
# if config.block_size != 16:
# logger.info(
# f"OpenVINO GPU optimal block size is 16, overriding currently set {config.block_size}" # noqa: G004, E501
# )
# config.block_size = 16

# kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
# if kv_cache_space >= 0:
# if kv_cache_space == 0 and current_platform.is_openvino_cpu():
# config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore

Check failure on line 188 in vllm/executor/openvino_executor.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

vllm/executor/openvino_executor.py:188:81: E501 Line too long (81 > 80)
# logger.warning(
# "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
# "for OpenVINO backend is not set, using 4 by default.")
# else:
# config.openvino_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore

Check failure on line 193 in vllm/executor/openvino_executor.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

vllm/executor/openvino_executor.py:193:81: E501 Line too long (94 > 80)
# else:
# raise RuntimeError(
# "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
# f" {kv_cache_space}, expect a positive integer value.")

# return config

0 comments on commit 865339e

Please sign in to comment.