diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b563c96343f92..32520cafe5c01 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -106,17 +106,32 @@ steps: source_file_dependencies: - vllm/ commands: - - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py - - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s entrypoints/openai - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests +# Plugin test need a single pipeline, since it will install a new fake platform plugin. +- label: Generic Plugin Test + working_dir: "/vllm-workspace/tests" + fast_check: true + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/models + commands: + # test model plugin first since it needs to install a new fake platform when platform plugin is installed. + - pip install -e ./plugins/vllm_add_dummy_model + - VLLM_PLUGINS='' pytest -v -s plugins/test_model_plugin_disabled.py + - pytest -v -s plugins/test_model_plugin.py + # test platform plugin second. + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins/test_platform_plugin.py + - label: Distributed Tests (4 GPUs) # 10min working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -333,8 +348,6 @@ steps: - vllm/ - tests/models commands: - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_registry.py - pytest -v -s models/test_initialization.py @@ -469,6 +482,7 @@ steps: - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + # distributed test need to be run on 2 gpus, move this test to plugins test once the plugin test runs on 2 gpus. - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md index 79aff757518f2..6a2b5258e07d1 100644 --- a/docs/source/design/plugin_system.md +++ b/docs/source/design/plugin_system.md @@ -43,7 +43,7 @@ Every plugin has three parts: ## What Can Plugins Do? -Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. +Currently, the primary use case for plugins is to register custom, out-of-the-tree models or platforms into vLLM. This is done by calling `ModelRegistry.register_model` or `PlatformRegistry.register_platform` to register the model or platform. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. ## Guidelines for Writing Plugins diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py index 62e77a2f77597..2d157b4c13828 100644 --- a/tests/distributed/test_distributed_oot.py +++ b/tests/distributed/test_distributed_oot.py @@ -1,5 +1,4 @@ -from ..entrypoints.openai.test_oot_registration import ( - run_and_test_dummy_opt_api_server) +from ..plugins.test_model_plugin import run_and_test_dummy_opt_api_server def test_distributed_oot(dummy_opt_path: str): diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py deleted file mode 100644 index b25cb1d0e7222..0000000000000 --- a/tests/entrypoints/openai/test_oot_registration.py +++ /dev/null @@ -1,42 +0,0 @@ -from ...utils import VLLM_PATH, RemoteOpenAIServer - -chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" -assert chatml_jinja_path.exists() - - -def run_and_test_dummy_opt_api_server(model, tp=1): - # the model is registered through the plugin - server_args = [ - "--gpu-memory-utilization", - "0.10", - "--dtype", - "float32", - "--chat-template", - str(chatml_jinja_path), - "--load-format", - "dummy", - "-tp", - f"{tp}", - ] - with RemoteOpenAIServer(model, server_args) as server: - client = server.get_client() - completion = client.chat.completions.create( - model=model, - messages=[{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "Hello!" - }], - temperature=0, - ) - generated_text = completion.choices[0].message.content - assert generated_text is not None - # make sure only the first token is generated - rest = generated_text.replace("", "") - assert rest == "" - - -def test_oot_registration_for_api_server(dummy_opt_path: str): - run_and_test_dummy_opt_api_server(dummy_opt_path) diff --git a/tests/plugins/__init__.py b/tests/plugins/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/test_oot_registration.py b/tests/plugins/test_model_plugin.py similarity index 63% rename from tests/models/test_oot_registration.py rename to tests/plugins/test_model_plugin.py index 2c413a633896a..4676edc7db991 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/plugins/test_model_plugin.py @@ -1,19 +1,10 @@ import os -import pytest - from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset -from ..utils import fork_new_process_for_each_test - - -@fork_new_process_for_each_test -def test_plugin(dummy_opt_path): - os.environ["VLLM_PLUGINS"] = "" - with pytest.raises(Exception) as excinfo: - LLM(model=dummy_opt_path, load_format="dummy") - assert "are not supported for now" in str(excinfo.value) +from ..utils import (VLLM_PATH, RemoteOpenAIServer, + fork_new_process_for_each_test) @fork_new_process_for_each_test @@ -78,3 +69,45 @@ def test_oot_registration_multimodal(dummy_llava_path): # make sure only the first token is generated rest = generated_text.replace(first_token, "") assert rest == "" + + +chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" +assert chatml_jinja_path.exists() + + +def run_and_test_dummy_opt_api_server(model, tp=1): + # the model is registered through the plugin + server_args = [ + "--gpu-memory-utilization", + "0.10", + "--dtype", + "float32", + "--chat-template", + str(chatml_jinja_path), + "--load-format", + "dummy", + "-tp", + f"{tp}", + ] + with RemoteOpenAIServer(model, server_args) as server: + client = server.get_client() + completion = client.chat.completions.create( + model=model, + messages=[{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "Hello!" + }], + temperature=0, + ) + generated_text = completion.choices[0].message.content + assert generated_text is not None + # make sure only the first token is generated + rest = generated_text.replace("", "") + assert rest == "" + + +def test_oot_registration_for_api_server(dummy_opt_path: str): + run_and_test_dummy_opt_api_server(dummy_opt_path) diff --git a/tests/plugins/test_model_plugin_disabled.py b/tests/plugins/test_model_plugin_disabled.py new file mode 100644 index 0000000000000..57927a43527ee --- /dev/null +++ b/tests/plugins/test_model_plugin_disabled.py @@ -0,0 +1,12 @@ +import pytest + +from vllm import LLM + +# The test in this file should be run with env VLLM_PLUGINS='', for example: +# VLLM_PLUGINS='' pytest -v -s test_model_plugin_disabled.py + + +def test_plugin_disabled(dummy_opt_path): + with pytest.raises(Exception) as excinfo: + LLM(model=dummy_opt_path, load_format="dummy") + assert "are not supported for now" in str(excinfo.value) diff --git a/tests/plugins/test_platform_plugin.py b/tests/plugins/test_platform_plugin.py new file mode 100644 index 0000000000000..7519817dfa95e --- /dev/null +++ b/tests/plugins/test_platform_plugin.py @@ -0,0 +1,10 @@ +from vllm.platforms import PlatformRegistry, current_platform + + +def test_current_platform_register(): + # make sure the platform is registered + assert PlatformRegistry.current_platform == "my_platform" + # make sure the platform is loaded + assert current_platform.device_name == "dummy" + assert current_platform.is_async_output_supported(enforce_eager=True) \ + is False diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py new file mode 100644 index 0000000000000..0b73d173040ab --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup + +setup(name='vllm_add_dummy_platform', + version='0.1', + packages=['vllm_add_dummy_platform'], + entry_points={ + 'vllm.general_plugins': + ["register_dummy_model = vllm_add_dummy_platform:register"] + }) diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py new file mode 100644 index 0000000000000..8435c365446d7 --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py @@ -0,0 +1,9 @@ +from vllm import PlatformRegistry + + +def register(): + # Register the dummy platform + PlatformRegistry.register_platform( + "my_platform", "vllm_add_dummy_platform.my_platform.DummyPlatform") + # Set the current platform to the dummy platform + PlatformRegistry.set_current_platform("my_platform") diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_attention.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_attention.py new file mode 100644 index 0000000000000..8c0df08fa29b8 --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_attention.py @@ -0,0 +1,13 @@ +class DummyAttentionImpl: + + def forward(self): + pass + + +class DummyAttentionBackend: + + def __init__(self): + pass + + def get_impl_cls(self): + return DummyAttentionImpl diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_model_runner.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_model_runner.py new file mode 100644 index 0000000000000..1d9060b6b7e68 --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_model_runner.py @@ -0,0 +1,7 @@ +from my_attention import DummyAttentionBackend + + +class DummyModelRunner: + + def __init__(self): + self.attn_backend = DummyAttentionBackend() diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_platform.py new file mode 100644 index 0000000000000..87a8969ff473c --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_platform.py @@ -0,0 +1,26 @@ +from typing import Optional + +from vllm.config import VllmConfig +from vllm.platforms import Platform, PlatformEnum + + +class DummyPlatform(Platform): + _enum = PlatformEnum.UNSPECIFIED + device_name = "dummy" + + def __init__(self): + super().__init__() + + @classmethod + def get_device_name(cls) -> str: + return "dummy" + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + parallel_config = vllm_config.parallel_config + parallel_config.worker_cls = \ + "vllm_add_dummy_platform.my_worker.DummyWorker" + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return False diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_worker.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_worker.py new file mode 100644 index 0000000000000..a144df2438b20 --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/my_worker.py @@ -0,0 +1,14 @@ +from typing import List + +from my_model_runner import DummyModelRunner + + +class DummyCacheEngine: + pass + + +class DummyWorker: + + def __init__(self): + self.cache_engine = List[DummyCacheEngine] + self.model_runner = DummyModelRunner() diff --git a/vllm/__init__.py b/vllm/__init__.py index 45252b93e3d54..fa91a8947b95e 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -12,16 +12,23 @@ EmbeddingRequestOutput, PoolingOutput, PoolingRequestOutput, RequestOutput, ScoringOutput, ScoringRequestOutput) +from vllm.platforms.registry import PlatformRegistry +from vllm.plugins import load_general_plugins from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from .version import __version__, __version_tuple__ +# Load general plugins first when the module is imported to make sure that all +# necessary global variables are set. Such as the `current_platform`. +load_general_plugins() + __all__ = [ "__version__", "__version_tuple__", "LLM", "ModelRegistry", + "PlatformRegistry", "PromptType", "TextPrompt", "TokensPrompt", diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 69b6d1e4648df..94efa83f03022 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -264,15 +264,6 @@ def unified_attention_fake( return torch.empty_like(query).contiguous() -direct_register_custom_op( - op_name="unified_attention", - op_func=unified_attention, - mutates_args=["kv_cache"], - fake_impl=unified_attention_fake, - dispatch_key=current_platform.dispatch_key, -) - - def unified_attention_with_output( query: torch.Tensor, key: torch.Tensor, @@ -308,10 +299,19 @@ def unified_attention_with_output_fake( return -direct_register_custom_op( - op_name="unified_attention_with_output", - op_func=unified_attention_with_output, - mutates_args=["kv_cache", "output"], - fake_impl=unified_attention_with_output_fake, - dispatch_key=current_platform.dispatch_key, -) +def register_custom_ops(): + """Register custom ops for attention.""" + direct_register_custom_op( + op_name="unified_attention", + op_func=unified_attention, + mutates_args=["kv_cache"], + fake_impl=unified_attention_fake, + dispatch_key=current_platform.dispatch_key, + ) + direct_register_custom_op( + op_name="unified_attention_with_output", + op_func=unified_attention_with_output, + mutates_args=["kv_cache", "output"], + fake_impl=unified_attention_with_output_fake, + dispatch_key=current_platform.dispatch_key, + ) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 419237c252ffd..7f00d15f6a4d8 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,123 +1,51 @@ +from typing import Any + from .interface import _Backend # noqa: F401 from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform - -current_platform: Platform - -# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because -# they only indicate the build configuration, not the runtime environment. -# For example, people can install a cuda build of pytorch but run on tpu. - -is_tpu = False -try: - # While it's technically possible to install libtpu on a non-TPU machine, - # this is a very uncommon scenario. Therefore, we assume that libtpu is - # installed if and only if the machine has TPUs. - import libtpu # noqa: F401 - is_tpu = True -except Exception: - pass - -is_cuda = False - -try: - import pynvml - pynvml.nvmlInit() - try: - if pynvml.nvmlDeviceGetCount() > 0: - is_cuda = True - finally: - pynvml.nvmlShutdown() -except Exception: - # CUDA is supported on Jetson, but NVML may not be. - import os - - def cuda_is_jetson() -> bool: - return os.path.isfile("/etc/nv_tegra_release") \ - or os.path.exists("/sys/class/tegra-firmware") - - if cuda_is_jetson(): - is_cuda = True - -is_rocm = False - -try: - import amdsmi - amdsmi.amdsmi_init() - try: - if len(amdsmi.amdsmi_get_processor_handles()) > 0: - is_rocm = True - finally: - amdsmi.amdsmi_shut_down() -except Exception: - pass - -is_hpu = False -try: - from importlib import util - is_hpu = util.find_spec('habana_frameworks') is not None -except Exception: - pass - -is_xpu = False - -try: - # installed IPEX if the machine has XPUs. - import intel_extension_for_pytorch # noqa: F401 - import oneccl_bindings_for_pytorch # noqa: F401 - import torch - if hasattr(torch, 'xpu') and torch.xpu.is_available(): - is_xpu = True -except Exception: - pass - -is_cpu = False -try: - from importlib.metadata import version - is_cpu = "cpu" in version("vllm") -except Exception: - pass - -is_neuron = False -try: - import transformers_neuronx # noqa: F401 - is_neuron = True -except ImportError: - pass - -is_openvino = False -try: - from importlib.metadata import version - is_openvino = "openvino" in version("vllm") -except Exception: - pass - -if is_tpu: - # people might install pytorch built with cuda but run on tpu - # so we need to check tpu first - from .tpu import TpuPlatform - current_platform = TpuPlatform() -elif is_cuda: - from .cuda import CudaPlatform - current_platform = CudaPlatform() -elif is_rocm: - from .rocm import RocmPlatform - current_platform = RocmPlatform() -elif is_hpu: - from .hpu import HpuPlatform - current_platform = HpuPlatform() -elif is_xpu: - from .xpu import XPUPlatform - current_platform = XPUPlatform() -elif is_cpu: - from .cpu import CpuPlatform - current_platform = CpuPlatform() -elif is_neuron: - from .neuron import NeuronPlatform - current_platform = NeuronPlatform() -elif is_openvino: - from .openvino import OpenVinoPlatform - current_platform = OpenVinoPlatform() -else: - current_platform = UnspecifiedPlatform() +from .registry import PlatformRegistry, detect_current_platform + +_current_platform: Platform = UnspecifiedPlatform() + + +def initialize_current_platform(): + """Initialize the current platform. This function is called when loading + the vllm plugin.""" + # Get the current platform from the registry first. If the current + # platform is not set, try to detect the current platform. + global _current_platform + if PlatformRegistry.current_platform is not None: + _current_platform = PlatformRegistry.get_current_platform_cls()() + else: + _current_platform = detect_current_platform() + + # Register custom ops for the current platform. + from vllm.attention.layer import register_custom_ops + register_custom_ops() + + +class CurrentPlatform(Platform): + """A wrapper that provides an interface to the current platform. + + `current_platform` is imported to many modules once vLLM is imported. + Updating `current_platform` value directly will not work in those modules. + So it needs the wrapper here to provide a dynamic platform loading + mechanism. + + This class can make sure that the `current_platform` is always up-to-date. + """ + + def __getattribute__(self, name: str) -> Any: + """If the attribute is not found, go pass to the current platform.""" + # Use __getattribute__ to here to get the attribute from the current + # platform. It doesn't work to use __getattr__ because it will be called + # only when the attribute is not found. Since CurrentPlatform inherits + # from Platform, __getattr__ will not be called. + global _current_platform + # Go pass to the current platform. + return _current_platform.__getattribute__(name) + + +# The global variable for other modules to use. +current_platform = CurrentPlatform() __all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum'] diff --git a/vllm/platforms/registry.py b/vllm/platforms/registry.py new file mode 100644 index 0000000000000..9e8a4f3da64ce --- /dev/null +++ b/vllm/platforms/registry.py @@ -0,0 +1,171 @@ +from dataclasses import dataclass, field +from typing import Callable, Dict, Optional + +from vllm import utils +from vllm.platforms import Platform + +from .interface import UnspecifiedPlatform + +# The list of supported in-tree platforms. Update this list when adding/removing +# platforms. +_VLLM_PLATFORMS = { + "cpu": "vllm.platforms.cpu.CpuPlatform", + "cuda": "vllm.platforms.cuda.CudaPlatform", + "hpu": "vllm.platforms.hpu.HpuPlatform", + "neuron": "vllm.platforms.neuron.NeuronPlatform", + "openvino": "vllm.platforms.openvino.OpenVinoPlatform", + "rocm": "vllm.platforms.rocm.RocmPlatform", + "tpu": "vllm.platforms.tpu.TpuPlatform", + "xpu": "vllm.platforms.xpu.XPUPlatform", +} + + +@dataclass +class _PlatformRegistry: + # The mapping from device name to platform class string. + platforms: Dict[str, str] = field(default_factory=dict) + # The current platform name. + current_platform: Optional[str] = None + + def _load_platform_cls(self, device_name: str) -> Callable: + """Load a platform object by device name.""" + if device_name not in self.platforms: + raise ValueError( + f"Platform {device_name} not registered. " + f"Available platforms: {list(self.platforms.keys())}") + platform_cls_str = self.platforms[device_name] + return utils.resolve_obj_by_qualname(platform_cls_str) + + def register_platform(self, device_name: str, platform: str): + """Register a platform by device name. This function is called by the + platform plugin.""" + if device_name in self.platforms: + raise ValueError(f"Platform {device_name} already registered.") + self.platforms[device_name] = platform + + def set_current_platform(self, device_name: str): + """Set the current platform by device name.""" + if device_name not in self.platforms: + raise ValueError( + f"Platform {device_name} not registered. " + f"Available platforms: {list(self.platforms.keys())}") + self.current_platform = device_name + + def get_current_platform_cls(self) -> Callable: + """Get the current platform object.""" + if self.current_platform is None: + raise ValueError("No current platform set.") + return self._load_platform_cls(self.current_platform) + + +PlatformRegistry = _PlatformRegistry({ + device_name: platform + for device_name, platform in _VLLM_PLATFORMS.items() +}) + + +def detect_current_platform() -> Platform: + """Detect the current platform by checking the installed packages.""" + CurrentPlatform: Optional[type[Platform]] = None + # NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because + # they only indicate the build configuration, not the runtime environment. + # For example, people can install a cuda build of pytorch but run on tpu. + + # Load TPU Platform + try: + # While it's technically possible to install libtpu on a non-TPU + # machine, this is a very uncommon scenario. Therefore, we assume that + # libtpu is installed if and only if the machine has TPUs. + import libtpu # noqa: F401 + + from .tpu import TpuPlatform as CurrentPlatform + except Exception: + pass + + # Load CUDA Platform + if not CurrentPlatform: + try: + import pynvml + pynvml.nvmlInit() + try: + if pynvml.nvmlDeviceGetCount() > 0: + from .cuda import CudaPlatform as CurrentPlatform + finally: + pynvml.nvmlShutdown() + except Exception: + # CUDA is supported on Jetson, but NVML may not be. + import os + + def cuda_is_jetson() -> bool: + return os.path.isfile("/etc/nv_tegra_release") \ + or os.path.exists("/sys/class/tegra-firmware") + + if cuda_is_jetson(): + from .cuda import CudaPlatform as CurrentPlatform + + # Load ROCm Platform + if not CurrentPlatform: + try: + import amdsmi + amdsmi.amdsmi_init() + try: + if len(amdsmi.amdsmi_get_processor_handles()) > 0: + from .rocm import RocmPlatform as CurrentPlatform + finally: + amdsmi.amdsmi_shut_down() + except Exception: + pass + + # Load HPU Platform + if not CurrentPlatform: + try: + from importlib import util + assert util.find_spec('habana_frameworks') is not None + from .hpu import HpuPlatform as CurrentPlatform + except Exception: + pass + + # Load XPU Platform + if not CurrentPlatform: + try: + # installed IPEX if the machine has XPUs. + import intel_extension_for_pytorch # noqa: F401 + import oneccl_bindings_for_pytorch # noqa: F401 + import torch + if hasattr(torch, 'xpu') and torch.xpu.is_available(): + from .xpu import XPUPlatform as CurrentPlatform + except Exception: + pass + + # Load CPU Platform + if not CurrentPlatform: + try: + from importlib.metadata import version + assert "cpu" in version("vllm") + from .cpu import CpuPlatform as CurrentPlatform + except Exception: + pass + + # Load Neuron Platform + if not CurrentPlatform: + try: + import transformers_neuronx # noqa: F401 + + from .neuron import NeuronPlatform as CurrentPlatform + except ImportError: + pass + + # Load OpenVINO Platform + if not CurrentPlatform: + try: + from importlib.metadata import version + assert "openvino" in version("vllm") + from .openvino import OpenVinoPlatform as CurrentPlatform + except Exception: + pass + + if CurrentPlatform: + PlatformRegistry.set_current_platform(CurrentPlatform.device_name) + return CurrentPlatform() + + return UnspecifiedPlatform() diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 17f604ea0e202..0336b69a78376 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -4,7 +4,7 @@ import torch import vllm.envs as envs -from vllm.platforms import current_platform +from vllm.platforms import current_platform, initialize_current_platform logger = logging.getLogger(__name__) @@ -17,6 +17,44 @@ def load_general_plugins(): processes. They should be designed in a way that they can be loaded multiple times without causing issues. """ + global plugins_loaded + if not plugins_loaded: + import sys + if sys.version_info < (3, 10): + from importlib_metadata import entry_points + else: + from importlib.metadata import entry_points + + allowed_plugins = envs.VLLM_PLUGINS + + discovered_plugins = entry_points(group='vllm.general_plugins') + if len(discovered_plugins) == 0: + logger.debug("No plugins found.") + else: + logger.info("Available plugins:") + for plugin in discovered_plugins: + logger.info("name=%s, value=%s, group=%s", plugin.name, + plugin.value, plugin.group) + if allowed_plugins is None: + logger.info("all available plugins will be loaded.") + logger.info("set environment variable VLLM_PLUGINS to control" + " which plugins to load.") + else: + logger.info("plugins to load: %s", allowed_plugins) + for plugin in discovered_plugins: + if allowed_plugins is None or plugin.name in allowed_plugins: + try: + func = plugin.load() + func() + logger.info("plugin %s loaded.", plugin.name) + except Exception: + logger.exception("Failed to load plugin %s", + plugin.name) + # initialize current platform should be called after all plugins are + # loaded. + initialize_current_platform() + + plugins_loaded = True # all processes created by vllm will load plugins, # and here we can inject some common environment variables @@ -42,38 +80,3 @@ def load_general_plugins(): # requires enabling lazy collectives # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' - - global plugins_loaded - if plugins_loaded: - return - plugins_loaded = True - import sys - if sys.version_info < (3, 10): - from importlib_metadata import entry_points - else: - from importlib.metadata import entry_points - - allowed_plugins = envs.VLLM_PLUGINS - - discovered_plugins = entry_points(group='vllm.general_plugins') - if len(discovered_plugins) == 0: - logger.debug("No plugins found.") - return - logger.info("Available plugins:") - for plugin in discovered_plugins: - logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value, - plugin.group) - if allowed_plugins is None: - logger.info("all available plugins will be loaded.") - logger.info("set environment variable VLLM_PLUGINS to control" - " which plugins to load.") - else: - logger.info("plugins to load: %s", allowed_plugins) - for plugin in discovered_plugins: - if allowed_plugins is None or plugin.name in allowed_plugins: - try: - func = plugin.load() - func() - logger.info("plugin %s loaded.", plugin.name) - except Exception: - logger.exception("Failed to load plugin %s", plugin.name)