From 5b15bde5399cbcb1052bfb49584f81ed300cd4ac Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 21 Jun 2024 12:44:29 -0400 Subject: [PATCH 001/392] [Doc] Documentation on supported hardware for quantization methods (#5745) --- docs/source/index.rst | 1 + docs/source/quantization/fp8.rst | 4 ++- .../quantization/supported_hardware.rst | 30 +++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 docs/source/quantization/supported_hardware.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 8795a865c3db..05133eb6d867 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -100,6 +100,7 @@ Documentation :maxdepth: 1 :caption: Quantization + quantization/supported_hardware quantization/auto_awq quantization/fp8 quantization/fp8_e5m2_kvcache diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst index 312a564595cc..09f3136644c3 100644 --- a/docs/source/quantization/fp8.rst +++ b/docs/source/quantization/fp8.rst @@ -3,7 +3,9 @@ FP8 ================== -vLLM supports FP8 (8-bit floating point) computation using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are supported. Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. +vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. +Currently, only Hopper and Ada Lovelace GPUs are supported. +Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM `_. diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst new file mode 100644 index 000000000000..df445e00a395 --- /dev/null +++ b/docs/source/quantization/supported_hardware.rst @@ -0,0 +1,30 @@ +.. _supported_hardware_for_quantization: + +Supported Hardware for Quantization Kernels +=========================================== + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +Implementation Volta Turing Ampere Ada Hopper AMD GPU Intel GPU x86 CPU AWS Inferentia Google TPU +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +AQLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +AWQ ❌ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +DeepSpeedFP ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +FP8 ❌ ❌ ❌ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +Marlin ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +GPTQ ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +SqueezeLLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +bitsandbytes ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== + +Notes: +^^^^^^ + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- "✅" indicates that the quantization method is supported on the specified hardware. +- "❌" indicates that the quantization method is not supported on the specified hardware. + +Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + +For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. \ No newline at end of file From f1e72cc19a21928400b63743d5fe164ec8ed30e8 Mon Sep 17 00:00:00 2001 From: zhyncs Date: Sat, 22 Jun 2024 03:15:48 +0800 Subject: [PATCH 002/392] [BugFix] exclude version 1.15.0 for modelscope (#5668) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5b3e682a8016..d031d98c5b7e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -172,7 +172,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer modelscope + pip install accelerate hf_transfer 'modelscope!=1.15.0' ENV VLLM_USAGE_SOURCE production-docker-image From 7187507301aa8361407e04be42d0d50680891493 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 21 Jun 2024 14:04:26 -0700 Subject: [PATCH 003/392] [ci][test] fix ca test in main (#5746) --- .buildkite/test-pipeline.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c337a81d4a0d..0b87e6280f0b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -197,6 +197,9 @@ steps: gpu: a100 num_gpus: 4 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py From f5dda63eb5fcb5624b93fa5f09da01d5372bbce4 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Fri, 21 Jun 2024 15:42:46 -0700 Subject: [PATCH 004/392] [LoRA] Add support for pinning lora adapters in the LRU cache (#5603) --- tests/lora/test_lora_manager.py | 64 +++++++++++++++++++++++ vllm/engine/llm_engine.py | 3 ++ vllm/executor/cpu_executor.py | 3 ++ vllm/executor/distributed_gpu_executor.py | 7 +++ vllm/executor/executor_base.py | 4 ++ vllm/executor/gpu_executor.py | 4 ++ vllm/executor/neuron_executor.py | 3 ++ vllm/lora/models.py | 26 +++++++++ vllm/lora/worker_manager.py | 3 ++ vllm/utils.py | 43 +++++++++++++-- vllm/worker/model_runner.py | 5 ++ vllm/worker/worker.py | 3 ++ vllm/worker/worker_base.py | 8 +++ 13 files changed, 171 insertions(+), 5 deletions(-) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 51a56b121ae2..2133bce14957 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -209,6 +209,34 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): assert manager.activate_lora(3) assert manager.lora_index_to_id[0] == 2 assert manager.lora_index_to_id[1] == 3 + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 3 + assert manager.activate_lora(1) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.deactivate_lora(2) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_lora(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + assert manager.pin_lora(3) + assert manager.pin_lora(1) + with pytest.raises(RuntimeError): + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + with pytest.raises(RuntimeError): + assert manager.activate_lora(2) + + assert manager.deactivate_lora(3) + assert manager.pin_lora(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.remove_lora(3) + with pytest.raises(ValueError): + assert manager.pin_lora(3) def test_lru_lora_model_manager(dist_init, dummy_model): @@ -288,6 +316,42 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert set(manager.list_loras()) == set() assert all(x is None for x in manager.lora_index_to_id) + # pinning + assert manager.add_lora(model_lora3) + assert manager.activate_lora(3) + assert manager.add_lora(model_lora4) + assert manager.activate_lora(4) + assert set(manager.list_loras()) == {3, 4} + with pytest.raises(ValueError): + assert manager.pin_lora(1) + assert manager.pin_lora(3) + # Remove manually + assert manager.remove_lora(3) + assert not manager.remove_lora(3) + + assert set(manager.list_loras()) == {4} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 4 + + assert manager.add_lora(model_lora1) + assert manager.pin_lora(1) + assert manager.add_lora(model_lora2) + assert manager.activate_lora(2) + + assert set(manager.list_loras()) == {1, 2} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + + assert manager.remove_oldest_lora() + assert set(manager.list_loras()) == {1} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] is None + + with pytest.raises(RuntimeError): + assert manager.remove_oldest_lora() + + assert set(manager.list_loras()) == {1} + def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings, sql_lora_files): diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 75d417f525e3..f7eae257fdd1 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1009,6 +1009,9 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return self.model_executor.list_loras() + def pin_lora(self, lora_id: int) -> bool: + return self.model_executor.pin_lora(lora_id) + def check_health(self) -> None: self.model_executor.check_health() diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index a2212459f034..6137cecd881d 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -84,6 +84,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py index f7c608af1ad3..235b5bc47021 100644 --- a/vllm/executor/distributed_gpu_executor.py +++ b/vllm/executor/distributed_gpu_executor.py @@ -100,6 +100,13 @@ def remove_lora(self, lora_id: int) -> bool: lora_id=lora_id, ) + def pin_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "pin_lora", + lora_id=lora_id, + ) + def list_loras(self) -> Set[int]: return self._run_workers("list_loras") diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 4d01939c2e38..7c2520b5a64f 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -86,6 +86,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError + @abstractmethod + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError # type: ignore + @abstractmethod def list_loras(self) -> Set[int]: raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 3ad201f4757e..0a654200ed79 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -99,6 +99,10 @@ def remove_lora(self, lora_id: int) -> bool: assert lora_id > 0, "lora_id must be greater than 0." return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index e7f0e887921b..c5e2fb0f6773 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -65,6 +65,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.driver_worker.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.driver_worker.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.driver_worker.list_loras() diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 3e82856866d8..afb9ba455067 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -525,6 +525,12 @@ def remove_lora(self, lora_id: int) -> bool: self.long_lora_context.offsets_by_lora_id.pop(lora_id, None) return bool(self._registered_loras.pop(lora_id, None)) + def pin_lora(self, lora_id: int) -> bool: + """Pin a LoRAModel in the manager cache.""" + raise NotImplementedError( + "Pinning is not supported in LoRAModelManager." + "Use LRUCacheLoRAModelManager for pinning") # type: ignore + # TODO see if this can be vectorized def _set_lora_mapping(self, mapping: LoRAMapping) -> None: (base_indices, sampler_indices, sampler_indices_padded, @@ -777,6 +783,26 @@ def remove_oldest_lora(self) -> bool: return True return False + def pin_lora(self, lora_id: int) -> bool: + """Pin a LoRAModel in the manager cache.""" + self._pin_lora_in_cpu_cache(lora_id) + self._pin_lora_in_gpu_cache(lora_id) + return True + + def _pin_lora_in_cpu_cache(self, lora_id: int): + try: + self._registered_loras.pin(lora_id) + except ValueError as err: + raise ValueError("Pinning failed. " + f"LoRA {lora_id} is not registered.") from err + + def _pin_lora_in_gpu_cache(self, lora_id: int): + if lora_id not in self._active_loras: + # move lora to gpu if not already active + self.activate_lora(lora_id) + + self._active_loras.pin(lora_id) + def create_lora_manager( model: nn.Module, diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 498b2b9ddb18..ca4903c23bca 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -221,6 +221,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self._lora_manager.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self._lora_manager.pin_lora(lora_id) + def remove_all_loras(self): self._lora_manager.remove_all_loras() diff --git a/vllm/utils.py b/vllm/utils.py index 27a7b1042d88..ce5c377eff2d 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -15,7 +15,7 @@ from functools import lru_cache, partial, wraps from platform import uname from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic, - Hashable, List, Optional, OrderedDict, Tuple, TypeVar, + Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar, Union) import numpy as np @@ -44,6 +44,13 @@ T = TypeVar("T") +class _Sentinel: + ... + + +ALL_PINNED_SENTINEL = _Sentinel() + + class Device(enum.Enum): GPU = enum.auto() CPU = enum.auto() @@ -67,6 +74,7 @@ class LRUCache(Generic[T]): def __init__(self, capacity: int): self.cache: OrderedDict[Hashable, T] = OrderedDict() + self.pinned_items: Set[Hashable] = set() self.capacity = capacity def __contains__(self, key: Hashable) -> bool: @@ -102,14 +110,36 @@ def put(self, key: Hashable, value: T) -> None: self.cache.move_to_end(key) self._remove_old_if_needed() + def pin(self, key: Hashable) -> None: + """ + Pins a key in the cache preventing it from being + evicted in the LRU order. + """ + if key not in self.cache: + raise ValueError(f"Cannot pin key: {key} not in cache.") + self.pinned_items.add(key) + + def _unpin(self, key: Hashable) -> None: + self.pinned_items.remove(key) + def _on_remove(self, key: Hashable, value: Optional[T]): pass - def remove_oldest(self): + def remove_oldest(self, remove_pinned=False): if not self.cache: return - key, value = self.cache.popitem(last=False) - self._on_remove(key, value) + + if not remove_pinned: + # pop the oldest item in the cache that is not pinned + lru_key = next( + (key for key in self.cache if key not in self.pinned_items), + ALL_PINNED_SENTINEL) + if lru_key is ALL_PINNED_SENTINEL: + raise RuntimeError("All items are pinned, " + "cannot remove oldest from the cache.") + else: + lru_key = next(iter(self.cache)) + self.pop(lru_key) def _remove_old_if_needed(self) -> None: while len(self.cache) > self.capacity: @@ -120,13 +150,16 @@ def pop(self, default_value: Optional[T] = None) -> Optional[T]: run_on_remove = key in self.cache value: Optional[T] = self.cache.pop(key, default_value) + # remove from pinned items + if key in self.pinned_items: + self._unpin(key) if run_on_remove: self._on_remove(key, value) return value def clear(self): while len(self.cache) > 0: - self.remove_oldest() + self.remove_oldest(remove_pinned=True) self.cache.clear() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e24835a1ea7f..a321eafce1a2 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -878,6 +878,11 @@ def remove_lora(self, lora_id: int) -> bool: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_lora(lora_id) + def list_loras(self) -> Set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index e334ffbb755b..c60764ef1bed 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -333,6 +333,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.model_runner.remove_lora(lora_id) + def pin_lora(self, lora_id: int) -> bool: + return self.model_runner.pin_lora(lora_id) + def list_loras(self) -> Set[int]: return self.model_runner.list_loras() diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 3d52fd71ec4b..dc09718de4a3 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -70,6 +70,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError + @abstractmethod + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError + @abstractmethod def list_loras(self) -> Set[int]: raise NotImplementedError @@ -86,6 +90,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise ValueError(f"{type(self)} does not support LoRA") + def pin_lora(self, lora_id: int) -> bool: + return ValueError( + f"{type(self)} does not support LoRA") # type: ignore + def list_loras(self) -> Set[int]: raise ValueError(f"{type(self)} does not support LoRA") From cf90ae01237018f70573f69c599d26648ff7740b Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 22 Jun 2024 08:09:34 +0800 Subject: [PATCH 005/392] [CI][Hardware][Intel GPU] add Intel GPU(XPU) ci pipeline (#5616) --- .buildkite/test-template-aws.j2 | 10 ++++++++-- README.md | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index fb34b787e0cb..1a7fb44c2ecc 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -42,12 +42,18 @@ steps: command: bash .buildkite/run-neuron-test.sh soft_fail: false - - label: "Intel Test" + - label: "Intel CPU Test" depends_on: ~ agents: - queue: intel + queue: intel-cpu command: bash .buildkite/run-cpu-test.sh + - label: "Intel GPU Test" + depends_on: ~ + agents: + queue: intel-gpu + command: bash .buildkite/run-xpu-test.sh + {% for step in steps %} {% if step.gpu == "a100" %} - label: "{{ step.label }}" diff --git a/README.md b/README.md index c24768bf7817..3e0da945d9be 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ vLLM is flexible and easy to use with: - Tensor parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs +- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs - (Experimental) Prefix caching support - (Experimental) Multi-lora support From 9c62db07ed8ee28d9f1a0e6ac215446d49532008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Sat, 22 Jun 2024 10:07:08 +0800 Subject: [PATCH 006/392] [Model] Support Qwen-VL and Qwen-VL-Chat models with text-only inputs (#5710) Co-authored-by: Roger Wang --- vllm/model_executor/models/qwen.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index d22ea6b79de0..b6ea6ab39664 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -28,6 +28,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput +from vllm.utils import print_warning_once class QWenMLP(nn.Module): @@ -288,6 +289,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Skip loading visual weights to support Qwen-VL models + # in cases with text-only inputs + # TODO: add support for Qwen-VL + if (name not in params_dict + and name.startswith("transformer.visual.")): + print_warning_once( + "Only text inputs are allowed. Images won't be handled " + "until Qwen-VL models are fully supported.") + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) From ff9ddbceee63efba6ba1f8d4dc66a92f1191da04 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Fri, 21 Jun 2024 20:33:12 -0700 Subject: [PATCH 007/392] [Misc] Remove #4789 workaround left in vllm/entrypoints/openai/run_batch.py (#5756) --- vllm/entrypoints/openai/run_batch.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 488ac89710b6..dac6c2b4cd48 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,5 +1,4 @@ import asyncio -import sys from io import StringIO from typing import Awaitable, List @@ -137,9 +136,6 @@ async def main(args): output_buffer.seek(0) await write_file(args.output_file, output_buffer.read().strip()) - # Temporary workaround for https://github.com/vllm-project/vllm/issues/4789 - sys.exit(0) - if __name__ == "__main__": args = parse_args() From 0cbc1d2b4ff9e3afa32ffd2d5d308c136c2d15e3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Jun 2024 22:25:14 -0700 Subject: [PATCH 008/392] [Bugfix] Fix pin_lora error in TPU executor (#5760) --- vllm/executor/tpu_executor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 7061ad85f88c..5ed00e137410 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -82,6 +82,9 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError("LoRA is not implemented for TPU backend.") + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for TPU backend.") + def list_loras(self) -> Set[int]: raise NotImplementedError("LoRA is not implemented for TPU backend.") From 8c00f9c15d13aed34b129b31c32a227be230e218 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Jun 2024 23:09:40 -0700 Subject: [PATCH 009/392] [Docs][TPU] Add installation tip for TPU (#5761) --- .../getting_started/tpu-installation.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 3627600e1f23..e96aabbb6327 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -73,3 +73,21 @@ Next, build vLLM from source. This will only take a few seconds: .. code-block:: console $ VLLM_TARGET_DEVICE="tpu" python setup.py develop + + +.. tip:: + + If you encounter the following error: + + .. code-block:: console + + from torch._C import * # noqa: F403 + ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory + + + You can install OpenBLAS with the following command: + + .. code-block:: console + + $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + From 832ea88fcb4819037b685fb47b3a0de37f2804d3 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 22 Jun 2024 10:00:43 -0700 Subject: [PATCH 010/392] [core][distributed] improve shared memory broadcast (#5754) --- .../device_communicators/shm_broadcast.py | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 119befcf6405..c44bd2f11ee8 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -48,6 +48,26 @@ def __init__(self, | written_flag | reader0_flag | reader1_flag | ... | readerN_flag | +--------------+--------------+--------------+-----+--------------+ + The state of metadata is as follows: + + (case 1) 0???...???: the block is not written yet, cannot read, can write + (case 2) 1000...000: the block is just written, can read, cannot write + (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write + (case 4) 1111...111: the block is written and read by all readers, cannot read, can write + + State transition for readers: + + When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read. + Only after the caller finishes reading the block, the reader can mark the block as read. + Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0). + + State transition for writer: + + When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case + to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer + can reset the reader flags to 0, and mark the block as written (from 0 to 1). + NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct. + During creation, `name` is None and the buffer is created. We can pass the created object to other processes by pickling it. The other processes will get the name of the shared memory and open it, so that they can access the @@ -81,10 +101,6 @@ def __init__(self, lambda *args, **kwargs: None): self.shared_memory = shared_memory.SharedMemory(name=name) assert self.shared_memory.size == self.total_bytes_of_buffer - with memoryview(self.shared_memory.buf[self.metadata_offset:] - ) as metadata_buffer: - tensor = torch.frombuffer(metadata_buffer, dtype=torch.uint8) - assert torch.all(tensor == 0) def __reduce__(self): return ( @@ -163,11 +179,15 @@ def acquire_write(self): yield buf # caller has written to the buffer - # mark the block as written - metadata_buffer[0] = 1 + # NOTE: order is important here + # first set the read flags to 0 + # then set the written flag to 1 + # otherwise, the readers may think they already read the block for i in range(1, self.buffer.n_reader + 1): # set read flag to 0, meaning it is not read yet metadata_buffer[i] = 0 + # mark the block as written + metadata_buffer[0] = 1 break @contextmanager @@ -247,13 +267,15 @@ def create_from_process_group(pg: ProcessGroup, buffer: ShmRingBuffer if group_rank == writer_rank: buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks) - dist.broadcast_object_list([buffer], src=global_ranks[writer_rank]) - dist.barrier(pg) + dist.broadcast_object_list([buffer], + src=global_ranks[writer_rank], + group=pg) return ShmRingBufferIO(buffer, -1) else: recv = [None] - dist.broadcast_object_list(recv, src=global_ranks[writer_rank]) - dist.barrier(pg) + dist.broadcast_object_list(recv, + src=global_ranks[writer_rank], + group=pg) buffer = recv[0] # type: ignore rest_ranks = [r for r in ranks_inside_group if r != writer_rank] return ShmRingBufferIO(buffer, rest_ranks.index(group_rank)) From 6c916ac8a80d1b2f4e0d0113a67767dc254a3598 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 24 Jun 2024 02:37:11 +0530 Subject: [PATCH 011/392] [BugFix] [Kernel] Add Cutlass2x fallback kernels (#5744) Co-authored-by: Varun Sundar Rabindranath --- csrc/quantization/cutlass_w8a8/common.hpp | 8 +++ .../cutlass_w8a8/scaled_mm_c2x.cu | 52 ++++++++++++++++--- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp index 23d0587bbdc5..bf04bb400790 100644 --- a/csrc/quantization/cutlass_w8a8/common.hpp +++ b/csrc/quantization/cutlass_w8a8/common.hpp @@ -17,3 +17,11 @@ inline uint32_t next_pow_2(uint32_t const num) { return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); } +inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { + int max_shared_mem_per_block_opt_in = 0; + cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, + cudaDevAttrMaxSharedMemoryPerBlockOptin, + device); + return max_shared_mem_per_block_opt_in; +} + diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 740b9fb64a75..38a20a1727d1 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -250,12 +250,39 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, CUTLASS_CHECK(status); } +template +void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + // In some cases, the GPU isn't able to accommodate the + // shared memory requirements of the Gemm. In such cases, use + // the FallbackGemm instead. + static const int max_shared_mem_per_block_opt_in = + get_cuda_max_shared_memory_per_block_opt_in(0); + + size_t const gemm_shared_mem_size = + sizeof(typename Gemm::KernelType::SharedStorage); + size_t const fallback_gemm_shared_mem_size = + sizeof(typename FallbackGemm::KernelType::SharedStorage); + + if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) { + return cutlass_gemm_caller(out, a, b, + std::forward(args)...); + } else { + TORCH_CHECK(fallback_gemm_shared_mem_size <= + max_shared_mem_per_block_opt_in); + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + template typename Epilogue> struct sm80_config_default { // This config is used in 2 cases, // - M in (128, inf) // - M in (64, 128] and N >= 8192 + // Shared Memory required by this Gemm - 81920 bytes static_assert(std::is_same()); using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; @@ -271,6 +298,7 @@ struct sm80_config_M64 { // This config is used in 2 cases, // - M in (32, 64] // - M in (64, 128] and N < 8192 + // Shared Memory required by this Gemm - 122880 bytes static_assert(std::is_same()); using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>; using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; @@ -284,6 +312,7 @@ template typename Epilogue> struct sm80_config_M32 { // M in (16, 32] + // Shared Memory required by this Gemm - 61440 bytes static_assert(std::is_same()); using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>; using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>; @@ -297,6 +326,7 @@ template typename Epilogue> struct sm80_config_M16 { // M in [1, 16] + // Shared Memory required by this Gemm - 51200 bytes static_assert(std::is_same()); using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>; using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>; @@ -331,35 +361,45 @@ void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a, using Cutlass2xGemmM16 = typename sm80_config_M16::Cutlass2xGemm; + // Due to shared memory requirements, some Gemms may fail to run on some + // GPUs. As the name indicates, the Fallback Gemm is used as an alternative + // in such cases. + // sm80_config_M16 has the least shared-memory requirement. However, + // based on some profiling, we select sm80_config_M32 as a better alternative + // performance wise. + using FallbackGemm = + typename sm80_config_M32::Cutlass2xGemm; + uint32_t const m = a.size(0); uint32_t const mp2 = std::max(static_cast(16), next_pow_2(m)); // next power of 2 if (mp2 <= 16) { // M in [1, 16] - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } else if (mp2 <= 32) { // M in (16, 32] - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } else if (mp2 <= 64) { // M in (32, 64] - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } else if (mp2 <= 128) { // M in (64, 128] uint32_t const n = out.size(1); bool const small_n = n < 8192; if (small_n) { - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } else { - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } } else { // M in (128, inf) - return cutlass_gemm_caller( + return fallback_cutlass_gemm_caller( out, a, b, std::forward(args)...); } } From 5d4d90536fa24c032bb91ae629b7b4958e045b03 Mon Sep 17 00:00:00 2001 From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com> Date: Sun, 23 Jun 2024 17:42:28 -0400 Subject: [PATCH 012/392] [Distributed] Add send and recv helpers (#5719) --- tests/distributed/test_comm_ops.py | 78 +++++++- tests/distributed/test_custom_all_reduce.py | 5 +- tests/distributed/test_pynccl.py | 16 +- tests/utils.py | 2 +- .../device_communicators/pynccl.py | 14 +- vllm/distributed/parallel_state.py | 187 ++++++++++++++++++ 6 files changed, 278 insertions(+), 24 deletions(-) diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 53654dc40d10..bf0f31df02fa 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -8,12 +8,11 @@ import ray import torch -from vllm.distributed import (broadcast_tensor_dict, +from vllm.distributed import (broadcast_tensor_dict, get_pp_group, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) -from ..utils import (init_test_distributed_environment, - multi_process_tensor_parallel) +from ..utils import init_test_distributed_environment, multi_process_parallel @ray.remote(num_gpus=1, max_calls=1) @@ -105,6 +104,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, assert torch.allclose(recv_dict["f"], test_dict["f"]) +@ray.remote(num_gpus=1, max_calls=1) +def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, + distributed_init_port: str): + del os.environ["CUDA_VISIBLE_DEVICES"] + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) + + test_dict = { + # device tensor + "a": torch.arange(8, dtype=torch.float32, device="cuda"), + # CPU tensor + "b": torch.arange(16, dtype=torch.int8, device="cpu"), + "c": "test", + "d": [1, 2, 3], + "e": { + "a": 1, + "b": 2 + }, + # empty tensor + "f": torch.tensor([], dtype=torch.float32, device="cuda"), + } + + if not get_pp_group().is_first_rank: + recv_dict = get_pp_group().recv_tensor_dict() + + if not get_pp_group().is_last_rank: + get_pp_group().send_tensor_dict(test_dict) + + if not get_pp_group().is_first_rank: + assert len(recv_dict) == len(test_dict) + assert torch.allclose(recv_dict["a"], test_dict["a"]) + assert torch.allclose(recv_dict["b"], test_dict["b"]) + assert recv_dict["c"] == test_dict["c"] + assert recv_dict["d"] == test_dict["d"] + assert recv_dict["e"] == test_dict["e"] + assert torch.allclose(recv_dict["f"], test_dict["f"]) + + +@ray.remote(num_gpus=1, max_calls=1) +def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, + distributed_init_port: str): + del os.environ["CUDA_VISIBLE_DEVICES"] + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) + + size = 64 + test_tensor = torch.arange(64, dtype=torch.float32, device="cuda") + + if not get_pp_group().is_first_rank: + recv_tensor = get_pp_group().recv(size, dtype=torch.float32) + + if not get_pp_group().is_last_rank: + get_pp_group().send(test_tensor) + + if not get_pp_group().is_first_rank: + assert torch.allclose(test_tensor, recv_tensor) + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize("tp_size", [2]) @@ -113,4 +174,13 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, broadcast_tensor_dict_test_worker ]) def test_multi_process_tensor_parallel(tp_size, test_target): - multi_process_tensor_parallel(tp_size, 1, test_target) + multi_process_parallel(tp_size, 1, test_target) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize("pp_size", [2]) +@pytest.mark.parametrize( + "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) +def test_multi_process_pipeline_parallel(pp_size, test_target): + multi_process_parallel(1, pp_size, test_target) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 9a39160b8a46..3c281a45fcaf 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -12,8 +12,7 @@ get_tp_group, graph_capture) from ..utils import (ensure_model_parallel_initialized, - init_test_distributed_environment, - multi_process_tensor_parallel) + init_test_distributed_environment, multi_process_parallel) random.seed(42) test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)] @@ -113,4 +112,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): world_size = tp_size * pipeline_parallel_size if world_size > torch.cuda.device_count(): pytest.skip("Not enough GPUs to run the test.") - multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target) + multi_process_parallel(tp_size, pipeline_parallel_size, test_target) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 964dbc5423e7..e0e424439e3a 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -168,9 +168,13 @@ def send_recv_worker_fn(): dtype=torch.float32).cuda(pynccl_comm.rank) with pynccl_comm.change_state(enable=True): if pynccl_comm.rank == 0: - pynccl_comm.send(tensor) + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % + pynccl_comm.world_size) else: - pynccl_comm.recv(tensor) + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % + pynccl_comm.world_size) result = tensor.mean().cpu().item() assert result == 1 @@ -203,9 +207,13 @@ def multiple_send_recv_worker_fn(): device=device) with pynccl_comm.change_state(enable=True): if torch.distributed.get_rank() in [0, 1]: - pynccl_comm.send(tensor) + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % + pynccl_comm.world_size) else: - pynccl_comm.recv(tensor) + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % + pynccl_comm.world_size) result = tensor.mean().cpu().item() if torch.distributed.get_rank() in [0, 2]: assert result == 1 diff --git a/tests/utils.py b/tests/utils.py index bc30515c8310..174efca4af53 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -129,7 +129,7 @@ def init_test_distributed_environment( ensure_model_parallel_initialized(tp_size, pp_size) -def multi_process_tensor_parallel( +def multi_process_parallel( tp_size: int, pp_size: int, test_target, diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 83eec264b6f8..731956654567 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -121,10 +121,7 @@ def all_reduce(self, ncclRedOpTypeEnum.from_torch(op), self.comm, cudaStream_t(stream.cuda_stream)) - def send(self, - tensor: torch.Tensor, - dst: Optional[int] = None, - stream=None): + def send(self, tensor: torch.Tensor, dst: int, stream=None): if self.disabled: return assert tensor.device == self.device, ( @@ -132,16 +129,11 @@ def send(self, f"but the input tensor is on {tensor.device}") if stream is None: stream = self.stream - if dst is None: - dst = (self.rank + 1) % self.world_size self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, self.comm, cudaStream_t(stream.cuda_stream)) - def recv(self, - tensor: torch.Tensor, - src: Optional[int] = None, - stream=None): + def recv(self, tensor: torch.Tensor, src: int, stream=None): if self.disabled: return assert tensor.device == self.device, ( @@ -149,8 +141,6 @@ def recv(self, f"but the input tensor is on {tensor.device}") if stream is None: stream = self.stream - if src is None: - src = (self.rank - 1) % self.world_size self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, self.comm, cudaStream_t(stream.cuda_stream)) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 5188fadbb92a..5f1decb376af 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -20,6 +20,7 @@ steps. """ import contextlib +import pickle from collections import namedtuple from contextlib import contextmanager, nullcontext from dataclasses import dataclass @@ -28,6 +29,7 @@ from unittest.mock import patch import torch +import torch.distributed from torch.distributed import Backend, ProcessGroup import vllm.envs as envs @@ -180,6 +182,16 @@ def last_rank(self): """Return the global rank of the last process in the group""" return self.ranks[-1] + @property + def is_first_rank(self): + """Return whether the caller is the first process in the group""" + return self.rank == self.first_rank + + @property + def is_last_rank(self): + """Return whether the caller is the last process in the group""" + return self.rank == self.last_rank + @property def next_rank(self): """Return the global rank of the process that follows the caller""" @@ -374,6 +386,70 @@ def broadcast_object_list(self, group=self.device_group) return obj_list + def send_object(self, obj: Any, dst: int) -> None: + """Send the input object list to the destination rank.""" + """NOTE: `dst` is the local rank of the destination rank.""" + + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + assert dst != self.rank, ( + "Invalid destination rank. Destination rank is the same " + "as the current rank.") + + # Serialize object to tensor and get the size as well + object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8) + + size_tensor = torch.tensor([object_tensor.numel()], + dtype=torch.long, + device="cpu") + + # Send object size + + torch.distributed.send(size_tensor, + dst=self.ranks[dst], + group=self.cpu_group) + + # Send object + torch.distributed.send(object_tensor, + dst=self.ranks[dst], + group=self.cpu_group) + + return None + + def recv_object(self, src: int) -> Any: + """Receive the input object list from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + + assert src < self.world_size, f"Invalid src rank ({src})" + + assert src != self.rank, ( + "Invalid source rank. Source rank is the same as the current rank." + ) + + size_tensor = torch.empty(1, dtype=torch.long, device="cpu") + + # Receive object size + rank_size = torch.distributed.recv(size_tensor, + src=src, + group=self.cpu_group) + + # Tensor to receive serialized objects into. + object_tensor = torch.empty( # type: ignore[call-overload] + size_tensor.item(), # type: ignore[arg-type] + dtype=torch.uint8, + device="cpu") + + rank_object = torch.distributed.recv(object_tensor, + src=src, + group=self.cpu_group) + + assert rank_object == rank_size, ( + "Received object sender rank does not match the size sender rank.") + + obj = pickle.loads(object_tensor.numpy().tobytes()) + + return obj + def broadcast_tensor_dict( self, tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, @@ -459,6 +535,88 @@ def broadcast_tensor_dict( async_handle.wait() return tensor_dict + def send_tensor_dict( + self, + tensor_dict: Dict[Any, Union[torch.Tensor, Any]], + dst: Optional[int] = None + ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]: + """Send the input tensor dictionary. + NOTE: `dst` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return tensor_dict + + group = self.device_group + metadata_group = self.cpu_group + + if dst is None: + dst = self.next_rank + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + metadata_list: List[Tuple[Any, Any]] = [] + assert isinstance( + tensor_dict, + dict), f"Expecting a dictionary, got {type(tensor_dict)}" + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `send_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + self.send_object(metadata_list, dst=dst) + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip sending empty tensors. + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.send(tensor, dst=dst, group=metadata_group) + else: + # use group for GPU tensors + torch.distributed.send(tensor, dst=dst, group=group) + return None + + def recv_tensor_dict( + self, + src: Optional[int] = None + ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]: + """Recv the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return None + + group = self.device_group + metadata_group = self.cpu_group + + if src is None: + src = self.prev_rank + assert src < self.world_size, f"Invalid src rank ({src})" + + recv_metadata_list = self.recv_object(src=src) + tensor_dict = {} + for key, value in recv_metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, + dtype=value.dtype, + device=value.device) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.recv(tensor, + src=src, + group=metadata_group) + else: + # use group for GPU tensors + torch.distributed.recv(tensor, src=src, group=group) + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + return tensor_dict + def barrier(self): """Barrier synchronization among the group. NOTE: don't use `device_group` here! `barrier` in NCCL is @@ -468,6 +626,35 @@ def barrier(self): """ torch.distributed.barrier(group=self.cpu_group) + def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None: + """Sends a tensor to the destination rank in a non-blocking way""" + """NOTE: `dst` is the local rank of the destination rank.""" + if dst is None: + dst = self.next_rank + + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + pynccl_comm.send(tensor, dst) + else: + torch.distributed.send(tensor, self.ranks[dst], self.device_group) + + def recv(self, + size: torch.Size, + dtype: torch.dtype, + src: Optional[int] = None) -> torch.Tensor: + """Receives a tensor from the src rank.""" + """NOTE: `src` is the local rank of the destination rank.""" + if src is None: + src = self.prev_rank + + tensor = torch.empty(size, dtype=dtype, device=self.device) + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + pynccl_comm.recv(tensor, src) + else: + torch.distributed.recv(tensor, self.ranks[src], self.device_group) + return tensor + def destroy(self): if self.device_group is not None: torch.distributed.destroy_process_group(self.device_group) From edd5fe5fa29b8f9cc5fa37a30cc7211e0ff37067 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 24 Jun 2024 12:11:53 +0800 Subject: [PATCH 013/392] [Bugfix] Add phi3v resize for dynamic shape and fix torchvision requirement (#5772) --- requirements-cpu.txt | 1 + requirements-cuda.txt | 2 + requirements-test.txt | 1 - tests/models/test_phi3v.py | 4 ++ vllm/model_executor/models/phi3v.py | 69 +++++++++++++++++++++++++++-- 5 files changed, 72 insertions(+), 5 deletions(-) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 8b7d86e68621..21acee91d7b5 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -3,4 +3,5 @@ # Dependencies for x86_64 CPUs torch == 2.3.1+cpu +torchvision == 0.18.1+cpu # required for the image processor of phi3v, this must be updated alongside torch triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. \ No newline at end of file diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 353617983596..10596ed85d60 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -5,5 +5,7 @@ ray >= 2.9 nvidia-ml-py # for pynvml package torch == 2.3.0 +# These must be updated alongside torch +torchvision == 0.18.0 # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0 diff --git a/requirements-test.txt b/requirements-test.txt index fef0ede7be0f..8b68e0e93966 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -14,7 +14,6 @@ peft requests ray sentence-transformers # required for embedding -torchvision # required for the image processor of phi3v # Benchmarking aiohttp diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 23454759827d..a29d50df4c4e 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -22,6 +22,7 @@ def iter_phi3v_configs(model_name: str): image_hw_to_feature_size = { (1008, 1344): 1921, + (2016, 2688): 1933, } for (h, w), f in image_hw_to_feature_size.items(): @@ -75,6 +76,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], # TODO: Add test for `tensor_parallel_size` [ref: PR #3883] # Since we use _attn_implementation="eager" for hf_runner, here is # numeric difference for longer context and test can't pass +@pytest.mark.xfail( + reason="Inconsistent image processor being used due to lack " + "of support for dynamic image token replacement") @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index fa20a7c5903d..dac832a686c2 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -13,14 +13,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict +from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict +import numpy as np import torch import torch.nn as nn +from PIL import Image from transformers import CLIPVisionConfig, PretrainedConfig from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig +from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -32,9 +35,11 @@ from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import get_dummy_image_data +from vllm.multimodal.image import ImagePixelData, get_dummy_image_data from vllm.sequence import SamplerOutput +logger = init_logger(__name__) + _KEYS_TO_MODIFY_MAPPING = { "model.vision_embed_tokens": "vision_embed_tokens", } @@ -268,7 +273,63 @@ class Phi3VImagePixelInputs(TypedDict): """Shape: (batch_size, 2)""" -@MULTIMODAL_REGISTRY.register_image_pixel_input() +# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported +# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py +def calc_padded_size(width, height, padding_unit=336): + target_height = int(np.ceil(height / padding_unit) * padding_unit) + top_padding = int((target_height - height) / 2) + bottom_padding = target_height - height - top_padding + padded_width = width + padded_height = height + top_padding + bottom_padding + return padded_width, padded_height + + +# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py +def calc_hd_transform_size(width, height, hd_num=16): + transposed = False + if width < height: + width, height = height, width + transposed = True + + ratio = width / height + scale = 1 + while scale * np.ceil(scale / ratio) <= hd_num: + scale += 1 + scale -= 1 + + new_width = int(scale * 336) + new_height = int(new_width / ratio) + + padded_width, padded_height = calc_padded_size(new_width, new_height) + + if transposed: + padded_width, padded_height = padded_height, padded_width + + return padded_width, padded_height + + +def _image_processor( + data: ImagePixelData, + model_config: ModelConfig, + vlm_config: VisionLanguageConfig, +) -> Dict[str, torch.Tensor]: + image = data.image + + if isinstance(image, Image.Image): + # Temporary patch before dynamic number of image tokens is supported + _, _, h, w = vlm_config.image_input_shape + if (w, h) != calc_hd_transform_size(image.width, image.height): + logger.warning( + "Dynamic image shape is currently not supported. " + "Resizing input image to (%d, %d).", w, h) + + data.image = image.resize((w, h)) + + return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \ + ._default_input_processor(data, model_config, vlm_config) + + +@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor) @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) class Phi3VForCausalLM(VisionLanguageModelBase): From c2462129521a64b62ace77b28641d2e3bec5831c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 24 Jun 2024 00:37:42 -0700 Subject: [PATCH 014/392] [doc][faq] add warning to download models for every nodes (#5783) --- docs/source/serving/distributed_serving.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index b0c45dbf7026..2a7937a9189c 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -35,4 +35,7 @@ To scale vLLM beyond a single machine, install and start a `Ray runtime -After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. \ No newline at end of file +After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. + +.. warning:: + Please make sure you downloaded the model to all the nodes, or the model is downloaded to some distributed file system that is accessible by all nodes. From e72dc6cb3507d914eec8dfd0d5c7b9478f6a8ccc Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 24 Jun 2024 13:26:17 -0400 Subject: [PATCH 015/392] [Doc] Add "Suggest edit" button to doc pages (#5789) --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index ca26dcec4bb5..af1f22b23c12 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -66,6 +66,7 @@ 'path_to_docs': 'docs/source', 'repository_url': 'https://github.com/vllm-project/vllm', 'use_repository_button': True, + 'use_edit_page_button': True, } # Add any paths that contain custom static files (such as style sheets) here, From 1744cc99ba9bdefea8f3f798cf51ed650b81a98e Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 24 Jun 2024 13:48:55 -0400 Subject: [PATCH 016/392] [Doc] Add Phi-3-medium to list of supported models (#5788) --- docs/source/models/supported_models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index f4673dc27092..47737ae52520 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -129,7 +129,7 @@ Alongside each architecture, we include some popular models that use it. - ✅︎ * - :code:`Phi3ForCausalLM` - Phi-3 - - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc. + - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc. - * - :code:`Phi3SmallForCausalLM` - Phi-3-Small From ba991d5c84adbc0685075af88333c688ddb06011 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Mon, 24 Jun 2024 16:01:19 -0700 Subject: [PATCH 017/392] [Bugfix] Fix FlexibleArgumentParser replaces _ with - for actual args (#5795) --- vllm/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index ce5c377eff2d..f0c7df5cf8c2 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -822,7 +822,13 @@ def parse_args(self, args=None, namespace=None): processed_args = [] for arg in args: if arg.startswith('--'): - processed_args.append('--' + arg[len('--'):].replace('_', '-')) + if '=' in arg: + key, value = arg.split('=', 1) + key = '--' + key[len('--'):].replace('_', '-') + processed_args.append(f'{key}={value}') + else: + processed_args.append('--' + + arg[len('--'):].replace('_', '-')) else: processed_args.append(arg) From e9de9dd551ac595a9f3825fcd1507deceef4f332 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 24 Jun 2024 21:09:02 -0700 Subject: [PATCH 018/392] [ci] Remove aws template (#5757) Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 7 +- .buildkite/test-template-aws.j2 | 145 -------------------------------- 2 files changed, 5 insertions(+), 147 deletions(-) delete mode 100644 .buildkite/test-template-aws.j2 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0b87e6280f0b..19b1bce16288 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,7 +1,10 @@ # In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. -# This script will be feed into Jinja template in `test-template-aws.j2` to generate -# the final pipeline yaml file. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + steps: - label: Regression Test diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 deleted file mode 100644 index 1a7fb44c2ecc..000000000000 --- a/.buildkite/test-template-aws.j2 +++ /dev/null @@ -1,145 +0,0 @@ -{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} -{% set default_working_dir = "/vllm-workspace/tests" %} - -steps: - - label: ":docker: build image" - agents: - queue: cpu_queue - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - - - group: "AMD Tests" - depends_on: ~ - steps: - {% for step in steps %} - {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} - - label: "AMD: {{ step.label }}" - agents: - queue: amd - command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" - env: - DOCKER_BUILDKIT: "1" - priority: 100 - soft_fail: true - {% endif %} - {% endfor %} - - - label: "Neuron Test" - depends_on: ~ - agents: - queue: neuron - command: bash .buildkite/run-neuron-test.sh - soft_fail: false - - - label: "Intel CPU Test" - depends_on: ~ - agents: - queue: intel-cpu - command: bash .buildkite/run-cpu-test.sh - - - label: "Intel GPU Test" - depends_on: ~ - agents: - queue: intel-gpu - command: bash .buildkite/run-xpu-test.sh - - {% for step in steps %} - {% if step.gpu == "a100" %} - - label: "{{ step.label }}" - agents: - queue: a100-queue - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: {{ docker_image }} - command: ["bash"] - args: - - '-c' - - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" - resources: - limits: - nvidia.com/gpu: {{ step.num_gpus or 1 }} - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - {% else %} - - label: "{{ step.label }}" - agents: - {% if step.label == "Documentation Build" %} - queue: small_cpu_queue - {% elif step.no_gpu %} - queue: cpu_queue - {% elif step.num_gpus == 2 or step.num_gpus == 4 %} - queue: gpu_4_queue - {% else %} - queue: gpu_1_queue - {% endif %} - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - docker#v5.2.0: - image: {{ docker_image }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_TOKEN - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS - {% endif %} - volumes: - - /dev/shm:/dev/shm - {% endif %} - {% endfor %} From f23871e9eead900d6146961ca894f5bc91f30f5e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 25 Jun 2024 16:25:03 +0800 Subject: [PATCH 019/392] [Doc] Add notice about breaking changes to VLMs (#5818) --- docs/source/models/vlm.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 70ac82e2005b..de55a1a09919 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -5,6 +5,9 @@ Using VLMs vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM. +.. important:: + We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation. + Engine Arguments ---------------- @@ -39,6 +42,10 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` image_feature_size=576, ) +.. important:: + We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration. + + To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. @@ -63,6 +70,9 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS A code example can be found in `examples/llava_example.py `_. +.. important:: + We will remove the need to format image tokens in a future release. Afterwards, the input text will follow the same format as that for the original HuggingFace model. + Online OpenAI Vision API Compatible Inference ---------------------------------------------- @@ -89,6 +99,9 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with --image-feature-size 576 \ --chat-template template_llava.jinja +.. important:: + We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration. + To consume the server, you can use the OpenAI client like in the example below: .. code-block:: python From 2ce5d6688bae64e467640b05e73af2888e93afcf Mon Sep 17 00:00:00 2001 From: Woo-Yeon Lee Date: Tue, 25 Jun 2024 18:56:06 +0900 Subject: [PATCH 020/392] [Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414) --- .buildkite/test-pipeline.yaml | 3 +- benchmarks/benchmark_latency.py | 6 + .../e2e/test_integration_dist_tp2.py | 111 +++++++++++++ ...n_dist.py => test_integration_dist_tp4.py} | 41 +++-- vllm/config.py | 24 ++- vllm/distributed/parallel_state.py | 76 ++++++--- vllm/engine/arg_utils.py | 10 ++ vllm/spec_decode/multi_step_worker.py | 11 +- vllm/spec_decode/proposer_worker_base.py | 4 +- .../spec_decode/smaller_tp_proposer_worker.py | 149 ++++++++++++++++++ vllm/spec_decode/spec_decode_worker.py | 12 +- 11 files changed, 388 insertions(+), 59 deletions(-) create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py rename tests/spec_decode/e2e/{test_integration_dist.py => test_integration_dist_tp4.py} (62%) create mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 19b1bce16288..10cfe35d85be 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -54,7 +54,7 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - pytest -v -s spec_decode/e2e/test_integration_dist.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py @@ -71,6 +71,7 @@ steps: # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - label: Engine Test mirror_hardwares: [amd] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index a4cf0632b779..f3d00e456f15 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -25,6 +25,8 @@ def main(args: argparse.Namespace): model=args.model, speculative_model=args.speculative_model, num_speculative_tokens=args.num_speculative_tokens, + speculative_draft_tensor_parallel_size=\ + args.speculative_draft_tensor_parallel_size, tokenizer=args.tokenizer, quantization=args.quantization, tensor_parallel_size=args.tensor_parallel_size, @@ -127,6 +129,10 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--speculative-model', type=str, default=None) parser.add_argument('--num-speculative-tokens', type=int, default=None) + parser.add_argument('--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', + type=int, + default=None) parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--quantization', '-q', diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py new file mode 100644 index 000000000000..5534b80c0aaa --- /dev/null +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -0,0 +1,111 @@ +"""Tests which cover integration of the speculative decoding framework with +tensor parallelism. +""" + +import pytest +import torch + +from vllm.utils import is_hip + +from .conftest import run_greedy_equality_correctness_test + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 3, + }, + { + "speculative_model": "[ngram]", + "num_speculative_tokens": 5, + "ngram_prompt_lookup_max": 3, + }, +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality when tensor parallelism is used. + """ + if is_hip(): + pytest.skip("hip is not well-supported yet") + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "speculative_draft_tensor_parallel_size": 1, + }, +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=32, + force_output_len=True) diff --git a/tests/spec_decode/e2e/test_integration_dist.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py similarity index 62% rename from tests/spec_decode/e2e/test_integration_dist.py rename to tests/spec_decode/e2e/test_integration_dist_tp4.py index d444ef24cbfd..56cb0147d9e4 100644 --- a/tests/spec_decode/e2e/test_integration_dist.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -5,16 +5,16 @@ import pytest import torch -from vllm.utils import is_hip - from .conftest import run_greedy_equality_correctness_test -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") @pytest.mark.parametrize( "common_llm_kwargs", [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. "model": "JackFram/llama-68m", # Skip cuda graph recording for fast test. @@ -22,7 +22,7 @@ # Required for spec decode. "use_v2_block_manager": True, - "tensor_parallel_size": 2, + "tensor_parallel_size": 4, # Use AsyncLLM engine, so that the engine runs in its own process. # Otherwise, since vLLM does not follow true SPMD, the test runner @@ -31,35 +31,30 @@ # second run of the test to fail with internal NCCL error. "use_async": True, }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ { "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - }, - { - "speculative_model": "[ngram]", "num_speculative_tokens": 5, - "ngram_prompt_lookup_max": 3, }, ]) -@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize( - "output_len", + "test_llm_kwargs", [ - # Use smaller output len for fast test. - 32, + #TODO(wooyeon): add spec_draft_dp=2 case + { + "speculative_draft_tensor_parallel_size": 1, + }, ]) +@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) -def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, - batch_size: int, output_len: int): - """Verify greedy equality when tensor parallelism is used. +def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. """ - if is_hip(): - pytest.skip("hip is not well-supported yet") run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, - max_output_len=output_len, + max_output_len=32, force_output_len=True) diff --git a/vllm/config.py b/vllm/config.py index 8d004902fe4f..0217a2b56992 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -797,6 +797,7 @@ def maybe_create_spec_config( target_parallel_config: ParallelConfig, target_dtype: str, speculative_model: Optional[str], + speculative_draft_tensor_parallel_size: Optional[int], num_speculative_tokens: Optional[int], speculative_max_model_len: Optional[int], enable_chunked_prefill: bool, @@ -819,6 +820,8 @@ def maybe_create_spec_config( target_dtype (str): The data type used for the target model. speculative_model (Optional[str]): The name of the speculative model, if provided. + speculative_draft_tensor_parallel_size (Optional[int]): The degree + of the tensor parallelism for the draft model. num_speculative_tokens (Optional[int]): The number of speculative tokens, if provided. Will default to the number in the draft model config if present, otherwise is required. @@ -939,7 +942,8 @@ def maybe_create_spec_config( draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( - target_parallel_config)) + target_parallel_config, + speculative_draft_tensor_parallel_size)) if num_speculative_tokens is None: raise ValueError( @@ -993,16 +997,26 @@ def _maybe_override_draft_max_model_len( @staticmethod def create_draft_parallel_config( - target_parallel_config: ParallelConfig) -> ParallelConfig: + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: Optional[int] + ) -> ParallelConfig: """Create a parallel config for use by the draft worker. - This is mostly a copy of the target parallel config. In the future the - draft worker can have a different parallel strategy, e.g. TP=1. + This is mostly a copy of the target parallel config, except the tp_size. """ + if speculative_draft_tensor_parallel_size is None: + speculative_draft_tensor_parallel_size = \ + target_parallel_config.tensor_parallel_size + elif speculative_draft_tensor_parallel_size != 1: + # TODO(wooyeon): allow tp values larger than 1 + raise ValueError( + f"{speculative_draft_tensor_parallel_size=} cannot be" + f"other value than 1") + draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config. pipeline_parallel_size, - tensor_parallel_size=target_parallel_config.tensor_parallel_size, + tensor_parallel_size=speculative_draft_tensor_parallel_size, distributed_executor_backend=target_parallel_config. distributed_executor_backend, max_parallel_loading_workers=target_parallel_config. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 5f1decb376af..a7a806b05568 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -676,6 +676,28 @@ def get_world_group() -> GroupCoordinator: return _WORLD +def init_world_group(ranks: List[int], local_rank: int, + backend: str) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=[ranks], + local_rank=local_rank, + torch_distributed_backend=backend, + use_pynccl=False, + use_custom_allreduce=False, + ) + + +def init_model_parallel_group(group_ranks: List[List[int]], local_rank: int, + backend: str) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=group_ranks, + local_rank=local_rank, + torch_distributed_backend=backend, + use_pynccl=True, + use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, + ) + + _TP: Optional[GroupCoordinator] = None @@ -764,13 +786,7 @@ def init_distributed_environment( global _WORLD if _WORLD is None: ranks = list(range(torch.distributed.get_world_size())) - _WORLD = GroupCoordinator( - group_ranks=[ranks], - local_rank=local_rank, - torch_distributed_backend=backend, - use_pynccl=False, - use_custom_allreduce=False, - ) + _WORLD = init_world_group(ranks, local_rank, backend) else: assert _WORLD.world_size == torch.distributed.get_world_size(), ( "world group already initialized with a different world size") @@ -827,13 +843,8 @@ def initialize_model_parallel( range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)) group_ranks.append(ranks) - _TP = GroupCoordinator( - group_ranks=group_ranks, - local_rank=get_world_group().local_rank, - torch_distributed_backend=backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) + _TP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, backend) # Build the pipeline model-parallel groups. num_pipeline_model_parallel_groups: int = (world_size // @@ -845,13 +856,8 @@ def initialize_model_parallel( for i in range(num_pipeline_model_parallel_groups): ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) group_ranks.append(ranks) - _PP = GroupCoordinator( - group_ranks=group_ranks, - local_rank=get_world_group().local_rank, - torch_distributed_backend=backend, - use_pynccl=True, - use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE, - ) + _PP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, backend) def ensure_model_parallel_initialized( @@ -887,6 +893,34 @@ def model_parallel_is_initialized(): return (_TP is not None and _PP is not None) +_TP_STATE_PATCHED = False + + +@contextmanager +def patch_tensor_parallel_group(tp_group: GroupCoordinator): + """Patch the tp group temporarily until this function ends. + + This method is for draft workers of speculative decoding to run draft model + with different tp degree from that of target model workers. + + Args: + tp_group (GroupCoordinator): the tp group coordinator + """ + global _TP_STATE_PATCHED + assert not _TP_STATE_PATCHED, "Should not call when it's already patched" + + _TP_STATE_PATCHED = True + old_tp_group = get_tp_group() + global _TP + _TP = tp_group + try: + yield + finally: + # restore the original state + _TP_STATE_PATCHED = False + _TP = old_tp_group + + def get_tensor_model_parallel_world_size(): """Return world size for the tensor model parallel group.""" return get_tp_group().world_size diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ef31612420c9..16374098b23d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -94,6 +94,7 @@ class EngineArgs: guided_decoding_backend: str = 'outlines' # Speculative decoding configuration. speculative_model: Optional[str] = None + speculative_draft_tensor_parallel_size: Optional[int] = None num_speculative_tokens: Optional[int] = None speculative_max_model_len: Optional[int] = None speculative_disable_by_batch_size: Optional[int] = None @@ -537,6 +538,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.num_speculative_tokens, help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding.') + parser.add_argument( + '--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', + type=int, + default=EngineArgs.speculative_draft_tensor_parallel_size, + help='Number of tensor parallel replicas for ' + 'the draft model in speculative decoding.') parser.add_argument( '--speculative-max-model-len', @@ -686,6 +694,8 @@ def create_engine_config(self, ) -> EngineConfig: target_parallel_config=parallel_config, target_dtype=self.dtype, speculative_model=self.speculative_model, + speculative_draft_tensor_parallel_size = \ + self.speculative_draft_tensor_parallel_size, num_speculative_tokens=self.num_speculative_tokens, speculative_disable_by_batch_size=self. speculative_disable_by_batch_size, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 668ceefe6175..e469fd7c3a16 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,7 +6,8 @@ from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, SequenceGroupMetadata) -from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeProposer) from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker @@ -28,9 +29,9 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Lazy initialization list. - self._proposer: Top1Proposer + self._proposer: SpeculativeProposer - def init_device(self): + def init_device(self) -> None: super().init_device() self._proposer = Top1Proposer( @@ -40,7 +41,7 @@ def init_device(self): max_proposal_len=self.max_model_len, ) - def set_include_gpu_probs_tensor(self): + def set_include_gpu_probs_tensor(self) -> None: # Need include_gpu_probs_tensor for multi_step_worker self.model_runner.model.sampler.include_gpu_probs_tensor = True @@ -73,7 +74,7 @@ def sampler_output( # Run model sample_len times. model_outputs: List[SamplerOutput] = [] for _ in range(sample_len): - model_output = super().execute_model( + model_output: List[SamplerOutput] = super().execute_model( execute_model_req=copied_execute_model_req) assert (len(model_output) == 1 ), "composing multistep workers not supported" diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index fd67ceb912ee..b691659fb292 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -3,10 +3,10 @@ from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposer -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase -class ProposerWorkerBase(WorkerBase, SpeculativeProposer): +class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer): """Interface for proposer workers""" @abstractmethod diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py new file mode 100644 index 000000000000..b78e4489513f --- /dev/null +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -0,0 +1,149 @@ +from typing import List, Optional, Tuple + +import torch + +from vllm.distributed.parallel_state import (get_tp_group, + init_model_parallel_group, + patch_tensor_parallel_group) +from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase + +logger = init_logger(__name__) + + +class SmallerTpProposerWorker(ProposerWorkerBase): + """Class which allows a speculative draft model to run with smaller tensor + parallel degree than target model. + This reduces the communication overhead of small draft models. + + To implement this feature, this class differs behavior based on is_dummy + flag, where dummy means worker that does not participate draft generation. + Participating workers use a smaller tp group by patching vLLM's tensor + parallel group temporarily during forward passes of draft models. + """ + + @classmethod + def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, + target_tensor_parallel_size: int): + """Wrap the worker in a SmallerTpProposerWorker if necessary. + """ + if draft_tensor_parallel_size == target_tensor_parallel_size: + return worker + + # gpu ranks that will generate draft tokens together + draft_ranks = list(range(draft_tensor_parallel_size)) + + logger.info("Wrapping {%s} in {%s}", type(worker), cls) + return cls(worker, draft_ranks) + + def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): + """Create a SmallerTpProposerWorker. + + Args: + worker (MultiStepWorker): an actual worker wrapped with this class + draft_ranks (List[int]): if this value is given, only the GPU ranks + written in this value participate in draft generation + """ + self._worker = worker + self._draft_ranks = draft_ranks + + # init during init_device + self._is_dummy = False + self._tp_group = None + + def _patch_tensor_parallel_group(self): + """Temporarily patch the global tp group state with its own tp group + state. + """ + return patch_tensor_parallel_group(self._tp_group) + + def init_device(self) -> None: + self._is_dummy = get_tp_group().rank not in self._draft_ranks + + # dummy workers do nothing + if self._is_dummy: + return + + # creates tp process group containing only a subset of gpu ranks + local_rank = get_tp_group().local_rank + tp_backend = torch.distributed.get_backend(get_tp_group().device_group) + self._tp_group = init_model_parallel_group([self._draft_ranks], + local_rank, tp_backend) + + with self._patch_tensor_parallel_group(): + self._worker.init_device() + + def set_include_gpu_probs_tensor(self) -> None: + if self._is_dummy: + return + + # Need include_gpu_probs_tensor for multi_step_worker + self._worker.set_include_gpu_probs_tensor() + + def load_model(self) -> None: + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + self._worker.load_model() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + if self._is_dummy: + # this case is not used now + return -1, -1 + + with self._patch_tensor_parallel_group(): + return self._worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + def sampler_output( + self, + execute_model_req: ExecuteModelRequest, + sample_len: int, + ) -> Tuple[List[SamplerOutput], bool]: + # Do not check _is_dummy, as it's always called by get_spec_proposals + return self._worker.sampler_output(execute_model_req, sample_len) + + def get_spec_proposals( + self, + execute_model_req: ExecuteModelRequest, + ) -> SpeculativeProposals: + """Produce speculations given an input batch of sequences. The number of + speculative tokens per sequence is determined by max_proposal_len. + """ + if self._is_dummy: + return SpeculativeProposals(None, None, None) + + with self._patch_tensor_parallel_group(): + return self._worker.get_spec_proposals(execute_model_req) + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if self._is_dummy: + return [] + + with self._patch_tensor_parallel_group(): + return self._worker.execute_model(execute_model_req) + + def get_cache_block_size_bytes(self) -> int: + if self._is_dummy: + # by returning zero, target worker can use the entire kv cache space + return 0 + + return self._worker.get_cache_block_size_bytes() + + @property + def vocab_size(self) -> int: + return self._worker.vocab_size diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 58d3461a2518..5089e3dd556e 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,7 +3,7 @@ import torch -from vllm.config import SpeculativeConfig +from vllm.config import ParallelConfig, SpeculativeConfig from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler @@ -18,6 +18,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase +from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker from vllm.spec_decode.util import (create_sequence_group_output, get_all_num_logprobs, get_sampled_token_logprobs, nvtx_range, @@ -90,7 +91,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): @classmethod def create_worker( cls, - scorer_worker: WorkerBase, + scorer_worker: Worker, draft_worker_kwargs: Dict[str, Any], disable_by_batch_size: Optional[int], ) -> "SpecDecodeWorker": @@ -111,7 +112,14 @@ def create_worker( proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs) disable_bonus_tokens = False else: + draft_parallel_config: ParallelConfig = draft_worker_kwargs[ + 'parallel_config'] + draft_tp = draft_parallel_config.tensor_parallel_size + target_tp = scorer_worker.parallel_config.tensor_parallel_size + proposer_worker = MultiStepWorker(**draft_worker_kwargs) + proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( + proposer_worker, draft_tp, target_tp) logger.info("Configuring SpecDecodeWorker with proposer=%s", type(proposer_worker)) From 7b993143014c95844b380a5b05eebd14ad77b7aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Wed, 26 Jun 2024 00:41:36 +0800 Subject: [PATCH 021/392] [Misc] Remove useless code in cpu_worker (#5824) --- vllm/worker/cpu_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 3ee394f9912e..914df0c7df0e 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -277,7 +277,6 @@ def execute_model( assert seq_group_metadata_list is not None num_seq_groups: int = len(seq_group_metadata_list) assert execute_model_req is not None - blocks_to_copy = execute_model_req.blocks_to_copy blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, device="cpu", dtype=torch.int64).view(-1, 2) From 67882dbb44186d781ab6db9eaec08f6616dc86bd Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 25 Jun 2024 10:15:10 -0700 Subject: [PATCH 022/392] [Core] Add fault tolerance for `RayTokenizerGroupPool` (#5748) --- tests/tokenization/test_tokenizer_group.py | 99 ++++++++++++++++ vllm/engine/async_llm_engine.py | 2 + vllm/engine/llm_engine.py | 2 + .../tokenizer_group/base_tokenizer_group.py | 4 + .../tokenizer_group/ray_tokenizer_group.py | 112 ++++++++++++++---- 5 files changed, 195 insertions(+), 24 deletions(-) diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 31571dbfff6f..1b9a59075042 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -1,5 +1,7 @@ import asyncio import os +import sys +from typing import List, Optional from unittest.mock import patch import pytest @@ -100,3 +102,100 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool): max_num_seqs=1, max_input_length=None) tokenizer_pool.ping() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("tokenizer_group_type", ["ray"]) +async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): + """Test that Ray tokenizer pool group can recover from failures and + if that's not possible, mark itself as unhealthy.""" + + class FailingTokenizerGroup(TokenizerGroup): + + def __init__(self, + *args, + fail_at: Optional[List[int]] = None, + **kwargs): + super().__init__(*args, **kwargs) + self.i = 0 + self.fail_at = fail_at or [] + + def encode(self, *args, **kwargs): + self.i += 1 + if self.i in self.fail_at: + sys.exit(1) + return super().encode(*args, **kwargs) + + class FailingRayTokenizerGroupPool(RayTokenizerGroupPool): + _worker_cls = FailingTokenizerGroup + + # Fail at first iteration + fail_at = [1] + tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type) + tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( + tokenizer_pool_config, + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=None, + fail_at=fail_at) + tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy() + + # Modify fail at to not fail at all (will be re-read when actor is + # re-initialized). + fail_at[0] = 1000 + + # We should recover successfully. + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt", + lora_request=None) + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt", + lora_request=None) + + # Check that we have a new actor + assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors) + assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors + + # Fail at first iteration + fail_at = [1] + tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( + tokenizer_pool_config, + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=None, + fail_at=fail_at) + + # We should fail after re-initialization. + with pytest.raises(RuntimeError): + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt", + lora_request=None) + + # check_health should raise the same thing + with pytest.raises(RuntimeError): + tokenizer_group_pool.check_health() + + # Ensure that non-ActorDiedErrors are still propagated correctly and do not + # cause a re-initialization. + fail_at = [] + tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( + tokenizer_pool_config, + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=2, + fail_at=fail_at) + tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy() + + # Prompt too long error + with pytest.raises(ValueError): + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt" * 100, + lora_request=None) + await tokenizer_group_pool.encode_async(request_id="1", + prompt="prompt", + lora_request=None) + # Actors should stay the same. + assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index df25eb111e87..7994b873fe9b 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -310,6 +310,8 @@ async def add_request_async( ) async def check_health_async(self) -> None: + if self.tokenizer: + self.tokenizer.check_health() self.model_executor.check_health() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f7eae257fdd1..0ad957ef9f95 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1013,6 +1013,8 @@ def pin_lora(self, lora_id: int) -> bool: return self.model_executor.pin_lora(lora_id) def check_health(self) -> None: + if self.tokenizer: + self.tokenizer.check_health() self.model_executor.check_health() def is_tracing_enabled(self) -> bool: diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index 3cce96e06d1a..18fbd894f1c0 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -53,3 +53,7 @@ async def get_lora_tokenizer_async( ) -> "PreTrainedTokenizer": """Get a tokenizer for a LoRA request.""" pass + + def check_health(self): + """Raise exception if the tokenizer group is unhealthy.""" + return diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 7c605416854b..21ec2b52bc95 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -2,17 +2,21 @@ import os from typing import List, Optional +from ray.exceptions import ActorDiedError from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy from transformers import PreTrainedTokenizer from vllm.config import TokenizerPoolConfig from vllm.executor.ray_utils import ray +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( BaseTokenizerGroup) from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( TokenizerGroup) +logger = init_logger(__name__) + class RayTokenizerGroupPool(BaseTokenizerGroup): """A Ray-based pool of TokenizerGroups for async tokenization.""" @@ -46,24 +50,28 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, ray_actor_options: dict, **tokenizer_config): # Store a local copy of the TokenizerGroup for quick access # to underlying HF tokenizers. + self._tokenizer_config = { + "tokenizer_id": tokenizer_id, + "enable_lora": enable_lora, + "max_num_seqs": max_num_seqs, + "max_input_length": max_input_length, + **tokenizer_config + } self._local_tokenizer_group = self._worker_cls( - tokenizer_id=tokenizer_id, - enable_lora=enable_lora, - max_num_seqs=max_num_seqs, - max_input_length=max_input_length, - **tokenizer_config, - ) - - ray_tokenizer_group_cls = ray.remote( + **self._tokenizer_config, ) + + self._ray_tokenizer_group_cls = ray.remote( self._worker_cls).options(**ray_actor_options) - self.tokenizer_actors = [ - ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora, - max_num_seqs, max_input_length, - **tokenizer_config) - for _ in range(num_actors) - ] + self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)] self._idle_actors: Optional[asyncio.Queue] = None + # If set, actor is unhealthy. Will reraise on the next + # check_health call. + self._exception: Optional[ActorDiedError] = None + + def _init_actor(self) -> ray.ObjectRef: + return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config) + @property def pool_size(self) -> int: return len(self.tokenizer_actors) @@ -78,6 +86,22 @@ def _ensure_queue_initialized(self): for actor in self.tokenizer_actors: self._idle_actors.put_nowait(actor) + def _finalize_encode(self, actor: ray.ObjectRef, + original_actor: ray.ObjectRef, actor_is_alive: bool): + assert self._idle_actors is not None + # Cleanup the dead actor. + if not actor_is_alive or original_actor is not actor: + self.tokenizer_actors.remove(original_actor) + if actor_is_alive: + # Put the actor back in the queue. + # This is done in a finally block to ensure that the actor is + # always put back in the queue, even if an exception/cancellation + # is raised. + self._idle_actors.put_nowait(actor) + # Add back the new actor. + if original_actor is not actor: + self.tokenizer_actors.append(actor) + def encode(self, prompt: str, request_id: Optional[str] = None, @@ -88,23 +112,41 @@ def encode(self, The actor is then put back in the queue for future use. This is blocking. """ + self.check_health() self._ensure_queue_initialized() assert self._idle_actors is not None if self._idle_actors.empty(): raise RuntimeError("No idle actors available.") actor = self._idle_actors.get_nowait() + actor_is_alive = True + original_actor = actor try: ret = ray.get( actor.encode.remote(request_id=request_id, prompt=prompt, lora_request=lora_request)) + except ActorDiedError as e: + # If the actor is dead, we first try to reinitialize it. + logger.warning("%s died with ActorDiedError, reinitializing.", + actor, + exc_info=e) + actor = self._init_actor() + try: + ret = ray.get( + actor.encode.remote(request_id=request_id, + prompt=prompt, + lora_request=lora_request)) + except ActorDiedError as e: + logger.error( + "%s died for second time in a row, marking " + "RayTokenizerGroupPool as unhealthy.", actor) + actor_is_alive = False + if not self._exception: + self._exception = e + self.check_health() finally: - # Put the actor back in the queue. - # This is done in a finally block to ensure that the actor is - # always put back in the queue, even if an exception/cancellation - # is raised. - self._idle_actors.put_nowait(actor) + self._finalize_encode(actor, original_actor, actor_is_alive) return ret async def encode_async( @@ -120,20 +162,37 @@ async def encode_async( The actor is then put back in the queue for future use. This is non-blocking. """ + self.check_health() self._ensure_queue_initialized() assert self._idle_actors is not None actor = await self._idle_actors.get() + actor_is_alive = True + original_actor = actor try: ret = await actor.encode.remote(request_id=request_id, prompt=prompt, lora_request=lora_request) + except ActorDiedError as e: + # If the actor is dead, we first try to reinitialize it. + logger.warning("%s died with ActorDiedError, reinitializing.", + actor, + exc_info=e) + actor = self._init_actor() + try: + ret = await actor.encode.remote(request_id=request_id, + prompt=prompt, + lora_request=lora_request) + except ActorDiedError as e: + logger.error( + "%s died for second time in a row, marking " + "RayTokenizerGroupPool as unhealthy.", actor) + actor_is_alive = False + if not self._exception: + self._exception = e + self.check_health() finally: - # Put the actor back in the queue. - # This is done in a finally block to ensure that the actor is - # always put back in the queue, even if an exception/cancellation - # is raised. - self._idle_actors.put_nowait(actor) + self._finalize_encode(actor, original_actor, actor_is_alive) return ret def get_max_input_len(self, @@ -155,6 +214,11 @@ async def get_lora_tokenizer_async( return await self._local_tokenizer_group.get_lora_tokenizer_async( lora_request) + def check_health(self): + if self._exception: + raise RuntimeError( + "TokenizerGroupPool is unhealthy.") from self._exception + def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None: """Copy over all current process environment variables to the runtime_env. From c18ebfdd71d16eb18617676b0b1d82ebde0027f0 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 25 Jun 2024 12:10:28 -0700 Subject: [PATCH 023/392] [doc][distributed] add both gloo and nccl tests (#5834) --- docs/source/getting_started/debugging.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index a22bba1478ab..4cd34769ecfb 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -28,8 +28,8 @@ If it crashes, and the error trace shows somewhere around ``self.graph.replay()` Here are some common issues that can cause hangs: -- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. -- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly. +- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address. +- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly. .. code-block:: python @@ -41,7 +41,14 @@ Here are some common issues that can cause hangs: dist.all_reduce(data, op=dist.ReduceOp.SUM) torch.cuda.synchronize() value = data.mean().item() - assert value == dist.get_world_size() + world_size = dist.get_world_size() + assert value == world_size, f"Expected {world_size}, got {value}" + + gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") + cpu_data = torch.FloatTensor([1,] * 128) + dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) + value = cpu_data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" .. tip:: From d9b34baeddc7f48a526dc610429a3c8670b3b339 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 25 Jun 2024 15:18:03 -0400 Subject: [PATCH 024/392] [CI/Build] Add unit testing for FlexibleArgumentParser (#5798) --- tests/test_utils.py | 61 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 0b674ea6a85c..8203b5d2f960 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,7 +7,8 @@ import pytest -from vllm.utils import deprecate_kwargs, get_open_port, merge_async_iterators +from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs, + get_open_port, merge_async_iterators) from .utils import error_on_warning @@ -130,3 +131,61 @@ def test_get_open_port(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: s3.bind(("localhost", get_open_port())) os.environ.pop("VLLM_PORT") + + +# Tests for FlexibleArgumentParser +@pytest.fixture +def parser(): + parser = FlexibleArgumentParser() + parser.add_argument('--image-input-type', + choices=['pixel_values', 'image_features']) + parser.add_argument('--model-name') + parser.add_argument('--batch-size', type=int) + parser.add_argument('--enable-feature', action='store_true') + return parser + + +def test_underscore_to_dash(parser): + args = parser.parse_args(['--image_input_type', 'pixel_values']) + assert args.image_input_type == 'pixel_values' + + +def test_mixed_usage(parser): + args = parser.parse_args([ + '--image_input_type', 'image_features', '--model-name', + 'facebook/opt-125m' + ]) + assert args.image_input_type == 'image_features' + assert args.model_name == 'facebook/opt-125m' + + +def test_with_equals_sign(parser): + args = parser.parse_args( + ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m']) + assert args.image_input_type == 'pixel_values' + assert args.model_name == 'facebook/opt-125m' + + +def test_with_int_value(parser): + args = parser.parse_args(['--batch_size', '32']) + assert args.batch_size == 32 + args = parser.parse_args(['--batch-size', '32']) + assert args.batch_size == 32 + + +def test_with_bool_flag(parser): + args = parser.parse_args(['--enable_feature']) + assert args.enable_feature is True + args = parser.parse_args(['--enable-feature']) + assert args.enable_feature is True + + +def test_invalid_choice(parser): + with pytest.raises(SystemExit): + parser.parse_args(['--image_input_type', 'invalid_choice']) + + +def test_missing_required_argument(parser): + parser.add_argument('--required-arg', required=True) + with pytest.raises(SystemExit): + parser.parse_args([]) From dd248f76756adba4a1637b882e79ab639f957feb Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 25 Jun 2024 15:23:35 -0400 Subject: [PATCH 025/392] [Misc] Update `w4a16` `compressed-tensors` support to include `w8a16` (#5794) --- tests/quantization/test_compressed_tensors.py | 23 +++++++-------- .../compressed_tensors/compressed_tensors.py | 28 +++++++++++-------- .../compressed_tensors/schemes/__init__.py | 5 ++-- .../schemes/compressed_tensors_w4a16_24.py | 1 + ...s_w4a16.py => compressed_tensors_wNa16.py} | 5 ++-- 5 files changed, 36 insertions(+), 26 deletions(-) rename vllm/model_executor/layers/quantization/compressed_tensors/schemes/{compressed_tensors_w4a16.py => compressed_tensors_wNa16.py} (98%) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index aaa366335d19..6eb7ff72fb11 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -8,9 +8,9 @@ from vllm import SamplingParams from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 - CompressedTensorsLinearMethod, CompressedTensorsW4A16, - CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken, - CompressedTensorsW8A8StaticTensor) + CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, + CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor, + CompressedTensorsWNA16) @pytest.mark.parametrize("model_args", [ @@ -74,26 +74,27 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): assert qkv_proj.weight.dtype is torch.int8 -@pytest.mark.parametrize("w4a16_args", [ - ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None), - ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128), -]) -def test_compressed_tensors_w4a16(vllm_runner, w4a16_args): - model, strategy, group = w4a16_args +@pytest.mark.parametrize( + "wNa16_args", + [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8), + ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8), + ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)]) +def test_compressed_tensors_w4a16(vllm_runner, wNa16_args): + model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] qkv_proj = layer.self_attn.qkv_proj assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16) + assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) assert qkv_proj.scheme.strategy == strategy assert qkv_proj.scheme.group_size == group assert qkv_proj.weight_packed.dtype is torch.int32 assert qkv_proj.weight_scale.dtype is torch.float16 - assert qkv_proj.weight_packed.pack_factor == 8 + assert qkv_proj.weight_packed.pack_factor == pack_factor def test_compressed_tensors_w4a16_marlin24(vllm_runner): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 44dd024afe74..c69e2f3bcf9f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -7,9 +7,10 @@ from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme, CompressedTensorsW4A16, - CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken, - CompressedTensorsW8A8StaticTensor) + W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, + CompressedTensorsScheme, CompressedTensorsW4A16Sparse24, + CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor, + CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match) @@ -108,26 +109,31 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, return is_8_bits and is_token and is_symmetric and is_dynamic - def _is_w4a16(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_wNa16_group_channel(self, weight_quant: BaseModel, + input_quant: BaseModel) -> bool: input_quant_none = input_quant is None - is_4_bits = weight_quant.num_bits == 4 is_symmetric = weight_quant.symmetric + is_channel_group = ( + weight_quant.strategy == QuantizationStrategy.CHANNEL.value + or weight_quant.strategy == QuantizationStrategy.GROUP.value) is_static = not weight_quant.dynamic - return is_4_bits and input_quant_none and is_symmetric and is_static + return (is_channel_group and input_quant_none and is_symmetric + and is_static) def _get_schema(self, weight_quant: BaseModel, input_quant: BaseModel) -> "CompressedTensorsScheme": - if self._is_w4a16(weight_quant, input_quant): - if self.quant_format == CompressionFormat.marlin_24.value: + if self._is_wNa16_group_channel(weight_quant, input_quant): + if (self.quant_format == CompressionFormat.marlin_24.value + and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS): return CompressedTensorsW4A16Sparse24( strategy=weight_quant.strategy, num_bits=weight_quant.num_bits, group_size=weight_quant.group_size) - if self.quant_format == CompressionFormat.pack_quantized.value: - return CompressedTensorsW4A16( + if (self.quant_format == CompressionFormat.pack_quantized.value + and weight_quant.num_bits in WNA16_SUPPORTED_BITS): + return CompressedTensorsWNA16( num_bits=weight_quant.num_bits, strategy=weight_quant.strategy, group_size=weight_quant.group_size) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 3c95aa11fc76..f6d20ce2c6f7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,10 +1,11 @@ from .compressed_tensors_scheme import CompressedTensorsScheme # noqa: F401 from .compressed_tensors_unquantized import ( # noqa: F401 CompressedTensorsUnquantized) -from .compressed_tensors_w4a16 import CompressedTensorsW4A16 # noqa: F401 from .compressed_tensors_w4a16_24 import ( # noqa: F401 - CompressedTensorsW4A16Sparse24) + W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24) from .compressed_tensors_w8a8_dynamictoken import ( # noqa: F401, E501 CompressedTensorsW8A8DynamicToken) from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501 CompressedTensorsW8A8StaticTensor) +from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS # noqa: F401 +from .compressed_tensors_wNa16 import CompressedTensorsWNA16 # noqa: F401 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index d7e04ddb8d94..607029c819dd 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -11,6 +11,7 @@ from vllm.model_executor.utils import set_weight_attrs __all__ = ["CompressedTensorsW4A16Sparse24"] +W4A16SPARSE24_SUPPORTED_BITS = [4] class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py similarity index 98% rename from vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 373458cfffe0..7707ea6ee94b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -11,10 +11,11 @@ marlin_permute_scales) from vllm.model_executor.utils import set_weight_attrs -__all__ = ["CompressedTensorsW4A16"] +__all__ = ["CompressedTensorsWNA16"] +WNA16_SUPPORTED_BITS = [4, 8] -class CompressedTensorsW4A16(CompressedTensorsScheme): +class CompressedTensorsWNA16(CompressedTensorsScheme): def __init__(self, strategy: str, From bc34937d68e9715d8416457539fb528301cf6269 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 25 Jun 2024 15:25:52 -0700 Subject: [PATCH 026/392] [Hardware][TPU] Refactor TPU backend (#5831) --- vllm/executor/tpu_executor.py | 58 +++++++++++++++++++++------------ vllm/worker/tpu_model_runner.py | 4 +++ vllm/worker/tpu_worker.py | 35 +++++++++++++------- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 5ed00e137410..7fe5349c987a 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -1,4 +1,4 @@ -from typing import List, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple import torch @@ -26,29 +26,45 @@ def _init_executor(self) -> None: self.model_config.dtype = torch.bfloat16 # Instantiate the worker and load the model to the device. - self._init_worker() - - def _init_worker(self): - from vllm.worker.tpu_worker import TPUWorker + self.driver_worker = self._create_worker() + self.driver_worker.init_device() + self.driver_worker.load_model() - assert self.parallel_config.world_size == 1, ( - "TPUExecutor currently only supports a single TPU chip.") - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = TPUWorker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - self.cache_config, - self.load_config, - self.vision_language_config, - local_rank=0, - rank=0, + def _get_worker_kwargs( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None, + ) -> Dict[str, Any]: + """Return worker init args for a given rank.""" + if distributed_init_method is None: + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + return dict( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + load_config=self.load_config, + local_rank=local_rank, + rank=rank, distributed_init_method=distributed_init_method, + vision_language_config=self.vision_language_config, + is_driver_worker=rank == 0, ) - self.driver_worker.init_device() - self.driver_worker.load_model() + + def _create_worker( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None, + ): + from vllm.worker.tpu_worker import TPUWorker + + worker = TPUWorker(**self._get_worker_kwargs(local_rank, rank, + distributed_init_method)) + return worker def initialize_cache( self, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 5003d3b0ca44..2d8fffe5ac16 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -33,6 +33,7 @@ def __init__( cache_config: CacheConfig, load_config: LoadConfig, vision_language_config: Optional[VisionLanguageConfig] = None, + is_driver_worker: bool = False, ): self.model_config = model_config self.parallel_config = parallel_config @@ -41,6 +42,7 @@ def __init__( self.cache_config = cache_config self.load_config = load_config self.vision_language_config = vision_language_config + self.is_driver_worker = is_driver_worker self.block_size = self.cache_config.block_size self.max_num_blocks_per_seq = (self.model_config.max_model_len // @@ -373,6 +375,8 @@ def _execute_model( inputs = self.prepare_inputs(seq_group_metadata_list) next_token_ids = self.model(inputs[0], inputs[1], kv_caches, *inputs[2:]) + if not self.is_driver_worker: + return [] next_token_ids = next_token_ids.cpu().tolist() i = 0 diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 04576015dadb..828bb89d70ba 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -34,6 +34,7 @@ def __init__( local_rank: int, rank: int, distributed_init_method: str, + is_driver_worker: bool, ) -> None: self.model_config = model_config self.parallel_config = parallel_config @@ -45,6 +46,7 @@ def __init__( self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method + self.is_driver_worker = is_driver_worker assert self.device_config.device_type == "tpu" if self.cache_config.cache_dtype == "auto": @@ -53,10 +55,14 @@ def __init__( self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ self.cache_config.cache_dtype] - self.model_runner = TPUModelRunner(model_config, parallel_config, - scheduler_config, device_config, - cache_config, load_config, - vision_language_config) + self.model_runner = TPUModelRunner(model_config, + parallel_config, + scheduler_config, + device_config, + cache_config, + load_config, + vision_language_config, + is_driver_worker=is_driver_worker) def init_device(self) -> None: os.environ["PJRT_DEVICE"] = "TPU" @@ -175,16 +181,13 @@ def get_cache_block_size_bytes(self) -> int: def execute_model( self, - execute_model_req: Optional[ExecuteModelRequest] = None + execute_model_req: Optional[ExecuteModelRequest] = None, ) -> List[SamplerOutput]: - if execute_model_req is None: - return [] - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - num_seq_groups = len(seq_group_metadata_list) - if num_seq_groups == 0: + if not self.is_driver_worker: + self._execute_model_non_driver() return [] + assert execute_model_req is not None # Currently, TPUWorker does not support swapping. # TODO(woosuk): Support block copying. assert len(execute_model_req.blocks_to_swap_in) == 0, ( @@ -193,6 +196,16 @@ def execute_model( "Swapping is not supported for the TPU backend.") assert len(execute_model_req.blocks_to_copy) == 0 + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + assert len(seq_group_metadata_list) > 0 output = self.model_runner.execute_model(seq_group_metadata_list, self.tpu_cache) return [output] + + def start_worker_execution_loop(self) -> None: + while self._execute_model_non_driver(): + pass + + def _execute_model_non_driver(self) -> bool: + self.model_runner.execute_model(None, self.tpu_cache) + return True From dd793d1de59b5efad25f4794b68cb935824c7a11 Mon Sep 17 00:00:00 2001 From: Matt Wong <156021403+mawong-amd@users.noreply.github.com> Date: Tue, 25 Jun 2024 17:56:15 -0500 Subject: [PATCH 027/392] [Hardware][AMD][CI/Build][Doc] Upgrade to ROCm 6.1, Dockerfile improvements, test fixes (#5422) --- CMakeLists.txt | 20 +- Dockerfile.rocm | 209 ++++++++++++------ cmake/utils.cmake | 20 +- .../getting_started/amd-installation.rst | 6 +- tests/async_engine/test_openapi_server_ray.py | 4 +- tests/distributed/test_utils.py | 17 +- tests/entrypoints/test_openai_embedding.py | 4 +- tests/entrypoints/test_openai_server.py | 4 +- tests/entrypoints/test_openai_vision.py | 4 +- tests/utils.py | 38 +++- vllm/config.py | 10 +- .../custom_all_reduce_utils.py | 11 +- vllm/executor/multiproc_gpu_executor.py | 8 +- vllm/utils.py | 16 +- vllm/worker/worker_base.py | 10 +- 15 files changed, 259 insertions(+), 122 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aa15b632cdd3..801429096eaa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,8 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # versions are derived from Dockerfile.rocm # set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0") -set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") -set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0") # # Try to find python package with an executable that exactly matches @@ -98,18 +97,11 @@ elseif(HIP_FOUND) # .hip extension automatically, HIP must be enabled explicitly. enable_language(HIP) - # ROCm 5.x - if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X}) - message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} " - "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.") - endif() - - # ROCm 6.x - if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X}) - message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} " - "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.") + # ROCm 5.X and 6.X + if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND + NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM}) + message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} " + "expected for ROCm build, saw ${Torch_VERSION} instead.") endif() else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 6bda696859c8..652f04adf895 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,34 +1,35 @@ -# default base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -FROM $BASE_IMAGE - -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -RUN echo "Base image is $BASE_IMAGE" - -ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \ - ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - +# Default ROCm 6.1 base image +ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" + +# Tested and supported base rocm/pytorch images +ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \ + ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ + ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" + +# Default ROCm ARCHes to build vLLM for. +ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" + +# Whether to build CK-based flash-attention +# If 0, will not build flash attention +# This is useful for gfx target where flash-attention is not supported +# (i.e. those that do not appear in `FA_GFX_ARCHS`) +# Triton FA is used by default on ROCm now so this is unnecessary. +ARG BUILD_FA="1" ARG FA_GFX_ARCHS="gfx90a;gfx942" -RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" - ARG FA_BRANCH="ae7928c" -RUN echo "FA_BRANCH is $FA_BRANCH" -# whether to build flash-attention -# if 0, will not build flash attention -# this is useful for gfx target where flash-attention is not supported -# In that case, we need to use the python reference attention implementation in vllm -ARG BUILD_FA="1" - -# whether to build triton on rocm +# Whether to build triton on rocm ARG BUILD_TRITON="1" +ARG TRITON_BRANCH="0ef1848" -# Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y +### Base image build stage +FROM $BASE_IMAGE AS base + +# Import arg(s) defined before this build stage +ARG PYTORCH_ROCM_ARCH # Install some basic utilities +RUN apt-get update && apt-get install python3 python3-pip -y RUN apt-get update && apt-get install -y \ curl \ ca-certificates \ @@ -39,79 +40,159 @@ RUN apt-get update && apt-get install -y \ build-essential \ wget \ unzip \ - nvidia-cuda-toolkit \ tmux \ ccache \ && rm -rf /var/lib/apt/lists/* -### Mount Point ### -# When launching the container, mount the code directory to /app +# When launching the container, mount the code directory to /vllm-workspace ARG APP_MOUNT=/vllm-workspace -VOLUME [ ${APP_MOUNT} ] WORKDIR ${APP_MOUNT} -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas +RUN pip install --upgrade pip +# Remove sccache so it doesn't interfere with ccache +# TODO: implement sccache support across components +RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)" +# Install torch == 2.4.0 on ROCm +RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-5.7"*) \ + pip uninstall -y torch \ + && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \ + *"rocm-6.0"*) \ + pip uninstall -y torch \ + && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \ + *"rocm-6.1"*) \ + pip uninstall -y torch \ + && pip install --no-cache-dir --pre torch==2.4.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ + *) ;; esac ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: -# Install ROCm flash-attention -RUN if [ "$BUILD_FA" = "1" ]; then \ - mkdir libs \ +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} +ENV CCACHE_DIR=/root/.cache/ccache + + +### AMD-SMI build stage +FROM base AS build_amdsmi +# Build amdsmi wheel always +RUN cd /opt/rocm/share/amd_smi \ + && pip wheel . --wheel-dir=/install + + +### Flash-Attention wheel build stage +FROM base AS build_fa +ARG BUILD_FA +ARG FA_GFX_ARCHS +ARG FA_BRANCH +# Build ROCm flash-attention wheel if `BUILD_FA = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_FA" = "1" ]; then \ + mkdir -p libs \ && cd libs \ && git clone https://github.com/ROCm/flash-attention.git \ && cd flash-attention \ - && git checkout ${FA_BRANCH} \ + && git checkout "${FA_BRANCH}" \ && git submodule update --init \ - && export GPU_ARCHS=${FA_GFX_ARCHS} \ - && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \ - patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ - && python3 setup.py install \ - && cd ..; \ + && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-5.7"*) \ + export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \ + && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \ + *) ;; esac \ + && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ fi -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually removed it so that later steps of numpy upgrade can continue -RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi -# build triton -RUN if [ "$BUILD_TRITON" = "1" ]; then \ +### Triton wheel build stage +FROM base AS build_triton +ARG BUILD_TRITON +ARG TRITON_BRANCH +# Build triton wheel if `BUILD_TRITON = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_TRITON" = "1" ]; then \ mkdir -p libs \ && cd libs \ - && pip uninstall -y triton \ - && git clone https://github.com/ROCm/triton.git \ - && cd triton/python \ - && pip3 install . \ - && cd ../..; \ + && git clone https://github.com/OpenAI/triton.git \ + && cd triton \ + && git checkout "${TRITON_BRANCH}" \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ fi -WORKDIR /vllm-workspace + +### Final vLLM build stage +FROM base AS final +# Import the vLLM development directory from the build context COPY . . -#RUN python3 -m pip install pynvml # to be removed eventually -RUN python3 -m pip install --upgrade pip numba +# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. +# Manually remove it so that later steps of numpy upgrade can continue +RUN case "$(which python3)" in \ + *"/opt/conda/envs/py_3.9"*) \ + rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ + *) ;; esac + +# Package upgrades for useful functionality or to avoid dependency issues +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --upgrade numba scipy huggingface-hub[cli] -# make sure punica kernels are built (for LoRA) +# Make sure punica kernels are built (for LoRA) ENV VLLM_INSTALL_PUNICA_KERNELS=1 # Workaround for ray >= 2.10.0 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +# Silences the HF Tokenizers warning +ENV TOKENIZERS_PARALLELISM=false -ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so - -ENV CCACHE_DIR=/root/.cache/ccache -RUN --mount=type=cache,target=/root/.cache/ccache \ +RUN --mount=type=cache,target=${CCACHE_DIR} \ --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ - && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ - patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \ - && python3 setup.py install \ - && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \ - && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \ - && cd .. + && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-6.0"*) \ + patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \ + *"rocm-6.1"*) \ + # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM + wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \ + && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \ + # Prevent interference if torch bundles its own HIP runtime + && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \ + *) ;; esac \ + && python3 setup.py clean --all \ + && python3 setup.py develop + +# Copy amdsmi wheel into final image +RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ + mkdir -p libs \ + && cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y amdsmi; +# Copy triton wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y triton; fi + +# Copy flash-attn wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y flash-attn; fi + +# Install wheels that were built to the final image +RUN --mount=type=cache,target=/root/.cache/pip \ + if ls libs/*.whl; then \ + pip install libs/*.whl; fi CMD ["/bin/bash"] diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 071e16336dfa..4869cad54113 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -147,19 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) if (${GPU_LANG} STREQUAL "HIP") # # `GPU_ARCHES` controls the `--offload-arch` flags. - # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled - # via the `PYTORCH_ROCM_ARCH` env variable. # - + # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list, + # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling + # "rocm_agent_enumerator" in "enable_language(HIP)" + # (in file Modules/CMakeDetermineHIPCompiler.cmake) + # + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH}) + else() + set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES}) + endif() # # Find the intersection of the supported + detected architectures to # set the module architecture flags. # - - set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") - set(${GPU_ARCHES}) - foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS}) + foreach (_ARCH ${HIP_ARCHITECTURES}) if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) list(APPEND ${GPU_ARCHES} ${_ARCH}) endif() @@ -167,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) if(NOT ${GPU_ARCHES}) message(FATAL_ERROR - "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" + "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is" " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") endif() diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 61fcd45a2634..cc41d47296f8 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -88,7 +88,7 @@ Option 2: Build from source - `Pytorch `_ - `hipBLAS `_ -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`. +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`. Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started `_ @@ -126,12 +126,12 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl $ cd vllm $ pip install -U -r requirements-rocm.txt - $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation + $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation .. tip:: - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation. - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention. + - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. - The ROCm version of pytorch, ideally, should match the ROCm driver version. diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index cc05d79e5687..332937b874e9 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -4,7 +4,7 @@ # and debugging. import ray -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" @@ -12,7 +12,7 @@ @pytest.fixture(scope="module") def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) + ray.init() yield ray.shutdown() diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 49d11daca9ae..9ff11b0d27b1 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,8 +1,8 @@ -import os - import ray -from vllm.utils import cuda_device_count_stateless +import vllm.envs as envs +from vllm.utils import (cuda_device_count_stateless, is_hip, + update_environment_variables) @ray.remote @@ -12,16 +12,21 @@ def get_count(self): return cuda_device_count_stateless() def set_cuda_visible_devices(self, cuda_visible_devices: str): - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) def get_cuda_visible_devices(self): - return os.environ["CUDA_VISIBLE_DEVICES"] + return envs.CUDA_VISIBLE_DEVICES def test_cuda_device_count_stateless(): """Test that cuda_device_count_stateless changes return value if CUDA_VISIBLE_DEVICES is changed.""" - + if is_hip(): + # Set HIP_VISIBLE_DEVICES == CUDA_VISIBLE_DEVICES. Conversion + # is handled by `update_environment_variables` + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES}) actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore num_gpus=2).remote() assert sorted(ray.get( diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/test_openai_embedding.py index 2496d2ac3e97..45f701733df0 100644 --- a/tests/entrypoints/test_openai_embedding.py +++ b/tests/entrypoints/test_openai_embedding.py @@ -2,7 +2,7 @@ import pytest import ray -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @@ -11,7 +11,7 @@ @pytest.fixture(scope="module") def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) + ray.init() yield ray.shutdown() diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index c22a675ff123..5196d8181550 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -16,7 +16,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -81,7 +81,7 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) + ray.init() yield ray.shutdown() diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py index 03dc5d1161f0..0e8d88b76ffe 100644 --- a/tests/entrypoints/test_openai_vision.py +++ b/tests/entrypoints/test_openai_vision.py @@ -8,7 +8,7 @@ from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer MODEL_NAME = "llava-hf/llava-1.5-7b-hf" LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent / @@ -27,7 +27,7 @@ @pytest.fixture(scope="module") def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) + ray.init() yield ray.shutdown() diff --git a/tests/utils.py b/tests/utils.py index 174efca4af53..2a5f82b91c42 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -15,9 +15,30 @@ from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.utils import get_open_port, is_hip -if (not is_hip()): +if is_hip(): + from amdsmi import (amdsmi_get_gpu_vram_usage, + amdsmi_get_processor_handles, amdsmi_init, + amdsmi_shut_down) + + @contextmanager + def _nvml(): + try: + amdsmi_init() + yield + finally: + amdsmi_shut_down() +else: from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, - nvmlInit) + nvmlInit, nvmlShutdown) + + @contextmanager + def _nvml(): + try: + nvmlInit() + yield + finally: + nvmlShutdown() + # Path to root of repository so that utilities can be imported by ray workers VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) @@ -160,20 +181,25 @@ def error_on_warning(): yield +@_nvml() def wait_for_gpu_memory_to_clear(devices: List[int], threshold_bytes: int, timeout_s: float = 120) -> None: # Use nvml instead of pytorch to reduce measurement error from torch cuda # context. - nvmlInit() start_time = time.time() while True: output: Dict[int, str] = {} output_raw: Dict[int, float] = {} for device in devices: - dev_handle = nvmlDeviceGetHandleByIndex(device) - mem_info = nvmlDeviceGetMemoryInfo(dev_handle) - gb_used = mem_info.used / 2**30 + if is_hip(): + dev_handle = amdsmi_get_processor_handles()[device] + mem_info = amdsmi_get_gpu_vram_usage(dev_handle) + gb_used = mem_info["vram_used"] / 2**10 + else: + dev_handle = nvmlDeviceGetHandleByIndex(device) + mem_info = nvmlDeviceGetMemoryInfo(dev_handle) + gb_used = mem_info.used / 2**30 output_raw[device] = gb_used output[device] = f'{gb_used:.02f}' diff --git a/vllm/config.py b/vllm/config.py index 0217a2b56992..0c4d770e4684 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -7,13 +7,15 @@ import torch from transformers import PretrainedConfig, PreTrainedTokenizerBase +import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_tpu, is_xpu) + is_hip, is_neuron, is_tpu, is_xpu, + update_environment_variables) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -634,6 +636,12 @@ def __init__( self.distributed_executor_backend = backend logger.info("Defaulting to use %s for distributed inference", backend) + # If CUDA_VISIBLE_DEVICES is set on ROCm prior to vLLM init, + # propagate changes to HIP_VISIBLE_DEVICES (conversion handled by + # the update_environment_variables function) + if is_hip() and envs.CUDA_VISIBLE_DEVICES: + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES}) self._verify_args() diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index d3e41fa71067..6f1aaed9881a 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -13,7 +13,8 @@ import vllm.envs as envs from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary from vllm.logger import init_logger -from vllm.utils import cuda_device_count_stateless +from vllm.utils import (cuda_device_count_stateless, + update_environment_variables) logger = init_logger(__name__) @@ -24,7 +25,8 @@ def producer(batch_src: Sequence[int], result_queue, cuda_visible_devices: Optional[str] = None): if cuda_visible_devices is not None: - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) lib = CudaRTLibrary() for i in batch_src: @@ -56,7 +58,8 @@ def consumer(batch_tgt: Sequence[int], result_queue, cuda_visible_devices: Optional[str] = None): if cuda_visible_devices is not None: - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) lib = CudaRTLibrary() for j in batch_tgt: @@ -123,7 +126,7 @@ def can_actually_p2p( processes for testing all pairs of GPUs in batch. The trick is to reset the device after each test (which is not available in PyTorch). """ # noqa - cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None) + cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES # pass the CUDA_VISIBLE_DEVICES to the child process # to make sure they see the same set of GPUs diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index e63e5a3a027f..a5b1d27f2759 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -11,7 +11,8 @@ from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (cuda_device_count_stateless, get_distributed_init_method, get_open_port, - get_vllm_instance_id, make_async) + get_vllm_instance_id, make_async, + update_environment_variables) logger = init_logger(__name__) @@ -25,8 +26,9 @@ def _init_executor(self) -> None: # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers if "CUDA_VISIBLE_DEVICES" not in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = (",".join( - map(str, range(world_size)))) + update_environment_variables({ + "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) + }) # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id() diff --git a/vllm/utils.py b/vllm/utils.py index f0c7df5cf8c2..92abdb3fb9b1 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -376,6 +376,10 @@ def get_open_port() -> int: def update_environment_variables(envs: Dict[str, str]): + if is_hip() and "CUDA_VISIBLE_DEVICES" in envs: + # Propagate changes to CUDA_VISIBLE_DEVICES to + # ROCm's HIP_VISIBLE_DEVICES as well + envs["HIP_VISIBLE_DEVICES"] = envs["CUDA_VISIBLE_DEVICES"] for k, v in envs.items(): if k in os.environ and os.environ[k] != v: logger.warning( @@ -779,9 +783,14 @@ def _cuda_device_count_stateless( if not torch.cuda._is_compiled(): return 0 - # bypass _device_count_nvml() if rocm (not supported) - nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml() - r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count + if is_hip(): + # ROCm uses amdsmi instead of nvml for stateless device count + # This requires a sufficiently modern version of Torch 2.4.0 + raw_count = torch.cuda._device_count_amdsmi() if (hasattr( + torch.cuda, "_device_count_amdsmi")) else -1 + else: + raw_count = torch.cuda._device_count_nvml() + r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count return r @@ -795,7 +804,6 @@ def cuda_device_count_stateless() -> int: # This can be removed and simply replaced with torch.cuda.get_device_count # after https://github.com/pytorch/pytorch/pull/122815 is released. - return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index dc09718de4a3..99482aa93bc5 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -6,7 +6,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (enable_trace_function_call_for_thread, +from vllm.utils import (enable_trace_function_call_for_thread, is_hip, update_environment_variables) logger = init_logger(__name__) @@ -125,6 +125,14 @@ def update_environment_variables(envs: Dict[str, str]) -> None: # overwriting CUDA_VISIBLE_DEVICES is desired behavior # suppress the warning in `update_environment_variables` del os.environ[key] + if is_hip(): + hip_env_var = "HIP_VISIBLE_DEVICES" + if hip_env_var in os.environ: + logger.warning( + "Ignoring pre-set environment variable `%s=%s` as " + "%s has also been set, which takes precedence.", + hip_env_var, os.environ[hip_env_var], key) + os.environ.pop(hip_env_var, None) update_environment_variables(envs) def init_worker(self, *args, **kwargs): From f178e56c68d97e3a29a8a885a09dd61f8d534732 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 25 Jun 2024 16:58:23 -0700 Subject: [PATCH 028/392] [Hardware][TPU] Raise errors for unsupported sampling params (#5850) --- vllm/worker/tpu_model_runner.py | 63 +++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 2d8fffe5ac16..2c70c1f917a0 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -20,6 +20,8 @@ logger = init_logger(__name__) _PAD_SLOT_ID = 0 # FIXME(woosuk) +# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow. +_ENABLE_TOP_P = False class TPUModelRunner: @@ -339,9 +341,34 @@ def _prepare_sample( assert seq_group_metadata.sampling_params is not None sampling_params = seq_group_metadata.sampling_params + # NOTE(woosuk): Here we mimic argmax sampling by applying a very + # low temperature. This is not accurate. t.append(sampling_params.temperature if sampling_params.temperature >= 1e-5 else 1e-5) + if sampling_params.top_p != 1 and not _ENABLE_TOP_P: + raise NotImplementedError( + "Top-p sampling is currently disabled for the TPU backend " + "due to performance issues.") p.append(sampling_params.top_p) + if sampling_params.top_k != -1: + raise NotImplementedError( + "Top-k sampling is currently disabled for the TPU backend " + "due to performance issues.") + if sampling_params.best_of > 1: + raise NotImplementedError( + "best_of > 1 is not currently supported by the TPU " + "backend.") + if sampling_params.use_beam_search: + raise NotImplementedError( + "Beam search is not supported by the TPU backend.") + if sampling_params.logprobs is not None: + raise NotImplementedError( + "logprobs is not currently supported by the TPU backend.") + if sampling_params.prompt_logprobs is not None: + raise NotImplementedError( + "prompt_logprobs is not currently supported by the TPU " + "backend.") + num_paddings = padded_batch_size - len(seq_group_metadata_list) t += [1.0] * num_paddings p += [1.0] * num_paddings @@ -350,35 +377,32 @@ def _prepare_sample( p = torch.tensor(p, dtype=torch.float32, device=self.device) return t, p - def prepare_inputs( + def _execute_model( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ): - assert seq_group_metadata_list is not None + seq_group_metadata_list: List[SequenceGroupMetadata], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + ) -> List[CompletionSequenceGroupOutput]: + # Prepare inputs. assert len(seq_group_metadata_list) > 0 # NOTE: We assume that all sequences in the group are all prompts or # all decodes. - if seq_group_metadata_list[0].is_prompt: + is_prompt = seq_group_metadata_list[0].is_prompt + if is_prompt: inputs = self._prepare_prompt(seq_group_metadata_list) else: inputs = self._prepare_decode(seq_group_metadata_list) padded_batch_size = inputs[0].shape[0] - sample_inputs = self._prepare_sample(seq_group_metadata_list, - padded_batch_size) - return inputs + sample_inputs + t, p = self._prepare_sample(seq_group_metadata_list, padded_batch_size) - def _execute_model( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - ) -> List[CompletionSequenceGroupOutput]: - inputs = self.prepare_inputs(seq_group_metadata_list) + # Execute the model. next_token_ids = self.model(inputs[0], inputs[1], kv_caches, - *inputs[2:]) - if not self.is_driver_worker: - return [] + *inputs[2:], t, p) + # Retrieve the outputs to CPU. next_token_ids = next_token_ids.cpu().tolist() + # NOTE(woosuk): Minimal code to construct the sampler outputs. + # The TPU backend does not reuse the sampler, since the TPU backend + # does not support the advanced sampling parameters such as logprobs. i = 0 sampler_outputs = [] for seq_group_metadata in seq_group_metadata_list: @@ -400,6 +424,7 @@ def execute_model( kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], ) -> SamplerOutput: assert seq_group_metadata_list is not None + assert len(seq_group_metadata_list) > 0 if seq_group_metadata_list[0].is_prompt: # NOTE(woosuk): To reduce the compilation time, we only compile the # prefill inputs with batch size 1. Because the scheduler is not @@ -492,8 +517,8 @@ def forward( logits = self.model.compute_logits(hidden_states, sampling_metadata) logits = logits / t.unsqueeze(dim=1) - # FIXME(woosuk): Disabled top-p sampling since it's too slow. - # logits = _apply_top_p(logits, p.unsqueeze(dim=1)) + if _ENABLE_TOP_P: + logits = _apply_top_p(logits, p.unsqueeze(dim=1)) probs = torch.softmax(logits, dim=-1, dtype=torch.float32) # FIXME(woosuk): best_of > 1 is not supported. next_token_ids = torch.multinomial(probs, num_samples=1).squeeze(dim=1) From c2a8ac75e03aec19dad397a8e64377d37c67239a Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 26 Jun 2024 01:04:08 +0100 Subject: [PATCH 029/392] [CI/Build] Add E2E tests for MLPSpeculator (#5791) Signed-off-by: Thomas Parnell --- tests/spec_decode/e2e/test_mlp_correctness.py | 216 ++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 tests/spec_decode/e2e/test_mlp_correctness.py diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py new file mode 100644 index 000000000000..9a9f2acbb8f3 --- /dev/null +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -0,0 +1,216 @@ +"""This docstring details important information on the testing methodology. + +Most of the tests rely on "greedy equality", where we expect the output of +speculative decoding on a sequence to exactly match the output of normal non- +speculative decoding. + +Since speculative decoding with rejection sampling guarantees that the output +distribution matches the target model's output distribution (up to hardware +numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy +equality. + +However, we still need to verify below scenario could be passed: + * Batch size 1 greedy equality + * Batch size >1 greedy equality + * Test greedy equality under preemption + * Test greedy equality under various number of speculative tokens. + +With those tests, we can say at least, MLPSpeculator would not break the +correctess for the target model outputs. +""" + +import pytest + +from .conftest import run_greedy_equality_correctness_test + +# main model +MAIN_MODEL = "ibm-granite/granite-3b-code-instruct" + +# speculative model +SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator" + +# max. number of speculative tokens: this corresponds to +# n_predict in the config.json of the speculator model. +MAX_SPEC_TOKENS = 5 + +# precision +PRECISION = "float16" + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": SPEC_MODEL, + }, +]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality with different batch size.""" + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 8, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": SPEC_MODEL, + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 128, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator, + test_llm_generator, + batch_size: int, + output_len: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_model": SPEC_MODEL, + "num_speculative_tokens": k, + } + # Try a range of num. speculative tokens + for k in range(1, 1 + MAX_SPEC_TOKENS) + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_different_k(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify that mlp speculative decoding produces exact equality + to without spec decode with different values of num_speculative_tokens. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", + [{ + "speculative_model": SPEC_MODEL, + "speculative_disable_by_batch_size": 4 + }]) +@pytest.mark.parametrize("batch_size", [1, 5]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify that mlp speculative decoding produces exact equality + to without spec decode when speculation is disabled for large + batch sizes. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) From 82079729ccd0830ce77fcc5fd7ea2be3bf81ccaf Mon Sep 17 00:00:00 2001 From: aws-patlange <90803007+aws-patlange@users.noreply.github.com> Date: Tue, 25 Jun 2024 19:52:10 -0700 Subject: [PATCH 030/392] [Bugfix] Fix assertion in NeuronExecutor (#5841) --- vllm/executor/neuron_executor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index c5e2fb0f6773..1a3329749fde 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -48,9 +48,9 @@ def initialize_cache(self, num_gpu_blocks: int, def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - assert (execute_model_req.blocks_to_swap_in == {} - and execute_model_req.blocks_to_swap_out == {} - and execute_model_req.blocks_to_copy == {}), ( + assert (not execute_model_req.blocks_to_swap_in + and not execute_model_req.blocks_to_swap_out + and not execute_model_req.blocks_to_copy), ( "Cache operations are not supported for Neuron backend.") assert execute_model_req.num_lookahead_slots == 0, ( "lookahead not supported for Neuron backend.") From dda4811591fdb90d263bc9b8ac522436369aef13 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 25 Jun 2024 20:30:03 -0700 Subject: [PATCH 031/392] [Core] Refactor Worker and ModelRunner to consolidate control plane communication (#5408) Signed-off-by: Stephanie Wang Signed-off-by: Stephanie Co-authored-by: Stephanie --- tests/worker/test_model_input.py | 152 ++++++++ tests/worker/test_model_runner.py | 57 +-- vllm/attention/backends/abstract.py | 6 +- vllm/attention/backends/blocksparse_attn.py | 4 +- vllm/attention/backends/flash_attn.py | 4 +- vllm/attention/backends/flashinfer.py | 4 +- vllm/attention/backends/ipex_attn.py | 4 +- vllm/attention/backends/pallas.py | 4 +- vllm/attention/backends/rocm_flash_attn.py | 4 +- vllm/attention/backends/torch_sdpa.py | 4 +- vllm/attention/backends/xformers.py | 4 +- vllm/executor/distributed_gpu_executor.py | 16 +- vllm/executor/executor_base.py | 4 +- vllm/executor/gpu_executor.py | 2 +- vllm/executor/multiproc_gpu_executor.py | 8 +- vllm/executor/neuron_executor.py | 3 +- vllm/executor/ray_gpu_executor.py | 5 +- vllm/sequence.py | 3 +- vllm/spec_decode/mlp_speculator_worker.py | 3 +- vllm/worker/cpu_model_runner.py | 161 +++++---- vllm/worker/cpu_worker.py | 85 ++--- vllm/worker/embedding_model_runner.py | 129 +++---- vllm/worker/model_runner.py | 367 +++++++++++--------- vllm/worker/model_runner_base.py | 157 +++++++++ vllm/worker/neuron_model_runner.py | 64 +++- vllm/worker/neuron_worker.py | 39 +-- vllm/worker/worker.py | 129 ++----- vllm/worker/worker_base.py | 170 ++++++++- vllm/worker/xpu_model_runner.py | 91 ++++- 29 files changed, 1108 insertions(+), 575 deletions(-) create mode 100644 tests/worker/test_model_input.py create mode 100644 vllm/worker/model_runner_base.py diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py new file mode 100644 index 000000000000..ae818ee360f1 --- /dev/null +++ b/tests/worker/test_model_input.py @@ -0,0 +1,152 @@ +import dataclasses +from typing import List, Tuple, Type + +import torch + +from vllm.attention import AttentionMetadata +from vllm.attention.backends.abstract import AttentionBackend +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.worker.embedding_model_runner import ( + ModelInputForGPUWithPoolingMetadata) +from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata + + +class MockAttentionBackend(AttentionBackend): + + @staticmethod + def get_name() -> str: + raise NotImplementedError + + @staticmethod + def get_impl_cls(): + raise NotImplementedError + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: + return AttentionMetadata + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + raise NotImplementedError + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + pass + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + pass + + +def test_model_runner_input(): + sampling_metadata = SamplingMetadata( + ["seq_group"], + "selected_token_indices", + "categorized_sample_indices", + "num_prompts", + ) + attn_metadata = AttentionMetadata( + num_prefills=1, + num_prefill_tokens=2, + num_decode_tokens=3, + slot_mapping=torch.zeros(1), + ) + model_input = ModelInputForGPUWithSamplingMetadata( + input_tokens=torch.ones(10), + input_positions=torch.ones(10), + sampling_metadata=sampling_metadata, + attn_metadata=attn_metadata) + + assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata) + + # Test round trip serialization. + tensor_dict = model_input.as_broadcastable_tensor_dict() + attn_backend = MockAttentionBackend() + received_model_input = ( + ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, attn_backend=attn_backend)) + # Check that received copy has correct values. + assert isinstance(received_model_input, + ModelInputForGPUWithSamplingMetadata) + assert received_model_input.input_tokens is not None + assert ( + received_model_input.input_tokens == model_input.input_tokens).all() + assert received_model_input.input_positions is not None + assert (received_model_input.input_positions == model_input.input_positions + ).all() + assert received_model_input.multi_modal_kwargs is None + assert (received_model_input.multi_modal_kwargs == + model_input.multi_modal_kwargs) + assert received_model_input.lora_requests is None + assert received_model_input.lora_requests == model_input.lora_requests + assert received_model_input.lora_mapping is None + assert received_model_input.lora_mapping == model_input.lora_mapping + for field in dataclasses.fields(AttentionMetadata): + assert getattr(received_model_input.attn_metadata, field.name, + None) == getattr(attn_metadata, field.name, None) + # For sampling metadata, only selected_token_indices is copied. + assert (received_model_input.sampling_metadata.selected_token_indices == + sampling_metadata.selected_token_indices) + assert received_model_input.sampling_metadata.seq_groups is None + + +def test_embedding_model_runner_input(): + pooling_metadata = PoolingMetadata( + seq_groups=[[0]], + seq_data={}, + prompt_lens=[1], + ) + attn_metadata = AttentionMetadata( + num_prefills=1, + num_prefill_tokens=2, + num_decode_tokens=3, + slot_mapping=torch.zeros(1), + ) + model_input = ModelInputForGPUWithPoolingMetadata( + input_tokens=torch.ones(10), + input_positions=torch.ones(10), + pooling_metadata=pooling_metadata, + attn_metadata=attn_metadata) + + assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata) + + # Test round trip serialization. + tensor_dict = model_input.as_broadcastable_tensor_dict() + attn_backend = MockAttentionBackend() + received_model_input = ( + ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( + tensor_dict, attn_backend=attn_backend)) + # Check that received copy has correct values. + assert isinstance(received_model_input, + ModelInputForGPUWithPoolingMetadata) + assert received_model_input.input_tokens is not None + assert ( + received_model_input.input_tokens == model_input.input_tokens).all() + assert received_model_input.input_positions is not None + assert (received_model_input.input_positions == model_input.input_positions + ).all() + assert received_model_input.multi_modal_kwargs is None + assert (received_model_input.multi_modal_kwargs == + model_input.multi_modal_kwargs) + assert received_model_input.lora_requests is None + assert received_model_input.lora_requests == model_input.lora_requests + assert received_model_input.lora_mapping is None + assert received_model_input.lora_mapping == model_input.lora_mapping + for field in dataclasses.fields(AttentionMetadata): + assert getattr(received_model_input.attn_metadata, field.name, + None) == getattr(attn_metadata, field.name, None) + # Pooling metadata is not broadcast. + assert received_model_input.pooling_metadata is None diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index dd0d3bf5082d..e1775790c0a0 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -61,12 +61,13 @@ def test_prepare_prompt(batch_size): expected_selected_token_indices.append(selected_token_start_idx + seq_len - 1) selected_token_start_idx += seq_len - model_input = model_runner._prepare_model_input(seq_group_metadata_list) + model_input = model_runner._prepare_model_input_tensors( + seq_group_metadata_list) input_tokens = model_input.input_tokens input_positions = model_input.input_positions attn_metadata = model_input.attn_metadata return_seq_lens = model_input.seq_lens - slot_mapping = model_input.slot_mapping + slot_mapping = attn_metadata.slot_mapping assert return_seq_lens == seq_lens assert len(slot_mapping) == len(input_tokens) @@ -174,10 +175,11 @@ def test_prepare_decode_cuda_graph(batch_size): assert seq_group_metadata.token_chunk_size == 1 seq_group_metadata_list.append(seq_group_metadata) - model_input = model_runner._prepare_model_input(seq_group_metadata_list) + model_input = model_runner._prepare_model_input_tensors( + seq_group_metadata_list) input_tokens, input_positions, attn_metadata, slot_mapping = ( model_input.input_tokens, model_input.input_positions, - model_input.attn_metadata, model_input.slot_mapping) + model_input.attn_metadata, model_input.attn_metadata.slot_mapping) assert len(slot_mapping) == len(input_tokens) expected_bs = _get_graph_batch_size(len(seq_group_metadata_list)) @@ -259,32 +261,29 @@ def test_empty_seq_group(): enforce_eager=False, ) seq_group_metadata_list: List[SequenceGroupMetadata] = [] - model_input = model_runner._prepare_model_input(seq_group_metadata_list) - input_tokens, input_positions, attn_metadata, slot_mapping = ( + model_input = model_runner._prepare_model_input_tensors( + seq_group_metadata_list) + input_tokens, input_positions, attn_metadata = ( model_input.input_tokens, model_input.input_positions, model_input.attn_metadata, - model_input.slot_mapping, ) - assert len(input_tokens) == 0 - assert len(input_positions) == 0 + assert input_tokens is None + assert input_positions is None assert attn_metadata is None - assert len(slot_mapping) == 0 - - model_input = model_runner._prepare_model_input(seq_group_metadata_list) - (input_tokens, input_positions, attn_metadata, slot_mapping, - return_seq_lens) = ( - model_input.input_tokens, - model_input.input_positions, - model_input.attn_metadata, - model_input.slot_mapping, - model_input.seq_lens, - ) - assert len(input_tokens) == 0 - assert len(input_positions) == 0 + + model_input = model_runner._prepare_model_input_tensors( + seq_group_metadata_list) + (input_tokens, input_positions, attn_metadata, return_seq_lens) = ( + model_input.input_tokens, + model_input.input_positions, + model_input.attn_metadata, + model_input.seq_lens, + ) + assert input_tokens is None + assert input_positions is None assert attn_metadata is None - assert len(slot_mapping) == 0 - assert len(return_seq_lens) == 0 + assert return_seq_lens is None @pytest.fixture @@ -353,8 +352,12 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): seq_group_metadata_list.append(seq_group_metadata) decode_metadata_list.append(seq_group_metadata) - (input_tokens, input_positions, attn_metadata, _, _, _, - _) = model_runner.prepare_input_tensors(seq_group_metadata_list) + model_input = model_runner.prepare_model_input(seq_group_metadata_list) + (input_tokens, input_positions, attn_metadata) = ( + model_input.input_tokens, + model_input.input_positions, + model_input.attn_metadata, + ) prefill_meta_actual = attn_metadata.prefill_metadata decode_meta_actual = attn_metadata.decode_metadata @@ -367,7 +370,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): # Verify attn metadata is consistent. We don't need to test individual # values here because they are tested above. - attn_metadata = model_runner._prepare_model_input( + attn_metadata = model_runner._prepare_model_input_tensors( seq_group_metadata_list).attn_metadata for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata), diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 6396103bf5ef..40768532f59c 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -21,9 +21,13 @@ def get_impl_cls() -> Type["AttentionImpl"]: @staticmethod @abstractmethod - def make_metadata(*args, **kwargs) -> "AttentionMetadata": + def get_metadata_cls() -> Type["AttentionMetadata"]: raise NotImplementedError + @classmethod + def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata": + return cls.get_metadata_cls()(*args, **kwargs) + @staticmethod @abstractmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index dce2b83615b7..7b4578fcd8b9 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -90,8 +90,8 @@ def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]: return BlocksparseFlashAttentionImpl @staticmethod - def make_metadata(*args, **kwargs) -> "BlocksparseFlashAttentionMetadata": - return BlocksparseFlashAttentionMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return BlocksparseFlashAttentionMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 1c48e2a0bb33..8cb5c3101a80 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -25,8 +25,8 @@ def get_impl_cls() -> Type["FlashAttentionImpl"]: return FlashAttentionImpl @staticmethod - def make_metadata(*args, **kwargs) -> "FlashAttentionMetadata": - return FlashAttentionMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return FlashAttentionMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 7b7959d257fa..535d30b55bc9 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -22,8 +22,8 @@ def get_impl_cls() -> Type["FlashInferImpl"]: return FlashInferImpl @staticmethod - def make_metadata(*args, **kwargs) -> "FlashInferMetadata": - return FlashInferMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return FlashInferMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index f09b24f2a030..5114bfa6e158 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -25,8 +25,8 @@ def get_impl_cls() -> Type["IpexAttnBackendImpl"]: return IpexAttnBackendImpl @staticmethod - def make_metadata(*args, **kwargs) -> "IpexAttnMetadata": - return IpexAttnMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["IpexAttnMetadata"]: + return IpexAttnMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index b203c5ec54c9..62b4a144fc44 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -16,8 +16,8 @@ def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: return PallasAttentionBackendImpl @staticmethod - def make_metadata(*args, **kwargs) -> "PallasMetadata": - return PallasMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["PallasMetadata"]: + return PallasMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 9294068c64d1..81fabdbdfc83 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -25,8 +25,8 @@ def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]: return ROCmFlashAttentionImpl @staticmethod - def make_metadata(*args, **kwargs) -> "ROCmFlashAttentionMetadata": - return ROCmFlashAttentionMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return ROCmFlashAttentionMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index c01e0a0a3a19..63f8466da931 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -31,8 +31,8 @@ def get_impl_cls() -> Type["TorchSDPABackendImpl"]: return TorchSDPABackendImpl @staticmethod - def make_metadata(*args, **kwargs) -> "TorchSDPAMetadata": - return TorchSDPAMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return TorchSDPAMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 0fecd9f6e610..ff449c3ff74f 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -28,8 +28,8 @@ def get_impl_cls() -> Type["XFormersImpl"]: return XFormersImpl @staticmethod - def make_metadata(*args, **kwargs) -> "XFormersMetadata": - return XFormersMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return XFormersMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py index 235b5bc47021..d8693e636ac8 100644 --- a/vllm/executor/distributed_gpu_executor.py +++ b/vllm/executor/distributed_gpu_executor.py @@ -64,8 +64,8 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks=num_cpu_blocks) def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + self, execute_model_req: ExecuteModelRequest + ) -> Optional[List[SamplerOutput]]: if self.parallel_worker_tasks is None: self.parallel_worker_tasks = self._run_workers( "start_worker_execution_loop", @@ -79,7 +79,7 @@ def stop_remote_worker_execution_loop(self) -> None: if self.parallel_worker_tasks is None: return - self._driver_execute_model() + self._driver_execute_model(execute_model_req=None) parallel_worker_tasks = self.parallel_worker_tasks self.parallel_worker_tasks = None # Ensure that workers exit model loop cleanly @@ -123,13 +123,13 @@ def save_sharded_state( @abstractmethod def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. + Passing None will cause the driver to stop the model execution loop + running in each of the remote workers. In this case, this method + returns None. Otherwise, this method returns the model output. """ raise NotImplementedError diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 7c2520b5a64f..d7c19622e270 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -69,8 +69,8 @@ def initialize_cache(self, num_gpu_blocks: int, @abstractmethod def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + self, execute_model_req: ExecuteModelRequest + ) -> Optional[List[SamplerOutput]]: """Executes at least one model step on the given sequences.""" raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 0a654200ed79..5522b5322e66 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -87,7 +87,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: def execute_model( self, execute_model_req: ExecuteModelRequest - ) -> List[Union[SamplerOutput, PoolerOutput]]: + ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: output = self.driver_worker.execute_model(execute_model_req) return output diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index a5b1d27f2759..6aebb4702889 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -78,16 +78,14 @@ def shutdown(self): worker_monitor.close() def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution loop running in each of the remote workers. """ - return self.driver_worker.execute_model( - execute_model_req=execute_model_req) + return self.driver_worker.execute_model(execute_model_req) def _run_workers( self, diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 1a3329749fde..53107dada996 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -55,8 +55,7 @@ def execute_model( assert execute_model_req.num_lookahead_slots == 0, ( "lookahead not supported for Neuron backend.") - output = self.driver_worker.execute_model( - execute_model_req.seq_group_metadata_list) + output = self.driver_worker.execute_model(execute_model_req) return output def add_lora(self, lora_request: LoRARequest) -> bool: diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index fc83c552888a..faa500c2d79c 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -190,9 +190,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers) def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution diff --git a/vllm/sequence.py b/vllm/sequence.py index 287e1b9df616..0925d15461fd 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -887,7 +887,8 @@ def prune(self, @dataclass class ExecuteModelRequest: - """The model execution request.""" + """The model execution request, containing CPU metadata only. The LLM + engine should create an instance of this class for each request batch.""" # The sequence group metadata list. seq_group_metadata_list: List[SequenceGroupMetadata] # Blocks to swap in. List of CPU -> GPU block number. diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index 0926e13bedab..6c1c8da57d18 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -7,7 +7,6 @@ SequenceGroupMetadata) from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase -from vllm.worker.model_runner import ModelInput class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker): @@ -56,7 +55,7 @@ def _prepare_input_tensors( seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], ) -> Tuple[torch.Tensor, List[int], List[int]]: if not seq_group_metadata_list: - return ModelInput.empty(self.device) + return torch.empty(0, device=self.device), [], [] input_tokens: List[int] = [] seq_lens: List[int] = [] diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index d539f56937be..e3464c0d3900 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -1,5 +1,6 @@ from collections import defaultdict -from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -8,20 +9,64 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad +from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend logger = init_logger(__name__) _PAD_SLOT_ID = -1 -class CPUModelRunner: +@dataclass(frozen=True) +class CPUModelInput(ModelRunnerInputBase): + """ + Used by the CPUModelRunner. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + attn_metadata: Optional["AttentionMetadata"] = None + sampling_metadata: Optional["SamplingMetadata"] = None + multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None + + def as_broadcastable_tensor_dict( + self) -> Dict[str, Union[int, torch.Tensor]]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "multi_modal_kwargs": self.multi_modal_kwargs, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type["CPUModelInput"], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None + ) -> "CPUModelInput": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class CPUModelRunner(ModelRunnerBase[CPUModelInput]): def __init__( self, @@ -270,86 +315,70 @@ def _prepare_decode( attn_metadata, ) - def prepare_input_tensors( + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> CPUModelInput: + return CPUModelInput.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Optional[Dict[str, torch.Tensor]]]: + ) -> CPUModelInput: multi_modal_kwargs = None - if self.is_driver_worker: - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs - ) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, - attn_metadata) = self._prepare_decode(seq_group_metadata_list) - seq_lens = [] - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query_lens is not needed if chunked prefill is not - # supported. Since CPU worker doesn't support chunked prefill - # just use seq_lens instead. - seq_lens, - self.device, - pin_memory=False) - # Broadcast the metadata. - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "selected_token_indices": - sampling_metadata.selected_token_indices, - } - metadata_dict.update(attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, attn_metadata, seq_lens, + multi_modal_kwargs + ) = self._prepare_prompt(seq_group_metadata_list) else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - selected_token_indices = metadata_dict.pop( - "selected_token_indices") - attn_metadata = self.attn_backend.make_metadata(**metadata_dict) - sampling_metadata = SamplingMetadata( - seq_groups=None, - seq_data=None, - seq_lens=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - generators=None, - ) - - return (input_tokens, input_positions, attn_metadata, - sampling_metadata, multi_modal_kwargs) + (input_tokens, input_positions, + attn_metadata) = self._prepare_decode(seq_group_metadata_list) + seq_lens = [] + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + # query_lens is not needed if chunked prefill is not + # supported. Since CPU worker doesn't support chunked prefill + # just use seq_lens instead. + seq_lens, + self.device, + pin_memory=False) + return CPUModelInput( + input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + sampling_metadata=sampling_metadata, + ) @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + model_input: CPUModelInput, kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, attn_metadata, sampling_metadata, - multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) - model_executable = self.model execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, + "input_ids": model_input.input_tokens, + "positions": model_input.input_positions, "kv_caches": kv_caches, - "attn_metadata": attn_metadata, + "attn_metadata": model_input.attn_metadata, } - if self.vision_language_config and multi_modal_input is not None: - execute_model_kwargs.update(multi_modal_input) + if (self.vision_language_config + and model_input.multi_modal_kwargs is not None): + execute_model_kwargs.update(model_input.multi_modal_kwargs) hidden_states = model_executable(**execute_model_kwargs) # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) # Only perform sampling in the driver worker. if not self.is_driver_worker: @@ -358,6 +387,6 @@ def execute_model( # Sample the next token. output = self.model.sample( logits=logits, - sampling_metadata=sampling_metadata, + sampling_metadata=model_input.sampling_metadata, ) return output diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 914df0c7df0e..30ee262c7a8b 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,5 +1,5 @@ """A CPU worker class.""" -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple import torch import torch.distributed @@ -8,15 +8,15 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import (broadcast_tensor_dict, - ensure_model_parallel_initialized, +from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.worker.cpu_model_runner import CPUModelRunner -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, + LoraNotSupportedWorkerBase, WorkerInput) logger = init_logger(__name__) @@ -110,7 +110,7 @@ def get_cache_block_size( return dtype_size * total -class CPUWorker(LoraNotSupportedWorkerBase): +class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a CPU socket. Each worker is associated with a single CPU socket. The worker is @@ -154,7 +154,7 @@ def __init__( # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.model_runner = CPUModelRunner( + self.model_runner: CPUModelRunner = CPUModelRunner( model_config, parallel_config, scheduler_config, @@ -255,54 +255,37 @@ def _init_cache_engine(self) -> None: for layer_cache in self.cpu_cache: layer_cache.fill_(0) - def cache_copy( + @property + def do_metadata_broadcast(self) -> bool: + return self.parallel_config.tensor_parallel_size > 1 + + @property + def kv_cache(self) -> Optional[List[torch.Tensor]]: + return self.cpu_cache + + def execute_worker( self, - blocks_to_copy: torch.Tensor, + worker_input: WorkerInput, ) -> None: - if blocks_to_copy.numel() > 0: - self.cache_engine.copy(blocks_to_copy) + if (worker_input.blocks_to_copy is not None + and worker_input.blocks_to_copy.numel() > 0): + self.cache_engine.copy(worker_input.blocks_to_copy) @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> List[SamplerOutput]: - - if execute_model_req is None: - seq_group_metadata_list = None - else: - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - if self.is_driver_worker: - assert seq_group_metadata_list is not None - num_seq_groups: int = len(seq_group_metadata_list) - assert execute_model_req is not None - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device="cpu", - dtype=torch.int64).view(-1, 2) - assert len(execute_model_req.blocks_to_swap_in) == 0 - assert len(execute_model_req.blocks_to_swap_out) == 0 - data: Dict[str, Any] = { - "num_seq_groups": num_seq_groups, - "blocks_to_copy": execute_model_req.blocks_to_copy, - } - broadcast_tensor_dict(data, src=0) - else: - data = broadcast_tensor_dict(src=0) - num_seq_groups = data["num_seq_groups"] - blocks_to_copy = data["blocks_to_copy"] - - self.cache_copy(blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] - - output = self.model_runner.execute_model(seq_group_metadata_list, - self.cpu_cache) - - # CPU worker only supports single-step execution. - return [output] + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + assert execute_model_req is not None + num_seq_groups: int = len(execute_model_req.seq_group_metadata_list) + blocks_to_copy = execute_model_req.blocks_to_copy + blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, + device="cpu", + dtype=torch.int64).view(-1, 2) + assert len(execute_model_req.blocks_to_swap_in) == 0 + assert len(execute_model_req.blocks_to_swap_out) == 0 + return WorkerInput( + num_seq_groups=num_seq_groups, + blocks_to_copy=blocks_to_copy, + ) def init_distributed_environment(self) -> None: """Initialize the distributed environment.""" diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index 465130d10e2f..3c8dfa2c6d8d 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -1,24 +1,32 @@ -from typing import Dict, List, Optional, Set, Tuple +import dataclasses +from typing import Any, Dict, List, Optional, Tuple, Type import torch -from vllm.attention import AttentionMetadata from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import broadcast_tensor_dict from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.pooling_params import PoolingParams from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata -from vllm.worker.model_runner import ModelRunner +from vllm.worker.model_runner import GPUModelRunnerBase, ModelInputForGPU logger = init_logger(__name__) -class EmbeddingModelRunner(ModelRunner): +@dataclasses.dataclass(frozen=True) +class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU): + """ + Used by the EmbeddingModelRunner. + """ + pooling_metadata: Optional["PoolingMetadata"] = None + + +class EmbeddingModelRunner( + GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]): + _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = ( + ModelInputForGPUWithPoolingMetadata) def __init__( self, @@ -47,21 +55,22 @@ def __init__( @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + model_input: ModelInputForGPUWithPoolingMetadata, kv_caches: List[torch.Tensor], ) -> Optional[PoolerOutput]: - (input_tokens, input_positions, attn_metadata, pooling_metadata, - lora_requests, lora_mapping, multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) - if self.lora_config: - self.set_active_loras(lora_requests, lora_mapping) + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) # Currently cuda graph is only supported by the decode phase. - prefill_meta = attn_metadata.prefill_metadata - decode_meta = attn_metadata.decode_metadata + assert model_input.attn_metadata is not None + prefill_meta = model_input.attn_metadata.prefill_metadata + decode_meta = model_input.attn_metadata.decode_metadata if prefill_meta is None and decode_meta.use_cuda_graph: - graph_batch_size = input_tokens.shape[0] + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] model_executable = self.graph_runners[graph_batch_size] else: model_executable = self.model @@ -70,13 +79,14 @@ def execute_model( kv_caches = [None] * num_layers execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, + "input_ids": model_input.input_tokens, + "positions": model_input.input_positions, "kv_caches": kv_caches, - "attn_metadata": attn_metadata, + "attn_metadata": model_input.attn_metadata, } if self.vision_language_config: - execute_model_kwargs.update({"image_input": multi_modal_input}) + multi_modal_kwargs = model_input.multi_modal_kwargs or {} + execute_model_kwargs.update({"image_input": multi_modal_kwargs}) hidden_states = model_executable(**execute_model_kwargs) # Only perform pooling in the driver worker. @@ -84,66 +94,31 @@ def execute_model( return None return self.model.pooler(hidden_states=hidden_states, - pooling_metadata=pooling_metadata) + pooling_metadata=model_input.pooling_metadata) + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, + Any]) -> ModelInputForGPUWithPoolingMetadata: + return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) - def prepare_input_tensors( + def prepare_model_input( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata, - Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]: - if self.is_driver_worker: - assert seq_group_metadata_list is not None - # Prepare input tensors. - ( - input_tokens, - input_positions, - attn_metadata, - seq_lens, - _, - lora_mapping, - lora_requests, - multi_modal_kwargs, - slot_mapping, - num_prefill_tokens, - num_decode_tokens, - num_prefills, - ) = self._prepare_model_input(seq_group_metadata_list) - # Prepare PoolingMetadata - pooling_metadata = self._prepare_pooling(seq_group_metadata_list, - seq_lens) - - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "lora_requests": lora_requests, - "lora_mapping": lora_mapping, - "multi_modal_kwargs": multi_modal_kwargs, - "num_prefill_tokens": num_prefill_tokens, - "num_decode_tokens": num_decode_tokens, - "slot_mapping": slot_mapping, - "num_prefills": num_prefills, - } - if attn_metadata: - metadata_dict.update(attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) - else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - lora_mapping = metadata_dict.pop("lora_mapping") - lora_requests = metadata_dict.pop("lora_requests") - multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs") - if metadata_dict: - attn_metadata = self.attn_backend.make_metadata( - **metadata_dict) - else: - attn_metadata = None - pooling_metadata = PoolingMetadata(seq_groups=None, - seq_data=None, - prompt_lens=None) - - return (input_tokens, input_positions, attn_metadata, pooling_metadata, - lora_requests, lora_mapping, multi_modal_kwargs) + ) -> ModelInputForGPUWithPoolingMetadata: + assert seq_group_metadata_list is not None + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list) + # Prepare PoolingMetadata. + assert model_input.seq_lens is not None + pooling_metadata = self._prepare_pooling(seq_group_metadata_list, + model_input.seq_lens) + + return dataclasses.replace(model_input, + pooling_metadata=pooling_metadata) def _prepare_pooling( self, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a321eafce1a2..9fdb2ea5dd4e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,8 +1,10 @@ +import dataclasses import gc import time import warnings from collections import defaultdict -from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type, + TypeVar, Union) import numpy as np import torch @@ -12,7 +14,6 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import broadcast_tensor_dict from vllm.distributed.parallel_state import graph_capture from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -26,6 +27,15 @@ from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip, is_pin_memory_available, make_tensor_with_pad) +from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend logger = init_logger(__name__) @@ -39,40 +49,90 @@ ] _NUM_WARMUP_ITERS = 2 +TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") -class ModelInput(NamedTuple): - input_tokens: torch.Tensor - input_positions: torch.Tensor - attn_metadata: Optional[AttentionMetadata] - seq_lens: List[int] - query_lens: List[int] - lora_mapping: Optional[LoRAMapping] - lora_requests: Set[LoRARequest] - multi_modal_kwargs: Dict[str, torch.Tensor] - slot_mapping: torch.Tensor - num_prefill_tokens: int - num_decode_tokens: int - num_prefills: int - @classmethod - def empty(cls, device): - return ModelInput( - input_tokens=torch.empty(0, device=device), - input_positions=torch.empty(0, device=device), - attn_metadata=None, - seq_lens=[], - query_lens=[], - lora_mapping=None, - lora_requests=set(), - multi_modal_kwargs={}, - slot_mapping=torch.empty(0, device=device), - num_prefill_tokens=0, - num_decode_tokens=0, - num_prefills=0, - ) +@dataclasses.dataclass(frozen=True) +class ModelInputForGPU(ModelRunnerInputBase): + """ + This base class contains metadata needed for the base model forward pass + but not metadata for possible additional steps, e.g., sampling. Model + runners that run additional steps should subclass this method to add + additional fields. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None + lora_mapping: Optional["LoRAMapping"] = None + lora_requests: Optional[Set[LoRARequest]] = None + attn_metadata: Optional["AttentionMetadata"] = None + multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + return tensor_dict + @classmethod + def from_broadcasted_tensor_dict( + cls: Type[TModelInputForGPU], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> TModelInputForGPU: + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +@dataclasses.dataclass(frozen=True) +class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): + """ + Used by the ModelRunner. + """ + sampling_metadata: Optional["SamplingMetadata"] = None + # Used for speculative decoding. We do not broadcast it because it is only + # used by the driver worker. + is_prompt: Optional[bool] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict -class ModelRunner: + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForGPUWithSamplingMetadata": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): + """ + Helper class for shared methods between GPU model runners. + """ + _model_input_cls: Type[TModelInputForGPU] def __init__( self, @@ -241,11 +301,13 @@ def get_max_block_per_batch(self) -> int: block_size = self.block_size return (self.max_seq_len_to_capture + block_size - 1) // block_size - def _prepare_model_input( + def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> ModelInput: - """Prepare the model input based on a given sequence group. + ) -> TModelInputForGPU: + """Helper method to prepare the model input based on a given sequence + group. Prepares metadata needed for the base model forward pass but not + metadata for possible additional steps, e.g., sampling. The API assumes seq_group_metadata_list is sorted by prefill -> decode. @@ -296,7 +358,7 @@ def _prepare_model_input( paged_kv_last_page_len: List[int] = [] if len(seq_group_metadata_list) == 0: - return ModelInput.empty(self.device) + return self._model_input_cls() if self.sliding_window is not None: sliding_window_blocks = (self.sliding_window + self.block_size - @@ -646,7 +708,7 @@ def _prepare_model_input( for k, v in multi_modal_kwargs_list.items() } - return ModelInput( + return self._model_input_cls( input_tokens=input_tokens_tensor, input_positions=input_positions_tensor, attn_metadata=attn_metadata, @@ -655,132 +717,8 @@ def _prepare_model_input( lora_mapping=lora_mapping, lora_requests=lora_requests, multi_modal_kwargs=multi_modal_kwargs, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - num_prefills=num_prefills, - ) - - def prepare_input_tensors( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]: - if self.is_driver_worker: - assert seq_group_metadata_list is not None - # Prepare input tensors. - ( - input_tokens, - input_positions, - attn_metadata, - seq_lens, - query_lens, - lora_mapping, - lora_requests, - multi_modal_kwargs, - slot_mapping, - num_prefill_tokens, - num_decode_tokens, - num_prefills, - ) = self._prepare_model_input(seq_group_metadata_list) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, seq_lens, query_lens, self.device, - self.pin_memory) - - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "selected_token_indices": - sampling_metadata.selected_token_indices, - "lora_requests": lora_requests, - "lora_mapping": lora_mapping, - "multi_modal_kwargs": multi_modal_kwargs, - "num_prefill_tokens": num_prefill_tokens, - "num_decode_tokens": num_decode_tokens, - "slot_mapping": slot_mapping, - "num_prefills": num_prefills, - } - if attn_metadata: - metadata_dict.update(attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) - else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - selected_token_indices = metadata_dict.pop( - "selected_token_indices") - lora_mapping = metadata_dict.pop("lora_mapping") - lora_requests = metadata_dict.pop("lora_requests") - multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs") - if metadata_dict: - attn_metadata = self.attn_backend.make_metadata( - **metadata_dict) - else: - attn_metadata = None - sampling_metadata = SamplingMetadata( - seq_groups=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - num_prompts=0, - ) - - return (input_tokens, input_positions, attn_metadata, - sampling_metadata, lora_requests, lora_mapping, - multi_modal_kwargs) - - @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - kv_caches: List[torch.Tensor], - ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, attn_metadata, sampling_metadata, - lora_requests, lora_mapping, multi_modal_kwargs - ) = self.prepare_input_tensors(seq_group_metadata_list) - - if self.lora_config: - self.set_active_loras(lora_requests, lora_mapping) - - # Currently cuda graph is only supported by the decode phase. - prefill_meta = attn_metadata.prefill_metadata - decode_meta = attn_metadata.decode_metadata - if prefill_meta is None and decode_meta.use_cuda_graph: - graph_batch_size = input_tokens.shape[0] - model_executable = self.graph_runners[graph_batch_size] - else: - model_executable = self.model - - hidden_states = model_executable( - input_ids=input_tokens, - positions=input_positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - **multi_modal_kwargs, - ) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return None - - # Sample the next token. - output: SamplerOutput = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, ) - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - assert seq_group_metadata_list is not None - if seq_group_metadata_list[0].is_prompt: - hidden_states = hidden_states.index_select( - 0, sampling_metadata.selected_token_indices) - output.hidden_states = hidden_states - - return output - @torch.inference_mode() def profile_run(self) -> None: # Enable top-k sampling to reflect the accurate memory usage. @@ -853,7 +791,8 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - self.execute_model(seqs, kv_caches) + model_input = self.prepare_model_input(seqs) + self.execute_model(model_input, kv_caches) torch.cuda.synchronize() return @@ -986,6 +925,110 @@ def vocab_size(self) -> int: return self.model_config.get_vocab_size() +class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): + """ + GPU model runner with sampling step. + """ + _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( + ModelInputForGPUWithSamplingMetadata) + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForGPUWithSamplingMetadata: + return ( + ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + )) + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> ModelInputForGPUWithSamplingMetadata: + """Prepare the model input based on a given sequence group, including + metadata for the sampling step. + + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + + The result tensors and data structure also batches input in prefill + -> decode order. For example, + + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + + If cuda graph is required, this API automatically pads inputs. + """ + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + model_input.seq_lens, + model_input.query_lens, + self.device, + self.pin_memory) + is_prompt = (seq_group_metadata_list[0].is_prompt + if seq_group_metadata_list else None) + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + is_prompt=is_prompt) + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + ) -> SamplerOutput: + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + + # Currently cuda graph is only supported by the decode phase. + assert model_input.attn_metadata is not None + prefill_meta = model_input.attn_metadata.prefill_metadata + decode_meta = model_input.attn_metadata.decode_metadata + if prefill_meta is None and decode_meta.use_cuda_graph: + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] + model_executable = self.graph_runners[graph_batch_size] + else: + model_executable = self.model + + multi_modal_kwargs = model_input.multi_modal_kwargs or {} + hidden_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + **multi_modal_kwargs, + ) + + # Compute the logits. + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) + + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return None + + # Sample the next token. + output: SamplerOutput = self.model.sample( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + ) + + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + if model_input.is_prompt: + assert model_input.sampling_metadata is not None + hidden_states = hidden_states.index_select( + 0, model_input.sampling_metadata.selected_token_indices) + output.hidden_states = hidden_states + + return output + + class CUDAGraphRunner: def __init__(self, model: nn.Module): diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py new file mode 100644 index 000000000000..9b1706035a33 --- /dev/null +++ b/vllm/worker/model_runner_base.py @@ -0,0 +1,157 @@ +import dataclasses +from abc import ABC, abstractmethod +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, + TypeVar) + +import torch + +from vllm.sequence import SamplerOutput, SequenceGroupMetadata + +if TYPE_CHECKING: + from vllm.attention import AttentionMetadata + from vllm.attention.backends.abstract import AttentionBackend + from vllm.model_executor import SamplingMetadata + +T = TypeVar('T', bound="ModelRunnerInputBase") + + +def _add_attn_metadata_broadcastable_dict( + tensor_dict: Dict[str, Any], + attn_metadata: Optional["AttentionMetadata"]) -> None: + """ + Helper method to update tensor_dict with broadcastable + AttentionMetadata fields. + """ + if attn_metadata is not None: + tensor_dict.update(attn_metadata.asdict_zerocopy()) + + +def _init_attn_metadata_from_tensor_dict( + attn_backend: "AttentionBackend", + tensor_dict: Dict[str, Any], +) -> Dict[str, Any]: + """ + Helper method to initialize AttentionMetadata based on an + AttentionBackend and broadcastable AttentionMetadata fields. + """ + # Extract the fields used to create AttentionMetadata. + valid_attn_kwargs = {} + for field in dataclasses.fields(attn_backend.get_metadata_cls()): + val = tensor_dict.pop(field.name, None) + if val is not None: + valid_attn_kwargs[field.name] = val + + attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs) + tensor_dict["attn_metadata"] = attn_metadata + return tensor_dict + + +def _init_sampling_metadata_from_tensor_dict( # type: ignore + tensor_dict: Dict[str, Any]) -> Dict[str, Any]: + """ + Helper method to initialize SamplingMetadata based on broadcastable + SamplingMetadata fields. + """ + from vllm.model_executor import SamplingMetadata + + selected_token_indices = tensor_dict.pop("selected_token_indices", None) + # An empty SamplingMetadata to signal that the worker should skip + # sampling. + if selected_token_indices is not None: + tensor_dict["sampling_metadata"] = SamplingMetadata( + seq_groups=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + num_prompts=0, + ) + return tensor_dict + + +def _add_sampling_metadata_broadcastable_dict( + tensor_dict: Dict[str, Any], + sampling_metadata: Optional["SamplingMetadata"]) -> None: + """ + Helper method to update tensor_dict with broadcastable + SamplingMetadata fields. + """ + if sampling_metadata is not None: + tensor_dict["selected_token_indices"] = ( + sampling_metadata.selected_token_indices) + + +@dataclasses.dataclass(frozen=True) +class ModelRunnerInputBase(ABC): + """Local inputs to each worker's model runner. May contain + device-specific data. Different worker backends may have different methods + of converting from the global ExecuteModelRequest produced by the LLM + engine to the worker-local ModelRunnerInputBase objects. + + Model runners that support multi-GPU execution should define a + ModelRunnerInputBase subclass, add their required fields, and specify how to + serialize/deserialize a ModelInput for broadcast between workers. + """ + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + """ + Extract broadcastable fields. Override for fields that require some + custom deserialization. + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def from_broadcasted_tensor_dict( + cls: Type[T], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> T: + """ + Pop fields from the given tensor_dict and populate a new instance of + ModelRunnerInputBase. + """ + raise NotImplementedError + + +class ModelRunnerBase(ABC, Generic[T]): + """ + Model runner interface that abstracts a particular hardware and/or type of + model. Model execution may communicate data with model runners in other + processes, but it should not include control plane metadata communication. + + Each ModelRunnerBase subclass should define a corresponding + ModelRunnerInputBase subclass. + """ + + @abstractmethod + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> T: + """ + Make an instance of a ModelRunnerInputBase from the broadcasted tensor + dict. + """ + raise NotImplementedError + + @abstractmethod + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> T: + """ + Prepare the inputs to ModelRunnerBase.execute_model from an execution + request. This method may move data to the worker's local device. It is + not allowed to communicate with other workers or devices. + """ + raise NotImplementedError + + @torch.inference_mode() + def execute_model( + self, + model_input: T, + kv_caches: Optional[List[torch.Tensor]], + ) -> Optional[SamplerOutput]: + """ + Execute the model on the given input. + """ + raise NotImplementedError diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index a336be04e124..fec2c97e7388 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -1,4 +1,5 @@ -from typing import List, Optional, Tuple +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch from torch import nn @@ -10,11 +11,39 @@ from vllm.model_executor.model_loader.neuron import get_neuron_model from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend logger = init_logger(__name__) -class NeuronModelRunner: +@dataclass(frozen=True) +class ModelInputForNeuron(ModelRunnerInputBase): + """ + Used by the NeuronModelRunner. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + input_block_ids: Optional[torch.Tensor] = None + sampling_metadata: Optional["SamplingMetadata"] = None + + def as_broadcastable_tensor_dict( + self) -> Dict[str, Union[int, torch.Tensor]]: + raise NotImplementedError("ModelInputForNeuron cannot be broadcast.") + + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForNeuron": + assert attn_backend is None + return cls.from_broadcasted_tensor_dict(tensor_dict) + + +class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): def __init__( self, @@ -139,10 +168,14 @@ def _prepare_decode( return input_tokens, input_positions, input_block_ids - def prepare_input_tensors( + def make_model_input_from_broadcasted_tensor_dict( + self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron: + return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict) + + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]: + ) -> ModelInputForNeuron: # NOTE: We assume that all sequences in the group are all prompts or # all decodes. is_prompt = seq_group_metadata_list[0].is_prompt @@ -164,30 +197,31 @@ def prepare_input_tensors( self.device, self.pin_memory) - return (input_tokens, input_positions, input_block_ids, - sampling_metadata) + return ModelInputForNeuron(input_tokens=input_tokens, + input_positions=input_positions, + input_block_ids=input_block_ids, + sampling_metadata=sampling_metadata) @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + model_input: ModelInputForNeuron, + kv_caches: Optional[List[torch.Tensor]] = None, ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, input_block_ids, sampling_metadata - ) = self.prepare_input_tensors(seq_group_metadata_list) - hidden_states = self.model( - input_ids=input_tokens, - positions=input_positions, - input_block_ids=input_block_ids, + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + input_block_ids=model_input.input_block_ids, ) # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) # Sample the next token. output = self.model.sample( logits=logits, - sampling_metadata=sampling_metadata, + sampling_metadata=model_input.sampling_metadata, ) return output diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index d0e6aaed180e..307c107ddef7 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,5 +1,5 @@ """A Neuron worker class.""" -from typing import List, Tuple +from typing import List, Optional, Tuple import torch import torch.distributed @@ -7,12 +7,13 @@ from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.model_executor import set_random_seed -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest from vllm.worker.neuron_model_runner import NeuronModelRunner -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, + LoraNotSupportedWorkerBase, WorkerInput) -class NeuronWorker(LoraNotSupportedWorkerBase): +class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): """A worker class that executes the model on a group of neuron cores. """ @@ -34,8 +35,9 @@ def __init__( from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.model_runner = NeuronModelRunner(model_config, parallel_config, - scheduler_config, device_config) + self.model_runner: NeuronModelRunner = NeuronModelRunner( + model_config, parallel_config, scheduler_config, device_config) + self.is_driver_worker = True def init_device(self) -> None: # Set random seed. @@ -73,22 +75,19 @@ def initialize_cache(self, num_gpu_blocks: int, self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> List[SamplerOutput]: - num_seq_groups = len(seq_group_metadata_list) + @property + def do_metadata_broadcast(self) -> bool: + return False - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] + @property + def kv_cache(self) -> Optional[List[torch.Tensor]]: + return None - output = self.model_runner.execute_model(seq_group_metadata_list) - - # Neuron worker only supports single-step output. Wrap the output in a - # list to conform to interface. - return [output] + @torch.inference_mode() + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + return WorkerInput(num_seq_groups=len( + execute_model_req.seq_group_metadata_list), ) def get_cache_block_size_bytes(self) -> int: """Determine the size in bytes of a cache block. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index c60764ef1bed..e1944a4f1d63 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,7 +1,7 @@ """A GPU worker class.""" import gc import os -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import List, Optional, Set, Tuple, Type import torch import torch.distributed @@ -9,21 +9,20 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.distributed import (broadcast_tensor_dict, - ensure_model_parallel_initialized, +from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner -from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner +from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput -class Worker(WorkerBase): +class Worker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a GPU. Each worker is associated with a single GPU. The worker is responsible for @@ -78,9 +77,10 @@ def __init__( or (speculative_config.draft_model_config.hf_config.model_type != "mlp_speculator") else {"return_hidden_states": True} - ModelRunnerClass = (EmbeddingModelRunner if - self.model_config.embedding_mode else ModelRunner) - self.model_runner = ModelRunnerClass( + ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner + if self.model_config.embedding_mode: + ModelRunnerClass = EmbeddingModelRunner + self.model_runner: GPUModelRunnerBase = ModelRunnerClass( model_config, parallel_config, scheduler_config, @@ -225,40 +225,18 @@ def _warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) - def cache_swap( - self, - blocks_to_swap_in: torch.Tensor, - blocks_to_swap_out: torch.Tensor, - blocks_to_copy: torch.Tensor, - ) -> None: - # Issue cache operations. - if blocks_to_swap_in.numel() > 0: - self.cache_engine.swap_in(blocks_to_swap_in) - if blocks_to_swap_out.numel() > 0: - self.cache_engine.swap_out(blocks_to_swap_out) - if blocks_to_copy.numel() > 0: - self.cache_engine.copy(blocks_to_copy) + @property + def do_metadata_broadcast(self) -> bool: + return self.parallel_config.tensor_parallel_size > 1 + + @property + def kv_cache(self) -> Optional[List[torch.Tensor]]: + return self.gpu_cache @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[Union[SamplerOutput, PoolerOutput]]: - if not self.is_driver_worker: - self._execute_model_non_driver() - return [] - - if execute_model_req is None: - # This signals that there's no more requests to process for now. - # All workers are running infinite loop with broadcast_tensor_dict, - # and it stops the loop when the driver broadcasts an empty input. - # Send an empty input to notify all other workers to stop their - # execution loop. - broadcast_tensor_dict({}, src=0) - return [] - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - num_seq_groups = len(seq_group_metadata_list) + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + num_seq_groups = len(execute_model_req.seq_group_metadata_list) # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. # they contain parameters to launch cudamemcpyasync. blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, @@ -273,59 +251,26 @@ def execute_model( blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, device=self.device, dtype=torch.int64).view(-1, 2) - data: Dict[str, Any] = { - "num_seq_groups": num_seq_groups, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - } - broadcast_tensor_dict(data, src=0) - - self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] - output = self.model_runner.execute_model(seq_group_metadata_list, - self.gpu_cache) - - # Worker only supports single-step execution. Wrap the output in a list - # to conform to interface. - return [output] + return WorkerInput( + num_seq_groups=num_seq_groups, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) @torch.inference_mode() - def start_worker_execution_loop(self) -> None: - """Execute model loop in parallel worker. - - You can stop the loop by executing a driver worker with an empty output. - See `stop_remote_worker_execution_loop` for more details. - """ - while self._execute_model_non_driver(): - pass - - def _execute_model_non_driver(self) -> bool: - """Execute model in parallel worker. - - Returns True iff there are remaining sequences to process. - """ - assert not self.is_driver_worker - data = broadcast_tensor_dict(src=0) - if not data: - return False - - num_seq_groups = data.get("num_seq_groups", 0) - blocks_to_swap_in = data.get("blocks_to_swap_in") - blocks_to_swap_out = data.get("blocks_to_swap_out") - blocks_to_copy = data.get("blocks_to_copy") - self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return False - - self.model_runner.execute_model(None, self.gpu_cache) - return True + def execute_worker(self, worker_input: WorkerInput) -> None: + # Issue cache operations. + if (worker_input.blocks_to_swap_in is not None + and worker_input.blocks_to_swap_in.numel() > 0): + self.cache_engine.swap_in(worker_input.blocks_to_swap_in) + if (worker_input.blocks_to_swap_out is not None + and worker_input.blocks_to_swap_out.numel() > 0): + self.cache_engine.swap_out(worker_input.blocks_to_swap_out) + if (worker_input.blocks_to_copy is not None + and worker_input.blocks_to_copy.numel() > 0): + self.cache_engine.copy(worker_input.blocks_to_copy) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 99482aa93bc5..1df60eb1f38c 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,20 +1,26 @@ +import dataclasses import importlib import os from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +import torch + +from vllm.distributed import broadcast_tensor_dict from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (enable_trace_function_call_for_thread, is_hip, update_environment_variables) +from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase logger = init_logger(__name__) class WorkerBase(ABC): """Worker interface that allows vLLM to cleanly separate implementations for - different hardware. + different hardware. Also abstracts control plane communication, e.g., to + communicate request metadata to other workers. """ @abstractmethod @@ -46,13 +52,23 @@ def initialize_cache(self, num_gpu_blocks: int, """ raise NotImplementedError + @torch.inference_mode() + def start_worker_execution_loop(self) -> None: + """Execute model loop in parallel worker. + + You can stop the loop by executing a driver worker with an empty output. + See `stop_remote_worker_execution_loop` for more details. + """ + while True: + output = self.execute_model(execute_model_req=None) + if output is None: + return None + @abstractmethod def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - """Executes at least one model step on the given sequences, unless no - sequences are provided.""" + ) -> Optional[List[SamplerOutput]]: raise NotImplementedError @abstractmethod @@ -98,6 +114,150 @@ def list_loras(self) -> Set[int]: raise ValueError(f"{type(self)} does not support LoRA") +@dataclasses.dataclass(frozen=True) +class WorkerInput: + """Local inputs to each worker. May contain device-specific data. These + fields should be broadcastable to other workers. + """ + + num_seq_groups: Optional[int] = None + blocks_to_swap_in: Optional[torch.Tensor] = None + blocks_to_swap_out: Optional[torch.Tensor] = None + blocks_to_copy: Optional[torch.Tensor] = None + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type["WorkerInput"], + tensor_dict: Dict[str, Any], + ) -> "WorkerInput": + """ + Pop fields from the given tensor_dict and populate a new instance of + WorkerInput. + """ + return cls( + num_seq_groups=tensor_dict.pop("num_seq_groups"), + blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"), + blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"), + blocks_to_copy=tensor_dict.pop("blocks_to_copy"), + ) + + def as_broadcastable_tensor_dict( + self) -> Dict[str, Union[int, torch.Tensor]]: + """ + Extract broadcastable fields. + """ + tensor_dict = { + "num_seq_groups": self.num_seq_groups, + "blocks_to_swap_in": self.blocks_to_swap_in, + "blocks_to_swap_out": self.blocks_to_swap_out, + "blocks_to_copy": self.blocks_to_copy, + } + + return tensor_dict + + +class LocalOrDistributedWorkerBase(WorkerBase): + """ + Partial implementation of WorkerBase that has a default `execute_model` + definition to perform metadata transfer between workers when in distributed + mode. Subclasses of this interface should use model runners that inherit + from ModelRunnerBase, and should only need to implement worker-local logic. + If custom control plane logic is needed to transfer metadata, or if the + model runner cannot inherit from ModelRunnerBase, use WorkerBase instead. + """ + is_driver_worker: bool + model_runner: ModelRunnerBase + + @property + @abstractmethod + def do_metadata_broadcast(self) -> bool: + """ + Used by the default `execute_model` to check whether broadcast is + needed to transfer request inputs from the driver worker to other + workers in the TP group. If WorkerBase subclass only supports + single-worker execution, then this method should return False. + """ + raise NotImplementedError + + @property + @abstractmethod + def kv_cache(self) -> Optional[List[torch.Tensor]]: + """ + Get the kv cache to pass to the worker's model runner. Used by the + default `execute_model`. If the worker's model runner does not follow + the ModelRunnerBase interface, then inherit from WorkerBase instead. + """ + raise NotImplementedError + + @abstractmethod + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + """ + Prepare the inputs to WorkerBase.execute_worker from an execution + request. This method may move data to the worker's local device. It is + not allowed to communicate with other workers or devices. + """ + raise NotImplementedError + + @abstractmethod + def execute_worker(self, worker_input: WorkerInput) -> None: + """ + Process an execution request. + """ + raise NotImplementedError + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> Optional[List[SamplerOutput]]: + """Executes at least one model step on the given sequences, unless no + sequences are provided.""" + if self.is_driver_worker: + if execute_model_req is None: + if self.do_metadata_broadcast: + # This signals that there's no more requests to process for + # now. All workers are running infinite loop with + # broadcast_tensor_dict, and it stops the loop when the + # driver broadcasts an empty input. Send an empty input to + # notify all other workers to stop their execution loop. + broadcast_tensor_dict({}, src=0) + return None + + worker_input: WorkerInput = self.prepare_worker_input( + execute_model_req=execute_model_req) + model_input: ModelRunnerInputBase = ( + self.model_runner.prepare_model_input( + execute_model_req.seq_group_metadata_list)) + + if self.do_metadata_broadcast: + broadcast_data = worker_input.as_broadcastable_tensor_dict() + broadcast_data.update( + model_input.as_broadcastable_tensor_dict()) + broadcast_tensor_dict(broadcast_data, src=0) + else: + assert self.do_metadata_broadcast + broadcast_data = broadcast_tensor_dict(src=0) + if not broadcast_data: + return None + + worker_input = WorkerInput.from_broadcasted_tensor_dict( + broadcast_data) + model_input = ( + self.model_runner. + make_model_input_from_broadcasted_tensor_dict(broadcast_data)) + + self.execute_worker(worker_input) + + # If there is no input, we don't need to execute the model. + if worker_input.num_seq_groups == 0: + return [] + + output = self.model_runner.execute_model(model_input, self.kv_cache) + # Worker only supports single-step execution. Wrap the output in a + # list to conform to interface. + return [output] + + class WorkerWrapperBase: """ The whole point of this class is to lazily initialize the worker. diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index f30de703e805..d9124a788a69 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,4 +1,5 @@ -from typing import List, Optional, Tuple +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -14,6 +15,15 @@ from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata +from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend logger = init_logger(__name__) @@ -24,7 +34,42 @@ ] -class XPUModelRunner: +@dataclass(frozen=True) +class ModelInputForXPU(ModelRunnerInputBase): + """ + Used by the NeuronModelRunner. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + attn_metadata: Optional["AttentionMetadata"] = None + sampling_metadata: Optional["SamplingMetadata"] = None + multi_modal_input: Optional[Dict[str, torch.Tensor]] = None + + def as_broadcastable_tensor_dict( + self) -> Dict[str, Union[int, torch.Tensor]]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type["ModelInputForXPU"], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForXPU": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]): def __init__( self, @@ -130,15 +175,22 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - self.execute_model(seqs, kv_caches) + model_input = self.prepare_model_input(seqs) + self.execute_model(model_input, kv_caches) torch.xpu.synchronize() return - def prepare_input_tensors( + def make_model_input_from_broadcasted_tensor_dict( + self, tensor_dict: Dict[str, Any]) -> ModelInputForXPU: + return (ModelInputForXPU.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + )) + + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Optional[torch.Tensor]]: + ) -> ModelInputForXPU: multi_modal_input = None if self.is_driver_worker: # NOTE: We assume that all sequences in the group are all prompts or @@ -185,8 +237,11 @@ def prepare_input_tensors( num_prompts=0, ) - return (input_tokens, input_positions, attn_metadata, - sampling_metadata, multi_modal_input) + return ModelInputForXPU(input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + sampling_metadata=sampling_metadata, + multi_modal_input=multi_modal_input) def _prepare_decode( self, @@ -277,27 +332,25 @@ def _prepare_decode( @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: List[SequenceGroupMetadata], + model_input: ModelInputForXPU, kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, attn_metadata, sampling_metadata, - multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) - model_executable = self.model execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, + "input_ids": model_input.input_tokens, + "positions": model_input.input_positions, "kv_caches": kv_caches, - "attn_metadata": attn_metadata, + "attn_metadata": model_input.attn_metadata, } if self.vision_language_config: - execute_model_kwargs.update({"image_input": multi_modal_input}) + execute_model_kwargs.update( + {"image_input": model_input.multi_modal_input}) hidden_states = model_executable(**execute_model_kwargs) # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) # Only perform sampling in the driver worker. if not self.is_driver_worker: @@ -306,7 +359,7 @@ def execute_model( # Sample the next token. output = self.model.sample( logits=logits, - sampling_metadata=sampling_metadata, + sampling_metadata=model_input.sampling_metadata, ) return output From 3aa7b6cf66890c042ebecf9e8094f4f5e3dbf96e Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 25 Jun 2024 20:34:25 -0700 Subject: [PATCH 032/392] [Misc][Doc] Add Example of using OpenAI Server with VLM (#5832) --- docs/source/models/vlm.rst | 2 + examples/openai_vision_api_client.py | 90 ++++++++++++++++++++++++++++ vllm/multimodal/utils.py | 12 +++- 3 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 examples/openai_vision_api_client.py diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index de55a1a09919..1837dd2aa89f 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -130,6 +130,8 @@ To consume the server, you can use the OpenAI client like in the example below: ) print("Chat response:", chat_response) +A full code example can be found in `examples/openai_vision_api_client.py `_. + .. note:: By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py new file mode 100644 index 000000000000..26f2aa651fca --- /dev/null +++ b/examples/openai_vision_api_client.py @@ -0,0 +1,90 @@ +"""An example showing how to use vLLM to serve VLMs. + +Launch the vLLM server with the following command: +python -m vllm.entrypoints.openai.api_server \ + --model llava-hf/llava-1.5-7b-hf \ + --image-input-type pixel_values \ + --image-token-id 32000 \ + --image-input-shape 1,3,336,336 \ + --image-feature-size 576 \ + --chat-template template_llava.jinja +""" +import base64 + +import requests +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +# Use image url in the payload +chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + ], + }], + model=model, +) + +result = chat_completion_from_url.choices[0].message.content +print(f"Chat completion output:{result}") + + +# Use base64 encoded image in the payload +def encode_image_base64_from_url(image_url: str) -> str: + """Encode an image retrieved from a remote url to base64 format.""" + + with requests.get(image_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + + +image_base64 = encode_image_base64_from_url(image_url=image_url) +chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + }, + ], + }], + model=model, +) + +result = chat_completion_from_base64.choices[0].message.content +print(f"Chat completion output:{result}") diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 509f791d27c6..0cf2c057f892 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,6 +1,7 @@ import base64 from io import BytesIO from typing import Optional, Union +from urllib.parse import urlparse import aiohttp from PIL import Image @@ -28,6 +29,10 @@ async def fetch_image(cls, image_url: str) -> Image.Image: """Load PIL image from a url or base64 encoded openai GPT4V format""" if image_url.startswith('http'): + parsed_url = urlparse(image_url) + if parsed_url.scheme not in ["http", "https"]: + raise ValueError("Invalid 'image_url': A valid 'image_url' " + "must have scheme 'http' or 'https'.") # Avoid circular import from vllm import __version__ as VLLM_VERSION @@ -44,8 +49,9 @@ async def fetch_image(cls, image_url: str) -> Image.Image: image = load_image_from_base64(image_url.split(',', 1)[1]) else: - raise ValueError("Invalid image url: A valid image url must start " - "with either 'data:image' or 'http'.") + raise ValueError( + "Invalid 'image_url': A valid 'image_url' must start " + "with either 'data:image' or 'http'.") return image @@ -56,7 +62,7 @@ async def async_get_and_parse_image(image_url: str) -> ImagePixelData: def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str: - """encode image to base64 format.""" + """Encode a pillow image to base64 format.""" buffered = BytesIO() if format == 'JPEG': From 515080ad2fd93cc8e363ff43b90a9df18cfd71ff Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 25 Jun 2024 21:56:02 -0700 Subject: [PATCH 033/392] [bugfix][distributed] fix shm broadcast when the queue size is full (#5801) --- tests/distributed/test_shm_broadcast.py | 49 +++++++++---- .../device_communicators/shm_broadcast.py | 73 +++++++++++-------- 2 files changed, 76 insertions(+), 46 deletions(-) diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index d92900ffce00..2c2466f81bb8 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -1,7 +1,9 @@ import multiprocessing import random import time +from typing import List +import numpy as np import torch.distributed as dist from vllm.distributed.device_communicators.shm_broadcast import ( @@ -9,6 +11,14 @@ from vllm.utils import update_environment_variables +def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]: + np.random.seed(seed) + sizes = np.random.randint(1, 10_000, n) + # on average, each array will have 5k elements + # with int64, each array will have 40kb + return [np.random.randint(1, 100, i) for i in sizes] + + def distributed_run(fn, world_size): number_of_processes = world_size processes = [] @@ -47,24 +57,31 @@ def wrapped_fn(env): def worker_fn(): writer_rank = 2 broadcaster = ShmRingBufferIO.create_from_process_group( - dist.group.WORLD, 1024, 2, writer_rank) + dist.group.WORLD, 1024 * 1024, 2, writer_rank) + if dist.get_rank() == writer_rank: + seed = random.randint(0, 1000) + dist.broadcast_object_list([seed], writer_rank) + else: + recv = [None] + dist.broadcast_object_list(recv, writer_rank) + seed = recv[0] # type: ignore + dist.barrier() + # in case we find a race condition + # print the seed so that we can reproduce the error + print(f"Rank {dist.get_rank()} got seed {seed}") + # test broadcasting with about 400MB of data + N = 10_000 if dist.get_rank() == writer_rank: - time.sleep(random.random()) - broadcaster.broadcast_object(0) - time.sleep(random.random()) - broadcaster.broadcast_object({}) - time.sleep(random.random()) - broadcaster.broadcast_object([]) + arrs = get_arrays(N, seed) + for x in arrs: + broadcaster.broadcast_object(x) + time.sleep(random.random() / 1000) else: - time.sleep(random.random()) - a = broadcaster.broadcast_object(None) - time.sleep(random.random()) - b = broadcaster.broadcast_object(None) - time.sleep(random.random()) - c = broadcaster.broadcast_object(None) - assert a == 0 - assert b == {} - assert c == [] + arrs = get_arrays(N, seed) + for x in arrs: + y = broadcaster.broadcast_object(None) + assert np.array_equal(x, y) + time.sleep(random.random() / 1000) dist.barrier() diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index c44bd2f11ee8..550271f881df 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -14,6 +14,12 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL +# time to wait if the queue is full or empty +# if we sleep for too short, it will consume too much CPU +# if we sleep for too long, it will slow down the writer/reader +# 0.1 us is a good balance +RINGBUFFER_SLEEP_INTERVAL = 1e-7 + logger = init_logger(__name__) @@ -145,8 +151,7 @@ def __init__(self, buffer: ShmRingBuffer, reader_rank: int): @contextmanager def acquire_write(self): assert self._is_writer, "Only writers can acquire write" - start_index = self.current_idx - start_time = time.time() + start_time = time.monotonic() n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: @@ -154,19 +159,21 @@ def acquire_write(self): written_flag = metadata_buffer[0] if written_flag and read_count != self.buffer.n_reader: # this block is written and not read by all readers - # try to write to the next block - self.current_idx = (self.current_idx + - 1) % self.buffer.max_chunks - if self.current_idx == start_index: - # no empty block found - if time.time( - ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa - logger.warning( - "No available block found in %s second. ", - VLLM_RINGBUFFER_WARNING_INTERVAL) - n_warning += 1 - # wait for a while (0.1 us) - time.sleep(1e-7) + # for writers, `self.current_idx` is the next block to write + # if this block is not ready to write, + # we need to wait until it is read by all readers + + # wait for a while + time.sleep(RINGBUFFER_SLEEP_INTERVAL) + + # if we wait for a long time, we should warn the user + if time.monotonic( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + continue # found a block that is either # (1) not written @@ -188,13 +195,14 @@ def acquire_write(self): metadata_buffer[i] = 0 # mark the block as written metadata_buffer[0] = 1 + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks break @contextmanager def acquire_read(self): assert self._is_reader, "Only readers can acquire read" - start_index = self.current_idx - start_time = time.time() + start_time = time.monotonic() n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: @@ -204,19 +212,22 @@ def acquire_read(self): # this block is either # (1) not written # (2) already read by this reader - # try to read the next block - self.current_idx = (self.current_idx + - 1) % self.buffer.max_chunks - if self.current_idx == start_index: - # no block found - if time.time( - ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa - logger.warning( - "No available block found in %s second. ", - VLLM_RINGBUFFER_WARNING_INTERVAL) - n_warning += 1 - # wait for a while (0.1 us) - time.sleep(1e-7) + + # for readers, `self.current_idx` is the next block to read + # if this block is not ready, + # we need to wait until it is written + + # wait for a while + time.sleep(RINGBUFFER_SLEEP_INTERVAL) + + # if we wait for a long time, we should warn the user + if time.monotonic( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + continue # found a block that is not read by this reader # let caller read from the buffer @@ -226,6 +237,8 @@ def acquire_read(self): # caller has read from the buffer # set the read flag metadata_buffer[self.reader_rank + 1] = 1 + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks break def enqueue(self, obj): From 6806998bf9c7f24d710d9017c901e9e9a30757d5 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 26 Jun 2024 00:15:22 -0700 Subject: [PATCH 034/392] [Bugfix] Fix embedding to support 2D inputs (#5829) --- vllm/model_executor/layers/vocab_parallel_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 1a26c5c63fed..4650b2c2458d 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -306,11 +306,11 @@ def forward(self, input_): self.shard_indices.added_vocab_end_index) else: masked_input = input_ - # Get the embeddings. + # Get the embeddings. output_parallel = F.embedding(masked_input.long(), self.weight) # Mask the output embedding. if self.tp_size > 1: - output_parallel.masked_fill_(input_mask.unsqueeze(1), 0) + output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) # Reduce across all the model parallel GPUs. output = tensor_model_parallel_all_reduce(output_parallel) return output From 3439c5a8e3a1cdab9bf7c4430455ace06be1f28d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 26 Jun 2024 00:58:23 -0700 Subject: [PATCH 035/392] [Bugfix][TPU] Fix KV cache size calculation (#5860) --- vllm/worker/tpu_worker.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 828bb89d70ba..cd72c7119909 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -118,14 +118,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: xm.wait_device_ops() m = xm.get_memory_info(self.device) - program_size = 1024 * 1024 * 1024 # 1GB - free_bytes = max(m["bytes_limit"] - m["bytes_used"] - program_size, 0) - kv_cache_bytes = int(free_bytes * - self.cache_config.gpu_memory_utilization) - kv_cache_dtype_btyes = get_dtype_size(self.cache_dtype) + total_memory_size = m["bytes_limit"] + usable_memory_size = int(total_memory_size * + self.cache_config.gpu_memory_utilization) + profiled = m["bytes_used"] # Weights + intermediate activations. + kv_cache_bytes = max(usable_memory_size - profiled, 0) + dtype_btyes = get_dtype_size(self.cache_dtype) block_size = self.cache_config.block_size num_tpu_blocks = (kv_cache_bytes // - (kv_cache_dtype_btyes * block_size * num_layers * 2 * + (dtype_btyes * block_size * num_layers * 2 * head_size * num_kv_heads)) num_tpu_blocks = (num_tpu_blocks // 8) * 8 # Round down to 8. return num_tpu_blocks, 0 From 6984c02a2735d4d08426d2c426c34b6d73bee89e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 26 Jun 2024 16:02:34 +0800 Subject: [PATCH 036/392] [CI/Build] Refactor image test assets (#5821) --- tests/conftest.py | 111 ++++++++++++++++++----------- tests/models/test_llava.py | 26 +++---- tests/models/test_llava_next.py | 30 ++++---- tests/models/test_phi3v.py | 28 ++++---- tests/multimodal/test_processor.py | 24 +++---- 5 files changed, 127 insertions(+), 92 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 67885b93285c..9d00c7676694 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,12 @@ import contextlib import gc import os -from typing import Any, Dict, List, Optional, Tuple, TypeVar +from collections import UserList +from dataclasses import dataclass +from functools import cached_property +from pathlib import Path +from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict, + TypeVar) import pytest import torch @@ -28,21 +33,8 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -# Multi modal related -# You can use `.buildkite/download-images.sh` to download the assets -PIXEL_VALUES_FILES = [ - os.path.join(_TEST_DIR, "images", filename) for filename in - ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"] -] -IMAGE_FEATURES_FILES = [ - os.path.join(_TEST_DIR, "images", filename) for filename in - ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"] -] -IMAGE_FILES = [ - os.path.join(_TEST_DIR, "images", filename) - for filename in ["stop_sign.jpg", "cherry_blossom.jpg"] -] -assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES) +_IMAGE_DIR = Path(_TEST_DIR) / "images" +"""You can use `.buildkite/download-images.sh` to download the assets.""" def _read_prompts(filename: str) -> List[str]: @@ -51,6 +43,63 @@ def _read_prompts(filename: str) -> List[str]: return prompts +@dataclass(frozen=True) +class ImageAsset: + name: Literal["stop_sign", "cherry_blossom"] + + @cached_property + def pixel_values(self) -> torch.Tensor: + return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt") + + @cached_property + def image_features(self) -> torch.Tensor: + return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt") + + @cached_property + def pil_image(self) -> Image.Image: + return Image.open(_IMAGE_DIR / f"{self.name}.jpg") + + def for_hf(self) -> Image.Image: + return self.pil_image + + def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData: + image_input_type = vision_config.image_input_type + ImageInputType = VisionLanguageConfig.ImageInputType + + if image_input_type == ImageInputType.IMAGE_FEATURES: + return ImageFeatureData(self.image_features) + if image_input_type == ImageInputType.PIXEL_VALUES: + return ImagePixelData(self.pil_image) + + raise NotImplementedError + + +class _ImageAssetPrompts(TypedDict): + stop_sign: str + cherry_blossom: str + + +class _ImageAssets(UserList[ImageAsset]): + + def __init__(self) -> None: + super().__init__( + [ImageAsset("stop_sign"), + ImageAsset("cherry_blossom")]) + + def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: + """ + Convenience method to define the prompt for each test image. + + The order of the returned prompts matches the order of the + assets when iterating through this object. + """ + return [prompts["stop_sign"], prompts["cherry_blossom"]] + + +IMAGE_ASSETS = _ImageAssets() +"""Singleton instance of :class:`_ImageAssets`.""" + + def cleanup(): destroy_model_parallel() destroy_distributed_environment() @@ -81,31 +130,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): cleanup() -@pytest.fixture(scope="session") -def hf_images() -> List[Image.Image]: - return [Image.open(filename) for filename in IMAGE_FILES] - - -@pytest.fixture() -def vllm_images(request) -> List[MultiModalData]: - vision_language_config = request.getfixturevalue("model_and_config")[1] - if vision_language_config.image_input_type == ( - VisionLanguageConfig.ImageInputType.IMAGE_FEATURES): - return [ - ImageFeatureData(torch.load(filename)) - for filename in IMAGE_FEATURES_FILES - ] - else: - return [ - ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES - ] - - -@pytest.fixture() -def vllm_image_tensors(request) -> List[torch.Tensor]: - return [torch.load(filename) for filename in PIXEL_VALUES_FILES] - - @pytest.fixture def example_prompts() -> List[str]: prompts = [] @@ -122,6 +146,11 @@ def example_long_prompts() -> List[str]: return prompts +@pytest.fixture(scope="session") +def image_assets() -> _ImageAssets: + return IMAGE_ASSETS + + _STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.half, "bfloat16": torch.bfloat16, diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index b41c69f72b05..ac1d2ece62b2 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -5,17 +5,17 @@ from vllm.config import VisionLanguageConfig -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": "\nUSER: What's the content of the image?\nASSISTANT:", + "cherry_blossom": "\nUSER: What is the season?\nASSISTANT:", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) +}) def iter_llava_configs(model_name: str): @@ -49,28 +49,28 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id for idx, input_id in enumerate(input_ids) - if input_id != image_token_id or input_ids[idx - 1] != image_token_id + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != image_token_id or output_ids[idx - 1] != image_token_id ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, "") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str # TODO: Add test for `tensor_parallel_size` [ref: PR #3883] @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -81,6 +81,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index 0eca5cb5330c..d36e503871ca 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -5,7 +5,7 @@ from vllm.config import VisionLanguageConfig -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm @@ -15,12 +15,12 @@ "questions.") # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ - f"{_PREFACE} \nUSER: What's the content of the image? ASSISTANT:", - f"{_PREFACE} \nUSER: What is the season? ASSISTANT:", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + f"{_PREFACE} \nUSER: What's the content of the image?\nASSISTANT:", + "cherry_blossom": + f"{_PREFACE} \nUSER: What is the season?\nASSISTANT:", +}) def iter_llava_next_configs(model_name: str): @@ -56,20 +56,20 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id for idx, input_id in enumerate(input_ids) - if input_id != image_token_id or input_ids[idx - 1] != image_token_id + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != image_token_id or output_ids[idx - 1] != image_token_id ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, " ") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str @pytest.mark.xfail( @@ -78,8 +78,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -90,6 +90,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index a29d50df4c4e..03c130466836 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -6,17 +6,17 @@ from vllm.config import VisionLanguageConfig from vllm.utils import is_cpu -from ..conftest import IMAGE_FILES +from ..conftest import IMAGE_ASSETS pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501 - "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) + "cherry_blossom": + "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", # noqa: E501 +}) def iter_phi3v_configs(model_name: str): @@ -50,22 +50,22 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... It also reduces `output_str` from "bla" to "bla". """ - input_ids, output_str = vllm_output + output_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id tokenizer = AutoTokenizer.from_pretrained(model_id) image_token_str = tokenizer.decode(image_token_id) - hf_input_ids = [ - input_id if input_id != image_token_id else 0 - for idx, input_id in enumerate(input_ids) + hf_output_ids = [ + token_id if token_id != image_token_id else 0 + for idx, token_id in enumerate(output_ids) ] hf_output_str = output_str \ .replace(image_token_str * vlm_config.image_feature_size, "") \ .replace("", " ").replace("<|user|>", "") \ .replace("<|end|>\n<|assistant|>", " ") - return hf_input_ids, hf_output_str + return hf_output_ids, hf_output_str target_dtype = "half" @@ -82,8 +82,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, hf_images, vllm_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -94,6 +94,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] # use eager mode for hf runner, since phi3_v didn't work with flash_attn hf_model_kwargs = {"_attn_implementation": "eager"} diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py index 51c352361702..9ac48dfab678 100644 --- a/tests/multimodal/test_processor.py +++ b/tests/multimodal/test_processor.py @@ -10,7 +10,7 @@ @pytest.mark.parametrize("dtype", ["half", "float"]) -def test_clip_image_processor(hf_images, dtype): +def test_clip_image_processor(image_assets, dtype): MODEL_NAME = "llava-hf/llava-1.5-7b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -35,13 +35,13 @@ def test_clip_image_processor(hf_images, dtype): image_processor_revision=None, ) - for image in hf_images: + for asset in image_assets: hf_result = hf_processor.preprocess( - image, + asset.pil_image, return_tensors="pt", ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) @@ -59,7 +59,7 @@ def test_clip_image_processor(hf_images, dtype): reason="Inconsistent image processor being used due to lack " "of support for dynamic image token replacement") @pytest.mark.parametrize("dtype", ["half", "float"]) -def test_llava_next_image_processor(hf_images, dtype): +def test_llava_next_image_processor(image_assets, dtype): MODEL_NAME = "llava-hf/llava-v1.6-34b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -84,13 +84,13 @@ def test_llava_next_image_processor(hf_images, dtype): image_processor_revision=None, ) - for image in hf_images: + for asset in image_assets: hf_result = hf_processor.preprocess( - image, + asset.pil_image, return_tensors="pt", ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) @@ -107,7 +107,7 @@ def test_llava_next_image_processor(hf_images, dtype): @pytest.mark.xfail( reason="Example image pixels were not processed using HuggingFace") @pytest.mark.parametrize("dtype", ["float"]) -def test_image_pixel_types(hf_images, vllm_image_tensors, dtype): +def test_image_pixel_types(image_assets, dtype): MODEL_NAME = "llava-hf/llava-1.5-7b-hf" IMAGE_HEIGHT = IMAGE_WIDTH = 560 @@ -129,14 +129,14 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype): image_processor_revision=None, ) - for image, tensor in zip(hf_images, vllm_image_tensors): + for asset in image_assets: image_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(image), + ImagePixelData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) tensor_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(tensor), + ImagePixelData(asset.pixel_values), model_config=model_config, vlm_config=vlm_config, ) From 5bfd1bbc9831fed39632f071f16bb62373ec1249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Wed, 26 Jun 2024 11:16:00 -0400 Subject: [PATCH 037/392] [Kernel] Adding bias epilogue support for `cutlass_scaled_mm` (#5560) Co-authored-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Lucas Wilkinson --- CMakeLists.txt | 3 +- csrc/ops.h | 3 +- .../cutlass_w8a8/scaled_mm_c2x.cu | 228 +++++++++++++----- .../cutlass_w8a8/scaled_mm_c3x.cu | 139 ++++++++--- .../cutlass_w8a8/scaled_mm_entry.cu | 32 ++- csrc/torch_bindings.cpp | 2 +- tests/kernels/test_cutlass.py | 100 +++++--- vllm/_custom_ops.py | 10 +- 8 files changed, 383 insertions(+), 134 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 801429096eaa..ede9192cd1db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda") +# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) +set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") diff --git a/csrc/ops.h b/csrc/ops.h index 6f0a7143c916..ae04150eaf75 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -96,7 +96,8 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales); + torch::Tensor const& b_scales, + c10::optional const& bias); #endif diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 38a20a1727d1..6ce25c5ac897 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -77,24 +77,12 @@ struct enable_sm89_to_sm90 : Kernel { }; /* - This epilogue function defines a quantized GEMM operation similar to - torch._scaled_mm. - - A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or - per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. -*/ + * This class provides the common ScaleA and ScaleB descriptors for the + * ScaledEpilogue and ScaledEpilogueBias classes. + */ template -struct ScaledEpilogue { - private: +struct ScaledEpilogueBase { + protected: using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< @@ -102,6 +90,32 @@ struct ScaledEpilogue { using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< OutputTileThreadMap, float, Stride, Int<1>, Int<0>>>; +}; + +/* + This epilogue function defines a quantized GEMM operation similar to + torch._scaled_mm. + + A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or + per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::ScaleA; + using ScaleB = typename SUPER::ScaleB; using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< cutlass::multiplies, float, float, @@ -134,6 +148,53 @@ struct ScaledEpilogue { } }; +template +struct ScaledEpilogueBias + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::ScaleA; + using ScaleB = typename SUPER::ScaleB; + + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast< + OutputTileThreadMap, ElementD, Stride, Int<1>, Int<0>>>; + + public: + using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + using ScaleAArgs = typename ScaleA::Arguments; + using ScaleBArgs = typename ScaleB::Arguments; + using BiasArgs = typename Bias::Arguments; + + ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; + ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; + BiasArgs bias_args{static_cast(bias.data_ptr()), {}}; + + typename EVTCompute0::Arguments evt0_compute_args{b_args}; + + typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args, + bias_args}; + return evt_compute_args; + } +}; + template typename ArchGuard, typename ElementAB_, typename ElementD_, template typename Epilogue_, typename TileShape, @@ -168,13 +229,13 @@ struct cutlass_2x_gemm { // clang-format off using RowMajor = typename cutlass::layout::RowMajor; using ColumnMajor = typename cutlass::layout::ColumnMajor; - using KernelType = + using KernelType = ArchGuard typename Epilogue, + typename... EpilogueArgs> +void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... epilogue_args) { TORCH_CHECK(a.dtype() == torch::kInt8); TORCH_CHECK(b.dtype() == torch::kInt8); - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; @@ -420,78 +480,130 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, if (out.dtype() == torch::kBFloat16) { return cutlass_gemm_caller>( - out, a, b, a_scales, b_scales); + Epilogue, TileShape, WarpShape, InstructionShape, 2>>( + out, a, b, std::forward(epilogue_args)...); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); return cutlass_gemm_caller>( - out, a, b, a_scales, b_scales); + Epilogue, TileShape, WarpShape, InstructionShape, 2>>( + out, a, b, std::forward(epilogue_args)...); } } -void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, +void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - TORCH_CHECK(a.dtype() == torch::kInt8); - TORCH_CHECK(b.dtype() == torch::kInt8); + torch::Tensor const& b_scales, + c10::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + if (bias) { + TORCH_CHECK(bias->dtype() == out.dtype(), + "currently bias dtype must match output dtype ", out.dtype()); + return cutlass_scaled_mm_sm75_epilogue( + out, a, b, a_scales, b_scales, *bias); + } else { + return cutlass_scaled_mm_sm75_epilogue(out, a, b, a_scales, + b_scales); + } +} + +template