diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index e15a2312ec5ae..bcf25d2631042 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -11,6 +11,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -287,7 +288,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str: - device_name = torch.cuda.get_device_name().replace(" ", "_") + device_name = current_platform.get_device_name().replace(" ", "_") dtype_selector = "" if not dtype else f",dtype={dtype}" return f"E={E},N={N},device_name={device_name}{dtype_selector}.json" diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 6642bb8e71bd1..c7557dc34ff64 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -44,6 +44,35 @@ def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]: return pynvml.nvmlDeviceGetCudaComputeCapability(handle) +@lru_cache(maxsize=8) +@with_nvml_context +def get_physical_device_name(device_id: int = 0) -> str: + handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) + return pynvml.nvmlDeviceGetName(handle) + + +@with_nvml_context +def warn_if_different_devices(): + device_ids: int = pynvml.nvmlDeviceGetCount() + if device_ids > 1: + device_names = [get_physical_device_name(i) for i in range(device_ids)] + if len(set(device_names)) > 1 and os.environ.get( + "CUDA_DEVICE_ORDER") != "PCI_BUS_ID": + logger.warning( + "Detected different devices in the system: \n%s\nPlease" + " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to " + "avoid unexpected behavior.", "\n".join(device_names)) + + +try: + from sphinx.ext.autodoc.mock import _MockModule + + if not isinstance(pynvml, _MockModule): + warn_if_different_devices() +except ModuleNotFoundError: + warn_if_different_devices() + + def device_id_to_physical_device_id(device_id: int) -> int: if "CUDA_VISIBLE_DEVICES" in os.environ: device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",") @@ -61,6 +90,11 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]: physical_device_id = device_id_to_physical_device_id(device_id) return get_physical_device_capability(physical_device_id) + @staticmethod + def get_device_name(device_id: int = 0) -> str: + physical_device_id = device_id_to_physical_device_id(device_id) + return get_physical_device_name(physical_device_id) + @staticmethod @with_nvml_context def is_full_nvlink(physical_device_ids: List[int]) -> bool: diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 0760f9554fb78..25b6f26676ef0 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -27,6 +27,10 @@ def is_tpu(self) -> bool: def get_device_capability(device_id: int = 0) -> Tuple[int, int]: raise NotImplementedError + @staticmethod + def get_device_name(device_id: int = 0) -> str: + raise NotImplementedError + @staticmethod def inference_mode(): """A device-specific wrapper of `torch.inference_mode`. diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 36b3ba8f7d1bb..3f6f5adee5a56 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -13,3 +13,8 @@ class RocmPlatform(Platform): @lru_cache(maxsize=8) def get_device_capability(device_id: int = 0) -> Tuple[int, int]: return torch.cuda.get_device_capability(device_id) + + @staticmethod + @lru_cache(maxsize=8) + def get_device_name(device_id: int = 0) -> str: + return torch.cuda.get_device_name(device_id) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 4c30f20ff076e..8f4372e20d2e7 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -368,7 +368,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): if torch_dtype == torch.bfloat16: compute_capability = current_platform.get_device_capability() if compute_capability[0] < 8: - gpu_name = torch.cuda.get_device_name() + gpu_name = current_platform.get_device_name() raise ValueError( "Bfloat16 is only supported on GPUs with compute capability " f"of at least 8.0. Your {gpu_name} GPU has compute capability "