diff --git a/CMakeLists.txt b/CMakeLists.txt index ad562d9c996f3..15b9cfe677a57 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -343,7 +343,7 @@ add_custom_target(default) if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") message(STATUS "Enabling C extension.") add_dependencies(default _C) - add_dependencies(default _custom_C) + message(STATUS "Enabling moe extension.") add_dependencies(default _moe_C) @@ -357,7 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") endif() endif() -if(VLLM_GPU_LANG STREQUAL "CUDA") - message(STATUS "Enabling moe extension.") - add_dependencies(default _moe_C) +if(VLLM_GPU_LANG STREQUAL "HIP") + message(STATUS "Enabling custom extension.") + add_dependencies(default _custom_C) endif() diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 6d0dd31d346f1..83a483075f8a4 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -178,6 +178,8 @@ RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \ RUN --mount=type=bind,from=export_rccl,src=/,target=/install \ if ls /install/*.deb; then \ dpkg -i /install/*.deb \ + # RCCL needs to be installed twice + && dpkg -i /install/*.deb \ && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status; \ fi diff --git a/requirements-rocm.txt b/requirements-rocm.txt index cc42839a975d0..cc60d21e717b5 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -4,3 +4,4 @@ # Dependencies for AMD GPUs ray >= 2.10.0 pytest-asyncio +pandas # Required for fp8 linear diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 29b5fe77ae705..476301a216c48 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -92,34 +92,10 @@ def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = layer.weight - if is_hip() and x.dtype == torch.float16 and x.view(-1, x.size(-1)).shape[0] == 1: - batched = False - if x.dim() == 3: - inp = x.view(-1, x.size(-1)) - batched = True - else: - inp = x - m, k = weight.shape[0], inp.shape[1] - out = torch.empty(inp.shape[0], - weight.shape[0], - dtype=inp.dtype, - device='cuda') - if (k == 8192 and - (m == 1280 or m == 7168)) or (k == 3584 and m == 8192): - _custom_C.LLMM1(weight, inp, out, 8) - elif k <= 8192 and k % 8 == 0 and m % 4 == 0: - _custom_C.LLMM1(weight, inp, out, 4) - else: - out = F.linear(inp, weight) - if batched: - out = out.view(x.shape[0], x.shape[1], weight.shape[0]) - if bias is not None: - out = out + bias - return out if self.separate_bias_add: if bias is not None: - return F.linear(x, weight) + bias - return F.linear(x, weight) + return tgemm.mm(x, weight) + bias + return tgemm.mm(x, weight) elif bias is not None: return F.linear(x, weight, bias) return tgemm.mm(x, weight)