diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad562d9c996f3..15b9cfe677a57 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -343,7 +343,7 @@ add_custom_target(default)
 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling C extension.")
   add_dependencies(default _C)
-  add_dependencies(default _custom_C)
+
   message(STATUS "Enabling moe extension.")
   add_dependencies(default _moe_C)
 
@@ -357,7 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   endif()
 endif()
 
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  message(STATUS "Enabling moe extension.")
-  add_dependencies(default _moe_C)
+if(VLLM_GPU_LANG STREQUAL "HIP")
+  message(STATUS "Enabling custom extension.")
+  add_dependencies(default _custom_C)
 endif()
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 6d0dd31d346f1..83a483075f8a4 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -178,6 +178,8 @@ RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
 RUN --mount=type=bind,from=export_rccl,src=/,target=/install \
     if ls /install/*.deb; then \
         dpkg -i /install/*.deb \
+        # RCCL needs to be installed twice
+        && dpkg -i /install/*.deb \
         && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
         && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status; \
     fi
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index cc42839a975d0..cc60d21e717b5 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -4,3 +4,4 @@
 # Dependencies for AMD GPUs
 ray >= 2.10.0
 pytest-asyncio
+pandas # Required for fp8 linear
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 29b5fe77ae705..476301a216c48 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -92,34 +92,10 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = layer.weight
-        if is_hip() and x.dtype == torch.float16 and x.view(-1, x.size(-1)).shape[0] == 1:
-            batched = False
-            if x.dim() == 3:
-                inp = x.view(-1, x.size(-1))
-                batched = True
-            else:
-                inp = x
-            m, k = weight.shape[0], inp.shape[1]
-            out = torch.empty(inp.shape[0],
-                              weight.shape[0],
-                              dtype=inp.dtype,
-                              device='cuda')
-            if (k == 8192 and
-                (m == 1280 or m == 7168)) or (k == 3584 and m == 8192):
-                _custom_C.LLMM1(weight, inp, out, 8)
-            elif k <= 8192 and k % 8 == 0 and m % 4 == 0:
-                _custom_C.LLMM1(weight, inp, out, 4)
-            else:
-                out = F.linear(inp, weight)
-            if batched:
-                out = out.view(x.shape[0], x.shape[1], weight.shape[0])
-            if bias is not None:
-                out = out + bias
-            return out
         if self.separate_bias_add:
             if bias is not None:
-                return F.linear(x, weight) + bias
-            return F.linear(x, weight)
+                return tgemm.mm(x, weight) + bias
+            return tgemm.mm(x, weight)
         elif bias is not None:
             return F.linear(x, weight, bias)
         return tgemm.mm(x, weight)