linter formatting bug fixes

ROCm · hongxiayang · Jan 15, 2025 · Dec 24, 2024 · Dec 24, 2024 · Dec 24, 2024
commit 02962b68f270c07cf724b77b981d543c099b8b80
diff --git a/benchmarks/kernels/benchmark_mixtral_moe_rocm.py b/benchmarks/kernels/benchmark_mixtral_moe_rocm.py
@@ -9,7 +9,6 @@
 import triton.language as tl
 from tqdm import tqdm
 
-import vllm._moe_C as moe_kernels
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe import (get_config_file_name,
                                                   invoke_fused_moe_kernel,
@@ -225,7 +224,7 @@ def run_timing(
     )
 
     w1 = torch.rand(
-        (num_total_experts, 2 * shard_intermediate_size, d_model+128),
+        (num_total_experts, 2 * shard_intermediate_size, d_model + 128),
         device=hidden_states.device,
         dtype=hidden_states.dtype,
     )
@@ -326,8 +325,7 @@ def run_timing(
             compute_type=(tl.bfloat16 if hidden_states.dtype == torch.bfloat16
                           else tl.float16),
             use_fp8_w8a8=False,
-            use_int8_w8a16=False
-        )
+            use_int8_w8a16=False)
 
         ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
@@ -348,8 +346,7 @@ def run_timing(
             compute_type=(tl.bfloat16 if hidden_states.dtype == torch.bfloat16
                           else tl.float16),
             use_fp8_w8a8=False,
-            use_int8_w8a16=False
-        )
+            use_int8_w8a16=False)
 
     end_event.record()
     end_event.synchronize()

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
@@ -43,9 +43,9 @@ def benchmark_config(
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     padding_size = 0
     if envs.VLLM_MOE_PADDING and not (use_fp8_w8a8 or use_int8_w8a16):
-        padding_size = 128 # fp16 padding size
+        padding_size = 128  # fp16 padding size
     if envs.VLLM_FP8_PADDING and use_fp8_w8a8:
-        padding_size = 256 # fp8 padding size. Ignoring int8 for now
+        padding_size = 256  # fp8 padding size. Ignoring int8 for now
 
     if use_int8_w8a16:
         w1 = torch.randint(-127,

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
@@ -28,19 +28,20 @@
 NUM_EXPERTS = [8, 64]
 TOP_KS = [2, 6]
 
+
 def permute_weight(x: torch.Tensor) -> torch.Tensor:
     ## Hardcode BLOCK_K and BLOCK_N
     BK = 128
     BN = 128
     x_ = x.clone()
-    x_ = x_.view(x.shape[0],
-                 x.shape[1]//BN, BN//16, 16,
-                     x.shape[2]//BK, BK//32, 4, 8)
-    x_ = x_.permute(0,1,5,2,6,4,3,7)
+    x_ = x_.view(x.shape[0], x.shape[1] // BN, BN // 16, 16, x.shape[2] // BK,
+                 BK // 32, 4, 8)
+    x_ = x_.permute(0, 1, 5, 2, 6, 4, 3, 7)
     x_ = x_.contiguous()
-    x_ = x_.view(x.shape[0], x.shape[1], x.shape[2]);
+    x_ = x_.view(x.shape[0], x.shape[1], x.shape[2])
     return x_
 
+
 @pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 511, 1024])
@@ -77,6 +78,7 @@ def test_fused_moe(
                                atol=1e-2,
                                rtol=0)
 
+
 @pytest.mark.parametrize("m", [1, 64, 96, 1000, 237])
 @pytest.mark.parametrize("n", [14336])
 @pytest.mark.parametrize("k", [4096])
@@ -101,7 +103,12 @@ def test_amd_moe_1(
         w2_shuffled = permute_weight(w2.data)
 
     score = torch.randn((m, e), device='cuda', dtype=dtype)
-    triton_output = fused_moe(a, w1_shuffled, w2_shuffled, score, topk, renormalize=False)
+    triton_output = fused_moe(a,
+                              w1_shuffled,
+                              w2_shuffled,
+                              score,
+                              topk,
+                              renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
     assert torch.allclose(triton_output, torch_output, atol=2e-2, rtol=0)
 
@@ -130,10 +137,16 @@ def test_amd_moe_2(
         w2_shuffled = permute_weight(w2.data)
 
     score = torch.randn((m, e), device='cuda', dtype=dtype)
-    triton_output = fused_moe(a, w1_shuffled, w2_shuffled, score, topk, renormalize=False)
+    triton_output = fused_moe(a,
+                              w1_shuffled,
+                              w2_shuffled,
+                              score,
+                              topk,
+                              renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
     assert torch.allclose(triton_output, torch_output, atol=2e-1, rtol=0)
 
+
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
@@ -168,9 +181,9 @@ def test_mixtral_moe(dtype: torch.dtype):
 
     # pad the weight if using padding
     if envs.VLLM_MOE_PADDING:
-        vllm_moe.experts.w13_weight = Parameter(F.pad(
-            vllm_moe.experts.w13_weight, (0, 128), "constant", 0),
-                                                requires_grad=False)[..., :-128]
+        vllm_moe.experts.w13_weight = Parameter(
+            F.pad(vllm_moe.experts.w13_weight, (0, 128), "constant", 0),
+            requires_grad=False)[..., :-128]
         torch.cuda.empty_cache()
         vllm_moe.experts.w2_weight = Parameter(F.pad(
             vllm_moe.experts.w2_weight, (0, 128), "constant", 0),

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -951,9 +951,6 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
     torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
                                   token_expert_indicies, gating_output)
 
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    torch.ops._moe_C.moe_sum(input, output)
-
 
 if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
 

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -27,8 +27,8 @@
 ARTIFICIAL_PREEMPTION_PROB = 0.5
 ARTIFICIAL_PREEMPTION_MAX_CNT = 500
 
-VLLM_SCHED_PREFILL_COUNT = int(
-    os.getenv("VLLM_SCHED_PREFILL_COUNT", 0))  # noqa
+VLLM_SCHED_PREFILL_COUNT = int(os.getenv("VLLM_SCHED_PREFILL_COUNT",
+                                         0))  # noqa
 
 
 class PreemptionMode(enum.Enum):
@@ -340,12 +340,14 @@ def __init__(
         self.lora_config = lora_config
         self.prefill_timeout = 0
 
-        # slightly hackey, but if you specify prefill batch count, the delay factor
-        # needs to exist, otherwise we will always skip.  Default will be equal to
-        # VLLM_SCHED_PREFILL_COUNT, as they should be roughly the same.
+        # slightly hackey, but if you specify prefill batch count,
+        # the delay factor needs to exist, otherwise we will always skip.
+        # Default will be equal to VLLM_SCHED_PREFILL_COUNT,
+        # as they should be roughly the same.
         # Recommend setting with --scheduler-delay-factor and experimenting
         # On command line
-        if VLLM_SCHED_PREFILL_COUNT > 0 and self.scheduler_config.delay_factor == 0:
+        if VLLM_SCHED_PREFILL_COUNT > 0 and \
+            self.scheduler_config.delay_factor == 0:
             self.scheduler_config.delay_factor = VLLM_SCHED_PREFILL_COUNT
         version = "selfattn"
         if (self.scheduler_config.runner_type == "pooling"
@@ -934,7 +936,8 @@ def _schedule_prefills(
 
         leftover_waiting_sequences: Deque[SequenceGroup] = deque()
 
-        while (VLLM_SCHED_PREFILL_COUNT <= len(waiting_queue) or self._passed_delay(time.time())) and waiting_queue:
+        while (len(waiting_queue) >= VLLM_SCHED_PREFILL_COUNT
+               or self._passed_delay(time.time())) and waiting_queue:
             seq_group = waiting_queue[0]
 
             waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -540,7 +540,7 @@ def get_default_config_root():
     # Pad the weight for moe kernel or not
     "VLLM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_FP8_PADDING", "1"))),
-        
+
     # shuffle the weight for moe kernel or not
     "VLLM_MOE_SHUFFLE":
     lambda: bool(int(os.getenv("VLLM_MOE_SHUFFLE", "0"))),

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -61,19 +61,20 @@ def apply(
     ) -> torch.Tensor:
         raise NotImplementedError
 
+
 def permute_weight_fp16(x: torch.Tensor) -> torch.Tensor:
     ## Hardcode BLOCK_K and BLOCK_N
     BK = 128
     BN = 128
     x_ = x
-    x_ = x_.view(x.shape[0],
-                 x.shape[1]//BN, BN//16, 16,
-                 x.shape[2]//BK, BK//32, 4, 8)
-    x_ = x_.permute(0,1,5,2,6,4,3,7)
+    x_ = x_.view(x.shape[0], x.shape[1] // BN, BN // 16, 16, x.shape[2] // BK,
+                 BK // 32, 4, 8)
+    x_ = x_.permute(0, 1, 5, 2, 6, 4, 3, 7)
     x_ = x_.contiguous()
     x_ = x_.view(x.shape[0], x.shape[1], x.shape[2])
     return x_
 
+
 @CustomOp.register("unquantized_fused_moe")
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -375,21 +375,22 @@ def apply(self,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
             use_per_token_if_dynamic=False)
 
+
 def permute_weight_fp8(x: torch.Tensor) -> torch.Tensor:
     ## Hardcode BLOCK_K and BLOCK_N
     BK = 256
-    BN = 64 #256 #128
+    BN = 64  #256 #128
     x_ = x
 
-    x_ = x_.view(x.shape[0],
-                 x.shape[1]//BN, BN//16, 16,
-                 x.shape[2]//BK, BK//(4 * 16), 4, 16)
+    x_ = x_.view(x.shape[0], x.shape[1] // BN, BN // 16, 16, x.shape[2] // BK,
+                 BK // (4 * 16), 4, 16)
 
-    x_ = x_.permute(0,1,5,2,6,4,3,7)
+    x_ = x_.permute(0, 1, 5, 2, 6, 4, 3, 7)
     x_ = x_.contiguous()
     x_ = x_.view(x.shape[0], x.shape[1], x.shape[2])
     return x_
 
+
 class Fp8MoEMethod(FusedMoEMethodBase):
     """MoE method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
@@ -556,7 +557,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                  requires_grad=False)
 
             if envs.VLLM_MOE_SHUFFLE:
-                layer.w13_weight.data = permute_weight_fp8(layer.w13_weight.data)
+                layer.w13_weight.data = permute_weight_fp8(
+                    layer.w13_weight.data)
                 layer.w2_weight.data = permute_weight_fp8(layer.w2_weight.data)
 
             if envs.VLLM_MOE_PADDING:
@@ -647,7 +649,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                         requires_grad=False)
 
             if envs.VLLM_MOE_SHUFFLE:
-                layer.w13_weight.data = permute_weight_fp8(layer.w13_weight.data)
+                layer.w13_weight.data = permute_weight_fp8(
+                    layer.w13_weight.data)
                 layer.w2_weight.data = permute_weight_fp8(layer.w2_weight.data)
 
             if envs.VLLM_MOE_PADDING:
@@ -666,6 +669,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 torch.cuda.empty_cache()
 
             return
+
     def apply(
         self,
         layer: torch.nn.Module,