[Kernel] Add per-tensor and per-token AZP epilogues (vllm-project#5941)

Co-authored-by: Tyler Michael Smith <[email protected]>
ruisearch42 · Aug 6, 2024 · 8d59dbb · 8d59dbb
1 parent 5c60c8c
commit 8d59dbb
Show file tree

Hide file tree

Showing 11 changed files with 1,175 additions and 153 deletions.
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -32,7 +32,6 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:
 
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                       k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-
     a = torch.randn((m, k), device='cuda') * 5
     b = torch.randn((n, k), device='cuda').t() * 5
 
@@ -44,59 +43,18 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
     raise ValueError("unsupported dtype")
 
 
-# impl
-
-
-def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
-                    scale_b: torch.Tensor,
-                    out_dtype: torch.dtype) -> torch.Tensor:
-    return torch.mm(a, b)
-
-
-def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
-                     scale_b: torch.Tensor,
-                     out_dtype: torch.dtype) -> torch.Tensor:
-    return torch._scaled_mm(a,
-                            b,
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=out_dtype)
-
-
-def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
-                                scale_a: torch.Tensor, scale_b: torch.Tensor,
-                                out_dtype: torch.dtype) -> torch.Tensor:
-    return torch._scaled_mm(a,
-                            b,
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=out_dtype,
-                            use_fast_accum=True)
-
-
-def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
-                 scale_b: torch.Tensor,
-                 out_dtype: torch.dtype) -> torch.Tensor:
-    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
-
-
 # bench
-def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
-             scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
-             sub_label: str, fn: Callable, description: str) -> TMeasurement:
-
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
     min_run_time = 1
 
     globals = {
-        "a": a,
-        "b": b,
-        "scale_a": scale_a,
-        "scale_b": scale_b,
-        "out_dtype": out_dtype,
+        "args": args,
+        "kwargs": kwargs,
         "fn": fn,
     }
     return TBenchmark.Timer(
-        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
+        stmt="fn(*args, **kwargs)",
         globals=globals,
         label=label,
         sub_label=sub_label,
@@ -110,26 +68,58 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     a, b = make_rand_tensors(torch.int8, m, n, k)
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
+    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
 
     timers = []
     # pytorch impl - bfloat16
     timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
 
     # pytorch impl - float16
     timers.append(
-        bench_fn(a.to(dtype=torch.float16, device="cuda"),
-                 b.to(dtype=torch.float16, device="cuda"), scale_a, scale_b,
-                 torch.float16, label, sub_label, pytorch_mm_impl,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales"))
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
 
     # cutlass impl
     timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass with azp per-tensor
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj))
+
+    # cutlass with azp per-tensor + bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, None, bias))
+
+    # cutlass with azp per-token
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, azp))
+
+    # cutlass with azp per-token + bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, azp, bias))
 
     return timers
 
@@ -140,46 +130,88 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
     timers = []
 
     # pytorch impl w. bf16
     timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
 
     # pytorch impl: bf16 output, without fp8 fast accum
     timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
 
     # pytorch impl: bf16 output, with fp8 fast accum
     timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 pytorch_fp8_impl_fast_accum,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
 
     # pytorch impl: fp16 output, without fp8 fast accum
     timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
 
     # pytorch impl: fp16 output, with fp8 fast accum
     timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 pytorch_fp8_impl_fast_accum,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
 
     # cutlass impl: bf16 output
     timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
     # cutlass impl: fp16 output
     timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
     return timers
 
 
@@ -200,7 +232,6 @@ def print_timers(timers: Iterable[TMeasurement]):
 
 def run(dtype: torch.dtype,
         MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-
     results = []
     for m, k, n in MKNs:
         timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
@@ -216,7 +247,6 @@ def make_output(data: Iterable[TMeasurement],
                 MKNs: Iterable[Tuple[int, int, int]],
                 base_description: str,
                 timestamp=None):
-
     print(f"== All Results {base_description} ====")
     print_timers(data)
 
@@ -251,7 +281,6 @@ def run_range_bench(args):
 
 
 def run_model_bench(args):
-
     print("Benchmarking models:")
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -128,6 +128,14 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b_scales,
                        c10::optional<torch::Tensor> const& bias);
 
+void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
+                           torch::Tensor const& b,
+                           torch::Tensor const& a_scales,
+                           torch::Tensor const& b_scales,
+                           torch::Tensor const& azp_adj,
+                           c10::optional<torch::Tensor> const& azp,
+                           c10::optional<torch::Tensor> const& bias);
+
 torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
                               torch::Tensor const& b_q_weight,
                               torch::Tensor const& s_tok,