Xpu support (#407)

## Summary Replica of #396 Adds xpu support so all tests, benchmarks etc. run on XPUs or Intel GPUs. ## Details infer_device() function is moved to a separate file and in any file where previously "cuda" was needed, infer_device is imported and "cuda" is replaced with return value of a call to infer_device() ## Testing Done  A100 80GB PCIe, RTX 3060, Intel Data Center GPU Max 1550  - Hardware Type: <BLANK> - [x] run `make test` to ensure correctness - [ ] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence --------- Co-authored-by: Shao Tang <[email protected]>
linkedin · Nov 23, 2024 · 7e3683e · 7e3683e
1 parent 90fb5e4
commit 7e3683e
Show file tree

Hide file tree

Showing 48 changed files with 365 additions and 252 deletions.
diff --git a/benchmark/scripts/benchmark_cpo_loss.py b/benchmark/scripts/benchmark_cpo_loss.py
@@ -13,6 +13,9 @@
 )
 
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 
@@ -66,7 +69,6 @@ def bench_memory_fused_linear_cpo_loss(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
 
-    device = "cuda"
     torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
     liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
 
@@ -107,8 +109,6 @@ def bench_speed_fused_linear_cpo_loss(
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
 
-    device = "cuda"
-
     torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
     liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
 

diff --git a/benchmark/scripts/benchmark_cross_entropy.py b/benchmark/scripts/benchmark_cross_entropy.py
@@ -11,6 +11,9 @@
 )
 
 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 
 def bench_memory_cross_entropy(
@@ -24,8 +27,8 @@ def bench_memory_cross_entropy(
     B = input.extra_benchmark_config["B"]
     T = input.extra_benchmark_config["T"]
 
-    _input = torch.randn(B * T, V, requires_grad=True, device="cuda")
-    target = torch.randint(V, (B * T, 1), device="cuda").squeeze(1)
+    _input = torch.randn(B * T, V, requires_grad=True, device=device)
+    target = torch.randint(V, (B * T, 1), device=device).squeeze(1)
 
     def fwd():
         if provider == "liger":
@@ -57,8 +60,8 @@ def bench_speed_cross_entropy(
     B = input.extra_benchmark_config["B"]
     T = input.extra_benchmark_config["T"]
 
-    _input = torch.randn(B * T, V, requires_grad=True, device="cuda")
-    target = torch.randint(V, (B * T, 1), device="cuda").squeeze(1)
+    _input = torch.randn(B * T, V, requires_grad=True, device=device)
+    target = torch.randint(V, (B * T, 1), device=device).squeeze(1)
 
     def fwd():
         if provider == "liger":

diff --git a/benchmark/scripts/benchmark_dpo_loss.py b/benchmark/scripts/benchmark_dpo_loss.py
@@ -12,6 +12,9 @@
 )
 
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 
 class TorchDPOLoss(torch.nn.Module):
@@ -79,7 +82,6 @@ def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
     ignore_index = input.extra_benchmark_config["ignore_index"]
     provider = input.kernel_provider
 
-    device = "cuda"
     torch_dpo_loss = TorchDPOLoss(
         H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
     ).to(device)
@@ -127,7 +129,6 @@ def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
 
-    device = "cuda"
     torch_dpo_loss = TorchDPOLoss(
         H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
     ).to(device)

diff --git a/benchmark/scripts/benchmark_embedding.py b/benchmark/scripts/benchmark_embedding.py
@@ -11,6 +11,9 @@
 )
 
 from liger_kernel.transformers.experimental.embedding import LigerEmbedding
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 # NOTE: For torch compile, we will just use default inductor settings. No further customization
 # is needed.
@@ -26,8 +29,6 @@ def bench_speed_embedding(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
     D = input.extra_benchmark_config["D"]
     dtype = input.extra_benchmark_config["dtype"]
 
-    device = "cuda"
-
     torch_emb = Embedding(V, D).to(device).to(dtype)
     liger_emb = LigerEmbedding(V, D).to(device).to(dtype)
     torch_compile_emb = torch.compile(torch_emb)
@@ -68,8 +69,6 @@ def bench_memory_embedding(input: SingleBenchmarkRunInput) -> SingleBenchmarkRun
     D = input.extra_benchmark_config["D"]
     dtype = input.extra_benchmark_config["dtype"]
 
-    device = "cuda"
-
     torch_emb = Embedding(V, D).to(device).to(dtype)
     liger_emb = LigerEmbedding(V, D).to(device).to(dtype)
     torch_compile_emb = torch.compile(torch_emb)

diff --git a/benchmark/scripts/benchmark_fused_linear_cross_entropy.py b/benchmark/scripts/benchmark_fused_linear_cross_entropy.py
@@ -12,6 +12,9 @@
 from liger_kernel.transformers.fused_linear_cross_entropy import (
     LigerFusedLinearCrossEntropyLoss,
 )
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 
 class TorchLMHeadCE(torch.nn.Module):
@@ -65,7 +68,6 @@ def bench_memory_fused_linear_cross_entropy(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
 
-    device = "cuda"
     torch_lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
     liger_lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
 
@@ -105,8 +107,6 @@ def bench_speed_fused_linear_cross_entropy(
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
 
-    device = "cuda"
-
     torch_lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
     liger_lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
 

diff --git a/benchmark/scripts/benchmark_fused_linear_jsd.py b/benchmark/scripts/benchmark_fused_linear_jsd.py
@@ -10,6 +10,9 @@
 )
 
 from liger_kernel.transformers.fused_linear_jsd import LigerFusedLinearJSD
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 
 class TorchJSD(torch.nn.Module):
@@ -134,7 +137,6 @@ def bench_memory_fused_linear_jsd(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
 
-    device = "cuda"
     torch_lm_head_jsd = TorchLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
     liger_lm_head_jsd = LigerLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
 
@@ -183,7 +185,6 @@ def bench_speed_fused_linear_jsd(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
 
-    device = "cuda"
     torch_lm_head_jsd = TorchLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
     liger_lm_head_jsd = LigerLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
 

diff --git a/benchmark/scripts/benchmark_geglu.py b/benchmark/scripts/benchmark_geglu.py
@@ -12,6 +12,9 @@
 )
 
 from liger_kernel.transformers.geglu import LigerGEGLUMLP
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 
 def bench_speed_geglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
@@ -31,7 +34,6 @@ def bench_speed_geglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutpu
     )
 
     x_shape = (bsz, seq_len, hidden_size)
-    device = "cuda"
 
     # initialize input
     x = torch.randn(*x_shape, device=device, dtype=dtype, requires_grad=True)
@@ -99,7 +101,6 @@ def bench_memory_geglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutp
     )
 
     x_shape = (bsz, seq_len, hidden_size)
-    device = "cuda"
     # initialize input
     x = torch.randn(*x_shape, device=device, dtype=dtype, requires_grad=True)
 

diff --git a/benchmark/scripts/benchmark_group_norm.py b/benchmark/scripts/benchmark_group_norm.py
@@ -10,6 +10,9 @@
 )
 
 from liger_kernel.transformers.group_norm import LigerGroupNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 
 def bench_speed_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
@@ -26,12 +29,12 @@ def bench_speed_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRun
     x_shape = (M, C, H)
     triton_ln = LigerGroupNorm(
         num_channels=C, num_groups=C // channels_per_group, eps=eps
-    ).to("cuda")
+    ).to(device)
     torch_ln = torch.nn.GroupNorm(
         num_groups=C // channels_per_group, num_channels=C, eps=eps
-    ).to("cuda")
+    ).to(device)
 
-    x = torch.randn(x_shape, dtype=dtype, device="cuda")
+    x = torch.randn(x_shape, dtype=dtype, device=device)
     dy = torch.randn_like(x)
     x.requires_grad_(True)
 
@@ -83,12 +86,12 @@ def bench_memory_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRu
     x_shape = (M, C, H)
     triton_ln = LigerGroupNorm(
         num_channels=C, num_groups=C // channels_per_group, eps=eps
-    ).to("cuda")
+    ).to(device)
     torch_ln = torch.nn.GroupNorm(
         num_groups=C // channels_per_group, num_channels=C, eps=eps
-    ).to("cuda")
+    ).to(device)
 
-    x = torch.randn(x_shape, dtype=dtype, device="cuda")
+    x = torch.randn(x_shape, dtype=dtype, device=device)
     dy = torch.randn_like(x)
     x.requires_grad_(True)
 

diff --git a/benchmark/scripts/benchmark_jsd.py b/benchmark/scripts/benchmark_jsd.py
@@ -10,6 +10,9 @@
 )
 
 from liger_kernel.transformers.jsd import LigerJSD
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 
 class TorchJSD(torch.nn.Module):
@@ -56,10 +59,10 @@ def bench_speed_jsd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
     torch_jsd = TorchJSD()
     liger_jsd = LigerJSD()
 
-    _input = torch.randn(B * T, V, requires_grad=True, device="cuda").log_softmax(
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).log_softmax(
         dim=-1
     )
-    target = torch.randn(B * T, V, device="cuda").log_softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).log_softmax(dim=-1)
 
     def fwd():
         if input.kernel_provider == "liger":
@@ -101,10 +104,10 @@ def bench_memory_jsd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput
     V = input.x
     B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
 
-    _input = torch.randn(B * T, V, requires_grad=True, device="cuda").log_softmax(
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).log_softmax(
         dim=-1
     )
-    target = torch.randn(B * T, V, device="cuda").log_softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).log_softmax(dim=-1)
 
     def fwd():
         if input.kernel_provider == "liger":

diff --git a/benchmark/scripts/benchmark_kl_div.py b/benchmark/scripts/benchmark_kl_div.py
@@ -11,6 +11,9 @@
 )
 
 from liger_kernel.transformers.kl_div import LigerKLDIVLoss
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 S, E = 12, 18
 
@@ -22,10 +25,10 @@ def bench_speed_kldiv(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutpu
     torch_kl_div = nn.KLDivLoss(reduction=reduction)
     liger_kl_div = LigerKLDIVLoss(reduction=reduction)
 
-    _input = torch.randn(B * T, V, requires_grad=True, device="cuda").log_softmax(
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).log_softmax(
         dim=-1
     )
-    target = torch.randn(B * T, V, device="cuda").softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).softmax(dim=-1)
 
     def fwd():
         if input.kernel_provider == "liger":
@@ -68,10 +71,10 @@ def bench_memory_kldiv(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutp
     V = input.x
     B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
 
-    _input = torch.randn(B * T, V, requires_grad=True, device="cuda").log_softmax(
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).log_softmax(
         dim=-1
     )
-    target = torch.randn(B * T, V, device="cuda").softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).softmax(dim=-1)
 
     def fwd():
         if input.kernel_provider == "liger":

diff --git a/benchmark/scripts/benchmark_layer_norm.py b/benchmark/scripts/benchmark_layer_norm.py
@@ -10,6 +10,9 @@
 )
 
 from liger_kernel.transformers.layer_norm import LigerLayerNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 
 def bench_speed_layer_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
@@ -22,10 +25,10 @@ def bench_speed_layer_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRun
     dtype = extra_benchmark_config["dtype"]
 
     x_shape = (M, N)
-    triton_ln = LigerLayerNorm(hidden_size=N).to("cuda")
-    torch_ln = torch.nn.LayerNorm(N, eps=eps).to("cuda")
+    triton_ln = LigerLayerNorm(hidden_size=N).to(device)
+    torch_ln = torch.nn.LayerNorm(N, eps=eps).to(device)
 
-    x = torch.randn(x_shape, dtype=dtype, device="cuda")
+    x = torch.randn(x_shape, dtype=dtype, device=device)
     dy = torch.randn_like(x)
     x.requires_grad_(True)
 
@@ -73,10 +76,10 @@ def bench_memory_layer_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRu
 
     x_shape = (M, N)
 
-    triton_ln = LigerLayerNorm(hidden_size=N).to("cuda")
-    torch_ln = torch.nn.LayerNorm(N, eps=eps).to("cuda")
+    triton_ln = LigerLayerNorm(hidden_size=N).to(device)
+    torch_ln = torch.nn.LayerNorm(N, eps=eps).to(device)
 
-    x = torch.randn(x_shape, dtype=dtype, device="cuda")
+    x = torch.randn(x_shape, dtype=dtype, device=device)
     dy = torch.randn_like(x)
     x.requires_grad_(True)
 

diff --git a/benchmark/scripts/benchmark_orpo_loss.py b/benchmark/scripts/benchmark_orpo_loss.py
@@ -13,6 +13,9 @@
 )
 
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOFunction
+from liger_kernel.utils import infer_device
+
+device = infer_device()
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 
@@ -66,7 +69,6 @@ def bench_memory_fused_linear_orpo_loss(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
 
-    device = "cuda"
     torch_lm_head_orpo = TorchLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
     liger_lm_head_orpo = LigerLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
 
@@ -107,8 +109,6 @@ def bench_speed_fused_linear_orpo_loss(
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
 
-    device = "cuda"
-
     torch_lm_head_orpo = TorchLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
     liger_lm_head_orpo = LigerLMHeadORPO(H=H, V=V, dtype=dtype).to(device)