doc: update the docstring related to alibi (#147)

followup of #146
flashinfer-ai · Mar 3, 2024 · bf2117b · bf2117b
1 parent 383518b
commit bf2117b
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -72,13 +72,13 @@ num_qo_heads = 32
 q = torch.randn(num_qo_heads, head_dim).half().to(0)
 
 o = flashinfer.single_decode_with_kv_cache(q, k, v) # decode attention without RoPE on-the-fly
-o_rope_on_the_fly = flashinfer.single_decode_with_kv_cache(q, k, v, pos_encoding_mode="LLAMA") # decode with LLaMA style RoPE on-the-fly
+o_rope_on_the_fly = flashinfer.single_decode_with_kv_cache(q, k, v, pos_encoding_mode="ROPE_LLAMA") # decode with LLaMA style RoPE on-the-fly
 
 # append attention
 append_qo_len = 128
 q = torch.randn(append_qo_len, num_qo_heads, head_dim).half().to(0) # append attention, the last 128 tokens in the KV-Cache are the new tokens
 o = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True) # append attention without RoPE on-the-fly, apply causal mask
-o_rope_on_the_fly = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True, pos_encoding_mode="LLAMA") # append attention with LLaMA style RoPE on-the-fly, apply causal mask
+o_rope_on_the_fly = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True, pos_encoding_mode="ROPE_LLAMA") # append attention with LLaMA style RoPE on-the-fly, apply causal mask
 
 # prefill attention
 qo_len = 2048

diff --git a/python/flashinfer/cascade.py b/python/flashinfer/cascade.py
@@ -419,9 +419,6 @@ def begin_forward(
             The dimension of the heads
         page_size : int
             The page size of the paged kv cache
-        pos_encoding_mode : str
-            Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
         data_type : Union[str, torch.dtype]
             The data type of the paged kv cache
 

diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py
@@ -79,7 +79,7 @@ def single_decode_with_kv_cache(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     sm_scale : Optional[float]
         The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
     rope_scale : Optional[float]
@@ -168,7 +168,7 @@ def batch_decode_with_padded_kv_cache(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     sm_scale : Optional[float]
         The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
     rope_scale : Optional[float]
@@ -257,7 +257,7 @@ def batch_decode_with_padded_kv_cache_return_lse(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     sm_scale : Optional[float]
         The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
     rope_scale : Optional[float]
@@ -456,7 +456,7 @@ def begin_forward(
             The page size of the paged kv cache
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         data_type : Union[str, torch.dtype]
             The data type of the paged kv cache
 
@@ -525,7 +525,7 @@ def forward(
             :attr:`kv_layout` is ``HND``.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         sm_scale : Optional[float]
             The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
         rope_scale : Optional[float]
@@ -586,7 +586,7 @@ def forward_return_lse(
             :attr:`kv_layout` is ``HND``.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         sm_scale : Optional[float]
             The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
         rope_scale : Optional[float]

diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py
@@ -93,7 +93,7 @@ def single_prefill_with_kv_cache(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     allow_fp16_qk_reduction : bool
         Whether to use f16 for qk reduction (faster at the cost of slight precision
         loss).
@@ -191,7 +191,7 @@ def single_prefill_with_kv_cache_return_lse(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     allow_fp16_qk_reduction : bool
         Whether to use f16 for qk reduction (faster at the cost of slight precision
         loss).
@@ -460,7 +460,7 @@ def forward(
             Whether to apply causal mask to the attention matrix.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         allow_fp16_qk_reduction : bool
             Whether to use f16 for qk reduction (faster at the cost of slight precision
             loss).
@@ -529,7 +529,7 @@ def forward_return_lse(
             Whether to apply causal mask to the attention matrix.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         allow_fp16_qk_reduction : bool
             Whether to use f16 for qk reduction (faster at the cost of slight precision
             loss).
@@ -744,7 +744,7 @@ def forward(
             Whether to apply causal mask to the attention matrix.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         allow_fp16_qk_reduction : bool
             Whether to use f16 for qk reduction (faster at the cost of slight precision
             loss).
@@ -811,7 +811,7 @@ def forward_return_lse(
             Whether to apply causal mask to the attention matrix.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         allow_fp16_qk_reduction : bool
             Whether to use f16 for qk reduction (faster at the cost of slight precision
             loss).