From 38fb9423cb30a996dbc991a2294ace97443008ae Mon Sep 17 00:00:00 2001
From: Eric Buehler <65165915+EricLBuehler@users.noreply.github.com>
Date: Sun, 28 Jul 2024 04:14:44 -0400
Subject: [PATCH] Allow setting PagedAttention KV cache allocation from context
 size (#640)

* Support paged attn memory allocation via context size

* Slightly better logging

* Connect it to the apis

* Clippy
---
 Cargo.lock                                    |  1 -
 mistralrs-bench/Cargo.toml                    |  1 -
 mistralrs-bench/src/main.rs                   | 59 ++++++++++++----
 .../src/dummy_paged_attention/mod.rs          | 38 ++++++++---
 mistralrs-core/src/lib.rs                     |  2 +-
 mistralrs-core/src/paged_attention/mod.rs     | 38 ++++++++---
 mistralrs-paged-attn/build.rs                 | 42 ++++--------
 .../src/backend/paged_attention.rs            |  4 +-
 mistralrs-pyo3/mistralrs.pyi                  | 12 +++-
 mistralrs-pyo3/src/lib.rs                     | 68 +++++++++++++------
 mistralrs-server/src/main.rs                  | 67 +++++++++++++-----
 mistralrs/examples/paged_attn/main.rs         |  6 +-
 12 files changed, 229 insertions(+), 109 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 020b3d794..94e108be1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2319,7 +2319,6 @@ dependencies = [
  "candle-core",
  "clap",
  "cli-table",
- "either",
  "mistralrs-core",
  "serde",
  "serde_json",
diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
index 0c8ec0d86..354938595 100644
--- a/mistralrs-bench/Cargo.toml
+++ b/mistralrs-bench/Cargo.toml
@@ -20,7 +20,6 @@ serde_json.workspace = true
 clap.workspace = true
 mistralrs-core = { version = "0.2.2", path = "../mistralrs-core" }
 tracing.workspace = true
-either.workspace = true
 tokio.workspace = true
 cli-table = "0.4.7"
 
diff --git a/mistralrs-bench/src/main.rs b/mistralrs-bench/src/main.rs
index ddc93a7c5..249073e59 100644
--- a/mistralrs-bench/src/main.rs
+++ b/mistralrs-bench/src/main.rs
@@ -1,12 +1,11 @@
 use candle_core::Device;
 use clap::Parser;
 use cli_table::{format::Justify, print_stdout, Cell, CellStruct, Style, Table};
-use either::Either;
 use mistralrs_core::{
     initialize_logging, paged_attn_supported, Constraint, DefaultSchedulerMethod,
-    DeviceLayerMapMetadata, DeviceMapMetadata, Loader, LoaderBuilder, MistralRs, MistralRsBuilder,
-    ModelDType, ModelSelected, NormalRequest, PagedAttentionConfig, Request, RequestMessage,
-    Response, SamplingParams, SchedulerConfig, TokenSource, Usage,
+    DeviceLayerMapMetadata, DeviceMapMetadata, Loader, LoaderBuilder, MemoryGpuConfig, MistralRs,
+    MistralRsBuilder, ModelDType, ModelSelected, NormalRequest, PagedAttentionConfig, Request,
+    RequestMessage, Response, SamplingParams, SchedulerConfig, TokenSource, Usage,
 };
 use std::fmt::Display;
 use std::sync::Arc;
@@ -292,6 +291,12 @@ struct Args {
     #[arg(long = "pa-gpu-mem-usage")]
     paged_attn_gpu_mem_usage: Option<f32>,
 
+    /// Total context length to allocate the KV cache for (total number of tokens which the KV cache can hold)
+    /// when using PagedAttention, which is only supported on CUDA and is always automatically activated.
+    /// The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`.
+    #[arg(long = "pa-ctxt-len")]
+    paged_ctxt_len: Option<usize>,
+
     /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32.
     /// PagedAttention is only supported on CUDA and is always automatically activated.
     #[arg(long = "pa-blk-size")]
@@ -383,31 +388,55 @@ fn main() -> anyhow::Result<()> {
         args.paged_attn_block_size,
         args.paged_attn_gpu_mem,
         args.paged_attn_gpu_mem_usage,
+        args.paged_ctxt_len,
         paged_attn_supported(),
         args.no_paged_attn,
     ) {
-        (block_size, None, None, true, false) => Some(PagedAttentionConfig::new(
+        (block_size, None, None, None, true, false) => Some(PagedAttentionConfig::new(
             block_size,
             512,
-            Either::Right(0.9), // NOTE(EricLBuehler): default is to use 90% of memory
+            MemoryGpuConfig::Utilization(0.9), // NOTE(EricLBuehler): default is to use 90% of memory
         )?),
-        (block_size, Some(m), None, true, false) => {
-            Some(PagedAttentionConfig::new(block_size, 512, Either::Left(m))?)
-        }
-        (block_size, None, Some(f), true, false) => Some(PagedAttentionConfig::new(
+        (block_size, None, None, Some(ctxt), true, false) => Some(PagedAttentionConfig::new(
+            block_size,
+            512,
+            MemoryGpuConfig::ContextSize(ctxt),
+        )?),
+        (block_size, None, Some(f), None, true, false) => Some(PagedAttentionConfig::new(
             block_size,
             512,
-            Either::Right(f),
+            MemoryGpuConfig::Utilization(f),
         )?),
-        (block_size, Some(_m), Some(f), true, false) => {
-            info!("Both memory size and usage were specified, defaulting to the usage value.");
+        (block_size, Some(m), None, None, true, false) => Some(PagedAttentionConfig::new(
+            block_size,
+            512,
+            MemoryGpuConfig::Amount(m),
+        )?),
+        (block_size, Some(_m), Some(f), None, true, false) => {
+            info!("Both memory size, and usage were specified, defaulting to the usage value.");
+            Some(PagedAttentionConfig::new(
+                block_size,
+                512,
+                MemoryGpuConfig::Utilization(f),
+            )?)
+        }
+        (block_size, Some(_m), None, Some(ctxt), true, false) => {
+            info!("All memory size and ctxt len, defaulting to the context len value.");
+            Some(PagedAttentionConfig::new(
+                block_size,
+                512,
+                MemoryGpuConfig::ContextSize(ctxt),
+            )?)
+        }
+        (block_size, None, Some(f), Some(_ctxt), true, false) => {
+            info!("Both ctxt len and usage were specified, defaulting to the usage value.");
             Some(PagedAttentionConfig::new(
                 block_size,
                 512,
-                Either::Right(f),
+                MemoryGpuConfig::Utilization(f),
             )?)
         }
-        (_, _, _, _, _) => None,
+        (_, _, _, _, _, _) => None,
     };
 
     let pipeline = loader.load_model_from_hf(
diff --git a/mistralrs-core/src/dummy_paged_attention/mod.rs b/mistralrs-core/src/dummy_paged_attention/mod.rs
index 6b8380fdb..07064bd3c 100644
--- a/mistralrs-core/src/dummy_paged_attention/mod.rs
+++ b/mistralrs-core/src/dummy_paged_attention/mod.rs
@@ -16,7 +16,6 @@ pub use block_engine_sequence::BlockEngineSequence;
 pub use cache_engine::{CacheConfig, CacheEngine};
 use candle_core::{DType, Device};
 pub use config::{ModelConfigLike, ModelConfigMetadata};
-use either::Either;
 pub use layers::PagedAttention;
 pub use scheduler::{
     PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
@@ -30,14 +29,14 @@ use tracing::info;
 pub struct PagedAttentionConfig {
     pub(crate) block_size: Option<usize>,
     pub(crate) mem_cpu: usize,
-    pub(crate) mem_gpu: Either<usize, f32>,
+    pub(crate) mem_gpu: MemoryGpuConfig,
 }
 
 impl PagedAttentionConfig {
     pub fn new(
         _block_size: Option<usize>,
         _mem_cpu: usize,
-        _mem_gpu: Either<usize, f32>,
+        _mem_gpu: MemoryGpuConfig,
     ) -> anyhow::Result<Self> {
         anyhow::bail!("PagedAttention is only supported for CUDA, compile with feature `cuda`.")
     }
@@ -48,6 +47,14 @@ pub enum AttentionImplementation {
     PagedAttention,
 }
 
+#[derive(Clone, Copy)]
+#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
+pub enum MemoryGpuConfig {
+    Amount(usize),
+    Utilization(f32),
+    ContextSize(usize),
+}
+
 // See `pagedattention.cu` CALL_V1_LAUNCHER_BLOCK_SIZE
 const SUPPORTED_BLOCK_SIZE: &[usize] = &[8, 16, 32];
 
@@ -65,9 +72,20 @@ macro_rules! mb_to_blocks {
     };
 }
 
+macro_rules! ctxt_to_blocks {
+    ($context_len:expr, $dtype_size:expr, $block_size:expr, $config:expr) => {
+        $context_len
+            * $dtype_size
+            * $config.num_kv_heads()
+            * ($config.hidden_size() / $config.num_attn_heads())
+            * $config.num_layers()
+            * 2
+    };
+}
+
 /// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32.
 pub fn calculate_cache_config(
-    mem_gpu: Either<usize, f32>,
+    mem_gpu: MemoryGpuConfig,
     mem_cpu: usize,
     block_size: Option<usize>,
     dtype: DType,
@@ -82,16 +100,18 @@ pub fn calculate_cache_config(
 
     #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
     let mem_gpu = match mem_gpu {
-        Either::Left(v) => v,
-        Either::Right(f) => {
+        MemoryGpuConfig::Amount(v) => v,
+        MemoryGpuConfig::Utilization(f) => {
             let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
             let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
             let used = total - free;
-            let size = (total * f - used) as usize;
-            info!("Allocating {size} MB for PagedAttention KV cache");
-            size
+            (total * f - used) as usize
+        }
+        MemoryGpuConfig::ContextSize(toks) => {
+            ctxt_to_blocks!(toks, dtype_size, block_size, config) / SIZE_IN_MB
         }
     };
+    info!("Allocating {mem_gpu} MB for PagedAttention KV cache");
 
     let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
     let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs
index 31ebaec58..274237e08 100644
--- a/mistralrs-core/src/lib.rs
+++ b/mistralrs-core/src/lib.rs
@@ -59,7 +59,7 @@ mod xlora_models;
 
 pub use amoe::{AnyMoeConfig, AnyMoeExpertType};
 pub use device_map::{DeviceLayerMapMetadata, DeviceMapMetadata, LayerDeviceMapper};
-pub use paged_attention::PagedAttentionConfig;
+pub use paged_attention::{MemoryGpuConfig, PagedAttentionConfig};
 pub use pipeline::{
     chat_template::ChatTemplate, AnyMoeLoader, AnyMoePipeline, GGMLLoader, GGMLLoaderBuilder,
     GGMLSpecificConfig, GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GemmaLoader,
diff --git a/mistralrs-core/src/paged_attention/mod.rs b/mistralrs-core/src/paged_attention/mod.rs
index 7ebabd232..6b354fdfb 100644
--- a/mistralrs-core/src/paged_attention/mod.rs
+++ b/mistralrs-core/src/paged_attention/mod.rs
@@ -16,7 +16,6 @@ pub use block_engine_sequence::BlockEngineSequence;
 pub use cache_engine::{CacheConfig, CacheEngine};
 use candle_core::{DType, Device};
 pub use config::{ModelConfigLike, ModelConfigMetadata};
-use either::Either;
 pub use layers::PagedAttention;
 pub use scheduler::{
     PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
@@ -30,14 +29,14 @@ use tracing::info;
 pub struct PagedAttentionConfig {
     pub(crate) block_size: Option<usize>,
     pub(crate) mem_cpu: usize,
-    pub(crate) mem_gpu: Either<usize, f32>,
+    pub(crate) mem_gpu: MemoryGpuConfig,
 }
 
 impl PagedAttentionConfig {
     pub fn new(
         block_size: Option<usize>,
         mem_cpu: usize,
-        mem_gpu: Either<usize, f32>,
+        mem_gpu: MemoryGpuConfig,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             block_size,
@@ -52,6 +51,14 @@ pub enum AttentionImplementation {
     PagedAttention,
 }
 
+#[derive(Clone, Copy)]
+#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
+pub enum MemoryGpuConfig {
+    Amount(usize),
+    Utilization(f32),
+    ContextSize(usize),
+}
+
 // See `pagedattention.cu` CALL_V1_LAUNCHER_BLOCK_SIZE
 const SUPPORTED_BLOCK_SIZE: &[usize] = &[8, 16, 32];
 
@@ -69,9 +76,20 @@ macro_rules! mb_to_blocks {
     };
 }
 
+macro_rules! ctxt_to_blocks {
+    ($context_len:expr, $dtype_size:expr, $block_size:expr, $config:expr) => {
+        $context_len
+            * $dtype_size
+            * $config.num_kv_heads()
+            * ($config.hidden_size() / $config.num_attn_heads())
+            * $config.num_layers()
+            * 2
+    };
+}
+
 /// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32.
 pub fn calculate_cache_config(
-    mem_gpu: Either<usize, f32>,
+    mem_gpu: MemoryGpuConfig,
     mem_cpu: usize,
     block_size: Option<usize>,
     dtype: DType,
@@ -86,16 +104,18 @@ pub fn calculate_cache_config(
 
     #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
     let mem_gpu = match mem_gpu {
-        Either::Left(v) => v,
-        Either::Right(f) => {
+        MemoryGpuConfig::Amount(v) => v,
+        MemoryGpuConfig::Utilization(f) => {
             let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
             let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
             let used = total - free;
-            let size = (total * f - used) as usize;
-            info!("Allocating {size} MB for PagedAttention KV cache");
-            size
+            (total * f - used) as usize
+        }
+        MemoryGpuConfig::ContextSize(toks) => {
+            ctxt_to_blocks!(toks, dtype_size, block_size, config) / SIZE_IN_MB
         }
     };
+    info!("Allocating {mem_gpu} MB for PagedAttention KV cache");
 
     let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
     let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
diff --git a/mistralrs-paged-attn/build.rs b/mistralrs-paged-attn/build.rs
index e8c6a4f3f..e1e6ee544 100644
--- a/mistralrs-paged-attn/build.rs
+++ b/mistralrs-paged-attn/build.rs
@@ -5,32 +5,29 @@ const CUDA_NVCC_FLAGS: Option<&'static str> = option_env!("CUDA_NVCC_FLAGS");
 
 #[cfg(all(feature = "cuda", target_family = "unix"))]
 fn main() -> Result<()> {
-    use std::fs;
-    use std::fs::read_to_string;
     use std::fs::OpenOptions;
     use std::io::prelude::*;
     use std::path::PathBuf;
 
     const OTHER_CONTENT: &str = r#"
 #[cfg(all(feature = "cuda", target_family = "unix"))]
-mod ffi;
+pub const COPY_BLOCKS_KERNEL: &str =
+    include_str!(concat!(env!("OUT_DIR"), "/copy_blocks_kernel.ptx"));
+#[cfg(all(feature = "cuda", target_family = "unix"))]
+pub const PAGEDATTENTION: &str = include_str!(concat!(env!("OUT_DIR"), "/pagedattention.ptx"));
+#[cfg(all(feature = "cuda", target_family = "unix"))]
+pub const RESHAPE_AND_CACHE_KERNEL: &str =
+    include_str!(concat!(env!("OUT_DIR"), "/reshape_and_cache_kernel.ptx"));
+
 #[cfg(all(feature = "cuda", target_family = "unix"))]
 mod backend;
+#[cfg(all(feature = "cuda", target_family = "unix"))]
+mod ffi;
 
 #[cfg(all(feature = "cuda", target_family = "unix"))]
-pub use backend::{{copy_blocks, paged_attention, reshape_and_cache, swap_blocks}};
+pub use backend::{copy_blocks, paged_attention, reshape_and_cache, swap_blocks};
     "#;
 
-    fn read_lines(filename: &str) -> Vec<String> {
-        let mut result = Vec::new();
-
-        for line in read_to_string(filename).unwrap().lines() {
-            result.push(line.to_string())
-        }
-
-        result
-    }
-
     println!("cargo:rerun-if-changed=build.rs");
     println!("cargo:rerun-if-changed=src/pagedattention.cu");
     println!("cargo:rerun-if-changed=src/copy_blocks_kernel.cu");
@@ -57,20 +54,11 @@ pub use backend::{{copy_blocks, paged_attention, reshape_and_cache, swap_blocks}
     println!("cargo:rustc-link-lib=pagedattention");
     println!("cargo:rustc-link-lib=dylib=cudart");
 
-    let contents = read_lines("src/lib.rs");
-    for line in contents {
-        if line == "pub mod ffi;" {
-            return Ok(());
-        }
-    }
-    let ct = fs::read_to_string("src/lib.rs")?;
-    if !ct.contains(OTHER_CONTENT) {
-        let mut file = OpenOptions::new().append(true).open("src/lib.rs").unwrap();
+    let mut file = OpenOptions::new().write(true).open("src/lib.rs").unwrap();
 
-        // Add the other stuff back
-        if let Err(e) = writeln!(file, "{OTHER_CONTENT}") {
-            anyhow::bail!("Error while building dependencies: {:?}\n", e)
-        }
+    // Add the other stuff back
+    if let Err(e) = writeln!(file, "{OTHER_CONTENT}") {
+        anyhow::bail!("Error while building dependencies: {:?}\n", e)
     }
     Ok(())
 }
diff --git a/mistralrs-paged-attn/src/backend/paged_attention.rs b/mistralrs-paged-attn/src/backend/paged_attention.rs
index d3a3ed812..032bf31b4 100644
--- a/mistralrs-paged-attn/src/backend/paged_attention.rs
+++ b/mistralrs-paged-attn/src/backend/paged_attention.rs
@@ -260,7 +260,7 @@ impl candle::CustomOp1 for PagedAttention {
 ///
 /// * `q` - Query tensor with shape `(num_sequences, num_heads_q, head_size)`.
 /// * `key_cache` - Key cache paged tensor of shape `(num_blocks, num_heads_kv, head_size / x, block_size, x)`
-/// with `x` being the size of an element in bytes.
+///   with `x` being the size of an element in bytes.
 /// * `value_cache` - Value cache paged tensor of shape `(num_blocks, num_heads_kv, head_size, block_size)`.
 /// * `block_tables` - Padded table associating blocks to each sequence of shape `(num_sequences, max_context_len // block_size)`
 /// * `context_lens` - Tensor associating lengths to each sequence of shape `(num_sequences)`
@@ -439,7 +439,7 @@ fn update_cache<
 /// * `key` - Key tensor of shape `(num_tokens, num_heads, head_size)`.
 /// * `value` - Value tensor of shape `(num_tokens, num_heads, head_size)`.
 /// * `key_cache` - Key cache paged tensor of shape `(num_blocks, num_heads, head_size / x, block_size, x)`
-/// with `x` being the size of an element in bytes.
+///   with `x` being the size of an element in bytes.
 /// * `value_cache` - Value cache paged tensor of shape `(num_blocks, num_heads, head_size, block_size)`.
 /// * `slot_mapping` - Mapping associating a slot to each token of shape `(num_tokens)`.
 pub fn reshape_and_cache(
diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi
index 7d392c433..edecc7a92 100644
--- a/mistralrs-pyo3/mistralrs.pyi
+++ b/mistralrs-pyo3/mistralrs.pyi
@@ -203,8 +203,16 @@ class Runner:
             the corresponding number of layers.
         - `in_situ_quant` sets the optional in-situ quantization for models that are not quantized (not GGUF or GGML).
         - `anymoe_config` specifies the AnyMoE config. If this is set, then the model will be loaded as an AnyMoE model.
-        - `pa_gpu_mem` sets GPU memory to allocate for KV cache with PagedAttention in MBs *OR* the percentage utilization, from 0 to 1. If this is not set and the device is
-            CUDA, it will default to using 90% of the total memory after allocation of the KV cache. PagedAttention is only supported on CUDA and is always automatically activated.
+        - `pa_gpu_mem`: GPU memory to allocate for KV cache with PagedAttention in MBs.
+            PagedAttention is only supported on CUDA and is always automatically activated.
+            The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`.
+        - `pa_gpu_mem_usage`: Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1.
+            If this is not set and the device is CUDA, it will default to `0.9`.
+            PagedAttention is only supported on CUDA and is always automatically activated.
+            The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`.
+        - `pa_ctxt_len`: Total context length to allocate the KV cache for (total number of tokens which the KV cache can hold)
+            when using PagedAttention, which is only supported on CUDA and is always automatically activated.
+            The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`.
         - `pa_blk_size` sets the block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA,
             it will default to 32. PagedAttention is only supported on CUDA and is always automatically activated.
         - `no_paged_attn` disables PagedAttention on CUDA
diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs
index 8ff8ad95a..c008c306d 100644
--- a/mistralrs-pyo3/src/lib.rs
+++ b/mistralrs-pyo3/src/lib.rs
@@ -21,10 +21,10 @@ use candle_core::Device;
 use mistralrs_core::{
     initialize_logging, paged_attn_supported, AnyMoeLoader, ChatCompletionResponse,
     CompletionResponse, Constraint, DefaultSchedulerMethod, DeviceLayerMapMetadata,
-    DeviceMapMetadata, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, Loader, MistralRs,
-    MistralRsBuilder, ModelDType, NormalLoaderBuilder, NormalRequest, NormalSpecificConfig,
-    PagedAttentionConfig, Request as _Request, RequestMessage, Response, SamplingParams,
-    SchedulerConfig, SpeculativeConfig, SpeculativeLoader, StopTokens, TokenSource,
+    DeviceMapMetadata, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, Loader,
+    MemoryGpuConfig, MistralRs, MistralRsBuilder, ModelDType, NormalLoaderBuilder, NormalRequest,
+    NormalSpecificConfig, PagedAttentionConfig, Request as _Request, RequestMessage, Response,
+    SamplingParams, SchedulerConfig, SpeculativeConfig, SpeculativeLoader, StopTokens, TokenSource,
     VisionLoaderBuilder, VisionSpecificConfig,
 };
 use pyo3::{
@@ -300,6 +300,8 @@ impl Runner {
         in_situ_quant = None,
         anymoe_config = None,
         pa_gpu_mem = None,
+        pa_gpu_mem_usage = None,
+        pa_ctxt_len = None,
         pa_blk_size = None,
         no_paged_attn = false,
     ))]
@@ -315,7 +317,9 @@ impl Runner {
         num_device_layers: Option<Vec<String>>,
         in_situ_quant: Option<String>,
         anymoe_config: Option<AnyMoeConfig>,
-        pa_gpu_mem: Option<Either<usize, f32>>,
+        pa_gpu_mem: Option<usize>,
+        pa_gpu_mem_usage: Option<f32>,
+        pa_ctxt_len: Option<usize>,
         pa_blk_size: Option<usize>,
         no_paged_attn: bool,
     ) -> PyResult<Self> {
@@ -428,22 +432,44 @@ impl Runner {
 
         // Allocate 0.5 GB of CPU memory just as a placeholder.
         // Nothing happens here as we have no `swap_out`, see `_preempt_by_swap`.
-        let cache_config = match (
-            pa_blk_size,
-            pa_gpu_mem,
-            paged_attn_supported(),
-            no_paged_attn,
-        ) {
-            (block_size, None, true, false) => Some(PagedAttentionConfig::new(
-                block_size,
-                512,
-                Either::Right(0.9), // NOTE(EricLBuehler): default is to use 90% of memory
-            )?),
-            (block_size, Some(either), true, false) => {
-                Some(PagedAttentionConfig::new(block_size, 512, either)?)
-            }
-            (_, _, _, _) => None,
-        };
+        let cache_config =
+            match (
+                pa_blk_size,
+                pa_gpu_mem,
+                pa_gpu_mem_usage,
+                pa_ctxt_len,
+                paged_attn_supported(),
+                no_paged_attn,
+            ) {
+                (block_size, None, None, None, true, false) => Some(PagedAttentionConfig::new(
+                    block_size,
+                    512,
+                    MemoryGpuConfig::Utilization(0.9), // NOTE(EricLBuehler): default is to use 90% of memory
+                )?),
+                (block_size, None, None, Some(ctxt), true, false) => Some(
+                    PagedAttentionConfig::new(block_size, 512, MemoryGpuConfig::ContextSize(ctxt))?,
+                ),
+                (block_size, None, Some(f), None, true, false) => Some(PagedAttentionConfig::new(
+                    block_size,
+                    512,
+                    MemoryGpuConfig::Utilization(f),
+                )?),
+                (block_size, Some(m), None, None, true, false) => Some(PagedAttentionConfig::new(
+                    block_size,
+                    512,
+                    MemoryGpuConfig::Amount(m),
+                )?),
+                (block_size, Some(_m), Some(f), None, true, false) => Some(
+                    PagedAttentionConfig::new(block_size, 512, MemoryGpuConfig::Utilization(f))?,
+                ),
+                (block_size, Some(_m), None, Some(ctxt), true, false) => Some(
+                    PagedAttentionConfig::new(block_size, 512, MemoryGpuConfig::ContextSize(ctxt))?,
+                ),
+                (block_size, None, Some(f), Some(_ctxt), true, false) => Some(
+                    PagedAttentionConfig::new(block_size, 512, MemoryGpuConfig::Utilization(f))?,
+                ),
+                (_, _, _, _, _, _) => None,
+            };
 
         let pipeline = loader
             .load_model_from_hf(
diff --git a/mistralrs-server/src/main.rs b/mistralrs-server/src/main.rs
index 0354ed58a..f17bd3ab5 100644
--- a/mistralrs-server/src/main.rs
+++ b/mistralrs-server/src/main.rs
@@ -7,12 +7,11 @@ use axum::{
 };
 use candle_core::{quantized::GgmlDType, Device};
 use clap::Parser;
-use either::Either;
 use mistralrs_core::{
     get_model_dtype, get_tgt_non_granular_index, initialize_logging, paged_attn_supported,
     DefaultSchedulerMethod, DeviceLayerMapMetadata, DeviceMapMetadata, Loader, LoaderBuilder,
-    MistralRs, MistralRsBuilder, ModelSelected, PagedAttentionConfig, Request, SchedulerConfig,
-    TokenSource,
+    MemoryGpuConfig, MistralRs, MistralRsBuilder, ModelSelected, PagedAttentionConfig, Request,
+    SchedulerConfig, TokenSource,
 };
 use openai::{ChatCompletionRequest, Message, ModelObjects, StopTokens};
 use serde::{Deserialize, Serialize};
@@ -123,17 +122,25 @@ struct Args {
     #[arg(long = "isq", value_parser = parse_isq)]
     in_situ_quant: Option<GgmlDType>,
 
-    /// GPU memory to allocate for KV cache with PagedAttention in MBs. If this is not set and the device is CUDA, it will default to
-    /// using `pa-gpu-mem-usage` set to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
+    /// GPU memory to allocate for KV cache with PagedAttention in MBs.
+    /// PagedAttention is only supported on CUDA and is always automatically activated.
+    /// The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`.
     #[arg(long = "pa-gpu-mem")]
     paged_attn_gpu_mem: Option<usize>,
 
     /// Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1.
-    /// If this is not set and the device is CUDA, it will default to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
-    /// This is always used over `pa-gpu-mem` if both are specified.
+    /// If this is not set and the device is CUDA, it will default to `0.9`.
+    /// PagedAttention is only supported on CUDA and is always automatically activated.
+    /// The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`.
     #[arg(long = "pa-gpu-mem-usage")]
     paged_attn_gpu_mem_usage: Option<f32>,
 
+    /// Total context length to allocate the KV cache for (total number of tokens which the KV cache can hold)
+    /// when using PagedAttention, which is only supported on CUDA and is always automatically activated.
+    /// The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`.
+    #[arg(long = "pa-ctxt-len")]
+    paged_ctxt_len: Option<usize>,
+
     /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32.
     /// PagedAttention is only supported on CUDA and is always automatically activated.
     #[arg(long = "pa-blk-size")]
@@ -351,31 +358,55 @@ async fn main() -> Result<()> {
         args.paged_attn_block_size,
         args.paged_attn_gpu_mem,
         args.paged_attn_gpu_mem_usage,
+        args.paged_ctxt_len,
         paged_attn_supported(),
         args.no_paged_attn,
     ) {
-        (block_size, None, None, true, false) => Some(PagedAttentionConfig::new(
+        (block_size, None, None, None, true, false) => Some(PagedAttentionConfig::new(
             block_size,
             512,
-            Either::Right(0.9), // NOTE(EricLBuehler): default is to use 90% of memory
+            MemoryGpuConfig::Utilization(0.9), // NOTE(EricLBuehler): default is to use 90% of memory
         )?),
-        (block_size, Some(m), None, true, false) => {
-            Some(PagedAttentionConfig::new(block_size, 512, Either::Left(m))?)
-        }
-        (block_size, None, Some(f), true, false) => Some(PagedAttentionConfig::new(
+        (block_size, None, None, Some(ctxt), true, false) => Some(PagedAttentionConfig::new(
+            block_size,
+            512,
+            MemoryGpuConfig::ContextSize(ctxt),
+        )?),
+        (block_size, None, Some(f), None, true, false) => Some(PagedAttentionConfig::new(
+            block_size,
+            512,
+            MemoryGpuConfig::Utilization(f),
+        )?),
+        (block_size, Some(m), None, None, true, false) => Some(PagedAttentionConfig::new(
             block_size,
             512,
-            Either::Right(f),
+            MemoryGpuConfig::Amount(m),
         )?),
-        (block_size, Some(_m), Some(f), true, false) => {
-            info!("Both memory size and usage were specified, defaulting to the usage value.");
+        (block_size, Some(_m), Some(f), None, true, false) => {
+            info!("Both memory size, and usage were specified, defaulting to the usage value.");
+            Some(PagedAttentionConfig::new(
+                block_size,
+                512,
+                MemoryGpuConfig::Utilization(f),
+            )?)
+        }
+        (block_size, Some(_m), None, Some(ctxt), true, false) => {
+            info!("All memory size and ctxt len, defaulting to the context len value.");
+            Some(PagedAttentionConfig::new(
+                block_size,
+                512,
+                MemoryGpuConfig::ContextSize(ctxt),
+            )?)
+        }
+        (block_size, None, Some(f), Some(_ctxt), true, false) => {
+            info!("Both ctxt len and usage were specified, defaulting to the usage value.");
             Some(PagedAttentionConfig::new(
                 block_size,
                 512,
-                Either::Right(f),
+                MemoryGpuConfig::Utilization(f),
             )?)
         }
-        (_, _, _, _, _) => None,
+        (_, _, _, _, _, _) => None,
     };
 
     let pipeline = loader.load_model_from_hf(
diff --git a/mistralrs/examples/paged_attn/main.rs b/mistralrs/examples/paged_attn/main.rs
index a6fb014a1..5e8bdbc52 100644
--- a/mistralrs/examples/paged_attn/main.rs
+++ b/mistralrs/examples/paged_attn/main.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;
 use tokio::sync::mpsc::channel;
 
 use mistralrs::{
-    Constraint, Device, DeviceMapMetadata, MistralRs, MistralRsBuilder, ModelDType,
-    NormalLoaderBuilder, NormalLoaderType, NormalRequest, NormalSpecificConfig,
+    Constraint, Device, DeviceMapMetadata, MemoryGpuConfig, MistralRs, MistralRsBuilder,
+    ModelDType, NormalLoaderBuilder, NormalLoaderType, NormalRequest, NormalSpecificConfig,
     PagedAttentionConfig, Request, RequestMessage, Response, Result, SamplingParams,
     SchedulerConfig, TokenSource,
 };
@@ -51,7 +51,7 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
         Some(PagedAttentionConfig::new(
             Some(32),
             1024,
-            Either::Right(0.9),
+            MemoryGpuConfig::Utilization(0.9),
         )?), // Automatically determine memory usage
     )?;
     let config = pipeline