Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Commit

Permalink
Merge pull request #325 from LLukas22/feat/cuda-opencl-acceleration
Browse files Browse the repository at this point in the history
CUDA/OpenCL Acceleration
  • Loading branch information
philpax authored Jul 16, 2023
2 parents 0269796 + d815857 commit 3062a08
Show file tree
Hide file tree
Showing 31 changed files with 1,244 additions and 634 deletions.
2 changes: 2 additions & 0 deletions binaries/generate-ggml-bindings/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ fn main() {

fn generate_main(ggml_path: &Path, src_path: &Path) {
let bindings = bindgen::Builder::default()
.header(ggml_path.join("ggml.h").to_str().unwrap().to_string())
.allowlist_file(r".*ggml.h")
.header(ggml_path.join("k_quants.h").to_string_lossy())
.allowlist_file(r".*k_quants.h")
// Suppress some warnings
Expand Down
9 changes: 7 additions & 2 deletions binaries/llm-cli/src/cli_args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ impl Generate {
memory_k_type: mem_typ,
memory_v_type: mem_typ,
use_gpu: self.use_gpu,
n_batch: self.batch_size,
n_threads: self.num_threads(),
}
}

Expand All @@ -348,8 +350,6 @@ impl Generate {

pub fn inference_parameters(&self, eot: llm::TokenId) -> InferenceParameters {
InferenceParameters {
n_threads: self.num_threads(),
n_batch: self.batch_size,
sampler: Arc::new(llm::samplers::TopPTopK {
top_k: self.top_k,
top_p: self.top_p,
Expand Down Expand Up @@ -457,6 +457,10 @@ pub struct ModelLoad {
/// LoRA adapter to use for the model
#[arg(long, num_args(0..))]
pub lora_paths: Option<Vec<PathBuf>>,

/// Number of layers to run on the GPU. If not specified, all layers will be run on the GPU.
#[arg(long)]
pub gpu_layers: Option<usize>,
}
impl ModelLoad {
pub fn load(&self, use_gpu: bool) -> eyre::Result<Box<dyn Model>> {
Expand All @@ -465,6 +469,7 @@ impl ModelLoad {
context_size: self.num_ctx_tokens,
lora_adapters: self.lora_paths.clone(),
use_gpu,
gpu_layers: self.gpu_layers,
};

let mut sp = Some(spinoff::Spinner::new(
Expand Down
6 changes: 2 additions & 4 deletions binaries/llm-cli/src/interactive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pub fn repl(
.as_deref()
.map(|template| util::process_prompt(template, &line))
.unwrap_or(line);
feed_prompt_with_spinner(model, &mut session, &parameters, prompt)?;
feed_prompt_with_spinner(model, &mut session, prompt)?;

session.infer::<Infallible>(
model,
Expand Down Expand Up @@ -79,7 +79,7 @@ pub fn chat(args: &Chat) -> eyre::Result<()> {

let model = model.as_ref();
let mut session = create_session(model, inference_session_config);
feed_prompt_with_spinner(model, &mut session, &parameters, prelude_prompt)?;
feed_prompt_with_spinner(model, &mut session, prelude_prompt)?;

readline_loop(|raw_line| {
let prompt = {
Expand Down Expand Up @@ -134,7 +134,6 @@ fn initialize_common_state(
fn feed_prompt_with_spinner(
model: &dyn llm::Model,
session: &mut llm::InferenceSession,
parameters: &llm::InferenceParameters,
mut prompt: String,
) -> eyre::Result<()> {
// Add a newline to the beginning of the prompt if the last character in the session is not a newline
Expand All @@ -145,7 +144,6 @@ fn feed_prompt_with_spinner(
let sp = spinoff::Spinner::new(spinoff::spinners::Dots2, "".to_string(), None);
let result = session.feed_prompt(
model,
parameters,
&prompt,
// OutputRequest
&mut Default::default(),
Expand Down
12 changes: 3 additions & 9 deletions binaries/llm-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,10 @@ fn perplexity(args: &cli_args::Perplexity) -> eyre::Result<()> {
let model = args.model_load.load(args.generate.use_gpu)?;
let (mut session, _) =
snapshot::read_or_create_session(model.as_ref(), None, None, inference_session_config);
let parameters = args.generate.inference_parameters(model.eot_token_id());

session.perplexity(
model.as_ref(),
&parameters,
prompt.as_str(),
|chunk, perplexity| {
println!("Perplexity[{chunk}]: {perplexity}");
},
)?;
session.perplexity(model.as_ref(), prompt.as_str(), |chunk, perplexity| {
println!("Perplexity[{chunk}]: {perplexity}");
})?;

Ok(())
}
Expand Down
2 changes: 1 addition & 1 deletion binaries/llm-test/src/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ fn feed_prompt(
model: &impl Model,
output: &mut OutputRequest,
) -> Result<(), llm::InferenceError> {
session.feed_prompt(model, &Default::default(), prompt, output, always_continue)
session.feed_prompt(model, prompt, output, always_continue)
}

fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {
Expand Down
18 changes: 6 additions & 12 deletions binaries/llm-test/src/inference.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

use std::{convert::Infallible, sync::Arc};

use llm::InferenceStats;
use llm::{InferenceSessionConfig, InferenceStats};

use crate::{ModelConfig, TestCaseReport, TestCaseReportInner, TestCaseReportMeta};

Expand All @@ -15,14 +15,11 @@ pub(crate) fn can_infer(
expected_output: Option<&str>,
maximum_token_count: usize,
) -> anyhow::Result<TestCaseReport> {
let mut session = model.start_session(Default::default());
let (actual_output, res) = run_inference(
model,
model_config,
&mut session,
input,
maximum_token_count,
);
let mut session = model.start_session(InferenceSessionConfig {
n_threads: model_config.threads,
..Default::default()
});
let (actual_output, res) = run_inference(model, &mut session, input, maximum_token_count);

// Process the results
Ok(TestCaseReport {
Expand Down Expand Up @@ -58,7 +55,6 @@ pub(crate) fn can_infer(

fn run_inference(
model: &dyn llm::Model,
model_config: &ModelConfig,
session: &mut llm::InferenceSession,
input: &str,
maximum_token_count: usize,
Expand All @@ -70,8 +66,6 @@ fn run_inference(
&llm::InferenceRequest {
prompt: input.into(),
parameters: &llm::InferenceParameters {
n_threads: model_config.threads,
n_batch: 1,
sampler: Arc::new(DeterministicSampler),
},
play_back_previous_tokens: false,
Expand Down
4 changes: 1 addition & 3 deletions binaries/llm-test/src/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@ fn feed_prompt(
model: &impl Model,
output: &mut OutputRequest,
) -> Result<(), llm::InferenceError> {
session.feed_prompt(model, &Default::default(), prompt, output, |x| {
always_continue(x)
})
session.feed_prompt(model, prompt, output, always_continue)
}

fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {
Expand Down
57 changes: 21 additions & 36 deletions crates/ggml/src/metal.rs → crates/ggml/src/accelerator/metal.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Metal support.
use crate::{sys::metal, Buffer, ComputationGraph, Context, Tensor};
use std::{ffi::c_void, ptr::NonNull, sync::Arc};
use std::{ptr::NonNull, sync::Arc};

/// Acts as a RAII-guard over a `sys::metal::ggml_metal_context`, allocating via
/// `ggml_metal_init` and dropping via `ggml_metal_free`.
Expand All @@ -14,8 +14,8 @@ pub struct MetalContext {

impl MetalContext {
/// Create a new Metal context
pub fn new() -> Self {
let raw = unsafe { metal::ggml_metal_init() };
pub fn new(n_threads: usize) -> Self {
let raw = unsafe { metal::ggml_metal_init(n_threads.try_into().unwrap()) };

MetalContext {
contexts: vec![],
Expand Down Expand Up @@ -45,47 +45,32 @@ impl MetalContext {

/// Add a context's memory as buffer to this Metal context
pub fn add_context(&mut self, from_context: Arc<Context>) {
if self.ref_context(from_context.clone()) {
unsafe {
let raw_context = from_context.ptr.as_ptr();

let (data_ptr, data_size): (*mut c_void, usize) =
if let Some(ref mmap) = from_context.mmap {
// This is a bit naughty...
(mmap.as_ptr().cast_mut().cast(), mmap.len())
} else {
(
ggml_sys::ggml_get_mem_buffer(raw_context),
ggml_sys::ggml_get_mem_size(raw_context),
)
};

let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context);
assert!(
metal::ggml_metal_add_buffer(
self.ptr.as_ptr(),
"wt\0".as_ptr().cast(), // FIXME provide an actual name
data_ptr,
data_size,
max_size
),
"Could not add weight buffer to metal context"
);
}
if !self.ref_context(from_context.clone()) {
return;
}
}
}

impl Default for MetalContext {
fn default() -> Self {
Self::new()
unsafe {
let raw_context = from_context.as_ptr();
let (data_ptr, data_size) = from_context.storage().as_ptr_and_size(&from_context);
let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context);
assert!(
metal::ggml_metal_add_buffer(
self.ptr.as_ptr(),
"wt\0".as_ptr().cast(), // FIXME provide an actual name
data_ptr,
data_size,
max_size
),
"Could not add weight buffer to metal context"
);
}
}
}

impl MetalContext {
/// Registers a context as a context that provides Metal buffers. Returns true if the context was not registered before.
fn ref_context(&mut self, context: Arc<Context>) -> bool {
if self.contexts.iter().any(|c| c.ptr == context.ptr) {
if self.contexts.iter().any(|c| *c == context) {
false
} else {
self.contexts.push(context);
Expand Down
94 changes: 94 additions & 0 deletions crates/ggml/src/accelerator/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
//! Functionality related to hardware acceleration of GGML (GPU, etc.)
use crate::sys;

#[cfg(feature = "metal")]
pub mod metal;

#[derive(Debug, Copy, Clone, PartialEq, Eq)]
/// Accelerators supported by `ggml`.
pub enum Accelerator {
/// CuBLAS accelerated
CuBLAS,
/// CLBlast accelerated
CLBlast,
/// Metal accelerated
Metal,
/// Cpu accelerated
None,
}

/// Returns the accelerator `ggml` was compiled with.
pub fn get_accelerator() -> Accelerator {
#[cfg(feature = "cublas")]
return Accelerator::CLBlast;
#[cfg(feature = "clblast")]
return Accelerator::CuBLAS;
#[cfg(feature = "metal")]
return Accelerator::Metal;
#[cfg(not(any(feature = "cublas", feature = "clblast", feature = "metal")))]
return Accelerator::None;
}

#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
/// Backend to use for a tensor.
pub enum Backend {
/// CPU backend
#[default]
Cpu,
/// GPU backend
Gpu,
/// Multi-GPU backend
GpuSplit,
}

impl From<Backend> for sys::ggml_backend {
fn from(b: Backend) -> Self {
match b {
Backend::Cpu => sys::ggml_backend_GGML_BACKEND_CPU,
Backend::Gpu => sys::ggml_backend_GGML_BACKEND_GPU,
Backend::GpuSplit => sys::ggml_backend_GGML_BACKEND_GPU_SPLIT,
}
}
}

impl TryFrom<sys::ggml_backend> for Backend {
type Error = ();
fn try_from(b: sys::ggml_backend) -> Result<Self, Self::Error> {
match b {
sys::ggml_backend_GGML_BACKEND_CPU => Ok(Backend::Cpu),
sys::ggml_backend_GGML_BACKEND_GPU => Ok(Backend::Gpu),
sys::ggml_backend_GGML_BACKEND_GPU_SPLIT => Ok(Backend::GpuSplit),
_ => Err(()),
}
}
}

/// Initialize the accelerator. If ggml-sys is compiled with CUDA or CLBlast support, this function will initialize the accelerator. If not this is a no-op.
#[allow(unused_variables)]
pub fn initialize(device: i32) {
#[cfg(feature = "cublas")]
unsafe {
//TODO: Make this configurable
sys::cuda::ggml_init_cublas();
sys::cuda::ggml_cuda_set_main_device(device);
let split = 1.0f32;
sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
}
}

/// Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op.
#[allow(unused_variables)]
pub fn set_scratch_size(size: usize) {
#[cfg(feature = "cublas")]
unsafe {
sys::cuda::ggml_cuda_set_scratch_size(size);
}
}

/// Frees the scratch memory. If ggml-sys is compiled with CUDA support, this function will free the scratch memory. If not this is a no-op.
pub fn free_scratch() {
#[cfg(feature = "cublas")]
unsafe {
sys::cuda::ggml_cuda_free_scratch();
}
}
Loading

0 comments on commit 3062a08

Please sign in to comment.