From b033d7bf26a691ac7255c5aba00e06a8065432c5 Mon Sep 17 00:00:00 2001 From: Laurent Sorber Date: Fri, 16 Aug 2024 14:00:39 +0200 Subject: [PATCH] feat: optimize config for CPU and GPU --- src/raglite/_config.py | 22 +++++++++++++++++----- tests/conftest.py | 13 ++----------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/raglite/_config.py b/src/raglite/_config.py index e178078..f217712 100644 --- a/src/raglite/_config.py +++ b/src/raglite/_config.py @@ -5,20 +5,31 @@ import numpy as np import numpy.typing as npt -from llama_cpp import Llama, LlamaRAMCache # type: ignore[attr-defined] +from llama_cpp import Llama, LlamaRAMCache, llama_supports_gpu_offload # type: ignore[attr-defined] from sqlalchemy.engine import URL @lru_cache(maxsize=1) def default_llm() -> Llama: """Get default LLM.""" + # Select the best available LLM for the given accelerator. + if llama_supports_gpu_offload(): + # Llama-3.1-8B-instruct on GPU. + repo_id = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF" # https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct + filename = "*Q4_K_M.gguf" + else: + # Phi-3.1-mini-128k-instruct on CPU. + repo_id = "bartowski/Phi-3.1-mini-128k-instruct-GGUF" # https://huggingface.co/microsoft/Phi-3-mini-128k-instruct + filename = "*Q4_K_M.gguf" + # Load the LLM. llm = Llama.from_pretrained( - repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", # https://github.com/meta-llama/llama-models - filename="*Q4_K_M.gguf", + repo_id=repo_id, + filename=filename, n_ctx=8192, # 0 = Use the model's context size (default is 512). n_gpu_layers=-1, # -1 = Offload all layers to the GPU (default is 0). verbose=False, ) + # Enable caching. llm.set_cache(LlamaRAMCache()) return llm @@ -26,9 +37,10 @@ def default_llm() -> Llama: @lru_cache(maxsize=1) def default_embedder() -> Llama: """Get default embedder.""" + # Load the embedder. embedder = Llama.from_pretrained( - repo_id="ChristianAzinn/snowflake-arctic-embed-l-gguf", # https://github.com/Snowflake-Labs/arctic-embed - filename="*f16.GGUF", + repo_id="yishan-wang/snowflake-arctic-embed-m-v1.5-Q8_0-GGUF", # https://github.com/Snowflake-Labs/arctic-embed + filename="*q8_0.gguf", n_ctx=0, # 0 = Use the model's context size (default is 512). n_gpu_layers=-1, # -1 = Offload all layers to the GPU (default is 0). verbose=False, diff --git a/tests/conftest.py b/tests/conftest.py index 52a5d6d..c647d6a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,7 @@ """Fixtures for the tests.""" import pytest -from llama_cpp import Llama, LlamaRAMCache # type: ignore[attr-defined] +from llama_cpp import Llama from raglite import RAGLiteConfig @@ -9,15 +9,6 @@ @pytest.fixture() def simple_config() -> RAGLiteConfig: """Create a lightweight in-memory config for testing.""" - # Use a lightweight LLM. - llm = Llama.from_pretrained( - repo_id="bartowski/Phi-3.1-mini-4k-instruct-GGUF", # https://huggingface.co/microsoft/Phi-3-mini-4k-instruct - filename="*Q4_K_M.gguf", - n_ctx=4096, # 0 = Use the model's context size (default is 512). - n_gpu_layers=-1, # -1 = Offload all layers to the GPU (default is 0). - verbose=False, - ) - llm.set_cache(LlamaRAMCache()) # Use a lightweight embedder. embedder = Llama.from_pretrained( repo_id="ChristianAzinn/snowflake-arctic-embed-xs-gguf", # https://github.com/Snowflake-Labs/arctic-embed @@ -30,5 +21,5 @@ def simple_config() -> RAGLiteConfig: # Use an in-memory SQLite database. db_url = "sqlite:///:memory:" # Create the config. - config = RAGLiteConfig(llm=llm, embedder=embedder, db_url=db_url) + config = RAGLiteConfig(embedder=embedder, db_url=db_url) return config