Skip to content

Commit

Permalink
feat: optimize config for CPU and GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
lsorber committed Aug 16, 2024
1 parent f9b92cf commit b033d7b
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 16 deletions.
22 changes: 17 additions & 5 deletions src/raglite/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,42 @@

import numpy as np
import numpy.typing as npt
from llama_cpp import Llama, LlamaRAMCache # type: ignore[attr-defined]
from llama_cpp import Llama, LlamaRAMCache, llama_supports_gpu_offload # type: ignore[attr-defined]
from sqlalchemy.engine import URL


@lru_cache(maxsize=1)
def default_llm() -> Llama:
"""Get default LLM."""
# Select the best available LLM for the given accelerator.
if llama_supports_gpu_offload():
# Llama-3.1-8B-instruct on GPU.
repo_id = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF" # https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct
filename = "*Q4_K_M.gguf"
else:
# Phi-3.1-mini-128k-instruct on CPU.
repo_id = "bartowski/Phi-3.1-mini-128k-instruct-GGUF" # https://huggingface.co/microsoft/Phi-3-mini-128k-instruct
filename = "*Q4_K_M.gguf"
# Load the LLM.
llm = Llama.from_pretrained(
repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", # https://github.com/meta-llama/llama-models
filename="*Q4_K_M.gguf",
repo_id=repo_id,
filename=filename,
n_ctx=8192, # 0 = Use the model's context size (default is 512).
n_gpu_layers=-1, # -1 = Offload all layers to the GPU (default is 0).
verbose=False,
)
# Enable caching.
llm.set_cache(LlamaRAMCache())
return llm


@lru_cache(maxsize=1)
def default_embedder() -> Llama:
"""Get default embedder."""
# Load the embedder.
embedder = Llama.from_pretrained(
repo_id="ChristianAzinn/snowflake-arctic-embed-l-gguf", # https://github.com/Snowflake-Labs/arctic-embed
filename="*f16.GGUF",
repo_id="yishan-wang/snowflake-arctic-embed-m-v1.5-Q8_0-GGUF", # https://github.com/Snowflake-Labs/arctic-embed
filename="*q8_0.gguf",
n_ctx=0, # 0 = Use the model's context size (default is 512).
n_gpu_layers=-1, # -1 = Offload all layers to the GPU (default is 0).
verbose=False,
Expand Down
13 changes: 2 additions & 11 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
"""Fixtures for the tests."""

import pytest
from llama_cpp import Llama, LlamaRAMCache # type: ignore[attr-defined]
from llama_cpp import Llama

from raglite import RAGLiteConfig


@pytest.fixture()
def simple_config() -> RAGLiteConfig:
"""Create a lightweight in-memory config for testing."""
# Use a lightweight LLM.
llm = Llama.from_pretrained(
repo_id="bartowski/Phi-3.1-mini-4k-instruct-GGUF", # https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
filename="*Q4_K_M.gguf",
n_ctx=4096, # 0 = Use the model's context size (default is 512).
n_gpu_layers=-1, # -1 = Offload all layers to the GPU (default is 0).
verbose=False,
)
llm.set_cache(LlamaRAMCache())
# Use a lightweight embedder.
embedder = Llama.from_pretrained(
repo_id="ChristianAzinn/snowflake-arctic-embed-xs-gguf", # https://github.com/Snowflake-Labs/arctic-embed
Expand All @@ -30,5 +21,5 @@ def simple_config() -> RAGLiteConfig:
# Use an in-memory SQLite database.
db_url = "sqlite:///:memory:"
# Create the config.
config = RAGLiteConfig(llm=llm, embedder=embedder, db_url=db_url)
config = RAGLiteConfig(embedder=embedder, db_url=db_url)
return config

0 comments on commit b033d7b

Please sign in to comment.