diff --git a/README.md b/README.md index 86ed1bf2..8f5d96ec 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A package used to for RAG applications. ______________________________________________________________________ -[![Code Coverage](https://img.shields.io/badge/Coverage-72%25-yellow.svg)](https://github.com/alexandrainst/ragger/tree/main/tests) +[![Code Coverage](https://img.shields.io/badge/Coverage-82%25-yellowgreen.svg)](https://github.com/alexandrainst/ragger/tree/main/tests) Developer(s): diff --git a/config/config.yaml b/config/config.yaml index 21a7eb72..81d8f828 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -15,11 +15,12 @@ document_store_filename: document_store.jsonl # EmbeddingStore parameters embedding_store_type: numpy +num_documents_to_retrieve: 3 # Embedder parameters embedder_type: e5 +document_text_field: text embedder_id: intfloat/multilingual-e5-large -num_documents_to_retrieve: 3 # Generator parameters generator_type: openai diff --git a/src/ragger/embedder.py b/src/ragger/embedder.py index b0bfc8bf..5ef51f43 100644 --- a/src/ragger/embedder.py +++ b/src/ragger/embedder.py @@ -1,12 +1,21 @@ """Embed documents using a pre-trained model.""" +import logging +import os +import re from abc import ABC, abstractmethod import numpy as np +import torch from omegaconf import DictConfig +from sentence_transformers import SentenceTransformer from .utils import Document +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +logger = logging.getLogger(__name__) + class Embedder(ABC): """An abstract embedder, which embeds documents using a pre-trained model.""" @@ -37,4 +46,89 @@ def embed_documents(self, documents: list[Document]) -> np.ndarray: class E5Embedder(Embedder): """An embedder that uses an E5 model to embed documents.""" - pass + def __init__(self, config: DictConfig) -> None: + """Initialise the E5 embedder. + + Args: + config: + The Hydra configuration. + """ + super().__init__(config) + self.embedder = SentenceTransformer(self.config.embedder_id) + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + def embed_documents(self, documents: list[Document]) -> np.ndarray: + """Embed a list of documents using an E5 model. + + Args: + documents: + A list of documents to embed. + + Returns: + An array of embeddings, where each row corresponds to a document. + """ + # Prepare the texts for embedding + texts = [document.text for document in documents] + prepared_texts = self._prepare_texts_for_embedding(texts=texts) + + # Embed the texts + embeddings = self.embedder.encode( + sentences=prepared_texts, + normalize_embeddings=True, + convert_to_numpy=True, + show_progress_bar=False, + ) + assert isinstance(embeddings, np.ndarray) + return embeddings + + def embed_query(self, query: str) -> np.ndarray: + """Embed a query. + + Args: + query: + A query. + + Returns: + The embedding of the query. + """ + prepared_query = self._prepare_query_for_embedding(query=query) + query_embedding = self.embedder.encode( + sentences=[prepared_query], + normalize_embeddings=True, + convert_to_numpy=True, + show_progress_bar=False, + device=self.device, + )[0] + return query_embedding + + def _prepare_texts_for_embedding(self, texts: list[str]) -> list[str]: + """This prepares texts for embedding. + + The precise preparation depends on the embedding model and usecase. + + Args: + texts: + The texts to prepare. + + Returns: + The prepared texts. + """ + passages = [ + "passage: " + re.sub(r"^passage: ", "", passage) for passage in texts + ] + return passages + + def _prepare_query_for_embedding(self, query: str) -> str: + """This prepares a query for embedding. + + The precise preparation depends on the embedding model. + + Args: + query: + A query. + + Returns: + A prepared query. + """ + query = "query: " + re.sub(r"^query: ", "", query) + return query diff --git a/src/ragger/embedding_store.py b/src/ragger/embedding_store.py index 9fec9667..852d1866 100644 --- a/src/ragger/embedding_store.py +++ b/src/ragger/embedding_store.py @@ -1,9 +1,11 @@ """Store and fetch embeddings from a database.""" from abc import ABC, abstractmethod +from pathlib import Path import numpy as np from omegaconf import DictConfig +from transformers import AutoConfig from .utils import Index @@ -47,4 +49,84 @@ def get_nearest_neighbours(self, embedding: np.ndarray) -> list[Index]: class NumpyEmbeddingStore(EmbeddingStore): """An embedding store that fetches embeddings from a NumPy file.""" - pass + def __init__(self, config: DictConfig) -> None: + """Initialise the NumPy embedding store. + + Args: + config: + The Hydra configuration. + """ + super().__init__(config) + self.embedding_dim = self._get_embedding_dimension() + self.embeddings = np.zeros((0, self.embedding_dim)) + + def _get_embedding_dimension(self) -> int: + """This returns the embedding dimension for the embedding model. + + Returns: + The embedding dimension. + """ + model_config = AutoConfig.from_pretrained(self.config.embedder_id) + return model_config.hidden_size + + def add_embeddings(self, embeddings: list[np.ndarray]) -> None: + """Add embeddings to the store. + + Args: + embeddings: + A list of embeddings to add to the store. + """ + self.embeddings = np.vstack([self.embeddings, np.array(embeddings)]) + + def reset(self) -> None: + """This resets the embeddings store.""" + self.embeddings = np.zeros((0, self.embedding_dim)) + + def save(self, path: Path | str) -> None: + """This saves the embeddings store to disk. + + This will store the embeddings in `npy`-file, called + `embeddings.npy`. + + Args: + path: + The path to the embeddings store in. + """ + path = Path(path) + np.save(file=path, arr=self.embeddings) + + def load(self, path: Path | str) -> None: + """This loads the embeddings store from disk. + + Args: + path: + The path to the zip file to load the embeddings store from. + """ + path = Path(path) + embeddings = np.load(file=path, allow_pickle=False) + assert self.embedding_dim == embeddings.shape[1] + self.embeddings = embeddings + + def get_nearest_neighbours(self, embedding: np.ndarray) -> list[Index]: + """Get the nearest neighbours to a given embedding. + + Args: + embedding: + The embedding to find nearest neighbours for. + + Returns: + A list of indices of the nearest neighbours. + + Raises: + ValueError: + If the number of documents in the store is less than the number of + documents to retrieve. + """ + if self.embeddings.shape[0] < self.config.num_documents_to_retrieve: + raise ValueError( + "The number of documents in the store is less than the number of " + "documents to retrieve." + ) + scores = self.embeddings @ embedding + top_indices = np.argsort(scores)[::-1][: self.config.num_documents_to_retrieve] + return top_indices diff --git a/tests/test_embedder.py b/tests/test_embedder.py index a7c885cd..f2c34962 100644 --- a/tests/test_embedder.py +++ b/tests/test_embedder.py @@ -1 +1,52 @@ """Unit tests for the `embedder` module.""" + +from typing import Generator + +import numpy as np +import pytest +from omegaconf import DictConfig +from ragger.embedder import E5Embedder, Embedder +from ragger.utils import Document + + +class TestE5Embedder: + """Tests for the `Embedder` class.""" + + @pytest.fixture(scope="class") + def embedder(self) -> Generator[E5Embedder, None, None]: + """Initialise an Embedder for testing.""" + config = DictConfig(dict(embedder_id="intfloat/multilingual-e5-large")) + embedder = E5Embedder(config=config) + yield embedder + + @pytest.fixture(scope="class") + def documents(self) -> list[Document]: + """Initialise a list of documents for testing.""" + return [ + Document(id="1", text="Hello, world!"), + Document(id="2", text="Goodbye, world!"), + ] + + @pytest.fixture(scope="class") + def query(self) -> str: + """Initialise a query for testing.""" + return "Hello, world!" + + def is_embedder(self): + """Test that the Embedder is an ABC.""" + assert issubclass(E5Embedder, Embedder) + + def test_initialisation(self, embedder): + """Test that the Embedder can be initialised.""" + assert embedder + + def test_embed(self, embedder, documents): + """Test that the Embedder can embed text.""" + embeddings = embedder.embed_documents(documents) + assert isinstance(embeddings, np.ndarray) + assert embeddings.shape[0] == len(documents) + + def test_embed_query(self, embedder, query): + """Test that the Embedder can embed a query.""" + embeddings = embedder.embed_query(query) + assert isinstance(embeddings, np.ndarray) diff --git a/tests/test_embedding_store.py b/tests/test_embedding_store.py index d48004fa..feba5891 100644 --- a/tests/test_embedding_store.py +++ b/tests/test_embedding_store.py @@ -1 +1,76 @@ """Unit tests for the `embedding_store` module.""" + +from tempfile import NamedTemporaryFile +from typing import Generator + +import numpy as np +import pytest +from omegaconf import DictConfig +from ragger.embedding_store import EmbeddingStore, NumpyEmbeddingStore + + +class TestNumpyEmbeddingStore: + """Tests for the `NumpyEmbeddingStore` class.""" + + @pytest.fixture(scope="class") + def embedding_store(self) -> Generator[NumpyEmbeddingStore, None, None]: + """Initialise a NumpyEmbeddingStore for testing.""" + config = DictConfig( + dict( + num_documents_to_retrieve=2, + embedder_id="intfloat/multilingual-e5-large", + ) + ) + store = NumpyEmbeddingStore(config=config) + yield store + + @pytest.fixture(scope="class") + def embeddings(self, embedding_store) -> list[np.array]: + """Initialise a list of documents for testing.""" + return [ + np.ones(shape=(embedding_store.embedding_dim,)), + np.zeros(shape=(embedding_store.embedding_dim,)), + ] + + def is_embedding_store(self): + """Test that the NumpyEmbeddingStore is an EmbeddingStore.""" + assert issubclass(NumpyEmbeddingStore, EmbeddingStore) + + def test_initialisation(self, embedding_store): + """Test that the NumpyEmbeddingStore can be initialised.""" + assert embedding_store + + def test_add_embeddings(self, embedding_store, embeddings): + """Test that embeddings can be added to the NumpyEmbeddingStore.""" + embedding_store.add_embeddings(embeddings) + assert len(embedding_store.embeddings) == 2 + assert np.array_equal(embedding_store.embeddings[0], embeddings[0]) + assert np.array_equal(embedding_store.embeddings[1], embeddings[1]) + embedding_store.reset() + + def test_get_nearest_neighbours(self, embedding_store, embeddings): + """Test that the nearest neighbours to an embedding can be found.""" + embedding_store.add_embeddings(embeddings) + neighbours = embedding_store.get_nearest_neighbours(embeddings[0]) + assert np.array_equal(np.array(neighbours), np.array([0, 1])) + neighbours = embedding_store.get_nearest_neighbours(embeddings[1]) + assert np.array_equal(np.array(neighbours), np.array([1, 0])) + embedding_store.reset() + + def test_reset(self, embedding_store, embeddings): + """Test that the NumpyEmbeddingStore can be reset.""" + embedding_store.add_embeddings(embeddings) + embedding_store.reset() + assert embedding_store.embeddings.shape == (0, embedding_store.embedding_dim) + embedding_store.reset() + + def test_save_load(self, embedding_store, embeddings): + """Test that the NumpyEmbeddingStore can be saved.""" + embedding_store.add_embeddings(embeddings) + new_store = NumpyEmbeddingStore(embedding_store.config) + with NamedTemporaryFile(suffix=".npy") as file: + embedding_store.save(file.name) + new_store.load(file.name) + assert np.array_equal(new_store.embeddings, embedding_store.embeddings) + assert new_store.embedding_dim == embedding_store.embedding_dim + embedding_store.reset()