From 2d0f5f72dbace850014c5cbda91343f8e3d0cfb4 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 16:35:53 +0200 Subject: [PATCH 01/58] feat: Use vllm server --- src/ragger/generator.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index bd0e3c93..dca62782 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -54,9 +54,14 @@ def __init__(self, config: DictConfig) -> None: """ super().__init__(config=config) logging.getLogger("httpx").setLevel(logging.CRITICAL) - api_key = os.environ[self.config.generator.api_key_variable_name] + api_key = os.environ[self.config.generator.api_key_variable_name].strip('"') + # self.client = OpenAI( + # api_key=api_key, timeout=self.config.generator.timeout + # ) self.client = OpenAI( - api_key=api_key.strip('"'), timeout=self.config.generator.timeout + base_url="http://localhost:8000/v1", + api_key=api_key, + timeout=self.config.generator.timeout, ) def generate( From 223cd969f04d87bd1718e83c9c20ab929cbd7c3d Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 16:38:11 +0200 Subject: [PATCH 02/58] fix: Use self.model --- src/ragger/generator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index dca62782..58ed1e3e 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -63,6 +63,7 @@ def __init__(self, config: DictConfig) -> None: api_key=api_key, timeout=self.config.generator.timeout, ) + self.model = self.client.models.list().data[0].id def generate( self, query: str, documents: list[Document] @@ -98,7 +99,7 @@ def generate( ] model_output = self.client.chat.completions.create( messages=messages, - model=self.config.generator.model, + model=self.model, # self.config.generator.model, max_tokens=self.config.generator.max_tokens, temperature=self.config.generator.temperature, stream=self.config.generator.stream, From 50c907f3238d653b0a936aa972f1d35f0dc61d08 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 16:39:44 +0200 Subject: [PATCH 03/58] debug --- .pre-commit-config.yaml | 2 +- src/ragger/generator.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 13ef8b68..a4564770 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: hooks: - id: end-of-file-fixer - id: trailing-whitespace - - id: debug-statements + #- id: debug-statements - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.3.2 hooks: diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 58ed1e3e..35978082 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -15,7 +15,6 @@ ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, ) -from openai.types.chat.completion_create_params import ResponseFormat from pydantic import ValidationError from pydantic_core import from_json @@ -103,9 +102,10 @@ def generate( max_tokens=self.config.generator.max_tokens, temperature=self.config.generator.temperature, stream=self.config.generator.stream, - stop=[""], - response_format=ResponseFormat(type="json_object"), + # stop=[""], + # response_format=ResponseFormat(type="json_object"), ) + breakpoint() if isinstance(model_output, Stream): def streamer() -> typing.Generator[GeneratedAnswer, None, None]: From d997f2845d3f53666d24313c181be54f68c64f72 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 16:45:52 +0200 Subject: [PATCH 04/58] debug --- src/ragger/generator.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 35978082..0bfcede5 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -11,10 +11,6 @@ from jinja2 import TemplateError from omegaconf import DictConfig from openai import OpenAI, Stream -from openai.types.chat import ( - ChatCompletionSystemMessageParam, - ChatCompletionUserMessageParam, -) from pydantic import ValidationError from pydantic_core import from_json @@ -83,10 +79,8 @@ def generate( "documents..." ) messages = [ - ChatCompletionSystemMessageParam( - role="system", content=self.config.generator.system_prompt - ), - ChatCompletionUserMessageParam( + dict(role="system", content=self.config.generator.system_prompt), + dict( role="user", content=self.config.generator.prompt.format( documents=json.dumps( From 24fd935fb274e5e7b4ee050655258a74c362e5a7 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 17:00:31 +0200 Subject: [PATCH 05/58] feat: Try using guided_json --- src/ragger/generator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 0bfcede5..da0c8d5c 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -96,8 +96,9 @@ def generate( max_tokens=self.config.generator.max_tokens, temperature=self.config.generator.temperature, stream=self.config.generator.stream, - # stop=[""], + stop=[""], # response_format=ResponseFormat(type="json_object"), + guided_json=GeneratedAnswer, ) breakpoint() if isinstance(model_output, Stream): From 984079f9b62a370d150f83e8c079e18824dc7490 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 17:02:54 +0200 Subject: [PATCH 06/58] fix: Use extra_body --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index da0c8d5c..201be994 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -98,7 +98,7 @@ def generate( stream=self.config.generator.stream, stop=[""], # response_format=ResponseFormat(type="json_object"), - guided_json=GeneratedAnswer, + extra_body=dict(guided_json=GeneratedAnswer), ) breakpoint() if isinstance(model_output, Stream): From 309baac1e8bf8edcf5d59642ff81db47945503b3 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 17:05:58 +0200 Subject: [PATCH 07/58] fix: Use json --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 201be994..7b3f78e5 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -98,7 +98,7 @@ def generate( stream=self.config.generator.stream, stop=[""], # response_format=ResponseFormat(type="json_object"), - extra_body=dict(guided_json=GeneratedAnswer), + extra_body=dict(guided_json=GeneratedAnswer.model_dump_json()), ) breakpoint() if isinstance(model_output, Stream): From 35fab6cbf04ace91303d0d7ed5f955e6f77a1af7 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 17:07:42 +0200 Subject: [PATCH 08/58] fix: Use model_json_schema --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 7b3f78e5..8fc1fa4a 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -98,7 +98,7 @@ def generate( stream=self.config.generator.stream, stop=[""], # response_format=ResponseFormat(type="json_object"), - extra_body=dict(guided_json=GeneratedAnswer.model_dump_json()), + extra_body=dict(guided_json=GeneratedAnswer.model_json_schema()), ) breakpoint() if isinstance(model_output, Stream): From 9b630014a22c3e36d53db61dda84a3961b203c9e Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 17:09:10 +0200 Subject: [PATCH 09/58] chore: Include response_format --- src/ragger/generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 8fc1fa4a..4207da3a 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -11,6 +11,7 @@ from jinja2 import TemplateError from omegaconf import DictConfig from openai import OpenAI, Stream +from openai.types.chat.completion_create_params import ResponseFormat from pydantic import ValidationError from pydantic_core import from_json @@ -97,10 +98,9 @@ def generate( temperature=self.config.generator.temperature, stream=self.config.generator.stream, stop=[""], - # response_format=ResponseFormat(type="json_object"), + response_format=ResponseFormat(type="json_object"), extra_body=dict(guided_json=GeneratedAnswer.model_json_schema()), ) - breakpoint() if isinstance(model_output, Stream): def streamer() -> typing.Generator[GeneratedAnswer, None, None]: From 0061921d066d7cc97e85863eeac1c24bbf160c98 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 17:10:37 +0200 Subject: [PATCH 10/58] chore: Logging --- src/ragger/generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 4207da3a..1d441565 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -111,6 +111,7 @@ def streamer() -> typing.Generator[GeneratedAnswer, None, None]: if chunk_str is None: break generated_output += chunk_str + logger.info(f"Generated output: {generated_output!r}") try: generated_dict = from_json( data=generated_output, allow_partial=True From 86745cb129801cb43bd13c9954281e67642dabe8 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Tue, 21 May 2024 17:12:27 +0200 Subject: [PATCH 11/58] chore: Remove logging --- src/ragger/generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 1d441565..4207da3a 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -111,7 +111,6 @@ def streamer() -> typing.Generator[GeneratedAnswer, None, None]: if chunk_str is None: break generated_output += chunk_str - logger.info(f"Generated output: {generated_output!r}") try: generated_dict = from_json( data=generated_output, allow_partial=True From 25c8e0711eb009154ae9dcfda9ad7fc16f5d7490 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 10:51:03 +0200 Subject: [PATCH 12/58] debug --- src/ragger/generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 4207da3a..6126a6b9 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -107,6 +107,7 @@ def streamer() -> typing.Generator[GeneratedAnswer, None, None]: generated_output = "" generated_obj = GeneratedAnswer(sources=[]) for chunk in model_output: + breakpoint() chunk_str = chunk.choices[0].delta.content if chunk_str is None: break From 7e0528c85d14d7cf8261e339b4472d8ea2965f8e Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 10:53:18 +0200 Subject: [PATCH 13/58] fix: Do not break streaming if chunk_str is None --- src/ragger/generator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 6126a6b9..79725761 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -107,10 +107,9 @@ def streamer() -> typing.Generator[GeneratedAnswer, None, None]: generated_output = "" generated_obj = GeneratedAnswer(sources=[]) for chunk in model_output: - breakpoint() chunk_str = chunk.choices[0].delta.content if chunk_str is None: - break + continue generated_output += chunk_str try: generated_dict = from_json( From 4fef8324b77ca9907ff1f0bd6a3cab382910a027 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 10:56:48 +0200 Subject: [PATCH 14/58] debug --- src/ragger/generator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 79725761..c12ee2fd 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -11,6 +11,10 @@ from jinja2 import TemplateError from omegaconf import DictConfig from openai import OpenAI, Stream +from openai.types.chat import ( + ChatCompletionSystemMessageParam, + ChatCompletionUserMessageParam, +) from openai.types.chat.completion_create_params import ResponseFormat from pydantic import ValidationError from pydantic_core import from_json @@ -80,8 +84,10 @@ def generate( "documents..." ) messages = [ - dict(role="system", content=self.config.generator.system_prompt), - dict( + ChatCompletionSystemMessageParam( + role="system", content=self.config.generator.system_prompt + ), + ChatCompletionUserMessageParam( role="user", content=self.config.generator.prompt.format( documents=json.dumps( @@ -101,6 +107,7 @@ def generate( response_format=ResponseFormat(type="json_object"), extra_body=dict(guided_json=GeneratedAnswer.model_json_schema()), ) + breakpoint() if isinstance(model_output, Stream): def streamer() -> typing.Generator[GeneratedAnswer, None, None]: From 05a96d3780fd8074c7755b524077f888dba37440 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:13:55 +0200 Subject: [PATCH 15/58] feat: Spawn new vLLM server if not already running --- config/generator/vllm.yaml | 1 + src/ragger/generator.py | 161 +++++++++---------------------------- 2 files changed, 38 insertions(+), 124 deletions(-) diff --git a/config/generator/vllm.yaml b/config/generator/vllm.yaml index 43f702b5..8c9c4fe2 100644 --- a/config/generator/vllm.yaml +++ b/config/generator/vllm.yaml @@ -7,3 +7,4 @@ max_tokens: 256 stream: true system_prompt: ${..language.system_prompt} prompt: ${..language.prompt} +server: null diff --git a/src/ragger/generator.py b/src/ragger/generator.py index c12ee2fd..a7dc9a79 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -1,14 +1,13 @@ """Generation of an answer from a query and a list of relevant documents.""" -import importlib.util import json import logging import os +import subprocess import typing import torch from dotenv import load_dotenv -from jinja2 import TemplateError from omegaconf import DictConfig from openai import OpenAI, Stream from openai.types.chat import ( @@ -20,21 +19,6 @@ from pydantic_core import from_json from .data_models import Document, GeneratedAnswer, Generator -from .utils import clear_memory - -if importlib.util.find_spec("vllm") is not None: - from vllm import LLM, SamplingParams - from vllm.model_executor.guided_decoding.outlines_logits_processors import ( - JSONLogitsProcessor, - ) - - try: - from vllm.model_executor.parallel_utils.parallel_state import ( - destroy_model_parallel, - ) - except ImportError: - from vllm.distributed.parallel_state import destroy_model_parallel - load_dotenv() @@ -54,16 +38,18 @@ def __init__(self, config: DictConfig) -> None: """ super().__init__(config=config) logging.getLogger("httpx").setLevel(logging.CRITICAL) - api_key = os.environ[self.config.generator.api_key_variable_name].strip('"') - # self.client = OpenAI( - # api_key=api_key, timeout=self.config.generator.timeout - # ) + + api_key = os.getenv(self.config.generator.api_key_variable_name) + if isinstance(api_key, str): + api_key = api_key.strip('"') + + self.server = ( + config.generator.server if hasattr(config.generator, "server") else None + ) + self.client = OpenAI( - base_url="http://localhost:8000/v1", - api_key=api_key, - timeout=self.config.generator.timeout, + base_url=self.server, api_key=api_key, timeout=self.config.generator.timeout ) - self.model = self.client.models.list().data[0].id def generate( self, query: str, documents: list[Document] @@ -99,7 +85,7 @@ def generate( ] model_output = self.client.chat.completions.create( messages=messages, - model=self.model, # self.config.generator.model, + model=self.config.generator.model, max_tokens=self.config.generator.max_tokens, temperature=self.config.generator.temperature, stream=self.config.generator.stream, @@ -107,7 +93,6 @@ def generate( response_format=ResponseFormat(type="json_object"), extra_body=dict(guided_json=GeneratedAnswer.model_json_schema()), ) - breakpoint() if isinstance(model_output, Stream): def streamer() -> typing.Generator[GeneratedAnswer, None, None]: @@ -182,7 +167,7 @@ def streamer() -> typing.Generator[GeneratedAnswer, None, None]: return generated_obj -class VllmGenerator(Generator): +class VllmGenerator(OpenaiGenerator): """A generator that uses a vLLM model to generate answers.""" def __init__(self, config: DictConfig) -> None: @@ -192,110 +177,38 @@ def __init__(self, config: DictConfig) -> None: config: The Hydra configuration. """ - super().__init__(config=config) - if not torch.cuda.is_available(): raise RuntimeError( "The `vLLMGenerator` requires a CUDA-compatible GPU to run. " "Please ensure that a compatible GPU is available and try again." ) - # We need to remove the model from GPU memory before creating a new one - destroy_model_parallel() - clear_memory() - - self.model = LLM( - model=config.generator.model, - gpu_memory_utilization=config.generator.gpu_memory_utilization, - max_model_len=config.generator.max_model_len, - seed=config.random_seed, - tensor_parallel_size=torch.cuda.device_count(), - ) - self.tokenizer = self.model.get_tokenizer() - self.logits_processor = JSONLogitsProcessor( - schema=GeneratedAnswer, tokenizer=self.tokenizer, whitespace_pattern=r" ?" - ) - - def generate( - self, query: str, documents: list[Document] - ) -> GeneratedAnswer | typing.Generator[GeneratedAnswer, None, None]: - """Generate an answer from a query and relevant documents. - - Args: - query: - The query to answer. - documents: - The relevant documents. - - Returns: - The generated answer. - """ - logger.info( - f"Generating answer for the query {query!r} and {len(documents):,} " - "documents..." - ) - - system_prompt = self.config.generator.system_prompt - user_prompt = self.config.generator.prompt.format( - documents=json.dumps([document.model_dump() for document in documents]), - query=query, - ) - - chat_template_kwargs = dict( - chat_template=self.tokenizer.chat_template, - add_generation_prompt=True, - tokenize=False, - ) - try: - prompt = self.tokenizer.apply_chat_template( - conversation=[ - dict(role="system", content=system_prompt), - dict(role="user", content=user_prompt), - ], - **chat_template_kwargs, - ) - except TemplateError: - prompt = self.tokenizer.apply_chat_template( - conversation=[ - dict(role="user", content=system_prompt + "\n\n" + user_prompt) + # If an inference server isn't already running then start a new server in a + # background process and store the process ID + breakpoint() + self.server_process: subprocess.Popen | None + if config.generator.server is None: + self.server_process = subprocess.Popen( + args=[ + "python", + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + config.generator.model, + "--max-model-len", + str(config.generator.max_model_len), + "--gpu-memory-utilization", + str(config.generator.gpu_memory_utilization), ], - **chat_template_kwargs, - ) - - sampling_params = SamplingParams( - max_tokens=self.config.generator.max_tokens, - temperature=self.config.generator.temperature, - stop=[""], - logits_processors=[self.logits_processor], - ) - - model_output = self.model.generate( - prompts=[prompt], sampling_params=sampling_params - ) - generated_output = model_output[0].outputs[0].text - - try: - generated_dict = json.loads(generated_output) - except json.JSONDecodeError: - raise ValueError( - f"Could not decode JSON from model output: {generated_output}" + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) + else: + self.server_process = None - try: - generated_obj = GeneratedAnswer.model_validate(generated_dict) - except ValidationError: - raise ValueError(f"Could not validate model output: {generated_dict}") - - logger.info(f"Generated answer: {generated_obj.answer!r}") - return generated_obj + super().__init__(config=config) def __del__(self) -> None: - """Clear the GPU memory used by the model, and remove the model itself.""" - if hasattr(self, "model"): - del self.model - del self - try: - destroy_model_parallel() - except ImportError: - pass - clear_memory() + """Close down the vLLM server, if we started a new one.""" + if self.server_process is not None: + self.server_process.kill() From ff9a16ec5a35d6177196108e68e243ace342b0b8 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:16:40 +0200 Subject: [PATCH 16/58] fix: Do not use api_key if running vLLM generator --- src/ragger/generator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index a7dc9a79..62833bf0 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -39,9 +39,11 @@ def __init__(self, config: DictConfig) -> None: super().__init__(config=config) logging.getLogger("httpx").setLevel(logging.CRITICAL) - api_key = os.getenv(self.config.generator.api_key_variable_name) - if isinstance(api_key, str): - api_key = api_key.strip('"') + if hasattr(config.generator, "api_key_variable_name"): + env_var_name = config.generator.api_key_variable_name + api_key = os.environ[env_var_name].strip('"') + else: + api_key = None self.server = ( config.generator.server if hasattr(config.generator, "server") else None From d5b8f6b3161951bcb1d0faa334959df5b921cfe5 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:17:31 +0200 Subject: [PATCH 17/58] fix: vLLM config --- config/generator/vllm.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/config/generator/vllm.yaml b/config/generator/vllm.yaml index 8c9c4fe2..96692f3c 100644 --- a/config/generator/vllm.yaml +++ b/config/generator/vllm.yaml @@ -1,10 +1,11 @@ name: vllm model: ThatsGroes/munin-SkoleGPTOpenOrca-7b-16bit -max_model_len: 10_000 -gpu_memory_utilization: 0.95 temperature: 0.0 max_tokens: 256 stream: true +timeout: 60 system_prompt: ${..language.system_prompt} prompt: ${..language.prompt} +max_model_len: 10_000 +gpu_memory_utilization: 0.95 server: null From c1dc7153ea35ecac3b667fa1ae85ec75cb64d103 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:17:50 +0200 Subject: [PATCH 18/58] chore: Remove breakpoint --- src/ragger/generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 62833bf0..4f376f55 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -187,7 +187,6 @@ def __init__(self, config: DictConfig) -> None: # If an inference server isn't already running then start a new server in a # background process and store the process ID - breakpoint() self.server_process: subprocess.Popen | None if config.generator.server is None: self.server_process = subprocess.Popen( From a73d98db37ef931adfd0528dd4a2a3bb62b4239b Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:18:28 +0200 Subject: [PATCH 19/58] debug --- src/ragger/generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 4f376f55..2e4194a2 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -85,6 +85,7 @@ def generate( ), ), ] + breakpoint() model_output = self.client.chat.completions.create( messages=messages, model=self.config.generator.model, From b9e72f076bde8a4f1d1d6ba190530435ca0d5c60 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:20:37 +0200 Subject: [PATCH 20/58] debug --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 2e4194a2..6455a1c4 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -48,6 +48,7 @@ def __init__(self, config: DictConfig) -> None: self.server = ( config.generator.server if hasattr(config.generator, "server") else None ) + breakpoint() self.client = OpenAI( base_url=self.server, api_key=api_key, timeout=self.config.generator.timeout @@ -85,7 +86,6 @@ def generate( ), ), ] - breakpoint() model_output = self.client.chat.completions.create( messages=messages, model=self.config.generator.model, From 6622b7a474781d4db5852fb87f496c5aec475d65 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:22:12 +0200 Subject: [PATCH 21/58] fix: Set server after booting it --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 6455a1c4..4326e45f 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -48,7 +48,6 @@ def __init__(self, config: DictConfig) -> None: self.server = ( config.generator.server if hasattr(config.generator, "server") else None ) - breakpoint() self.client = OpenAI( base_url=self.server, api_key=api_key, timeout=self.config.generator.timeout @@ -205,6 +204,7 @@ def __init__(self, config: DictConfig) -> None: stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) + config.generator.server = "http://localhost:8000/v1" else: self.server_process = None From acde691bcb56340b84a22dd6fe88475062490854 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:22:56 +0200 Subject: [PATCH 22/58] debug --- src/ragger/generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 4326e45f..7e9670bc 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -205,6 +205,7 @@ def __init__(self, config: DictConfig) -> None: stderr=subprocess.DEVNULL, ) config.generator.server = "http://localhost:8000/v1" + breakpoint() else: self.server_process = None From ef4c33e3285c0698ede3c8ff63b9fec260c95914 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:23:37 +0200 Subject: [PATCH 23/58] debug --- src/ragger/generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 7e9670bc..a689b94f 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -200,9 +200,9 @@ def __init__(self, config: DictConfig) -> None: str(config.generator.max_model_len), "--gpu-memory-utilization", str(config.generator.gpu_memory_utilization), - ], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, + ] + # stdout=subprocess.DEVNULL, + # stderr=subprocess.DEVNULL, ) config.generator.server = "http://localhost:8000/v1" breakpoint() From 00b38bd64301c1477ce83563ea570afeda54ffbf Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:26:13 +0200 Subject: [PATCH 24/58] fix: Add sleep after server start --- src/ragger/generator.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index a689b94f..5a21c27c 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -5,6 +5,7 @@ import os import subprocess import typing +from time import sleep import torch from dotenv import load_dotenv @@ -200,12 +201,13 @@ def __init__(self, config: DictConfig) -> None: str(config.generator.max_model_len), "--gpu-memory-utilization", str(config.generator.gpu_memory_utilization), - ] - # stdout=subprocess.DEVNULL, - # stderr=subprocess.DEVNULL, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) + logger.info("Starting vLLM server...") + sleep(10) config.generator.server = "http://localhost:8000/v1" - breakpoint() else: self.server_process = None From d532c77b6fdcf08a558566ec872e0f584ecf51a8 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 11:34:36 +0200 Subject: [PATCH 25/58] fix: Only require CUDA to start the vLLM inference server, not to use one --- src/ragger/generator.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 5a21c27c..dfb7cb39 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -180,16 +180,17 @@ def __init__(self, config: DictConfig) -> None: config: The Hydra configuration. """ - if not torch.cuda.is_available(): - raise RuntimeError( - "The `vLLMGenerator` requires a CUDA-compatible GPU to run. " - "Please ensure that a compatible GPU is available and try again." - ) - # If an inference server isn't already running then start a new server in a # background process and store the process ID self.server_process: subprocess.Popen | None if config.generator.server is None: + # We can only run the inference server if CUDA is available + if not torch.cuda.is_available(): + raise RuntimeError( + "The `vLLMGenerator` requires a CUDA-compatible GPU to run. " + "Please ensure that a compatible GPU is available and try again." + ) + self.server_process = subprocess.Popen( args=[ "python", From dc0be2c0fa337f4b30c1d982668f2b80057bff7d Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:22:26 +0200 Subject: [PATCH 26/58] fix: Only set `guided_json` if using vLLM --- src/ragger/generator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index dfb7cb39..c15ecc1a 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -86,6 +86,11 @@ def generate( ), ), ] + + extra_body = dict() + if self.config.generator.name == "vllm": + extra_body["guided_json"] = GeneratedAnswer.model_json_schema() + model_output = self.client.chat.completions.create( messages=messages, model=self.config.generator.model, @@ -94,8 +99,9 @@ def generate( stream=self.config.generator.stream, stop=[""], response_format=ResponseFormat(type="json_object"), - extra_body=dict(guided_json=GeneratedAnswer.model_json_schema()), + extra_body=extra_body, ) + if isinstance(model_output, Stream): def streamer() -> typing.Generator[GeneratedAnswer, None, None]: From 3fd39c66d7d66ad48236cd83dd0524bc05cef6ee Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:26:50 +0200 Subject: [PATCH 27/58] tests: vLLM tests --- tests/conftest.py | 6 ++++-- tests/test_generator.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3f8d4f50..08de9848 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -114,13 +114,15 @@ def vllm_generator_params(system_prompt, prompt) -> typing.Generator[dict, None, yield dict( name="vllm", model="ThatsGroes/munin-SkoleGPTOpenOrca-7b-16bit", - max_model_len=10_000, - gpu_memory_utilization=0.95, temperature=0.0, max_tokens=128, stream=False, + timeout=60, system_prompt=system_prompt, prompt=prompt, + max_model_len=10_000, + gpu_memory_utilization=0.95, + server=None, ) diff --git a/tests/test_generator.py b/tests/test_generator.py index 2d93da1d..65316f33 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -95,7 +95,6 @@ def test_initialisation(self, config) -> None: """Test that the generator is initialised correctly.""" generator = VllmGenerator(config=config) assert generator - del generator def test_generate(self, config, query, documents) -> None: """Test that the generator generates an answer.""" From ad04ed1254cd94c64088da68234bdea36e9d0171 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:41:50 +0200 Subject: [PATCH 28/58] feat: Add more args to vLLM server --- config/generator/vllm.yaml | 1 + src/ragger/generator.py | 14 ++++++++++++-- tests/conftest.py | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/config/generator/vllm.yaml b/config/generator/vllm.yaml index 96692f3c..8052e172 100644 --- a/config/generator/vllm.yaml +++ b/config/generator/vllm.yaml @@ -9,3 +9,4 @@ prompt: ${..language.prompt} max_model_len: 10_000 gpu_memory_utilization: 0.95 server: null +port: 8000 diff --git a/src/ragger/generator.py b/src/ragger/generator.py index c15ecc1a..bc9e6af8 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -18,6 +18,7 @@ from openai.types.chat.completion_create_params import ResponseFormat from pydantic import ValidationError from pydantic_core import from_json +from transformers import AutoTokenizer from .data_models import Document, GeneratedAnswer, Generator @@ -47,7 +48,9 @@ def __init__(self, config: DictConfig) -> None: api_key = None self.server = ( - config.generator.server if hasattr(config.generator, "server") else None + f"{config.generator.server}:{config.generator.host}/v1" + if hasattr(config.generator, "server") + else None ) self.client = OpenAI( @@ -197,6 +200,8 @@ def __init__(self, config: DictConfig) -> None: "Please ensure that a compatible GPU is available and try again." ) + config.generator.server = "http://localhost" + tokenizer = AutoTokenizer.from_pretrained(config.generator.model) self.server_process = subprocess.Popen( args=[ "python", @@ -208,13 +213,18 @@ def __init__(self, config: DictConfig) -> None: str(config.generator.max_model_len), "--gpu-memory-utilization", str(config.generator.gpu_memory_utilization), + "--chat-template", + tokenizer.chat_template, + "--host", + config.generator.server, + "--port", + str(config.generator.port), ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) logger.info("Starting vLLM server...") sleep(10) - config.generator.server = "http://localhost:8000/v1" else: self.server_process = None diff --git a/tests/conftest.py b/tests/conftest.py index 08de9848..c6f5a65a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -123,6 +123,7 @@ def vllm_generator_params(system_prompt, prompt) -> typing.Generator[dict, None, max_model_len=10_000, gpu_memory_utilization=0.95, server=None, + port=9999, ) From 226a88a888992465097069c1e4a65245aaf73fe6 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:42:35 +0200 Subject: [PATCH 29/58] fix: Typo --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index bc9e6af8..3e1db36c 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -48,7 +48,7 @@ def __init__(self, config: DictConfig) -> None: api_key = None self.server = ( - f"{config.generator.server}:{config.generator.host}/v1" + f"{config.generator.server}:{config.generator.port}/v1" if hasattr(config.generator, "server") else None ) From e7262269d73d86ad0d089992272c8a98bb0f1575 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:43:18 +0200 Subject: [PATCH 30/58] debug --- src/ragger/generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 3e1db36c..1b35176c 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -219,9 +219,9 @@ def __init__(self, config: DictConfig) -> None: config.generator.server, "--port", str(config.generator.port), - ], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, + ] + # stdout=subprocess.DEVNULL, + # stderr=subprocess.DEVNULL, ) logger.info("Starting vLLM server...") sleep(10) From dedb0327de90d064d76f97aa3a1e8bfb14e9c600 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:44:01 +0200 Subject: [PATCH 31/58] fix: Up vLLM startup sleep time --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 1b35176c..22392b86 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -224,7 +224,7 @@ def __init__(self, config: DictConfig) -> None: # stderr=subprocess.DEVNULL, ) logger.info("Starting vLLM server...") - sleep(10) + sleep(20) else: self.server_process = None From 598f2862476c8b5a86aed07ce6ba840a48a37a27 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:46:15 +0200 Subject: [PATCH 32/58] debug --- src/ragger/generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 22392b86..dc7fcea6 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -215,8 +215,8 @@ def __init__(self, config: DictConfig) -> None: str(config.generator.gpu_memory_utilization), "--chat-template", tokenizer.chat_template, - "--host", - config.generator.server, + # "--host", + # config.generator.server, "--port", str(config.generator.port), ] From 36e6d0b8c9fa2127fb75ae67775ebf98978a1f96 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:47:29 +0200 Subject: [PATCH 33/58] debug --- src/ragger/generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index dc7fcea6..91c97663 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -200,7 +200,7 @@ def __init__(self, config: DictConfig) -> None: "Please ensure that a compatible GPU is available and try again." ) - config.generator.server = "http://localhost" + config.generator.server = "http://0.0.0.0" tokenizer = AutoTokenizer.from_pretrained(config.generator.model) self.server_process = subprocess.Popen( args=[ @@ -215,8 +215,8 @@ def __init__(self, config: DictConfig) -> None: str(config.generator.gpu_memory_utilization), "--chat-template", tokenizer.chat_template, - # "--host", - # config.generator.server, + "--host", + config.generator.server, "--port", str(config.generator.port), ] From 192721a2d9de91d4996dd8c76bb46bbffe1de7b9 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:48:47 +0200 Subject: [PATCH 34/58] debug --- src/ragger/generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 91c97663..30e85cc3 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -215,10 +215,10 @@ def __init__(self, config: DictConfig) -> None: str(config.generator.gpu_memory_utilization), "--chat-template", tokenizer.chat_template, - "--host", - config.generator.server, - "--port", - str(config.generator.port), + # "--host", + # config.generator.server, + # "--port", + # str(config.generator.port), ] # stdout=subprocess.DEVNULL, # stderr=subprocess.DEVNULL, From 89a56c256d814b733b5ae1fde9bef4b971f7edd3 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:50:57 +0200 Subject: [PATCH 35/58] debug --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 30e85cc3..370e5ff0 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -200,7 +200,7 @@ def __init__(self, config: DictConfig) -> None: "Please ensure that a compatible GPU is available and try again." ) - config.generator.server = "http://0.0.0.0" + config.generator.server = "0.0.0.0" tokenizer = AutoTokenizer.from_pretrained(config.generator.model) self.server_process = subprocess.Popen( args=[ From a2936b68f58d8b9b615be1fa2f519256e9feead2 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:51:58 +0200 Subject: [PATCH 36/58] fix: Add port back in --- src/ragger/generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 370e5ff0..3897cf7b 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -217,8 +217,8 @@ def __init__(self, config: DictConfig) -> None: tokenizer.chat_template, # "--host", # config.generator.server, - # "--port", - # str(config.generator.port), + "--port", + str(config.generator.port), ] # stdout=subprocess.DEVNULL, # stderr=subprocess.DEVNULL, From e414942b560d4323faff848f9cc0d5fcd93b6a62 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 12:54:41 +0200 Subject: [PATCH 37/58] fix: Set up self.server in OpenaiGenerator correctly --- src/ragger/generator.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 3897cf7b..68e70b88 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -47,11 +47,14 @@ def __init__(self, config: DictConfig) -> None: else: api_key = None - self.server = ( - f"{config.generator.server}:{config.generator.port}/v1" - if hasattr(config.generator, "server") - else None - ) + self.server: str | None + if hasattr(config.generator, "server"): + host = config.generator.server + if not host.startswith("http"): + host = f"http://{host}" + self.server = f"{host}:{config.generator.port}/v1" + else: + self.server = None self.client = OpenAI( base_url=self.server, api_key=api_key, timeout=self.config.generator.timeout From 6d01292ff9e9533ec524c50e1c5f64a5fd3e5f33 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:01:45 +0200 Subject: [PATCH 38/58] debug --- src/ragger/generator.py | 61 +++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 68e70b88..4fdeb314 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -203,36 +203,49 @@ def __init__(self, config: DictConfig) -> None: "Please ensure that a compatible GPU is available and try again." ) + self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) config.generator.server = "0.0.0.0" - tokenizer = AutoTokenizer.from_pretrained(config.generator.model) - self.server_process = subprocess.Popen( - args=[ - "python", - "-m", - "vllm.entrypoints.openai.api_server", - "--model", - config.generator.model, - "--max-model-len", - str(config.generator.max_model_len), - "--gpu-memory-utilization", - str(config.generator.gpu_memory_utilization), - "--chat-template", - tokenizer.chat_template, - # "--host", - # config.generator.server, - "--port", - str(config.generator.port), - ] - # stdout=subprocess.DEVNULL, - # stderr=subprocess.DEVNULL, - ) - logger.info("Starting vLLM server...") - sleep(20) + self.server_process = self.start_inference_server() else: self.server_process = None super().__init__(config=config) + def start_inference_server(self) -> subprocess.Popen: + """Start the vLLM inference server. + + Returns: + The inference server process. + """ + logger.info("Starting vLLM server...") + process = subprocess.Popen( + args=[ + "python", + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + self.config.generator.model, + "--max-model-len", + str(self.config.generator.max_model_len), + "--gpu-memory-utilization", + str(self.config.generator.gpu_memory_utilization), + "--chat-template", + self.tokenizer.chat_template, + # "--host", + # self.config.generator.server, + "--port", + str(self.config.generator.port), + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout = process.stdout + for _ in range(20): + if stdout is not None: + print(stdout.readline()) + sleep(1) + return process + def __del__(self) -> None: """Close down the vLLM server, if we started a new one.""" if self.server_process is not None: From 905dd975b0ae8bb75433f4983e84c05bfaca9ac9 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:02:31 +0200 Subject: [PATCH 39/58] fix: Store config in VllmGenerator --- src/ragger/generator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 4fdeb314..ca10ac69 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -192,6 +192,8 @@ def __init__(self, config: DictConfig) -> None: config: The Hydra configuration. """ + self.config = config + # If an inference server isn't already running then start a new server in a # background process and store the process ID self.server_process: subprocess.Popen | None From 63cf5c9def4f525c1ff0952a5e5fbbde0ff7af4d Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:04:04 +0200 Subject: [PATCH 40/58] debug --- src/ragger/generator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index ca10ac69..5ccdd151 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -242,9 +242,11 @@ def start_inference_server(self) -> subprocess.Popen: stderr=subprocess.PIPE, ) stdout = process.stdout + stderr = process.stderr for _ in range(20): - if stdout is not None: - print(stdout.readline()) + if stdout is not None and stderr is not None: + print("STDOUT:", stdout.readline()) + print("STDERR:", stderr.readline()) sleep(1) return process From 837ce283bd17bbe27f6bbacf07e9334368417a63 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:07:08 +0200 Subject: [PATCH 41/58] feat: Check manually if Uvicorn server has started --- src/ragger/generator.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 5ccdd151..e81abb2a 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -238,16 +238,19 @@ def start_inference_server(self) -> subprocess.Popen: "--port", str(self.config.generator.port), ], - stdout=subprocess.PIPE, + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, ) - stdout = process.stdout stderr = process.stderr - for _ in range(20): - if stdout is not None and stderr is not None: - print("STDOUT:", stdout.readline()) - print("STDERR:", stderr.readline()) + assert stderr is not None + for seconds in range(self.config.generator.timeout): + update = stderr.readline().decode("utf-8") + if "Uvicorn running" in update: + logger.info(f"vLLM server started after {seconds} seconds.") + break sleep(1) + else: + raise RuntimeError("vLLM server failed to start.") return process def __del__(self) -> None: From 14e5d338c49a67987dd3787fdf3eb2d541bf2400 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:08:57 +0200 Subject: [PATCH 42/58] feat: Block stderr when loading tokenizer --- src/ragger/generator.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index e81abb2a..281610c4 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -4,6 +4,7 @@ import logging import os import subprocess +import sys import typing from time import sleep @@ -205,7 +206,11 @@ def __init__(self, config: DictConfig) -> None: "Please ensure that a compatible GPU is available and try again." ) + # Load the tokenizer without printing any logs + sys.stderr = open(os.devnull, "w") self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) + sys.stderr = sys.__stderr__ + config.generator.server = "0.0.0.0" self.server_process = self.start_inference_server() else: From 7c1298ccc480b42355baf957f3aed0eeb7432d87 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:09:30 +0200 Subject: [PATCH 43/58] debug --- src/ragger/generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 281610c4..618dd5ff 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -208,6 +208,7 @@ def __init__(self, config: DictConfig) -> None: # Load the tokenizer without printing any logs sys.stderr = open(os.devnull, "w") + breakpoint() self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) sys.stderr = sys.__stderr__ From bc1641bd92f47d097961d6aad2b7690bc82ac7fa Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:11:30 +0200 Subject: [PATCH 44/58] refactor: Use HiddenPrints --- src/ragger/generator.py | 9 ++++----- src/ragger/utils.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 618dd5ff..bafd18f7 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -4,7 +4,6 @@ import logging import os import subprocess -import sys import typing from time import sleep @@ -21,6 +20,8 @@ from pydantic_core import from_json from transformers import AutoTokenizer +from ragger.utils import HiddenPrints + from .data_models import Document, GeneratedAnswer, Generator load_dotenv() @@ -207,10 +208,8 @@ def __init__(self, config: DictConfig) -> None: ) # Load the tokenizer without printing any logs - sys.stderr = open(os.devnull, "w") - breakpoint() - self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) - sys.stderr = sys.__stderr__ + with HiddenPrints(): + self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) config.generator.server = "0.0.0.0" self.server_process = self.start_inference_server() diff --git a/src/ragger/utils.py b/src/ragger/utils.py index 5b4c4225..b0a149f4 100644 --- a/src/ragger/utils.py +++ b/src/ragger/utils.py @@ -2,7 +2,9 @@ import gc import importlib +import os import re +import sys from typing import Type import torch @@ -138,3 +140,21 @@ def load_ragger_components(config: DictConfig) -> Components: class_name=config.generator.name, component_type="generator" ), ) + + +class HiddenPrints: + """Context manager which removes all terminal output.""" + + def __enter__(self): + """Enter the context manager.""" + self._original_stdout = sys.stdout + self._original_stderr = sys.stderr + sys.stdout = open(os.devnull, "w") + sys.stderr = open(os.devnull, "w") + + def __exit__(self, exc_type, exc_val, exc_tb): + """Exit the context manager.""" + sys.stdout.close() + sys.stderr.close() + sys.stdout = self._original_stdout + sys.stderr = self._original_stderr From 67de367e4f898a8981527ff23d2a7c1970a33b92 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:13:45 +0200 Subject: [PATCH 45/58] fix: Block transformers logging --- src/ragger/generator.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index bafd18f7..08a22c2d 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -20,8 +20,6 @@ from pydantic_core import from_json from transformers import AutoTokenizer -from ragger.utils import HiddenPrints - from .data_models import Document, GeneratedAnswer, Generator load_dotenv() @@ -195,6 +193,7 @@ def __init__(self, config: DictConfig) -> None: The Hydra configuration. """ self.config = config + logging.getLogger("transformers").setLevel(logging.CRITICAL) # If an inference server isn't already running then start a new server in a # background process and store the process ID @@ -207,9 +206,7 @@ def __init__(self, config: DictConfig) -> None: "Please ensure that a compatible GPU is available and try again." ) - # Load the tokenizer without printing any logs - with HiddenPrints(): - self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) + self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) config.generator.server = "0.0.0.0" self.server_process = self.start_inference_server() From aaae8cb6791779c5375ead3504af902ed3a63f9e Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:15:28 +0200 Subject: [PATCH 46/58] feat: Add --host back in --- src/ragger/generator.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 08a22c2d..e2d7b3cd 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -206,9 +206,8 @@ def __init__(self, config: DictConfig) -> None: "Please ensure that a compatible GPU is available and try again." ) - self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) - config.generator.server = "0.0.0.0" + self.tokenizer = AutoTokenizer.from_pretrained(config.generator.model) self.server_process = self.start_inference_server() else: self.server_process = None @@ -235,8 +234,8 @@ def start_inference_server(self) -> subprocess.Popen: str(self.config.generator.gpu_memory_utilization), "--chat-template", self.tokenizer.chat_template, - # "--host", - # self.config.generator.server, + "--host", + self.config.generator.server, "--port", str(self.config.generator.port), ], From 8b67836fb92219273afbe1129b9db5b02b091ef0 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:16:26 +0200 Subject: [PATCH 47/58] debug --- src/ragger/generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index e2d7b3cd..fafc9d06 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -256,5 +256,6 @@ def start_inference_server(self) -> subprocess.Popen: def __del__(self) -> None: """Close down the vLLM server, if we started a new one.""" + breakpoint() if self.server_process is not None: self.server_process.kill() From 9a93b7bbe631f282605cc118effa77a9d4323796 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:17:52 +0200 Subject: [PATCH 48/58] fix: Add `del self` in `__del__` --- src/ragger/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index fafc9d06..8f6e788a 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -256,6 +256,6 @@ def start_inference_server(self) -> subprocess.Popen: def __del__(self) -> None: """Close down the vLLM server, if we started a new one.""" - breakpoint() if self.server_process is not None: self.server_process.kill() + del self From 3e15ea228264d7650846bd9ba6884af2f303badf Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:20:30 +0200 Subject: [PATCH 49/58] chore: Ignore ResourceWarning in pytest --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 41e2b580..e69d1271 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,6 +95,7 @@ filterwarnings = [ "ignore::DeprecationWarning", "ignore::PendingDeprecationWarning", "ignore::ImportWarning", + "ignore::ResourceWarning", ] log_cli_level = "info" testpaths = [ From 38cb04726e356de1059f0156515322a3d35c0f5e Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:25:31 +0200 Subject: [PATCH 50/58] tests: Initialise the VllmGenerator fewer times in tests --- tests/test_generator.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index 65316f33..bc75b8a5 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -1,6 +1,7 @@ """Unit tests for the `generator` module.""" import typing +from copy import deepcopy import pytest import torch @@ -83,38 +84,47 @@ class TestVllmGenerator: """Tests for the `VllmGenerator` class.""" @pytest.fixture(scope="class") - def config(self, vllm_generator_params) -> typing.Generator[DictConfig, None, None]: + def generator( + self, vllm_generator_params + ) -> typing.Generator[VllmGenerator, None, None]: """Initialise a configuration for testing.""" - yield DictConfig(dict(random_seed=703, generator=vllm_generator_params)) + config = DictConfig(dict(random_seed=703, generator=vllm_generator_params)) + yield VllmGenerator(config=config) + + @pytest.fixture(scope="class") + def generator_with_few_max_tokens( + self, vllm_generator_params + ) -> typing.Generator[VllmGenerator, None, None]: + """Initialise a configuration for testing.""" + params = deepcopy(vllm_generator_params) + params["max_tokens"] = 1 + params["port"] = 9998 + config = DictConfig(dict(random_seed=703, generator=params)) + yield VllmGenerator(config=config) def test_is_generator(self) -> None: """Test that the VllmGenerator is a Generator.""" assert issubclass(VllmGenerator, Generator) - def test_initialisation(self, config) -> None: + def test_initialisation(self, generator) -> None: """Test that the generator is initialised correctly.""" - generator = VllmGenerator(config=config) assert generator - def test_generate(self, config, query, documents) -> None: + def test_generate(self, generator, query, documents) -> None: """Test that the generator generates an answer.""" - generator = VllmGenerator(config=config) answer = generator.generate(query=query, documents=documents) expected = GeneratedAnswer(answer="Uerop", sources=["2"]) assert answer == expected - def test_error_if_not_json(self, config, query, documents) -> None: + def test_error_if_not_json( + self, generator_with_few_max_tokens, query, documents + ) -> None: """Test that the generator raises an error if the output is not JSON.""" - old_max_tokens = config.generator.max_tokens - config.generator.max_tokens = 1 - generator = VllmGenerator(config=config) with pytest.raises(ValueError): - generator.generate(query=query, documents=documents) - config.generator.max_tokens = old_max_tokens + generator_with_few_max_tokens.generate(query=query, documents=documents) - def test_error_if_not_valid_types(self, config, query, documents) -> None: + def test_error_if_not_valid_types(self, generator, query, documents) -> None: """Test that the generator raises an error if the output is not JSON.""" - generator = VllmGenerator(config=config) bad_prompt = 'Inkludér kilderne i key\'en "kilder" i stedet for "sources".' with pytest.raises(ValueError): generator.generate(query=f"{query}\n{bad_prompt}", documents=documents) From 2c3ff566b2991cb891c8964f650cfcd3f6a5707c Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:26:45 +0200 Subject: [PATCH 51/58] fix: Do not hardcode different ports --- tests/test_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index bc75b8a5..03d6ba82 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -98,7 +98,7 @@ def generator_with_few_max_tokens( """Initialise a configuration for testing.""" params = deepcopy(vllm_generator_params) params["max_tokens"] = 1 - params["port"] = 9998 + params["port"] = params["port"] - 1 config = DictConfig(dict(random_seed=703, generator=params)) yield VllmGenerator(config=config) From 9b2fddc3c2d3260795d0b68518b9af5f4388df15 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:28:53 +0200 Subject: [PATCH 52/58] tests: Use same VllmGenerator --- tests/test_generator.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index 03d6ba82..1cd97e35 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -91,17 +91,6 @@ def generator( config = DictConfig(dict(random_seed=703, generator=vllm_generator_params)) yield VllmGenerator(config=config) - @pytest.fixture(scope="class") - def generator_with_few_max_tokens( - self, vllm_generator_params - ) -> typing.Generator[VllmGenerator, None, None]: - """Initialise a configuration for testing.""" - params = deepcopy(vllm_generator_params) - params["max_tokens"] = 1 - params["port"] = params["port"] - 1 - config = DictConfig(dict(random_seed=703, generator=params)) - yield VllmGenerator(config=config) - def test_is_generator(self) -> None: """Test that the VllmGenerator is a Generator.""" assert issubclass(VllmGenerator, Generator) @@ -116,12 +105,15 @@ def test_generate(self, generator, query, documents) -> None: expected = GeneratedAnswer(answer="Uerop", sources=["2"]) assert answer == expected - def test_error_if_not_json( - self, generator_with_few_max_tokens, query, documents - ) -> None: + def test_error_if_not_json(self, generator, query, documents) -> None: """Test that the generator raises an error if the output is not JSON.""" + old_config = generator.config + config_copy = deepcopy(old_config) + config_copy.generator.max_tokens = 1 + generator.config = config_copy with pytest.raises(ValueError): - generator_with_few_max_tokens.generate(query=query, documents=documents) + generator.generate(query=query, documents=documents) + generator.config = old_config def test_error_if_not_valid_types(self, generator, query, documents) -> None: """Test that the generator raises an error if the output is not JSON.""" From 21abef2c9011e53ec784044ebdbf68463659c91f Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:31:13 +0200 Subject: [PATCH 53/58] tests: Remove validity check test, as it is impossible with VllmGenerator --- tests/test_generator.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index 1cd97e35..6da980a9 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -72,7 +72,7 @@ def test_error_if_not_json(self, config, query, documents) -> None: config.generator.max_tokens = old_max_tokens def test_error_if_not_valid_types(self, config, query, documents) -> None: - """Test that the generator raises an error if the output is not JSON.""" + """Test that the generator raises an error if the JSON isn't valid.""" generator = OpenaiGenerator(config=config) bad_prompt = 'Inkludér kilderne i key\'en "kilder" i stedet for "sources".' with pytest.raises(ValueError): @@ -114,9 +114,3 @@ def test_error_if_not_json(self, generator, query, documents) -> None: with pytest.raises(ValueError): generator.generate(query=query, documents=documents) generator.config = old_config - - def test_error_if_not_valid_types(self, generator, query, documents) -> None: - """Test that the generator raises an error if the output is not JSON.""" - bad_prompt = 'Inkludér kilderne i key\'en "kilder" i stedet for "sources".' - with pytest.raises(ValueError): - generator.generate(query=f"{query}\n{bad_prompt}", documents=documents) From 65e538afbaa6742e47dd4d2e4eee1e89c4b5c33b Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:32:14 +0200 Subject: [PATCH 54/58] tests: Remove random_seed from VllmGenerator config --- tests/test_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index 6da980a9..4ec769ff 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -88,7 +88,7 @@ def generator( self, vllm_generator_params ) -> typing.Generator[VllmGenerator, None, None]: """Initialise a configuration for testing.""" - config = DictConfig(dict(random_seed=703, generator=vllm_generator_params)) + config = DictConfig(dict(generator=vllm_generator_params)) yield VllmGenerator(config=config) def test_is_generator(self) -> None: From 6facb53f3da707053b8c505328affe48c75a43f4 Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:34:02 +0200 Subject: [PATCH 55/58] docs: Add comments --- src/ragger/generator.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ragger/generator.py b/src/ragger/generator.py index 8f6e788a..a613d4d0 100644 --- a/src/ragger/generator.py +++ b/src/ragger/generator.py @@ -221,6 +221,8 @@ def start_inference_server(self) -> subprocess.Popen: The inference server process. """ logger.info("Starting vLLM server...") + + # Start server using the vLLM entrypoint process = subprocess.Popen( args=[ "python", @@ -242,6 +244,8 @@ def start_inference_server(self) -> subprocess.Popen: stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, ) + + # Wait for the server to start stderr = process.stderr assert stderr is not None for seconds in range(self.config.generator.timeout): @@ -252,6 +256,7 @@ def start_inference_server(self) -> subprocess.Popen: sleep(1) else: raise RuntimeError("vLLM server failed to start.") + return process def __del__(self) -> None: From aa64ac495842c11a37b099200f7c40ef2179956a Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:36:57 +0200 Subject: [PATCH 56/58] fix: Raise ValueError in get_component_by_name if module or class don't exist --- src/ragger/utils.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/ragger/utils.py b/src/ragger/utils.py index b0a149f4..d58aad52 100644 --- a/src/ragger/utils.py +++ b/src/ragger/utils.py @@ -105,15 +105,27 @@ def get_component_by_name(class_name: str, component_type: str) -> Type: Returns: The class. + + Raises: + ValueError: + If the module or class cannot be found. """ # Get the snake_case and PascalCase version of the class name full_class_name = f"{class_name}_{component_type}" name_pascal = snake_to_pascal(snake_string=full_class_name) - # Get the class from the module + # Get the module module_name = f"ragger.{component_type}" - module = importlib.import_module(name=module_name) - class_: Type = getattr(module, name_pascal) + try: + module = importlib.import_module(name=module_name) + except ModuleNotFoundError: + raise ValueError(f"Module {module_name!r}' not found.") + + # Get the class from the module + try: + class_: Type = getattr(module, name_pascal) + except AttributeError: + raise ValueError(f"Class {name_pascal!r} not found in module {module_name!r}.") return class_ From 213950bd7f114cff136105de36d2a641291da6d5 Mon Sep 17 00:00:00 2001 From: jDan Saattrup Nielsen Date: Wed, 22 May 2024 11:38:41 +0000 Subject: [PATCH 57/58] docs: Update coverage badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index de647d4b..a4087550 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A repository for general-purpose RAG applications. ______________________________________________________________________ -[![Code Coverage](https://img.shields.io/badge/Coverage-68%25-yellow.svg)](https://github.com/alexandrainst/ragger/tree/main/tests) +[![Code Coverage](https://img.shields.io/badge/Coverage-70%25-yellow.svg)](https://github.com/alexandrainst/ragger/tree/main/tests) Developer(s): From 46c61efdbf6b977087a5632aa5cec2394acbd12b Mon Sep 17 00:00:00 2001 From: Dan Saattrup Nielsen Date: Wed, 22 May 2024 13:39:35 +0200 Subject: [PATCH 58/58] chore: Re-instate pre-commit hook --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a4564770..13ef8b68 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: hooks: - id: end-of-file-fixer - id: trailing-whitespace - #- id: debug-statements + - id: debug-statements - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.3.2 hooks: