-
Notifications
You must be signed in to change notification settings - Fork 761
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[#432] Add Groq Provider - chat completions #609
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the terms described in the LICENSE file in | ||
# the root directory of this source tree. | ||
|
||
from pydantic import BaseModel | ||
|
||
from llama_stack.apis.inference import Inference | ||
|
||
from .config import GroqConfig | ||
|
||
|
||
class GroqProviderDataValidator(BaseModel): | ||
groq_api_key: str | ||
|
||
|
||
async def get_adapter_impl(config: GroqConfig, _deps) -> Inference: | ||
# import dynamically so the import is used only when it is needed | ||
from .groq import GroqInferenceAdapter | ||
|
||
if not isinstance(config, GroqConfig): | ||
raise RuntimeError(f"Unexpected config type: {type(config)}") | ||
|
||
adapter = GroqInferenceAdapter(config) | ||
return adapter |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the terms described in the LICENSE file in | ||
# the root directory of this source tree. | ||
|
||
from typing import Optional | ||
|
||
from llama_models.schema_utils import json_schema_type | ||
from pydantic import BaseModel, Field | ||
|
||
|
||
@json_schema_type | ||
class GroqConfig(BaseModel): | ||
api_key: Optional[str] = Field( | ||
# The Groq client library loads the GROQ_API_KEY environment variable by default | ||
default=None, | ||
description="The Groq API key", | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the terms described in the LICENSE file in | ||
# the root directory of this source tree. | ||
|
||
import warnings | ||
from typing import AsyncIterator, List, Optional, Union | ||
|
||
from groq import Groq | ||
from llama_models.datatypes import SamplingParams | ||
from llama_models.llama3.api.datatypes import ToolDefinition, ToolPromptFormat | ||
from llama_models.sku_list import CoreModelId | ||
|
||
from llama_stack.apis.inference import ( | ||
ChatCompletionRequest, | ||
ChatCompletionResponse, | ||
ChatCompletionResponseStreamChunk, | ||
CompletionResponse, | ||
CompletionResponseStreamChunk, | ||
EmbeddingsResponse, | ||
Inference, | ||
InterleavedContent, | ||
LogProbConfig, | ||
Message, | ||
ResponseFormat, | ||
ToolChoice, | ||
) | ||
from llama_stack.distribution.request_headers import NeedsRequestProviderData | ||
from llama_stack.providers.remote.inference.groq.config import GroqConfig | ||
from llama_stack.providers.utils.inference.model_registry import ( | ||
build_model_alias, | ||
build_model_alias_with_just_provider_model_id, | ||
ModelRegistryHelper, | ||
) | ||
from .groq_utils import ( | ||
convert_chat_completion_request, | ||
convert_chat_completion_response, | ||
convert_chat_completion_response_stream, | ||
) | ||
|
||
_MODEL_ALIASES = [ | ||
build_model_alias( | ||
"llama3-8b-8192", | ||
CoreModelId.llama3_1_8b_instruct.value, | ||
), | ||
build_model_alias_with_just_provider_model_id( | ||
"llama-3.1-8b-instant", | ||
CoreModelId.llama3_1_8b_instruct.value, | ||
), | ||
build_model_alias( | ||
"llama3-70b-8192", | ||
CoreModelId.llama3_70b_instruct.value, | ||
), | ||
build_model_alias( | ||
"llama-3.3-70b-versatile", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you know what does this suffix indicate? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I couldn't find anything online. @ricklamers @philass - could you provide any additional context here? |
||
CoreModelId.llama3_3_70b_instruct.value, | ||
), | ||
# Groq only contains a preview version for llama-3.2-3b | ||
# Preview models aren't recommended for production use, but we include this one | ||
# to pass the test fixture | ||
# TODO(aidand): Replace this with a stable model once Groq supports it | ||
build_model_alias( | ||
"llama-3.2-3b-preview", | ||
CoreModelId.llama3_2_3b_instruct.value, | ||
), | ||
] | ||
|
||
|
||
class GroqInferenceAdapter(Inference, ModelRegistryHelper, NeedsRequestProviderData): | ||
_config: GroqConfig | ||
|
||
def __init__(self, config: GroqConfig): | ||
ModelRegistryHelper.__init__(self, model_aliases=_MODEL_ALIASES) | ||
self._config = config | ||
|
||
def completion( | ||
self, | ||
model_id: str, | ||
content: InterleavedContent, | ||
sampling_params: Optional[SamplingParams] = SamplingParams(), | ||
response_format: Optional[ResponseFormat] = None, | ||
stream: Optional[bool] = False, | ||
logprobs: Optional[LogProbConfig] = None, | ||
) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: | ||
# Groq doesn't support non-chat completion as of time of writing | ||
raise NotImplementedError() | ||
|
||
async def chat_completion( | ||
self, | ||
model_id: str, | ||
messages: List[Message], | ||
sampling_params: Optional[SamplingParams] = SamplingParams(), | ||
response_format: Optional[ResponseFormat] = None, | ||
tools: Optional[List[ToolDefinition]] = None, | ||
tool_choice: Optional[ToolChoice] = ToolChoice.auto, | ||
tool_prompt_format: Optional[ | ||
ToolPromptFormat | ||
] = None, # API default is ToolPromptFormat.json, we default to None to detect user input | ||
stream: Optional[bool] = False, | ||
logprobs: Optional[LogProbConfig] = None, | ||
) -> Union[ | ||
ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk] | ||
]: | ||
model_id = self.get_provider_model_id(model_id) | ||
if model_id == "llama-3.2-3b-preview": | ||
warnings.warn( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. very user friendly +1 |
||
"Groq only contains a preview version for llama-3.2-3b-instruct. " | ||
"Preview models aren't recommended for production use. " | ||
"They can be discontinued on short notice." | ||
) | ||
|
||
request = convert_chat_completion_request( | ||
request=ChatCompletionRequest( | ||
model=model_id, | ||
messages=messages, | ||
sampling_params=sampling_params, | ||
response_format=response_format, | ||
tools=tools, | ||
tool_choice=tool_choice, | ||
tool_prompt_format=tool_prompt_format, | ||
stream=stream, | ||
logprobs=logprobs, | ||
) | ||
) | ||
|
||
response = self._get_client().chat.completions.create(**request) | ||
|
||
if stream: | ||
return convert_chat_completion_response_stream(response) | ||
else: | ||
return convert_chat_completion_response(response) | ||
|
||
async def embeddings( | ||
self, | ||
model_id: str, | ||
contents: List[InterleavedContent], | ||
) -> EmbeddingsResponse: | ||
raise NotImplementedError() | ||
|
||
def _get_client(self) -> Groq: | ||
if self._config.api_key is not None: | ||
return Groq(api_key=self.config.api_key) | ||
else: | ||
provider_data = self.get_request_provider_data() | ||
if provider_data is None or not provider_data.groq_api_key: | ||
raise ValueError( | ||
'Pass Groq API Key in the header X-LlamaStack-ProviderData as { "groq_api_key": "<your api key>" }' | ||
) | ||
return Groq(api_key=provider_data.groq_api_key) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the terms described in the LICENSE file in | ||
# the root directory of this source tree. | ||
|
||
import warnings | ||
from typing import AsyncGenerator, Literal | ||
|
||
from groq import Stream | ||
from groq.types.chat.chat_completion import ChatCompletion | ||
from groq.types.chat.chat_completion_assistant_message_param import ( | ||
ChatCompletionAssistantMessageParam, | ||
) | ||
from groq.types.chat.chat_completion_chunk import ChatCompletionChunk | ||
from groq.types.chat.chat_completion_message_param import ChatCompletionMessageParam | ||
from groq.types.chat.chat_completion_system_message_param import ( | ||
ChatCompletionSystemMessageParam, | ||
) | ||
from groq.types.chat.chat_completion_user_message_param import ( | ||
ChatCompletionUserMessageParam, | ||
) | ||
|
||
from groq.types.chat.completion_create_params import CompletionCreateParams | ||
|
||
from llama_stack.apis.inference import ( | ||
ChatCompletionRequest, | ||
ChatCompletionResponse, | ||
ChatCompletionResponseEvent, | ||
ChatCompletionResponseEventType, | ||
ChatCompletionResponseStreamChunk, | ||
CompletionMessage, | ||
Message, | ||
StopReason, | ||
) | ||
|
||
|
||
def convert_chat_completion_request( | ||
request: ChatCompletionRequest, | ||
) -> CompletionCreateParams: | ||
""" | ||
Convert a ChatCompletionRequest to a Groq API-compatible dictionary. | ||
Warns client if request contains unsupported features. | ||
""" | ||
|
||
if request.logprobs: | ||
# Groq doesn't support logprobs at the time of writing | ||
warnings.warn("logprobs are not supported yet") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
if request.response_format: | ||
# Groq's JSON mode is beta at the time of writing | ||
warnings.warn("response_format is not supported yet") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
if request.sampling_params.repetition_penalty != 1.0: | ||
# groq supports frequency_penalty, but frequency_penalty and sampling_params.repetition_penalty | ||
# seem to have different semantics | ||
# frequency_penalty defaults to 0 is a float between -2.0 and 2.0 | ||
# repetition_penalty defaults to 1 and is often set somewhere between 1.0 and 2.0 | ||
# so we exclude it for now | ||
warnings.warn("repetition_penalty is not supported") | ||
|
||
if request.tools: | ||
warnings.warn("tools are not supported yet") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I’m planning to handle tool calls in a separate PR since there are edge cases I want to cover properly. But lmk if you want me to include it within this PR |
||
|
||
return CompletionCreateParams( | ||
model=request.model, | ||
messages=[_convert_message(message) for message in request.messages], | ||
logprobs=None, | ||
frequency_penalty=None, | ||
stream=request.stream, | ||
max_tokens=request.sampling_params.max_tokens or None, | ||
temperature=request.sampling_params.temperature, | ||
top_p=request.sampling_params.top_p, | ||
) | ||
|
||
|
||
def _convert_message(message: Message) -> ChatCompletionMessageParam: | ||
if message.role == "system": | ||
return ChatCompletionSystemMessageParam(role="system", content=message.content) | ||
elif message.role == "user": | ||
return ChatCompletionUserMessageParam(role="user", content=message.content) | ||
elif message.role == "assistant": | ||
return ChatCompletionAssistantMessageParam( | ||
role="assistant", content=message.content | ||
) | ||
else: | ||
raise ValueError(f"Invalid message role: {message.role}") | ||
|
||
|
||
def convert_chat_completion_response( | ||
response: ChatCompletion, | ||
) -> ChatCompletionResponse: | ||
# groq only supports n=1 at time of writing, so there is only one choice | ||
choice = response.choices[0] | ||
return ChatCompletionResponse( | ||
completion_message=CompletionMessage( | ||
content=choice.message.content, | ||
stop_reason=_map_finish_reason_to_stop_reason(choice.finish_reason), | ||
), | ||
) | ||
|
||
|
||
def _map_finish_reason_to_stop_reason( | ||
finish_reason: Literal["stop", "length", "tool_calls"] | ||
) -> StopReason: | ||
""" | ||
Convert a Groq chat completion finish_reason to a StopReason. | ||
finish_reason: Literal["stop", "length", "tool_calls"] | ||
- stop -> model hit a natural stop point or a provided stop sequence | ||
- length -> maximum number of tokens specified in the request was reached | ||
- tool_calls -> model called a tool | ||
""" | ||
if finish_reason == "stop": | ||
return StopReason.end_of_turn | ||
elif finish_reason == "length": | ||
return StopReason.out_of_tokens | ||
elif finish_reason == "tool_calls": | ||
raise NotImplementedError("tool_calls is not supported yet") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Users won't be able to hit this error yet since they can't pass tools as a parameter |
||
else: | ||
raise ValueError(f"Invalid finish reason: {finish_reason}") | ||
|
||
|
||
async def convert_chat_completion_response_stream( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in some other PR, we should merge this into the general util module. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think this function is too coupled to Groq types to be used as general util function? E.g., this one takes in a |
||
stream: Stream[ChatCompletionChunk], | ||
) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]: | ||
|
||
event_type = ChatCompletionResponseEventType.start | ||
for chunk in stream: | ||
choice = chunk.choices[0] | ||
|
||
# We assume there's only one finish_reason for the entire stream. | ||
# We collect the last finish_reason | ||
if choice.finish_reason: | ||
stop_reason = _map_finish_reason_to_stop_reason(choice.finish_reason) | ||
|
||
yield ChatCompletionResponseStreamChunk( | ||
event=ChatCompletionResponseEvent( | ||
event_type=event_type, | ||
delta=choice.delta.content or "", | ||
logprobs=None, | ||
) | ||
) | ||
event_type = ChatCompletionResponseEventType.progress | ||
|
||
yield ChatCompletionResponseStreamChunk( | ||
event=ChatCompletionResponseEvent( | ||
event_type=ChatCompletionResponseEventType.complete, | ||
delta="", | ||
logprobs=None, | ||
stop_reason=stop_reason, | ||
) | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit - the groq library will read GROQ_API_KEY env (https://github.com/groq/groq-python/blob/main/src/groq/_client.py#L86), consider adding a comment here so people in the LS codebase know this expectation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's not rely on environment variables for code that we expect to run in llama-stack server. We would want to take in the api key as a config variable in run.yaml when we spin up the server
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@raghotham, I believe that's the behaviour at the moment. This is how fireworks and together define their configs:
llama-stack/llama_stack/providers/remote/inference/fireworks/config.py
Lines 19 to 22 in 96e158e
llama-stack/llama_stack/providers/remote/inference/together/config.py
Lines 19 to 22 in 6765fd7
And it's in the run.yaml that you define the environment variable:
llama-stack/llama_stack/templates/together/run.yaml
Lines 18 to 20 in 516e1a3
wdyt?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes @aidando73 this is correct. Note that both Together and Fireworks also support grabbing the api key from headers via the
NeedsProviderData
mixin. You can add that if you feel like it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done - added the mixin
Added some client code in the test plan as well to test