Skip to content

Commit

Permalink
Complete 'requests' library removal (BerriAI#7350)
Browse files Browse the repository at this point in the history
* refactor: initial commit moving watsonx_text to base_llm_http_handler + clarifying new provider directory structure

* refactor(watsonx/completion/handler.py): move to using base llm http handler

removes 'requests' library usage

* fix(watsonx_text/transformation.py): fix result transformation

migrates to transformation.py, for usage with base llm http handler

* fix(streaming_handler.py): migrate watsonx streaming to transformation.py

ensures streaming works with base llm http handler

* fix(streaming_handler.py): fix streaming linting errors and remove watsonx conditional logic

* fix(watsonx/): fix chat route post completion route refactor

* refactor(watsonx/embed): refactor watsonx to use base llm http handler for embedding calls as well

* refactor(base.py): remove requests library usage from litellm

* build(pyproject.toml): remove requests library usage

* fix: fix linting errors

* fix: fix linting errors

* fix(types/utils.py): fix validation errors for modelresponsestream

* fix(replicate/handler.py): fix linting errors

* fix(litellm_logging.py): handle modelresponsestream object

* fix(streaming_handler.py): fix modelresponsestream args

* fix: remove unused imports

* test: fix test

* fix: fix test

* test: fix test

* test: fix tests

* test: fix test

* test: fix patch target

* test: fix test
  • Loading branch information
krrishdholakia authored Dec 22, 2024
1 parent 8b1ea40 commit 3671829
Show file tree
Hide file tree
Showing 39 changed files with 2,147 additions and 2,279 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,4 @@ litellm/tests/langfuse.log
litellm/proxy/google-cloud-sdk/*
tests/llm_translation/log.txt
venv/
tests/local_testing/log.txt
8 changes: 4 additions & 4 deletions docs/my-website/docs/adding_provider/directory_structure.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ When adding a new provider, you need to create a directory for the provider that
```
litellm/llms/
└── provider_name/
├── completion/
├── completion/ # use when endpoint is equivalent to openai's `/v1/completions`
│ ├── handler.py
│ └── transformation.py
├── chat/
├── chat/ # use when endpoint is equivalent to openai's `/v1/chat/completions`
│ ├── handler.py
│ └── transformation.py
├── embed/
├── embed/ # use when endpoint is equivalent to openai's `/v1/embeddings`
│ ├── handler.py
│ └── transformation.py
└── rerank/
└── rerank/ # use when endpoint is equivalent to cohere's `/rerank` endpoint.
├── handler.py
└── transformation.py
```
Expand Down
2 changes: 2 additions & 0 deletions litellm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,6 +991,7 @@ def add_known_models():
get_api_base,
get_first_chars_messages,
ModelResponse,
ModelResponseStream,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
Expand Down Expand Up @@ -1157,6 +1158,7 @@ def add_known_models():
from .llms.azure.chat.o1_transformation import AzureOpenAIO1Config
from .llms.watsonx.completion.transformation import IBMWatsonXAIConfig
from .llms.watsonx.chat.transformation import IBMWatsonXChatConfig
from .llms.watsonx.embed.transformation import IBMWatsonXEmbeddingConfig
from .main import * # type: ignore
from .integrations import *
from .exceptions import (
Expand Down
11 changes: 5 additions & 6 deletions litellm/litellm_core_utils/litellm_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
ImageResponse,
LiteLLMLoggingBaseClass,
ModelResponse,
ModelResponseStream,
StandardCallbackDynamicParams,
StandardLoggingAdditionalHeaders,
StandardLoggingHiddenParams,
Expand Down Expand Up @@ -741,6 +742,7 @@ def _response_cost_calculator(
self,
result: Union[
ModelResponse,
ModelResponseStream,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
Expand Down Expand Up @@ -848,6 +850,7 @@ def _success_handler_helper_fn(
): # handle streaming separately
if (
isinstance(result, ModelResponse)
or isinstance(result, ModelResponseStream)
or isinstance(result, EmbeddingResponse)
or isinstance(result, ImageResponse)
or isinstance(result, TranscriptionResponse)
Expand Down Expand Up @@ -955,6 +958,7 @@ def success_handler( # noqa: PLR0915
if self.stream and (
isinstance(result, litellm.ModelResponse)
or isinstance(result, TextCompletionResponse)
or isinstance(result, ModelResponseStream)
):
complete_streaming_response: Optional[
Union[ModelResponse, TextCompletionResponse]
Expand All @@ -966,19 +970,13 @@ def success_handler( # noqa: PLR0915
streaming_chunks=self.sync_streaming_chunks,
is_async=False,
)
_caching_complete_streaming_response: Optional[
Union[ModelResponse, TextCompletionResponse]
] = None
if complete_streaming_response is not None:
verbose_logger.debug(
"Logging Details LiteLLM-Success Call streaming complete"
)
self.model_call_details["complete_streaming_response"] = (
complete_streaming_response
)
_caching_complete_streaming_response = copy.deepcopy(
complete_streaming_response
)
self.model_call_details["response_cost"] = (
self._response_cost_calculator(result=complete_streaming_response)
)
Expand Down Expand Up @@ -1474,6 +1472,7 @@ async def async_success_handler( # noqa: PLR0915
] = None
if self.stream is True and (
isinstance(result, litellm.ModelResponse)
or isinstance(result, litellm.ModelResponseStream)
or isinstance(result, TextCompletionResponse)
):
complete_streaming_response: Optional[
Expand Down
8 changes: 6 additions & 2 deletions litellm/litellm_core_utils/logging_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from typing import TYPE_CHECKING, Any, List, Optional, Union

from litellm._logging import verbose_logger
from litellm.types.utils import ModelResponse, TextCompletionResponse
from litellm.types.utils import (
ModelResponse,
ModelResponseStream,
TextCompletionResponse,
)

if TYPE_CHECKING:
from litellm import ModelResponse as _ModelResponse
Expand Down Expand Up @@ -38,7 +42,7 @@ def convert_litellm_response_object_to_str(


def _assemble_complete_response_from_streaming_chunks(
result: Union[ModelResponse, TextCompletionResponse],
result: Union[ModelResponse, TextCompletionResponse, ModelResponseStream],
start_time: datetime,
end_time: datetime,
request_kwargs: dict,
Expand Down
118 changes: 47 additions & 71 deletions litellm/litellm_core_utils/streaming_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import traceback
import uuid
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Callable, List, Optional
from typing import Any, Callable, Dict, List, Optional, cast

import httpx
from pydantic import BaseModel
Expand Down Expand Up @@ -611,44 +611,6 @@ def handle_ollama_chat_stream(self, chunk):
except Exception as e:
raise e

def handle_watsonx_stream(self, chunk):
try:
if isinstance(chunk, dict):
parsed_response = chunk
elif isinstance(chunk, (str, bytes)):
if isinstance(chunk, bytes):
chunk = chunk.decode("utf-8")
if "generated_text" in chunk:
response = chunk.replace("data: ", "").strip()
parsed_response = json.loads(response)
else:
return {
"text": "",
"is_finished": False,
"prompt_tokens": 0,
"completion_tokens": 0,
}
else:
print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
raise ValueError(
f"Unable to parse response. Original response: {chunk}"
)
results = parsed_response.get("results", [])
if len(results) > 0:
text = results[0].get("generated_text", "")
finish_reason = results[0].get("stop_reason")
is_finished = finish_reason != "not_finished"
return {
"text": text,
"is_finished": is_finished,
"finish_reason": finish_reason,
"prompt_tokens": results[0].get("input_token_count", 0),
"completion_tokens": results[0].get("generated_token_count", 0),
}
return {"text": "", "is_finished": False}
except Exception as e:
raise e

def handle_triton_stream(self, chunk):
try:
if isinstance(chunk, dict):
Expand Down Expand Up @@ -702,9 +664,18 @@ def model_response_creator(
# pop model keyword
chunk.pop("model", None)

model_response = ModelResponse(
stream=True, model=_model, stream_options=self.stream_options, **chunk
)
chunk_dict = {}
for key, value in chunk.items():
if key != "stream":
chunk_dict[key] = value

args = {
"model": _model,
"stream_options": self.stream_options,
**chunk_dict,
}

model_response = ModelResponseStream(**args)
if self.response_id is not None:
model_response.id = self.response_id
else:
Expand Down Expand Up @@ -742,9 +713,9 @@ def is_delta_empty(self, delta: Delta) -> bool:

def return_processed_chunk_logic( # noqa
self,
completion_obj: dict,
completion_obj: Dict[str, Any],
model_response: ModelResponseStream,
response_obj: dict,
response_obj: Dict[str, Any],
):

print_verbose(
Expand Down Expand Up @@ -887,11 +858,11 @@ def return_processed_chunk_logic( # noqa

def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915
model_response = self.model_response_creator()
response_obj: dict = {}
response_obj: Dict[str, Any] = {}

try:
# return this for all models
completion_obj = {"content": ""}
completion_obj: Dict[str, Any] = {"content": ""}
from litellm.types.utils import GenericStreamingChunk as GChunk

if (
Expand Down Expand Up @@ -1089,11 +1060,6 @@ def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915
print_verbose(f"completion obj content: {completion_obj['content']}")
if response_obj["is_finished"]:
self.received_finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "watsonx":
response_obj = self.handle_watsonx_stream(chunk)
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
self.received_finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "triton":
response_obj = self.handle_triton_stream(chunk)
completion_obj["content"] = response_obj["text"]
Expand Down Expand Up @@ -1158,7 +1124,7 @@ def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915
self.received_finish_reason = response_obj["finish_reason"]
else: # openai / azure chat model
if self.custom_llm_provider == "azure":
if hasattr(chunk, "model"):
if isinstance(chunk, BaseModel) and hasattr(chunk, "model"):
# for azure, we need to pass the model from the orignal chunk
self.model = chunk.model
response_obj = self.handle_openai_chat_completion_chunk(chunk)
Expand Down Expand Up @@ -1190,21 +1156,29 @@ def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915

if response_obj["usage"] is not None:
if isinstance(response_obj["usage"], dict):
model_response.usage = litellm.Usage(
prompt_tokens=response_obj["usage"].get(
"prompt_tokens", None
)
or None,
completion_tokens=response_obj["usage"].get(
"completion_tokens", None
)
or None,
total_tokens=response_obj["usage"].get("total_tokens", None)
or None,
setattr(
model_response,
"usage",
litellm.Usage(
prompt_tokens=response_obj["usage"].get(
"prompt_tokens", None
)
or None,
completion_tokens=response_obj["usage"].get(
"completion_tokens", None
)
or None,
total_tokens=response_obj["usage"].get(
"total_tokens", None
)
or None,
),
)
elif isinstance(response_obj["usage"], BaseModel):
model_response.usage = litellm.Usage(
**response_obj["usage"].model_dump()
setattr(
model_response,
"usage",
litellm.Usage(**response_obj["usage"].model_dump()),
)

model_response.model = self.model
Expand Down Expand Up @@ -1337,7 +1311,7 @@ def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915
raise StopIteration
except Exception as e:
traceback.format_exc()
e.message = str(e)
setattr(e, "message", str(e))
raise exception_type(
model=self.model,
custom_llm_provider=self.custom_llm_provider,
Expand Down Expand Up @@ -1434,7 +1408,9 @@ def __next__(self): # noqa: PLR0915
print_verbose(
f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}"
)
response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk)
response: Optional[ModelResponseStream] = self.chunk_creator(
chunk=chunk
)
print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}")

if response is None:
Expand Down Expand Up @@ -1597,7 +1573,7 @@ async def __anext__(self): # noqa: PLR0915
# __anext__ also calls async_success_handler, which does logging
print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}")

processed_chunk: Optional[ModelResponse] = self.chunk_creator(
processed_chunk: Optional[ModelResponseStream] = self.chunk_creator(
chunk=chunk
)
print_verbose(
Expand All @@ -1624,7 +1600,7 @@ async def __anext__(self): # noqa: PLR0915
if self.logging_obj._llm_caching_handler is not None:
asyncio.create_task(
self.logging_obj._llm_caching_handler._add_streaming_response_to_cache(
processed_chunk=processed_chunk,
processed_chunk=cast(ModelResponse, processed_chunk),
)
)

Expand Down Expand Up @@ -1663,8 +1639,8 @@ async def __anext__(self): # noqa: PLR0915
chunk = next(self.completion_stream)
if chunk is not None and chunk != b"":
print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
processed_chunk: Optional[ModelResponse] = self.chunk_creator(
chunk=chunk
processed_chunk: Optional[ModelResponseStream] = (
self.chunk_creator(chunk=chunk)
)
print_verbose(
f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
Expand Down
5 changes: 2 additions & 3 deletions litellm/llms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Any, Optional, Union

import httpx
import requests

import litellm
from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
Expand All @@ -16,7 +15,7 @@ class BaseLLM:
def process_response(
self,
model: str,
response: Union[requests.Response, httpx.Response],
response: httpx.Response,
model_response: ModelResponse,
stream: bool,
logging_obj: Any,
Expand All @@ -35,7 +34,7 @@ def process_response(
def process_text_completion_response(
self,
model: str,
response: Union[requests.Response, httpx.Response],
response: httpx.Response,
model_response: TextCompletionResponse,
stream: bool,
logging_obj: Any,
Expand Down
8 changes: 7 additions & 1 deletion litellm/llms/base_llm/chat/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,13 @@ def validate_environment(
) -> dict:
pass

def get_complete_url(self, api_base: str, model: str) -> str:
def get_complete_url(
self,
api_base: str,
model: str,
optional_params: dict,
stream: Optional[bool] = None,
) -> str:
"""
OPTIONAL
Expand Down
Loading

0 comments on commit 3671829

Please sign in to comment.