Complete 'requests' library removal (BerriAI#7350)

* refactor: initial commit moving watsonx_text to base_llm_http_handler + clarifying new provider directory structure * refactor(watsonx/completion/handler.py): move to using base llm http handler removes 'requests' library usage * fix(watsonx_text/transformation.py): fix result transformation migrates to transformation.py, for usage with base llm http handler * fix(streaming_handler.py): migrate watsonx streaming to transformation.py ensures streaming works with base llm http handler * fix(streaming_handler.py): fix streaming linting errors and remove watsonx conditional logic * fix(watsonx/): fix chat route post completion route refactor * refactor(watsonx/embed): refactor watsonx to use base llm http handler for embedding calls as well * refactor(base.py): remove requests library usage from litellm * build(pyproject.toml): remove requests library usage * fix: fix linting errors * fix: fix linting errors * fix(types/utils.py): fix validation errors for modelresponsestream * fix(replicate/handler.py): fix linting errors * fix(litellm_logging.py): handle modelresponsestream object * fix(streaming_handler.py): fix modelresponsestream args * fix: remove unused imports * test: fix test * fix: fix test * test: fix test * test: fix tests * test: fix test * test: fix patch target * test: fix test
kp-forks · Dec 22, 2024 · 3671829 · 3671829
1 parent 8b1ea40
commit 3671829
Show file tree

Hide file tree

Showing 39 changed files with 2,147 additions and 2,279 deletions.
diff --git a/.gitignore b/.gitignore
@@ -67,3 +67,4 @@ litellm/tests/langfuse.log
 litellm/proxy/google-cloud-sdk/*
 tests/llm_translation/log.txt
 venv/
+tests/local_testing/log.txt
diff --git a/docs/my-website/docs/adding_provider/directory_structure.md b/docs/my-website/docs/adding_provider/directory_structure.md
@@ -5,16 +5,16 @@ When adding a new provider, you need to create a directory for the provider that
 ```
 litellm/llms/
 └── provider_name/
-    ├── completion/
+    ├── completion/ # use when endpoint is equivalent to openai's `/v1/completions`
     │   ├── handler.py
     │   └── transformation.py
-    ├── chat/
+    ├── chat/ # use when endpoint is equivalent to openai's `/v1/chat/completions`
     │   ├── handler.py
     │   └── transformation.py
-    ├── embed/
+    ├── embed/ # use when endpoint is equivalent to openai's `/v1/embeddings`
     │   ├── handler.py
     │   └── transformation.py
-    └── rerank/
+    └── rerank/ # use when endpoint is equivalent to cohere's `/rerank` endpoint.
         ├── handler.py
         └── transformation.py
 ```

diff --git a/litellm/__init__.py b/litellm/__init__.py
@@ -991,6 +991,7 @@ def add_known_models():
     get_api_base,
     get_first_chars_messages,
     ModelResponse,
+    ModelResponseStream,
     EmbeddingResponse,
     ImageResponse,
     TranscriptionResponse,
@@ -1157,6 +1158,7 @@ def add_known_models():
 from .llms.azure.chat.o1_transformation import AzureOpenAIO1Config
 from .llms.watsonx.completion.transformation import IBMWatsonXAIConfig
 from .llms.watsonx.chat.transformation import IBMWatsonXChatConfig
+from .llms.watsonx.embed.transformation import IBMWatsonXEmbeddingConfig
 from .main import *  # type: ignore
 from .integrations import *
 from .exceptions import (

diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
@@ -43,6 +43,7 @@
     ImageResponse,
     LiteLLMLoggingBaseClass,
     ModelResponse,
+    ModelResponseStream,
     StandardCallbackDynamicParams,
     StandardLoggingAdditionalHeaders,
     StandardLoggingHiddenParams,
@@ -741,6 +742,7 @@ def _response_cost_calculator(
         self,
         result: Union[
             ModelResponse,
+            ModelResponseStream,
             EmbeddingResponse,
             ImageResponse,
             TranscriptionResponse,
@@ -848,6 +850,7 @@ def _success_handler_helper_fn(
             ):  # handle streaming separately
                 if (
                     isinstance(result, ModelResponse)
+                    or isinstance(result, ModelResponseStream)
                     or isinstance(result, EmbeddingResponse)
                     or isinstance(result, ImageResponse)
                     or isinstance(result, TranscriptionResponse)
@@ -955,6 +958,7 @@ def success_handler(  # noqa: PLR0915
             if self.stream and (
                 isinstance(result, litellm.ModelResponse)
                 or isinstance(result, TextCompletionResponse)
+                or isinstance(result, ModelResponseStream)
             ):
                 complete_streaming_response: Optional[
                     Union[ModelResponse, TextCompletionResponse]
@@ -966,19 +970,13 @@ def success_handler(  # noqa: PLR0915
                     streaming_chunks=self.sync_streaming_chunks,
                     is_async=False,
                 )
-            _caching_complete_streaming_response: Optional[
-                Union[ModelResponse, TextCompletionResponse]
-            ] = None
             if complete_streaming_response is not None:
                 verbose_logger.debug(
                     "Logging Details LiteLLM-Success Call streaming complete"
                 )
                 self.model_call_details["complete_streaming_response"] = (
                     complete_streaming_response
                 )
-                _caching_complete_streaming_response = copy.deepcopy(
-                    complete_streaming_response
-                )
                 self.model_call_details["response_cost"] = (
                     self._response_cost_calculator(result=complete_streaming_response)
                 )
@@ -1474,6 +1472,7 @@ async def async_success_handler(  # noqa: PLR0915
         ] = None
         if self.stream is True and (
             isinstance(result, litellm.ModelResponse)
+            or isinstance(result, litellm.ModelResponseStream)
             or isinstance(result, TextCompletionResponse)
         ):
             complete_streaming_response: Optional[

diff --git a/litellm/litellm_core_utils/logging_utils.py b/litellm/litellm_core_utils/logging_utils.py
@@ -2,7 +2,11 @@
 from typing import TYPE_CHECKING, Any, List, Optional, Union
 
 from litellm._logging import verbose_logger
-from litellm.types.utils import ModelResponse, TextCompletionResponse
+from litellm.types.utils import (
+    ModelResponse,
+    ModelResponseStream,
+    TextCompletionResponse,
+)
 
 if TYPE_CHECKING:
     from litellm import ModelResponse as _ModelResponse
@@ -38,7 +42,7 @@ def convert_litellm_response_object_to_str(
 
 
 def _assemble_complete_response_from_streaming_chunks(
-    result: Union[ModelResponse, TextCompletionResponse],
+    result: Union[ModelResponse, TextCompletionResponse, ModelResponseStream],
     start_time: datetime,
     end_time: datetime,
     request_kwargs: dict,

diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py
@@ -5,7 +5,7 @@
 import traceback
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional, cast
 
 import httpx
 from pydantic import BaseModel
@@ -611,44 +611,6 @@ def handle_ollama_chat_stream(self, chunk):
         except Exception as e:
             raise e
 
-    def handle_watsonx_stream(self, chunk):
-        try:
-            if isinstance(chunk, dict):
-                parsed_response = chunk
-            elif isinstance(chunk, (str, bytes)):
-                if isinstance(chunk, bytes):
-                    chunk = chunk.decode("utf-8")
-                if "generated_text" in chunk:
-                    response = chunk.replace("data: ", "").strip()
-                    parsed_response = json.loads(response)
-                else:
-                    return {
-                        "text": "",
-                        "is_finished": False,
-                        "prompt_tokens": 0,
-                        "completion_tokens": 0,
-                    }
-            else:
-                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
-                raise ValueError(
-                    f"Unable to parse response. Original response: {chunk}"
-                )
-            results = parsed_response.get("results", [])
-            if len(results) > 0:
-                text = results[0].get("generated_text", "")
-                finish_reason = results[0].get("stop_reason")
-                is_finished = finish_reason != "not_finished"
-                return {
-                    "text": text,
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                    "prompt_tokens": results[0].get("input_token_count", 0),
-                    "completion_tokens": results[0].get("generated_token_count", 0),
-                }
-            return {"text": "", "is_finished": False}
-        except Exception as e:
-            raise e
-
     def handle_triton_stream(self, chunk):
         try:
             if isinstance(chunk, dict):
@@ -702,9 +664,18 @@ def model_response_creator(
             # pop model keyword
             chunk.pop("model", None)
 
-        model_response = ModelResponse(
-            stream=True, model=_model, stream_options=self.stream_options, **chunk
-        )
+        chunk_dict = {}
+        for key, value in chunk.items():
+            if key != "stream":
+                chunk_dict[key] = value
+
+        args = {
+            "model": _model,
+            "stream_options": self.stream_options,
+            **chunk_dict,
+        }
+
+        model_response = ModelResponseStream(**args)
         if self.response_id is not None:
             model_response.id = self.response_id
         else:
@@ -742,9 +713,9 @@ def is_delta_empty(self, delta: Delta) -> bool:
 
     def return_processed_chunk_logic(  # noqa
         self,
-        completion_obj: dict,
+        completion_obj: Dict[str, Any],
         model_response: ModelResponseStream,
-        response_obj: dict,
+        response_obj: Dict[str, Any],
     ):
 
         print_verbose(
@@ -887,11 +858,11 @@ def return_processed_chunk_logic(  # noqa
 
     def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
         model_response = self.model_response_creator()
-        response_obj: dict = {}
+        response_obj: Dict[str, Any] = {}
 
         try:
             # return this for all models
-            completion_obj = {"content": ""}
+            completion_obj: Dict[str, Any] = {"content": ""}
             from litellm.types.utils import GenericStreamingChunk as GChunk
 
             if (
@@ -1089,11 +1060,6 @@ def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
                 print_verbose(f"completion obj content: {completion_obj['content']}")
                 if response_obj["is_finished"]:
                     self.received_finish_reason = response_obj["finish_reason"]
-            elif self.custom_llm_provider == "watsonx":
-                response_obj = self.handle_watsonx_stream(chunk)
-                completion_obj["content"] = response_obj["text"]
-                if response_obj["is_finished"]:
-                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "triton":
                 response_obj = self.handle_triton_stream(chunk)
                 completion_obj["content"] = response_obj["text"]
@@ -1158,7 +1124,7 @@ def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
                     self.received_finish_reason = response_obj["finish_reason"]
             else:  # openai / azure chat model
                 if self.custom_llm_provider == "azure":
-                    if hasattr(chunk, "model"):
+                    if isinstance(chunk, BaseModel) and hasattr(chunk, "model"):
                         # for azure, we need to pass the model from the orignal chunk
                         self.model = chunk.model
                 response_obj = self.handle_openai_chat_completion_chunk(chunk)
@@ -1190,21 +1156,29 @@ def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
 
                 if response_obj["usage"] is not None:
                     if isinstance(response_obj["usage"], dict):
-                        model_response.usage = litellm.Usage(
-                            prompt_tokens=response_obj["usage"].get(
-                                "prompt_tokens", None
-                            )
-                            or None,
-                            completion_tokens=response_obj["usage"].get(
-                                "completion_tokens", None
-                            )
-                            or None,
-                            total_tokens=response_obj["usage"].get("total_tokens", None)
-                            or None,
+                        setattr(
+                            model_response,
+                            "usage",
+                            litellm.Usage(
+                                prompt_tokens=response_obj["usage"].get(
+                                    "prompt_tokens", None
+                                )
+                                or None,
+                                completion_tokens=response_obj["usage"].get(
+                                    "completion_tokens", None
+                                )
+                                or None,
+                                total_tokens=response_obj["usage"].get(
+                                    "total_tokens", None
+                                )
+                                or None,
+                            ),
                         )
                     elif isinstance(response_obj["usage"], BaseModel):
-                        model_response.usage = litellm.Usage(
-                            **response_obj["usage"].model_dump()
+                        setattr(
+                            model_response,
+                            "usage",
+                            litellm.Usage(**response_obj["usage"].model_dump()),
                         )
 
             model_response.model = self.model
@@ -1337,7 +1311,7 @@ def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
             raise StopIteration
         except Exception as e:
             traceback.format_exc()
-            e.message = str(e)
+            setattr(e, "message", str(e))
             raise exception_type(
                 model=self.model,
                 custom_llm_provider=self.custom_llm_provider,
@@ -1434,7 +1408,9 @@ def __next__(self):  # noqa: PLR0915
                     print_verbose(
                         f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}"
                     )
-                    response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk)
+                    response: Optional[ModelResponseStream] = self.chunk_creator(
+                        chunk=chunk
+                    )
                     print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}")
 
                     if response is None:
@@ -1597,7 +1573,7 @@ async def __anext__(self):  # noqa: PLR0915
                     # __anext__ also calls async_success_handler, which does logging
                     print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}")
 
-                    processed_chunk: Optional[ModelResponse] = self.chunk_creator(
+                    processed_chunk: Optional[ModelResponseStream] = self.chunk_creator(
                         chunk=chunk
                     )
                     print_verbose(
@@ -1624,7 +1600,7 @@ async def __anext__(self):  # noqa: PLR0915
                     if self.logging_obj._llm_caching_handler is not None:
                         asyncio.create_task(
                             self.logging_obj._llm_caching_handler._add_streaming_response_to_cache(
-                                processed_chunk=processed_chunk,
+                                processed_chunk=cast(ModelResponse, processed_chunk),
                             )
                         )
 
@@ -1663,8 +1639,8 @@ async def __anext__(self):  # noqa: PLR0915
                         chunk = next(self.completion_stream)
                     if chunk is not None and chunk != b"":
                         print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
-                        processed_chunk: Optional[ModelResponse] = self.chunk_creator(
-                            chunk=chunk
+                        processed_chunk: Optional[ModelResponseStream] = (
+                            self.chunk_creator(chunk=chunk)
                         )
                         print_verbose(
                             f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"

diff --git a/litellm/llms/base.py b/litellm/llms/base.py
@@ -2,7 +2,6 @@
 from typing import Any, Optional, Union
 
 import httpx
-import requests
 
 import litellm
 from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
@@ -16,7 +15,7 @@ class BaseLLM:
     def process_response(
         self,
         model: str,
-        response: Union[requests.Response, httpx.Response],
+        response: httpx.Response,
         model_response: ModelResponse,
         stream: bool,
         logging_obj: Any,
@@ -35,7 +34,7 @@ def process_response(
     def process_text_completion_response(
         self,
         model: str,
-        response: Union[requests.Response, httpx.Response],
+        response: httpx.Response,
         model_response: TextCompletionResponse,
         stream: bool,
         logging_obj: Any,

diff --git a/litellm/llms/base_llm/chat/transformation.py b/litellm/llms/base_llm/chat/transformation.py
@@ -107,7 +107,13 @@ def validate_environment(
     ) -> dict:
         pass
 
-    def get_complete_url(self, api_base: str, model: str) -> str:
+    def get_complete_url(
+        self,
+        api_base: str,
+        model: str,
+        optional_params: dict,
+        stream: Optional[bool] = None,
+    ) -> str:
         """
         OPTIONAL