You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Is your feature request related to a problem? Please describe.
Several models when receive a large input return an error instead of complete with what they have. That disrupts our workflow. We have only one more month to enjoy the amazing Solar Pro for free. Halp!
Describe the solution you'd like
Adding parameter trim to generation.
Describe alternatives you've considered
A scissor and a knife. Now I have a split keyboard.
Additional context
I was planning to make a humble contribution, but I guess it may never happen.
def trim_messages(
messages,
model: Optional[str] = None,
trim_ratio: float = 0.75,
return_response_tokens: bool = False,
max_tokens=None,
):
"""
Trim a list of messages to fit within a model's token limit.
Args:
messages: Input messages to be trimmed. Each message is a dictionary with 'role' and 'content'.
model: The LiteLLM model being used (determines the token limit).
trim_ratio: Target ratio of tokens to use after trimming. Default is 0.75, meaning it will trim messages so they use about 75% of the model's token limit.
return_response_tokens: If True, also return the number of tokens left available for the response after trimming.
max_tokens: Instead of specifying a model or trim_ratio, you can specify this directly.
Returns:
Trimmed messages and optionally the number of tokens available for response.
"""
# Initialize max_tokens
# if users pass in max tokens, trim to this amount
messages = copy.deepcopy(messages)
try:
if max_tokens is None:
# Check if model is valid
if model in litellm.model_cost:
max_tokens_for_model = litellm.model_cost[model].get(
"max_input_tokens", litellm.model_cost[model]["max_tokens"]
)
max_tokens = int(max_tokens_for_model * trim_ratio)
else:
# if user did not specify max (input) tokens
# or passed an llm litellm does not know
# do nothing, just return messages
return messages
I stupidly forgot about it and implemented half-baked as follows:
def adjust_prompt_length(prompt, max_tokens=115000):
# Get the encoding
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
# Count the number of tokens in the prompt
num_tokens = len(encoding.encode(prompt))
# If the number of tokens exceeds the maximum, reduce the prompt size
if num_tokens > max_tokens:
logger.warning(
f"Prompt exceeds maximum length of {max_tokens} tokens. Truncating..."
)
# Calculate the percentage to keep
keep_ratio = max_tokens / num_tokens
# Find the split point
split_point = int(len(prompt) * keep_ratio)
# Split the prompt
truncated_content = prompt[:split_point]
When generating multiple dialogues, though, we have another issue: the models that need user/assistant/user/assistant. I implemented as follows (a much less elegant solution):
# Create messages_to_llm with correct role structure
messages_to_llm = []
for i, msg in enumerate(messages_dataset):
if i == 0:
# First message is always system
messages_to_llm.append({"role": "system", "content": msg["content"]})
elif i % 2 == 1:
# Odd-indexed messages are user
messages_to_llm.append({"role": "user", "content": msg["content"]})
else:
# Even-indexed messages are assistant
messages_to_llm.append({"role": "assistant", "content": msg["content"]})
print(f"Created messages_to_llm with {len(messages_to_llm)} messages")
# Ensure the last message is from the user
# Safeguard in case of an empty messages_to_llm list
if messages_to_llm and messages_to_llm[-1]["role"] != "user":
messages_to_llm.append({"role": "user", "content": "Please continue."})
print("Ensured last message is from user")
print("Final messages_to_llm: ", model)
print(messages_to_llm[-1]["content"])
max_retries = 500
for attempt in range(max_retries):
try:
SOLAR_KEY = await pick_random_key()
print(SOLAR_KEY)
client = AsyncOpenAI(api_key=SOLAR_KEY, base_url=base_url)
response = await asyncio.wait_for(
client.chat.completions.create(
messages=messages_to_llm,
model=model,
# max_tokens=4000,
temperature=0.7,
stream=False,
),
timeout=60,
)
await asyncio.sleep(5) # Corrected to await
if response and response.choices and response.choices[0].message.content:
message = response.choices[0].message.content.strip()
print(f"The model is: {model} and the message is: {message}")
if "<STOP>" in message:
message = message.replace("<STOP>", "").strip()
return message, True
else:
return message, False
except httpx.HTTPStatusError as e: # or requests.HTTPError if using requests
if e.response.status_code == 400:
print(f"Bad request error (400) on task. Attempt {attempt + 1}.")
try:
# Only cut if there are more than 4 messages in total
if len(messages_to_llm) > 4:
print("Message too long, cutting user-assistant pair...")
# Remove second (user) and third (assistant) messages
del messages_to_llm[1:3] # Removes elements at index 1 and 2
print(f"New length of messages_to_llm: {len(messages_to_llm)}")
else:
print("Message length is acceptable. No need to cut.")
break # Exit the loop as message size is small enough
except IndexError as err:
print(f"Index error occurred: {err}")
break # Safely exit if there's an index issue
await asyncio.sleep(5)
continue # Retry after cutting
else:
# Handle other HTTP errors but do not cut messages
print(f"HTTP error {e.response.status_code} occurred, retrying...")
await asyncio.sleep(5)
continue
except asyncio.TimeoutError:
print(f"Timeout occurred. Retrying... (Attempt {attempt + 1})")
await asyncio.sleep(5) # Corrected to await
continue
print("Exiting generate_message function")
individual_message = "INTERRUPTED DUE TO REPEATED ERRORS"
await asyncio.sleep(5)
return individual_message, True
The text was updated successfully, but these errors were encountered:
Is your feature request related to a problem? Please describe.
Several models when receive a large input return an error instead of complete with what they have. That disrupts our workflow. We have only one more month to enjoy the amazing Solar Pro for free. Halp!
Describe the solution you'd like
Adding parameter trim to generation.
Describe alternatives you've considered
A scissor and a knife. Now I have a split keyboard.
Additional context
I was planning to make a humble contribution, but I guess it may never happen.
LiteLLM implements it natively:
https://github.com/BerriAI/litellm/blob/60baa65e0ec4827d27c3b15c14a4f921b1e66121/litellm/utils.py
if total_tokens <= tokens_needed:
break
LiteLLM token trimmer
this code is borrowed from https://github.com/KillianLucas/tokentrim/blob/main/tokentrim/tokentrim.py
Credits for this code go to Killian Lucas
def trim_messages(
messages,
model: Optional[str] = None,
trim_ratio: float = 0.75,
return_response_tokens: bool = False,
max_tokens=None,
):
"""
Trim a list of messages to fit within a model's token limit.
I stupidly forgot about it and implemented half-baked as follows:
def adjust_prompt_length(prompt, max_tokens=115000):
# Get the encoding
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
When generating multiple dialogues, though, we have another issue: the models that need user/assistant/user/assistant. I implemented as follows (a much less elegant solution):
sync def generate_message(model, messages_dataset):
print("Starting generate_message function")
The text was updated successfully, but these errors were encountered: