Fix token budgeting (#824)

### Motivation and Context OpenAI inserts a message under the hood for which we don't account, and thus throwing off our token budgeting. This causes us to sometimes send too many tokens when we approach a model's token limit. This results in errors. ### Description Account for the OpenAI message inserted in our requests. ### Contribution Checklist - [ ] The code builds clean without any errors or warnings - [ ] The PR follows the [Contribution Guidelines](https://github.com/microsoft/chat-copilot/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/chat-copilot/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [ ] All unit tests pass, and I have added new tests where possible - [ ] I didn't break anyone 😄
microsoft · Mar 1, 2024 · e018f90 · e018f90
1 parent 4559fce
commit e018f90
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 13 deletions.
diff --git a/webapi/Plugins/Chat/ChatPlugin.cs b/webapi/Plugins/Chat/ChatPlugin.cs
@@ -241,16 +241,15 @@ private async Task<string> GetAllowedChatHistoryAsync(
             }
 
             var promptRole = chatMessage.AuthorRole == CopilotChatMessage.AuthorRoles.Bot ? AuthorRole.System : AuthorRole.User;
-            var tokenCount = chatHistory is not null ? TokenUtils.GetContextMessageTokenCount(promptRole, formattedMessage) : TokenUtils.TokenCount(formattedMessage);
+            int tokenCount = chatHistory is not null ? TokenUtils.GetContextMessageTokenCount(promptRole, formattedMessage) : TokenUtils.TokenCount(formattedMessage);
 
             if (remainingToken - tokenCount >= 0)
             {
                 historyText = $"{formattedMessage}\n{historyText}";
                 if (chatMessage.AuthorRole == CopilotChatMessage.AuthorRoles.Bot)
                 {
                     // Message doesn't have to be formatted for bot. This helps with asserting a natural language response from the LLM (no date or author preamble).
-                    var botMessage = chatMessage.Content;
-                    allottedChatHistory.AddAssistantMessage(botMessage.Trim());
+                    allottedChatHistory.AddAssistantMessage(chatMessage.Content.Trim());
                 }
                 else
                 {
@@ -330,7 +329,6 @@ private async Task<CopilotChatMessage> GetChatResponseAsync(string chatId, strin
         // Render system instruction components and create the meta-prompt template
         var systemInstructions = await AsyncUtils.SafeInvokeAsync(
             () => this.RenderSystemInstructions(chatId, chatContext, cancellationToken), nameof(RenderSystemInstructions));
-        var chatCompletion = this._kernel.GetRequiredService<IChatCompletionService>();
         ChatHistory chatHistory = new(systemInstructions);
 
         // Bypass audience extraction if Auth is disabled
@@ -351,7 +349,6 @@ private async Task<CopilotChatMessage> GetChatResponseAsync(string chatId, strin
         chatHistory.AddSystemMessage(userIntent);
 
         // Calculate the remaining token budget.
-        await this.UpdateBotResponseStatusOnClientAsync(chatId, "Calculating remaining token budget", cancellationToken);
         var remainingTokenBudget = this.GetChatContextTokenLimit(chatHistory, userMessage.ToFormattedString());
 
         // Query relevant semantic and document memories
@@ -397,7 +394,8 @@ private async Task<string> RenderSystemInstructions(string chatId, KernelArgumen
     }
 
     /// <summary>
-    /// Helper function to handle final steps of bot response generation, including streaming to client, generating semantic text memory, calculating final token usages, and saving to chat history.
+    /// Helper function to handle final steps of bot response generation, including streaming to client,
+    /// generating semantic text memory, calculating final token usages, and saving to chat history.
     /// </summary>
     /// <param name="chatId">The chat ID</param>
     /// <param name="userId">The user ID</param>
@@ -461,12 +459,14 @@ await AsyncUtils.SafeInvokeAsync(
     }
 
     /// <summary>
-    /// Helper function that creates the correct context variables to
-    /// extract the audience from a conversation history.
+    /// Extract the list of participants from the conversation history.
+    /// Note that only those who have spoken will be included.
     /// </summary>
+    /// <param name="context">Kernel context variables.</param>
     /// <param name="cancellationToken">The cancellation token.</param>
     private async Task<string> GetAudienceAsync(KernelArguments context, CancellationToken cancellationToken)
     {
+        // Clone the context to avoid modifying the original context variables
         KernelArguments audienceContext = new(context);
         var audience = await this.ExtractAudienceAsync(audienceContext, cancellationToken);
 
@@ -481,12 +481,13 @@ private async Task<string> GetAudienceAsync(KernelArguments context, Cancellatio
     }
 
     /// <summary>
-    /// Helper function that creates the correct context variables to
-    /// extract the user intent from the conversation history.
+    /// Extract user intent from the conversation history.
     /// </summary>
+    /// <param name="context">Kernel context.</param>
     /// <param name="cancellationToken">The cancellation token.</param>
     private async Task<string> GetUserIntentAsync(KernelArguments context, CancellationToken cancellationToken)
     {
+        // Clone the context to avoid modifying the original context variables
         KernelArguments intentContext = new(context);
         string userIntent = await this.ExtractUserIntentAsync(intentContext, cancellationToken);
 
@@ -636,7 +637,13 @@ private OpenAIPromptExecutionSettings CreateIntentCompletionSettings()
     /// <returns>The remaining token limit.</returns>
     private int GetChatContextTokenLimit(ChatHistory promptTemplate, string userInput = "")
     {
+        // OpenAI inserts a message under the hood:
+        // "content": "Assistant is a large language model.","role": "system"
+        // This burns just under 20 tokens which need to be accounted for.
+        const int ExtraOpenAiMessageTokens = 20;
+
         return this._promptOptions.CompletionTokenLimit
+            - ExtraOpenAiMessageTokens
             - TokenUtils.GetContextMessagesTokenCount(promptTemplate)
             - TokenUtils.GetContextMessageTokenCount(AuthorRole.User, userInput) // User message has to be included in chat history allowance
             - this._promptOptions.ResponseTokenLimit;

diff --git a/webapi/Plugins/Utils/TokenUtils.cs b/webapi/Plugins/Utils/TokenUtils.cs
@@ -121,14 +121,13 @@ internal static int TokenCount(string text)
     /// <summary>
     /// Rough token costing of ChatHistory's message object.
     /// Follows the syntax defined by Azure OpenAI's ChatMessage object: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#chatmessage
-    /// e.g., "message": {"role":"assistant","content":"Yes }
+    /// e.g., "message": {"role":"assistant","content":"Yes" }
     /// </summary>
     /// <param name="authorRole">Author role of the message.</param>
     /// <param name="content">Content of the message.</param>
     internal static int GetContextMessageTokenCount(AuthorRole authorRole, string? content)
     {
-        var tokenCount = authorRole == AuthorRole.System ? TokenCount("\n") : 0;
-        return tokenCount + TokenCount($"role:{authorRole.Label}") + TokenCount($"content:{content}");
+        return TokenCount($"role:{authorRole.Label}") + TokenCount($"content:{content}\n");
     }
 
     /// <summary>