fix the first token to pass the correct logits and input id to next t…

…oken chooser Signed-off-by: Wang, Yi A <[email protected]>
huggingface · Sep 17, 2023 · a27b2a1 · a27b2a1
1 parent 8d1a432
commit a27b2a1
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 12 deletions.
diff --git a/text-generation-inference/server/text_generation_server/models/causal_lm.py b/text-generation-inference/server/text_generation_server/models/causal_lm.py
@@ -653,7 +653,12 @@ def generate_token(self, batch: CausalLMBatch) -> Tuple[List[Generation], Option
             top_token_logprobs,
         ) in enumerate(iterator):
             # Select next token
-            next_token_id, logprobs = next_token_chooser(all_input_ids.view(1, -1), logits[-1:, :])
+            if self.is_optimized_for_gaudi and logits.shape[-2] > 1:
+                next_token_id, logprobs = next_token_chooser(
+                    all_input_ids[0:input_length].view(1, -1), logits[input_length - 1 : input_length, :]
+                )
+            else:
+                next_token_id, logprobs = next_token_chooser(all_input_ids[0:input_length].view(1, -1), logits[-1:, :])
 
             # Append next token to all tokens
             if self.is_optimized_for_gaudi:

diff --git a/text-generation-inference/server/text_generation_server/utils/logits_process.py b/text-generation-inference/server/text_generation_server/utils/logits_process.py
@@ -43,8 +43,7 @@ def __init__(
         self.static_next_logprob = None
 
     def __call__(self, scores):
-        # use hpu graph only for > 1st token
-        if self.hpu_graph is None and scores.shape[-2] == 1:
+        if self.hpu_graph is None:
             self.static_scores = scores.clone().contiguous()
             self.static_warped_scores = scores.clone().contiguous()
             self.static_next_logprob = scores.clone().contiguous()
@@ -59,16 +58,14 @@ def __call__(self, scores):
                 # Compute logprobs
                 self.static_next_logprob.copy_(torch.log_softmax(self.static_warped_scores, -1))
 
-        if scores.shape[-2] == 1:
-            self.static_scores.copy_(scores)
-            self.hpu_graph.replay()
+        self.static_scores.copy_(scores)
+        self.hpu_graph.replay()
 
-            return self.static_warped_scores, self.static_next_logprob
-        else:
-            # 1st token disposal
-            for warper in self.warpers:
-                scores = warper(None, scores)
-            return scores, torch.log_softmax(scores, -1)
+        return self.static_warped_scores, self.static_next_logprob
+        # cpu branch
+        for warper in self.warpers:
+            scores = warper(None, scores)
+        return scores, torch.log_softmax(scores, -1)
 
 
 @lru_cache(10)