diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc index ba12da87..6caec8f7 100644 --- a/src/llama_server_context.cc +++ b/src/llama_server_context.cc @@ -1524,14 +1524,16 @@ bool LlamaServerContext::UpdateSlots() { } // entire prompt has been processed - start decoding new tokens - if (slot.n_past == slot.num_prompt_tokens) { + if (has_images || slot.n_past == slot.num_prompt_tokens) { slot.state = SlotState::kProcessing; slot.command = SlotCommand::kNone; GGML_ASSERT(batch.n_tokens > 0); // extract the logits only for the last token - batch.logits[batch.n_tokens - 1] = true; + if (batch.n_tokens > 0) { + batch.logits[batch.n_tokens - 1] = true; + } slot.n_decoded = 0; slot.i_batch = batch.n_tokens - 1;