From fae8faa135942918a7c9ecc8c2fc26be7f61140d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 23 Aug 2023 22:56:50 +0300 Subject: [PATCH] perplexity : add log for start of tokenization --- examples/perplexity/perplexity.cpp | 4 ++++ llama.cpp | 1 + 2 files changed, 5 insertions(+) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9a39529d50875..a7bd9db2a3fd3 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -41,6 +41,8 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) { const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; const bool add_bos = is_spm; + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos); const int calc_chunk = params.n_ctx; @@ -152,6 +154,8 @@ void perplexity(llama_context * ctx, const gpt_params & params) { const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; const bool add_bos = is_spm; + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos); const int n_chunk_max = tokens.size() / params.n_ctx; diff --git a/llama.cpp b/llama.cpp index bdf2437015b52..f2dc4da1db344 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3321,6 +3321,7 @@ struct llm_tokenizer_bpe { } // probably not 100% correct + // TODO: this is quite slow - how to make it more efficient? static std::vector bpe_gpt2_preprocess(std::string text) { std::vector words;