Skip to content

Commit

Permalink
perplexity : add log for start of tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Aug 23, 2023
1 parent 630d8b4 commit fae8faa
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 0 deletions.
4 changes: 4 additions & 0 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;

fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

const int calc_chunk = params.n_ctx;
Expand Down Expand Up @@ -152,6 +154,8 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;

fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

const int n_chunk_max = tokens.size() / params.n_ctx;
Expand Down
1 change: 1 addition & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3321,6 +3321,7 @@ struct llm_tokenizer_bpe {
}

// probably not 100% correct
// TODO: this is quite slow - how to make it more efficient?
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
std::vector<std::string> words;

Expand Down

0 comments on commit fae8faa

Please sign in to comment.