Skip to content

Commit

Permalink
llama : add API for token type
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Aug 21, 2023
1 parent 8d177ed commit 0b53b8b
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 116 deletions.
33 changes: 18 additions & 15 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,17 +241,19 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}

vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")

items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens

def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.bpe_tokenizer
Expand All @@ -261,12 +263,12 @@ def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8")
score: float = -i
yield text, score, 4
yield text, score, gguf.TokenType.USER_DEFINED

def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, 4
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED

def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.bpe_tokens()
Expand Down Expand Up @@ -304,27 +306,27 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)

toktype = 1 # defualt to normal token type
toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i):
toktype = 2
toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i):
toktype = 3
toktype = gguf.TokenType.CONTROL

# NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED

if tokenizer.is_unused(i):
toktype = 5
toktype = gguf.TokenType.UNUSED
if tokenizer.is_byte(i):
toktype = 6
toktype = gguf.TokenType.BYTE

yield text, score, toktype

def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, 4
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED

def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.sentencepiece_tokens()
Expand Down Expand Up @@ -725,6 +727,7 @@ def __init__(self, fname_out: Path) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])

def add_meta_arch(self, params: Params) -> None:
self.gguf.add_name ("llama")
self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer)
Expand Down
44 changes: 20 additions & 24 deletions examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;

struct token_score {
token tok;
struct token_data {
token text;
float score;
ttype type;
};

std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
std::vector<token_data> id_to_token;
};

struct my_llama_hparams {
Expand Down Expand Up @@ -516,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);

std::vector<const char *> strings;
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
const int n_vocab = llama_n_vocab(lctx);
vocab->id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]);
float score = scores[i];
vocab->id_to_token[i].tok = tok;
vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i);
vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
}
llama_free(lctx);
llama_free_model(lmodel);
} else { // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
llama_file file(filename, "rb");
uint32_t n_vocab = config->vocab_size;
const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
for (uint32_t i=0; i<n_vocab; ++i) {
for (int i=0; i<n_vocab; ++i) {
float_t score = file.read_f32();
uint32_t len = file.read_u32();
std::string tok = file.read_string(len);
vocab->id_to_token[i].tok = tok;
std::string text = file.read_string(len);
vocab->id_to_token[i].text = text;
vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i);
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
vocab->token_to_id.emplace(text, i);
}
}
}
Expand Down Expand Up @@ -611,10 +607,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
// uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) {
// const auto & token_score = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_score.tok.size());
// file.write_raw(token_score.tok.data(), token_score.tok.size());
// file.write_raw(&token_score.score, sizeof(token_score.score));
// const auto & token_data = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_data.tok.size());
// file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_data.score, sizeof(token_data.score));
// }
//
// // stuff AK weights into GG weights one by one.
Expand Down
33 changes: 14 additions & 19 deletions examples/train-text-from-scratch/train-text-from-scratch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;

struct token_score {
token tok;
struct token_data {
token text;
float score;
ttype type;
};

std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
std::vector<token_data> id_to_token;
};

struct my_llama_hparams {
Expand Down Expand Up @@ -2629,10 +2631,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
// // write_vocab
// uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) {
// const auto & token_score = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_score.tok.size());
// file.write_raw(token_score.tok.data(), token_score.tok.size());
// file.write_raw(&token_score.score, sizeof(token_score.score));
// const auto & token_data = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_data.tok.size());
// file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_data.score, sizeof(token_data.score));
// }
// // write tensors
// write_tensor(&file, model->tok_embeddings);
Expand Down Expand Up @@ -3055,20 +3057,13 @@ int main(int argc, char ** argv) {

struct llama_vocab vocab;
{
std::vector<const char *> strings;
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
const int n_vocab = llama_n_vocab(lctx);
vocab.id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]);
float score = scores[i];
vocab.id_to_token[i].tok = tok;
vocab.id_to_token[i].score = score;
vocab.token_to_id.emplace(tok, i);
vocab.id_to_token[i].text = llama_token_get_text(lctx, i);
vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
vocab.id_to_token[i].type = llama_token_get_type(lctx, i);
vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
}
}

Expand Down
10 changes: 10 additions & 0 deletions gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"


#
# recommended mapping of model tensor names for storage in gguf
#
Expand Down Expand Up @@ -319,6 +320,15 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:

return tensor_map


class TokenType(IntEnum):
NORMAL = 1
UNKNOWN = 2
CONTROL = 3
USER_DEFINED = 4
UNUSED = 5
BYTE = 6

#
# implementation
#
Expand Down
Loading

0 comments on commit 0b53b8b

Please sign in to comment.