forked from ggerganov/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 21
/
falcon_convert.py
206 lines (182 loc) · 8.18 KB
/
falcon_convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# Based on: https://github.com/KerfuffleV2/ggml-falcon/blob/feat-improve-falcon-convert-hf/examples/falcon/convert-hf-to-ggml.py
# Convert Hugging Face fine-tuned bloom-like models to ggml format
#
# Usage:
#
# python3 models/convert-h5-to-ggml.py
#
# This script is similar to "convert-pt-to-ggml.py"
#
import io
import os
import sys
import struct
import json
import code
import torch
import numpy as np
from safetensors import safe_open
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
if len(sys.argv) < 3:
print("INFO: GGML V1 files produced are meant to be finalized through examples/falcon_quantize which will bring them to latest version and precision of choice");
print("Usage: python falcon_convert.py model_directory output_directory [use-f32]")
# print(" num_parts: number of pytorch parts, use 0 if not a multipart model. example: 9")
print(" model_directory: name of the directory and model you convert (it should be a subdirectory)")
print(" output-directory: directory where the output file will be written")
print(" use-f32: if present, use float32 instead of float16 (f32 is recommended)")
sys.exit(1)
# num_parts = int(sys.argv[1])
dir_model = sys.argv[1] # name and dir of model
dir_out = sys.argv[2] # output directory
# make sure the output directory exists
os.makedirs(dir_out, exist_ok=True)
safetensors = False
num_parts = 0
for file_name in os.listdir(dir_model):
if "pytorch_model" in file_name and '-' in file_name and '.' in file_name:
num_parts = max(num_parts, int(file_name.split('-')[-1].split('.')[0]))
elif "safetensors" in file_name and '-' in file_name and '.' in file_name:
num_parts = max(num_parts, int(file_name.split('-')[-1].split('.')[0]))
safetensors = True
num_parts = int(num_parts)
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 3:
ftype = 0
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# print(tokenizer)
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
hparams = config.to_dict()
# if n_head not found, use num_attention_heads (make sure no python error key access)
# old: n_head = hparams["n_head"]
n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
n_hidden_layers = hparams.get("num_hidden_layers", hparams.get("n_layer"))
n_head_kv = hparams.get("num_kv_heads", hparams.get("n_head_kv", 1))
head_dim = hparams["hidden_size"] // n_head
print("* Loading model from: ", dir_model)
fname_out = dir_out + f"/ggml-model-{dir_model.split('/')[-1]}-{ftype_str[ftype]}.bin"
fout = open(fname_out, "wb")
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex (version 1) - possibly change to ggfc ?
fout.write(struct.pack("i", 1))
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["hidden_size"]))
fout.write(struct.pack("i", n_head))
fout.write(struct.pack("i", n_head_kv))
fout.write(struct.pack("i", n_hidden_layers))
fout.write(struct.pack("i", 40 if "n_head_kv" in hparams else 7)) # obsolete field that breaks ggml compatibility - todo again remove one day
fout.write(struct.pack("i", ftype))
print(f'Vocab size: {hparams["vocab_size"]}')
print(f'Hidden size: {hparams["hidden_size"]}')
print(f'Number of heads: {n_head}')
print(f'Number of layers: {n_hidden_layers}')
print(f'Number of head_kv: {n_head_kv}')
print(f'Number of head_dim: {head_dim}')
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}
for i in range(hparams["vocab_size"]):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
padding_token = f"[PAD{i}]".encode("utf8")
text = bytearray(padding_token)
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", 0.0)) # falcon uses bpe on RefinedWeb - no probability scores used
if num_parts == 0:
partnames= ('pytorch_model.bin',)
else:
if safetensors == True:
partnames = ([f'model-{n:05}-of-{num_parts:05}.safetensors' for n in range(1, num_parts + 1)])
else:
partnames = (f'pytorch_model-{n:05}-of-{num_parts:05}.bin' for n in range(1, num_parts + 1))
for partname in partnames:
filename = f'{dir_model}/{partname}'
if not os.path.isfile(filename):
continue
print(f'\n* Loading part: {partname}')
if safetensors == True:
model = safe_open(filename, framework="pt", device="cpu")
else:
model = torch.load(filename, map_location = 'cpu')
for name in model.keys():
src = name
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
if safetensors == True:
data = model.get_tensor(name)
else:
data = model[src]
old_dtype = data.dtype
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
if "query_key_value" in src:
qkv = data.view(
n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
# model[src] = torch.cat((q,k,v)).reshape_as(model[src])
data = torch.cat((q,k,v)).reshape_as(data)
data = data.squeeze().numpy()
n_dims = len(data.shape)
ftype_cur = 1 if ftype == 1 and n_dims > 1 else 0
data_dtype = data.dtype
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f' |', name, data.shape, '->', data.dtype)
# header
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str)
# data
data.tofile(fout)
fout.close()
print("Done. Output file: " + fname_out)
print("")