forked from Lightning-AI/lit-llama
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate.py
147 lines (120 loc) · 5.1 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import sys
import time
from pathlib import Path
from typing import Optional
import lightning as L
import torch
from lit_llama import LLaMA, Tokenizer, as_8_bit_quantized
@torch.no_grad()
def generate(
model: torch.nn.Module,
idx: torch.Tensor,
max_new_tokens: int,
max_seq_length: int,
temperature: float = 1.0,
top_k: Optional[int] = None,
) -> torch.Tensor:
"""Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
The implementation of this function is modified from A. Karpathy's nanoGPT.
Args:
model: The model to use.
idx: Tensor of shape (B, T) with indices of the prompt sequence.
max_new_tokens: The number of new tokens to generate.
max_seq_length: The maximum sequence length allowed.
temperature: Scales the predicted logits by 1 / temperature
top_k: If specified, only sample among the tokens with the k highest probabilities
"""
# create an empty tensor of the expected final shape and fill in the current tokens
B, T = idx.shape
T_new = T + max_new_tokens
empty = torch.empty(B, T_new, dtype=idx.dtype, device=idx.device)
empty[:, :T] = idx
idx = empty
# generate max_new_tokens tokens
for t in range(T, T_new):
# ignore the not-filled-yet tokens
idx_cond = idx[:, :t]
# if the sequence context is growing too long we must crop it at max_seq_length
idx_cond = idx_cond if T <= max_seq_length else idx_cond[:, -max_seq_length:]
# forward
logits = model(idx_cond)
logits = logits[:, -1] / temperature
# optionally crop the logits to only the top k options
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float("Inf")
probs = torch.nn.functional.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
# concatenate the new column
idx[:, t:] = idx_next
return idx
def main(
prompt: str = "Hello, my name is",
*,
num_samples: int = 1,
max_new_tokens: int = 50,
top_k: int = 200,
temperature: float = 0.8,
# compilation fails as it does not support torch.complex64 for RoPE
# compile: bool = False,
accelerator: str = "auto",
checkpoint_path: Optional[Path] = None,
tokenizer_path: Optional[Path] = None,
model_size: str = "7B",
quantize: bool = False,
) -> None:
"""Generates text samples based on a pre-trained LLaMA model and tokenizer.
Args:
prompt: The prompt string to use for generating the samples.
num_samples: The number of text samples to generate.
max_new_tokens: The number of generation steps to take.
top_k: The number of top most probable tokens to consider in the sampling process.
temperature: A value controlling the randomness of the sampling process. Higher values result in more random
samples.
# compile: Whether to compile the model.
accelerator: The hardware to run on. Possible choices are:
``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``.
checkpoint_path: The checkpoint path to load.
tokenizer_path: The tokenizer path to load.
quantize: Whether to quantize the model using the `LLM.int8()` method
"""
if not checkpoint_path:
checkpoint_path = Path(f"./checkpoints/lit-llama/{model_size}/state_dict.pth")
if not tokenizer_path:
tokenizer_path = Path("./checkpoints/lit-llama/tokenizer.model")
assert checkpoint_path.is_file()
assert tokenizer_path.is_file()
fabric = L.Fabric(accelerator=accelerator, devices=1)
with as_8_bit_quantized(fabric.device, enabled=quantize):
print("Loading model ...", file=sys.stderr)
t0 = time.time()
model = LLaMA.from_name(model_size)
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint)
print(f"Time to load model: {time.time() - t0:.02f} seconds.", file=sys.stderr)
model.eval()
# if compile:
# model = torch.compile(model)
model = fabric.setup_module(model)
tokenizer = Tokenizer(tokenizer_path)
encoded_prompt = tokenizer.encode(prompt, bos=True, eos=False, device=fabric.device)
encoded_prompt = encoded_prompt[None, :] # add batch dimension
L.seed_everything(1234)
t0 = time.perf_counter()
for _ in range(num_samples):
y = generate(
model,
encoded_prompt,
max_new_tokens,
model.config.block_size, # type: ignore[union-attr,arg-type]
temperature=temperature,
top_k=top_k,
)[0] # unpack batch dimension
print(tokenizer.decode(y))
t = time.perf_counter() - t0
print(f"\n\nTime for inference: {t:.02f} sec total, {num_samples * max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
if __name__ == "__main__":
from jsonargparse import CLI
torch.set_float32_matmul_precision("high")
CLI(main)