-
Notifications
You must be signed in to change notification settings - Fork 23
/
take_test.py
222 lines (188 loc) · 7.95 KB
/
take_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import argparse
import openai
import torch
import json
import time
from transformers import GenerationConfig
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
from peft import PeftModel
from peft.tuners.lora import Linear4bitLt
def load_trivia_questions(file_path):
with open(file_path, 'r') as file:
trivia_data = json.load(file)
return trivia_data
def generate_question_string(question_data):
question = question_data['question']
choices = [f" {answer['choice']}. {answer['text']}\n" if answer != question_data['answers'][-1] else f" {answer['choice']}. {answer['text']}" for answer in question_data['answers']]
return f"{question}\n{''.join(choices)}"
def grade_answers(question_data, llm_answer):
correct_answer = None
for answer in question_data['answers']:
if answer['correct']:
correct_answer = answer
break
if correct_answer is None:
return "No correct answer found"
normalized_llm_answer = llm_answer.lower().strip()
normalized_correct_answer = correct_answer['text'].lower().strip()
# lower case of the full text answer is in the llm's answer
if normalized_correct_answer in normalized_llm_answer:
return f"{correct_answer['choice']}. {correct_answer['text']} (correct)"
# Upper case " A." or " B." or " C." or " D." or " E." for instance
if f" {correct_answer['choice']}." in llm_answer:
return f"{correct_answer['choice']}. {correct_answer['text']} (correct)"
# Upper case " (A)" or " (B)" or " (C)" or " (D)" or " (E)" for instance
if f"({correct_answer['choice']})" in llm_answer:
return f"{correct_answer['choice']}. {correct_answer['text']} (correct)"
if "i don't know" in normalized_llm_answer or normalized_llm_answer == "d" or normalized_llm_answer == "d.":
return f"{llm_answer} (uncertain)"
return f"{llm_answer} (incorrect {correct_answer['choice']}.)"
def query_openai_gpt(prompt, engine):
while True:
try:
response = openai.Completion.create(
engine=engine,
prompt=prompt,
max_tokens=50,
temperature=0.1,
)
return response.choices[0].text.strip()
except openai.error.RateLimitError as e:
print("Rate limit exceeded. Pausing for one minute...")
time.sleep(60)
continue
except Exception as e:
print(f"Error: {e}")
break
if torch.cuda.is_available():
device_count = torch.cuda.device_count()
print(f"Found {device_count} GPU(s) available.")
device_index = 0
if device_count > 1:
device_index = input(f"Select device index (0-{device_count-1}): ")
device_index = int(device_index)
device = f"cuda:{device_index}"
print(f"Using device: {device}")
else:
device = "cpu"
print("No GPU available, using CPU.")
def query_model(
prompt,
model,
tokenizer,
temperature=0.1,
max_new_tokens=50,
**kwargs,
):
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=temperature,
top_p=0.75,
top_k=40,
num_beams=2,
**kwargs,
)
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
s = generation_output.sequences[0]
output = tokenizer.decode(s)
response = output.split("### Response:")[1].strip()
return response.split("### Instruction:")[0].strip()
def main():
parser = argparse.ArgumentParser(description='Run trivia quiz with GPT-3 or a local model.')
parser.add_argument('--use-gpt3', action='store_true', help='Use GPT-3')
parser.add_argument('--use-gpt3-5', action='store_true', help='Use GPT-3.5')
parser.add_argument('--use-gpt4all', action='store_true', help='Use GPT4All')
parser.add_argument('--use-llama', action='store_true', help='Use Llama')
parser.add_argument('--openai-key', type=str, help='OpenAI API key')
parser.add_argument('--trivia', type=str, help='File path to trivia questions')
args = parser.parse_args()
use_gpt_3 = args.use_gpt3 or args.use_gpt3_5
use_gpt4all = args.use_gpt4all
if use_gpt_3 and use_gpt4all:
print("Can't use both gpt and gpt4all at same time.")
return
if use_gpt_3 and not args.openai_key:
print("Please provide an OpenAI API key with the --openai-key argument.")
return
if use_gpt_3:
openai.api_key = args.openai_key
if not use_gpt_3:
if use_gpt4all:
config_path = './models/llama-7b-hf/'
model_path = './weights/llama-7b-4bit.pt'
lora_path = './loras/gpt4all-lora/'
else:
config_path = './models/llama-7b-hf/'
model_path = './weights/llama-7b-4bit.pt'
lora_path = './loras/alpaca7B-lora/'
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path)
if not args.use_llama:
model = PeftModel.from_pretrained(model, lora_path)
print('Fitting 4bit scales and zeros to half')
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
m.zeros = m.zeros.half()
m.scales = m.scales.half()
m.bias = m.bias.half()
file_path = args.trivia
trivia_data = load_trivia_questions(file_path)
total_score = 0
incorrect = []
unknown = []
if args.use_gpt3_5:
model_name = "text-davinci-003"
elif use_gpt_3:
model_name = "text-davinci-002"
elif args.use_llama:
model_name = "llama-4bit"
elif args.use_gpt4all:
model_name = "gpt4all-4bit"
else:
model_name = "alpaca-lora-4bit"
for i, question_data in enumerate(trivia_data):
question_string = generate_question_string(question_data)
prompt = generate_prompt(question_string)
print(f"Question {i+1}: {question_string}")
if use_gpt_3:
llm_answer = query_openai_gpt(prompt, model_name)
else:
llm_answer = query_model(prompt, model, tokenizer)
answer_output = grade_answers(question_data, llm_answer)
print(f"Answer: {answer_output}\n")
if "(correct)" in answer_output:
total_score += 2
elif "(incorrect" in answer_output:
incorrect.append((i+1, question_string, answer_output))
else:
total_score += 1
unknown.append((i+1, question_string, answer_output))
with open(f"test_results_{file_path}_{model_name}.txt", 'w') as f:
f.write(f"Total score: {total_score} of {len(trivia_data) * 2}\n")
i = len(incorrect)
u = len(unknown)
f.write(f"Correct: {len(trivia_data) - i - u}\n")
if i:
f.write(f"\nIncorrect: {i}\n")
for question_num, question_string, answer_output in incorrect:
f.write(f"Question {question_num}: {question_string.strip()}\n{answer_output.strip()}\n\n")
if u:
f.write(f"Unknown: {u}\n")
for question_num, question_string, answer_output in unknown:
f.write(f"Question {question_num}: {question_string.strip()}\n{answer_output.strip()}\n\n")
print(f"Total score: {total_score} of {len(trivia_data) * 2}\n", end='')
def generate_prompt(instruction):
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. Only answer the question. Keep token limit low.
### Instruction:
{instruction}
### Response:\n
"""
if __name__ == '__main__':
main()