-
Notifications
You must be signed in to change notification settings - Fork 4
/
utils.py
126 lines (107 loc) · 4.98 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import time
import datetime
import torch
from torch.utils.data import DataLoader, TensorDataset
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split
class Dataset():
def __init__(self, tokenizer, test_size = 0.1, valid_size = 0.05, tokenizer_type = "KoBertTokenizer", batch_size = 256):
self.tokenizer = tokenizer
self.test_size = test_size
self.valid_size = valid_size
self.tokenizer_type = tokenizer_type
self.batch_size = batch_size
def make_dataloder(self, data, data_type = "lstm"):
sentences = list(data["document"])
if data_type == "lstm":
tokens = self.tokenizer.batch_encode_plus(sentences, return_tensors = "pt", padding = True, truncation = True, max_length = 512)
# tokens = []
# for i in range(0, len(sentences), batch_size):
# tokens.append(self.tokenizer.batch_encode_plus(sentences[i : i + batch_size], return_tensors = "pt", padding = "max_length", truncation = True, max_length = 512))
# tokens = tokens[0]
else:
tokens = self.tokenizer(sentences, return_tensors = "pt", padding = True, truncation = True, max_length = 512)
X = tokens["input_ids"]
Y = list(data["label"])
Y = torch.tensor(Y)
idx = list(data["idx"])
idx = torch.tensor(idx)
dataset = TensorDataset(X, Y, idx)
return DataLoader(dataset, batch_size = self.batch_size, pin_memory = True)
def load_data(self, df):
print("Start load data!!")
start = time.time()
train_data, test_data = train_test_split(df, test_size = self.test_size, random_state = 1)
train_data, valid_data = train_test_split(train_data, test_size = self.valid_size, random_state = 1)
train_iter = self.make_dataloder(train_data)
test_iter = self.make_dataloder(test_data)
valid_iter = self.make_dataloder(valid_data)
end = time.time()
sec = (end - start)
result = datetime.timedelta(seconds = sec)
print("load data time : {}".format(result))
return train_iter, test_iter, valid_iter
import os
def createFolder(directory):
try:
if not os.path.exists(directory):
os.makedirs(directory)
except OSError:
print ('Error: Creating directory. ' + directory)
def make_tokenizer(start = 2000, end = 10000, step = 1000, data_file = 'dataset.txt'):
for vocab_size in range(start, end, step):
tokenizer = BertWordPieceTokenizer(lowercase=False)
if vocab_size > 2000:
limit_alphabet = 1500
else:
limit_alphabet = 1000
min_frequency = 5
print(vocab_size)
tokenizer.train(files = data_file,
vocab_size = vocab_size,
limit_alphabet = limit_alphabet,
min_frequency = min_frequency)
vocab_path = 'tokenizer/vocab_size_{}/'.format(vocab_size)
createFolder(vocab_path)
tokenizer.save_model(vocab_path)
import pickle
def input_list_positive_or_negative(sentences, model, tokenizer):
tokens = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
sequence_output = model(tokens["input_ids"], attention_mask=tokens["attention_mask"])
return sequence_output[0].cpu().tolist()
def make_teacher_output(df, teacher_path, tokenizer, Classification, batch_size = 32):
checkpoint = teacher_path + "pytorch_model.bin"
config_file = teacher_path + "config.json"
teacher_model = Classification.from_pretrained(checkpoint, config = config_file).to("cuda")
document_list = list(df["document"])
idx_list = list(df["idx"])
teacher_output = {}
n = len(document_list)
for i in range(0, n, batch_size):
output_list = input_list_positive_or_negative(document_list[i:i+batch_size], teacher_model, tokenizer)
now_idx = idx_list[i:i+batch_size]
for output, idx in zip(output_list, now_idx):
teacher_output[idx] = output
if i / 32 % 100 == 0:
print(i, i / n)
return teacher_output
def get_teacher_output(teacher_path, tokenizer = None, Classification = None, df = None, batch_size = None):
teacher_output_path = teacher_path + "teacher_output.pickle"
if os.path.isfile(teacher_output_path):
with open(teacher_output_path, 'rb') as fr:
teacher_output = pickle.load(fr)
else:
teacher_output = make_teacher_output(df, teacher_path, tokenizer, Classification, batch_size)
return teacher_output
from prettytable import PrettyTable
def count_parameters(model):
table = PrettyTable(["Modules", "Parameters"])
total_params = 0
for name, parameter in model.named_parameters():
if not parameter.requires_grad: continue
params = parameter.numel()
table.add_row([name, params])
total_params+=params
print(table)
print(f"Total Trainable Params: {total_params}")
return total_params