-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_onehot.py
77 lines (57 loc) · 2.37 KB
/
my_onehot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import torch
from sklearn import preprocessing
from text_processing import get_nlp_pipeline, word_tokenization
def get_tokens(text, _pipeline):
selected_nlp_pipeline = get_nlp_pipeline(_pipeline)
return word_tokenization(text, selected_nlp_pipeline, _pipeline)
def init_token2idx(text_list, _pipeline):
selected_nlp_pipeline = get_nlp_pipeline(_pipeline)
tk_lists = []
for text in text_list:
tk_lists.append(word_tokenization(text, selected_nlp_pipeline, _pipeline))
whole_tokens = [x for tk_list in tk_lists for x in tk_list]
set_of_tokens = list(set(whole_tokens)) + ["UNK"]
set_of_tokens.sort()
token2idx_dict = {}
idx2token_dict = {}
for idx, t in enumerate(set_of_tokens):
token2idx_dict[t] = idx
idx2token_dict[idx] = t
return token2idx_dict, idx2token_dict
def tk2idx(_text, _pipeline, token2idx_dict, unk_ignore):
selected_nlp_pipeline = get_nlp_pipeline(_pipeline)
tk_lists = word_tokenization(_text, selected_nlp_pipeline, _pipeline)
idx_list = []
if unk_ignore == True:
for tk in tk_lists:
if tk in token2idx_dict:
idx_list.append(token2idx_dict[tk])
else:
idx_list.append(token2idx_dict["UNK"])
else:
for tk in tk_lists:
idx_list.append(token2idx_dict[tk])
return idx_list
def custom_one_hot_encoding(_idx_list, dim):
tensor_list = []
for idx in _idx_list:
temp = torch.zeros(dim)
temp[idx] = 1
tensor_list.append(temp)
return tensor_list
def tensor2token(_tensor, idx2token_dict):
idx = (_tensor == 1).nonzero(as_tuple=True)[0].item()
return idx2token_dict[idx]
def build_onehot_encoding_model(unk_ignore):
if unk_ignore == True:
model = preprocessing.OneHotEncoder(handle_unknown='ignore')
else:
model = preprocessing.OneHotEncoder()
return model
def get_onehot_encoding(text_list, cur_text, _nlp_pipeline, _unk_igsnore):
sklearn_onehotencoder = build_onehot_encoding_model(_unk_igsnore)
token2idx_dict, _ = init_token2idx(text_list, _nlp_pipeline)
sklearn_onehotencoder.fit([[t] for t in token2idx_dict])
_tks = get_tokens(cur_text, _nlp_pipeline)
tk_list = [[x] for x in _tks]
return torch.tensor(sklearn_onehotencoder.transform(tk_list).toarray(), dtype=torch.float)