-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
87 lines (75 loc) · 2.85 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pathlib
import pickle
import torch
import torch.functional as F
from pathlib import Path
from typing import List, Dict, Union, Any
from cfg import RetrieverCFG
# Fastest cosine similarity calc at the moment.
# %%timeit -n 10
#similarity_calc(question_embedd, torch.rand((15000, 1024)))
#114 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
def similarity_calc(embeda: torch.Tensor, embedb: torch.Tensor, normalize: bool = False) -> torch.Tensor:
"""
If normalize set to True, then torch.functional.normalize is used
"""
assert embeda.shape[-1] == embedb.shape[-1], 'Embeddings does not have equal shape'
if normalize:
embeda = F.normalize(embeda)
embedb = F.normalize(embedb)
return embeda @ embedb.T
def get_most_similar(similarities: torch.Tensor) -> torch.Tensor:
"""
Return argmax index from tensor of similarities
"""
return torch.argmax(similarities, dim = 1)
def path_exists(path: Union[str, pathlib.Path]) -> bool:
if isinstance(path, str):
path = pathlib.Path(path)
return pathlib.Path(path).exists()
def load_pickle(path: Union[str, pathlib.Path]) -> Any:
if path_exists(path):
with open(path, 'rb') as outfile:
output = pickle.load(outfile)
return output
else:
raise ValueError("Provided path does not exist")
def load_tensors(path: Union[str, pathlib.Path]) -> torch.Tensor:
if path_exists(path):
return torch.load(path, map_location= torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
else:
raise ValueError("Provided path does not exist")
#TODO: remove
def padding_token(input):
return self.tokenizer(
self.input_text,
return_tensors="pt",
padding=True,
truncation = True,
max_length=512.
)
def batch_chunk(self, input_base: List) -> torch.Tensor:
"""
Return sliced input based, base on chunk_size
provided in RetrieverCFG.chunk_size argument
Args:
input_base: List
Consist knowledge base, stored in dictionary.
This is a chunked one.
We just want to extending to chunk_size, to calculate more embeddings at once.
Returns:
List[List]
List of lists which consist of nested lists of lists.
"""
emb_len = len(input_base)
chunk_size = TranslatorCFG.chunk_size
n_ixes = int(emb_len/TranslatorCFG.chunk_size)
return [input_base[(ix*chunk_size):(ix+1)*chunk_size] if ix != (n_ixes+1) else input[ix:] for ix in range(n_ixes+1)]
def save_pickle(object, outpath) -> None:
with open(outpath, 'wb') as outfile:
pickle.dump(object, outfile)
def translate_base(input, unit_name, outdict: Dict):
tr = Translator()
all_chunk = tr.batch_chunk(input)
output = tr.process_batch(input_base = all_chunk)
outdict[unit_name] = output