-
Notifications
You must be signed in to change notification settings - Fork 6
/
dataset.py
80 lines (59 loc) · 2.09 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
from collections import Counter
import itertools
np.random.seed(13)
def _pad_sequences(sequences, pad_tok, max_length):
"""
Args:
sequences: a generator of list or tuple
pad_tok: the char to pad with
Returns:
a list of list where each sublist has same length
"""
sequence_padded, sequence_length = [], []
for seq in sequences:
seq = list(seq)
seq_ = seq[:max_length] + [pad_tok] * max(max_length - len(seq), 0)
sequence_padded += [seq_]
sequence_length += [min(len(seq), max_length)]
return sequence_padded, sequence_length
def pad_sequences(sequences, pad_tok):
"""
Args:
sequences: a generator of list or tuple
pad_tok: the char to pad with
Returns:
a list of list where each sublist has same length
"""
max_length = max(map(lambda x: len(x), sequences))
sequence_padded, sequence_length = _pad_sequences(sequences, pad_tok, max_length)
return sequence_padded, sequence_length
class Dataset:
def __init__(self, data_name, vocab_words=None, init=True):
self.data_name = data_name
self.words = None
self.labels = None
self.vocab_words = vocab_words
if init:
self._process_data()
self._clean_data()
def _clean_data(self):
del self.vocab_words
def _process_data(self):
data_words, labels = self._parse_raw()
words = []
for i in data_words:
ws = [self.vocab_words[w] if w in self.vocab_words else self.vocab_words['$UNK$'] for w in i]
words.append(ws)
self.words = words
self.labels = labels
def _parse_raw(self):
all_words = []
all_labels = []
with open('{}.word.txt'.format(self.data_name), 'r') as f:
for line in f:
all_words.append(line.strip().split())
with open('{}.label.txt'.format(self.data_name), 'r') as f:
for line in f:
all_labels.append(list(map(int, line.strip().split())))
return all_words, all_labels