-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsmote.py
157 lines (128 loc) · 6.65 KB
/
smote.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import torch
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
import sys, os, time, argparse
"""
FILE FOR SMOTE (Synthetic Minority Over-sampling Technique)
"""
# assuming this is for the lag-day lag period?
total_labels = 0
total_pos = 0
#graphs_directory = '/work/nlp/b.irving/stock/graphsPreprocessed'
graphs_directory = '/work/nlp/b.irving/stock/graphs'
tweets_directory = '/work/nlp/b.irving/stock/tweets_2'
labels_directory ='/work/nlp/b.irving/stock/labels_2'
macd_directory = '/work/nlp/b.irving/stock/macd'
tickers = []
# Iterate over the files in the specified directory
parser = argparse.ArgumentParser()
parser.add_argument('-l', '--lag', type=int, help='Lag period', default=5)
parser.add_argument('-c', '--channels', type=int, help='Number of channels in the image', default=4)
parser.add_argument('-ih', '--image_height', type=int, help='Height of the images', default=224)
parser.add_argument('-iw', '--image_width', type=int, help='Width of the images', default=224)
parser.add_argument('-t', '--tweet_embedding_dim', type=int, help='Dimension of the tweet embedding', default=128)
parser.add_argument('-md', '--macd_dim', type=int, help='Dimension of the macd information', default=4)
args = parser.parse_args()
lag = args.lag
channels = args.channels
image_height = args.image_height
image_width = args.image_width
image_dim = channels * image_height * image_width
tweet_embedding_dim = args.tweet_embedding_dim
macd_dim = args.macd_dim
feature_vector_length = lag * (image_dim + tweet_embedding_dim + macd_dim)
all_positives = []
all_negatives = []
print('Gathering features...')
for filename in os.listdir(tweets_directory):
print(filename)
# Construct the full file path
tweets_file_path = os.path.join(tweets_directory, filename)
graph_file_path = os.path.join(graphs_directory, filename)
labels_file_path = os.path.join(labels_directory, filename)
macd_file_path = os.path.join(macd_directory, filename)
tweets = torch.load(tweets_file_path, map_location=torch.device('cpu')).cpu()
graphs = torch.load(graph_file_path, map_location=torch.device('cpu')).cpu()
macds = torch.load(macd_file_path, map_location=torch.device('cpu')).cpu()
labels = torch.load(labels_file_path, map_location=torch.device('cpu')).cpu()
positive_indices = (labels == 1).nonzero(as_tuple=True)[0]
negative_indices = (labels == 0).nonzero(as_tuple=True)[0]
# refactor
for index in range(labels.shape[0]):
if index > lag:
lag_graphs = graphs[index - (lag - 1):index + 1].numpy()
lag_tweets = tweets[index - (lag - 1):index + 1].numpy()
lag_macds = macds[index - (lag - 1):index + 1].numpy()
new_graphs = lag_graphs.reshape(1, lag, -1)
new_tweets = lag_tweets.reshape(1, lag, -1)
new_macds = lag_macds.reshape(1, lag, -1)
combined_features = np.concatenate((new_graphs, new_tweets, new_macds), axis=2)
combined_features = combined_features.reshape(1, feature_vector_length)
if (labels[index] == 0):
all_negatives.append(combined_features)
else:
all_positives.append(combined_features)
"""
for index in positive_indices:
if index > lag:
# so for the positive graphs, we convert to numpy?
positive_graphs = graphs[index - (lag - 1):index + 1].numpy()
positive_tweets = tweets[index - (lag - 1):index + 1].numpy()
new_graphs = positive_graphs.reshape(1, lag, -1)
new_tweets = positive_tweets.reshape(1, lag, -1)
combined_features = np.concatenate((new_graphs, new_tweets), axis=2)
combined_features = combined_features.reshape(1, feature_vector_length)
all_positives.append(combined_features)
for index in negative_indices:
if index > lag:
# so for the positive graphs, we convert to numpy?
negative_graphs = graphs[index - (lag - 1):index + 1].numpy()
negative_tweets = tweets[index - (lag - 1):index + 1].numpy()
new_graphs = positive_graphs.reshape(1, lag, -1)
new_tweets = positive_tweets.reshape(1, lag, -1)
combined_features = np.concatenate((new_graphs, new_tweets), axis=2)
combined_features = combined_features.reshape(1, feature_vector_length)
all_negatives.append(combined_features)
"""
all_positives_array = np.squeeze(np.stack(all_positives, axis=0), axis=1)
all_positives_labels = np.ones(all_positives_array.shape[0])
all_negatives_array = np.squeeze(np.stack(all_negatives, axis=0), axis=1)
all_negatives_labels = np.zeros(all_negatives_array.shape[0])
all_xs, all_ys = np.concatenate((all_positives_array, all_negatives_array), axis=0), np.concatenate((all_positives_labels, all_negatives_labels), axis=0)
np.save('/work/nlp/b.irving/stock/complete/all_xs_' + str(lag) + '.npy', all_xs)
np.save('/work/nlp/b.irving/stock/complete/all_ys_' + str(lag) + '.npy', all_ys)
print('Features gathered.')
#all_xs = np.load('all_xs_10.npy')
#all_ys = np.load('all_ys_10.npy')
# Doing SMOTE on this will take a long time to run
# is any of this worth it?
# YES. Synethetic data is really important
# will this work, ah ah ahhhhh
smote = SMOTE()
X_minority_resampled, y_minority_resampled = smote.fit_resample(all_xs, all_ys)
np.save('/work/nlp/b.irving/stock/complete/x_resampled_' + str(lag) + '.npy', X_minority_resampled)
np.save('/work/nlp/b.irving/stock/complete/y_resampled_' + str(lag) + '.npy', y_minority_resampled)
print('Resampling complete.')
#X_minority_resampled = np.load('x_resampled.npy')
#y_minority_resampled = np.load('y_resampled.npy')
# Is this the way to do this...
# I have no idea. Stay the course, and submit this paper
print('Saving images and tweets...')
images = X_minority_resampled[:, :1003520]
print('images', images.shape)
images_to_save = images.reshape(-1, lag, channels, image_height, image_width)
print('images to save', images_to_save.shape)
tweets = X_minority_resampled[:, 1003520:1004160]
print('tweets', tweets.shape)
tweets_to_save = tweets.reshape(-1, lag, tweet_embedding_dim)
print('tweets to save', tweets_to_save.shape)
macds = X_minority_resampled[:, 1004160:]
print('MACD', macds.shape)
macds_to_save = macds.reshape(-1, lag, macd_dim)
print('MACDs to save', macds_to_save.shape)
np.save('/work/nlp/b.irving/stock/complete/graphs_' + str(lag) + '.npy', images_to_save)
np.save('/work/nlp/b.irving/stock/complete/tweets_' + str(lag) + '.npy', tweets_to_save)
np.save('/work/nlp/b.irving/stock/complete/macds_' + str(lag) + '.npy', macds_to_save)
print('Process complete.')
# now, reshape and save the respective numpy arrays