-
Notifications
You must be signed in to change notification settings - Fork 510
/
stock_prediction.py
60 lines (48 loc) · 1.84 KB
/
stock_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from utils import data_string_to_float, status_calc
# The percentage by which a stock has to beat the S&P500 to be considered a 'buy'
OUTPERFORMANCE = 10
def build_data_set():
"""
Reads the keystats.csv file and prepares it for scikit-learn
:return: X_train and y_train numpy arrays
"""
training_data = pd.read_csv("keystats.csv", index_col="Date")
training_data.dropna(axis=0, how="any", inplace=True)
features = training_data.columns[6:]
X_train = training_data[features].values
# Generate the labels: '1' if a stock beats the S&P500 by more than 10%, else '0'.
y_train = list(
status_calc(
training_data["stock_p_change"],
training_data["SP500_p_change"],
OUTPERFORMANCE,
)
)
return X_train, y_train
def predict_stocks():
X_train, y_train = build_data_set()
# Remove the random_state parameter to generate actual predictions
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
# Now we get the actual data from which we want to generate predictions.
data = pd.read_csv("forward_sample.csv", index_col="Date")
data.dropna(axis=0, how="any", inplace=True)
features = data.columns[6:]
X_test = data[features].values
z = data["Ticker"].values
# Get the predicted tickers
y_pred = clf.predict(X_test)
if sum(y_pred) == 0:
print("No stocks predicted!")
else:
invest_list = z[y_pred].tolist()
print(
f"{len(invest_list)} stocks predicted to outperform the S&P500 by more than {OUTPERFORMANCE}%:"
)
print(" ".join(invest_list))
return invest_list
if __name__ == "__main__":
print("Building dataset and predicting stocks...")
predict_stocks()