forked from robertmartin8/MachineLearningStocks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
backtesting.py
86 lines (72 loc) · 3.6 KB
/
backtesting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from utils import status_calc
def backtest():
"""
A simple backtest, which splits the dataset into a train set and test set,
then fits a Random Forest classifier to the train set. We print the precision and accuracy
of the classifier on the test set, then run a backtest comparing this strategy's performance
to passive investment in the S&P500.
Please note that there is a methodological flaw in this backtest which will give deceptively
good results, so the results here should not encourage you to live trade.
"""
# Build the dataset, and drop any rows with missing values
data_df = pd.read_csv("keystats.csv", index_col="Date")
data_df.dropna(axis=0, how="any", inplace=True)
features = data_df.columns[6:]
X = data_df[features].values
# The labels are generated by applying the status_calc to the dataframe.
# '1' if a stock beats the S&P500 by more than x%, else '0'. Here x is the
# outperformance parameter, which is set to 10 by default but can be redefined.
y = list(
status_calc(
data_df["stock_p_change"], data_df["SP500_p_change"], outperformance=10
)
)
# z is required for us to track returns
z = np.array(data_df[["stock_p_change", "SP500_p_change"]])
# Generate the train set and test set by randomly splitting the dataset
X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(
X, y, z, test_size=0.2
)
# Instantiate a RandomForestClassifier with 100 trees, then fit it to the training data
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
# Generate the predictions, then print test set accuracy and precision
y_pred = clf.predict(X_test)
print("Classifier performance\n", "=" * 20)
print(f"Accuracy score: {clf.score(X_test, y_test): .2f}")
print(f"Precision score: {precision_score(y_test, y_pred): .2f}")
# Because y_pred is an array of 1s and 0s, the number of positive predictions
# is equal to the sum of the array
num_positive_predictions = sum(y_pred)
if num_positive_predictions < 0:
print("No stocks predicted!")
# Recall that z_test stores the change in stock price in column 0, and the
# change in S&P500 price in column 1.
# Whenever a stock is predicted to outperform (y_pred = 1), we 'buy' that stock
# and simultaneously `buy` the index for comparison.
stock_returns = 1 + z_test[y_pred, 0] / 100
market_returns = 1 + z_test[y_pred, 1] / 100
# Calculate the average growth for each stock we predicted 'buy'
# and the corresponding index growth
avg_predicted_stock_growth = sum(stock_returns) / num_positive_predictions
index_growth = sum(market_returns) / num_positive_predictions
percentage_stock_returns = 100 * (avg_predicted_stock_growth - 1)
percentage_market_returns = 100 * (index_growth - 1)
total_outperformance = percentage_stock_returns - percentage_market_returns
print("\n Stock prediction performance report \n", "=" * 40)
print(f"Total Trades:", num_positive_predictions)
print(f"Average return for stock predictions: {percentage_stock_returns: .1f} %")
print(
f"Average market return in the same period: {percentage_market_returns: .1f}% "
)
print(
f"Compared to the index, our strategy earns {total_outperformance: .1f} percentage points more"
)
if __name__ == "__main__":
backtest()