forked from robertmartin8/MachineLearningStocks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
current_data.py
156 lines (133 loc) · 4.74 KB
/
current_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import pandas as pd
import os
import re
import time
import requests
import numpy as np
from tqdm import tqdm
from utils import data_string_to_float
# The path to your fundamental data
statspath = "intraQuarter/_KeyStats/"
# These are the features that will be parsed
features = [ # Valuation measures
"Market Cap",
"Enterprise Value",
"Trailing P/E",
"Forward P/E",
"PEG Ratio",
"Price/Sales",
"Price/Book",
"Enterprise Value/Revenue",
"Enterprise Value/EBITDA",
# Financials
"Profit Margin",
"Operating Margin",
"Return on Assets",
"Return on Equity",
"Revenue",
"Revenue Per Share",
"Quarterly Revenue Growth",
"Gross Profit",
"EBITDA",
"Net Income Avi to Common",
"Diluted EPS",
"Quarterly Earnings Growth",
"Total Cash",
"Total Cash Per Share",
"Total Debt",
"Total Debt/Equity",
"Current Ratio",
"Book Value Per Share",
"Operating Cash Flow",
"Levered Free Cash Flow",
# Trading information
"Beta",
"50-Day Moving Average",
"200-Day Moving Average",
"Avg Vol (3 month)",
"Shares Outstanding",
"Float",
"% Held by Insiders",
"% Held by Institutions",
"Shares Short",
"Short Ratio",
"Short % of Float",
"Shares Short (prior month",
]
def check_yahoo():
"""
Retrieves the stock ticker from the _KeyStats directory, then downloads the html file from yahoo finance.
:return: a directory named `forward/` filled with the html files for each ticker
"""
# Create the directory where we will store the current data
if not os.path.exists("forward/"):
os.makedirs("forward/")
# Retrieve a list of tickers from the fundamental data folder
ticker_list = os.listdir(statspath)
# Required in macOS to remove the hidden index file.
if ".DS_Store" in ticker_list:
ticker_list.remove(".DS_Store")
for ticker in tqdm(ticker_list, desc="Download progress:", unit="tickers"):
try:
link = f"http://finance.yahoo.com/quote/{ticker.upper()}/key-statistics"
resp = requests.get(link)
# Write results to forward/
save = f"forward/{ticker}.html"
with open(save, "w") as file:
file.write(resp.text)
except Exception as e:
print(f"{ticker}: {str(e)}\n")
time.sleep(2)
def forward():
"""
Creates the forward sample by parsing the current data html files that we downloaded in check_yahoo().
:return: a pandas dataframe containing all of the current data for each ticker.
"""
# Creating an empty dataframe which we will later fill. In addition to the features, we need some index variables
# (date, unix timestamp, ticker), and of course the dependent variables (prices).
df_columns = [
"Date",
"Unix",
"Ticker",
"Price",
"stock_p_change",
"SP500",
"SP500_p_change",
] + features
df = pd.DataFrame(columns=df_columns)
tickerfile_list = os.listdir("forward/")
# Required in macOS to remove the hidden index file.
if ".DS_Store" in tickerfile_list:
tickerfile_list.remove(".DS_Store")
# This is the actual parsing. This needs to be fixed every time yahoo changes their UI.
for tickerfile in tqdm(tickerfile_list, desc="Parsing progress:", unit="tickers"):
ticker = tickerfile.split(".html")[0].upper()
source = open(f"forward/{tickerfile}").read()
# Remove commas from the html to make parsing easier.
source = source.replace(",", "")
# Regex search for the different variables in the html file, then append to value_list
value_list = []
for variable in features:
try:
# Basically, look for the first number present after we an occurence of the variable
regex = (
r">"
+ re.escape(variable)
+ r".*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?"
r"(</td>|</span>)"
)
value = re.search(regex, source, flags=re.DOTALL).group(1)
# Dealing with number formatting
value_list.append(data_string_to_float(value))
# The data may not be present. Process accordingly.
except AttributeError:
value_list.append("N/A")
# print(ticker, variable)
# Append the ticker and the features to the dataframe
new_df_row = [0, 0, ticker, 0, 0, 0, 0] + value_list
df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)
return df.replace("N/A", np.nan)
if __name__ == "__main__":
check_yahoo()
current_df = forward()
current_df.to_csv("forward_sample.csv", index=False)