-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlambda.py
392 lines (338 loc) · 14.7 KB
/
lambda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
import requests
from bs4 import BeautifulSoup as soup
import re
from datetime import datetime as dt
from tenacity import retry, stop_after_attempt, wait_fixed
import psycopg2
import datetime
import pytz
class InvalidArticleLink(Exception):
def __init__(self, link):
self.link = link
self.message = f'Something is wrong with this link: {self.link}'
super().__init__(self.message)
class InvalidArticleText(Exception):
def __init__(self, article):
self.article = article
self.message = f'Something is wrong with the article text: {self.article}'
super().__init__(self.message)
class Proxies:
url = 'https://free-proxy-list.net/'
def __init__(self):
self.header = {'User-Agent': 'Mozilla/5.0'}
self.proxyList = []
self.currentProxy = None
self.getProxiesDefault()
def getProxiesDefault(self):
"""
Makes a request to the free proxy url and parses the resulting html to find all the proxies and their port
:return: N/A
"""
try:
req = requests.get(self.url, headers=self.header) # sending requests with headers
url = req.content # opening and reading the source code
except requests.exceptions.RequestException as e:
print('Error getting Proxies')
raise e
else:
pageSoup = soup(url, "html.parser") # structuring the source code in proper format
rows = pageSoup.findAll("tr") # finding all rows in the table if any.
rows = rows[1:300] # removes column headers
self.proxyList = []
for row in rows:
cols = row.findAll('td')
cols = [element.text for element in cols]
IP = cols[0] # ipAddress which presents in the first element of cols list
portNum = cols[1] # portNum which presents in the second element of cols list
proxy = IP + ":" + portNum # concatenating both ip and port
protocol = cols[6] # portName variable result will be yes / No
if protocol == "yes": # checks if the proxy supports https
self.proxyList.append(proxy)
def refreshProxies(self):
"""
Refreshes the proxy list. Use this only if you need to manually refresh the list and apply some changes on top
of the regular list refresh.
:return: N/A
"""
self.getProxiesDefault()
# self.currentProxy = self.proxyList[0]
# return self.currentProxy
def getNextProxy(self):
"""
Gets the next proxy on the list and refreshes the list if it is the last proxy proxy in the list.
:return: The next proxy available
"""
if self.currentProxy is None:
self.currentProxy = self.proxyList[0]
return self.currentProxy
elif self.currentProxy != self.proxyList[-1]:
self.currentProxy = self.proxyList[self.proxyList.index(self.currentProxy) + 1]
return self.currentProxy
else:
self.getProxiesDefault()
self.currentProxy = self.proxyList[0]
return self.currentProxy
def checkProxy(self):
"""
Checks to see if the current proxy is working by testing a connection to google using the proxy
:return: the status code or the error that arises
"""
userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
buildProxy = {'http': self.currentProxy}
try:
r = requests.get('https://www.google.com', headers={'User-Agent': userAgent}, proxies=buildProxy, timeout=8)
return r.status_code
except (requests.exceptions.Timeout,
requests.exceptions.ProxyError,
requests.exceptions.SSLError,
requests.exceptions.ConnectionError) as e:
return e
class GNWData:
"""
The main class used for pulling GlobeNewsWire data.
"""
# sets the class attribute RSSurl to the RSS feed url that we want to parse data from
RSSurl = 'https://www.globenewswire.com/Atom/search/srvpLA0OZACn9KBGWOftivocEmgym4Tjxh69n0TAmMM%3d'
def __init__(self, oldHeadlines):
"""
Initializes instance variables
:param oldHeadlines: A list of article headlines that were previously scanned in another GNWData instance -
None value if running first scan, no previous data pull was made, or not using this
feature because you are checking for duplicates in another manner.
"""
self.timePulled = None
self.headlines = []
self.oldHeadlines = oldHeadlines
self.entriesList = None
self.htmlData()
self.removeOld()
@classmethod
def changeRSSurl(cls, link):
cls.RSSurl = link
def htmlData(self):
"""
This function pulls the main data from the RSS link and filters it to return only the data that hasn't already
been scanned.
:return:N/A
"""
try:
# records data pull time in NAIVE UTC YYYY-MM-DD HH:MM:SS.ffffff format datetime object
self.timePulled = dt.utcnow()
content = requests.get(self.RSSurl).text
except requests.exceptions as e:
print(e)
raise
else:
pageSoup = soup(content, 'html.parser')
self.entriesList = pageSoup.find_all('entry')
def removeOld(self):
"""
Filters through the pulled entries to find and remove the ones that have already been scanned previously
:return: N/A
"""
if self.oldHeadlines is not None:
validEntries = []
for item in self.entriesList:
headline = item.find('title').getText()
# sets self.headlines to all the valid entry headlines
if headline not in self.oldHeadlines:
self.headlines.append(headline)
validEntries.append(item)
self.entriesList = validEntries
else:
# sets self.headlines to all the headlines pulled
for item in self.entriesList:
headline = item.find('title').getText()
self.headlines.append(headline)
class Entry:
exchanges = ['Nasdaq', 'NASDAQ', 'NYSE', 'OTC', 'Symbol', 'OTCQB', 'OTCPK', 'OTCBB', 'OTC Pink', 'OTC.PK',
'OTC PINK', 'OTCMKTS', 'OTCQX', 'OTC BB', 'OTC Markets']
def __init__(self, entry, proxies):
"""
Initializes class instance variables
:param entry: A parsable BeautifulSoup object that contains the info for one article summary/entry
:param proxies: An instance of the Proxies class
"""
self.entry = entry
self.proxies = proxies # sets the instance variable self.proxies as the Proxies class instance
self.proxyErrorCounter = 0
# runs the methods in the correct order on instance creation so that they are accessible using attributes
self.link = self.getLink()
self.timeArticleReleased = self.getTimeRelease()
self.page_soup = self.makeRequest()
self.article = self.getArticle()
self.ticker = self.getTicker()
self.headline = self.getHeadline()
def getLink(self):
"""
Parses the entry to find the link to the article
:return: The link
"""
self.link = self.entry.find('id').getText()
return self.link
def getTimeRelease(self):
"""
Retrieves the time the article was released from the entry and converts it to a NAIVE datetime object that is
by already in UTC
:return: the time the article was released as a NAIVE datetime object in UTC
"""
self.timeArticleReleased = self.entry.find('updated').getText()
self.timeArticleReleased = dt.strptime(self.timeArticleReleased, '%Y-%m-%dT%H:%M:%SZ')
return self.timeArticleReleased
# if error arises during requests this decorator will retry running the function --- if error persists, the error
# will be propagated up to the code that called the method in the first place
@retry(stop=stop_after_attempt(6), wait=wait_fixed(10))
def makeRequest(self):
"""
Makes a request to the link to grab the html and parse the results. Sets self.page_soup to the soup object of
the HTML returned by the request.
:return: N/A
"""
try:
content = requests.get(self.link, proxies={'https': self.proxies.getNextProxy()}).content
except requests.exceptions.MissingSchema:
print('this error was raised')
raise InvalidArticleLink(self.link)
except requests.exceptions.InvalidSchema:
print('this error was raised')
raise InvalidArticleLink(self.link)
except requests.exceptions.ProxyError: # catches proxy errors during the request
print('Proxy Error, trying the next one...')
self.proxyErrorCounter += 1
if self.proxyErrorCounter >= 4:
self.proxies.refreshProxies() # refreshes the proxy generator
self.proxyErrorCounter = 0
except requests.exceptions.RequestException as e:
print(e)
raise
except Exception as e:
print(e)
else:
self.page_soup = soup(content, 'html.parser')
return self.page_soup
def getHeadline(self):
self.headline = self.page_soup.find('h1', {'class': 'article-headline'}).text
return self.headline
def getArticle(self):
"""
Retrieves the article from the link using proxies. Decorator runs the function again a set number of times with
a time interval between each try incase any errors arise.
:return: the article text
"""
self.article = self.page_soup.find('span', {'class': 'article-body'}).text
return self.article
def getTicker(self):
"""
Parses the article text and finds the stock ticker if there is one. InvalidArticleText error is raised if there
is a problem parsing the text for any reason.
:return: the stock ticker if there is one and None if there isn't
"""
try:
for exchange in self.exchanges:
if exchange in self.article:
exchangeTuple = self.article.partition(exchange)
foundCloseBracket = False
foundOpenBracket = False
for i in range(-1, -35, -1):
if exchangeTuple[0][i] == '(':
foundOpenBracket = True
break
elif exchangeTuple[0][i] == ')':
foundOpenBracket = False
break
for i in range(35):
if exchangeTuple[2][i] == ')':
foundCloseBracket = True
break
elif exchangeTuple[2][i] == '(':
foundCloseBracket = False
break
if foundOpenBracket and foundCloseBracket:
for i in range(30):
if exchangeTuple[2][i] == ':':
secondString = exchangeTuple[2].partition(':')
symbolString = secondString[2].strip()[:5]
regex = re.compile('[^;)\s]+')
symbol = regex.search(symbolString).group(0)
if symbol.isalpha():
self.ticker = symbol
return self.ticker
else:
return None
except TypeError:
raise InvalidArticleText(self.article)
except IndexError:
print(exchangeTuple)
def handler(event=None, context=None):
lambda_function()
return
def lambda_function():
data = GNWData(None)
proxies = Proxies()
try:
connection = psycopg2.connect(user="timolegros",
password="Nashville2020",
host="quadko-paris.ca0fcwommpnv.eu-west-3.rds.amazonaws.com",
port="5432",
database="GlobeNewsWire")
cursor = connection.cursor()
except (Exception, psycopg2.Error) as error:
print(error)
raise error
# gets the 10 most recent articles from oldest to newest
data = data.entriesList[9::-1]
recordedTickers = []
for item in data:
try:
entry = Entry(item, proxies)
except Exception:
continue
try:
cursor.execute("""INSERT INTO "Main"("Ticker", "Headline", "PubTime", "Link", "ArticleText")
VALUES (%s, %s, %s, %s, %s);""",
(entry.ticker, entry.headline, entry.timeArticleReleased, entry.link, entry.article))
connection.commit()
# recordedTickers.append(entry.ticker)
except Exception as error:
print(error)
connection.rollback()
if connection:
cursor.close()
connection.close()
# try:
# connection = psycopg2.connect(user="timolegros",
# password="Nashville2020",
# host="quadko-paris.ca0fcwommpnv.eu-west-3.rds.amazonaws.com",
# port="5432",
# database="Main")
#
# cursor = connection.cursor()
#
# except (Exception, psycopg2.Error) as error:
# print(error)
# raise error
#
# currentTime = datetime.datetime.now(pytz.utc)
# for item in recordedTickers:
# try:
# cursor.execute("""INSERT INTO "Tickers"("Ticker", "LastUpdate")
# VALUES (%s, %s);""",
# (item, currentTime))
# connection.commit()
# except Exception as error:
# print(error)
# connection.rollback()
#
# if connection:
# cursor.close()
# connection.close()
# # calls the twitter lambda to update all the tweets for the new news
# lambda_client = boto3.client("lambda")
# response = lambda_client.invoke(
# FunctionName='TwitterLambda',
# InvocationType='Event',
# LogType='None',
# Payload='Some Data',
# )
if __name__ == '__main__':
handler()