forked from je-suis-tm/web-scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CME2.py
124 lines (97 loc) · 4.25 KB
/
CME2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 9 11:33:03 2018
"""
#previously in CME1
#i said scraping CME is soooo effortless
#CME technical guys must have heard my voice
#they changed the website from xml structure to json query
#holy crap!! well, it would not scare off people like us!!
#here is the trick
#before we actually go to the website of CME quotes
#we press ctrl+shift+i in chrome or f12 in ie
#we can inspect element of the website
#we just go to the network monitor
#we will be able to see all the network activity
#including where the data of CME is coming from
#this is how we gon do it baby
import pandas as pd
import requests
import os
os.chdir('H:/')
#
def scrape(commodity_code):
session=requests.Session()
#cme officially forbids scraping
#so a header must be used to disguise as a browser
#technically speaking, the website should be able to detect that too
#those tech guys just turn a blind eye, thx fellas
session.headers.update(
{'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
#now that we have found out where the data is coming from
#we need to do a lil analysis on the url
#e.g. http://www.cmegroup.com/CmeWS/mvc/Quotes/Future/437/G
#it is quite obvious that 437 is a code name for commodity gold
#but how do we know the code for each commodity
#this is an issue raised by maysam19
# https://github.com/je-suis-tm/web-scraping/issues/1
#might as well as mention the solution here
#there are two ways to solve it
#if you only need very few types of commodity
#you can go to websites one by one
#e.g. https://www.cmegroup.com/trading/metals/precious/gold.html
#you can right click and select view page source
#search for /CmeWS/mvc/Quotes/Future/
#you should find the commodity code easily
#if you got so many types of commodity to scrape
#you should seek for the link that contains such information from inspect element
#here is the hack that i have done for you, voila
# https://www.cmegroup.com/CmeWS/mvc/ProductSlate/V2/List
#it is a json file that contains codes of each commodity in cme
#if you are visiting this script to understand json file
#dont worry, we will talk about how to read it very soon
response=session.get(
'http://www.cmegroup.com/CmeWS/mvc/Quotes/Future/%s/G'%(commodity_code))
return response
#
def etl(commodity_code,commodity_name):
try:
response=scrape(commodity_code)
print(response)
except Exception as e:
print(e)
#think of json file as dictionaries inside dictionaries
#the simplest way to handle json files is pandas
#remember, the solution is pandas package, not json package!
#dataframe is a default way of reading json
#if you dont like the structure
#you can use pd.read_json with orient as a key argument
#you can choose from index, columns, values, split, records
df=pd.DataFrame(response.json())
#pandas turns json into a dataframe
#still, for df['quotes']
#we end up with a bunch of dictionaries
#we just treat things as normal dictionaries
#we use the key to get value for each dictionary
#and we form a new dataframe as output
#for me, i only need prior settle price and expiration date
#volume is used to detect the front month contract
output=pd.DataFrame()
output['prior settle']=[i['priorSettle'] for i in df['quotes']]
output['expiration date']=[i['expirationDate'] for i in df['quotes']]
output['volume']=[i['volume'] for i in df['quotes']]
output['volume']=output['volume'].replace(',','').astype(float)
output['name']=commodity_name
output['front month']=output['volume']==max(output['volume'])
return output
#
def main():
df1=etl('458','silver')
df2=etl('437','gold')
df3=etl('445','palladium')
df4=etl('438','copper')
#concatenate then export
output=pd.concat([df1,df2,df3,df4])
output.to_csv('cme.csv',encoding='utf_8_sig')
if __name__ == "__main__":
main()