forked from je-suis-tm/web-scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CQF.py
125 lines (101 loc) · 4.79 KB
/
CQF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#this is a script to scrape website that requires login
#make sure you understand the basics of a webpage
#u should go through other simple scrapers in this repo before moving to this one
# https://github.com/je-suis-tm/web-scraping
#in the following context
#the script is trying to get some articles from a website
#this website called cqf only allows pdf download for registered users
import requests
from bs4 import BeautifulSoup as bs
import re
import os
os.chdir('d:/')
def main():
#input your username and password
#ideally we should not store password
#we should use getpass as followed
"""
import getpass
getpass.getpass('input password:')
"""
session=requests.Session()
username=''
password=''
prefix='https://www.cqfinstitute.org/cqf-access/nojs/'
login_url='https://www.cqfinstitute.org/user/login?destination=cqf-access/nojs/'
#the first stage is to get a list of what you want
response=session.get('https://www.cqfinstitute.org/articles')
page=bs(response.content,'html.parser')
#in this case, we just need to find a list of all the articles
#each article is assigned with a code
#we only need (prefix+code) to visit the article download website
articlelist=page.find_all('a',class_='use-ajax ctools-modal-cqf-popup')
d={}
for i in articlelist:
if i.text:
d[i.text]=re.search('(?<=nojs\/)\d*',
i.get('href')).group()
#d is a dictionary that contains all the articles and codes
#for simplicity, we only wanna get the first article
target=d[list(d.keys())[0]]
#the second stage is authentication
#for websites without captcha or other methods to detect bots
#it will be as simple as followed
#if we need to go through captcha or other human verification
#we can use neural network to recognize stuff
#or download the image and let human identify it
#this script will not cover that part (cuz i am lazy)
#u may wonder where i get the headers and data from
#before writing any script at all
#we should use browser to login and go through the process
#while typing username and password in browser
#we can right click and inspect element
#in chrome, simply ctrl+shift+i
#the top columns in a popup window are elements, console, sources, network...
#we select network monitor before we login
#next, we click sign in button
#and we should see a lot of traffic in network monitor
#usually there is something called login or sign-in or auth
#when we click it, we can see our username and password in form data
#voila, that is everything we need to post
#an easy way is to copy as powershell and paste it in our ide
#we just need to restructure headers and form data in a pythonic way
#normally we dont include cookies as they may expire after a few weeks
#and we can find login url in request url section
auth=session.post(login_url+target,
headers={"Cache-Control":"max-age=0",
"Origin":"https://www.cqfinstitute.org",
"Upgrade-Insecure-Requests":"1",
"DNT":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Referer":"https://www.cqfinstitute.org/user/login?destination=cqf-access/nojs/"+target,
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"en-US,en;q=0.9"},
data={'name': username,
'pass': password,
'form_id': 'user_login',
'device_authentication_operating_system': 'Windows 10 64-bit',
'device_authentication_browser': 'Chrome',
'op': 'Log in'})
#normally when we finish login
#we should take a look at the response
#in most cases, login response is a json
#we need to find something like token or auth
#and update the session header as followed
"""
token=auth.json()["token"]
session.headers.update({"Authorization": 'Token %s'%token})
"""
#once we officially sign in as a user
#the third stage is to download the pdf
response=session.get(prefix+target)
page=bs(response.content,'html.parser')
pdf_link=(page.find('div',class_='file file-ext').find('a').get('href'))
pdf=session.get(pdf_link)
f=open('a.pdf','wb')
f.write(pdf.content)
f.close()
return
if __name__ == "__main__":
main()