-
Notifications
You must be signed in to change notification settings - Fork 5
/
iota2.py
151 lines (119 loc) · 4.69 KB
/
iota2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from argparse import ArgumentParser
"""
Main Process:
Step 1. Parse command line arguments
Step 2. Find suburls or links of the web page
Step 3. Add raw links/urls to a list
Step 4. Remove duplicates of the list and output
"""
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
download_img = False
class LOG:
def warn(msg):
print("warning: " + msg)
def error(msg):
print("error: " + msg)
def info(msg):
print("info: " + msg)
def decoration(output):
print('\n', output, '\n', '--' * 20)
class HELPER:
def if_url(url):
return ('http' or 'www.') in url
def remove_url_ends(url):
try:
if 'https' in url:
return 'https://' + url.split('//')[1].split('/')[0]
return 'http://' + url.split('//')[1].split('/')[0]
except:
print("please input the full url")
def download_image(name, url):
with open(name, 'wb') as f:
f.write(requests.get(url).content)
def get_html(parent_url):
driver.get(parent_url)
soup = BeautifulSoup(str(driver.page_source), 'html.parser')
return soup
class Instructions(HELPER):
def find_files(self, parent_url):
LOG.decoration('Files:')
url_list = []
suffix = input('file suffix: ')
for x in find_links(parent_url):
if suffix in x:
print(x)
url_list.append(x)
if len(url_list) == 0:
print("Couldn't find anything...")
# Find suburls or links of the web page
def find_links(self, parent_url):
url_list = []
url_prefix = HELPER.remove_url_ends(parent_url)
for link in HELPER.get_html(parent_url).find_all('a'):
try:
href = link.get('href')
if if_url(href):
url_list.append(href)
else:
url_list.append(url_prefix + href)
except Exception as e:
LOG.warn(e)
return list(set(url_list))
def find_img(self, parent_url):
url_list = []
url_prefix = HELPER.remove_url_ends(parent_url) # http||https
html = HELPER.get_html(parent_url) # soup(html)
for link in html.find_all('img'):
url_list.append(link.get('src')
if HELPER.if_url(link.get('src'))
else url_prefix + link.get('src'))
return list(set(url_list))
def find_all_img(self, url):
pass
def find_all_links(self, url):
print(find_links(url))
def main():
# Parse command line arguments
parser = ArgumentParser()
parser.add_argument("url", nargs="?", default="", help='The URL of the target website/webpage')
parser.add_argument('-img', help='Find all of the image on the webpage', action='store_true')
parser.add_argument('-all_img', help='Find all of the image on the webpage and subwebpages', action='store_true')
parser.add_argument('-link', help='Find the suburls/links on the webpage', action='store_true')
parser.add_argument('-all_links', help='Find all of the suburls/links on the webpage', action='store_true')
parser.add_argument('--download', help='Download images', action='store_true')
args = parser.parse_args()
global download_img
download_img = args.download
INSTRUC = Instructions()
if args.img:
url_list = INSTRUC.find_img(args.url)
LOG.decoration('Images:')
for i in range(len(url_list)):
print(f'[{i + 1}]{url_list[i]}')
if not download_img: continue
try:
HELPER.download_image(f'img[{i + 1}].png', url_list[i])
LOG.info('download successfully...')
except:
LOG.error('invalid image...')
if args.all_img:
INSTRUC.find_all_img(args.url)
if args.link:
url_list = INSTRUC.find_links(args.url)
LOG.decoration('Links:')
for i in range(len(url_list)):
print(f'[{i + 1}]{url_list[i]}')
if args.all_links:
INSTRUC.find_all_links(args.url)
if __name__ == '__main__':
main()