forked from krm1337/Mobile-Phone-Dataset-GSMArena
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gsmarena_scraping.py
168 lines (152 loc) · 7.18 KB
/
gsmarena_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import requests
from bs4 import BeautifulSoup
import csv
import os
import time
import json
import platform
# Class gsmarena scrap the website phones models and its devices and save to csv file individually.
class Gsmarena():
# Constructor to initialize common useful varibales throughout the program.
def __init__(self):
self.phones = []
self.features = ["Brand", "Model Name", "Model Image"]
self.temp1 = []
self.phones_brands = []
self.url = 'https://www.gsmarena.com/' # GSMArena website url
self.new_folder_name = 'GSMArenaDataset' # Folder name on which files going to save.
if platform.system() == 'Windows' :
self.absolute_path = os.getcwd() + '/' + self.new_folder_name +'/' # It create the absolute path of the GSMArenaDataset folder in Windows platform.
else:
self.absolute_path = os.popen('pwd').read().strip() + '/' + self.new_folder_name # It create the absolute path of the GSMArenaDataset folder in not Windows platform.
# This function crawl the html code of the requested URL.
def crawl_html_page(self, sub_url):
url = self.url + sub_url # Url for html content parsing.
header={"User-Agent":"#user agent of your system "}
# https://github.com/saschazesiger/Free-Proxies
http_proxy = "103.127.243.31:1080"
proxy_servers = {
'http': "http://" + http_proxy,
'https': "https://" + http_proxy
}
time.sleep(30) #SO that your IP does not gets blocked by the website
# Handing the connection error of the url.
try:
page = requests.get(url,timeout= 5, headers=header, proxies=proxy_servers)
soup = BeautifulSoup(page.text, 'html.parser') # It parses the html data from requested url.
return soup
except ConnectionError as err:
print("Please check your network connection and re-run the script.")
exit()
except Exception:
print("Please check your network connection and re-run the script.")
exit()
# This function crawl mobile phones brands and return the list of the brands.
def crawl_phone_brands(self):
phones_brands = []
soup = self.crawl_html_page('makers.php3')
table = soup.find_all('table')[0]
table_a = table.find_all('a')
for a in table_a:
temp = [a['href'].split('-')[0], a.find('span').text.split(' ')[0], a['href']]
phones_brands.append(temp)
return phones_brands
# This function crawl mobile phones brands models links and return the list of the links.
def crawl_phones_models(self, phone_brand_link):
links = []
nav_link = []
soup = self.crawl_html_page(phone_brand_link)
nav_data = soup.find(class_='nav-pages')
if not nav_data:
nav_link.append(phone_brand_link)
else:
nav_link = nav_data.findAll('a')
nav_link = [link['href'] for link in nav_link]
nav_link.append(phone_brand_link)
nav_link.insert(0, nav_link.pop())
for link in nav_link:
soup = self.crawl_html_page(link)
data = soup.find(class_='section-body')
for line1 in data.findAll('a'):
links.append(line1['href'])
return links
# This function crawl mobile phones specification and return the list of the all devices list of single brand.
def crawl_phones_models_specification(self, link, phone_brand):
phone_data = {}
soup = self.crawl_html_page(link)
model_name = soup.find(class_='specs-phone-name-title').text
model_img_html = soup.find(class_='specs-photo-main')
model_img = model_img_html.find('img')['src']
phone_data.update({"Brand": phone_brand})
phone_data.update({"Model Name": model_name})
phone_data.update({"Model Image": model_img})
temp = []
for data1 in range(len(soup.findAll('table'))):
table = soup.findAll('table')[data1]
for line in table.findAll('tr'):
temp = []
for l in line.findAll('td'):
text = l.getText()
text = text.strip()
text = text.lstrip()
text = text.rstrip()
text = text.replace("\n", "")
temp.append(text)
if temp[0] in phone_data.keys():
temp[0] = temp[0] + '_1'
if temp[0] not in self.features:
self.features.append(temp[0])
if not temp:
continue
else:
phone_data.update({temp[0]: temp[1]})
return phone_data
# This function create the folder 'GSMArenaDataset'.
def create_folder(self):
if not os.path.exists(self.new_folder_name):
os.system('mkdir ' + self.new_folder_name)
print("Creating ", self.new_folder_name, " Folder....")
time.sleep(6)
print("Folder Created.")
else:
print(self.new_folder_name , "directory already exists")
# This function check the csv file exists in the 'GSMArenaDataset' directory or not.
def check_file_exists(self):
return os.listdir(self.absolute_path)
# This function save the devices specification to csv file.
def save_specification_to_file(self):
phone_brand = self.crawl_phone_brands()
self.create_folder()
files_list = self.check_file_exists()
for brand in phone_brand:
phones_data = []
if (brand[0].title() + '.csv') not in files_list:
link = self.crawl_phones_models(brand[2])
model_value = 1
print("Working on", brand[0].title(), "brand.")
for value in link:
datum = self.crawl_phones_models_specification(value, brand[0])
datum = { k:v.replace('\n', ' ').replace('\r', ' ') for k,v in datum.items() }
phones_data.append(datum)
print("Completed ", model_value, "/", len(link))
model_value+=1
with open(self.absolute_path + '/' + brand[0].title() + ".csv", "w") as file:
dict_writer = csv.DictWriter(file, fieldnames=self.features)
dict_writer.writeheader()
str_phones_data = json.dumps(phones_data)
encoded = str_phones_data.encode('utf-8')
load_list = json.loads(encoded)
for dicti in load_list:
dict_writer.writerow({k:v.encode('utf-8') for k,v in dicti.items()})
print("Data loaded in the file")
else:
print(brand[0].title() + '.csv file already in your directory.')
# This is the main function which create the object of Gsmarena class and call the save_specificiton_to_file function.
i = 1
while i == 1:
if __name__ == "__main__":
obj = Gsmarena()
try:
obj.save_specification_to_file()
except KeyboardInterrupt:
print("File has been stopped due to KeyBoard Interruption.")