-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.py
61 lines (48 loc) · 1.8 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import requests
from bs4 import BeautifulSoup
import json
def scrape_instructables(link):
url = link
data = requests.get(url)
soup = BeautifulSoup(data.content, "html.parser",
from_encoding="iso-8859-1")
header_title = soup.find("h1", {"class": "header-title"}).text
yt_links = soup.find("iframe")
if yt_links:
youtube_url = yt_links['src']
else:
youtube_url = yt_links
view_count = soup.find("p", {"class": "view-count"}).text
favorite_count = soup.find("p", {"class": "favorite-count"}).text
try:
comment_count = soup.find("p", {"class": "comment-count"}).text
except:
comment_count = soup.find("p", {"class": "comment-count"})
steps = soup.findAll("h2", {"class": "step-title"})
step_titles = []
for step in steps:
step_titles.append(step.text)
supplies_body = soup.find("div", {"class": "step-body"})
supp = supplies_body.find('ul')
supply_list = []
if supp:
sups = supp.findAll('li')
for sup in sups:
sup = sup.text
supply_list.append(sup)
scraped = {
"header_title": str(header_title),
"youtube_url": str(youtube_url),
"view_count": str(view_count),
"favorite_count": str(favorite_count),
"comment_count": str(comment_count),
"steps": step_titles,
"supplies": supply_list
}
return scraped
url_list = ["https://www.instructables.com/Building-a-Self-Driving-Boat-ArduPilot-Rover/",
"https://www.instructables.com/Hydraulic-Craft-Stick-Box/",
"https://www.instructables.com/How-to-Make-a-Self-Watering-Plant-Stand/"]
for i, url in enumerate(url_list):
with open("url"+str(i+1)+".json", 'w') as f:
f.write(json.dumps(scrape_instructables(url)))