-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
46 lines (37 loc) · 1.47 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# this is a simple crawler that crawls the web and stores the website content in a database
# it is a simple crawler that does not use any advanced techniques
# it also scrapes the content of the website and stores it in the database
from urllib.request import urlopen
import re
import sqlite3
import time
import sys
urls = ["https://glittery-croquembouche-c25561.netlify.app/"]
# connect to the database
conn = sqlite3.connect('garlic.db')
c = conn.cursor()
# create the table if it does not exist
c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''')
# create the index if it does not exist
c.execute('''CREATE INDEX IF NOT EXISTS url_index ON websites (url)''')
# commit the changes
conn.commit()
# loop through the urls
for url in urls:
# check if the url is already in the database
c.execute('''SELECT * FROM websites WHERE url = ?''', (url,))
if c.fetchone() is None:
# if the url is not in the database, then get the content
try:
content = urlopen(url).read().decode('utf-8')
# insert the url and the content into the database
c.execute('''INSERT INTO websites VALUES (?, ?)''', (url, content))
# commit the changes
conn.commit()
# find all the urls in the content
urls.extend(re.findall('''href=["'](.[^"']+)["']''', content))
except:
# if the url is not valid, then continue
continue
# close the connection
conn.close()