This repository has been archived by the owner on Jul 25, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
47 lines (37 loc) · 1.38 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
# Set the starting URL
base_url = "127.0.0.1:1234"
# Initialize a set to store visited URLs
visited_urls = set()
# Define a function to crawl a URL and check for dead links
def crawl(url):
# Add the URL to the visited set
visited_urls.add(url)
# Send a GET request to the URL
response = requests.get(url)
# Check if the response was successful
if response.status_code != 200:
print(f"Error: {response.status_code} - {url}")
return
# Parse the HTML content of the response
soup = BeautifulSoup(response.content, "html.parser")
# Find all links on the page
links = soup.find_all("a")
# Crawl each link
for link in links:
href = link.get("href")
if href is not None:
# Join the link URL with the base URL to handle relative links
full_url = urljoin(url, href)
# Parse the full URL to check if it's in the same domain
parsed_url = urlparse(full_url)
if parsed_url.netloc == urlparse(base_url).netloc:
# Check if the URL has already been visited
if full_url not in visited_urls:
crawl(full_url)
# Start crawling from the base URL
crawl(base_url)
# Print the number of visited URLs
print(f"Visited {len(visited_urls)} URLs")