-
Notifications
You must be signed in to change notification settings - Fork 0
/
ScraPDF.py
104 lines (80 loc) · 3.48 KB
/
ScraPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import requests
from bs4 import BeautifulSoup
import os
def download_file(url, filename):
"""Downloads a file from the specified URL and saves it with the given filename.
Args:
url (str): The URL of the file to download.
filename (str): The name of the file to save.
"""
try:
response = requests.get(url, stream=True)
response.raise_for_status() # Raise an exception for failed downloads
with open(filename, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"Downloaded: {filename}")
except requests.exceptions.RequestException as e:
print(f"Error downloading {filename}: {e}")
def check_robots_txt(url):
"""Checks the website's robots.txt for download restrictions.
Args:
url (str): The base URL of the website.
Returns:
bool: True if downloading is allowed, False otherwise.
"""
robots_url = f"{url}/robots.txt"
try:
response = requests.get(robots_url)
if response.status_code == 200:
content = response.text
for line in content.splitlines():
if line.lower().startswith("disallow: "):
disallowed_path = line.split()[1]
if disallowed_path in url:
print(f"Downloading from {url} is disallowed by robots.txt")
return True
else:
print(f"robots.txt not found or inaccessible at {robots_url}")
except requests.exceptions.RequestException as e:
print(f"Error checking robots.txt: {e}")
return True
def download_images_and_pdfs(url):
"""Downloads PDFs, JPGs, and PNGs from the specified URL and its subpages.
Args:
url (str): The base URL of the website to crawl.
"""
if not check_robots_txt(url):
return
visited = set() # Keep track of visited URLs to avoid duplicates
def crawl(url):
if url in visited:
return
visited.add(url)
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href and (href.endswith('.pdf') or href.endswith(('.jpg', '.png'))):
download_url = f"{url}/{href}" if not href.startswith('http') else href
filename = os.path.basename(download_url)
download_file(download_url, filename)
for img in soup.find_all('img'):
src = img.get('src')
if src and (src.endswith('.jpg') or src.endswith('.png')):
download_url = f"{url}/{src}" if not src.startswith('http') else src
filename = os.path.basename(download_url)
download_file(download_url, filename)
# Recursively crawl subpages (be mindful of crawl depth and politeness)
for link in soup.find_all('a', href=True):
subpage_url = link['href']
if subpage_url.startswith(url) and subpage_url not in visited:
crawl(subpage_url)
except requests.exceptions.RequestException as e:
print(f"Error crawling {url}: {e}")
crawl(url)
# Example usage (replace with the actual website URL)
website_url = "https://www.efluniversity.ac.in/images/Meet A Leading Light/"
download_images_and_pdfs(website_url)