-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinternal-link-analysys.py
174 lines (141 loc) · 5.97 KB
/
internal-link-analysys.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import re
# Function to fetch and parse a sitemap or sitemapindex
def get_sitemap_urls(sitemap_url):
try:
response = requests.get(sitemap_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'xml')
urls = set()
# Check if it's a sitemapindex
if soup.find_all('sitemap'):
for sitemap in soup.find_all('sitemap'):
# Recursively fetch URLs from nested sitemaps
nested_sitemap_url = sitemap.find('loc').text
urls.update(get_sitemap_urls(nested_sitemap_url))
else:
# Extract URLs from a regular sitemap
for url_loc in soup.find_all('loc'):
url = url_loc.text
if not is_image(url) and not should_exclude_url(url): # Avoid crawling images and excluded URLs
urls.add(url)
return urls
else:
print(f"Failed to fetch sitemap: {sitemap_url}")
return set()
except Exception as e:
print(f"Error fetching sitemap: {e}")
return set()
# Function to check if the URL is an image (avoid crawling images)
def is_image(url):
return re.search(r'\.(jpg|jpeg|png|gif|bmp|svg|webp)$', url, re.IGNORECASE)
# Function to check if the URL should be excluded
def should_exclude_url(url):
return re.search(r'/(tag|category|author)/', url, re.IGNORECASE)
# Function to fetch and parse a page
def get_links(url, base_domain):
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
# Find all anchor tags with href attributes
links = set()
for link in soup.find_all('a', href=True):
href = link['href']
# Convert relative links to absolute
full_url = urljoin(url, href)
# Check if the link belongs to the same domain, is not an image, and should not be excluded
if (base_domain in urlparse(full_url).netloc
and not is_image(full_url)
and not should_exclude_url(full_url)):
links.add(full_url)
return links
else:
print(f"Failed to fetch {url}")
return set()
except Exception as e:
print(f"Error fetching {url}: {e}")
return set()
# Function to crawl the site based on sitemap URLs
def crawl_sitemap(sitemap_urls, base_domain):
crawled = set()
internal_links_map = {}
for current_url in sitemap_urls:
if current_url in crawled:
continue
print(f"Crawling: {current_url}")
links = get_links(current_url, base_domain)
internal_links_map[current_url] = links.intersection(sitemap_urls) # Restrict to sitemap URLs only
crawled.add(current_url)
return internal_links_map
# Analyse the link structure
def analyse_links(internal_links_map):
page_link_count = {page: len(links) for page, links in internal_links_map.items()}
all_pages = set(internal_links_map.keys())
linked_pages = {link for links in internal_links_map.values() for link in links}
orphan_pages = all_pages - linked_pages # Pages with no incoming links
return page_link_count, orphan_pages
# Compute PageRank using networkx
def compute_pagerank(internal_links_map):
G = nx.DiGraph()
# Add edges (from -> to) to the graph
for page, links in internal_links_map.items():
for link in links:
G.add_edge(page, link)
# Compute PageRank
pagerank = nx.pagerank(G, alpha=0.85)
return pagerank
# Visualise the internal link structure
def visualise_internal_links(internal_links_map, pagerank):
G = nx.DiGraph()
# Add edges and nodes
for page, links in internal_links_map.items():
for link in links:
G.add_edge(page, link)
# Draw the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)
# Draw nodes, sized by PageRank
node_size = [v * 10000 for v in pagerank.values()] # Scale PageRank for visualisation
nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color="skyblue", alpha=0.8)
# Draw edges
nx.draw_networkx_edges(G, pos, arrowstyle='->', arrowsize=10, edge_color="gray", alpha=0.7)
# Draw labels
nx.draw_networkx_labels(G, pos, font_size=10)
plt.title("Internal Link Structure Visualised by PageRank")
plt.show()
# Save report to CSV
def save_report(page_link_count, orphan_pages, pagerank, output_file="internal_link_report.csv"):
df = pd.DataFrame(page_link_count.items(), columns=['Page', 'Internal Links Count'])
df['PageRank'] = df['Page'].map(pagerank)
df['Is Orphan'] = df['Page'].apply(lambda x: x in orphan_pages)
df.sort_values(by='Internal Links Count', ascending=True, inplace=True)
df.to_csv(output_file, index=False)
print(f"Report saved to {output_file}")
# Main function
def main():
# Sitemap URL
sitemap_url = "https://psualatberat.com/sitemap_index.xml" # Replace with your site's sitemap URL
base_domain = urlparse(sitemap_url).netloc
# Get URLs from sitemap (supports sitemapindex)
sitemap_urls = get_sitemap_urls(sitemap_url)
if not sitemap_urls:
print("No URLs found in the sitemap.")
return
# Crawl the site based on the sitemap
internal_links_map = crawl_sitemap(sitemap_urls, base_domain)
# Analyse link structure
page_link_count, orphan_pages = analyse_links(internal_links_map)
# Compute PageRank
pagerank = compute_pagerank(internal_links_map)
# Visualise the internal link structure
visualise_internal_links(internal_links_map, pagerank)
# Save the results
save_report(page_link_count, orphan_pages, pagerank)
if __name__ == "__main__":
main()