generated from shuding/nextra-docs-template
-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #105 from sei-protocol/Cordt-actions
Add github actions workflow to crawl docs pages for broken URLs
- Loading branch information
Showing
2 changed files
with
93 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
name: Broken URL Check | ||
|
||
on: | ||
schedule: | ||
# Runs every Monday at 00:00 UTC | ||
- cron: '0 0 * * 1' | ||
workflow_dispatch: # Allows manual triggering of the workflow | ||
|
||
defaults: | ||
run: | ||
shell: bash | ||
|
||
jobs: | ||
url-check: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.x' | ||
|
||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install requests | ||
- name: Run link checker | ||
run: | | ||
python scripts/urlcheck.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import os | ||
import re | ||
import requests | ||
import socket | ||
|
||
def check_url_status(url): | ||
try: | ||
response = requests.head(url, allow_redirects=True, timeout=5) | ||
return response.status_code, response.reason | ||
except requests.RequestException as e: | ||
return None, str(e) | ||
|
||
def find_urls(text): | ||
# Only match valid URLs starting with http:// or https:// | ||
url_pattern = re.compile(r'https?://[^\s"\'<>\)]*') | ||
return url_pattern.findall(text) | ||
|
||
def is_valid_url(url): | ||
try: | ||
domain = re.findall(r'://([^/]+)', url)[0] | ||
socket.gethostbyname(domain) # Check if domain resolves to an IP | ||
return True | ||
except (socket.gaierror, IndexError): | ||
return False | ||
|
||
def check_files_in_directory(directory): | ||
report = [] | ||
|
||
for root, _, files in os.walk(directory): | ||
for file in files: | ||
if file.endswith(('.md', '.mdx')): # Check both .md and .mdx files | ||
file_path = os.path.join(root, file) | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
for line_number, line in enumerate(f, 1): | ||
urls = find_urls(line) | ||
for url in urls: | ||
if is_valid_url(url): | ||
status_code, reason = check_url_status(url) | ||
# Exclude specific status codes from report | ||
if status_code and status_code not in {200, 403, 415}: | ||
report.append({ | ||
'file': file_path, | ||
'line': line_number, | ||
'url': url, | ||
'status_code': status_code, | ||
'reason': reason | ||
}) | ||
return report | ||
|
||
def generate_report(report): | ||
for item in report: | ||
print(f"File: {item['file']}, Line: {item['line']}") | ||
print(f"URL: {item['url']}") | ||
print(f"Status Code: {item['status_code']}, Reason: {item['reason']}") | ||
print("-" * 40) | ||
|
||
if __name__ == "__main__": | ||
check_path = './pages/' # path to check | ||
report = check_files_in_directory(check_path) | ||
generate_report(report) |