-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathurl_domain_counter.py
35 lines (28 loc) · 1.24 KB
/
url_domain_counter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
from urllib.parse import urlparse
from collections import Counter
from tkinter import Tk, filedialog
# Use Tkinter to open file dialog for selecting the input file
root = Tk()
root.withdraw() # Hide Tkinter main window
file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xls")])
if file_path:
# Read the selected Excel file
df = pd.read_excel(file_path)
# Get the URLs from the third column (C column)
urls = df.iloc[:, 2] # C column is the third column, so iloc[:, 2]
# Extract domains from the URLs
domains = [urlparse(url).netloc for url in urls if isinstance(url, str)]
# Count the occurrences of each domain
domain_counts = Counter(domains)
# Write the results to a new DataFrame
result_df = pd.DataFrame(list(domain_counts.items()), columns=['Domain', 'Count'])
# Use file dialog to save the output file
output_file_path = filedialog.asksaveasfilename(defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx *.xls")])
if output_file_path:
result_df.to_excel(output_file_path, index=False)
print(f"Results saved to {output_file_path}.")
else:
print("Output file not selected.")
else:
print("Input file not selected.")