-
Notifications
You must be signed in to change notification settings - Fork 1
/
AutoCite_CLI.py
280 lines (121 loc) · 5.57 KB
/
AutoCite_CLI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Citation Machine
import re, datetime, urllib.request, sys
from bs4 import BeautifulSoup
from dateutil import parser
'''
web_address = "https://www.straitstimes.com/singapore/condo-conflicts"
web_address = "https://www.channelnewsasia.com/news/asia/southeast-asian-leaders-meet-us-china-trade-war-asean-summit-12057538"
web_address = "https://blog.seedly.sg/telegram-channels-that-every-millennials-singapore-must-have/"
web_address = "https://stackoverflow.com/questions/5815747/beautifulsoup-getting-href"
web_address = "https://www.channelnewsasia.com/news/asia/southeast-asian-leaders-meet-us-china-trade-war-asean-summit-12057538"
web_address = "https://www.dissentmagazine.org/online_articles/can-democratic-socialism-rise-in-rural-america"
'''
def citation_components(web_address):
req = urllib.request.Request(web_address, headers = {"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A'})
response = urllib.request.urlopen(req)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
# Page Title from <title> tag | Website Title from <title> tag or from URL
try:
title_text = str(soup.title.contents[0]) # String Slice Removes <title> and </title>
except:
title_text = "" # In case no <title> tag
title_segments = title_text.split(" - ")
if len(title_segments) > 1: # If website title coems after "-" in page title
website_title = title_segments[-1]
page_title = " - "
page_title = page_title.join(title_segments[:-1])
else:
try:
website_title = re.search(r"([^.\/]+?)(?:\.(?:sg|net|com|org|gov|edu|int|eu|us))+",web_address).group(1) # Captures the last string between .DOMAIN and the . in front of that
except:
website_title = re.search(r"(?:http[s]*:\/\/)([^\/]+)",web_address).group(1)
website_title = website_title.capitalize() # Capitalises the first letter of the string
page_title = title_text
if page_title == "":
page_title = website_title # For when there is no <title> tag
page_title = page_title.strip()
# Searches for Authors via href with "author"
try:
for a in soup.find_all('a', href=True): # Find first author
if "author" in a['href']:
# print ("Author URL", a['href'])
author_path = a['href']
author_name = re.search(r"\/([^\/]+)$", author_path).group(1)
author_name = author_name.split("-")
first_name = author_name[0].capitalize()
last_name = author_name[-1].capitalize()
concat_name = first_name + last_name
if '=' in concat_name or '?' in concat_name or '+' in concat_name:
first_name = last_name = ""
except:
first_name = last_name = ""
# Accessed Date
today = datetime.date.today()
date_accessed = today.strftime("%B %d, %Y")
# Published Date
try:
date_published = ""
time_element = soup.find('time')
if time_element.has_attr('datetime') and time_element["datetime"]!= "": # Take date from attribute
web_datetime = time_element["datetime"]
date = parser.parse(web_datetime)
else: # take date from contents
date_published = time_element.contents[0]
date = parser.parse(date_published)
date_published = date.strftime("%B %d, %Y")
except Exception as e:
date_published = ""
pass
return (first_name,last_name,page_title,website_title,date_published, date_accessed)
def chicago_compile(web_address):
first_name,last_name,page_title,website_title,date_published, date_accessed = citation_components(web_address)
# Compiling the Citation
if first_name != "" and last_name != "":
citation = last_name + ", " + first_name + ". "
else:
citation = ""
citation += '"' +page_title + '." ' + website_title + ", "
if date_published != "":
citation += date_published + ", "
citation += web_address+ ". Accessed " + date_accessed + "."
return citation
def apa_compile(web_address):
first_name,last_name,page_title,website_title,date_published, date_accessed = citation_components(web_address)
no_author = False
if first_name != "" and last_name != "":
citation = last_name + ", " + first_name[0] + ". "
else:
citation = page_title + ' '
no_author = True
if date_published != "":
citation += "(" + date_published + "). "
else:
citation += "(n.d). "
if not no_author:
citation += page_title + ". "
citation += "Retrieved from " + web_address
return citation
# Main
if len(sys.argv) == 1:
print('''
USAGE: AutoCite_CLI.py URL FORMAT
Possible Formats:
Chicago (default)
apa
Notes:
Ensure that URL begins with either "http://" or "https://"
''')
exit()
web_address = sys.argv[1]
print("Citing", web_address + "...")
try:
citation_format = sys.argv[2]
print("Citation format set as", citation_format)
except:
citation_format = "chicago"
print("Citation format defaulted to", citation_format)
if citation_format == "chicago":
print(chicago_compile(web_address))
else:
print(apa_compile(web_address))