-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathauthorizeFASTAndLCAuthorityHeadings.py
92 lines (80 loc) · 2.86 KB
/
authorizeFASTAndLCAuthorityHeadings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
import pandas as pd
import argparse
from datetime import datetime
from bs4 import BeautifulSoup as Soup
from rdflib import Namespace, Graph, URIRef
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file')
args = parser.parse_args()
if args.file:
filename = args.file
else:
filename = input('Enter filename (including \'.csv\'): ')
dt = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')
df_1 = pd.read_csv(filename, header=0)
searchTerms = df_1.to_dict('records')
# Configuration for requests.
headers = {'User-Agent': 'Custom user agent'}
lc = requests.Session()
ft = requests.Session()
baseURL = 'http://id.loc.gov/authorities/'
fastURL = 'http://id.worldcat.org/fast/search?query=cql.any+all+%22'
fastPara = '%22&fl=oclc.heading&recordSchema=info:srw/schema/1/rdf-v2.0'
mads = Namespace('http://www.loc.gov/mads/rdf/v1#')
auth = URIRef('http://id.loc.gov/authorities/')
authorities = {'lcnaf': 'names',
'lcsh': 'subjects',
'genre': 'genreForms'}
def get_graph(url):
g = Graph()
try:
response = lc.get(url, timeout=30, headers=headers)
text_data = response.text
parsed_graph = g.parse(data=text_data)
except requests.exceptions.Timeout:
parsed_graph = None
return parsed_graph
all_items = []
for item in searchTerms:
vocab = item.get('vocab')
searchTerm = item.get('term')
print(vocab, searchTerm)
searchTerm = searchTerm.rstrip('.')
result = {'term': searchTerm}
type = authorities.get(vocab)
if vocab != 'fast':
url = baseURL+type+'/label/'+searchTerm
data = lc.get(url, timeout=30, headers=headers)
foundName = data.ok
newURL = data.url
if foundName:
newURL = data.url
newURL = newURL.replace('.html', '')
print(newURL)
graph = get_graph(newURL + '.nt')
for item in graph.subject_objects(mads.authoritativeLabel):
if auth+type in item[0]:
if item[1].value == searchTerm:
print('Heading validated')
result['authURI'] = item[0]
result['authLabel'] = item[1].value
else:
data = ft.get(fastURL+searchTerm+fastPara)
data = data.content
soup = Soup(data, features='lxml')
record = soup.find('record')
identifier = record.find('dct:identifier')
identifier = identifier.string
authLabel = record.find('skos:preflabel')
authLabel = str(authLabel.string)
print(authLabel)
if authLabel == searchTerm:
print('Heading validated')
result['authLabel'] = authLabel
result['authURI'] = identifier
all_items.append(result)
df = pd.DataFrame.from_dict(all_items)
print(df.columns)
print(df.head)
df.to_csv('authorizedHeadingResults_'+dt+'.csv', index=False)