-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsearchForStringMatchInFAST.py
101 lines (90 loc) · 3.76 KB
/
searchForStringMatchInFAST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import requests
import csv
from datetime import datetime
import argparse
from fuzzywuzzy import fuzz
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', help='Enter filename with csv.')
args = parser.parse_args()
if args.file:
filename = args.file
else:
filename = input('Enter filename (including \'.csv\'): ')
# Some config for FAST APIs.
api_base_url = "http://fast.oclc.org/searchfast/fastsuggest"
fast_uri_base = "http://id.worldcat.org/fast/{0}"
dt = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')
# Find exact matches from FAST API.
def fastExact_function(search_subject):
fast_url = api_base_url + '?&query=' + search_subject
fast_url += '&queryIndex=suggestall&queryReturn=suggestall,idroot,auth,tag,raw&suggest=autoSubject&rows=5&wt=json'
try:
data = requests.get(fast_url).json()
for item in data:
if item == 'response':
response = data.get(item)
if response.get('numFound') > 0:
for metadata in response:
if metadata == 'docs':
keyInfo = response.get(metadata)
for info in keyInfo:
auth_name = info.get('auth')
fast_id = info.get('idroot')
ratio = fuzz.token_sort_ratio(auth_name, search_subject)
if auth_name == search_subject or ratio == 95:
result_dict['auth_name'] = auth_name
result_dict['fast_id'] = fast_id
break
else:
pass
except ValueError:
pass
# Find close matches from FAST API
def fastClose_function(search_subject):
global fast_found
fast_url = api_base_url + '?&query=' + search_subject
fast_url += '&queryIndex=suggestall&queryReturn=suggestall,idroot,auth,tag,raw&suggest=autoSubject&rows=5&wt=json'
try:
data = requests.get(fast_url).json()
for item in data:
if item == 'response':
response = data.get(item)
if response.get('numFound') > 0:
for metadata in response:
if metadata == 'docs':
keyInfo = response.get(metadata)
for count, info in enumerate(keyInfo[:5]):
auth_name = info.get('auth')
fast_id = info.get('idroot')
result_dict[str(count)+'_'+'auth_name'] = auth_name
result_dict[str(count)+'_'+'fast_id'] = fast_id
except ValueError:
pass
result_list = []
with open(filename) as itemMetadataFile:
itemMetadata = csv.DictReader(itemMetadataFile)
for row in itemMetadata:
result_dict = {}
itemID = row['itemID']
result_dict['itemID'] = itemID
search_subject = row['subjects'].strip()
result_dict['old_subject'] = search_subject
print(search_subject)
# Improve quality of API search.
search_subject = search_subject.replace(' -- ', ' ')
search_subject = search_subject.replace('-', ' ')
search_subject = search_subject.rstrip('.')
# Loop through function to find matches.
fastExact_function(search_subject)
data = result_dict.get('auth_name')
if data is None:
fastClose_function(search_subject)
else:
pass
print(result_dict)
result_list.append(result_dict)
df = pd.DataFrame.from_dict(result_list)
print(df.columns)
print(df.head)
df.to_csv('fastResults_'+dt+'.csv', index=False)