forked from seljaseppala/eu_corpus_compiler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_cellar_ids.py
executable file
·137 lines (100 loc) · 4.15 KB
/
get_cellar_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/python
# coding=<utf-8>
"""
Program to send a SPARQL query to the EU SPARQL endpoint and
return a list of CELLAR IDs and other information related to each document.
It includes a function create a list of CELLAR ids to be used for downloading the corresponding files.
The program also has a function to return a list of CELLAR ids
from a CSV file that contains a set of information about each document.
"""
import os
import pandas as pd
from datetime import datetime
from utils.file_utils import text_to_str, to_json_output_file, print_list_to_file
from SPARQLWrapper import SPARQLWrapper, JSON, POST
def get_cellar_info_from_endpoint(sparql_query):
"""
Send the given sparql_query to the EU Sparql endpoint
and retrieve and return the results in JSON format.
:param sparql_query: str
:return: json dict
"""
# sparql_query = "r'" + sparql_query + "'"
# print('QUERY:', sparql_query)
endpoint = "http://publications.europa.eu/webapi/rdf/sparql" # 2020-06-12 THIS
## USING SPARQLWrapper
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(sparql_query)
sparql.setMethod(POST)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
# print('RESULTS:', results)
return results
def get_cellar_ids_from_csv_file(file_path):
"""
Get the list of CELLAR ids from the CSV file in the given file_path.
Return a list of CELLAR ids.
Input file format:
cellarURIs,lang,mtypes,workTypes,subjects,subject_ids
:param file_path: file path str
:return: list
"""
# Read the CSV into a pandas data frame (df)
df = pd.read_csv(file_path, delimiter=',')
# Get CELLAR ids
url_list = df.loc[ : , 'cellarURIs' ]
csv_id_list = [url.split('/')[-1] for url in url_list]
# print('CSV_ID_LIST:', csv_id_list, len(csv_id_list))
return csv_id_list
def get_cellar_ids_from_json_results(cellar_results):
"""
Create a list of CELLAR ids from the given cellar_results JSON dictionary and return the list.
:param cellar_results: dict
:return: list of cellar ids
"""
results_list = cellar_results["results"]["bindings"]
cellar_ids_list = [results_list[i]["cellarURIs"]["value"].split('/')[-1] for i in range(len(results_list))]
return cellar_ids_list
def query_results_to_json(query_results):
"""
Output query results to json file.
:param query_results: dict
:return: None
"""
# Usage: to_json_output_file(file_name, data)
to_json_output_file('sparql_query_results/query_results.json', query_results)
def cellar_ids_to_file(id_list):
"""
Output the list of CELLAR ids to txt file.
:param id_list: list
:return: None
"""
dir_name = "cellar_ids/"
os.makedirs(os.path.dirname(dir_name), exist_ok=True)
# Usage: print_list_to_file(file_name, data)
print_list_to_file(dir_name + 'cellar_ids_.txt', id_list)
if __name__ == '__main__':
timestamp = str(datetime.now().strftime("%Y%m%d-%H%M%S"))
# Get SPARQL query from given file
sparql_query = text_to_str('sparql_queries/financial_domain_sparql_2019-01-07.rq')
# Get CELLAR information records from EU Sparql endpoint
sparql_query_results = get_cellar_info_from_endpoint(sparql_query)
# print('RETURNED:', sparql_query_results)
# Output SPARQL query results to json file
# Usage: query_results_to_json(data)
query_results_to_json(sparql_query_results)
# Create a list of ids from the SPARQL query results (in JSON format)
id_list = get_cellar_ids_from_json_results(sparql_query_results)
# print('ID LIST:', len(id_list), id_list)
# # ALTERNATIVELY
# # If you already have a CSV file with cellar ids,
# # e.g., copy-pasted from browser results,
# # specify file (path) containing the CELLAR ids.
# # Input format: cellarURIs,lang,mtypes,workTypes,subjects,subject_ids
# cellar_ids_file = 'sparql_query_results/query_results_2019-01-07.csv'
#
# # Create a list of CELLAR ids from the given CSV file.
# id_list_from_file = get_cellar_ids_from_csv_file(cellar_ids_file)
# Output CELLAR ids list to txt file.
# with each ID on a new line
cellar_ids_to_file(id_list, timestamp)