-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheu_rules_metadata_extractor.py
291 lines (240 loc) · 11.6 KB
/
eu_rules_metadata_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
"""
Script to extract metadata of EU legislative documents from the CELLAR database
SPARQL endpoint URL to access CELLAR is here: http://publications.europa.eu/webapi/rdf/sparql
If any fields have missing values, an attempt is made to scrape these from the EURLEX website.
Website: http://eur-lex.europa.eu/
"""
import csv
from SPARQLWrapper import SPARQLWrapper, TURTLE, JSON
from rdflib import Graph, Literal
import pandas as pd
import os
import requests
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import argparse
import sys
from os.path import exists
argParser = argparse.ArgumentParser(description='EURLEX PDF and HTML legislative documents metadata downloader')
required = argParser.add_argument_group('required arguments')
required.add_argument("-in", "--input", required=True, help="Path to input CSV file (single column, no header, list of celex identifiers). Find more info about CELEX identifiers here: http://eur-lex.europa.eu/content/help/eurlex-content/celex-number.html")
required.add_argument("-out", "--output", required=True, help="Path to a CSV file to store the metadata in e.g. 'path/to/metadata.csv'. ")
args = argParser.parse_args()
if args.input is None:
sys.exit('No input file specified. Type "python eu_rules_metadata_extractor.py -h" for usage help.')
if args.output is None:
sys.exit('No output file specified. Type "python eu_rules_metadata_extractor.py -h" for usage help.')
IN_CELEX_FILE = str(args.input)
OUT_METADATA_FILE = str(args.output)
# import list of celex numbers of legislation for which to extract metadata
file = open(IN_CELEX_FILE, "r", encoding='utf-8')
data = list(csv.reader(file, delimiter=","))
file.close()
celex_nums = []
for item in data:
celex_nums.append(item[0])
# Todo: if possible, extract the number of times that a file has been discussed in the Council or its preparatory bodies (under Procedure tab)
# Todo: if possible, extract the responsible EP committee
# the names of properties in CELLAR use the CDM ontology terminology / ontology: https://op.europa.eu/en/web/eu-vocabularies/cdm
# the names used on the EURLEX website are different than the CDM ones. We specify a mapping here:
property_mapping = {
"celex" : "http://publications.europa.eu/ontology/cdm#resource_legal_id_celex",
"author" : "http://publications.europa.eu/ontology/cdm#work_created_by_agent",
"responsible_body" : "http://publications.europa.eu/ontology/cdm#regulation_service_responsible",
"form" : "http://publications.europa.eu/ontology/cdm#resource_legal_type",
"title" : "http://publications.europa.eu/ontology/cdm#work_title", # (or scrape by p tag with id = "englishTitle")
"addressee" : "http://publications.europa.eu/ontology/cdm#resource_legal_addresses_country", # only for decisions - (get string from XML result)
"date_adoption" : "http://publications.europa.eu/ontology/cdm#work_date_document",
"date_in_force" : "http://publications.europa.eu/ontology/cdm#resource_legal_date_entry-into-force",
"date_end_validity" : "http://publications.europa.eu/ontology/cdm#resource_legal_date_end-of-validity", # (and cdm:resource_legal_in-force)
"directory_code" : "http://publications.europa.eu/ontology/cdm#resource_legal_is_about_concept_directory-code", # (get string from XML result)
"procedure_code" : "http://publications.europa.eu/ontology/cdm#procedure_code_interinstitutional_basis_legal", # (or cdm:resource_legal_information_miscellaneous)
"eurovoc" : "http://publications.europa.eu/ontology/cdm#work_is_about_concept_eurovoc", # (get string from XML result)
"subject_matters" : "http://publications.europa.eu/ontology/cdm#resource_legal_is_about_subject-matter" # (get string from XML result)
}
def get_title(celex):
"""
Function to extract the title summary of legislation from EURLEX website if it is not available in CELLAR
Parameters:
- celex: str
unique identifier for legislation e.g. '32016R0679'
Return:
- Summary title (str) of legislation
"""
reg_base_url = "https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:{}"
r = None
try:
r = requests.get(reg_base_url.format(celex))
except:
return ''
if r is not None:
soup = BeautifulSoup(r.content, 'lxml-xml')
res = soup.find('meta', attrs={"property" : "eli:title", "lang" : "en"})
if res:
return res.get('content')
else:
return ''
else:
return ''
def execute_sparql_query_and_return_results(sparql, query):
"""
Function to get metadata for a specific piece of legislation (indicated by a unique celex number).
This is done by executing a sparql query
Parameters:
- sparql: SPARQLWrapper instance
initialised with endpoint URL
- query: str
SPARQL query string to execute
Returns:
- SPARQL query results in TURTLE format
"""
sparql.setReturnFormat(TURTLE)
sparql.setQuery(query)
try:
results = sparql.query().convert()
return results.decode("utf-8")
except Exception as e:
print(e)
return -1
def get_string_label(sparql, uri, pred, celex_num):
"""
Gets the string representation of a coded metadata field for legislation.
For example, the directory code field for legislation is a numerical identifier for the policy area.
An example directory code looks like this: '11.60.40.20' and its URL representation
looks like this: http://publications.europa.eu/resource/authority/dir-eu-legal-act/11604020
The string label for this code is 'Anti-dumping measures'
Parameters:
- sparql : SPARQLWrapper instance
initialised with endpoint URL
- uri : str
URL representation of the directory code
- pred : str
The metadata field (using the EURLEX name for that field and not the CDM one)
- celex_num : str
The CELEX identifier for the legislation
Returns:
- String representation of the given metadata property value
"""
if (pred == 'directory_code'):
# for directory code we only need the most general policy area
# not the most fine-grained topic of the legislation
dc_uri_parts = uri.split('/')
new_dc_uri_parts = dc_uri_parts[:-1]
# print('new_dc_uri_parts: ', new_dc_uri_parts)
dc = dc_uri_parts[-1][:2] # most general part of directory code
# print('dc: ', dc)
new_dc_uri_parts.append(dc)
uri = '/'.join(new_dc_uri_parts)
# print('uri: ', uri)
sparql.setReturnFormat(JSON) # return JSON format response
query = """
SELECT (str(?o) as ?label) WHERE {{
<{}> skos:prefLabel ?o .
FILTER (lang(?o) = "en")
}}
"""
query = query.format(uri) # insert URI value into query
sparql.setQuery(query)
try: # execute query
results = sparql.query().convert()
except:
return ''
# return query results
val = results["results"]["bindings"][0]["label"]["value"]
# print('val: ', val)
return val
def get_metadata_for_legal_acts(celex_nums, endpoint_url):
"""
Extracts all the metadata for a given legislative document
Parameters:
- celex_nums : List
list of unique celex identifiers for legislation to extract
- endpoint_url : str
SPARQL endpoint URL (CELLAR) where to extract the metadata from
Returns:
- Metadata as (list of lists) where each list in the larger list
represents all the metadata property values for a single legislative document
(the list represents a table row of values for that legislative document)
"""
sparql = SPARQLWrapper(endpoint_url)
CDM_PREFIX = "http://publications.europa.eu/ontology/cdm#"
metadata_query_base = """
PREFIX cdm: <{}>
CONSTRUCT {{?s ?p ?o}}
WHERE {{
SELECT DISTINCT ?s ?p ?o WHERE {{
?s cdm:resource_legal_id_celex "{}"^^xsd:string .
?s ?p ?o .
}}
}}
"""
metadata = []
processed_celex = []
metadata_header_row = ['celex', 'author', 'responsible_body', 'form', 'title', 'addressee', 'date_adoption', 'date_in_force', 'date_end_validity', 'directory_code', 'procedure_code', 'eurovoc', 'subject_matters']
metadata.append(metadata_header_row)
idx = 1
num_celexes = len(celex_nums)
for celex_num in celex_nums:
if (celex_num not in processed_celex) and (celex_num not in ['celex']):
print(idx, '/', num_celexes)
current_row = {
"celex" : [],
"author" : [],
"responsible_body" : [],
"form" : [],
"title" : [],
"addressee" : [],
"date_adoption" : [],
"date_in_force" : [],
"date_end_validity" : [],
"directory_code" : [],
"procedure_code" : [],
"eurovoc" : [],
"subject_matters" : []
}
idx += 1
metadata_query = metadata_query_base.format(CDM_PREFIX, celex_num)
query_result = execute_sparql_query_and_return_results(sparql, metadata_query)
if (query_result != -1):
g = Graph().parse(data=str(query_result), format='turtle')
for s, p, o in g.triples((None, None, None)):
if str(p) in list(property_mapping.values()):
keys = [k for k, v in property_mapping.items() if v == str(p)]
predicate = keys[0]
if isinstance(o, Literal):
current_row[predicate].append(str(o))
else:
current_row[predicate].append(get_string_label(sparql, str(o), predicate, celex_num))
for item in current_row:
if len(current_row[item]) == 0:
current_row[item] = ''
elif len(current_row[item]) == 1:
current_row[item] = current_row[item][0]
else:
current_row[item] = ' | '.join(list(set(current_row[item])))
# If the title summary is missing then try to scrape the title
# from the EURLEX website
if (len(current_row['title']) == 0):
current_row['title'] = get_title(celex_num)
new_current_row = []
for field in current_row:
new_current_row.append(current_row[field])
metadata.append(new_current_row)
return metadata
import time
# get the start time
st = time.time()
# Extract all metadata for input list of celex identifiers
SPARQL_ENDPOINT_URL = "http://publications.europa.eu/webapi/rdf/sparql"
metadata = get_metadata_for_legal_acts(celex_nums, SPARQL_ENDPOINT_URL)
# get the end time
et = time.time()
# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')
# saving the results to file: opening the csv file in 'w+' mode
file = open(OUT_METADATA_FILE, 'w', newline ='', encoding='utf-8')
with file:
write = csv.writer(file)
write.writerows(metadata)