-
Notifications
You must be signed in to change notification settings - Fork 18
/
sparql_statistics.py
87 lines (73 loc) · 3.13 KB
/
sparql_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from io import StringIO
from urllib import request, parse
import pandas as pd
from processor import Processor
import os
class ProcessorSPARQL(Processor):
def __init__(self):
super().__init__(type="sparql")
# SPARQL Dataset Query for API
def get_sparql_query(self):
return """
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX folder: <http://publishmydata.com/def/ontology/folder/>
SELECT ?uri ?name ?creator ?publisher ?issued ?modified ?licence ?comment ?theme
WHERE {
?uri rdf:type <http://publishmydata.com/def/dataset#Dataset>.
OPTIONAL { ?uri rdfs:label ?name. }
OPTIONAL { ?uri dcterms:publisher/rdfs:label ?publisher.}
OPTIONAL { ?uri dcterms:creator/rdfs:label ?creator.}
OPTIONAL { ?uri dcterms:issued ?issued.}
OPTIONAL { ?uri dcterms:modified ?modified.}
OPTIONAL { ?uri dcterms:license ?licence.}
OPTIONAL { ?uri rdfs:comment ?comment.}
OPTIONAL {
?uri dcat:theme ?themeUri.
?themeUri folder:inTree <http://statistics.gov.scot/def/concept/folders/themes>;
rdfs:label ?theme.
}
}
"""
def get_datasets(self, owner, start_url, fname):
sparql = self.get_sparql_query()
data = parse.urlencode({"query": sparql}).encode()
# API REQUEST
req = request.Request("http://statistics.gov.scot/sparql", data=data)
req.add_header("Accept", "text/csv")
req.add_header("Contect-type", "application/x-www-form-urlencoded")
resp = request.urlopen(req)
# Decoding response and adding to pandas dataframe
respDecode = StringIO(resp.read().decode())
df = pd.read_csv(respDecode)
# Dropping Duplicate Datasets by Filtering Latest Issued Dataset
dfUnique = df.sort_values("issued", ascending=False).drop_duplicates(
subset="name", keep="first"
)
# Fallback values for those datasets missing an owner
for index, row in dfUnique.iterrows():
if pd.isnull(row["creator"]):
if pd.isnull(row["publisher"]):
row["creator"] = "Scottish Government"
else:
row["creator"] = row["publisher"]
# Renaming Column Names to ODS Format
dfOds = dfUnique.rename(
columns={
"name": "title",
"theme": "category",
"creator": "organization",
"comment": "notes",
"issued": "date_created",
"modified": "date_updated",
"uri": "url",
}
).drop(columns=["publisher"])
# File Path
fname = os.path.join("data", "scotgov-datasets-sparql" + ".csv")
dfOds.to_csv(fname, index=False)
processor = ProcessorSPARQL()
if __name__ == "__main__":
processor.process()