-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Wrangling Lesson 6.5 - Preparing for Database
221 lines (198 loc) · 6.93 KB
/
Data Wrangling Lesson 6.5 - Preparing for Database
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:
{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
"version":"2",
"changeset":"17206049",
"timestamp":"2013-08-03T16:43:42Z",
"user":"linuxUser16",
"uid":"1219059"
},
"pos": [41.9757030, -87.6921867],
"address": {
"housenumber": "5157",
"postcode": "60625",
"street": "North Lincoln Ave"
},
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}
You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB.
Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to
update the street names before you save them to JSON.
In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
- attributes in the CREATED array should be added under a key "created"
- attributes for latitude and longitude should be added to a "pos" array,
for use in geospacial indexing. Make sure the values inside "pos" array are floats
and not strings.
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
the tag should be ignored, for example:
<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>
should be turned into:
{...
"address": {
"housenumber": 5158,
"street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}
- for "way" specifically:
<nd ref="305896090"/>
<nd ref="1719825889"/>
should be turned into
"node_refs": ["305896090", "1719825889"]
"""
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def grab_tag(element):
tag_dict = {}
address = {}
add_flag = 0
for tag in element.iter("tag"):
tag_key = tag.attrib['k']
tag_value = tag.attrib['v']
prob = problemchars.search(tag_key)
lc = lower_colon.search(tag_key[5:])
if prob:
pass
elif lc:
tag_key = tag_key[5:].split(":")[1]
tag_dict[tag_key] = tag_value
elif tag_key.startswith("addr:"):
address_key = tag_key.split(":")[1]
address[address_key] = tag_value
add_flag = 1
else:
tag_dict[tag_key] = tag_value
if add_flag == 1:
tag_dict["address"] = address
return tag_dict
def grab_nd(element):
nd_dict = {}
ref_list = []
for nd in element.iter("nd"):
nd_value = nd.attrib['ref']
ref_list.append(nd_value)
nd_dict["node_refs"] = ref_list
return nd_dict
def shape_element(element):
node = {"created": {}}
if element.tag == "node" or element.tag == "way" :
# YOUR CODE HERE
lat = 0.0
lon = 0.0
for attr in element.attrib:
if attr in CREATED:
node["created"][attr] = element.attrib[attr]
elif attr == "lat":
lat = float(element.attrib[attr])
elif attr == "lon":
lon = float(element.attrib[attr])
elif attr not in CREATED:
node[attr] = element.attrib[attr]
if lat != 0.0 and lon != 0.0:
posList = [lat, lon]
node["pos"] = posList
#Grab node type
node["type"] = element.tag
#Grab element tag
tag_dict = grab_tag(element)
node.update(tag_dict)
#Grab nd tag under "way" node
if element.tag == "way":
nd_dict = grab_nd(element)
node.update(nd_dict)
return node
else:
return None
'''
"created": {
"version":"2",
"changeset":"17206049",
"timestamp":"2013-08-03T16:43:42Z",
"user":"linuxUser16",
"uid":"1219059"
}
'''
'''
for tagAttrName, tagAttrValue in tag.attributes.items():
if tagAttrName == "k":
'''
def process_map(file_in, pretty = False):
# You do not need to change this file
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
def test():
# NOTE: if you are running this code on your computer, with a larger dataset,
# call the process_map procedure with pretty=False. The pretty=True option adds
# additional spaces to the output, making it significantly larger.
data = process_map('example.osm', True)
#pprint.pprint(data)
correct_first_elem = {
"id": "261114295",
"visible": "true",
"type": "node",
"pos": [41.9730791, -87.6866303],
"created": {
"changeset": "11129782",
"user": "bbmiller",
"version": "7",
"uid": "451048",
"timestamp": "2012-03-28T18:31:23Z"
}
}
print data[0]
assert data[0] == correct_first_elem
print data[-1]
assert data[-1]["address"] == {
"street": "West Lexington St.",
"housenumber": "1412"
}
assert data[-1]["node_refs"] == [ "2199822281", "2199822390", "2199822392", "2199822369",
"2199822370", "2199822284", "2199822281"]
if __name__ == "__main__":
test()