-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_utility.py
483 lines (310 loc) · 16.2 KB
/
build_utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
"""
Author: Miranda Lv
Date: 06/12/2018
Description:
"""
import pandas as pd
import urllib2
from bs4 import BeautifulSoup
import shortuuid
import numpy as np
import sys
import json
import os
import copy
import geopandas as gpd
from shapely.geometry import LineString, shape, asShape
import geojson
from pandas.io.json import json_normalize
import shapely
import warnings
class BuilderClass(object):
def __init__(self, geosheet=None, outgeojson=None, outtxt=None, geoboundaries=None, newgeosheet=None,
country=None, countryadm0=None, transactions=None, deflation_file=None, projects=None):
self.geosheet = pd.read_csv(geosheet, encoding='utf-8', sep='\t')
self.outgeojson = outgeojson
self.geoboundaries = geoboundaries
self.outtxt = open(outtxt, "w")
self.newgeosheet = newgeosheet
self.country = country
self.country_geom = gpd.read_file(countryadm0)['geometry'][0]
self.transactions = pd.read_csv(transactions, encoding='utf-8', sep=',')
self.deflation_file = deflation_file
self.projects = pd.read_csv(projects, encoding='utf-8', sep=',')
def build_dataset(self):
currentpath = os.getcwd()
alljsonpath = os.path.join(currentpath, 'processing', 'geographic')
self.merge_geojson(alljsonpath)
self.merge_ancillary()
self.geojson2shp()
infiles = [os.path.join(alljsonpath, f) for f in os.listdir(alljsonpath) if
os.path.isfile(os.path.join(alljsonpath, f)) and f.endswith("geojson")]
for file in infiles:
file_geom = gpd.read_file(file)['geometry'][0]
if (not shape(self.country_geom).contains(shape(file_geom))) or shape(self.country_geom).overlaps(shape(file_geom)):
warnings.warn("Polygon %s is out of the country.."%(os.path.basename(file)))
def get_full_url(self):
"""
This function is used to 1). create an unique location id; 2). retrieve the full geojson url
:return: geosheet.csv with location id and geojson link
"""
self.outtxt.write("Start retrieving geojson url.\n")
print "Start retrieving geojson url."
#df = pd.read_csv(self.geosheet, encoding='utf-8', sep='\t')
self.geosheet.dropna(how="all", inplace=True)
self.outtxt.write("Creating unique location id......\n")
print "Creating unique location id......"
# convert project id to integer
self.geosheet["project_id"] = self.geosheet["project_id"].astype(int)
# Create an unique location id
# ------------------------------
sLength = len(self.geosheet["project_id"])
self.geosheet['location_id'] = pd.Series(np.random.randn(sLength), index=self.geosheet.index)
self.geosheet["location_id"] = self.geosheet["location_id"].apply(lambda x: self.create_id())
# ------------------------------
# create a project_location id filed, which will be used for geojson file names
self.geosheet['project_location_id'] = self.geosheet[['project_id', 'location_id']].apply(lambda x: '_'.join(str(v) for v in x), axis=1)
# get the full geojson link
self.geosheet["full_url"] = self.geosheet["GeoJSON Link or Feature ID"].apply(lambda x: self.get_geojson(x))
# save the geosheet under processing/ancillary, this geosheet has unique location id, and directory geojson urls
self.geosheet.to_csv(self.newgeosheet, encoding='utf-8', sep='\t', index=False)
grouped_df = self.geosheet.groupby(["full_url", "project_location_id"])
for name, group in grouped_df:
self.get_feature_geojson(name[0], name[1])
print "Finish creating unique location id...."
print "Finish retrieving geojson url."
self.outtxt.write("Finish creating unique location id....\n")
self.outtxt.write("Finish retrieving geojson url.\n")
def create_id(self):
newid = shortuuid.uuid()
return newid
# get geojson url from gist
def get_geojson(self, url):
"""
:param url: The gist url from column "GeoJSON Link or Feature ID" in GeoSheet
:return: The url for geocoded geojson file
"""
baseurl = "https://gist.github.com"
try:
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
element = soup.find("a", class_="btn btn-sm ")
jsonurl = baseurl + element.attrs["href"]
except:
jsonurl = url
return jsonurl
# get geojson file for GeoBoundary features
def get_feature_geojson(self, url, filename):
try:
req = urllib2.Request(url)
response = urllib2.urlopen(req)
content = response.read()
jsv = json.loads(content)
filename = "processing/geographic/" + filename + ".geojson"
with open(filename, "w") as jsonfile:
json.dump(jsv, jsonfile)
except:
print "Feature from GeoBoundaries."
self.get_geoboundary_feature(url, filename)
def merge_geojson(self, inpath):
"""
This function is used to merge multiple geojson files into one. Source: https://gist.github.com/migurski/3759608
:param infiles: input geojson folder directory
:return: a geojson file includes all geocoded location geojsons.
"""
print "Start merging geojson files..."
self.outtxt.write("Start merging geojson files...\n")
infiles = [os.path.join(inpath,f) for f in os.listdir(inpath) if os.path.isfile(os.path.join(inpath, f)) and f.endswith("geojson")]
outjson = dict(type='FeatureCollection', features=[])
count = 0
for infile in infiles:
count += 1
project_id = int(float(os.path.splitext(os.path.basename(infile))[0].split("_")[0]))
location_id = str(os.path.splitext(os.path.basename(infile))[0].split("_")[1])
# add project location id to the output geojson
property_dict = dict()
property_dict["project_id"] = project_id
property_dict["location_id"] = location_id
property_dict["project_location_id"] = "_".join([str(project_id),location_id])
injsonfile = json.load(open(infile))
newjson = self.geom_check(injsonfile, project_id, location_id)
if newjson["features"][0]["geometry"]["type"] == "LineString":
newjson = self.buffer_line(newjson)
if newjson.get('type', None) != 'FeatureCollection':
raise Exception('Sorry, "%s" does not look like GeoJSON' % infile)
if type(newjson.get('features', None)) != list:
raise Exception('Sorry, "%s" does not look like GeoJSON' % infile)
newjson["features"][0]["properties"] = property_dict
outjson['features'] += newjson['features']
print "-------------------------"
print "There are %s geocoded locations"%(count)
json.dumps(outjson)
output = open(self.outgeojson, "w")
json.dump(outjson, output)
output.close()
def merge_ancillary(self):
"""
:return: This function is used to merge geojson file with geosheet, transaction info, and calculate even split commitment to each location
"""
# add location count information to geojson file
merged_geojson = gpd.read_file(self.outgeojson)
count_series = merged_geojson.project_id.value_counts()
count_df = pd.DataFrame(count_series)
count_df['merge_id'] = count_df.index
count_df['counts'] = count_df.project_id
del count_df['project_id'] # this project_id field is similar to location count, should be deleted
merge_df = merged_geojson.merge(count_df, how='left', left_on='project_id', right_on='merge_id')
# add geosheet to geojson file
newgeosheet = pd.read_csv(self.newgeosheet, encoding='utf-8', sep='\t')
outdf = merge_df.merge(newgeosheet, how='left', on='project_location_id')
# add transaction info to geojson file
full_df = outdf.merge(self.transactions, how='left', left_on='project_id_x', right_on='project_id')
full_df['even_split_commitment'] = full_df.transaction_value / full_df.counts
full_df['location_id'] = full_df['location_id_x']
keep_columns = ['project_location_id', 'project_id', 'location_id', 'Location Name',
'Identified Location Type',
'Geocoded Location Type', 'Source URL', 'GeoJSON Link or Feature ID',
'Geoparsing Notes', 'Geocoding and Review Note', 'full_url', 'geometry',
'transaction_value', 'even_split_commitment']
full_df = pd.DataFrame(full_df[keep_columns], columns=keep_columns)
# merge project level information
full_proj_df = full_df.merge(self.projects, how='left', on='project_id').set_geometry('geometry')
with open(self.outgeojson, "wb") as output:
json.dump(json.loads(full_proj_df.to_json()), output)
def geom_check(self, injson, proj_id, loc_id):
"""
This function is used to check the geometry of each geocoded locations.
- geojson cannot be point feature
- geometry cannot be a combination of multiple types
- geometry cannot have multi-features
:param injson: individual geojson
:param proj_id: project id
:param loc_id: location id
:return: geojson file with fixed geometry (multi-line features) or geometry checked
"""
geom_types = ["LineString", "Polygon"]
geoms = injson["features"]
if len(geoms) != 1: # multi-features
dest_geom_type = geoms[0]["geometry"]["type"]
if dest_geom_type not in geom_types: # cannot be point feature
print "Geometry Error: geometry types of project %s location %s is not correct." % (proj_id, loc_id)
self.outtxt.write("Geometry Error: geometry types of project %s location %s is not correct. \n" % (proj_id, loc_id))
return injson
else:
for i in range(1,len(geoms)):
new_geom_type = geoms[0]["geometry"]["type"]
if dest_geom_type != new_geom_type:
print "Geometry Error: there are multiple geometry types in project %s location %s"%(proj_id, loc_id)
self.outtxt.write("Geometry Error: there are multiple geometry types in project %s location %s \n"%(proj_id, loc_id))
return injson
else:
print "Geometry Error: there are multiple geometry features in project %s location %s" % (proj_id, loc_id)
self.outtxt.write("Geometry Error: there are multiple geometry features in project %s location %s \n" % (proj_id, loc_id))
if new_geom_type == "LineString":
self.outtxt.write("Correct multiline geometry for project %s location %s \n" % (proj_id, loc_id))
return self.connect_lines(injson)
else:
return injson
else:
return injson
def connect_lines(self, injson):
"""
:param injson: the individual geojson that has multiple line features
:return: one line feature
"""
newjson = dict(type='FeatureCollection', features=[])
newjson["features"].append(copy.deepcopy(injson["features"][0]))
coords = []
for i in range(0, len(injson["features"])):
coords = coords + injson["features"][i]["geometry"]["coordinates"]
newjson["features"][0]["geometry"]["coordinates"] = coords
return newjson
def get_geoboundary_feature(self, feat_id, proj_loc_id):
"""
:param countryjsons:
- the folder that has the country administrative boundaries
- the administrative geoboundary geojson file
:param feat_id: the identical feature id that is used to track the location feature
:param proj_loc_id: the proj_loc_id field
:return: a geojson file of feature
"""
try:
country = feat_id.split("_")[0]
adms = feat_id.split("_")[1]
jsonpath = os.path.join(self.geoboundaries, "%s/%s_%s/%s_%s.geojson"%(country, country, adms, country, adms))
geo_data = gpd.read_file(jsonpath)
feat_geo = geo_data[geo_data["feature_id"] == feat_id]
filename = "processing/geographic/" + proj_loc_id + ".geojson"
with open(filename, "wb") as output:
json.dump(json.loads(feat_geo.to_json()), output)
except:
pass
def buffer_line(self, injson):
"""
:param injson: the input geojson file is a LineString feature
:return: a buffered line feature, default buffer distance is 0.0001 degree, around 10 meters
"""
line = LineString(injson["features"][0]["geometry"]["coordinates"])
buffered_line = shape(line).buffer(0.0001).__geo_interface__
# the geo_interface returns geometries in tuple pairs
# however the geojson has list pairs geometries
poly_tuples = buffered_line["coordinates"][0]
poly_lists = [[list(i) for i in poly_tuples]]
injson["features"][0]["geometry"]["coordinates"]= poly_lists
injson["features"][0]["geometry"]["type"] = "Polygon"
return injson
# validate geojson
def parse(self, text):
try:
return json.loads(text)
except ValueError as e:
print('invalid json: %s' % e)
return None # or: raise
def add_count(self):
# count the number of line features that have been buffered
# count the number of polygon features
# count the total number of final product
return
def location_type_check(self):
"""
This function is used to check the identified location type with geocoded location type
:return: write into report.txt
"""
newdf = pd.read_csv(self.newgeosheet, encoding='utf-8', sep='\t')
newdf['discrepancy_check']= newdf.apply(lambda x: 'False' if x['Identified Location Type']!=x['Geocoded Location Type'] else 'True', axis=1) #
loc_type_fail= newdf[newdf['discrepancy_check']=='False']
grouped_df = loc_type_fail.groupby(["project_id", "location_id"])
self.outtxt.write("Start checking the discrepancy of identified location type with geocoded location type.\n")
for name, group in grouped_df:
self.outtxt.write("Location type discrepancy check failed for project %s and location %s...\n"%(name[0], name[1]))
return
def spatial_scrub(self, geocoded_geom):
"""
:param geom: geometry of geocoded feature
:param country_geom: geometry of destination country
:return:
"""
if not shape(self.country_geom).contains(shape(geocoded_geom)):
raise ('not passed spatial scrub')
def deflation(self, year, currency, val):
deflation_sheet = pd.read_csv(self.deflation_file, encoding='utf-8', sep='\t')
df_iso = deflation_sheet[deflation_sheet['currency_val'] == currency]
if df_iso.empty:
raise ('donor_iso3 \'%s\' not found in deflator table' % (currency))
else:
try:
def_val = df_iso.loc[df_iso['transaction_year'] == year, 'deflator'][0]
deflated_val = def_val * val
except KeyError:
raise ("Year not found for: %s in the year of %s"%(currency, year))
return deflated_val
def geojson2shp(self):
"""
This script is to convert the merged geojson file to shapefile
:return: shapefile under the same directory of geojson file
"""
df = gpd.read_file(self.outgeojson)
gdf = gpd.GeoDataFrame(df, geometry=df.geometry)
gdf.crs = {'init': 'epsg:4326'}
filename = os.path.join(os.path.dirname(self.outgeojson), 'merged_locations.shp')
gdf.to_file(driver='ESRI Shapefile', filename=filename)