Skip to content

Commit

Permalink
Merge pull request #299 from shankari/fix_common_trips
Browse files Browse the repository at this point in the history
Clean up current implementation of the common trips
  • Loading branch information
shankari authored Jun 30, 2016
2 parents ddd428a + 9ba9511 commit 4f8c206
Show file tree
Hide file tree
Showing 20 changed files with 301 additions and 319 deletions.
8 changes: 1 addition & 7 deletions emission/analysis/modelling/tour_model/K_medoid.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
# Standard imports
from __future__ import division
import logging
import numpy as np
import math
import random
import time

# Our imports
from emission.core.get_database import get_routeDistanceMatrix_db,get_routeCluster_db,get_section_db
from emission.core.common import calDistance, getDisplayModes
from emission.core.get_database import get_routeDistanceMatrix_db,get_section_db
from emission.analysis.modelling.tour_model.trajectory_matching.route_matching import fullMatchDistance,getRoute
from emission.analysis.modelling.tour_model.trajectory_matching.LCS import lcsScore

Sections=get_section_db()

Expand Down
61 changes: 16 additions & 45 deletions emission/analysis/modelling/tour_model/cluster_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Standard imports
import math
import datetime
import uuid as uu
import sys
import logging
Expand All @@ -10,13 +9,6 @@
import emission.analysis.modelling.tour_model.similarity as similarity
import emission.analysis.modelling.tour_model.featurization as featurization
import emission.analysis.modelling.tour_model.representatives as representatives

from emission.core.wrapper.trip_old import Trip, Section, Fake_Trip

import emission.core.wrapper.trip as ecwt
import emission.core.wrapper.section as ecws
import emission.storage.decorations.trip_queries as ecsdtq
import emission.storage.decorations.section_queries as ecsdsq
import emission.storage.decorations.analysis_timeseries_queries as esda

"""
Expand All @@ -41,74 +33,53 @@
"""

#read the data from the database.
def read_data(uuid=None, size=None, old=True):
def read_data(uuid=None):
db = edb.get_trip_db()
if not old:
logging.debug("not old")
trips = esda.get_entries(esda.CLEANED_TRIP_KEY, uuid,
time_query=None, geo_query=None)
return trips

if old:
data = []
trip_db = db
if uuid:
trips = trip_db.find({'user_id' : uuid, 'type' : 'move'})
else:
trips = trip_db.find({'type' : 'move'})
for t in trips:
try:
trip = Trip.trip_from_json(t)
except:
continue
if not (trip.trip_start_location and trip.trip_end_location and trip.start_time):
continue
data.append(trip)
if size:
if len(data) == size:
break
return data
trips = esda.get_entries(esda.CLEANED_TRIP_KEY, uuid,
time_query=None, geo_query=None)
logging.info("After reading data, returning %s trips" % len(trips))
return trips

#put the data into bins and cut off the lower portion of the bins
def remove_noise(data, radius, old=True):
def remove_noise(data, radius):
if not data:
return [], []
sim = similarity.similarity(data, radius, old)
sim = similarity.similarity(data, radius)
sim.bin_data()
logging.debug('number of bins before filtering: %d' % len(sim.bins))
sim.delete_bins()
logging.debug('number of bins after filtering: %d' % len(sim.bins))
return sim.newdata, sim.bins

#cluster the data using k-means
def cluster(data, bins, old=True):
def cluster(data, bins):
if not data:
return 0, [], []
feat = featurization.featurization(data, old=old)
feat = featurization.featurization(data)
min = bins
max = int(math.ceil(1.5 * bins))
feat.cluster(min_clusters=min, max_clusters=max)
logging.debug('number of clusters: %d' % feat.clusters)
return feat.clusters, feat.labels, feat.data

#prepare the data for the tour model
def cluster_to_tour_model(data, labels, old=True):
def cluster_to_tour_model(data, labels):
if not data:
return []
repy = representatives.representatives(data, labels, old=old)
repy = representatives.representatives(data, labels)
repy.list_clusters()
repy.get_reps()
repy.locations()
logging.debug('number of locations: %d' % repy.num_locations)
repy.cluster_dict()
return repy.tour_dict

def main(uuid=None, old=True):
data = read_data(uuid, old=old)
def main(uuid=None):
data = read_data(uuid)
logging.debug("len(data) is %d" % len(data))
data, bins = remove_noise(data, 300, old=old)
n, labels, data = cluster(data, len(bins), old=old)
tour_dict = cluster_to_tour_model(data, labels, old=old)
data, bins = remove_noise(data, 300)
n, labels, data = cluster(data, len(bins))
tour_dict = cluster_to_tour_model(data, labels)
return tour_dict

if __name__=='__main__':
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging

import emission.analysis.modelling.tour_model.tour_model_matrix as tm ##here
import emission.core.get_database as edb
import emission.core.wrapper.trip_old as trip
import emission.analysis.modelling.tour_model.tour_model_matrix as tm
import emission.analysis.modelling.tour_model.cluster_pipeline as eamtcp
from uuid import UUID
import random, datetime, sys
Expand Down
33 changes: 10 additions & 23 deletions emission/analysis/modelling/tour_model/featurization.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
# Standard imports
import logging
import matplotlib
# matplotlib.use('Agg')
import matplotlib.pyplot as plt
import math
import numpy
from sklearn.cluster import KMeans
from sklearn import metrics
import sys

# our imports
from emission.core.wrapper.trip_old import Trip, Coordinate
from kmedoid import kmedoids
import emission.storage.decorations.trip_queries as esdtq


"""
Expand All @@ -25,9 +20,8 @@
"""
class featurization:

def __init__(self, data, old=True):
def __init__(self, data):
self.data = data
self.is_old = old
if not self.data:
self.data = []
self.calculate_points()
Expand All @@ -41,21 +35,14 @@ def calculate_points(self):
if not self.data:
return
for trip in self.data:
if self.is_old:
start = trip.trip_start_location
end = trip.trip_end_location
else:
try:
start = trip.data.start_loc["coordinates"]
end = trip.data.end_loc["coordinates"]
except:
continue
try:
start = trip.data.start_loc["coordinates"]
end = trip.data.end_loc["coordinates"]
except:
continue
if not (start and end):
raise AttributeError('each trip must have valid start and end locations')
if self.is_old:
self.points.append([start.lon, start.lat, end.lon, end.lat])
else:
self.points.append([start[0], start[1], end[0], end[1]])
self.points.append([start[0], start[1], end[0], end[1]])

#cluster the data. input options:
# - name (optional): the clustering algorithm to use. Options are 'kmeans' or 'kmedoids'. Default is kmeans.
Expand All @@ -68,7 +55,7 @@ def cluster(self, name='kmeans', min_clusters=2, max_clusters=None):
logging.debug("min_clusters < 2, setting min_clusters = 2")
min_clusters = 2
if min_clusters > len(self.points):
sys.stderr.write('Maximum number of clusters is the number of data points.\n')
sys.stderr.write('Minimum number of clusters %d is greater than the number of data points %d.\n' % (min_clusters, len(self.points)))
min_clusters = len(self.points)-1
if max_clusters == None:
logging.debug("max_clusters is None, setting max_clusters = %d" % (len(self.points) - 1))
Expand Down Expand Up @@ -138,8 +125,8 @@ def check_clusters(self):
if not self.labels:
logging.debug('Please cluster before analyzing clusters.')
return
logging.debug('number of clusters is %d' % str(self.clusters))
logging.debug('silhouette score is %d' % str(self.sil))
logging.debug('number of clusters is %d' % self.clusters)
logging.debug('silhouette score is %s' % self.sil)

#map the clusters
#TODO - move this to a file in emission.analysis.plotting to map clusters from the database
Expand Down
85 changes: 48 additions & 37 deletions emission/analysis/modelling/tour_model/representatives.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import numpy
import math
import copy
import geojson as gj

# our imports
from emission.core.wrapper.trip_old import Trip, Coordinate
import emission.core.wrapper.trip as ecwt
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.analysis_timeseries_queries as esda


Expand All @@ -26,9 +28,8 @@

class representatives:

def __init__(self, data, labels, old=True):
def __init__(self, data, labels):
self.data = data
self.is_old = old
if not self.data:
self.data = []
self.labels = labels
Expand All @@ -52,37 +53,46 @@ def list_clusters(self):
self.clusters[a].append(self.data[i])

#get the representatives for each cluster
#I don't understand wtf this does
# Why are we getting the mean of the start and end points in the cluster and
# creating a fake trip from it? Why not just pick a real representative of
# of the trips? Alternatively, why not create a new data structure to represent
# that this is a reconstructed trip that has no bearing in reality? What does
# it even mean that we have a trip with only a start and end point and no
# actual start or end times?
def get_reps(self):
self.reps = []
if not self.data:
return
for cluster in self.clusters:
for i, cluster in enumerate(self.clusters):
logging.debug("Considering cluster %d = %s" % (i, cluster))
points = [[], [], [], []]
for c in cluster:
if self.is_old:
points[0].append(c.trip_start_location.lat)
points[1].append(c.trip_start_location.lon)
points[2].append(c.trip_end_location.lat)
points[3].append(c.trip_end_location.lon)
else:
# We want (lat, lon) to be consistent with old above.
# But in the new, our data is in geojson so it is (lon, lat).
# Fix it by flipping the order of the indices
# Note also that we want to use the locations of the start
# and end places, not of the start point of the trip, which
# may be some distance away due to geofencing.
start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
c.data.start_place)
end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
c.data.end_place)
points[0].append(start_place.data.location["coordinates"][1])
points[1].append(start_place.data.location["coordinates"][0])
points[2].append(end_place.data.location["coordinates"][1])
points[3].append(end_place.data.location["coordinates"][0])
logging.debug("in representatives, endpoints have len = %s" %
len(points))

# If this cluster has no points, we skip it
if len(cluster) == 0:
logging.info("Cluster %d = %s, has length %d, skipping" %
(i, cluster, len(cluster)))
continue

for j, c in enumerate(cluster):
logging.debug("Consider point %d = %s" % (j, c))
start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
c.data.start_place)
end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
c.data.end_place)
points[0].append(start_place.data.location["coordinates"][1]) # lat
points[1].append(start_place.data.location["coordinates"][0]) # lng
points[2].append(end_place.data.location["coordinates"][1]) # lat
points[3].append(end_place.data.location["coordinates"][0]) # lng
logging.debug("in representatives, endpoints have len = %s" %
len(points))
centers = numpy.mean(points, axis=1)
a = Trip(None, None, None, None, None, None, Coordinate(centers[0], centers[1]), Coordinate(centers[2], centers[3]))
logging.debug("For cluster %d, centers are %s" % (i, centers))
t = ecwt.Trip({
"start_loc": gj.Point([centers[1], centers[0]]),
"end_loc": gj.Point([centers[3], centers[2]])
})
a = ecwe.Entry.create_entry(c.user_id, "analysis/cleaned_trip", t)
self.reps.append(a)

#map the representatives
Expand Down Expand Up @@ -134,12 +144,12 @@ def locations(self):
locs = []
for b in bin:
if b[0] == 'start':
point = self.reps[b[1]].trip_start_location
point = self.reps[b[1]].data.start_loc
if b[0] == 'end':
point = self.reps[b[1]].trip_end_location
locs.append([point.lat, point.lon])
point = self.reps[b[1]].data.end_loc
locs.append(point.coordinates)
locs = numpy.mean(locs, axis=0)
coord = Coordinate(locs[0], locs[1])
coord = [locs[0], locs[1]]
self.locs.append(coord)

#create the input to the tour graph
Expand Down Expand Up @@ -198,15 +208,16 @@ def cluster_dict(self):
#check whether a point is close to all points in a bin
def match(self, label, a, bin):
if label == 'start':
pointa = self.reps[a].trip_start_location
pointa = self.reps[a].data.start_loc
elif label == 'end':
pointa = self.reps[a].trip_end_location
pointa = self.reps[a].data.end_loc
for b in bin:
if b[0] == 'start':
pointb = self.reps[b[1]].trip_start_location
pointb = self.reps[b[1]].data.start_loc
elif b[0] == 'end':
pointb = self.reps[b[1]].trip_end_location
if self.distance(pointa.lat, pointa.lon, pointb.lat, pointb.lon) > 300:
pointb = self.reps[b[1]].data.end_loc
if self.distance(pointa.coordinates[1], pointa.coordinates[0],
pointb.coordinates[1], pointb.coordinates[0]) > 300:
return False
return True

Expand Down
Loading

0 comments on commit 4f8c206

Please sign in to comment.