Merge pull request #299 from shankari/fix_common_trips

Clean up current implementation of the common trips
e-mission · Jun 30, 2016 · 4f8c206 · 4f8c206
2 parents ddd428a + 9ba9511
commit 4f8c206
Show file tree

Hide file tree

Showing 20 changed files with 301 additions and 319 deletions.
diff --git a/emission/analysis/modelling/tour_model/K_medoid.py b/emission/analysis/modelling/tour_model/K_medoid.py
@@ -1,16 +1,10 @@
 # Standard imports
 from __future__ import division
-import logging
-import numpy as np
-import math
 import random
-import time
 
 # Our imports
-from emission.core.get_database import get_routeDistanceMatrix_db,get_routeCluster_db,get_section_db
-from emission.core.common import calDistance, getDisplayModes
+from emission.core.get_database import get_routeDistanceMatrix_db,get_section_db
 from emission.analysis.modelling.tour_model.trajectory_matching.route_matching import fullMatchDistance,getRoute
-from emission.analysis.modelling.tour_model.trajectory_matching.LCS import lcsScore
 
 Sections=get_section_db()
 

diff --git a/emission/analysis/modelling/tour_model/cluster_pipeline.py b/emission/analysis/modelling/tour_model/cluster_pipeline.py
@@ -1,6 +1,5 @@
 # Standard imports
 import math
-import datetime
 import uuid as uu
 import sys
 import logging
@@ -10,13 +9,6 @@
 import emission.analysis.modelling.tour_model.similarity as similarity
 import emission.analysis.modelling.tour_model.featurization as featurization
 import emission.analysis.modelling.tour_model.representatives as representatives
-
-from emission.core.wrapper.trip_old import Trip, Section, Fake_Trip
-
-import emission.core.wrapper.trip as ecwt
-import emission.core.wrapper.section as ecws
-import emission.storage.decorations.trip_queries as ecsdtq
-import emission.storage.decorations.section_queries as ecsdsq
 import emission.storage.decorations.analysis_timeseries_queries as esda
 
 """
@@ -41,74 +33,53 @@
 """
 
 #read the data from the database. 
-def read_data(uuid=None, size=None, old=True):
+def read_data(uuid=None):
     db = edb.get_trip_db()
-    if not old:
-        logging.debug("not old")
-        trips = esda.get_entries(esda.CLEANED_TRIP_KEY, uuid,
-                                 time_query=None, geo_query=None)
-        return trips
-
-    if old:
-        data = []
-        trip_db = db
-        if uuid:
-            trips = trip_db.find({'user_id' : uuid, 'type' : 'move'})
-        else:
-            trips = trip_db.find({'type' : 'move'})
-        for t in trips:
-            try: 
-                trip = Trip.trip_from_json(t)
-            except:
-                continue
-            if not (trip.trip_start_location and trip.trip_end_location and trip.start_time):
-                continue
-            data.append(trip)
-            if size:
-                if len(data) == size:
-                    break
-        return data
+    trips = esda.get_entries(esda.CLEANED_TRIP_KEY, uuid,
+                             time_query=None, geo_query=None)
+    logging.info("After reading data, returning %s trips" % len(trips))
+    return trips
 
 #put the data into bins and cut off the lower portion of the bins
-def remove_noise(data, radius, old=True):
+def remove_noise(data, radius):
     if not data:
         return [], []
-    sim = similarity.similarity(data, radius, old)
+    sim = similarity.similarity(data, radius)
     sim.bin_data()
     logging.debug('number of bins before filtering: %d' % len(sim.bins))
     sim.delete_bins()
     logging.debug('number of bins after filtering: %d' % len(sim.bins))
     return sim.newdata, sim.bins
 
 #cluster the data using k-means
-def cluster(data, bins, old=True):
+def cluster(data, bins):
     if not data:
         return 0, [], []
-    feat = featurization.featurization(data, old=old)
+    feat = featurization.featurization(data)
     min = bins
     max = int(math.ceil(1.5 * bins))
     feat.cluster(min_clusters=min, max_clusters=max)
     logging.debug('number of clusters: %d' % feat.clusters)
     return feat.clusters, feat.labels, feat.data
 
 #prepare the data for the tour model
-def cluster_to_tour_model(data, labels, old=True):
+def cluster_to_tour_model(data, labels):
     if not data:
         return []
-    repy = representatives.representatives(data, labels, old=old)
+    repy = representatives.representatives(data, labels)
     repy.list_clusters()
     repy.get_reps()
     repy.locations()
     logging.debug('number of locations: %d' % repy.num_locations)
     repy.cluster_dict()
     return repy.tour_dict
 
-def main(uuid=None, old=True):
-    data = read_data(uuid, old=old)
+def main(uuid=None):
+    data = read_data(uuid)
     logging.debug("len(data) is %d" % len(data))
-    data, bins = remove_noise(data, 300, old=old)
-    n, labels, data = cluster(data, len(bins), old=old)
-    tour_dict = cluster_to_tour_model(data, labels, old=old)
+    data, bins = remove_noise(data, 300)
+    n, labels, data = cluster(data, len(bins))
+    tour_dict = cluster_to_tour_model(data, labels)
     return tour_dict
 
 if __name__=='__main__':

diff --git a/emission/analysis/modelling/tour_model/create_tour_model_matrix.py b/emission/analysis/modelling/tour_model/create_tour_model_matrix.py
@@ -1,8 +1,6 @@
 import logging
 
-import emission.analysis.modelling.tour_model.tour_model_matrix as tm ##here
-import emission.core.get_database as edb
-import emission.core.wrapper.trip_old as trip
+import emission.analysis.modelling.tour_model.tour_model_matrix as tm
 import emission.analysis.modelling.tour_model.cluster_pipeline as eamtcp
 from uuid import UUID
 import random, datetime, sys

diff --git a/emission/analysis/modelling/tour_model/featurization.py b/emission/analysis/modelling/tour_model/featurization.py
@@ -1,18 +1,13 @@
 # Standard imports
 import logging
-import matplotlib
-# matplotlib.use('Agg')
 import matplotlib.pyplot as plt
-import math
 import numpy
 from sklearn.cluster import KMeans
 from sklearn import metrics
 import sys
 
 # our imports
-from emission.core.wrapper.trip_old import Trip, Coordinate
 from kmedoid import kmedoids
-import emission.storage.decorations.trip_queries as esdtq
 
 
 """
@@ -25,9 +20,8 @@
 """
 class featurization:
 
-    def __init__(self, data, old=True):
+    def __init__(self, data):
         self.data = data
-        self.is_old = old
         if not self.data:
             self.data = []
         self.calculate_points()
@@ -41,21 +35,14 @@ def calculate_points(self):
         if not self.data:
             return
         for trip in self.data:
-            if self.is_old:
-                start = trip.trip_start_location
-                end = trip.trip_end_location
-            else:
-                try:
-                    start = trip.data.start_loc["coordinates"]
-                    end = trip.data.end_loc["coordinates"]
-                except:
-                    continue
+            try:
+                start = trip.data.start_loc["coordinates"]
+                end = trip.data.end_loc["coordinates"]
+            except:
+                continue
             if not (start and end):
                 raise AttributeError('each trip must have valid start and end locations')
-            if self.is_old:
-                self.points.append([start.lon, start.lat, end.lon, end.lat])
-            else:
-                self.points.append([start[0], start[1], end[0], end[1]])
+            self.points.append([start[0], start[1], end[0], end[1]])
 
     #cluster the data. input options:
     # - name (optional): the clustering algorithm to use. Options are 'kmeans' or 'kmedoids'. Default is kmeans.
@@ -68,7 +55,7 @@ def cluster(self, name='kmeans', min_clusters=2, max_clusters=None):
             logging.debug("min_clusters < 2, setting min_clusters = 2")
             min_clusters = 2
         if min_clusters > len(self.points):
-            sys.stderr.write('Maximum number of clusters is the number of data points.\n')
+            sys.stderr.write('Minimum number of clusters %d is greater than the number of data points %d.\n' % (min_clusters, len(self.points)))
             min_clusters = len(self.points)-1
         if max_clusters == None:
             logging.debug("max_clusters is None, setting max_clusters = %d" % (len(self.points) - 1))
@@ -138,8 +125,8 @@ def check_clusters(self):
         if not self.labels:
             logging.debug('Please cluster before analyzing clusters.')
             return
-        logging.debug('number of clusters is %d' % str(self.clusters))
-        logging.debug('silhouette score is %d' % str(self.sil))
+        logging.debug('number of clusters is %d' % self.clusters)
+        logging.debug('silhouette score is %s' % self.sil)
 
     #map the clusters
     #TODO - move this to a file in emission.analysis.plotting to map clusters from the database

diff --git a/emission/analysis/modelling/tour_model/representatives.py b/emission/analysis/modelling/tour_model/representatives.py
@@ -3,9 +3,11 @@
 import numpy
 import math
 import copy
+import geojson as gj
 
 # our imports
-from emission.core.wrapper.trip_old import Trip, Coordinate
+import emission.core.wrapper.trip as ecwt
+import emission.core.wrapper.entry as ecwe
 import emission.storage.decorations.analysis_timeseries_queries as esda
 
 
@@ -26,9 +28,8 @@
 
 class representatives:
 
-    def __init__(self, data, labels, old=True):
+    def __init__(self, data, labels):
         self.data = data
-        self.is_old = old
         if not self.data:
             self.data = []
         self.labels = labels
@@ -52,37 +53,46 @@ def list_clusters(self):
             self.clusters[a].append(self.data[i])
 
     #get the representatives for each cluster
+    #I don't understand wtf this does
+    # Why are we getting the mean of the start and end points in the cluster and
+    # creating a fake trip from it? Why not just pick a real representative of
+    # of the trips? Alternatively, why not create a new data structure to represent
+    # that this is a reconstructed trip that has no bearing in reality? What does
+    # it even mean that we have a trip with only a start and end point and no
+    # actual start or end times?
     def get_reps(self):
         self.reps = []
         if not self.data:
             return
-        for cluster in self.clusters:
+        for i, cluster in enumerate(self.clusters):
+            logging.debug("Considering cluster %d = %s" % (i, cluster))
             points = [[], [], [], []]
-            for c in cluster:
-                if self.is_old:
-                    points[0].append(c.trip_start_location.lat)
-                    points[1].append(c.trip_start_location.lon)
-                    points[2].append(c.trip_end_location.lat)
-                    points[3].append(c.trip_end_location.lon)
-                else:
-                    # We want (lat, lon) to be consistent with old above.
-                    # But in the new, our data is in geojson so it is (lon, lat).
-                    # Fix it by flipping the order of the indices
-                    # Note also that we want to use the locations of the start
-                    # and end places, not of the start point of the trip, which
-                    # may be some distance away due to geofencing.
-                    start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
-                                                 c.data.start_place)
-                    end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
-                                                 c.data.end_place)
-                    points[0].append(start_place.data.location["coordinates"][1])
-                    points[1].append(start_place.data.location["coordinates"][0])
-                    points[2].append(end_place.data.location["coordinates"][1])
-                    points[3].append(end_place.data.location["coordinates"][0])
-                    logging.debug("in representatives, endpoints have len = %s" %
-                                  len(points))
+
+            # If this cluster has no points, we skip it
+            if len(cluster) == 0:
+                logging.info("Cluster %d = %s, has length %d, skipping" %
+                             (i, cluster, len(cluster)))
+                continue
+
+            for j, c in enumerate(cluster):
+                logging.debug("Consider point %d = %s" % (j, c))
+                start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
+                                             c.data.start_place)
+                end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
+                                             c.data.end_place)
+                points[0].append(start_place.data.location["coordinates"][1]) # lat
+                points[1].append(start_place.data.location["coordinates"][0]) # lng
+                points[2].append(end_place.data.location["coordinates"][1]) # lat
+                points[3].append(end_place.data.location["coordinates"][0]) # lng
+                logging.debug("in representatives, endpoints have len = %s" %
+                              len(points))
             centers = numpy.mean(points, axis=1)
-            a = Trip(None, None, None, None, None, None, Coordinate(centers[0], centers[1]), Coordinate(centers[2], centers[3]))
+            logging.debug("For cluster %d, centers are %s" % (i, centers))
+            t = ecwt.Trip({
+                "start_loc": gj.Point([centers[1], centers[0]]),
+                "end_loc": gj.Point([centers[3], centers[2]])
+            })
+            a = ecwe.Entry.create_entry(c.user_id, "analysis/cleaned_trip", t)
             self.reps.append(a)
 
     #map the representatives
@@ -134,12 +144,12 @@ def locations(self):
             locs = []
             for b in bin:
                 if b[0] == 'start':
-                    point = self.reps[b[1]].trip_start_location
+                    point = self.reps[b[1]].data.start_loc
                 if b[0] == 'end':
-                    point = self.reps[b[1]].trip_end_location
-                locs.append([point.lat, point.lon])
+                    point = self.reps[b[1]].data.end_loc
+                locs.append(point.coordinates)
             locs = numpy.mean(locs, axis=0)
-            coord = Coordinate(locs[0], locs[1])
+            coord = [locs[0], locs[1]]
             self.locs.append(coord)
 
     #create the input to the tour graph
@@ -198,15 +208,16 @@ def cluster_dict(self):
     #check whether a point is close to all points in a bin
     def match(self, label, a, bin):
         if label == 'start':
-            pointa = self.reps[a].trip_start_location
+            pointa = self.reps[a].data.start_loc
         elif label == 'end':
-            pointa = self.reps[a].trip_end_location
+            pointa = self.reps[a].data.end_loc
         for b in bin:
             if b[0] == 'start':
-                pointb = self.reps[b[1]].trip_start_location
+                pointb = self.reps[b[1]].data.start_loc
             elif b[0] == 'end':
-                pointb = self.reps[b[1]].trip_end_location
-            if self.distance(pointa.lat, pointa.lon, pointb.lat, pointb.lon) > 300:
+                pointb = self.reps[b[1]].data.end_loc
+            if self.distance(pointa.coordinates[1], pointa.coordinates[0],
+                             pointb.coordinates[1], pointb.coordinates[0]) > 300:
                 return False
         return True