diff --git a/emission/analysis/modelling/tour_model/K_medoid.py b/emission/analysis/modelling/tour_model/K_medoid.py index 7435a2224..602761c71 100644 --- a/emission/analysis/modelling/tour_model/K_medoid.py +++ b/emission/analysis/modelling/tour_model/K_medoid.py @@ -1,16 +1,10 @@ # Standard imports from __future__ import division -import logging -import numpy as np -import math import random -import time # Our imports -from emission.core.get_database import get_routeDistanceMatrix_db,get_routeCluster_db,get_section_db -from emission.core.common import calDistance, getDisplayModes +from emission.core.get_database import get_routeDistanceMatrix_db,get_section_db from emission.analysis.modelling.tour_model.trajectory_matching.route_matching import fullMatchDistance,getRoute -from emission.analysis.modelling.tour_model.trajectory_matching.LCS import lcsScore Sections=get_section_db() diff --git a/emission/analysis/modelling/tour_model/cluster_pipeline.py b/emission/analysis/modelling/tour_model/cluster_pipeline.py index 2489ca8cb..a5bfd8a6a 100644 --- a/emission/analysis/modelling/tour_model/cluster_pipeline.py +++ b/emission/analysis/modelling/tour_model/cluster_pipeline.py @@ -1,6 +1,5 @@ # Standard imports import math -import datetime import uuid as uu import sys import logging @@ -10,13 +9,6 @@ import emission.analysis.modelling.tour_model.similarity as similarity import emission.analysis.modelling.tour_model.featurization as featurization import emission.analysis.modelling.tour_model.representatives as representatives - -from emission.core.wrapper.trip_old import Trip, Section, Fake_Trip - -import emission.core.wrapper.trip as ecwt -import emission.core.wrapper.section as ecws -import emission.storage.decorations.trip_queries as ecsdtq -import emission.storage.decorations.section_queries as ecsdsq import emission.storage.decorations.analysis_timeseries_queries as esda """ @@ -41,39 +33,18 @@ """ #read the data from the database. -def read_data(uuid=None, size=None, old=True): +def read_data(uuid=None): db = edb.get_trip_db() - if not old: - logging.debug("not old") - trips = esda.get_entries(esda.CLEANED_TRIP_KEY, uuid, - time_query=None, geo_query=None) - return trips - - if old: - data = [] - trip_db = db - if uuid: - trips = trip_db.find({'user_id' : uuid, 'type' : 'move'}) - else: - trips = trip_db.find({'type' : 'move'}) - for t in trips: - try: - trip = Trip.trip_from_json(t) - except: - continue - if not (trip.trip_start_location and trip.trip_end_location and trip.start_time): - continue - data.append(trip) - if size: - if len(data) == size: - break - return data + trips = esda.get_entries(esda.CLEANED_TRIP_KEY, uuid, + time_query=None, geo_query=None) + logging.info("After reading data, returning %s trips" % len(trips)) + return trips #put the data into bins and cut off the lower portion of the bins -def remove_noise(data, radius, old=True): +def remove_noise(data, radius): if not data: return [], [] - sim = similarity.similarity(data, radius, old) + sim = similarity.similarity(data, radius) sim.bin_data() logging.debug('number of bins before filtering: %d' % len(sim.bins)) sim.delete_bins() @@ -81,10 +52,10 @@ def remove_noise(data, radius, old=True): return sim.newdata, sim.bins #cluster the data using k-means -def cluster(data, bins, old=True): +def cluster(data, bins): if not data: return 0, [], [] - feat = featurization.featurization(data, old=old) + feat = featurization.featurization(data) min = bins max = int(math.ceil(1.5 * bins)) feat.cluster(min_clusters=min, max_clusters=max) @@ -92,10 +63,10 @@ def cluster(data, bins, old=True): return feat.clusters, feat.labels, feat.data #prepare the data for the tour model -def cluster_to_tour_model(data, labels, old=True): +def cluster_to_tour_model(data, labels): if not data: return [] - repy = representatives.representatives(data, labels, old=old) + repy = representatives.representatives(data, labels) repy.list_clusters() repy.get_reps() repy.locations() @@ -103,12 +74,12 @@ def cluster_to_tour_model(data, labels, old=True): repy.cluster_dict() return repy.tour_dict -def main(uuid=None, old=True): - data = read_data(uuid, old=old) +def main(uuid=None): + data = read_data(uuid) logging.debug("len(data) is %d" % len(data)) - data, bins = remove_noise(data, 300, old=old) - n, labels, data = cluster(data, len(bins), old=old) - tour_dict = cluster_to_tour_model(data, labels, old=old) + data, bins = remove_noise(data, 300) + n, labels, data = cluster(data, len(bins)) + tour_dict = cluster_to_tour_model(data, labels) return tour_dict if __name__=='__main__': diff --git a/emission/analysis/modelling/tour_model/create_tour_model_matrix.py b/emission/analysis/modelling/tour_model/create_tour_model_matrix.py index 64bacf6ec..4af6c47eb 100644 --- a/emission/analysis/modelling/tour_model/create_tour_model_matrix.py +++ b/emission/analysis/modelling/tour_model/create_tour_model_matrix.py @@ -1,8 +1,6 @@ import logging -import emission.analysis.modelling.tour_model.tour_model_matrix as tm ##here -import emission.core.get_database as edb -import emission.core.wrapper.trip_old as trip +import emission.analysis.modelling.tour_model.tour_model_matrix as tm import emission.analysis.modelling.tour_model.cluster_pipeline as eamtcp from uuid import UUID import random, datetime, sys diff --git a/emission/analysis/modelling/tour_model/featurization.py b/emission/analysis/modelling/tour_model/featurization.py index ba5ae3167..d4ad6783b 100644 --- a/emission/analysis/modelling/tour_model/featurization.py +++ b/emission/analysis/modelling/tour_model/featurization.py @@ -1,18 +1,13 @@ # Standard imports import logging -import matplotlib -# matplotlib.use('Agg') import matplotlib.pyplot as plt -import math import numpy from sklearn.cluster import KMeans from sklearn import metrics import sys # our imports -from emission.core.wrapper.trip_old import Trip, Coordinate from kmedoid import kmedoids -import emission.storage.decorations.trip_queries as esdtq """ @@ -25,9 +20,8 @@ """ class featurization: - def __init__(self, data, old=True): + def __init__(self, data): self.data = data - self.is_old = old if not self.data: self.data = [] self.calculate_points() @@ -41,21 +35,14 @@ def calculate_points(self): if not self.data: return for trip in self.data: - if self.is_old: - start = trip.trip_start_location - end = trip.trip_end_location - else: - try: - start = trip.data.start_loc["coordinates"] - end = trip.data.end_loc["coordinates"] - except: - continue + try: + start = trip.data.start_loc["coordinates"] + end = trip.data.end_loc["coordinates"] + except: + continue if not (start and end): raise AttributeError('each trip must have valid start and end locations') - if self.is_old: - self.points.append([start.lon, start.lat, end.lon, end.lat]) - else: - self.points.append([start[0], start[1], end[0], end[1]]) + self.points.append([start[0], start[1], end[0], end[1]]) #cluster the data. input options: # - name (optional): the clustering algorithm to use. Options are 'kmeans' or 'kmedoids'. Default is kmeans. @@ -68,7 +55,7 @@ def cluster(self, name='kmeans', min_clusters=2, max_clusters=None): logging.debug("min_clusters < 2, setting min_clusters = 2") min_clusters = 2 if min_clusters > len(self.points): - sys.stderr.write('Maximum number of clusters is the number of data points.\n') + sys.stderr.write('Minimum number of clusters %d is greater than the number of data points %d.\n' % (min_clusters, len(self.points))) min_clusters = len(self.points)-1 if max_clusters == None: logging.debug("max_clusters is None, setting max_clusters = %d" % (len(self.points) - 1)) @@ -138,8 +125,8 @@ def check_clusters(self): if not self.labels: logging.debug('Please cluster before analyzing clusters.') return - logging.debug('number of clusters is %d' % str(self.clusters)) - logging.debug('silhouette score is %d' % str(self.sil)) + logging.debug('number of clusters is %d' % self.clusters) + logging.debug('silhouette score is %s' % self.sil) #map the clusters #TODO - move this to a file in emission.analysis.plotting to map clusters from the database diff --git a/emission/analysis/modelling/tour_model/representatives.py b/emission/analysis/modelling/tour_model/representatives.py index 88e71dcf1..31a16c5e3 100755 --- a/emission/analysis/modelling/tour_model/representatives.py +++ b/emission/analysis/modelling/tour_model/representatives.py @@ -3,9 +3,11 @@ import numpy import math import copy +import geojson as gj # our imports -from emission.core.wrapper.trip_old import Trip, Coordinate +import emission.core.wrapper.trip as ecwt +import emission.core.wrapper.entry as ecwe import emission.storage.decorations.analysis_timeseries_queries as esda @@ -26,9 +28,8 @@ class representatives: - def __init__(self, data, labels, old=True): + def __init__(self, data, labels): self.data = data - self.is_old = old if not self.data: self.data = [] self.labels = labels @@ -52,37 +53,46 @@ def list_clusters(self): self.clusters[a].append(self.data[i]) #get the representatives for each cluster + #I don't understand wtf this does + # Why are we getting the mean of the start and end points in the cluster and + # creating a fake trip from it? Why not just pick a real representative of + # of the trips? Alternatively, why not create a new data structure to represent + # that this is a reconstructed trip that has no bearing in reality? What does + # it even mean that we have a trip with only a start and end point and no + # actual start or end times? def get_reps(self): self.reps = [] if not self.data: return - for cluster in self.clusters: + for i, cluster in enumerate(self.clusters): + logging.debug("Considering cluster %d = %s" % (i, cluster)) points = [[], [], [], []] - for c in cluster: - if self.is_old: - points[0].append(c.trip_start_location.lat) - points[1].append(c.trip_start_location.lon) - points[2].append(c.trip_end_location.lat) - points[3].append(c.trip_end_location.lon) - else: - # We want (lat, lon) to be consistent with old above. - # But in the new, our data is in geojson so it is (lon, lat). - # Fix it by flipping the order of the indices - # Note also that we want to use the locations of the start - # and end places, not of the start point of the trip, which - # may be some distance away due to geofencing. - start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, - c.data.start_place) - end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, - c.data.end_place) - points[0].append(start_place.data.location["coordinates"][1]) - points[1].append(start_place.data.location["coordinates"][0]) - points[2].append(end_place.data.location["coordinates"][1]) - points[3].append(end_place.data.location["coordinates"][0]) - logging.debug("in representatives, endpoints have len = %s" % - len(points)) + + # If this cluster has no points, we skip it + if len(cluster) == 0: + logging.info("Cluster %d = %s, has length %d, skipping" % + (i, cluster, len(cluster))) + continue + + for j, c in enumerate(cluster): + logging.debug("Consider point %d = %s" % (j, c)) + start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, + c.data.start_place) + end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, + c.data.end_place) + points[0].append(start_place.data.location["coordinates"][1]) # lat + points[1].append(start_place.data.location["coordinates"][0]) # lng + points[2].append(end_place.data.location["coordinates"][1]) # lat + points[3].append(end_place.data.location["coordinates"][0]) # lng + logging.debug("in representatives, endpoints have len = %s" % + len(points)) centers = numpy.mean(points, axis=1) - a = Trip(None, None, None, None, None, None, Coordinate(centers[0], centers[1]), Coordinate(centers[2], centers[3])) + logging.debug("For cluster %d, centers are %s" % (i, centers)) + t = ecwt.Trip({ + "start_loc": gj.Point([centers[1], centers[0]]), + "end_loc": gj.Point([centers[3], centers[2]]) + }) + a = ecwe.Entry.create_entry(c.user_id, "analysis/cleaned_trip", t) self.reps.append(a) #map the representatives @@ -134,12 +144,12 @@ def locations(self): locs = [] for b in bin: if b[0] == 'start': - point = self.reps[b[1]].trip_start_location + point = self.reps[b[1]].data.start_loc if b[0] == 'end': - point = self.reps[b[1]].trip_end_location - locs.append([point.lat, point.lon]) + point = self.reps[b[1]].data.end_loc + locs.append(point.coordinates) locs = numpy.mean(locs, axis=0) - coord = Coordinate(locs[0], locs[1]) + coord = [locs[0], locs[1]] self.locs.append(coord) #create the input to the tour graph @@ -198,15 +208,16 @@ def cluster_dict(self): #check whether a point is close to all points in a bin def match(self, label, a, bin): if label == 'start': - pointa = self.reps[a].trip_start_location + pointa = self.reps[a].data.start_loc elif label == 'end': - pointa = self.reps[a].trip_end_location + pointa = self.reps[a].data.end_loc for b in bin: if b[0] == 'start': - pointb = self.reps[b[1]].trip_start_location + pointb = self.reps[b[1]].data.start_loc elif b[0] == 'end': - pointb = self.reps[b[1]].trip_end_location - if self.distance(pointa.lat, pointa.lon, pointb.lat, pointb.lon) > 300: + pointb = self.reps[b[1]].data.end_loc + if self.distance(pointa.coordinates[1], pointa.coordinates[0], + pointb.coordinates[1], pointb.coordinates[0]) > 300: return False return True diff --git a/emission/analysis/modelling/tour_model/similarity.py b/emission/analysis/modelling/tour_model/similarity.py index 77d1a83ba..c806d512d 100644 --- a/emission/analysis/modelling/tour_model/similarity.py +++ b/emission/analysis/modelling/tour_model/similarity.py @@ -6,11 +6,7 @@ import matplotlib.pyplot as plt import numpy from sklearn import metrics -import sys -from numpy import cross from numpy.linalg import norm -import emission.storage.decorations.trip_queries as esdtq -import emission.storage.decorations.section_queries as esdsq import emission.storage.decorations.analysis_timeseries_queries as esda """ @@ -30,41 +26,30 @@ """ class similarity: - def __init__(self, data, radius, old=True): + def __init__(self, data, radius): self.data = data if not data: self.data = [] self.bins = [] self.radius = float(radius) - self.old = old - if not old: - for a in self.data: - # print "a is %s" % a - t = a - try: - start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, - t.data.start_place) - end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, - t.data.end_place) - start_lon = start_place.data.location["coordinates"][0] - start_lat = start_place.data.location["coordinates"][1] - end_lon = end_place.data.location["coordinates"][0] - end_lat = end_place.data.location["coordinates"][1] - logging.debug("endpoints are = (%s, %s) and (%s, %s)" % - (start_lon, start_lat, end_lon, end_lat)) - if self.distance(start_lat, start_lon, end_lat, end_lon): - self.data.remove(a) - except: - logging.exception("exception while getting start and end places for %s" % t) - self.data.remove(a) - else: - for a in range(len(self.data)-1, -1, -1): - start_lat = self.data[a].trip_start_location.lat - start_lon = self.data[a].trip_start_location.lon - end_lat = self.data[a].trip_end_location.lat - end_lon = self.data[a].trip_end_location.lon + for t in self.data: + logging.debug("Considering trip %s" % t) + try: + start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, + t.data.start_place) + end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, + t.data.end_place) + start_lon = start_place.data.location["coordinates"][0] + start_lat = start_place.data.location["coordinates"][1] + end_lon = end_place.data.location["coordinates"][0] + end_lat = end_place.data.location["coordinates"][1] + logging.debug("endpoints are = (%s, %s) and (%s, %s)" % + (start_lon, start_lat, end_lon, end_lat)) if self.distance(start_lat, start_lon, end_lat, end_lon): - self.data.pop(a) + self.data.remove(t) + except: + logging.exception("exception while getting start and end places for %s" % t) + self.data.remove(t) logging.debug('After removing trips that are points, there are %s data points' % len(self.data)) self.size = len(self.data) @@ -146,12 +131,8 @@ def elbow_distance(self): #check if two trips match def match(self,a,bin): for b in bin: - if not self.old: - if not self.distance_helper_new(a,b): - return False - else: - if not self.distance_helper(a,b): - return False + if not self.distance_helper(a, b): + return False return True #create the histogram @@ -187,12 +168,19 @@ def evaluate_bins(self): points = [] for bin in self.bins: for b in bin: - start_lat = self.data[b].trip_start_location.lat - start_lon = self.data[b].trip_start_location.lon - end_lat = self.data[b].trip_end_location.lat - end_lon = self.data[b].trip_end_location.lon + tb = self.data[b] + start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, + tb.data.start_place) + end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, + tb.data.end_place) + start_lon = start_place.data.location["coordinates"][0] + start_lat = start_place.data.location["coordinates"][1] + end_lon = end_place.data.location["coordinates"][0] + end_lat = end_place.data.location["coordinates"][1] path = [start_lat, start_lon, end_lat, end_lon] points.append(path) + logging.debug("number of labels are %d, number of points are = %d" % + (len(labels), len(points))) a = metrics.silhouette_score(numpy.array(points), labels) logging.debug('number of bins is %d' % len(self.bins)) logging.debug('silhouette score is %d' % a) @@ -200,20 +188,8 @@ def evaluate_bins(self): #calculate the distance between two trips def distance_helper(self, a, b): - starta = self.data[a].trip_start_location - startb = self.data[b].trip_start_location - enda = self.data[a].trip_end_location - endb = self.data[b].trip_end_location - - start = self.distance(starta.lat, starta.lon, startb.lat, startb.lon) - end = self.distance(enda.lat, enda.lon, endb.lat, endb.lon) - if start and end: - return True - return False - - def distance_helper_new(self, a, b): - tripa = self.data[a] - tripb = self.data[b] + tripa = self.data[a].data + tripb = self.data[b].data starta = tripa.start_loc["coordinates"] startb = tripb.start_loc["coordinates"] diff --git a/emission/analysis/modelling/tour_model/tour_model_matrix.py b/emission/analysis/modelling/tour_model/tour_model_matrix.py index 3b5f6af13..59b173b1f 100644 --- a/emission/analysis/modelling/tour_model/tour_model_matrix.py +++ b/emission/analysis/modelling/tour_model/tour_model_matrix.py @@ -5,7 +5,7 @@ # Standard imports import numpy as np -import math, datetime, heapq +import datetime, heapq import networkx as nx import matplotlib.pyplot as plt diff --git a/emission/storage/decorations/common_place_queries.py b/emission/storage/decorations/common_place_queries.py index b0798633c..95736e109 100644 --- a/emission/storage/decorations/common_place_queries.py +++ b/emission/storage/decorations/common_place_queries.py @@ -42,17 +42,6 @@ def clear_existing_places(user_id): db = edb.get_common_place_db() db.remove({'user_id': user_id}) -def get_all_place_objs(common_place): - trip.trips = [unc_trip.get_id() for unc_trip in dct["sections"]] - place_db = edb.get_place_db() - start_places = [] - end_places = [] - for t in trip.trips: - start = place_db.find_one({"_id" : t.start_place}) - end = place_db.find_one({"_id" : t.end_place}) - start_places.append(start) - end_places.append(end) - ################################################################################ def create_places(list_of_cluster_data, user_id): @@ -60,10 +49,11 @@ def create_places(list_of_cluster_data, user_id): places_dct = {} logging.debug("About to create places for %d clusters" % len(list_of_cluster_data)) for dct in list_of_cluster_data: + logging.debug("Current coords = %s" % dct) start_name = dct['start'] end_name = dct['end'] - start_loc = gj.Point(dct['start_coords'].coordinate_list()) - end_loc = gj.Point(dct['end_coords'].coordinate_list()) + start_loc = gj.Point(dct['start_coords']) + end_loc = gj.Point(dct['end_coords']) start_loc_str = gj.dumps(start_loc, sort_keys=True) end_loc_str = gj.dumps(end_loc, sort_keys=True) if start_loc_str not in places_to_successors: diff --git a/emission/storage/decorations/common_trip_queries.py b/emission/storage/decorations/common_trip_queries.py index 32eb3f56a..6b5d08ec0 100644 --- a/emission/storage/decorations/common_trip_queries.py +++ b/emission/storage/decorations/common_trip_queries.py @@ -103,8 +103,8 @@ def set_up_trips(list_of_cluster_data, user_id): for dct in list_of_cluster_data: start_times = [] durations = [] - start_loc = gj.Point(dct['start_coords'].coordinate_list()) - end_loc = gj.Point(dct['end_coords'].coordinate_list()) + start_loc = gj.Point(dct['start_coords']) + end_loc = gj.Point(dct['end_coords']) start_place_id = esdcpq.get_common_place_at_location(start_loc).get_id() end_place_id = esdcpq.get_common_place_at_location(end_loc).get_id() #print 'dct["sections"].trip_id %s is' % dct["sections"][0] diff --git a/emission/storage/decorations/tour_model_queries.py b/emission/storage/decorations/tour_model_queries.py index 9890b0d58..6f169c6b0 100644 --- a/emission/storage/decorations/tour_model_queries.py +++ b/emission/storage/decorations/tour_model_queries.py @@ -47,7 +47,7 @@ def get_common_trips(user_id): def make_tour_model_from_raw_user_data(user_id): try: - list_of_cluster_data = eamtmcp.main(user_id, False) + list_of_cluster_data = eamtmcp.main(user_id) esdcpq.create_places(list_of_cluster_data, user_id) esdctq.set_up_trips(list_of_cluster_data, user_id) except ValueError as e: diff --git a/emission/tests/analysisTests/TestClusterPipeline.py b/emission/tests/analysisTests/TestClusterPipeline.py index 6bd00146f..fecdd9cb3 100644 --- a/emission/tests/analysisTests/TestClusterPipeline.py +++ b/emission/tests/analysisTests/TestClusterPipeline.py @@ -32,59 +32,59 @@ def import_test_info(self): eaicr.clean_and_resample(self.testUUID) def testSanity(self): - cp.main(self.testUUID, False) + cp.main(self.testUUID) def testReadData(self): - data = cp.read_data(uuid=self.testUUID, old=False) + data = cp.read_data(uuid=self.testUUID) # Test to make sure something is happening self.assertTrue(len(data) > 5) # Test to make sure that the trips are mapped to the correct uuid - bad_data = cp.read_data(uuid="FakeUUID", old=False) + bad_data = cp.read_data(uuid="FakeUUID") self.assertEqual(len(bad_data), 0) def testRemoveNoise(self): - data = cp.read_data(uuid=self.testUUID, old=False) + data = cp.read_data(uuid=self.testUUID) # Test to make sure the code doesn't break on an empty dataset - new_data, bins = cp.remove_noise(None, self.RADIUS, False) + new_data, bins = cp.remove_noise(None, self.RADIUS) self.assertTrue(len(new_data) == len(bins) == 0) #Test to make sure some or no data was filtered out, but that nothing was added after filtering - new_data, bins = cp.remove_noise(None, self.RADIUS, False) + new_data, bins = cp.remove_noise(None, self.RADIUS) self.assertTrue(len(new_data) <= len(data)) # Make sure there are not more bins than data; that wouldnt make sense self.assertTrue(len(bins) <= len(data)) def testCluster(self): - data = cp.read_data(uuid=self.testUUID, old=False) + data = cp.read_data(uuid=self.testUUID) # Test to make sure empty dataset doesn't crash the program - clusters, labels, new_data = cp.cluster([], 10, False) + clusters, labels, new_data = cp.cluster([], 10) self.assertTrue(len(new_data) == clusters == len(labels) == 0) # Test to make sure clustering with noise works - clusters, labels, new_data = cp.cluster(data, 10, False) + clusters, labels, new_data = cp.cluster(data, 10) self.assertEqual(len(labels), len(new_data)) self.assertEqual(cmp(new_data, data), 0) # Test to make sure clustering without noise works - data, bins = cp.remove_noise(data, self.RADIUS, False) - clusters, labels, new_data = cp.cluster(data, len(bins), False) + data, bins = cp.remove_noise(data, self.RADIUS) + clusters, labels, new_data = cp.cluster(data, len(bins)) self.assertTrue(clusters == 0 or len(bins) <= clusters <= len(bins) + 10) def testClusterToTourModel(self): # Test to make sure it doesn't crash on a empty dataset - data = cp.cluster_to_tour_model(None, None, False) + data = cp.cluster_to_tour_model(None, None) self.assertFalse(data) # Test with the real dataset - data = cp.read_data(uuid=self.testUUID, old=False) - data, bins = cp.remove_noise(data, self.RADIUS, False) - n, labels, data = cp.cluster(data, len(bins), False) - tour_dict = cp.main(uuid=self.testUUID, old=False) + data = cp.read_data(uuid=self.testUUID) + data, bins = cp.remove_noise(data, self.RADIUS) + n, labels, data = cp.cluster(data, len(bins)) + tour_dict = cp.main(uuid=self.testUUID) self.assertTrue(len(tour_dict) <= n) diff --git a/emission/tests/analysisTests/TestFeaturization.py b/emission/tests/analysisTests/TestFeaturization.py index b6ff53240..bc14ce654 100644 --- a/emission/tests/analysisTests/TestFeaturization.py +++ b/emission/tests/analysisTests/TestFeaturization.py @@ -1,35 +1,36 @@ import unittest +import time +import uuid +import logging + import emission.core.get_database as edb -import sys import emission.analysis.modelling.tour_model.featurization as featurization -from emission.core.wrapper.trip_old import Trip, Coordinate import emission.analysis.modelling.tour_model.cluster_pipeline as cp -import emission.simulation.trip_gen as tg -import datetime -import os, os.path +import emission.storage.timeseries.abstract_timeseries as esta + +import emission.tests.analysisTests.tourModelTests.common as etatc class FeaturizationTests(unittest.TestCase): def __init__(self, *args, **kwargs): super(FeaturizationTests, self).__init__(*args, **kwargs) - self.data = cp.read_data(size=100) - print 'there are ' + str(len(self.data)) - #if len(self.data) == 0: - # tg.create_fake_trips() - # self.data = cp.read_data(size=100) def setUp(self): - pass + self.data = cp.read_data() + self.testUUID = uuid.uuid4() + self.ts = esta.TimeSeries.get_time_series(self.testUUID) + print 'there are ' + str(len(self.data)) def tearDown(self): - pass + edb.get_timeseries_db().remove({'user_id': self.testUUID}) + edb.get_analysis_timeseries_db().remove({'user_id': self.testUUID}) def testCalculatePoints(self): feat = featurization.featurization([]) self.assertTrue(not feat.data) feat = featurization.featurization(None) self.assertTrue(not feat.data) - trip = Trip(None, None, None, None, None, None, None, None) + trip = etatc._createTripEntry(self, None, None, None, None) data = [trip] try: feat = featurization.featurization(data) @@ -64,17 +65,16 @@ def testCluster(self): self.assertTrue(False) data = [] - start = Coordinate(47,-122) - end = Coordinate(47,-123) + start = [-122, 47] + end = [-123,47] + now = time.time() for i in range(10): - now = datetime.datetime.now() - a = Trip(None, None, None, None, now, now, start, end) + a = etatc._createTripEntry(self, now, now, start, end) data.append(a) - start = Coordinate(41,-74) - end = Coordinate(42, -74) + start = [-74, 41] + end = [-74, 42] for i in range(10): - now = datetime.datetime.now() - a = Trip(None, None, None, None, now, now, start, end) + a = etatc._createTripEntry(self, now, now, start, end) data.append(a) feat = featurization.featurization(data) feat.cluster() @@ -87,8 +87,10 @@ def testCheckClusters(self): feat.cluster(min_clusters=2, max_clusters=10) try: feat.check_clusters() - except Exception: + except Exception, e: + logging.exception(e.message) self.assertTrue(False) if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) unittest.main() diff --git a/emission/tests/analysisTests/TestRepresentatives.py b/emission/tests/analysisTests/TestRepresentatives.py index 905b9b028..0c0e463e4 100644 --- a/emission/tests/analysisTests/TestRepresentatives.py +++ b/emission/tests/analysisTests/TestRepresentatives.py @@ -1,25 +1,24 @@ import unittest -import emission.core.get_database as edb +import time +import logging + import emission.analysis.modelling.tour_model.representatives as rep -import emission.simulation.trip_gen as tg import emission.analysis.modelling.tour_model.featurization as feat -import emission.analysis.modelling.tour_model.cluster_pipeline as cp -from emission.core.wrapper.trip_old import Trip, Coordinate + +import emission.tests.analysisTests.tourModelTests.common as etatc class RepresentativesTests(unittest.TestCase): def __init__(self, *args, **kwargs): super(RepresentativesTests, self).__init__(*args, **kwargs) - self.data = cp.read_data(size=100) - #if len(self.data) == 0: - # tg.create_fake_trips() - # self.data = cp.read_data(size=100) - print 'there are ' + str(len(self.data)) + + def setUp(self): + etatc._setup(self) n = len(self.data)/5 - self.labels = feat.featurization(self.data).cluster(min_clusters=n, max_clusters=n) + self.labels = feat.featurization(self.data).cluster(min_clusters=n, max_clusters=n) def tearDown(self): - pass + etatc._tearDown(self) def testInit(self): repy = rep.representatives(None, None) @@ -55,18 +54,20 @@ def testReps(self): repy.get_reps() self.assertTrue(len(repy.reps) == len(repy.clusters)) clusters = [0] - tripa = Trip(None, None, None, None, None, None, Coordinate(1,2), Coordinate(3,4)) - tripb = Trip(None, None, None, None, None, None, Coordinate(9,10), Coordinate(5,8)) - tripc = Trip(None, None, None, None, None, None, Coordinate(5,6), Coordinate(4,6)) + now = time.time() + tripa = etatc._createTripEntry(self, now, now, [1,2], [3,4]) + tripb = etatc._createTripEntry(self, now, now, [9,10], [5,8]) + tripc = etatc._createTripEntry(self, now, now, [5,6], [4,6]) data = [tripa, tripb, tripc] labels = [0,0,0] repy = rep.representatives(data, labels) repy.list_clusters() repy.get_reps() - self.assertTrue(repy.reps[0].trip_start_location.lat == 5) - self.assertTrue(repy.reps[0].trip_start_location.lon == 6) - self.assertTrue(repy.reps[0].trip_end_location.lat == 4) - self.assertTrue(repy.reps[0].trip_end_location.lon == 6) + logging.debug("repy.reps[0].data.start_loc = %s" % repy.reps[0].data.start_loc) + self.assertEqual(repy.reps[0].data.start_loc.coordinates[0], 5) + self.assertEqual(repy.reps[0].data.start_loc.coordinates[1], 6) + self.assertEqual(repy.reps[0].data.end_loc.coordinates[0], 4) + self.assertEqual(repy.reps[0].data.end_loc.coordinates[1], 6) def testLocations(self): repy = rep.representatives(self.data, self.labels) @@ -78,16 +79,17 @@ def testLocations(self): for i in range(len(bin)): b = bin[i] if b[0] == 'start': - a = repy.reps[b[1]].trip_start_location + a = repy.reps[b[1]].data.start_loc if b[0] == 'end': - a = repy.reps[b[1]].trip_end_location + a = repy.reps[b[1]].data.end_loc for j in range(i): c = bin[j] if c[0] == 'start': - d = repy.reps[c[1]].trip_start_location + d = repy.reps[c[1]].data.start_loc if c[0] == 'end': - d = repy.reps[c[1]].trip_end_location - self.assertTrue(repy.distance(a.lat, a.lon, d.lat, d.lon) < 300) + d = repy.reps[c[1]].data.end_loc + self.assertTrue(repy.distance(a.coordinates[1], a.coordinates[0], + d.coordinates[1], d.coordinates[0]) < 300) total += len(bin) self.assertTrue(total == 2 * repy.num_clusters) for i in range(repy.num_clusters): @@ -95,24 +97,25 @@ def testLocations(self): self.assertTrue(sum(bin.count(('end',i)) for bin in repy.bins) == 1) self.assertTrue(len(repy.locs) == len(repy.bins)) - tripa = Trip(None, None, None, None, None, None, Coordinate(1,2), Coordinate(30,40)) - tripb = Trip(None, None, None, None, None, None, Coordinate(1.0000002,2.0000002), Coordinate(55.0000002,85.0000002)) - tripc = Trip(None, None, None, None, None, None, Coordinate(30.0000002,40.0000002), Coordinate(55,85)) + now = time.time() + tripa = etatc._createTripEntry(self, now, now, [1,2], [30,40]) + tripb = etatc._createTripEntry(self, now, now, [1.0000002,2.0000002], [55.0000002,85.0000002]) + tripc = etatc._createTripEntry(self, now, now, [30.0000002,40.0000002], [55,85]) data = [tripa, tripb, tripc] labels = [0,1,2] repy = rep.representatives(data, labels) repy.list_clusters() repy.get_reps() repy.locations() - self.assertTrue(repy.bins[0] == [('start', 0), ('start', 1)]) - self.assertTrue(repy.bins[1] == [('end', 0), ('start', 2)]) - self.assertTrue(repy.bins[2] == [('end', 1), ('end', 2)]) - self.assertTrue(round(repy.locs[0].lat,7) == 1.0000001) - self.assertTrue(round(repy.locs[0].lon,7) == 2.0000001) - self.assertTrue(round(repy.locs[1].lat,7) == 30.0000001) - self.assertTrue(round(repy.locs[1].lon,7) == 40.0000001) - self.assertTrue(round(repy.locs[2].lat,7) == 55.0000001) - self.assertTrue(round(repy.locs[2].lon,7) == 85.0000001) + self.assertEqual(repy.bins[0], [('start', 0), ('start', 1)]) + self.assertEqual(repy.bins[1], [('end', 0), ('start', 2)]) + self.assertEqual(repy.bins[2], [('end', 1), ('end', 2)]) + self.assertAlmostEqual(repy.locs[0][0], 1.0000001, places=7) + self.assertAlmostEqual(repy.locs[0][1], 2.0000001, places=7) + self.assertAlmostEqual(repy.locs[1][0], 30.0000001, places=7) + self.assertAlmostEqual(repy.locs[1][1], 40.0000001, places=7) + self.assertAlmostEqual(repy.locs[2][0], 55.0000001, places=7) + self.assertAlmostEqual(repy.locs[2][1], 85.0000001, places=7) def testClusterDict(self): repy = rep.representatives(self.data, self.labels) @@ -127,19 +130,20 @@ def testClusterDict(self): self.assertTrue(('start', i) in repy.bins[cluster['start']]) self.assertTrue(('end', i) in repy.bins[cluster['end']]) for d in repy.clusters[i]: - tripid = d.trip_id - tripy = next((x for x in cluster['sections'] if x.trip_id == tripid), None) + tripid = d.get_id() + tripy = next((x for x in cluster['sections'] if x.get_id() == tripid), None) self.assertTrue(tripy) - self.assertTrue(sum(sum(t.trip_id == tripid for t in cluster['sections']) for cluster in repy.self_loops_tour_dict) == 1) + self.assertTrue(sum(sum(t.get_id() == tripid for t in cluster['sections']) for cluster in repy.self_loops_tour_dict) == 1) for c in repy.tour_dict: self.assertTrue(c['start'] != c['end']) def testMatch(self): - tripa = Trip(None, None, None, None, None, None, Coordinate(1,2), Coordinate(3,4)) - tripb = Trip(None, None, None, None, None, None, Coordinate(3,4), Coordinate(1,2)) - tripc = Trip(None, None, None, None, None, None, Coordinate(1,2), Coordinate(9,10)) + now = time.time() + tripa = etatc._createTripEntry(self, now, now, [1,2], [3,4]) + tripb = etatc._createTripEntry(self, now, now, [3,4], [1,2]) + tripc = etatc._createTripEntry(self, now, now, [1,2], [9,10]) data = [tripa, tripb, tripc] labels = [0,1,2] @@ -155,4 +159,5 @@ def testMatch(self): self.assertTrue(not repy.match('end', 2, bin)) if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) unittest.main() diff --git a/emission/tests/analysisTests/TestSimilarity.py b/emission/tests/analysisTests/TestSimilarity.py index 4c7400a29..37c42bbac 100644 --- a/emission/tests/analysisTests/TestSimilarity.py +++ b/emission/tests/analysisTests/TestSimilarity.py @@ -1,29 +1,36 @@ +import logging import unittest +import uuid +import time +import datetime +import os, os.path + +import emission.tests.analysisTests.tourModelTests.common as etatc + import emission.core.get_database as edb + import emission.analysis.modelling.tour_model.similarity as similarity -import emission.simulation.trip_gen as tg -import math -from emission.core.wrapper.trip_old import Trip, Coordinate import emission.analysis.modelling.tour_model.cluster_pipeline as cp -import datetime -import os, os.path + +import emission.storage.timeseries.abstract_timeseries as esta class SimilarityTests(unittest.TestCase): - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): super(SimilarityTests, self).__init__(*args, **kwargs) - self.data = cp.read_data(size=100) + + def setUp(self): + self.testUUID = uuid.uuid4() + self.data = cp.read_data() #if len(self.data) == 0: # tg.create_fake_trips() # self.data = cp.read_data(size=100) - print 'there are ' + str(len(self.data)) - - def setUp(self): - pass + logging.info("Found %s trips" % len(self.data)) + self.ts = esta.TimeSeries.get_time_series(self.testUUID) def tearDown(self): - return - + edb.get_timeseries_db().remove({'user_id': self.testUUID}) + edb.get_analysis_timeseries_db().remove({'user_id': self.testUUID}) def testInit(self): try: @@ -33,15 +40,18 @@ def testInit(self): except Exception: self.assertTrue(False) + logging.debug("STARTING init test") sim = similarity.similarity([], 100) self.assertTrue(len(sim.data) == 0) - now = datetime.datetime.now() - start = Coordinate(47,-122) - end = Coordinate(47,-123) - t1 = Trip(None, None, None, None, now, now, start, start) - t2 = Trip(None, None, None, None, now, now, start, end) + now = time.time() + start = [-122,47] + end = [-123,47] + t1 = etatc._createTripEntry(self, now, now, start, start) + t2 = etatc._createTripEntry(self, now, now, start, end) sim = similarity.similarity([t1, t2], 100) + logging.debug("sim.data = %s" % sim.data) simmy = similarity.similarity([t2], 100) + logging.debug("simmy.data = %s" % simmy.data) self.assertTrue(sim.data == simmy.data) def testBinData(self): @@ -62,17 +72,15 @@ def testBinData(self): self.assertTrue(len(sim.bins[i]) >= len(sim.bins[i+1])) data = [] - now = datetime.datetime.now() - start = Coordinate(47,-122) - end = Coordinate(47,-123) + now = time.time() + start = [-122, 47] + end = [-123, 47] for i in range(10): - a = Trip(None, None, None, None, now, now, start, end) - data.append(a) - start = Coordinate(41,-74) - end = Coordinate(42, -74) + data.append(etatc._createTripEntry(self, now, now, start, end)) + start = [-74, 41] + end = [-74, 42] for i in range(10): - a = Trip(None, None, None, None, now, now, start, end) - data.append(a) + data.append(etatc._createTripEntry(self, now, now, start, end)) sim = similarity.similarity(data, 300) sim.bin_data() self.assertTrue(len(sim.bins) == 2) @@ -86,10 +94,10 @@ def testDeleteBins(self): self.assertTrue(b == sim.num) def testElbowDistance(self): - start = Coordinate(47,-122) - end = Coordinate(47,-123) - now = datetime.datetime.now() - t = Trip(None, None, None, None, now, now, start, end) + start = [-122,47] + end = [-123,47] + now = time.time() + t = etatc._createTripEntry(self, now, now, start, end) data = [t] * 11 bins = [[1,2,3,4], [5,6,10], [7], [8], [9], [0]] sim = similarity.similarity(data, 300) @@ -106,15 +114,15 @@ def testMatch(self): self.assertTrue(sim.distance_helper(b,c)) def testDistance(self): - start = Coordinate(-122.259447, 37.875174) - end1 = Coordinate(-122.259279, 37.875479) - end2 = Coordinate(-122.252287, 37.869569) - now = datetime.datetime.now() - t1 = Trip(None, None, None, None, now, now, start, end1) - t2 = Trip(None, None, None, None, now, now, start, end2) + start = [-122.259447, 37.875174] + end1 = [-122.259279, 37.875479] + end2 = [-122.252287, 37.869569] + now = time.time() + t1 = etatc._createTripEntry(self, now, now, start, end1) + t2 = etatc._createTripEntry(self, now, now, start, end2) sim = similarity.similarity(self.data, 300) - self.assertTrue(sim.distance(start.lat, start.lon, end1.lat, end1.lon)) - self.assertTrue(not sim.distance(start.lat, start.lon, end2.lat, end2.lon)) + self.assertTrue(sim.distance(start[1], start[0], end1[1], end1[0])) + self.assertTrue(not sim.distance(start[1], start[0], end2[1], end2[0])) def testGraph(self): if os.path.isfile('./histogram.png'): @@ -136,12 +144,11 @@ def testEvaluateBins(self): a = sim.evaluate_bins() self.assertTrue(not a) sim = similarity.similarity(self.data, 300) - b = sim.evaluate_bins() - self.assertTrue(not b) sim.bin_data() c = sim.evaluate_bins() if sim.data: self.assertTrue(c) if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) unittest.main() diff --git a/emission/tests/analysisTests/TestUserModel.py b/emission/tests/analysisTests/TestUserModel.py index 154d71d3c..30c135d61 100644 --- a/emission/tests/analysisTests/TestUserModel.py +++ b/emission/tests/analysisTests/TestUserModel.py @@ -1,10 +1,6 @@ import unittest import emission.user_model_josh.utility_model as eum -import googlemaps import emission.net.ext_service.otp.otp as otp -import emission.net.ext_service.gmaps.googlemaps as gmaps -import emission.net.ext_service.gmaps.common as gmcommon -import emission.net.api.utility_model_api as umapi import datetime diff --git a/emission/tests/analysisTests/tourModelTests/__init__.py b/emission/tests/analysisTests/tourModelTests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/emission/tests/analysisTests/tourModelTests/common.py b/emission/tests/analysisTests/tourModelTests/common.py new file mode 100644 index 000000000..47aabfae8 --- /dev/null +++ b/emission/tests/analysisTests/tourModelTests/common.py @@ -0,0 +1,46 @@ +import logging +import geojson as gj +import uuid + +import emission.core.wrapper.cleanedtrip as ecwct +import emission.core.wrapper.entry as ecwe +import emission.core.wrapper.cleanedplace as ecwcp +import emission.core.get_database as edb + +import emission.analysis.modelling.tour_model.cluster_pipeline as cp +import emission.storage.timeseries.abstract_timeseries as esta + +def _createTripEntry(self, start_ts, end_ts, start_loc, end_loc): + t = ecwct.Cleanedtrip() + t.start_ts = start_ts + t.end_ts = end_ts + t.start_loc = gj.Point(start_loc) + t.end_loc = gj.Point(end_loc) + sp = ecwcp.Cleanedplace() + sp.location = t.start_loc + sp.exit_ts = start_ts + ep = ecwcp.Cleanedplace() + ep.location = t.end_loc + ep.enter_ts = end_ts + spe = ecwe.Entry.create_entry(self.testUUID, "analysis/cleaned_place", sp, create_id=True) + epe = ecwe.Entry.create_entry(self.testUUID, "analysis/cleaned_place", ep, create_id=True) + t.start_place = spe.get_id() + t.end_place = epe.get_id() + te = ecwe.Entry.create_entry(self.testUUID, "analysis/cleaned_trip", t, create_id=True) + self.ts.insert(spe) + self.ts.insert(epe) + self.ts.insert(te) + return te + +def _setup(self): + self.data = cp.read_data() + #if len(self.data) == 0: + # tg.create_fake_trips() + # self.data = cp.read_data(size=100) + print 'there are ' + str(len(self.data)) + self.testUUID = uuid.uuid4() + self.ts = esta.TimeSeries.get_time_series(self.testUUID) + +def _tearDown(self): + edb.get_timeseries_db().remove({'user_id': self.testUUID}) + edb.get_analysis_timeseries_db().remove({'user_id': self.testUUID}) diff --git a/emission/tests/storageTests/TestCommonPlaceQueries.py b/emission/tests/storageTests/TestCommonPlaceQueries.py index e330bd184..c0563ae94 100644 --- a/emission/tests/storageTests/TestCommonPlaceQueries.py +++ b/emission/tests/storageTests/TestCommonPlaceQueries.py @@ -52,7 +52,7 @@ def testCreatePlace(self): estfm.move_all_filters_to_data() eaist.segment_current_trips(self.testUUID) eaiss.segment_current_sections(self.testUUID) - data = eamtcp.main(self.testUUID, False) + data = eamtcp.main(self.testUUID) esdcpq.create_places(data, self.testUUID) places = esdcpq.get_all_common_places_for_user(self.testUUID) places_list = [] diff --git a/emission/tests/storageTests/TestCommonTripQueries.py b/emission/tests/storageTests/TestCommonTripQueries.py index de7c37267..273321f5c 100644 --- a/emission/tests/storageTests/TestCommonTripQueries.py +++ b/emission/tests/storageTests/TestCommonTripQueries.py @@ -106,7 +106,7 @@ def testCreateFromData(self): def get_fake_data(user_name): # Call with a username unique to your database tg.create_fake_trips(user_name, True) - return eamtcp.main(user_name, old=False) + return eamtcp.main(user_name) if __name__ == "__main__": diff --git a/emission/user_model_josh/utility_model.py b/emission/user_model_josh/utility_model.py index a8184f6f9..685c7ffc1 100644 --- a/emission/user_model_josh/utility_model.py +++ b/emission/user_model_josh/utility_model.py @@ -13,7 +13,6 @@ import json import heapq import time -import googlemaps import requests import random @@ -127,7 +126,7 @@ def get_top_choice_places(self, start_place, end_place): return self.get_top_choices_lat_lng(start, end) def get_all_trips(self, start, end, curr_time=None): - c = googlemaps.client.Client(GOOGLE_MAPS_KEY) + c = gmaps.client.Client(GOOGLE_MAPS_KEY) if curr_time is None: curr_time = datetime.datetime.now() curr_month = curr_time.month @@ -505,9 +504,9 @@ def get_elevation_change(trip, testing=False): down = random.randint(1, 100) return (up, down) time.sleep(1) # so we dont run out calls - c = googlemaps.client.Client(GOOGLE_MAPS_KEY) + c = gmaps.client.Client(GOOGLE_MAPS_KEY) print get_route(trip) - jsn = googlemaps.elevation.elevation_along_path(c, get_route(trip), 200) + jsn = gmaps.elevation.elevation_along_path(c, get_route(trip), 200) up, down = 0, 0 prev = None for item in jsn: