From 6dc499512191354fa0b55343e1d6126d65832551 Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Thu, 17 Nov 2022 15:21:54 -0800 Subject: [PATCH 01/12] Blueprint for GBDT, work in progress --- .../gradient_boosted_decision_tree.py | 122 ++++++++++++++++++ .../TestGradientBoostedDecisionTree.py | 76 +++++++++++ 2 files changed, 198 insertions(+) create mode 100644 emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py create mode 100644 emission/tests/modellingTests/TestGradientBoostedDecisionTree.py diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py new file mode 100644 index 000000000..646ba0a1b --- /dev/null +++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py @@ -0,0 +1,122 @@ +import logging +from tokenize import group +from typing import Dict, List, Optional, Tuple + +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import GradientBoostingClassifier +import sklearn.metrics as sm + +import emission.storage.timeseries.abstract_timeseries as esta +import emission.analysis.modelling.tour_model.label_processing as lp +import emission.analysis.modelling.trip_model.trip_model as eamuu +import emission.analysis.modelling.trip_model.util as util +import emission.analysis.modelling.trip_model.config as eamtc +import emission.core.wrapper.confirmedtrip as ecwc + + +class GradientBoostedDecisionTree(eamuu.TripModel): + + is_incremental: bool = False # overwritten during __init__ + class_map: dict = {} # overwritten during fit + + def __init__(self, config=None): + """ + Instantiate a gradient boosted decision tree for all users. + + This uses the sklearn implementation of a gradient boosted + decision tree to classify unlabeled replacement modes. + + https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html + + Replacement modes are considered to be the second-best choice for + a given trip (i.e., what mode would have been chosen if the actual + choice wasn't available). + + The model is currently trained on data from all users. + """ + if config is None: + config = eamtc.get_config_value_or_raise('model_parameters.gbdt') + logging.debug(f'GradientBoostedDecisionTree loaded model config from file') + else: + logging.debug(f'GradientBoostedDecisionTree using model config argument') + expected_keys = [ + 'incremental_evaluation', + 'feature_list', + 'dependent_var' + ] + for k in expected_keys: + if config.get(k) is None: + msg = f"gbdt trip model config missing expected key {k}" + raise KeyError(msg) + self.is_incremental = config['incremental_evaluation'] + # Use the sklearn implementation of a GBDT + self.gbdt = GradientBoostingClassifier(n_estimators=50) + # Which features to use in the fit/prediction + self.feature_list = config['feature_list'] + self.dependent_var = config['dependent_var'] + + def fit(self, trips: List[ecwc.Confirmedtrip]): + """train the model by passing data, where each row in the data + corresponds to a label at the matching index of the label input + + :param trips: 2D array of features to train from + """ + logging.debug(f'fit called with {len(trips)} trips') + unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips)) + if len(unlabeled) > 0: + msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' + raise Exception(msg) + X_train, y_train = self.extract_features(trips) + self.gbdt.fit(X_train, y_train) + logging.info(f"gradient boosted decision tree model fit to {len(X_train)} rows of trip data") + + def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: + logging.debug(f"running gradient boosted mode prediction") + X_train, y_train = self.extract_features(trip) + y_pred = self.gbdt.predict(X_train) + if y_pred is None: + logging.debug(f"unable to predict bin for trip {trip}") + return [] + else: + logging.debug(f"made predictions {y_pred}") + return y_pred + + def to_dict(self) -> Dict: + return self.gbdt + + def from_dict(self, model: Dict): + self.gbdt = model + + def extract_features(self, trips: ecwc.Confirmedtrip) -> List[float]: + # TODO: need to enable generic paths other than just user input for features + X = pd.DataFrame( + [[trip['data']['user_input'][x] for x in self.feature_list] for trip in trips], + columns=self.feature_list + ) + y = pd.DataFrame( + [trip['data']['user_input'][self.dependent_var] for trip in trips], + columns=[self.dependent_var] + ) + # Clean up and recode the feature columns for training/prediction + X_processed, y_processed = self._process_data(X, y) + return X_processed, y_processed + + def _process_data(self, X, y): + """ + helper function to transform binned features and labels. + """ + # Any non-numeric dtype must be one-hot encoded (if unordered) or numerically coded (if ordered) + dummies = [] + for col in X: + if X[col].dtype=='object': + dummies.append(pd.get_dummies(X[col], prefix=col)) + X = pd.concat(dummies, axis=1) + # The outcome must be a single categorical column; recode to numeric + for col in y: + cat_list = list(pd.unique(y[col])).sort() + if y[col].dtype=='object': + y[col] = pd.Categorical(y[col], ordered=True, categories=cat_list) + y[col] = y[col].cat.codes + return X, y diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py new file mode 100644 index 000000000..f49b6c389 --- /dev/null +++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py @@ -0,0 +1,76 @@ +import unittest +import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg +import emission.tests.modellingTests.modellingTestAssets as etmm +import logging + + +class TestGradientBoostedDecisionTree(unittest.TestCase): + + def setUp(self) -> None: + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + def testSmoke(self): + """ + the model should fit and predict on normal data without errors + """ + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": ['mode_confirm','purpose_confirm'], + "dependent_var": 'replaced_mode' + } + model = eamtg.GradientBoostedDecisionTree(model_config) + model.fit(trips) + model.predict(trips) + + def testUnseenTrainingClasses(self): + """ + if a new class is added the model should re-train + """ + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": ['mode_confirm','purpose_confirm'], + "dependent_var": 'replaced_mode' + } + model = eamtg.GradientBoostedDecisionTree(model_config) + model.fit(trips) + model.predict(trips) + + # any predicted values must + # self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") From 8a35ff05c5337fa1700b9e47791ef401f25a1815 Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Wed, 23 Nov 2022 12:27:51 -0800 Subject: [PATCH 02/12] GBDT with all feature types --- .../gradient_boosted_decision_tree.py | 82 ++++++++----- .../TestGradientBoostedDecisionTree.py | 111 ++++++++++++++++-- .../modellingTests/modellingTestAssets.py | 12 ++ 3 files changed, 167 insertions(+), 38 deletions(-) diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py index 646ba0a1b..cf2e33dac 100644 --- a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py +++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py @@ -2,6 +2,7 @@ from tokenize import group from typing import Dict, List, Optional, Tuple +from math import radians, cos, sin, asin, sqrt import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier @@ -13,13 +14,13 @@ import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.trip_model.util as util import emission.analysis.modelling.trip_model.config as eamtc +import emission.core.get_database as edb import emission.core.wrapper.confirmedtrip as ecwc class GradientBoostedDecisionTree(eamuu.TripModel): is_incremental: bool = False # overwritten during __init__ - class_map: dict = {} # overwritten during fit def __init__(self, config=None): """ @@ -71,11 +72,12 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): X_train, y_train = self.extract_features(trips) self.gbdt.fit(X_train, y_train) logging.info(f"gradient boosted decision tree model fit to {len(X_train)} rows of trip data") + logging.info(f"training features were {X_train.columns}") - def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: + def predict(self, trip: ecwc.Confirmedtrip) -> List[int]: logging.debug(f"running gradient boosted mode prediction") - X_train, y_train = self.extract_features(trip) - y_pred = self.gbdt.predict(X_train) + X_test, y_pred = self.extract_features(trip, is_prediction=True) + y_pred = self.gbdt.predict(X_test) if y_pred is None: logging.debug(f"unable to predict bin for trip {trip}") return [] @@ -84,39 +86,59 @@ def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]: return y_pred def to_dict(self) -> Dict: - return self.gbdt + return dict(self.gbdt) def from_dict(self, model: Dict): self.gbdt = model - def extract_features(self, trips: ecwc.Confirmedtrip) -> List[float]: - # TODO: need to enable generic paths other than just user input for features - X = pd.DataFrame( - [[trip['data']['user_input'][x] for x in self.feature_list] for trip in trips], - columns=self.feature_list - ) - y = pd.DataFrame( - [trip['data']['user_input'][self.dependent_var] for trip in trips], - columns=[self.dependent_var] - ) - # Clean up and recode the feature columns for training/prediction - X_processed, y_processed = self._process_data(X, y) - return X_processed, y_processed - - def _process_data(self, X, y): - """ - helper function to transform binned features and labels. - """ - # Any non-numeric dtype must be one-hot encoded (if unordered) or numerically coded (if ordered) + def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: + # Get dataframe from json trips; fill in calculated columns + trips_df = pd.json_normalize(trips) + # distance + trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']] + trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1) + # collect all features + X = trips_df[self.feature_list] + # Any object/categorical dtype features must be one-hot encoded if unordered dummies = [] for col in X: if X[col].dtype=='object': dummies.append(pd.get_dummies(X[col], prefix=col)) X = pd.concat(dummies, axis=1) - # The outcome must be a single categorical column; recode to numeric - for col in y: - cat_list = list(pd.unique(y[col])).sort() - if y[col].dtype=='object': - y[col] = pd.Categorical(y[col], ordered=True, categories=cat_list) - y[col] = y[col].cat.codes + # Only extract dependent var if fitting a new model + if is_prediction: + y = None + else: + y = trips_df[self.dependent_var].values return X, y + + # If the non-mock trips have distance calculated then this can be removed + # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points + def haversine(self, coord1, coord2): + """ + Calculate the great circle distance in kilometers between two points + on the earth (specified in decimal degrees) + """ + lon1 = coord1[0] + lat1 = coord1[1] + lon2 = coord2[0] + lat2 = coord2[1] + # convert decimal degrees to radians + lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) + + # haversine formula + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 + c = 2 * asin(sqrt(a)) + r = 3956 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units. + return c * r + + def export_demographic_table(self, uuid_list): + print("Looking up details for %s" % uuid_list) + all_survey_results = list(edb.get_timeseries_db().find({"user_id": {"$in": uuid_list}, "metadata.key": "manual/demographic_survey"})) + for s in all_survey_results: + s["data"]["user_id"] = s["user_id"] + all_survey_results_df = pd.json_normalize([s["data"] for s in all_survey_results]) + all_survey_results_df.drop(columns=['xmlResponse', 'name', 'version', 'label'], axis=1, inplace=True) + return all_survey_results_df \ No newline at end of file diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py index f49b6c389..f1a12ea6d 100644 --- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py +++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py @@ -10,6 +10,7 @@ def setUp(self) -> None: logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.DEBUG) + def testSmoke(self): """ the model should fit and predict on normal data without errors @@ -34,16 +35,71 @@ def testSmoke(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": ['mode_confirm','purpose_confirm'], - "dependent_var": 'replaced_mode' + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm' + ], + "dependent_var": 'data.user_input.replaced_mode' } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(trips) model.predict(trips) - def testUnseenTrainingClasses(self): + + def testUnseenFeatures(self): + """ + if the input classes for a feature change throw sklearn error + """ + train_label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + test_label_data = { + "mode_confirm": ['drive'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + # generate $n trips. + n = 20 + m = 5 + train_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=train_label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + test_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=test_label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm' + ], + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamtg.GradientBoostedDecisionTree(model_config) + model.fit(train_trips) + + with self.assertRaises(ValueError): + model.predict(test_trips) + + + def testNumeric(self): """ - if a new class is added the model should re-train + the model should handle numeric and categorical variable types """ label_data = { "mode_confirm": ['walk', 'bike', 'transit'], @@ -65,12 +121,51 @@ def testUnseenTrainingClasses(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": ['mode_confirm','purpose_confirm'], - "dependent_var": 'replaced_mode' + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm', + 'distance_miles' + ], + "dependent_var": 'data.user_input.replaced_mode' } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(trips) model.predict(trips) - # any predicted values must - # self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") + + def testFull(self): + """ + the model should handle survey, trip, and user input features + """ + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk','bike','transit'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm', + 'data.survey.age', + 'data.survey.hhinc', + 'distance_miles' + ], + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamtg.GradientBoostedDecisionTree(model_config) + model.fit(trips) + model.predict(trips) diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 879a3a2ca..8b7ab7c6a 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -187,12 +187,24 @@ def generate_mock_trips( purpose_weights=label_data.get('purpose_weights') ) trip = build_mock_trip(user_id, o, d, labels, start_ts, end_ts) + trip = add_trip_demographics(trip) result.append(trip) random.shuffle(result) return result +def add_trip_demographics(trip): + trip['data']['survey'] = {} + survey_features = { + 'hhinc':['0-24999','25000-49000','50000-99999','100000+'], + 'age':[x for x in range(0, 70)], + 'veh':['0','1','2','3','4+'] + } + for feature in survey_features: + trip['data']['survey'][feature] = random.choice(survey_features[feature]) + return trip + if __name__ == '__main__': label_data = { "mode_confirm": ['walk', 'bike', 'drive'], From 8fa921ddf456d31d968b2008594ed5a7e9cb2847 Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Wed, 23 Nov 2022 13:06:38 -0800 Subject: [PATCH 03/12] Decent baseline GBDT --- .../gradient_boosted_decision_tree.py | 24 ++++++------------- .../TestGradientBoostedDecisionTree.py | 7 +++++- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py index cf2e33dac..12e350092 100644 --- a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py +++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py @@ -52,9 +52,8 @@ def __init__(self, config=None): msg = f"gbdt trip model config missing expected key {k}" raise KeyError(msg) self.is_incremental = config['incremental_evaluation'] - # Use the sklearn implementation of a GBDT + # use the sklearn implementation of a GBDT self.gbdt = GradientBoostingClassifier(n_estimators=50) - # Which features to use in the fit/prediction self.feature_list = config['feature_list'] self.dependent_var = config['dependent_var'] @@ -86,20 +85,20 @@ def predict(self, trip: ecwc.Confirmedtrip) -> List[int]: return y_pred def to_dict(self) -> Dict: - return dict(self.gbdt) + return self.gbdt.get_params() def from_dict(self, model: Dict): - self.gbdt = model + self.gbdt.set_params(model) def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: - # Get dataframe from json trips; fill in calculated columns + # get dataframe from json trips; fill in calculated columns trips_df = pd.json_normalize(trips) # distance trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']] trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1) # collect all features X = trips_df[self.feature_list] - # Any object/categorical dtype features must be one-hot encoded if unordered + # any object/categorical dtype features must be one-hot encoded if unordered dummies = [] for col in X: if X[col].dtype=='object': @@ -112,7 +111,7 @@ def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> Li y = trips_df[self.dependent_var].values return X, y - # If the non-mock trips have distance calculated then this can be removed + # if the non-mock trips have distance calculated then this can be removed # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points def haversine(self, coord1, coord2): """ @@ -131,14 +130,5 @@ def haversine(self, coord1, coord2): dlat = lat2 - lat1 a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 c = 2 * asin(sqrt(a)) - r = 3956 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units. + r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units. return c * r - - def export_demographic_table(self, uuid_list): - print("Looking up details for %s" % uuid_list) - all_survey_results = list(edb.get_timeseries_db().find({"user_id": {"$in": uuid_list}, "metadata.key": "manual/demographic_survey"})) - for s in all_survey_results: - s["data"]["user_id"] = s["user_id"] - all_survey_results_df = pd.json_normalize([s["data"] for s in all_survey_results]) - all_survey_results_df.drop(columns=['xmlResponse', 'name', 'version', 'label'], axis=1, inplace=True) - return all_survey_results_df \ No newline at end of file diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py index f1a12ea6d..b506307f6 100644 --- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py +++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py @@ -2,6 +2,7 @@ import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg import emission.tests.modellingTests.modellingTestAssets as etmm import logging +import pandas as pd class TestGradientBoostedDecisionTree(unittest.TestCase): @@ -168,4 +169,8 @@ def testFull(self): } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(trips) - model.predict(trips) + y = model.predict(trips) + + # No class in predictions that's not in training data + for predicted_class in pd.unique(y): + self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']])) From b99c49929b7e619e536f6de1e267323b3f8a443a Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Wed, 23 Nov 2022 13:42:03 -0800 Subject: [PATCH 04/12] Add basic SVM --- .../trip_model/support_vector_machine.py | 132 +++++++++++++ .../TestSupportVectorMachine.py | 176 ++++++++++++++++++ 2 files changed, 308 insertions(+) create mode 100644 emission/analysis/modelling/trip_model/support_vector_machine.py create mode 100644 emission/tests/modellingTests/TestSupportVectorMachine.py diff --git a/emission/analysis/modelling/trip_model/support_vector_machine.py b/emission/analysis/modelling/trip_model/support_vector_machine.py new file mode 100644 index 000000000..db7c79ea3 --- /dev/null +++ b/emission/analysis/modelling/trip_model/support_vector_machine.py @@ -0,0 +1,132 @@ +import logging +from tokenize import group +from typing import Dict, List, Optional, Tuple + +from math import radians, cos, sin, asin, sqrt +import numpy as np +import pandas as pd +from sklearn.svm import SVC + +import emission.storage.timeseries.abstract_timeseries as esta +import emission.analysis.modelling.tour_model.label_processing as lp +import emission.analysis.modelling.trip_model.trip_model as eamuu +import emission.analysis.modelling.trip_model.util as util +import emission.analysis.modelling.trip_model.config as eamtc +import emission.core.get_database as edb +import emission.core.wrapper.confirmedtrip as ecwc + + +class SupportVectorMachine(eamuu.TripModel): + + is_incremental: bool = False # overwritten during __init__ + + def __init__(self, config=None): + """ + Instantiate a support vector machine for all users. + + This uses the sklearn implementation of a support vector machine + to classify unlabeled replacement modes. + + https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html + + Replacement modes are considered to be the second-best choice for + a given trip (i.e., what mode would have been chosen if the actual + choice wasn't available). + + The model is currently trained on data from all users. + """ + if config is None: + config = eamtc.get_config_value_or_raise('model_parameters.svm') + logging.debug(f'SupportVectorMachine loaded model config from file') + else: + logging.debug(f'SupportVectorMachine using model config argument') + expected_keys = [ + 'incremental_evaluation', + 'feature_list', + 'dependent_var' + ] + for k in expected_keys: + if config.get(k) is None: + msg = f"svm trip model config missing expected key {k}" + raise KeyError(msg) + self.is_incremental = config['incremental_evaluation'] + # use the sklearn implementation of a svm + self.svm = SVC() + self.feature_list = config['feature_list'] + self.dependent_var = config['dependent_var'] + + def fit(self, trips: List[ecwc.Confirmedtrip]): + """train the model by passing data, where each row in the data + corresponds to a label at the matching index of the label input + + :param trips: 2D array of features to train from + """ + logging.debug(f'fit called with {len(trips)} trips') + unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips)) + if len(unlabeled) > 0: + msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' + raise Exception(msg) + X_train, y_train = self.extract_features(trips) + self.svm.fit(X_train, y_train) + logging.info(f"support vector machine model fit to {len(X_train)} rows of trip data") + logging.info(f"training features were {X_train.columns}") + + def predict(self, trip: ecwc.Confirmedtrip) -> List[int]: + logging.debug(f"running support vector mode prediction") + X_test, y_pred = self.extract_features(trip, is_prediction=True) + y_pred = self.svm.predict(X_test) + if y_pred is None: + logging.debug(f"unable to predict bin for trip {trip}") + return [] + else: + logging.debug(f"made predictions {y_pred}") + return y_pred + + def to_dict(self) -> Dict: + return self.svm.get_params() + + def from_dict(self, model: Dict): + self.svm.set_params(model) + + def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: + # get dataframe from json trips; fill in calculated columns + trips_df = pd.json_normalize(trips) + # distance + trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']] + trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1) + # collect all features + X = trips_df[self.feature_list] + # any object/categorical dtype features must be one-hot encoded if unordered + dummies = [] + for col in X: + if X[col].dtype=='object': + dummies.append(pd.get_dummies(X[col], prefix=col)) + X = pd.concat(dummies, axis=1) + # Only extract dependent var if fitting a new model + if is_prediction: + y = None + else: + y = trips_df[self.dependent_var].values + return X, y + + # if the non-mock trips have distance calculated then this can be removed + # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points + def haversine(self, coord1, coord2): + """ + Calculate the great circle distance in kilometers between two points + on the earth (specified in decimal degrees) + """ + lon1 = coord1[0] + lat1 = coord1[1] + lon2 = coord2[0] + lat2 = coord2[1] + # convert decimal degrees to radians + lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) + + # haversine formula + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 + c = 2 * asin(sqrt(a)) + r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units. + return c * r diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py new file mode 100644 index 000000000..eff98d939 --- /dev/null +++ b/emission/tests/modellingTests/TestSupportVectorMachine.py @@ -0,0 +1,176 @@ +import unittest +import emission.analysis.modelling.trip_model.support_vector_machine as eamts +import emission.tests.modellingTests.modellingTestAssets as etmm +import logging +import pandas as pd + + +class TestSupportVectorMachine(unittest.TestCase): + + def setUp(self) -> None: + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + + def testSmoke(self): + """ + the model should fit and predict on normal data without errors + """ + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm' + ], + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamts.SupportVectorMachine(model_config) + model.fit(trips) + model.predict(trips) + + + def testUnseenFeatures(self): + """ + if the input classes for a feature change throw sklearn error + """ + train_label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + test_label_data = { + "mode_confirm": ['drive'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + # generate $n trips. + n = 20 + m = 5 + train_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=train_label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + test_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=test_label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm' + ], + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamts.SupportVectorMachine(model_config) + model.fit(train_trips) + + with self.assertRaises(ValueError): + model.predict(test_trips) + + + def testNumeric(self): + """ + the model should handle numeric and categorical variable types + """ + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm', + 'distance_miles' + ], + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamts.SupportVectorMachine(model_config) + model.fit(trips) + model.predict(trips) + + + def testFull(self): + """ + the model should handle survey, trip, and user input features + """ + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk','bike','transit'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm', + 'data.survey.age', + 'data.survey.hhinc', + 'distance_miles' + ], + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamts.SupportVectorMachine(model_config) + model.fit(trips) + y = model.predict(trips) + + # No class in predictions that's not in training data + for predicted_class in pd.unique(y): + self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']])) From f3c7160a53cfb2faade6b456bab3686e246b725a Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Wed, 23 Nov 2022 14:55:03 -0800 Subject: [PATCH 05/12] Clean up some shared code --- .../gradient_boosted_decision_tree.py | 54 +-------------- .../trip_model/support_vector_machine.py | 50 +------------- .../analysis/modelling/trip_model/util.py | 65 +++++++++++++++++++ 3 files changed, 70 insertions(+), 99 deletions(-) diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py index 12e350092..ec354d5e4 100644 --- a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py +++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py @@ -2,19 +2,11 @@ from tokenize import group from typing import Dict, List, Optional, Tuple -from math import radians, cos, sin, asin, sqrt -import numpy as np -import pandas as pd -from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier -import sklearn.metrics as sm -import emission.storage.timeseries.abstract_timeseries as esta -import emission.analysis.modelling.tour_model.label_processing as lp import emission.analysis.modelling.trip_model.trip_model as eamuu -import emission.analysis.modelling.trip_model.util as util +import emission.analysis.modelling.trip_model.util as eamtu import emission.analysis.modelling.trip_model.config as eamtc -import emission.core.get_database as edb import emission.core.wrapper.confirmedtrip as ecwc @@ -73,7 +65,7 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): logging.info(f"gradient boosted decision tree model fit to {len(X_train)} rows of trip data") logging.info(f"training features were {X_train.columns}") - def predict(self, trip: ecwc.Confirmedtrip) -> List[int]: + def predict(self, trip: ecwc.Confirmedtrip) -> List: logging.debug(f"running gradient boosted mode prediction") X_test, y_pred = self.extract_features(trip, is_prediction=True) y_pred = self.gbdt.predict(X_test) @@ -91,44 +83,4 @@ def from_dict(self, model: Dict): self.gbdt.set_params(model) def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: - # get dataframe from json trips; fill in calculated columns - trips_df = pd.json_normalize(trips) - # distance - trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']] - trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1) - # collect all features - X = trips_df[self.feature_list] - # any object/categorical dtype features must be one-hot encoded if unordered - dummies = [] - for col in X: - if X[col].dtype=='object': - dummies.append(pd.get_dummies(X[col], prefix=col)) - X = pd.concat(dummies, axis=1) - # Only extract dependent var if fitting a new model - if is_prediction: - y = None - else: - y = trips_df[self.dependent_var].values - return X, y - - # if the non-mock trips have distance calculated then this can be removed - # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points - def haversine(self, coord1, coord2): - """ - Calculate the great circle distance in kilometers between two points - on the earth (specified in decimal degrees) - """ - lon1 = coord1[0] - lat1 = coord1[1] - lon2 = coord2[0] - lat2 = coord2[1] - # convert decimal degrees to radians - lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) - - # haversine formula - dlon = lon2 - lon1 - dlat = lat2 - lat1 - a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 - c = 2 * asin(sqrt(a)) - r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units. - return c * r + return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, is_prediction, trips) diff --git a/emission/analysis/modelling/trip_model/support_vector_machine.py b/emission/analysis/modelling/trip_model/support_vector_machine.py index db7c79ea3..6ef311ea5 100644 --- a/emission/analysis/modelling/trip_model/support_vector_machine.py +++ b/emission/analysis/modelling/trip_model/support_vector_machine.py @@ -2,18 +2,12 @@ from tokenize import group from typing import Dict, List, Optional, Tuple -from math import radians, cos, sin, asin, sqrt -import numpy as np -import pandas as pd from sklearn.svm import SVC -import emission.storage.timeseries.abstract_timeseries as esta -import emission.analysis.modelling.tour_model.label_processing as lp import emission.analysis.modelling.trip_model.trip_model as eamuu -import emission.analysis.modelling.trip_model.util as util import emission.analysis.modelling.trip_model.config as eamtc -import emission.core.get_database as edb import emission.core.wrapper.confirmedtrip as ecwc +import emission.analysis.modelling.trip_model.util as eamtu class SupportVectorMachine(eamuu.TripModel): @@ -89,44 +83,4 @@ def from_dict(self, model: Dict): self.svm.set_params(model) def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: - # get dataframe from json trips; fill in calculated columns - trips_df = pd.json_normalize(trips) - # distance - trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']] - trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1) - # collect all features - X = trips_df[self.feature_list] - # any object/categorical dtype features must be one-hot encoded if unordered - dummies = [] - for col in X: - if X[col].dtype=='object': - dummies.append(pd.get_dummies(X[col], prefix=col)) - X = pd.concat(dummies, axis=1) - # Only extract dependent var if fitting a new model - if is_prediction: - y = None - else: - y = trips_df[self.dependent_var].values - return X, y - - # if the non-mock trips have distance calculated then this can be removed - # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points - def haversine(self, coord1, coord2): - """ - Calculate the great circle distance in kilometers between two points - on the earth (specified in decimal degrees) - """ - lon1 = coord1[0] - lat1 = coord1[1] - lon2 = coord2[0] - lat2 = coord2[1] - # convert decimal degrees to radians - lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) - - # haversine formula - dlon = lon2 - lon1 - dlat = lat2 - lat1 - a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 - c = 2 * asin(sqrt(a)) - r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units. - return c * r + return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, is_prediction, trips) diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index 7d22b5d22..d70a8c625 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -1,7 +1,49 @@ from typing import List, Tuple from past.utils import old_div +from math import radians, cos, sin, asin, sqrt import numpy from numpy.linalg import norm +import pandas as pd + +import emission.core.wrapper.confirmedtrip as ecwc + + +def get_replacement_mode_features(feature_list, dependent_var, is_prediction, trips: ecwc.Confirmedtrip) -> List[float]: + """extract the features needed to perform replacement mode modeling from a set of + trips. + + recodes variables that are categorical, and (TODO: scales numeric variables 0-1). + + :param feature_list: features to gather from each trip + :type feature_list: List[string] + :param dependent_var: the feature to predict for each trip + :type dependent_var: string + :param is_prediction: whether or not to extract the dependent var + :type is_prediction: bool + :param trips: all trips to extract features from + :type trips: List[ecwc.Confirmedtrip] + :return: the training X features and y for the replacement mode model + :rtype: Tuple[List[List[float]], List[]] + """ + # get dataframe from json trips; fill in calculated columns + trips_df = pd.json_normalize(trips) + # distance + trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']] + trips_df['distance_miles'] = trips_coords.apply(lambda row : haversine(row[0],row[1]), axis=1) + # collect all features + X = trips_df[feature_list] + # any object/categorical dtype features must be one-hot encoded if unordered + dummies = [] + for col in X: + if X[col].dtype=='object': + dummies.append(pd.get_dummies(X[col], prefix=col)) + X = pd.concat(dummies, axis=1) + # Only extract dependent var if fitting a new model + if is_prediction: + y = None + else: + y = trips_df[dependent_var].values + return X, y def find_knee_point(values: List[float]) -> Tuple[float, int]: @@ -39,3 +81,26 @@ def find_knee_point(values: List[float]) -> Tuple[float, int]: index = i value = values[index] return [index, value] + + +# if the non-mock trips have distance calculated then this can be removed +# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points +def haversine(coord1, coord2): + """ + Calculate the great circle distance in kilometers between two points + on the earth (specified in decimal degrees) + """ + lon1 = coord1[0] + lat1 = coord1[1] + lon2 = coord2[0] + lat2 = coord2[1] + # convert decimal degrees to radians + lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) + + # haversine formula + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 + c = 2 * asin(sqrt(a)) + r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units. + return c * r \ No newline at end of file From a4cb4f3b80a1a6dad6ad3f7a68850c6a5e8d5120 Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Thu, 1 Dec 2022 11:13:41 -0800 Subject: [PATCH 06/12] Make SVM incremental --- .../trip_model/support_vector_machine.py | 47 ++++++-- .../TestSupportVectorMachine.py | 113 ++++++++++++++++++ 2 files changed, 152 insertions(+), 8 deletions(-) diff --git a/emission/analysis/modelling/trip_model/support_vector_machine.py b/emission/analysis/modelling/trip_model/support_vector_machine.py index 6ef311ea5..c2b780e6a 100644 --- a/emission/analysis/modelling/trip_model/support_vector_machine.py +++ b/emission/analysis/modelling/trip_model/support_vector_machine.py @@ -2,7 +2,8 @@ from tokenize import group from typing import Dict, List, Optional, Tuple -from sklearn.svm import SVC +import numpy as np +from sklearn.linear_model import SGDClassifier import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.trip_model.config as eamtc @@ -13,15 +14,25 @@ class SupportVectorMachine(eamuu.TripModel): is_incremental: bool = False # overwritten during __init__ + incremental_classes: list = None # overwritten when fit is initialized def __init__(self, config=None): """ - Instantiate a support vector machine for all users. + Instantiate a linear support vector machine for all users. This uses the sklearn implementation of a support vector machine - to classify unlabeled replacement modes. - - https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html + to classify unlabeled replacement modes. The SVM is linear, and is fit + with the more general SGDClassifier class which can accommodate online + learning: + + https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html + + For anyone looking to implement a differnt online learning model in the + future, here is a list of sklearn models that implement "partial_fit" + and would be candidates for the online learning approach implemented + here: + + https://scikit-learn.org/0.15/modules/scaling_strategies.html Replacement modes are considered to be the second-best choice for a given trip (i.e., what mode would have been chosen if the actual @@ -45,13 +56,22 @@ def __init__(self, config=None): raise KeyError(msg) self.is_incremental = config['incremental_evaluation'] # use the sklearn implementation of a svm - self.svm = SVC() + self.svm = SGDClassifier() self.feature_list = config['feature_list'] self.dependent_var = config['dependent_var'] def fit(self, trips: List[ecwc.Confirmedtrip]): """train the model by passing data, where each row in the data - corresponds to a label at the matching index of the label input + corresponds to a label at the matching index of the label input. + + If using an incremental model, the initial call to fit will store + the list of unique classes in y. If additional classes are added in the + future, the model will need to be re-initialized on data that + contains all of those classes, otherwise sklearn will throw an + error. Additionally, any categorical variables used as features + must have all classes present in the initial training data. If not, + they will throw an sklearn error due to differences in the number of + OHE variables in the data. :param trips: 2D array of features to train from """ @@ -61,7 +81,18 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' raise Exception(msg) X_train, y_train = self.extract_features(trips) - self.svm.fit(X_train, y_train) + # The first time partial_fit is called, it must include a list of the unique classes in y + if self.is_incremental and self.incremental_classes is None: + logging.debug(f'initializing incremental model fit') + self.incremental_classes = np.unique(y_train) + self.svm.partial_fit(X_train, y_train, self.incremental_classes) + # For all future partial fits, there is no need to pass the classes again + elif self.is_incremental and self.incremental_classes is not None: + logging.debug(f'updating incremental model fit') + self.svm.partial_fit(X_train, y_train) + # If not incremental, train regularly + else: + self.svm.fit(X_train, y_train) logging.info(f"support vector machine model fit to {len(X_train)} rows of trip data") logging.info(f"training features were {X_train.columns}") diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py index eff98d939..8cc5d354e 100644 --- a/emission/tests/modellingTests/TestSupportVectorMachine.py +++ b/emission/tests/modellingTests/TestSupportVectorMachine.py @@ -174,3 +174,116 @@ def testFull(self): # No class in predictions that's not in training data for predicted_class in pd.unique(y): self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']])) + + + def testIncremental(self): + """ + the model should fit and predict incrementally on normal data without errors + """ + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + # generate $n trips. + n = 20 + m = 5 + initial_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + additional_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n*5, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": True, + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm' + ], + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamts.SupportVectorMachine(model_config) + # Start with some initialization data + model.fit(initial_trips) + # Train on additional sets of data and predict for initial data + for i in range(0, 5): + model.fit(additional_trips[i:(i+1)*n]) + model.predict(initial_trips) + + + def testUnseenClassesIncremental(self): + """ + if the input classes for a feature change throw sklearn error + """ + train_label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + test_label_data = { + "mode_confirm": ['drive'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk'] + } + # generate $n trips. + n = 20 + m = 5 + initial_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=train_label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + additional_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n*5, + origin=(0, 0), + destination=(1, 1), + label_data=train_label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + test_trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=test_label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": [ + 'data.user_input.mode_confirm', + 'data.user_input.purpose_confirm' + ], + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamts.SupportVectorMachine(model_config) + # Start with some initialization data + model.fit(initial_trips) + # Train on additional sets of data + for i in range(0, 5): + model.fit(additional_trips[i:(i+1)*n]) + + # If an unseen class is introduced, allow sklearn to throw error + with self.assertRaises(ValueError): + model.predict(test_trips) From 10e5c7404e565df6098d21bc16b9a448a0872771 Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Thu, 1 Dec 2022 11:34:20 -0800 Subject: [PATCH 07/12] Add gbdt to model types --- emission/analysis/modelling/trip_model/model_type.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/emission/analysis/modelling/trip_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py index b5e761fb0..fcb5b552f 100644 --- a/emission/analysis/modelling/trip_model/model_type.py +++ b/emission/analysis/modelling/trip_model/model_type.py @@ -3,6 +3,7 @@ import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.similarity.od_similarity as eamso import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug +import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg SIMILARITY_THRESHOLD_METERS=500 @@ -11,6 +12,7 @@ class ModelType(Enum): # ENUM_NAME_CAPS = 'SHORTHAND_NAME_CAPS' GREEDY_SIMILARITY_BINNING = 'GREEDY' + GRADIENT_BOOSTED_DECISION_TREE = 'GBDT' def build(self, config=None) -> eamuu.TripModel: """ @@ -25,7 +27,8 @@ def build(self, config=None) -> eamuu.TripModel: """ # Dict[ModelType, TripModel] MODELS = { - ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config) + ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config), + ModelType.GRADIENT_BOOSTED_DECISION_TREE: eamtg.GradientBoostedDecisionTree(config) } model = MODELS.get(self) if model is None: From 063d9abd37824c813d60452b69bada7b45a000b9 Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Thu, 8 Dec 2022 15:11:04 -0800 Subject: [PATCH 08/12] Addressing comments, move classes to config --- conf/analysis/trip_model.conf.json.sample | 34 +++++ .../gradient_boosted_decision_tree.py | 13 +- .../analysis/modelling/trip_model/util.py | 57 +++---- .../TestGradientBoostedDecisionTree.py | 143 +++++++++++++++--- .../modellingTests/modellingTestAssets.py | 2 + 5 files changed, 184 insertions(+), 65 deletions(-) diff --git a/conf/analysis/trip_model.conf.json.sample b/conf/analysis/trip_model.conf.json.sample index 845e67a6a..ac9208eaf 100644 --- a/conf/analysis/trip_model.conf.json.sample +++ b/conf/analysis/trip_model.conf.json.sample @@ -8,6 +8,40 @@ "similarity_threshold_meters": 500, "apply_cutoff": false, "incremental_evaluation": false + }, + "gbdt": { + "feature_list": { + "data.user_input.mode_confirm": [ + "drive", + "bike", + "walk", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ] + }, + "dependent_var": "data.user_input.replaced_mode", + "incremental_evaluation": false + }, + "svm": { + "feature_list": { + "data.user_input.mode_confirm": [ + "drive", + "bike", + "walk", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ] + }, + "dependent_var": "data.user_input.replaced_mode", + "incremental_evaluation": true } } } \ No newline at end of file diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py index ec354d5e4..5b2968318 100644 --- a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py +++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py @@ -2,7 +2,7 @@ from tokenize import group from typing import Dict, List, Optional, Tuple -from sklearn.ensemble import GradientBoostingClassifier +import sklearn.ensemble as ske import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.trip_model.util as eamtu @@ -25,7 +25,8 @@ def __init__(self, config=None): Replacement modes are considered to be the second-best choice for a given trip (i.e., what mode would have been chosen if the actual - choice wasn't available). + choice wasn't available). These labels are gathered from the user + along with the chosen mode and trip purpose after the trip takes place. The model is currently trained on data from all users. """ @@ -45,7 +46,7 @@ def __init__(self, config=None): raise KeyError(msg) self.is_incremental = config['incremental_evaluation'] # use the sklearn implementation of a GBDT - self.gbdt = GradientBoostingClassifier(n_estimators=50) + self.gbdt = ske.GradientBoostingClassifier(n_estimators=50) self.feature_list = config['feature_list'] self.dependent_var = config['dependent_var'] @@ -65,12 +66,12 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): logging.info(f"gradient boosted decision tree model fit to {len(X_train)} rows of trip data") logging.info(f"training features were {X_train.columns}") - def predict(self, trip: ecwc.Confirmedtrip) -> List: + def predict(self, trip: ecwc.Confirmedtrip) -> List[str]: logging.debug(f"running gradient boosted mode prediction") X_test, y_pred = self.extract_features(trip, is_prediction=True) y_pred = self.gbdt.predict(X_test) if y_pred is None: - logging.debug(f"unable to predict bin for trip {trip}") + logging.debug(f"unable to predict mode for trip {trip}") return [] else: logging.debug(f"made predictions {y_pred}") @@ -83,4 +84,4 @@ def from_dict(self, model: Dict): self.gbdt.set_params(model) def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: - return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, is_prediction, trips) + return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, trips, is_prediction) diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index d70a8c625..b42e88bef 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -8,7 +8,7 @@ import emission.core.wrapper.confirmedtrip as ecwc -def get_replacement_mode_features(feature_list, dependent_var, is_prediction, trips: ecwc.Confirmedtrip) -> List[float]: +def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: """extract the features needed to perform replacement mode modeling from a set of trips. @@ -25,24 +25,34 @@ def get_replacement_mode_features(feature_list, dependent_var, is_prediction, tr :return: the training X features and y for the replacement mode model :rtype: Tuple[List[List[float]], List[]] """ - # get dataframe from json trips; fill in calculated columns + # get dataframe from json trips trips_df = pd.json_normalize(trips) - # distance - trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']] - trips_df['distance_miles'] = trips_coords.apply(lambda row : haversine(row[0],row[1]), axis=1) - # collect all features - X = trips_df[feature_list] - # any object/categorical dtype features must be one-hot encoded if unordered + X = trips_df[list(feature_list.keys())] + # any features that are strings must be encoded as numeric variables + # we use one-hot encoding for categorical variables + # https://pbpython.com/pandas_dtypes.html dummies = [] + numeric = [] for col in X: + # object column == string or mixed variable if X[col].dtype=='object': - dummies.append(pd.get_dummies(X[col], prefix=col)) - X = pd.concat(dummies, axis=1) - # Only extract dependent var if fitting a new model + cat_col = pd.Categorical(X[col], categories=feature_list[col]) + # If new features are present in X_test, throw value error + if cat_col.isnull().any(): + raise ValueError(f"Cannot predict on unseen classes in: {col}") + dummies.append(pd.get_dummies(cat_col, prefix=col)) + else: + numeric.append(X[col]) + numeric.extend(dummies) + X = pd.concat(numeric, axis=1) + # only extract dependent var if fitting a new model + # for the dependent variable of a classification model, sklearn will accept strings + # so no need to recode these to numeric and deal with complications of storing labels + # and decoding them later if is_prediction: y = None else: - y = trips_df[dependent_var].values + y = trips_df[dependent_var] return X, y @@ -81,26 +91,3 @@ def find_knee_point(values: List[float]) -> Tuple[float, int]: index = i value = values[index] return [index, value] - - -# if the non-mock trips have distance calculated then this can be removed -# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points -def haversine(coord1, coord2): - """ - Calculate the great circle distance in kilometers between two points - on the earth (specified in decimal degrees) - """ - lon1 = coord1[0] - lat1 = coord1[1] - lon2 = coord2[0] - lat2 = coord2[1] - # convert decimal degrees to radians - lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) - - # haversine formula - dlon = lon2 - lon1 - dlat = lat2 - lat1 - a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 - c = 2 * asin(sqrt(a)) - r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units. - return c * r \ No newline at end of file diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py index b506307f6..864d3bf29 100644 --- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py +++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py @@ -3,6 +3,7 @@ import emission.tests.modellingTests.modellingTestAssets as etmm import logging import pandas as pd +import random class TestGradientBoostedDecisionTree(unittest.TestCase): @@ -36,11 +37,19 @@ def testSmoke(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm' - ], - "dependent_var": 'data.user_input.replaced_mode' + "feature_list": { + "data.user_input.mode_confirm": [ + "walk", + "bike", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ] + }, + "dependent_var": "data.user_input.replaced_mode" } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(trips) @@ -50,6 +59,7 @@ def testSmoke(self): def testUnseenFeatures(self): """ if the input classes for a feature change throw sklearn error + the test mode_confirm includes 'drive' which is not in the training set nor config """ train_label_data = { "mode_confirm": ['walk', 'bike', 'transit'], @@ -85,17 +95,25 @@ def testUnseenFeatures(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm' - ], + "feature_list": { + "data.user_input.mode_confirm": [ + 'walk', + 'bike', + 'transit' + ], + "data.user_input.purpose_confirm": [ + 'work', + 'home', + 'school' + ] + }, "dependent_var": 'data.user_input.replaced_mode' } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(train_trips) with self.assertRaises(ValueError): - model.predict(test_trips) + y = model.predict(test_trips) def testNumeric(self): @@ -122,16 +140,27 @@ def testNumeric(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm', - 'distance_miles' - ], + "feature_list": { + "data.user_input.mode_confirm": [ + 'walk', + 'bike', + 'transit' + ], + "data.user_input.purpose_confirm": [ + 'work', + 'home', + 'school' + ], + "data.distance": None + }, "dependent_var": 'data.user_input.replaced_mode' } model = eamtg.GradientBoostedDecisionTree(model_config) - model.fit(trips) - model.predict(trips) + X_train, y_train = model.extract_features(trips) + # 3 features for mode confirm, 3 for trip purpose, 1 for distance + self.assertEqual(len(X_train.columns), 7) + # all feature columns should be strictly numeric + self.assertTrue(X_train.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all()) def testFull(self): @@ -158,13 +187,26 @@ def testFull(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm', - 'data.survey.age', - 'data.survey.hhinc', - 'distance_miles' - ], + "feature_list": { + "data.user_input.mode_confirm": [ + 'walk', + 'bike', + 'transit' + ], + "data.user_input.purpose_confirm": [ + 'work', + 'home', + 'school' + ], + "data.distance": None, + "data.survey.age": None, + "data.survey.hhinc": [ + '0-24999', + '25000-49000', + '50000-99999', + '100000+' + ] + }, "dependent_var": 'data.user_input.replaced_mode' } model = eamtg.GradientBoostedDecisionTree(model_config) @@ -174,3 +216,56 @@ def testFull(self): # No class in predictions that's not in training data for predicted_class in pd.unique(y): self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']])) + + + def testPredictions(self): + """ + with a fixed seed, the model should make consistent predictions + """ + random.seed(42) + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk','bike','transit'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": { + "data.user_input.mode_confirm": [ + 'walk', + 'bike', + 'transit' + ], + "data.user_input.purpose_confirm": [ + 'work', + 'home', + 'school' + ] + }, + "dependent_var": 'data.user_input.replaced_mode' + } + model = eamtg.GradientBoostedDecisionTree(model_config) + model.fit(trips) + y = model.predict(trips) + + # Test that predicted == expected + expected_result = [ + 'transit', 'transit', 'walk', 'transit', 'drive', 'walk', 'bike', 'transit', + 'transit', 'transit', 'walk', 'drive', 'drive', 'drive', 'drive', 'drive', + 'transit', 'transit', 'walk', 'walk' + ] + for i, prediction in enumerate(y): + self.assertEqual(prediction, expected_result[i]) diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 8b7ab7c6a..c3cef0f42 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -2,6 +2,7 @@ from typing import Optional, Tuple, List, Dict from uuid import UUID import emission.analysis.modelling.trip_model.trip_model as eamtm +import emission.core.common as ecc import emission.core.wrapper.confirmedtrip as ecwc import emission.core.wrapper.entry as ecwe @@ -120,6 +121,7 @@ def build_mock_trip( "type": "Point", "coordinates": destination }, + "distance": ecc.calDistance(origin, destination), "user_input": labels } From 2e1822620911b9a0fe671317924b4ec62a7c1f26 Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Thu, 8 Dec 2022 19:43:12 -0800 Subject: [PATCH 09/12] Incremental SVM, more testing, add classes to configs --- conf/analysis/trip_model.conf.json.sample | 20 +- .../trip_model/support_vector_machine.py | 47 ++-- .../analysis/modelling/trip_model/util.py | 6 +- .../TestGradientBoostedDecisionTree.py | 53 +++- .../TestSupportVectorMachine.py | 247 +++++++++++++++--- 5 files changed, 302 insertions(+), 71 deletions(-) diff --git a/conf/analysis/trip_model.conf.json.sample b/conf/analysis/trip_model.conf.json.sample index ac9208eaf..b18da6a50 100644 --- a/conf/analysis/trip_model.conf.json.sample +++ b/conf/analysis/trip_model.conf.json.sample @@ -23,7 +23,15 @@ "school" ] }, - "dependent_var": "data.user_input.replaced_mode", + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + }, "incremental_evaluation": false }, "svm": { @@ -40,7 +48,15 @@ "school" ] }, - "dependent_var": "data.user_input.replaced_mode", + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + }, "incremental_evaluation": true } } diff --git a/emission/analysis/modelling/trip_model/support_vector_machine.py b/emission/analysis/modelling/trip_model/support_vector_machine.py index c2b780e6a..f615a66c7 100644 --- a/emission/analysis/modelling/trip_model/support_vector_machine.py +++ b/emission/analysis/modelling/trip_model/support_vector_machine.py @@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Tuple import numpy as np -from sklearn.linear_model import SGDClassifier +import sklearn as ske import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.trip_model.config as eamtc @@ -14,7 +14,7 @@ class SupportVectorMachine(eamuu.TripModel): is_incremental: bool = False # overwritten during __init__ - incremental_classes: list = None # overwritten when fit is initialized + is_initialized: bool = False # overwritten during first fit() def __init__(self, config=None): """ @@ -36,7 +36,8 @@ def __init__(self, config=None): Replacement modes are considered to be the second-best choice for a given trip (i.e., what mode would have been chosen if the actual - choice wasn't available). + choice wasn't available). These labels are gathered from the user + along with the chosen mode and trip purpose after the trip takes place. The model is currently trained on data from all users. """ @@ -56,7 +57,7 @@ def __init__(self, config=None): raise KeyError(msg) self.is_incremental = config['incremental_evaluation'] # use the sklearn implementation of a svm - self.svm = SGDClassifier() + self.svm = ske.linear_model.SGDClassifier() self.feature_list = config['feature_list'] self.dependent_var = config['dependent_var'] @@ -65,13 +66,12 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): corresponds to a label at the matching index of the label input. If using an incremental model, the initial call to fit will store - the list of unique classes in y. If additional classes are added in the - future, the model will need to be re-initialized on data that - contains all of those classes, otherwise sklearn will throw an - error. Additionally, any categorical variables used as features - must have all classes present in the initial training data. If not, - they will throw an sklearn error due to differences in the number of - OHE variables in the data. + the list of unique classes in y. The config file is used to store + a lookup for known classes for each categorical feature. This prevents + the need to store a lookup in the model itself, which must be updated + every time the model sees a new class or feature OR when it is given + an incremental training request that does not contain every feature class + etc. :param trips: 2D array of features to train from """ @@ -81,27 +81,30 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' raise Exception(msg) X_train, y_train = self.extract_features(trips) - # The first time partial_fit is called, it must include a list of the unique classes in y - if self.is_incremental and self.incremental_classes is None: + # the first time partial_fit is called, the incremental classes are initialized to the unique y values + if self.is_incremental and not self.is_initialized: logging.debug(f'initializing incremental model fit') - self.incremental_classes = np.unique(y_train) - self.svm.partial_fit(X_train, y_train, self.incremental_classes) - # For all future partial fits, there is no need to pass the classes again - elif self.is_incremental and self.incremental_classes is not None: + self.svm.partial_fit(X_train, y_train, self.dependent_var['classes']) + self.is_initialized = True + # for all future partial fits, there is no need to pass the classes again + elif self.is_incremental and self.is_initialized: logging.debug(f'updating incremental model fit') - self.svm.partial_fit(X_train, y_train) - # If not incremental, train regularly + try: + self.svm.partial_fit(X_train, y_train) + except ValueError: + raise ValueError("Error in incremental fit: Likely an unseen feature or dependent class was found") + # if not incremental, just train regularly else: self.svm.fit(X_train, y_train) logging.info(f"support vector machine model fit to {len(X_train)} rows of trip data") logging.info(f"training features were {X_train.columns}") - def predict(self, trip: ecwc.Confirmedtrip) -> List[int]: + def predict(self, trip: ecwc.Confirmedtrip) -> List[str]: logging.debug(f"running support vector mode prediction") X_test, y_pred = self.extract_features(trip, is_prediction=True) y_pred = self.svm.predict(X_test) if y_pred is None: - logging.debug(f"unable to predict bin for trip {trip}") + logging.debug(f"unable to predict mode for trip {trip}") return [] else: logging.debug(f"made predictions {y_pred}") @@ -114,4 +117,4 @@ def from_dict(self, model: Dict): self.svm.set_params(model) def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: - return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, is_prediction, trips) + return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, trips, is_prediction) diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index b42e88bef..996dec1bf 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -12,7 +12,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi """extract the features needed to perform replacement mode modeling from a set of trips. - recodes variables that are categorical, and (TODO: scales numeric variables 0-1). + recodes variables that are categorical. :param feature_list: features to gather from each trip :type feature_list: List[string] @@ -27,7 +27,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi """ # get dataframe from json trips trips_df = pd.json_normalize(trips) - X = trips_df[list(feature_list.keys())] + X = trips_df[feature_list.keys()] # any features that are strings must be encoded as numeric variables # we use one-hot encoding for categorical variables # https://pbpython.com/pandas_dtypes.html @@ -52,7 +52,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi if is_prediction: y = None else: - y = trips_df[dependent_var] + y = trips_df[dependent_var['name']] return X, y diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py index 864d3bf29..9b928fbcc 100644 --- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py +++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py @@ -49,7 +49,15 @@ def testSmoke(self): "school" ] }, - "dependent_var": "data.user_input.replaced_mode" + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(trips) @@ -107,7 +115,15 @@ def testUnseenFeatures(self): 'school' ] }, - "dependent_var": 'data.user_input.replaced_mode' + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(train_trips) @@ -153,7 +169,15 @@ def testNumeric(self): ], "data.distance": None }, - "dependent_var": 'data.user_input.replaced_mode' + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamtg.GradientBoostedDecisionTree(model_config) X_train, y_train = model.extract_features(trips) @@ -207,7 +231,15 @@ def testFull(self): '100000+' ] }, - "dependent_var": 'data.user_input.replaced_mode' + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(trips) @@ -215,7 +247,8 @@ def testFull(self): # No class in predictions that's not in training data for predicted_class in pd.unique(y): - self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']])) + + self.assertIn(predicted_class, model_config['dependent_var']['classes']) def testPredictions(self): @@ -255,7 +288,15 @@ def testPredictions(self): 'school' ] }, - "dependent_var": 'data.user_input.replaced_mode' + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamtg.GradientBoostedDecisionTree(model_config) model.fit(trips) diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py index 8cc5d354e..96df61a64 100644 --- a/emission/tests/modellingTests/TestSupportVectorMachine.py +++ b/emission/tests/modellingTests/TestSupportVectorMachine.py @@ -3,6 +3,7 @@ import emission.tests.modellingTests.modellingTestAssets as etmm import logging import pandas as pd +import random class TestSupportVectorMachine(unittest.TestCase): @@ -36,11 +37,27 @@ def testSmoke(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm' - ], - "dependent_var": 'data.user_input.replaced_mode' + "feature_list": { + "data.user_input.mode_confirm": [ + "walk", + "bike", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ] + }, + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamts.SupportVectorMachine(model_config) model.fit(trips) @@ -85,11 +102,27 @@ def testUnseenFeatures(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm' - ], - "dependent_var": 'data.user_input.replaced_mode' + "feature_list": { + "data.user_input.mode_confirm": [ + "walk", + "bike", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ] + }, + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamts.SupportVectorMachine(model_config) model.fit(train_trips) @@ -122,16 +155,35 @@ def testNumeric(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm', - 'distance_miles' - ], - "dependent_var": 'data.user_input.replaced_mode' + "feature_list": { + "data.user_input.mode_confirm": [ + "walk", + "bike", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ], + "data.distance": None + }, + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamts.SupportVectorMachine(model_config) - model.fit(trips) - model.predict(trips) + X_train, y_train = model.extract_features(trips) + # 3 features for mode confirm, 3 for trip purpose, 1 for distance + self.assertEqual(len(X_train.columns), 7) + # all feature columns should be strictly numeric + self.assertTrue(X_train.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all()) def testFull(self): @@ -158,14 +210,35 @@ def testFull(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm', - 'data.survey.age', - 'data.survey.hhinc', - 'distance_miles' - ], - "dependent_var": 'data.user_input.replaced_mode' + "feature_list": { + "data.user_input.mode_confirm": [ + "walk", + "bike", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ], + "data.survey.age": None, + "data.survey.hhinc": [ + '0-24999', + '25000-49000', + '50000-99999', + '100000+' + ], + "data.distance": None + }, + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamts.SupportVectorMachine(model_config) model.fit(trips) @@ -173,7 +246,7 @@ def testFull(self): # No class in predictions that's not in training data for predicted_class in pd.unique(y): - self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']])) + self.assertIn(predicted_class, model_config['dependent_var']['classes']) def testIncremental(self): @@ -209,11 +282,27 @@ def testIncremental(self): # pass in a test configuration model_config = { "incremental_evaluation": True, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm' - ], - "dependent_var": 'data.user_input.replaced_mode' + "feature_list": { + "data.user_input.mode_confirm": [ + "walk", + "bike", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ] + }, + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamts.SupportVectorMachine(model_config) # Start with some initialization data @@ -271,11 +360,27 @@ def testUnseenClassesIncremental(self): # pass in a test configuration model_config = { "incremental_evaluation": False, - "feature_list": [ - 'data.user_input.mode_confirm', - 'data.user_input.purpose_confirm' - ], - "dependent_var": 'data.user_input.replaced_mode' + "feature_list": { + "data.user_input.mode_confirm": [ + "walk", + "bike", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ] + }, + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } } model = eamts.SupportVectorMachine(model_config) # Start with some initialization data @@ -286,4 +391,70 @@ def testUnseenClassesIncremental(self): # If an unseen class is introduced, allow sklearn to throw error with self.assertRaises(ValueError): - model.predict(test_trips) + model.predict(test_trips) + + + def testPredictions(self): + """ + with a fixed seed, the model should make consistent predictions + """ + random.seed(42) + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive','walk','bike','transit'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": { + "data.user_input.mode_confirm": [ + "walk", + "bike", + "transit" + ], + "data.user_input.purpose_confirm": [ + "work", + "home", + "school" + ] + }, + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "drive", + "walk", + "bike", + "transit" + ] + } + } + model = eamts.SupportVectorMachine(model_config) + # there is a separate random number generator in SGDClassifier that + # must be fixed to get consistent predictions + model.svm.random_state = (3) + model.fit(trips) + y = model.predict(trips) + + # Test that predicted == expected + # note that it seems with a small dataset the svm tends to predict a single category + expected_result = [ + 'transit', 'transit', 'bike', 'transit', 'transit', 'bike', 'transit', 'transit', + 'transit', 'transit', 'bike', 'transit', 'transit', 'transit', 'transit', + 'transit', 'transit', 'transit', 'bike', 'bike' + ] + print(y) + for i, prediction in enumerate(y): + self.assertEqual(prediction, expected_result[i]) From d194d2b94fa6c53d475f1b565190b33a18b8551b Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Wed, 21 Dec 2022 17:45:37 -0800 Subject: [PATCH 10/12] Demographic data format in replacement modeling --- .../analysis/modelling/trip_model/util.py | 34 ++++- .../TestGradientBoostedDecisionTree.py | 142 ++++++++++-------- .../TestSupportVectorMachine.py | 27 +++- .../modellingTests/modellingTestAssets.py | 32 ++-- 4 files changed, 152 insertions(+), 83 deletions(-) diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index 996dec1bf..97bc4e5fe 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -8,6 +8,19 @@ import emission.core.wrapper.confirmedtrip as ecwc +def get_survey_df(trips_df, survey_features, response_ids): + survey_data = [] + for feature in survey_features: + feature_data = [] + for i, response_id in enumerate(response_ids): + feature_string = feature.split(".") + feature_string.insert(2, response_id) + feature_string = ".".join(feature_string) + feature_data.append(trips_df.iloc[i,][feature_string]) + survey_data.append(feature_data) + return pd.DataFrame(numpy.column_stack(survey_data), columns=survey_features) + + def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: """extract the features needed to perform replacement mode modeling from a set of trips. @@ -27,7 +40,24 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi """ # get dataframe from json trips trips_df = pd.json_normalize(trips) - X = trips_df[feature_list.keys()] + # any features that are part of the demographic survey require special attention + # the first nested value of the survey data responses changes depending on the user/response + feature_names = list(feature_list.keys()) + survey_features = [] + nonsurvey_features = [] + for x in feature_names: + if 'jsonDocResponse' in x: + survey_features.append(x) + else: + nonsurvey_features.append(x) + # make sure no features are being lost during separation + assert(len(survey_features) + len(nonsurvey_features) == len(feature_names)) + # need unique response id for every trip to identify survey features in the trip dataframe (key below jsonDocResponse) + if len(survey_features) > 0: + response_ids = [list(trip['data']['jsonDocResponse'].keys())[0] for trip in trips] + X = pd.concat([trips_df[nonsurvey_features], get_survey_df(trips_df, survey_features, response_ids)], axis=1) + else: + X = trips_df[nonsurvey_features] # any features that are strings must be encoded as numeric variables # we use one-hot encoding for categorical variables # https://pbpython.com/pandas_dtypes.html @@ -37,7 +67,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi # object column == string or mixed variable if X[col].dtype=='object': cat_col = pd.Categorical(X[col], categories=feature_list[col]) - # If new features are present in X_test, throw value error + # if new features are present in X_test, throw value error if cat_col.isnull().any(): raise ValueError(f"Cannot predict on unseen classes in: {col}") dummies.append(pd.get_dummies(cat_col, prefix=col)) diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py index 9b928fbcc..c85f155bb 100644 --- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py +++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py @@ -34,6 +34,7 @@ def testSmoke(self): within_threshold=m, threshold=0.001, # ~ 111 meters in degrees WGS84 ) + print(trips[1]) # pass in a test configuration model_config = { "incremental_evaluation": False, @@ -196,6 +197,11 @@ def testFull(self): "purpose_confirm": ['work', 'home', 'school'], "replaced_mode": ['drive','walk','bike','transit'] } + survey_data = { + "group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'], + "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'], + "group_pa5ah98.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more'] + } # generate $n trips. n = 20 m = 5 @@ -205,6 +211,7 @@ def testFull(self): origin=(0, 0), destination=(1, 1), label_data=label_data, + survey_data=survey_data, within_threshold=m, threshold=0.001, # ~ 111 meters in degrees WGS84 ) @@ -223,12 +230,19 @@ def testFull(self): 'school' ], "data.distance": None, - "data.survey.age": None, - "data.survey.hhinc": [ - '0-24999', - '25000-49000', - '50000-99999', - '100000+' + "data.jsonDocResponse.group_hg4zz25.How_old_are_you": [ + '0___25_years_old', + '26___55_years_old', + '56___70_years_old' + ], + "data.jsonDocResponse.group_hg4zz25.Are_you_a_student": [ + 'not_a_student', + 'yes' + ], + "data.jsonDocResponse.group_pa5ah98.Please_identify_which_category": [ + '0_to__49_999', + '_50_000_to__99_999', + '100_000_or_more' ] }, "dependent_var": { @@ -251,62 +265,62 @@ def testFull(self): self.assertIn(predicted_class, model_config['dependent_var']['classes']) - def testPredictions(self): - """ - with a fixed seed, the model should make consistent predictions - """ - random.seed(42) - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk','bike','transit'] - } - # generate $n trips. - n = 20 - m = 5 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - 'walk', - 'bike', - 'transit' - ], - "data.user_input.purpose_confirm": [ - 'work', - 'home', - 'school' - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamtg.GradientBoostedDecisionTree(model_config) - model.fit(trips) - y = model.predict(trips) + # def testPredictions(self): + # """ + # with a fixed seed, the model should make consistent predictions + # """ + # random.seed(42) + # label_data = { + # "mode_confirm": ['walk', 'bike', 'transit'], + # "purpose_confirm": ['work', 'home', 'school'], + # "replaced_mode": ['drive','walk','bike','transit'] + # } + # # generate $n trips. + # n = 20 + # m = 5 + # trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n, + # origin=(0, 0), + # destination=(1, 1), + # label_data=label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # # pass in a test configuration + # model_config = { + # "incremental_evaluation": False, + # "feature_list": { + # "data.user_input.mode_confirm": [ + # 'walk', + # 'bike', + # 'transit' + # ], + # "data.user_input.purpose_confirm": [ + # 'work', + # 'home', + # 'school' + # ] + # }, + # "dependent_var": { + # "name": "data.user_input.replaced_mode", + # "classes": [ + # "drive", + # "walk", + # "bike", + # "transit" + # ] + # } + # } + # model = eamtg.GradientBoostedDecisionTree(model_config) + # model.fit(trips) + # y = model.predict(trips) - # Test that predicted == expected - expected_result = [ - 'transit', 'transit', 'walk', 'transit', 'drive', 'walk', 'bike', 'transit', - 'transit', 'transit', 'walk', 'drive', 'drive', 'drive', 'drive', 'drive', - 'transit', 'transit', 'walk', 'walk' - ] - for i, prediction in enumerate(y): - self.assertEqual(prediction, expected_result[i]) + # # Test that predicted == expected + # expected_result = [ + # 'transit', 'transit', 'walk', 'transit', 'drive', 'walk', 'bike', 'transit', + # 'transit', 'transit', 'walk', 'drive', 'drive', 'drive', 'drive', 'drive', + # 'transit', 'transit', 'walk', 'walk' + # ] + # for i, prediction in enumerate(y): + # self.assertEqual(prediction, expected_result[i]) diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py index 96df61a64..1d1e9ae41 100644 --- a/emission/tests/modellingTests/TestSupportVectorMachine.py +++ b/emission/tests/modellingTests/TestSupportVectorMachine.py @@ -195,6 +195,11 @@ def testFull(self): "purpose_confirm": ['work', 'home', 'school'], "replaced_mode": ['drive','walk','bike','transit'] } + survey_data = { + "group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'], + "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'], + "group_pa5ah98.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more'] + } # generate $n trips. n = 20 m = 5 @@ -204,6 +209,7 @@ def testFull(self): origin=(0, 0), destination=(1, 1), label_data=label_data, + survey_data=survey_data, within_threshold=m, threshold=0.001, # ~ 111 meters in degrees WGS84 ) @@ -221,14 +227,21 @@ def testFull(self): "home", "school" ], - "data.survey.age": None, - "data.survey.hhinc": [ - '0-24999', - '25000-49000', - '50000-99999', - '100000+' + "data.distance": None, + "data.jsonDocResponse.group_hg4zz25.How_old_are_you": [ + '0___25_years_old', + '26___55_years_old', + '56___70_years_old' ], - "data.distance": None + "data.jsonDocResponse.group_hg4zz25.Are_you_a_student": [ + 'not_a_student', + 'yes' + ], + "data.jsonDocResponse.group_pa5ah98.Please_identify_which_category": [ + '0_to__49_999', + '_50_000_to__99_999', + '100_000_or_more' + ] }, "dependent_var": { "name": "data.user_input.replaced_mode", diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index c3cef0f42..4fcfeff24 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -1,4 +1,5 @@ import random +import string from typing import Optional, Tuple, List, Dict from uuid import UUID import emission.analysis.modelling.trip_model.trip_model as eamtm @@ -134,6 +135,7 @@ def generate_mock_trips( origin, destination, label_data = None, + survey_data = None, within_threshold = None, start_ts: None = None, end_ts: None = None, @@ -145,7 +147,7 @@ def generate_mock_trips( within a threshold from the provided o/d pair, and some have labels. some other ones can be sampled to appear outside of the threshold of the o/d locations. - label_data is an optional dictionary with labels and sample weights, for example: + label_data and survey_data are optional dictionaries with labels and sample weights, for example: { "mode_confirm": ['walk', 'bike'], "replaced_mode": ['drive', 'tnc'], @@ -154,6 +156,14 @@ def generate_mock_trips( "replaced_mode_weights": [0.4, 0.6], "purpose_weights": [0.1, 0.9] } + { + "group_hg4zz25.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more'], + "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'], + "data.jsonDocResponse.group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'], + "group_hg4zz25.Please_identify_which_category_weights": [0.8, 0.1, 0.1], + "group_hg4zz25.Are_you_a_student": [0.9, 0.1], + "data.jsonDocResponse.group_hg4zz25.How_old_are_you_weights": [0.4, 0.4, 0.2] + } weights entries are optional and result in uniform sampling. @@ -162,6 +172,7 @@ def generate_mock_trips( :param origin: origin coordinates :param destination: destination coordinates :param label_data: dictionary of label data, see above, defaults to None + :param survey_data: dictionary of survey data, see above, defaults to None :param within_threshold: number of trips that should fall within the provided distance threshold in degrees WGS84, defaults to None :param threshold: distance threshold in WGS84 for sampling, defaults to 0.01 @@ -189,22 +200,23 @@ def generate_mock_trips( purpose_weights=label_data.get('purpose_weights') ) trip = build_mock_trip(user_id, o, d, labels, start_ts, end_ts) - trip = add_trip_demographics(trip) + if survey_data is not None: + trip = add_trip_demographics(trip, survey_data) result.append(trip) random.shuffle(result) return result -def add_trip_demographics(trip): - trip['data']['survey'] = {} - survey_features = { - 'hhinc':['0-24999','25000-49000','50000-99999','100000+'], - 'age':[x for x in range(0, 70)], - 'veh':['0','1','2','3','4+'] - } +def add_trip_demographics(trip, survey_features): + response_id = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=22)) + trip['data']['jsonDocResponse'] = {response_id: {'group_yk8eb99': {}, 'group_hg4zz25': {}, 'group_pa5ah98': {}}} for feature in survey_features: - trip['data']['survey'][feature] = random.choice(survey_features[feature]) + feature_labels = feature.split(".") + feature_group = feature_labels[0] + feature_name = feature_labels[1] + feature_value = random.choice(survey_features[feature]) + trip['data']['jsonDocResponse'][response_id][feature_group].update({feature_name: feature_value}) return trip if __name__ == '__main__': From 693b8e0978e5b81d013dad8c674dacdcb9567ae6 Mon Sep 17 00:00:00 2001 From: zackAemmer Date: Thu, 22 Dec 2022 17:32:49 -0800 Subject: [PATCH 11/12] Partial switch to inferred labels for replacement mode, demographics --- .../analysis/modelling/trip_model/util.py | 39 +- .../TestGradientBoostedDecisionTree.py | 326 ------------ .../TestReplacementTripModels.py | 417 +++++++++++++++ .../TestSupportVectorMachine.py | 473 ------------------ .../modellingTests/modellingTestAssets.py | 32 +- 5 files changed, 452 insertions(+), 835 deletions(-) delete mode 100644 emission/tests/modellingTests/TestGradientBoostedDecisionTree.py create mode 100644 emission/tests/modellingTests/TestReplacementTripModels.py delete mode 100644 emission/tests/modellingTests/TestSupportVectorMachine.py diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index 97bc4e5fe..31152d262 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -5,20 +5,30 @@ from numpy.linalg import norm import pandas as pd +import emission.core.get_database as edb import emission.core.wrapper.confirmedtrip as ecwc -def get_survey_df(trips_df, survey_features, response_ids): - survey_data = [] - for feature in survey_features: - feature_data = [] - for i, response_id in enumerate(response_ids): - feature_string = feature.split(".") - feature_string.insert(2, response_id) - feature_string = ".".join(feature_string) - feature_data.append(trips_df.iloc[i,][feature_string]) - survey_data.append(feature_data) - return pd.DataFrame(numpy.column_stack(survey_data), columns=survey_features) +# TODO: Currently fails to match the uuid_list to what is in the database. Needs to be split into 1 function which reads demographic data, 1 function which formats features +def get_survey_df(uuid_list, survey_features): + # we use the "survey." identifier to separate out the features which require survey attention in the config, but do not need it in actual key + survey_features_response = [".".join(x.split(".")[1:]) for x in survey_features] + # retrieve survey response records for any user that has supplied a trip which is being trained on + all_survey_results = list(edb.get_timeseries_db().find({"user_id": {"$in": uuid_list}, "metadata.key": "manual/demographic_survey"})) + # these are the uuids that were able to be retrieved from the database; if they don't match the requested ones throw an error since we cannot train/test + survey_result_uuids = [s["user_id"] for s in all_survey_results] + # the challenge is that one of the first dictionary keys changes across users, so we cannot apply a single json_normalize and take feature values + # each unique key will end up as its own column, even if it is really the same feature as others, and will be NaN for all other users + # this is the id that changes across users, we keep a list here to index later when summarizing all of the survey response results to a single df + survey_response_ids = [list(s['data']['jsonDocResponse'].keys())[0] for s in all_survey_results] + result = [] + for i, s in enumerate(all_survey_results): + response = pd.json_normalize(s['data']['jsonDocResponse'][survey_response_ids[i]]) + response = response[survey_features_response] + response.columns = survey_features + response['user_id'] = survey_result_uuids[i] + result.append(response) + return pd.concat(result, axis=0).reset_index(inplace=True, drop=True) def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]: @@ -46,7 +56,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi survey_features = [] nonsurvey_features = [] for x in feature_names: - if 'jsonDocResponse' in x: + if 'survey' in x: survey_features.append(x) else: nonsurvey_features.append(x) @@ -54,8 +64,9 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi assert(len(survey_features) + len(nonsurvey_features) == len(feature_names)) # need unique response id for every trip to identify survey features in the trip dataframe (key below jsonDocResponse) if len(survey_features) > 0: - response_ids = [list(trip['data']['jsonDocResponse'].keys())[0] for trip in trips] - X = pd.concat([trips_df[nonsurvey_features], get_survey_df(trips_df, survey_features, response_ids)], axis=1) + uuid_list = [list(trip['user_id'] for trip in trips)] + survey_df = get_survey_df(uuid_list, survey_features) + X = pd.concat([trips_df[nonsurvey_features], survey_df], axis=1) else: X = trips_df[nonsurvey_features] # any features that are strings must be encoded as numeric variables diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py deleted file mode 100644 index c85f155bb..000000000 --- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py +++ /dev/null @@ -1,326 +0,0 @@ -import unittest -import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg -import emission.tests.modellingTests.modellingTestAssets as etmm -import logging -import pandas as pd -import random - - -class TestGradientBoostedDecisionTree(unittest.TestCase): - - def setUp(self) -> None: - logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', - level=logging.DEBUG) - - - def testSmoke(self): - """ - the model should fit and predict on normal data without errors - """ - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - # generate $n trips. - n = 20 - m = 5 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - print(trips[1]) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - "walk", - "bike", - "transit" - ], - "data.user_input.purpose_confirm": [ - "work", - "home", - "school" - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamtg.GradientBoostedDecisionTree(model_config) - model.fit(trips) - model.predict(trips) - - - def testUnseenFeatures(self): - """ - if the input classes for a feature change throw sklearn error - the test mode_confirm includes 'drive' which is not in the training set nor config - """ - train_label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - test_label_data = { - "mode_confirm": ['drive'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - # generate $n trips. - n = 20 - m = 5 - train_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=train_label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - test_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=test_label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - 'walk', - 'bike', - 'transit' - ], - "data.user_input.purpose_confirm": [ - 'work', - 'home', - 'school' - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamtg.GradientBoostedDecisionTree(model_config) - model.fit(train_trips) - - with self.assertRaises(ValueError): - y = model.predict(test_trips) - - - def testNumeric(self): - """ - the model should handle numeric and categorical variable types - """ - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - # generate $n trips. - n = 20 - m = 5 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - 'walk', - 'bike', - 'transit' - ], - "data.user_input.purpose_confirm": [ - 'work', - 'home', - 'school' - ], - "data.distance": None - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamtg.GradientBoostedDecisionTree(model_config) - X_train, y_train = model.extract_features(trips) - # 3 features for mode confirm, 3 for trip purpose, 1 for distance - self.assertEqual(len(X_train.columns), 7) - # all feature columns should be strictly numeric - self.assertTrue(X_train.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all()) - - - def testFull(self): - """ - the model should handle survey, trip, and user input features - """ - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk','bike','transit'] - } - survey_data = { - "group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'], - "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'], - "group_pa5ah98.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more'] - } - # generate $n trips. - n = 20 - m = 5 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - survey_data=survey_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - 'walk', - 'bike', - 'transit' - ], - "data.user_input.purpose_confirm": [ - 'work', - 'home', - 'school' - ], - "data.distance": None, - "data.jsonDocResponse.group_hg4zz25.How_old_are_you": [ - '0___25_years_old', - '26___55_years_old', - '56___70_years_old' - ], - "data.jsonDocResponse.group_hg4zz25.Are_you_a_student": [ - 'not_a_student', - 'yes' - ], - "data.jsonDocResponse.group_pa5ah98.Please_identify_which_category": [ - '0_to__49_999', - '_50_000_to__99_999', - '100_000_or_more' - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamtg.GradientBoostedDecisionTree(model_config) - model.fit(trips) - y = model.predict(trips) - - # No class in predictions that's not in training data - for predicted_class in pd.unique(y): - - self.assertIn(predicted_class, model_config['dependent_var']['classes']) - - - # def testPredictions(self): - # """ - # with a fixed seed, the model should make consistent predictions - # """ - # random.seed(42) - # label_data = { - # "mode_confirm": ['walk', 'bike', 'transit'], - # "purpose_confirm": ['work', 'home', 'school'], - # "replaced_mode": ['drive','walk','bike','transit'] - # } - # # generate $n trips. - # n = 20 - # m = 5 - # trips = etmm.generate_mock_trips( - # user_id="joe", - # trips=n, - # origin=(0, 0), - # destination=(1, 1), - # label_data=label_data, - # within_threshold=m, - # threshold=0.001, # ~ 111 meters in degrees WGS84 - # ) - # # pass in a test configuration - # model_config = { - # "incremental_evaluation": False, - # "feature_list": { - # "data.user_input.mode_confirm": [ - # 'walk', - # 'bike', - # 'transit' - # ], - # "data.user_input.purpose_confirm": [ - # 'work', - # 'home', - # 'school' - # ] - # }, - # "dependent_var": { - # "name": "data.user_input.replaced_mode", - # "classes": [ - # "drive", - # "walk", - # "bike", - # "transit" - # ] - # } - # } - # model = eamtg.GradientBoostedDecisionTree(model_config) - # model.fit(trips) - # y = model.predict(trips) - - # # Test that predicted == expected - # expected_result = [ - # 'transit', 'transit', 'walk', 'transit', 'drive', 'walk', 'bike', 'transit', - # 'transit', 'transit', 'walk', 'drive', 'drive', 'drive', 'drive', 'drive', - # 'transit', 'transit', 'walk', 'walk' - # ] - # for i, prediction in enumerate(y): - # self.assertEqual(prediction, expected_result[i]) diff --git a/emission/tests/modellingTests/TestReplacementTripModels.py b/emission/tests/modellingTests/TestReplacementTripModels.py new file mode 100644 index 000000000..e4fe4b2a7 --- /dev/null +++ b/emission/tests/modellingTests/TestReplacementTripModels.py @@ -0,0 +1,417 @@ +import unittest +import emission.core.get_database as edb +import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg +import emission.analysis.modelling.trip_model.support_vector_machine as eamts +import emission.tests.modellingTests.modellingTestAssets as etmm +import logging +import pandas as pd +import random + + +class TestReplacementTripModels(unittest.TestCase): + + def setUp(self) -> None: + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + + def testSmoke(self): + """ + the model should fit and predict on normal data without errors + """ + # though we cannot use mode_confirm or purpose_confirm to predict, they are required for mock trip generation + # for now, just pass it to user-label and sensed-label data + label_data = { + "mode_confirm": ['drive'], + "replaced_mode": ['drive','walk'], + "purpose_confirm": ['walk'] + } + # generate $n trips. + n = 20 + m = 5 + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + label_data=label_data, + sensed_label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + # pass in a test configuration + model_config = { + "incremental_evaluation": False, + "feature_list": { + "data.inferred_labels.mode_confirm": [ + "walk", + "bike", + "drive" + ] + }, + "dependent_var": { + "name": "data.user_input.replaced_mode", + "classes": [ + "walk", + "bike", + "drive", + ] + } + } + model = eamtg.GradientBoostedDecisionTree(model_config) + model.fit(trips) + model.predict(trips) + model = eamts.SupportVectorMachine(model_config) + model.fit(trips) + model.predict(trips) + + +#TODO: These tests were written using a prior version of the model which did not include the sensed data. +# So sensed features were actually being read from the 'user_input' key rather than 'inferred_labels' +# The above test shows how these can be set up to read from inferred labels, however they are dependent on the +# 'add_sensed_labels()' function in modellingTestAssets.py. Once that function correctly samples the labels +# (instead of just using drive for every label) these tests should work again. The exception is 'testFull' which +# will rely on a new function which adds mock-demographic-data to the database. See todo in get_survey_df() under util.py +# for more info on that. + + # def testUnseenFeatures(self): + # """ + # if the input classes for a feature change throw sklearn error + # """ + # train_label_data = { + # "mode_confirm": ['drive'], + # "purpose_confirm": ['work', 'home', 'school'], + # "replaced_mode": ['drive','walk'] + # } + # test_label_data = { + # "mode_confirm": ['walk','transit'], + # "purpose_confirm": ['work', 'home', 'school'], + # "replaced_mode": ['drive','walk'] + # } + # # generate $n trips. + # n = 20 + # m = 5 + # train_trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n, + # origin=(0, 0), + # destination=(1, 1), + # label_data=train_label_data, + # sensed_label_data=train_label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # test_trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n, + # origin=(0, 0), + # destination=(1, 1), + # label_data=test_label_data, + # sensed_label_data=test_label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # # pass in a test configuration + # model_config = { + # "incremental_evaluation": False, + # "feature_list": { + # "data.inferred_labels.mode_confirm": [ + # "drive" + # ] + # }, + # "dependent_var": { + # "name": "data.user_input.replaced_mode", + # "classes": [ + # "drive", + # "walk" + # ] + # } + # } + # model = eamtg.GradientBoostedDecisionTree(model_config) + # model.fit(train_trips) + # with self.assertRaises(ValueError): + # model.predict(test_trips) + # model = eamts.SupportVectorMachine(model_config) + # model.fit(train_trips) + # with self.assertRaises(ValueError): + # model.predict(test_trips) + + + # def testFull(self): + # """ + # the model should handle survey, trip, and user input features + # """ + # label_data = { + # "mode_confirm": ['walk', 'bike', 'transit'], + # "purpose_confirm": ['work', 'home', 'school'], + # "replaced_mode": ['drive','walk','bike','transit'] + # } + # # generate $n trips. + # n = 20 + # m = 5 + # # for the sake of testing, need a UUID that correlates to a survey response in the database; use the first one + # all_survey_results = list(edb.get_timeseries_db().find({"metadata.key": "manual/demographic_survey"})) + # sample_uuid = all_survey_results[0]['user_id'] + # trips = etmm.generate_mock_trips( + # user_id=sample_uuid, + # trips=n, + # origin=(0, 0), + # destination=(1, 1), + # label_data=label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # print(trips[0]) + # # pass in a test configuration + # model_config = { + # "incremental_evaluation": False, + # "feature_list": { + # "data.inferred_labels.mode_confirm": [ + # "walk", + # "bike", + # "transit" + # ], + # "data.distance": None, + # "survey.group_hg4zz25.How_old_are_you": [ + # '0___25_years_old', + # '26___55_years_old', + # '56___70_years_old' + # ], + # "survey.group_hg4zz25.Are_you_a_student": [ + # 'not_a_student', + # 'yes' + # ], + # "survey.group_pa5ah98.Please_identify_which_category": [ + # '0_to__49_999', + # '_50_000_to__99_999', + # '100_000_or_more' + # ] + # }, + # "dependent_var": { + # "name": "data.user_input.replaced_mode", + # "classes": [ + # "drive", + # "walk", + # "bike", + # "transit" + # ] + # } + # } + # model = eamtg.GradientBoostedDecisionTree(model_config) + # model.fit(trips) + # y = model.predict(trips) + # # No class in predictions that's not in training data + # for predicted_class in pd.unique(y): + # self.assertIn(predicted_class, model_config['dependent_var']['classes']) + + # model = eamts.SupportVectorMachine(model_config) + # model.fit(trips) + # y = model.predict(trips) + # # No class in predictions that's not in training data + # for predicted_class in pd.unique(y): + # self.assertIn(predicted_class, model_config['dependent_var']['classes']) + + + # def testIncremental(self): + # """ + # the model should fit and predict incrementally on normal data without errors + # """ + # label_data = { + # "mode_confirm": ['walk', 'bike', 'transit'], + # "purpose_confirm": ['work', 'home', 'school'], + # "replaced_mode": ['drive','walk'] + # } + # # generate $n trips. + # n = 20 + # m = 5 + # initial_trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n, + # origin=(0, 0), + # destination=(1, 1), + # label_data=label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # additional_trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n*5, + # origin=(0, 0), + # destination=(1, 1), + # label_data=label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # # pass in a test configuration + # model_config = { + # "incremental_evaluation": True, + # "feature_list": { + # "data.inferred_labels.mode_confirm": [ + # "walk", + # "bike", + # "transit" + # ] + # }, + # "dependent_var": { + # "name": "data.user_input.replaced_mode", + # "classes": [ + # "drive", + # "walk", + # "bike", + # "transit" + # ] + # } + # } + # model = eamts.SupportVectorMachine(model_config) + # # Start with some initialization data + # model.fit(initial_trips) + # # Train on additional sets of data and predict for initial data + # for i in range(0, 5): + # model.fit(additional_trips[i:(i+1)*n]) + # model.predict(initial_trips) + + + # def testUnseenClassesIncremental(self): + # """ + # if the input classes for a feature change throw sklearn error + # """ + # train_label_data = { + # "mode_confirm": ['walk', 'bike', 'transit'], + # "purpose_confirm": ['work', 'home', 'school'], + # "replaced_mode": ['drive','walk'] + # } + # test_label_data = { + # "mode_confirm": ['drive'], + # "purpose_confirm": ['work', 'home', 'school'], + # "replaced_mode": ['drive','walk'] + # } + # # generate $n trips. + # n = 20 + # m = 5 + # initial_trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n, + # origin=(0, 0), + # destination=(1, 1), + # label_data=train_label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # additional_trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n*5, + # origin=(0, 0), + # destination=(1, 1), + # label_data=train_label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # test_trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n, + # origin=(0, 0), + # destination=(1, 1), + # label_data=test_label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # # pass in a test configuration + # model_config = { + # "incremental_evaluation": False, + # "feature_list": { + # "data.inferred_labels.mode_confirm": [ + # "walk", + # "bike", + # "transit" + # ] + # }, + # "dependent_var": { + # "name": "data.user_input.replaced_mode", + # "classes": [ + # "drive", + # "walk", + # "bike", + # "transit" + # ] + # } + # } + # model = eamts.SupportVectorMachine(model_config) + # # Start with some initialization data + # model.fit(initial_trips) + # # Train on additional sets of data + # for i in range(0, 5): + # model.fit(additional_trips[i:(i+1)*n]) + # # If an unseen class is introduced, allow sklearn to throw error + # with self.assertRaises(ValueError): + # model.predict(test_trips) + + + # def testPredictions(self): + # """ + # with a fixed seed, the model should make consistent predictions + # """ + # random.seed(42) + # label_data = { + # "mode_confirm": ['walk', 'bike', 'transit'], + # "purpose_confirm": ['work', 'home', 'school'], + # "replaced_mode": ['drive','walk','bike','transit'] + # } + # # generate $n trips. + # n = 20 + # m = 5 + # trips = etmm.generate_mock_trips( + # user_id="joe", + # trips=n, + # origin=(0, 0), + # destination=(1, 1), + # label_data=label_data, + # within_threshold=m, + # threshold=0.001, # ~ 111 meters in degrees WGS84 + # ) + # # pass in a test configuration + # model_config = { + # "incremental_evaluation": False, + # "feature_list": { + # "data.inferred_labels.mode_confirm": [ + # "walk", + # "bike", + # "transit" + # ] + # }, + # "dependent_var": { + # "name": "data.user_input.replaced_mode", + # "classes": [ + # "drive", + # "walk", + # "bike", + # "transit" + # ] + # } + # } + # model = eamtg.GradientBoostedDecisionTree(model_config) + # model.fit(trips) + # y = model.predict(trips) + # # Test that predicted == expected + # expected_result = [ + # 'transit', 'bike', 'transit', 'bike', 'transit', 'bike', 'drive', 'transit', + # 'transit', 'drive', 'transit', 'transit', 'bike', 'bike', 'bike', 'transit', + # 'transit', 'transit', 'bike', 'drive' + # ] + # for i, prediction in enumerate(y): + # self.assertEqual(prediction, expected_result[i]) + + # model = eamts.SupportVectorMachine(model_config) + # # there is a separate random number generator in SGDClassifier that + # # must be fixed to get consistent predictions + # model.svm.random_state = (3) + # model.fit(trips) + # y = model.predict(trips) + # # Test that predicted == expected + # # note that it seems with a small dataset the svm tends to predict a single category + # expected_result = [ + # 'drive', 'drive', 'transit', 'drive', 'drive', 'drive', 'drive', 'transit', + # 'transit', 'drive', 'drive', 'transit', 'drive', 'drive', 'drive', 'transit', + # 'drive', 'transit', 'drive', 'drive' + # ] + # for i, prediction in enumerate(y): + # self.assertEqual(prediction, expected_result[i]) diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py deleted file mode 100644 index 1d1e9ae41..000000000 --- a/emission/tests/modellingTests/TestSupportVectorMachine.py +++ /dev/null @@ -1,473 +0,0 @@ -import unittest -import emission.analysis.modelling.trip_model.support_vector_machine as eamts -import emission.tests.modellingTests.modellingTestAssets as etmm -import logging -import pandas as pd -import random - - -class TestSupportVectorMachine(unittest.TestCase): - - def setUp(self) -> None: - logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', - level=logging.DEBUG) - - - def testSmoke(self): - """ - the model should fit and predict on normal data without errors - """ - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - # generate $n trips. - n = 20 - m = 5 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - "walk", - "bike", - "transit" - ], - "data.user_input.purpose_confirm": [ - "work", - "home", - "school" - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamts.SupportVectorMachine(model_config) - model.fit(trips) - model.predict(trips) - - - def testUnseenFeatures(self): - """ - if the input classes for a feature change throw sklearn error - """ - train_label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - test_label_data = { - "mode_confirm": ['drive'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - # generate $n trips. - n = 20 - m = 5 - train_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=train_label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - test_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=test_label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - "walk", - "bike", - "transit" - ], - "data.user_input.purpose_confirm": [ - "work", - "home", - "school" - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamts.SupportVectorMachine(model_config) - model.fit(train_trips) - - with self.assertRaises(ValueError): - model.predict(test_trips) - - - def testNumeric(self): - """ - the model should handle numeric and categorical variable types - """ - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - # generate $n trips. - n = 20 - m = 5 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - "walk", - "bike", - "transit" - ], - "data.user_input.purpose_confirm": [ - "work", - "home", - "school" - ], - "data.distance": None - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamts.SupportVectorMachine(model_config) - X_train, y_train = model.extract_features(trips) - # 3 features for mode confirm, 3 for trip purpose, 1 for distance - self.assertEqual(len(X_train.columns), 7) - # all feature columns should be strictly numeric - self.assertTrue(X_train.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all()) - - - def testFull(self): - """ - the model should handle survey, trip, and user input features - """ - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk','bike','transit'] - } - survey_data = { - "group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'], - "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'], - "group_pa5ah98.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more'] - } - # generate $n trips. - n = 20 - m = 5 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - survey_data=survey_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - "walk", - "bike", - "transit" - ], - "data.user_input.purpose_confirm": [ - "work", - "home", - "school" - ], - "data.distance": None, - "data.jsonDocResponse.group_hg4zz25.How_old_are_you": [ - '0___25_years_old', - '26___55_years_old', - '56___70_years_old' - ], - "data.jsonDocResponse.group_hg4zz25.Are_you_a_student": [ - 'not_a_student', - 'yes' - ], - "data.jsonDocResponse.group_pa5ah98.Please_identify_which_category": [ - '0_to__49_999', - '_50_000_to__99_999', - '100_000_or_more' - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamts.SupportVectorMachine(model_config) - model.fit(trips) - y = model.predict(trips) - - # No class in predictions that's not in training data - for predicted_class in pd.unique(y): - self.assertIn(predicted_class, model_config['dependent_var']['classes']) - - - def testIncremental(self): - """ - the model should fit and predict incrementally on normal data without errors - """ - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - # generate $n trips. - n = 20 - m = 5 - initial_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - additional_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n*5, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": True, - "feature_list": { - "data.user_input.mode_confirm": [ - "walk", - "bike", - "transit" - ], - "data.user_input.purpose_confirm": [ - "work", - "home", - "school" - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamts.SupportVectorMachine(model_config) - # Start with some initialization data - model.fit(initial_trips) - # Train on additional sets of data and predict for initial data - for i in range(0, 5): - model.fit(additional_trips[i:(i+1)*n]) - model.predict(initial_trips) - - - def testUnseenClassesIncremental(self): - """ - if the input classes for a feature change throw sklearn error - """ - train_label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - test_label_data = { - "mode_confirm": ['drive'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk'] - } - # generate $n trips. - n = 20 - m = 5 - initial_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=train_label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - additional_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n*5, - origin=(0, 0), - destination=(1, 1), - label_data=train_label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - test_trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=test_label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - "walk", - "bike", - "transit" - ], - "data.user_input.purpose_confirm": [ - "work", - "home", - "school" - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamts.SupportVectorMachine(model_config) - # Start with some initialization data - model.fit(initial_trips) - # Train on additional sets of data - for i in range(0, 5): - model.fit(additional_trips[i:(i+1)*n]) - - # If an unseen class is introduced, allow sklearn to throw error - with self.assertRaises(ValueError): - model.predict(test_trips) - - - def testPredictions(self): - """ - with a fixed seed, the model should make consistent predictions - """ - random.seed(42) - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive','walk','bike','transit'] - } - # generate $n trips. - n = 20 - m = 5 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - # pass in a test configuration - model_config = { - "incremental_evaluation": False, - "feature_list": { - "data.user_input.mode_confirm": [ - "walk", - "bike", - "transit" - ], - "data.user_input.purpose_confirm": [ - "work", - "home", - "school" - ] - }, - "dependent_var": { - "name": "data.user_input.replaced_mode", - "classes": [ - "drive", - "walk", - "bike", - "transit" - ] - } - } - model = eamts.SupportVectorMachine(model_config) - # there is a separate random number generator in SGDClassifier that - # must be fixed to get consistent predictions - model.svm.random_state = (3) - model.fit(trips) - y = model.predict(trips) - - # Test that predicted == expected - # note that it seems with a small dataset the svm tends to predict a single category - expected_result = [ - 'transit', 'transit', 'bike', 'transit', 'transit', 'bike', 'transit', 'transit', - 'transit', 'transit', 'bike', 'transit', 'transit', 'transit', 'transit', - 'transit', 'transit', 'transit', 'bike', 'bike' - ] - print(y) - for i, prediction in enumerate(y): - self.assertEqual(prediction, expected_result[i]) diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 4fcfeff24..674eb6f70 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -135,7 +135,7 @@ def generate_mock_trips( origin, destination, label_data = None, - survey_data = None, + sensed_label_data = None, within_threshold = None, start_ts: None = None, end_ts: None = None, @@ -147,7 +147,7 @@ def generate_mock_trips( within a threshold from the provided o/d pair, and some have labels. some other ones can be sampled to appear outside of the threshold of the o/d locations. - label_data and survey_data are optional dictionaries with labels and sample weights, for example: + label_data/sensed_label-data is optional dictionary with labels and sample weights, for example: { "mode_confirm": ['walk', 'bike'], "replaced_mode": ['drive', 'tnc'], @@ -156,14 +156,6 @@ def generate_mock_trips( "replaced_mode_weights": [0.4, 0.6], "purpose_weights": [0.1, 0.9] } - { - "group_hg4zz25.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more'], - "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'], - "data.jsonDocResponse.group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'], - "group_hg4zz25.Please_identify_which_category_weights": [0.8, 0.1, 0.1], - "group_hg4zz25.Are_you_a_student": [0.9, 0.1], - "data.jsonDocResponse.group_hg4zz25.How_old_are_you_weights": [0.4, 0.4, 0.2] - } weights entries are optional and result in uniform sampling. @@ -172,7 +164,7 @@ def generate_mock_trips( :param origin: origin coordinates :param destination: destination coordinates :param label_data: dictionary of label data, see above, defaults to None - :param survey_data: dictionary of survey data, see above, defaults to None + :param sensed_label_data: dictionary of sensed data, see above, defaults to None :param within_threshold: number of trips that should fall within the provided distance threshold in degrees WGS84, defaults to None :param threshold: distance threshold in WGS84 for sampling, defaults to 0.01 @@ -200,25 +192,21 @@ def generate_mock_trips( purpose_weights=label_data.get('purpose_weights') ) trip = build_mock_trip(user_id, o, d, labels, start_ts, end_ts) - if survey_data is not None: - trip = add_trip_demographics(trip, survey_data) + if sensed_label_data is not None: + trip = add_sensed_labels(trip, sensed_label_data) result.append(trip) random.shuffle(result) return result -def add_trip_demographics(trip, survey_features): - response_id = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=22)) - trip['data']['jsonDocResponse'] = {response_id: {'group_yk8eb99': {}, 'group_hg4zz25': {}, 'group_pa5ah98': {}}} - for feature in survey_features: - feature_labels = feature.split(".") - feature_group = feature_labels[0] - feature_name = feature_labels[1] - feature_value = random.choice(survey_features[feature]) - trip['data']['jsonDocResponse'][response_id][feature_group].update({feature_name: feature_value}) +def add_sensed_labels(trip, sensed_label_data): + for label in sensed_label_data: + # TODO: currently just makes one inferred label 'drive'; should be random option from sensed_label_data + trip['data']['inferred_labels'] = {'mode_confirm': sensed_label_data[label][0]} return trip + if __name__ == '__main__': label_data = { "mode_confirm": ['walk', 'bike', 'drive'], From 94673568d55edcc668381ebf17c791e866c98167 Mon Sep 17 00:00:00 2001 From: aGuttman Date: Fri, 23 Dec 2022 13:39:21 -0700 Subject: [PATCH 12/12] Integrating Replace Mode Model Building out pipeline infrastructure to run replace mode --- .../inference/labels/inferrers.py | 16 +++ .../inference/labels/pipeline_replace_mode.py | 102 ++++++++++++++++++ .../modelling/trip_model/model_storage.py | 51 +++++++++ .../modelling/trip_model/run_model.py | 40 +++++++ emission/core/wrapper/labelprediction.py | 1 + .../analysis_timeseries_queries.py | 1 + 6 files changed, 211 insertions(+) create mode 100644 emission/analysis/classification/inference/labels/pipeline_replace_mode.py diff --git a/emission/analysis/classification/inference/labels/inferrers.py b/emission/analysis/classification/inference/labels/inferrers.py index c6b939671..16d5cf3c3 100644 --- a/emission/analysis/classification/inference/labels/inferrers.py +++ b/emission/analysis/classification/inference/labels/inferrers.py @@ -156,3 +156,19 @@ def predict_cluster_confidence_discounting(trip, max_confidence=None, first_conf labels = copy.deepcopy(labels) for l in labels: l["p"] *= confidence_coeff return labels + +def predict_gradient_boosted_decision_tree(trip, max_confidence=None, first_confidence=None, confidence_multiplier=None): + # load application config + model_type = eamtc.get_model_type() + model_storage = eamtc.get_model_storage() + labels, n = eamur.predict_labels_with_gbdt(trip, model_type, model_storage) + if n <= 0: # No model data or trip didn't match a cluster + logging.debug(f"In predict_gradient_boosted_decision_tree: n={n}; returning as-is") + return labels + + # confidence_coeff = n_to_confidence_coeff(n, max_confidence, first_confidence, confidence_multiplier) + # logging.debug(f"In predict_cluster_confidence_discounting: n={n}; discounting with coefficient {confidence_coeff}") + + labels = copy.deepcopy(labels) + for l in labels: l["p"] *= confidence_coeff + return labels \ No newline at end of file diff --git a/emission/analysis/classification/inference/labels/pipeline_replace_mode.py b/emission/analysis/classification/inference/labels/pipeline_replace_mode.py new file mode 100644 index 000000000..3c8656f00 --- /dev/null +++ b/emission/analysis/classification/inference/labels/pipeline_replace_mode.py @@ -0,0 +1,102 @@ +# Standard imports +import logging +import random +import copy + +# Our imports +import emission.storage.pipeline_queries as epq +import emission.storage.timeseries.abstract_timeseries as esta +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.core.wrapper.labelprediction as ecwl +import emission.core.wrapper.entry as ecwe +import emission.analysis.classification.inference.labels.inferrers as eacili +import emission.analysis.classification.inference.labels.ensembles as eacile + + +# For each algorithm in ecwl.AlgorithmTypes that runs on a trip (e.g., not the ensemble, which +# runs on the results of other algorithms), primary_algorithms specifies a corresponding +# function in eacili to run. This makes it easy to plug in additional algorithms later. +primary_algorithms = { + ecwl.AlgorithmTypes.GRADIENT_BOOSTED_DECISION_TREE: eacili.predict_gradient_boosted_decision_tree +} + +# ensemble specifies which algorithm in eacile to run. +# This makes it easy to test various ways of combining various algorithms. +ensemble = eacile.ensemble_first_prediction + + +# Does all the work necessary for a given user +def infer_labels(user_id): + time_query = epq.get_time_range_for_label_inference(user_id) + try: + lip = LabelInferencePipeline() + lip.user_id = user_id + lip.run_prediction_pipeline(user_id, time_query) + if lip.last_trip_done is None: + logging.debug("After run, last_trip_done == None, must be early return") + epq.mark_label_inference_done(user_id, lip.last_trip_done) + except: + logging.exception("Error while inferring labels, timestamp is unchanged") + epq.mark_label_inference_failed(user_id) + +# Code structure based on emission.analysis.classification.inference.mode.pipeline +# and emission.analysis.classification.inference.mode.rule_engine +class LabelInferencePipeline: + def __init__(self): + self._last_trip_done = None + + @property + def last_trip_done(self): + return self._last_trip_done + + # For a given user and time range, runs all the primary algorithms and ensemble, saves results + # to the database, and records progress + def run_prediction_pipeline(self, user_id, time_range): + self.ts = esta.TimeSeries.get_time_series(user_id) + self.toPredictTrips = esda.get_entries( + esda.CLEANED_TRIP_KEY, user_id, time_query=time_range) + for cleaned_trip in self.toPredictTrips: + # Create an inferred trip + cleaned_trip_dict = copy.copy(cleaned_trip)["data"] + inferred_trip = ecwe.Entry.create_entry(user_id, "analysis/inferred_trip", cleaned_trip_dict) + + # Run the algorithms and the ensemble, store results + results = self.compute_and_save_algorithms(inferred_trip) + ensemble = self.compute_and_save_ensemble(inferred_trip, results) + + # Put final results into the inferred trip and store it + inferred_trip["data"]["cleaned_trip"] = cleaned_trip.get_id() + inferred_trip["data"]["inferred_labels"] = ensemble["prediction"] + self.ts.insert(inferred_trip) + + if self._last_trip_done is None or self._last_trip_done["data"]["end_ts"] < cleaned_trip["data"]["end_ts"]: + self._last_trip_done = cleaned_trip + + # This is where the labels for a given trip are actually predicted. + # Though the only information passed in is the trip object, the trip object can provide the + # user_id and other potentially useful information. + def compute_and_save_algorithms(self, trip): + predictions = [] + for algorithm_id, algorithm_fn in primary_algorithms.items(): + prediction = algorithm_fn(trip) + lp = ecwl.Labelprediction() + lp.trip_id = trip.get_id() + lp.algorithm_id = algorithm_id + lp.prediction = prediction + lp.start_ts = trip["data"]["start_ts"] + lp.end_ts = trip["data"]["end_ts"] + self.ts.insert_data(self.user_id, "inference/labels", lp) + predictions.append(lp) + return predictions + + # Combine all our predictions into a single ensemble prediction. + # As a placeholder, we just take the first prediction. + # TODO: implement a real combination algorithm. + def compute_and_save_ensemble(self, trip, predictions): + il = ecwl.Labelprediction() + il.trip_id = trip.get_id() + il.start_ts = trip["data"]["start_ts"] + il.end_ts = trip["data"]["end_ts"] + (il.algorithm_id, il.prediction) = ensemble(trip, predictions) + self.ts.insert_data(self.user_id, "analysis/inferred_labels", il) + return il \ No newline at end of file diff --git a/emission/analysis/modelling/trip_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py index 7e94d1217..6cce5db63 100644 --- a/emission/analysis/modelling/trip_model/model_storage.py +++ b/emission/analysis/modelling/trip_model/model_storage.py @@ -35,6 +35,57 @@ def from_str(cls, str): msg = f"{str} is not a known ModelStorage, must be one of {names}" raise KeyError(msg) +def load_model_all_users(model_type: eamum.ModelType, model_storage: ModelStorage) -> Optional[Dict]: + """load a user label model from a model storage location + + :param user_id: the user to request a model for + :param model_type: expected type of model stored + :param model_storage: storage format + :return: the model representation as a Python Dict or None + :raises: TypeError if loaded model has different type than expected type + KeyError if the ModelType is not known + """ + if model_storage == ModelStorage.DOCUMENT_DATABASE: + + # retrieve stored model with timestamp that matches/exceeds the most + # recent PipelineState.TRIP_MODEL entry + ms = esma.ModelStorage.get_model_storage(0) + latest_model_entry = ms.get_current_model(key=esda.REPLACE_MODEL_STORE_KEY) + + if latest_model_entry is None: + logging.debug(f'no {model_type.name} model found') + return None + + write_ts = latest_model_entry['metadata']['write_ts'] + logging.debug(f'retrieved latest trip model recorded at timestamp {write_ts}') + logging.debug(latest_model_entry) + + # parse str to enum for ModelType + latest_model_type_str = latest_model_entry.get('data', {}).get('model_type') + if latest_model_type_str is None: + raise TypeError('stored model does not have a model type') + latest_model_type = eamum.ModelType.from_str(latest_model_type_str) + + # validate and return + if latest_model_entry is None: + return None + elif latest_model_type != model_type: + msg = ( + f"loading model has model type '{latest_model_type.name}' " + f"but was expected to have model type {model_type.name}" + ) + raise TypeError(msg) + else: + return latest_model_entry['data']['model'] + + else: + storage_types_str = ",".join(ModelStorage.names()) + msg = ( + f"unknown model storage type {model_storage}, must be one of " + f"{{{storage_types_str}}}" + ) + raise TypeError(msg) + def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage) -> Optional[Dict]: """load a user label model from a model storage location diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index e3e2b1c4e..094eacef1 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -118,6 +118,27 @@ def predict_labels_with_n( predictions, n = model.predict(trip) return predictions, n +def predict_labels_with_gbdt( + trip: ecwc.Confirmedtrip, + model_type = eamumt.ModelType.GRADIENT_BOOSTED_DECISION_TREE, + model_storage = eamums.ModelStorage.DOCUMENT_DATABASE, + model_config = None): + """ + invoke the user label prediction model to predict labels for a trip. + + :param trip: the trip to predict labels for + :param model_type: type of prediction model to run + :param model_storage: location to read/write models + :param model_config: optional configuration for model, for debugging purposes + :return: a list of predictions + """ + user_id = trip['user_id'] + model = _load_stored_trip_model_all_users(model_type, model_storage, model_config) + if model is None: + return [], -1 + else: + predictions, n = model.predict(trip) + return predictions, n def _get_training_data(user_id: UUID, time_query: Optional[estt.TimeQuery]): """ @@ -159,6 +180,25 @@ def _load_stored_trip_model( model.from_dict(model_dict) return model +def _load_stored_trip_model_all_users( + model_type: eamumt.ModelType, + model_storage: eamums.ModelStorage, + model_config = None) -> Optional[eamuu.TripModel]: + """helper to build a user label prediction model class with the + contents of a stored model shared across all users. + + :param model_type: TripModel type configured for this OpenPATH server + :param model_storage: storage type + :param model_config: optional configuration for model, for debugging purposes + :return: model, or None if no model is stored for this user + """ + model_dict = eamums.load_model_all_users(model_type, model_storage) + if model_dict is None: + return None + else: + model = model_type.build(model_config) + model.from_dict(model_dict) + return model def _latest_timestamp(trips: List[ecwc.Confirmedtrip]) -> float: """extract the latest timestamp observed from a list of trips diff --git a/emission/core/wrapper/labelprediction.py b/emission/core/wrapper/labelprediction.py index c74609e5a..9725e95ee 100644 --- a/emission/core/wrapper/labelprediction.py +++ b/emission/core/wrapper/labelprediction.py @@ -19,6 +19,7 @@ class AlgorithmTypes(enum.Enum): TWO_STAGE_BIN_CLUSTER = 5 PLACEHOLDER_PREDICTOR_DEMO = 6 CONFIDENCE_DISCOUNTED_CLUSTER = 7 + GRADIENT_BOOSTED_DECISION_TREE = 8 class Labelprediction(ecwb.WrapperBase): diff --git a/emission/storage/decorations/analysis_timeseries_queries.py b/emission/storage/decorations/analysis_timeseries_queries.py index 9f8ab6a70..c684dc127 100644 --- a/emission/storage/decorations/analysis_timeseries_queries.py +++ b/emission/storage/decorations/analysis_timeseries_queries.py @@ -38,6 +38,7 @@ METRICS_DAILY_MEAN_MEDIAN_SPEED = "metrics/daily_mean_median_speed" INFERRED_LABELS_KEY = "inference/labels" TRIP_MODEL_STORE_KEY = "inference/trip_model" +REPLACE_MODEL_STORE_KEY = "inference/replace_model" # General methods