From 6dc499512191354fa0b55343e1d6126d65832551 Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Thu, 17 Nov 2022 15:21:54 -0800
Subject: [PATCH 01/12] Blueprint for GBDT, work in progress

---
 .../gradient_boosted_decision_tree.py         | 122 ++++++++++++++++++
 .../TestGradientBoostedDecisionTree.py        |  76 +++++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
 create mode 100644 emission/tests/modellingTests/TestGradientBoostedDecisionTree.py

diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
new file mode 100644
index 000000000..646ba0a1b
--- /dev/null
+++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
@@ -0,0 +1,122 @@
+import logging
+from tokenize import group
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+import sklearn.metrics as sm
+
+import emission.storage.timeseries.abstract_timeseries as esta
+import emission.analysis.modelling.tour_model.label_processing as lp
+import emission.analysis.modelling.trip_model.trip_model as eamuu
+import emission.analysis.modelling.trip_model.util as util
+import emission.analysis.modelling.trip_model.config as eamtc
+import emission.core.wrapper.confirmedtrip as ecwc
+
+
+class GradientBoostedDecisionTree(eamuu.TripModel):
+
+    is_incremental: bool = False  # overwritten during __init__
+    class_map: dict = {}  # overwritten during fit
+
+    def __init__(self, config=None):
+        """
+        Instantiate a gradient boosted decision tree for all users.
+
+        This uses the sklearn implementation of a gradient boosted
+        decision tree to classify unlabeled replacement modes.
+        
+        https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
+
+        Replacement modes are considered to be the second-best choice for
+        a given trip (i.e., what mode would have been chosen if the actual
+        choice wasn't available).
+
+        The model is currently trained on data from all users.
+        """
+        if config is None:
+            config = eamtc.get_config_value_or_raise('model_parameters.gbdt')
+            logging.debug(f'GradientBoostedDecisionTree loaded model config from file')
+        else:
+            logging.debug(f'GradientBoostedDecisionTree using model config argument')
+        expected_keys = [
+            'incremental_evaluation',
+            'feature_list',
+            'dependent_var'
+        ]
+        for k in expected_keys:
+            if config.get(k) is None:
+                msg = f"gbdt trip model config missing expected key {k}"
+                raise KeyError(msg)
+        self.is_incremental = config['incremental_evaluation']
+        # Use the sklearn implementation of a GBDT
+        self.gbdt = GradientBoostingClassifier(n_estimators=50)
+        # Which features to use in the fit/prediction
+        self.feature_list = config['feature_list']
+        self.dependent_var = config['dependent_var']
+
+    def fit(self, trips: List[ecwc.Confirmedtrip]):
+        """train the model by passing data, where each row in the data
+        corresponds to a label at the matching index of the label input
+
+        :param trips: 2D array of features to train from
+        """
+        logging.debug(f'fit called with {len(trips)} trips')
+        unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips))
+        if len(unlabeled) > 0:
+            msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}'
+            raise Exception(msg)
+        X_train, y_train = self.extract_features(trips)
+        self.gbdt.fit(X_train, y_train)
+        logging.info(f"gradient boosted decision tree model fit to {len(X_train)} rows of trip data")
+
+    def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]:
+        logging.debug(f"running gradient boosted mode prediction")
+        X_train, y_train = self.extract_features(trip)
+        y_pred = self.gbdt.predict(X_train)
+        if y_pred is None:
+            logging.debug(f"unable to predict bin for trip {trip}")
+            return []
+        else:
+            logging.debug(f"made predictions {y_pred}")
+            return y_pred
+
+    def to_dict(self) -> Dict:
+        return self.gbdt
+
+    def from_dict(self, model: Dict):
+        self.gbdt = model
+
+    def extract_features(self, trips: ecwc.Confirmedtrip) -> List[float]:
+        # TODO: need to enable generic paths other than just user input for features
+        X = pd.DataFrame(
+            [[trip['data']['user_input'][x] for x in self.feature_list] for trip in trips],
+            columns=self.feature_list
+        )
+        y = pd.DataFrame(
+            [trip['data']['user_input'][self.dependent_var] for trip in trips],
+            columns=[self.dependent_var]
+        )
+        # Clean up and recode the feature columns for training/prediction
+        X_processed, y_processed = self._process_data(X, y)
+        return X_processed, y_processed
+
+    def _process_data(self, X, y):
+        """
+        helper function to transform binned features and labels.
+        """
+        # Any non-numeric dtype must be one-hot encoded (if unordered) or numerically coded (if ordered)
+        dummies = []
+        for col in X:
+            if X[col].dtype=='object':
+                dummies.append(pd.get_dummies(X[col], prefix=col))
+        X = pd.concat(dummies, axis=1)
+        # The outcome must be a single categorical column; recode to numeric
+        for col in y:
+            cat_list = list(pd.unique(y[col])).sort()
+            if y[col].dtype=='object':
+                y[col] = pd.Categorical(y[col], ordered=True, categories=cat_list)
+                y[col] = y[col].cat.codes
+        return X, y
diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
new file mode 100644
index 000000000..f49b6c389
--- /dev/null
+++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
@@ -0,0 +1,76 @@
+import unittest
+import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg
+import emission.tests.modellingTests.modellingTestAssets as etmm
+import logging
+
+
+class TestGradientBoostedDecisionTree(unittest.TestCase):
+
+    def setUp(self) -> None:
+        logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
+        level=logging.DEBUG)
+
+    def testSmoke(self):
+        """
+        the model should fit and predict on normal data without errors
+        """
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": ['mode_confirm','purpose_confirm'],
+            "dependent_var": 'replaced_mode'
+        }
+        model = eamtg.GradientBoostedDecisionTree(model_config)
+        model.fit(trips)
+        model.predict(trips)
+
+    def testUnseenTrainingClasses(self):
+        """
+        if a new class is added the model should re-train
+        """
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": ['mode_confirm','purpose_confirm'],
+            "dependent_var": 'replaced_mode'
+        }
+        model = eamtg.GradientBoostedDecisionTree(model_config)
+        model.fit(trips)
+        model.predict(trips)
+
+        # any predicted values must 
+        # self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it")

From 8a35ff05c5337fa1700b9e47791ef401f25a1815 Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Wed, 23 Nov 2022 12:27:51 -0800
Subject: [PATCH 02/12] GBDT with all feature types

---
 .../gradient_boosted_decision_tree.py         |  82 ++++++++-----
 .../TestGradientBoostedDecisionTree.py        | 111 ++++++++++++++++--
 .../modellingTests/modellingTestAssets.py     |  12 ++
 3 files changed, 167 insertions(+), 38 deletions(-)

diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
index 646ba0a1b..cf2e33dac 100644
--- a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
+++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
@@ -2,6 +2,7 @@
 from tokenize import group
 from typing import Dict, List, Optional, Tuple
 
+from math import radians, cos, sin, asin, sqrt
 import numpy as np
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
@@ -13,13 +14,13 @@
 import emission.analysis.modelling.trip_model.trip_model as eamuu
 import emission.analysis.modelling.trip_model.util as util
 import emission.analysis.modelling.trip_model.config as eamtc
+import emission.core.get_database as edb
 import emission.core.wrapper.confirmedtrip as ecwc
 
 
 class GradientBoostedDecisionTree(eamuu.TripModel):
 
     is_incremental: bool = False  # overwritten during __init__
-    class_map: dict = {}  # overwritten during fit
 
     def __init__(self, config=None):
         """
@@ -71,11 +72,12 @@ def fit(self, trips: List[ecwc.Confirmedtrip]):
         X_train, y_train = self.extract_features(trips)
         self.gbdt.fit(X_train, y_train)
         logging.info(f"gradient boosted decision tree model fit to {len(X_train)} rows of trip data")
+        logging.info(f"training features were {X_train.columns}")
 
-    def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]:
+    def predict(self, trip: ecwc.Confirmedtrip) -> List[int]:
         logging.debug(f"running gradient boosted mode prediction")
-        X_train, y_train = self.extract_features(trip)
-        y_pred = self.gbdt.predict(X_train)
+        X_test, y_pred = self.extract_features(trip, is_prediction=True)
+        y_pred = self.gbdt.predict(X_test)
         if y_pred is None:
             logging.debug(f"unable to predict bin for trip {trip}")
             return []
@@ -84,39 +86,59 @@ def predict(self, trip: ecwc.Confirmedtrip) -> Tuple[List[Dict], int]:
             return y_pred
 
     def to_dict(self) -> Dict:
-        return self.gbdt
+        return dict(self.gbdt)
 
     def from_dict(self, model: Dict):
         self.gbdt = model
 
-    def extract_features(self, trips: ecwc.Confirmedtrip) -> List[float]:
-        # TODO: need to enable generic paths other than just user input for features
-        X = pd.DataFrame(
-            [[trip['data']['user_input'][x] for x in self.feature_list] for trip in trips],
-            columns=self.feature_list
-        )
-        y = pd.DataFrame(
-            [trip['data']['user_input'][self.dependent_var] for trip in trips],
-            columns=[self.dependent_var]
-        )
-        # Clean up and recode the feature columns for training/prediction
-        X_processed, y_processed = self._process_data(X, y)
-        return X_processed, y_processed
-
-    def _process_data(self, X, y):
-        """
-        helper function to transform binned features and labels.
-        """
-        # Any non-numeric dtype must be one-hot encoded (if unordered) or numerically coded (if ordered)
+    def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
+        # Get dataframe from json trips; fill in calculated columns
+        trips_df = pd.json_normalize(trips)
+        # distance
+        trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']]
+        trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1)
+        # collect all features
+        X = trips_df[self.feature_list]
+        # Any object/categorical dtype features must be one-hot encoded if unordered
         dummies = []
         for col in X:
             if X[col].dtype=='object':
                 dummies.append(pd.get_dummies(X[col], prefix=col))
         X = pd.concat(dummies, axis=1)
-        # The outcome must be a single categorical column; recode to numeric
-        for col in y:
-            cat_list = list(pd.unique(y[col])).sort()
-            if y[col].dtype=='object':
-                y[col] = pd.Categorical(y[col], ordered=True, categories=cat_list)
-                y[col] = y[col].cat.codes
+        # Only extract dependent var if fitting a new model
+        if is_prediction:
+            y = None
+        else:
+            y = trips_df[self.dependent_var].values
         return X, y
+
+    # If the non-mock trips have distance calculated then this can be removed
+    # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
+    def haversine(self, coord1, coord2):
+        """
+        Calculate the great circle distance in kilometers between two points 
+        on the earth (specified in decimal degrees)
+        """
+        lon1 = coord1[0]
+        lat1 = coord1[1]
+        lon2 = coord2[0]
+        lat2 = coord2[1]
+        # convert decimal degrees to radians 
+        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
+
+        # haversine formula 
+        dlon = lon2 - lon1 
+        dlat = lat2 - lat1 
+        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
+        c = 2 * asin(sqrt(a)) 
+        r = 3956 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
+        return c * r
+
+    def export_demographic_table(self, uuid_list):
+        print("Looking up details for %s" % uuid_list)
+        all_survey_results = list(edb.get_timeseries_db().find({"user_id": {"$in": uuid_list}, "metadata.key": "manual/demographic_survey"}))
+        for s in all_survey_results:
+            s["data"]["user_id"] = s["user_id"]
+        all_survey_results_df = pd.json_normalize([s["data"] for s in all_survey_results])
+        all_survey_results_df.drop(columns=['xmlResponse', 'name', 'version', 'label'], axis=1, inplace=True)
+        return all_survey_results_df
\ No newline at end of file
diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
index f49b6c389..f1a12ea6d 100644
--- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
+++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
@@ -10,6 +10,7 @@ def setUp(self) -> None:
         logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
         level=logging.DEBUG)
 
+
     def testSmoke(self):
         """
         the model should fit and predict on normal data without errors
@@ -34,16 +35,71 @@ def testSmoke(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": ['mode_confirm','purpose_confirm'],
-            "dependent_var": 'replaced_mode'
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(trips)
         model.predict(trips)
 
-    def testUnseenTrainingClasses(self):
+
+    def testUnseenFeatures(self):
+        """
+        if the input classes for a feature change throw sklearn error
+        """
+        train_label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        test_label_data = {
+            "mode_confirm": ['drive'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        train_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=train_label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        test_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=test_label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamtg.GradientBoostedDecisionTree(model_config)
+        model.fit(train_trips)
+
+        with self.assertRaises(ValueError):
+            model.predict(test_trips)
+
+
+    def testNumeric(self):
         """
-        if a new class is added the model should re-train
+        the model should handle numeric and categorical variable types
         """
         label_data = {
             "mode_confirm": ['walk', 'bike', 'transit'],
@@ -65,12 +121,51 @@ def testUnseenTrainingClasses(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": ['mode_confirm','purpose_confirm'],
-            "dependent_var": 'replaced_mode'
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm',
+                'distance_miles'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(trips)
         model.predict(trips)
 
-        # any predicted values must 
-        # self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it")
+
+    def testFull(self):
+        """
+        the model should handle survey, trip, and user input features
+        """
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk','bike','transit']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm',
+                'data.survey.age',
+                'data.survey.hhinc',
+                'distance_miles'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamtg.GradientBoostedDecisionTree(model_config)
+        model.fit(trips)
+        model.predict(trips)
diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py
index 879a3a2ca..8b7ab7c6a 100644
--- a/emission/tests/modellingTests/modellingTestAssets.py
+++ b/emission/tests/modellingTests/modellingTestAssets.py
@@ -187,12 +187,24 @@ def generate_mock_trips(
             purpose_weights=label_data.get('purpose_weights')
         )
         trip = build_mock_trip(user_id, o, d, labels, start_ts, end_ts)
+        trip = add_trip_demographics(trip)
         result.append(trip)
         
     random.shuffle(result) 
     return result
 
 
+def add_trip_demographics(trip):
+    trip['data']['survey'] = {}
+    survey_features = {
+        'hhinc':['0-24999','25000-49000','50000-99999','100000+'],
+        'age':[x for x in range(0, 70)],
+        'veh':['0','1','2','3','4+']
+    }
+    for feature in survey_features:
+        trip['data']['survey'][feature] = random.choice(survey_features[feature])
+    return trip
+
 if __name__ == '__main__':
     label_data = {
         "mode_confirm": ['walk', 'bike', 'drive'],

From 8fa921ddf456d31d968b2008594ed5a7e9cb2847 Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Wed, 23 Nov 2022 13:06:38 -0800
Subject: [PATCH 03/12] Decent baseline GBDT

---
 .../gradient_boosted_decision_tree.py         | 24 ++++++-------------
 .../TestGradientBoostedDecisionTree.py        |  7 +++++-
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
index cf2e33dac..12e350092 100644
--- a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
+++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
@@ -52,9 +52,8 @@ def __init__(self, config=None):
                 msg = f"gbdt trip model config missing expected key {k}"
                 raise KeyError(msg)
         self.is_incremental = config['incremental_evaluation']
-        # Use the sklearn implementation of a GBDT
+        # use the sklearn implementation of a GBDT
         self.gbdt = GradientBoostingClassifier(n_estimators=50)
-        # Which features to use in the fit/prediction
         self.feature_list = config['feature_list']
         self.dependent_var = config['dependent_var']
 
@@ -86,20 +85,20 @@ def predict(self, trip: ecwc.Confirmedtrip) -> List[int]:
             return y_pred
 
     def to_dict(self) -> Dict:
-        return dict(self.gbdt)
+        return self.gbdt.get_params()
 
     def from_dict(self, model: Dict):
-        self.gbdt = model
+        self.gbdt.set_params(model)
 
     def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
-        # Get dataframe from json trips; fill in calculated columns
+        # get dataframe from json trips; fill in calculated columns
         trips_df = pd.json_normalize(trips)
         # distance
         trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']]
         trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1)
         # collect all features
         X = trips_df[self.feature_list]
-        # Any object/categorical dtype features must be one-hot encoded if unordered
+        # any object/categorical dtype features must be one-hot encoded if unordered
         dummies = []
         for col in X:
             if X[col].dtype=='object':
@@ -112,7 +111,7 @@ def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> Li
             y = trips_df[self.dependent_var].values
         return X, y
 
-    # If the non-mock trips have distance calculated then this can be removed
+    # if the non-mock trips have distance calculated then this can be removed
     # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
     def haversine(self, coord1, coord2):
         """
@@ -131,14 +130,5 @@ def haversine(self, coord1, coord2):
         dlat = lat2 - lat1 
         a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
         c = 2 * asin(sqrt(a)) 
-        r = 3956 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
+        r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units.
         return c * r
-
-    def export_demographic_table(self, uuid_list):
-        print("Looking up details for %s" % uuid_list)
-        all_survey_results = list(edb.get_timeseries_db().find({"user_id": {"$in": uuid_list}, "metadata.key": "manual/demographic_survey"}))
-        for s in all_survey_results:
-            s["data"]["user_id"] = s["user_id"]
-        all_survey_results_df = pd.json_normalize([s["data"] for s in all_survey_results])
-        all_survey_results_df.drop(columns=['xmlResponse', 'name', 'version', 'label'], axis=1, inplace=True)
-        return all_survey_results_df
\ No newline at end of file
diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
index f1a12ea6d..b506307f6 100644
--- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
+++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
@@ -2,6 +2,7 @@
 import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg
 import emission.tests.modellingTests.modellingTestAssets as etmm
 import logging
+import pandas as pd
 
 
 class TestGradientBoostedDecisionTree(unittest.TestCase):
@@ -168,4 +169,8 @@ def testFull(self):
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(trips)
-        model.predict(trips)
+        y = model.predict(trips)
+
+        # No class in predictions that's not in training data
+        for predicted_class in pd.unique(y):
+            self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']]))

From b99c49929b7e619e536f6de1e267323b3f8a443a Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Wed, 23 Nov 2022 13:42:03 -0800
Subject: [PATCH 04/12] Add basic SVM

---
 .../trip_model/support_vector_machine.py      | 132 +++++++++++++
 .../TestSupportVectorMachine.py               | 176 ++++++++++++++++++
 2 files changed, 308 insertions(+)
 create mode 100644 emission/analysis/modelling/trip_model/support_vector_machine.py
 create mode 100644 emission/tests/modellingTests/TestSupportVectorMachine.py

diff --git a/emission/analysis/modelling/trip_model/support_vector_machine.py b/emission/analysis/modelling/trip_model/support_vector_machine.py
new file mode 100644
index 000000000..db7c79ea3
--- /dev/null
+++ b/emission/analysis/modelling/trip_model/support_vector_machine.py
@@ -0,0 +1,132 @@
+import logging
+from tokenize import group
+from typing import Dict, List, Optional, Tuple
+
+from math import radians, cos, sin, asin, sqrt
+import numpy as np
+import pandas as pd
+from sklearn.svm import SVC
+
+import emission.storage.timeseries.abstract_timeseries as esta
+import emission.analysis.modelling.tour_model.label_processing as lp
+import emission.analysis.modelling.trip_model.trip_model as eamuu
+import emission.analysis.modelling.trip_model.util as util
+import emission.analysis.modelling.trip_model.config as eamtc
+import emission.core.get_database as edb
+import emission.core.wrapper.confirmedtrip as ecwc
+
+
+class SupportVectorMachine(eamuu.TripModel):
+
+    is_incremental: bool = False  # overwritten during __init__
+
+    def __init__(self, config=None):
+        """
+        Instantiate a support vector machine for all users.
+
+        This uses the sklearn implementation of a support vector machine 
+        to classify unlabeled replacement modes.
+        
+        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
+
+        Replacement modes are considered to be the second-best choice for
+        a given trip (i.e., what mode would have been chosen if the actual
+        choice wasn't available).
+
+        The model is currently trained on data from all users.
+        """
+        if config is None:
+            config = eamtc.get_config_value_or_raise('model_parameters.svm')
+            logging.debug(f'SupportVectorMachine loaded model config from file')
+        else:
+            logging.debug(f'SupportVectorMachine using model config argument')
+        expected_keys = [
+            'incremental_evaluation',
+            'feature_list',
+            'dependent_var'
+        ]
+        for k in expected_keys:
+            if config.get(k) is None:
+                msg = f"svm trip model config missing expected key {k}"
+                raise KeyError(msg)
+        self.is_incremental = config['incremental_evaluation']
+        # use the sklearn implementation of a svm
+        self.svm = SVC()
+        self.feature_list = config['feature_list']
+        self.dependent_var = config['dependent_var']
+
+    def fit(self, trips: List[ecwc.Confirmedtrip]):
+        """train the model by passing data, where each row in the data
+        corresponds to a label at the matching index of the label input
+
+        :param trips: 2D array of features to train from
+        """
+        logging.debug(f'fit called with {len(trips)} trips')
+        unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips))
+        if len(unlabeled) > 0:
+            msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}'
+            raise Exception(msg)
+        X_train, y_train = self.extract_features(trips)
+        self.svm.fit(X_train, y_train)
+        logging.info(f"support vector machine model fit to {len(X_train)} rows of trip data")
+        logging.info(f"training features were {X_train.columns}")
+
+    def predict(self, trip: ecwc.Confirmedtrip) -> List[int]:
+        logging.debug(f"running support vector mode prediction")
+        X_test, y_pred = self.extract_features(trip, is_prediction=True)
+        y_pred = self.svm.predict(X_test)
+        if y_pred is None:
+            logging.debug(f"unable to predict bin for trip {trip}")
+            return []
+        else:
+            logging.debug(f"made predictions {y_pred}")
+            return y_pred
+
+    def to_dict(self) -> Dict:
+        return self.svm.get_params()
+
+    def from_dict(self, model: Dict):
+        self.svm.set_params(model)
+
+    def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
+        # get dataframe from json trips; fill in calculated columns
+        trips_df = pd.json_normalize(trips)
+        # distance
+        trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']]
+        trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1)
+        # collect all features
+        X = trips_df[self.feature_list]
+        # any object/categorical dtype features must be one-hot encoded if unordered
+        dummies = []
+        for col in X:
+            if X[col].dtype=='object':
+                dummies.append(pd.get_dummies(X[col], prefix=col))
+        X = pd.concat(dummies, axis=1)
+        # Only extract dependent var if fitting a new model
+        if is_prediction:
+            y = None
+        else:
+            y = trips_df[self.dependent_var].values
+        return X, y
+
+    # if the non-mock trips have distance calculated then this can be removed
+    # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
+    def haversine(self, coord1, coord2):
+        """
+        Calculate the great circle distance in kilometers between two points 
+        on the earth (specified in decimal degrees)
+        """
+        lon1 = coord1[0]
+        lat1 = coord1[1]
+        lon2 = coord2[0]
+        lat2 = coord2[1]
+        # convert decimal degrees to radians 
+        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
+
+        # haversine formula 
+        dlon = lon2 - lon1 
+        dlat = lat2 - lat1 
+        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
+        c = 2 * asin(sqrt(a)) 
+        r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units.
+        return c * r
diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py
new file mode 100644
index 000000000..eff98d939
--- /dev/null
+++ b/emission/tests/modellingTests/TestSupportVectorMachine.py
@@ -0,0 +1,176 @@
+import unittest
+import emission.analysis.modelling.trip_model.support_vector_machine as eamts
+import emission.tests.modellingTests.modellingTestAssets as etmm
+import logging
+import pandas as pd
+
+
+class TestSupportVectorMachine(unittest.TestCase):
+
+    def setUp(self) -> None:
+        logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
+        level=logging.DEBUG)
+
+
+    def testSmoke(self):
+        """
+        the model should fit and predict on normal data without errors
+        """
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamts.SupportVectorMachine(model_config)
+        model.fit(trips)
+        model.predict(trips)
+
+
+    def testUnseenFeatures(self):
+        """
+        if the input classes for a feature change throw sklearn error
+        """
+        train_label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        test_label_data = {
+            "mode_confirm": ['drive'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        train_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=train_label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        test_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=test_label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamts.SupportVectorMachine(model_config)
+        model.fit(train_trips)
+
+        with self.assertRaises(ValueError):
+            model.predict(test_trips)
+
+
+    def testNumeric(self):
+        """
+        the model should handle numeric and categorical variable types
+        """
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm',
+                'distance_miles'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamts.SupportVectorMachine(model_config)
+        model.fit(trips)
+        model.predict(trips)
+
+
+    def testFull(self):
+        """
+        the model should handle survey, trip, and user input features
+        """
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk','bike','transit']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm',
+                'data.survey.age',
+                'data.survey.hhinc',
+                'distance_miles'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamts.SupportVectorMachine(model_config)
+        model.fit(trips)
+        y = model.predict(trips)
+
+        # No class in predictions that's not in training data
+        for predicted_class in pd.unique(y):
+            self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']]))

From f3c7160a53cfb2faade6b456bab3686e246b725a Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Wed, 23 Nov 2022 14:55:03 -0800
Subject: [PATCH 05/12] Clean up some shared code

---
 .../gradient_boosted_decision_tree.py         | 54 +--------------
 .../trip_model/support_vector_machine.py      | 50 +-------------
 .../analysis/modelling/trip_model/util.py     | 65 +++++++++++++++++++
 3 files changed, 70 insertions(+), 99 deletions(-)

diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
index 12e350092..ec354d5e4 100644
--- a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
+++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
@@ -2,19 +2,11 @@
 from tokenize import group
 from typing import Dict, List, Optional, Tuple
 
-from math import radians, cos, sin, asin, sqrt
-import numpy as np
-import pandas as pd
-from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import GradientBoostingClassifier
-import sklearn.metrics as sm
 
-import emission.storage.timeseries.abstract_timeseries as esta
-import emission.analysis.modelling.tour_model.label_processing as lp
 import emission.analysis.modelling.trip_model.trip_model as eamuu
-import emission.analysis.modelling.trip_model.util as util
+import emission.analysis.modelling.trip_model.util as eamtu
 import emission.analysis.modelling.trip_model.config as eamtc
-import emission.core.get_database as edb
 import emission.core.wrapper.confirmedtrip as ecwc
 
 
@@ -73,7 +65,7 @@ def fit(self, trips: List[ecwc.Confirmedtrip]):
         logging.info(f"gradient boosted decision tree model fit to {len(X_train)} rows of trip data")
         logging.info(f"training features were {X_train.columns}")
 
-    def predict(self, trip: ecwc.Confirmedtrip) -> List[int]:
+    def predict(self, trip: ecwc.Confirmedtrip) -> List:
         logging.debug(f"running gradient boosted mode prediction")
         X_test, y_pred = self.extract_features(trip, is_prediction=True)
         y_pred = self.gbdt.predict(X_test)
@@ -91,44 +83,4 @@ def from_dict(self, model: Dict):
         self.gbdt.set_params(model)
 
     def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
-        # get dataframe from json trips; fill in calculated columns
-        trips_df = pd.json_normalize(trips)
-        # distance
-        trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']]
-        trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1)
-        # collect all features
-        X = trips_df[self.feature_list]
-        # any object/categorical dtype features must be one-hot encoded if unordered
-        dummies = []
-        for col in X:
-            if X[col].dtype=='object':
-                dummies.append(pd.get_dummies(X[col], prefix=col))
-        X = pd.concat(dummies, axis=1)
-        # Only extract dependent var if fitting a new model
-        if is_prediction:
-            y = None
-        else:
-            y = trips_df[self.dependent_var].values
-        return X, y
-
-    # if the non-mock trips have distance calculated then this can be removed
-    # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
-    def haversine(self, coord1, coord2):
-        """
-        Calculate the great circle distance in kilometers between two points 
-        on the earth (specified in decimal degrees)
-        """
-        lon1 = coord1[0]
-        lat1 = coord1[1]
-        lon2 = coord2[0]
-        lat2 = coord2[1]
-        # convert decimal degrees to radians 
-        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
-
-        # haversine formula 
-        dlon = lon2 - lon1 
-        dlat = lat2 - lat1 
-        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
-        c = 2 * asin(sqrt(a)) 
-        r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units.
-        return c * r
+        return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, is_prediction, trips)
diff --git a/emission/analysis/modelling/trip_model/support_vector_machine.py b/emission/analysis/modelling/trip_model/support_vector_machine.py
index db7c79ea3..6ef311ea5 100644
--- a/emission/analysis/modelling/trip_model/support_vector_machine.py
+++ b/emission/analysis/modelling/trip_model/support_vector_machine.py
@@ -2,18 +2,12 @@
 from tokenize import group
 from typing import Dict, List, Optional, Tuple
 
-from math import radians, cos, sin, asin, sqrt
-import numpy as np
-import pandas as pd
 from sklearn.svm import SVC
 
-import emission.storage.timeseries.abstract_timeseries as esta
-import emission.analysis.modelling.tour_model.label_processing as lp
 import emission.analysis.modelling.trip_model.trip_model as eamuu
-import emission.analysis.modelling.trip_model.util as util
 import emission.analysis.modelling.trip_model.config as eamtc
-import emission.core.get_database as edb
 import emission.core.wrapper.confirmedtrip as ecwc
+import emission.analysis.modelling.trip_model.util as eamtu
 
 
 class SupportVectorMachine(eamuu.TripModel):
@@ -89,44 +83,4 @@ def from_dict(self, model: Dict):
         self.svm.set_params(model)
 
     def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
-        # get dataframe from json trips; fill in calculated columns
-        trips_df = pd.json_normalize(trips)
-        # distance
-        trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']]
-        trips_df['distance_miles'] = trips_coords.apply(lambda row : self.haversine(row[0],row[1]), axis=1)
-        # collect all features
-        X = trips_df[self.feature_list]
-        # any object/categorical dtype features must be one-hot encoded if unordered
-        dummies = []
-        for col in X:
-            if X[col].dtype=='object':
-                dummies.append(pd.get_dummies(X[col], prefix=col))
-        X = pd.concat(dummies, axis=1)
-        # Only extract dependent var if fitting a new model
-        if is_prediction:
-            y = None
-        else:
-            y = trips_df[self.dependent_var].values
-        return X, y
-
-    # if the non-mock trips have distance calculated then this can be removed
-    # https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
-    def haversine(self, coord1, coord2):
-        """
-        Calculate the great circle distance in kilometers between two points 
-        on the earth (specified in decimal degrees)
-        """
-        lon1 = coord1[0]
-        lat1 = coord1[1]
-        lon2 = coord2[0]
-        lat2 = coord2[1]
-        # convert decimal degrees to radians 
-        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
-
-        # haversine formula 
-        dlon = lon2 - lon1 
-        dlat = lat2 - lat1 
-        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
-        c = 2 * asin(sqrt(a)) 
-        r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units.
-        return c * r
+        return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, is_prediction, trips)
diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py
index 7d22b5d22..d70a8c625 100644
--- a/emission/analysis/modelling/trip_model/util.py
+++ b/emission/analysis/modelling/trip_model/util.py
@@ -1,7 +1,49 @@
 from typing import List, Tuple
 from past.utils import old_div
+from math import radians, cos, sin, asin, sqrt
 import numpy
 from numpy.linalg import norm
+import pandas as pd
+
+import emission.core.wrapper.confirmedtrip as ecwc
+
+
+def get_replacement_mode_features(feature_list, dependent_var, is_prediction, trips: ecwc.Confirmedtrip) -> List[float]:
+    """extract the features needed to perform replacement mode modeling from a set of
+    trips.
+
+    recodes variables that are categorical, and (TODO: scales numeric variables 0-1).
+
+    :param feature_list: features to gather from each trip
+    :type feature_list: List[string]
+    :param dependent_var: the feature to predict for each trip
+    :type dependent_var: string
+    :param is_prediction: whether or not to extract the dependent var
+    :type is_prediction: bool
+    :param trips: all trips to extract features from
+    :type trips: List[ecwc.Confirmedtrip]
+    :return: the training X features and y for the replacement mode model
+    :rtype: Tuple[List[List[float]], List[]]
+    """
+    # get dataframe from json trips; fill in calculated columns
+    trips_df = pd.json_normalize(trips)
+    # distance
+    trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']]
+    trips_df['distance_miles'] = trips_coords.apply(lambda row : haversine(row[0],row[1]), axis=1)
+    # collect all features
+    X = trips_df[feature_list]
+    # any object/categorical dtype features must be one-hot encoded if unordered
+    dummies = []
+    for col in X:
+        if X[col].dtype=='object':
+            dummies.append(pd.get_dummies(X[col], prefix=col))
+    X = pd.concat(dummies, axis=1)
+    # Only extract dependent var if fitting a new model
+    if is_prediction:
+        y = None
+    else:
+        y = trips_df[dependent_var].values
+    return X, y
 
 
 def find_knee_point(values: List[float]) -> Tuple[float, int]:
@@ -39,3 +81,26 @@ def find_knee_point(values: List[float]) -> Tuple[float, int]:
             index = i
     value = values[index]
     return [index, value]
+
+
+# if the non-mock trips have distance calculated then this can be removed
+# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
+def haversine(coord1, coord2):
+    """
+    Calculate the great circle distance in kilometers between two points 
+    on the earth (specified in decimal degrees)
+    """
+    lon1 = coord1[0]
+    lat1 = coord1[1]
+    lon2 = coord2[0]
+    lat2 = coord2[1]
+    # convert decimal degrees to radians 
+    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
+
+    # haversine formula 
+    dlon = lon2 - lon1 
+    dlat = lat2 - lat1 
+    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
+    c = 2 * asin(sqrt(a)) 
+    r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units.
+    return c * r
\ No newline at end of file

From a4cb4f3b80a1a6dad6ad3f7a68850c6a5e8d5120 Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Thu, 1 Dec 2022 11:13:41 -0800
Subject: [PATCH 06/12] Make SVM incremental

---
 .../trip_model/support_vector_machine.py      |  47 ++++++--
 .../TestSupportVectorMachine.py               | 113 ++++++++++++++++++
 2 files changed, 152 insertions(+), 8 deletions(-)

diff --git a/emission/analysis/modelling/trip_model/support_vector_machine.py b/emission/analysis/modelling/trip_model/support_vector_machine.py
index 6ef311ea5..c2b780e6a 100644
--- a/emission/analysis/modelling/trip_model/support_vector_machine.py
+++ b/emission/analysis/modelling/trip_model/support_vector_machine.py
@@ -2,7 +2,8 @@
 from tokenize import group
 from typing import Dict, List, Optional, Tuple
 
-from sklearn.svm import SVC
+import numpy as np
+from sklearn.linear_model import SGDClassifier
 
 import emission.analysis.modelling.trip_model.trip_model as eamuu
 import emission.analysis.modelling.trip_model.config as eamtc
@@ -13,15 +14,25 @@
 class SupportVectorMachine(eamuu.TripModel):
 
     is_incremental: bool = False  # overwritten during __init__
+    incremental_classes: list = None # overwritten when fit is initialized
 
     def __init__(self, config=None):
         """
-        Instantiate a support vector machine for all users.
+        Instantiate a linear support vector machine for all users.
 
         This uses the sklearn implementation of a support vector machine 
-        to classify unlabeled replacement modes.
-        
-        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
+        to classify unlabeled replacement modes. The SVM is linear, and is fit 
+        with the more general SGDClassifier class which can accommodate online 
+        learning:
+
+        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
+
+        For anyone looking to implement a differnt online learning model in the 
+        future, here is a list of sklearn models that implement "partial_fit" 
+        and would be candidates for the online learning approach implemented 
+        here:
+
+        https://scikit-learn.org/0.15/modules/scaling_strategies.html
 
         Replacement modes are considered to be the second-best choice for
         a given trip (i.e., what mode would have been chosen if the actual
@@ -45,13 +56,22 @@ def __init__(self, config=None):
                 raise KeyError(msg)
         self.is_incremental = config['incremental_evaluation']
         # use the sklearn implementation of a svm
-        self.svm = SVC()
+        self.svm = SGDClassifier()
         self.feature_list = config['feature_list']
         self.dependent_var = config['dependent_var']
 
     def fit(self, trips: List[ecwc.Confirmedtrip]):
         """train the model by passing data, where each row in the data
-        corresponds to a label at the matching index of the label input
+        corresponds to a label at the matching index of the label input.
+
+        If using an incremental model, the initial call to fit will store 
+        the list of unique classes in y. If additional classes are added in the 
+        future, the model will need to be re-initialized on data that 
+        contains all of those classes, otherwise sklearn will throw an 
+        error. Additionally, any categorical variables used as features 
+        must have all classes present in the initial training data. If not, 
+        they will throw an sklearn error due to differences in the number of 
+        OHE variables in the data.
 
         :param trips: 2D array of features to train from
         """
@@ -61,7 +81,18 @@ def fit(self, trips: List[ecwc.Confirmedtrip]):
             msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}'
             raise Exception(msg)
         X_train, y_train = self.extract_features(trips)
-        self.svm.fit(X_train, y_train)
+        # The first time partial_fit is called, it must include a list of the unique classes in y
+        if self.is_incremental and self.incremental_classes is None:
+            logging.debug(f'initializing incremental model fit')
+            self.incremental_classes = np.unique(y_train)
+            self.svm.partial_fit(X_train, y_train, self.incremental_classes)
+        # For all future partial fits, there is no need to pass the classes again
+        elif self.is_incremental and self.incremental_classes is not None:
+            logging.debug(f'updating incremental model fit')
+            self.svm.partial_fit(X_train, y_train)
+        # If not incremental, train regularly
+        else:
+            self.svm.fit(X_train, y_train)
         logging.info(f"support vector machine model fit to {len(X_train)} rows of trip data")
         logging.info(f"training features were {X_train.columns}")
 
diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py
index eff98d939..8cc5d354e 100644
--- a/emission/tests/modellingTests/TestSupportVectorMachine.py
+++ b/emission/tests/modellingTests/TestSupportVectorMachine.py
@@ -174,3 +174,116 @@ def testFull(self):
         # No class in predictions that's not in training data
         for predicted_class in pd.unique(y):
             self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']]))
+
+
+    def testIncremental(self):
+        """
+        the model should fit and predict incrementally on normal data without errors
+        """
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        initial_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        additional_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n*5, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": True,
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamts.SupportVectorMachine(model_config)
+        # Start with some initialization data
+        model.fit(initial_trips)
+        # Train on additional sets of data and predict for initial data
+        for i in range(0, 5):
+            model.fit(additional_trips[i:(i+1)*n])
+            model.predict(initial_trips)
+
+
+    def testUnseenClassesIncremental(self):
+        """
+        if the input classes for a feature change throw sklearn error
+        """
+        train_label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        test_label_data = {
+            "mode_confirm": ['drive'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        initial_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=train_label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        additional_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n*5, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=train_label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        test_trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=test_label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": [
+                'data.user_input.mode_confirm',
+                'data.user_input.purpose_confirm'
+            ],
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamts.SupportVectorMachine(model_config)
+        # Start with some initialization data
+        model.fit(initial_trips)
+        # Train on additional sets of data
+        for i in range(0, 5):
+            model.fit(additional_trips[i:(i+1)*n])
+
+        # If an unseen class is introduced, allow sklearn to throw error
+        with self.assertRaises(ValueError):
+            model.predict(test_trips)

From 10e5c7404e565df6098d21bc16b9a448a0872771 Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Thu, 1 Dec 2022 11:34:20 -0800
Subject: [PATCH 07/12] Add gbdt to model types

---
 emission/analysis/modelling/trip_model/model_type.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/emission/analysis/modelling/trip_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py
index b5e761fb0..fcb5b552f 100644
--- a/emission/analysis/modelling/trip_model/model_type.py
+++ b/emission/analysis/modelling/trip_model/model_type.py
@@ -3,6 +3,7 @@
 import emission.analysis.modelling.trip_model.trip_model as eamuu
 import emission.analysis.modelling.similarity.od_similarity as eamso
 import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug
+import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg
 
 
 SIMILARITY_THRESHOLD_METERS=500
@@ -11,6 +12,7 @@
 class ModelType(Enum):
     # ENUM_NAME_CAPS = 'SHORTHAND_NAME_CAPS'
     GREEDY_SIMILARITY_BINNING = 'GREEDY'
+    GRADIENT_BOOSTED_DECISION_TREE = 'GBDT'
     
     def build(self, config=None) -> eamuu.TripModel:
         """
@@ -25,7 +27,8 @@ def build(self, config=None) -> eamuu.TripModel:
         """
         # Dict[ModelType, TripModel]
         MODELS = {
-                ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config)
+                ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config),
+                ModelType.GRADIENT_BOOSTED_DECISION_TREE: eamtg.GradientBoostedDecisionTree(config)
         }
         model = MODELS.get(self)
         if model is None:

From 063d9abd37824c813d60452b69bada7b45a000b9 Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Thu, 8 Dec 2022 15:11:04 -0800
Subject: [PATCH 08/12] Addressing comments, move classes to config

---
 conf/analysis/trip_model.conf.json.sample     |  34 +++++
 .../gradient_boosted_decision_tree.py         |  13 +-
 .../analysis/modelling/trip_model/util.py     |  57 +++----
 .../TestGradientBoostedDecisionTree.py        | 143 +++++++++++++++---
 .../modellingTests/modellingTestAssets.py     |   2 +
 5 files changed, 184 insertions(+), 65 deletions(-)

diff --git a/conf/analysis/trip_model.conf.json.sample b/conf/analysis/trip_model.conf.json.sample
index 845e67a6a..ac9208eaf 100644
--- a/conf/analysis/trip_model.conf.json.sample
+++ b/conf/analysis/trip_model.conf.json.sample
@@ -8,6 +8,40 @@
             "similarity_threshold_meters": 500,
             "apply_cutoff": false,
             "incremental_evaluation": false
+        },
+        "gbdt": {
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "drive",
+                    "bike",
+                    "walk",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ]
+            },
+            "dependent_var": "data.user_input.replaced_mode",
+            "incremental_evaluation": false
+        },
+        "svm": {
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "drive",
+                    "bike",
+                    "walk",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ]
+            },
+            "dependent_var": "data.user_input.replaced_mode",
+            "incremental_evaluation": true
         }
     }
 }
\ No newline at end of file
diff --git a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
index ec354d5e4..5b2968318 100644
--- a/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
+++ b/emission/analysis/modelling/trip_model/gradient_boosted_decision_tree.py
@@ -2,7 +2,7 @@
 from tokenize import group
 from typing import Dict, List, Optional, Tuple
 
-from sklearn.ensemble import GradientBoostingClassifier
+import sklearn.ensemble as ske
 
 import emission.analysis.modelling.trip_model.trip_model as eamuu
 import emission.analysis.modelling.trip_model.util as eamtu
@@ -25,7 +25,8 @@ def __init__(self, config=None):
 
         Replacement modes are considered to be the second-best choice for
         a given trip (i.e., what mode would have been chosen if the actual
-        choice wasn't available).
+        choice wasn't available). These labels are gathered from the user 
+        along with the chosen mode and trip purpose after the trip takes place.
 
         The model is currently trained on data from all users.
         """
@@ -45,7 +46,7 @@ def __init__(self, config=None):
                 raise KeyError(msg)
         self.is_incremental = config['incremental_evaluation']
         # use the sklearn implementation of a GBDT
-        self.gbdt = GradientBoostingClassifier(n_estimators=50)
+        self.gbdt = ske.GradientBoostingClassifier(n_estimators=50)
         self.feature_list = config['feature_list']
         self.dependent_var = config['dependent_var']
 
@@ -65,12 +66,12 @@ def fit(self, trips: List[ecwc.Confirmedtrip]):
         logging.info(f"gradient boosted decision tree model fit to {len(X_train)} rows of trip data")
         logging.info(f"training features were {X_train.columns}")
 
-    def predict(self, trip: ecwc.Confirmedtrip) -> List:
+    def predict(self, trip: ecwc.Confirmedtrip) -> List[str]:
         logging.debug(f"running gradient boosted mode prediction")
         X_test, y_pred = self.extract_features(trip, is_prediction=True)
         y_pred = self.gbdt.predict(X_test)
         if y_pred is None:
-            logging.debug(f"unable to predict bin for trip {trip}")
+            logging.debug(f"unable to predict mode for trip {trip}")
             return []
         else:
             logging.debug(f"made predictions {y_pred}")
@@ -83,4 +84,4 @@ def from_dict(self, model: Dict):
         self.gbdt.set_params(model)
 
     def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
-        return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, is_prediction, trips)
+        return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, trips, is_prediction)
diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py
index d70a8c625..b42e88bef 100644
--- a/emission/analysis/modelling/trip_model/util.py
+++ b/emission/analysis/modelling/trip_model/util.py
@@ -8,7 +8,7 @@
 import emission.core.wrapper.confirmedtrip as ecwc
 
 
-def get_replacement_mode_features(feature_list, dependent_var, is_prediction, trips: ecwc.Confirmedtrip) -> List[float]:
+def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
     """extract the features needed to perform replacement mode modeling from a set of
     trips.
 
@@ -25,24 +25,34 @@ def get_replacement_mode_features(feature_list, dependent_var, is_prediction, tr
     :return: the training X features and y for the replacement mode model
     :rtype: Tuple[List[List[float]], List[]]
     """
-    # get dataframe from json trips; fill in calculated columns
+    # get dataframe from json trips
     trips_df = pd.json_normalize(trips)
-    # distance
-    trips_coords = trips_df[['data.start_loc.coordinates','data.end_loc.coordinates']]
-    trips_df['distance_miles'] = trips_coords.apply(lambda row : haversine(row[0],row[1]), axis=1)
-    # collect all features
-    X = trips_df[feature_list]
-    # any object/categorical dtype features must be one-hot encoded if unordered
+    X = trips_df[list(feature_list.keys())]
+    # any features that are strings must be encoded as numeric variables
+    # we use one-hot encoding for categorical variables
+    # https://pbpython.com/pandas_dtypes.html
     dummies = []
+    numeric = []
     for col in X:
+        # object column == string or mixed variable
         if X[col].dtype=='object':
-            dummies.append(pd.get_dummies(X[col], prefix=col))
-    X = pd.concat(dummies, axis=1)
-    # Only extract dependent var if fitting a new model
+            cat_col = pd.Categorical(X[col], categories=feature_list[col])
+            # If new features are present in X_test, throw value error
+            if cat_col.isnull().any():
+                raise ValueError(f"Cannot predict on unseen classes in: {col}")
+            dummies.append(pd.get_dummies(cat_col, prefix=col))
+        else:
+            numeric.append(X[col])
+    numeric.extend(dummies)
+    X = pd.concat(numeric, axis=1)
+    # only extract dependent var if fitting a new model
+    # for the dependent variable of a classification model, sklearn will accept strings
+    # so no need to recode these to numeric and deal with complications of storing labels
+    # and decoding them later
     if is_prediction:
         y = None
     else:
-        y = trips_df[dependent_var].values
+        y = trips_df[dependent_var]
     return X, y
 
 
@@ -81,26 +91,3 @@ def find_knee_point(values: List[float]) -> Tuple[float, int]:
             index = i
     value = values[index]
     return [index, value]
-
-
-# if the non-mock trips have distance calculated then this can be removed
-# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
-def haversine(coord1, coord2):
-    """
-    Calculate the great circle distance in kilometers between two points 
-    on the earth (specified in decimal degrees)
-    """
-    lon1 = coord1[0]
-    lat1 = coord1[1]
-    lon2 = coord2[0]
-    lat2 = coord2[1]
-    # convert decimal degrees to radians 
-    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
-
-    # haversine formula 
-    dlon = lon2 - lon1 
-    dlat = lat2 - lat1 
-    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
-    c = 2 * asin(sqrt(a)) 
-    r = 3956 # radius of earth in kilometers. Use 3956 for miles. Determines return value units.
-    return c * r
\ No newline at end of file
diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
index b506307f6..864d3bf29 100644
--- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
+++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
@@ -3,6 +3,7 @@
 import emission.tests.modellingTests.modellingTestAssets as etmm
 import logging
 import pandas as pd
+import random
 
 
 class TestGradientBoostedDecisionTree(unittest.TestCase):
@@ -36,11 +37,19 @@ def testSmoke(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm'
-            ],
-            "dependent_var": 'data.user_input.replaced_mode'
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ]
+            },
+            "dependent_var": "data.user_input.replaced_mode"
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(trips)
@@ -50,6 +59,7 @@ def testSmoke(self):
     def testUnseenFeatures(self):
         """
         if the input classes for a feature change throw sklearn error
+        the test mode_confirm includes 'drive' which is not in the training set nor config
         """
         train_label_data = {
             "mode_confirm": ['walk', 'bike', 'transit'],
@@ -85,17 +95,25 @@ def testUnseenFeatures(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm'
-            ],
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    'walk',
+                    'bike',
+                    'transit'
+                ],
+                "data.user_input.purpose_confirm": [
+                    'work',
+                    'home',
+                    'school'
+                ]
+            },
             "dependent_var": 'data.user_input.replaced_mode'
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(train_trips)
 
         with self.assertRaises(ValueError):
-            model.predict(test_trips)
+            y = model.predict(test_trips)
 
 
     def testNumeric(self):
@@ -122,16 +140,27 @@ def testNumeric(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm',
-                'distance_miles'
-            ],
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    'walk',
+                    'bike',
+                    'transit'
+                ],
+                "data.user_input.purpose_confirm": [
+                    'work',
+                    'home',
+                    'school'
+                ],
+                "data.distance": None
+            },
             "dependent_var": 'data.user_input.replaced_mode'
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
-        model.fit(trips)
-        model.predict(trips)
+        X_train, y_train = model.extract_features(trips)
+        # 3 features for mode confirm, 3 for trip purpose, 1 for distance
+        self.assertEqual(len(X_train.columns), 7)
+        # all feature columns should be strictly numeric
+        self.assertTrue(X_train.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all())
 
 
     def testFull(self):
@@ -158,13 +187,26 @@ def testFull(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm',
-                'data.survey.age',
-                'data.survey.hhinc',
-                'distance_miles'
-            ],
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    'walk',
+                    'bike',
+                    'transit'
+                ],
+                "data.user_input.purpose_confirm": [
+                    'work',
+                    'home',
+                    'school'
+                ],
+                "data.distance": None,
+                "data.survey.age": None,
+                "data.survey.hhinc": [
+                    '0-24999',
+                    '25000-49000',
+                    '50000-99999',
+                    '100000+'
+                ]
+            },
             "dependent_var": 'data.user_input.replaced_mode'
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
@@ -174,3 +216,56 @@ def testFull(self):
         # No class in predictions that's not in training data
         for predicted_class in pd.unique(y):
             self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']]))
+
+
+    def testPredictions(self):
+        """
+        with a fixed seed, the model should make consistent predictions
+        """
+        random.seed(42)
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk','bike','transit']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    'walk',
+                    'bike',
+                    'transit'
+                ],
+                "data.user_input.purpose_confirm": [
+                    'work',
+                    'home',
+                    'school'
+                ]
+            },
+            "dependent_var": 'data.user_input.replaced_mode'
+        }
+        model = eamtg.GradientBoostedDecisionTree(model_config)
+        model.fit(trips)
+        y = model.predict(trips)
+
+        # Test that predicted == expected
+        expected_result = [
+            'transit', 'transit', 'walk', 'transit', 'drive', 'walk', 'bike', 'transit',
+            'transit', 'transit', 'walk', 'drive', 'drive', 'drive', 'drive', 'drive',
+            'transit', 'transit', 'walk', 'walk'
+        ]
+        for i, prediction in enumerate(y):
+            self.assertEqual(prediction, expected_result[i])
diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py
index 8b7ab7c6a..c3cef0f42 100644
--- a/emission/tests/modellingTests/modellingTestAssets.py
+++ b/emission/tests/modellingTests/modellingTestAssets.py
@@ -2,6 +2,7 @@
 from typing import Optional, Tuple, List, Dict
 from uuid import UUID
 import emission.analysis.modelling.trip_model.trip_model as eamtm
+import emission.core.common as ecc
 import emission.core.wrapper.confirmedtrip as ecwc
 
 import emission.core.wrapper.entry as ecwe
@@ -120,6 +121,7 @@ def build_mock_trip(
             "type": "Point",
             "coordinates": destination
         },
+        "distance": ecc.calDistance(origin, destination),
         "user_input": labels
     }
 

From 2e1822620911b9a0fe671317924b4ec62a7c1f26 Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Thu, 8 Dec 2022 19:43:12 -0800
Subject: [PATCH 09/12] Incremental SVM, more testing, add classes to configs

---
 conf/analysis/trip_model.conf.json.sample     |  20 +-
 .../trip_model/support_vector_machine.py      |  47 ++--
 .../analysis/modelling/trip_model/util.py     |   6 +-
 .../TestGradientBoostedDecisionTree.py        |  53 +++-
 .../TestSupportVectorMachine.py               | 247 +++++++++++++++---
 5 files changed, 302 insertions(+), 71 deletions(-)

diff --git a/conf/analysis/trip_model.conf.json.sample b/conf/analysis/trip_model.conf.json.sample
index ac9208eaf..b18da6a50 100644
--- a/conf/analysis/trip_model.conf.json.sample
+++ b/conf/analysis/trip_model.conf.json.sample
@@ -23,7 +23,15 @@
                     "school"
                 ]
             },
-            "dependent_var": "data.user_input.replaced_mode",
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            },
             "incremental_evaluation": false
         },
         "svm": {
@@ -40,7 +48,15 @@
                     "school"
                 ]
             },
-            "dependent_var": "data.user_input.replaced_mode",
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            },
             "incremental_evaluation": true
         }
     }
diff --git a/emission/analysis/modelling/trip_model/support_vector_machine.py b/emission/analysis/modelling/trip_model/support_vector_machine.py
index c2b780e6a..f615a66c7 100644
--- a/emission/analysis/modelling/trip_model/support_vector_machine.py
+++ b/emission/analysis/modelling/trip_model/support_vector_machine.py
@@ -3,7 +3,7 @@
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-from sklearn.linear_model import SGDClassifier
+import sklearn as ske
 
 import emission.analysis.modelling.trip_model.trip_model as eamuu
 import emission.analysis.modelling.trip_model.config as eamtc
@@ -14,7 +14,7 @@
 class SupportVectorMachine(eamuu.TripModel):
 
     is_incremental: bool = False  # overwritten during __init__
-    incremental_classes: list = None # overwritten when fit is initialized
+    is_initialized: bool = False  # overwritten during first fit()
 
     def __init__(self, config=None):
         """
@@ -36,7 +36,8 @@ def __init__(self, config=None):
 
         Replacement modes are considered to be the second-best choice for
         a given trip (i.e., what mode would have been chosen if the actual
-        choice wasn't available).
+        choice wasn't available). These labels are gathered from the user 
+        along with the chosen mode and trip purpose after the trip takes place.
 
         The model is currently trained on data from all users.
         """
@@ -56,7 +57,7 @@ def __init__(self, config=None):
                 raise KeyError(msg)
         self.is_incremental = config['incremental_evaluation']
         # use the sklearn implementation of a svm
-        self.svm = SGDClassifier()
+        self.svm = ske.linear_model.SGDClassifier()
         self.feature_list = config['feature_list']
         self.dependent_var = config['dependent_var']
 
@@ -65,13 +66,12 @@ def fit(self, trips: List[ecwc.Confirmedtrip]):
         corresponds to a label at the matching index of the label input.
 
         If using an incremental model, the initial call to fit will store 
-        the list of unique classes in y. If additional classes are added in the 
-        future, the model will need to be re-initialized on data that 
-        contains all of those classes, otherwise sklearn will throw an 
-        error. Additionally, any categorical variables used as features 
-        must have all classes present in the initial training data. If not, 
-        they will throw an sklearn error due to differences in the number of 
-        OHE variables in the data.
+        the list of unique classes in y. The config file is used to store 
+        a lookup for known classes for each categorical feature. This prevents 
+        the need to store a lookup in the model itself, which must be updated 
+        every time the model sees a new class or feature OR when it is given 
+        an incremental training request that does not contain every feature class
+        etc.
 
         :param trips: 2D array of features to train from
         """
@@ -81,27 +81,30 @@ def fit(self, trips: List[ecwc.Confirmedtrip]):
             msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}'
             raise Exception(msg)
         X_train, y_train = self.extract_features(trips)
-        # The first time partial_fit is called, it must include a list of the unique classes in y
-        if self.is_incremental and self.incremental_classes is None:
+        # the first time partial_fit is called, the incremental classes are initialized to the unique y values
+        if self.is_incremental and not self.is_initialized:
             logging.debug(f'initializing incremental model fit')
-            self.incremental_classes = np.unique(y_train)
-            self.svm.partial_fit(X_train, y_train, self.incremental_classes)
-        # For all future partial fits, there is no need to pass the classes again
-        elif self.is_incremental and self.incremental_classes is not None:
+            self.svm.partial_fit(X_train, y_train, self.dependent_var['classes'])
+            self.is_initialized = True
+        # for all future partial fits, there is no need to pass the classes again
+        elif self.is_incremental and self.is_initialized:
             logging.debug(f'updating incremental model fit')
-            self.svm.partial_fit(X_train, y_train)
-        # If not incremental, train regularly
+            try:
+                self.svm.partial_fit(X_train, y_train)
+            except ValueError:
+                raise ValueError("Error in incremental fit: Likely an unseen feature or dependent class was found")
+        # if not incremental, just train regularly
         else:
             self.svm.fit(X_train, y_train)
         logging.info(f"support vector machine model fit to {len(X_train)} rows of trip data")
         logging.info(f"training features were {X_train.columns}")
 
-    def predict(self, trip: ecwc.Confirmedtrip) -> List[int]:
+    def predict(self, trip: ecwc.Confirmedtrip) -> List[str]:
         logging.debug(f"running support vector mode prediction")
         X_test, y_pred = self.extract_features(trip, is_prediction=True)
         y_pred = self.svm.predict(X_test)
         if y_pred is None:
-            logging.debug(f"unable to predict bin for trip {trip}")
+            logging.debug(f"unable to predict mode for trip {trip}")
             return []
         else:
             logging.debug(f"made predictions {y_pred}")
@@ -114,4 +117,4 @@ def from_dict(self, model: Dict):
         self.svm.set_params(model)
 
     def extract_features(self, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
-        return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, is_prediction, trips)
+        return eamtu.get_replacement_mode_features(self.feature_list, self.dependent_var, trips, is_prediction)
diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py
index b42e88bef..996dec1bf 100644
--- a/emission/analysis/modelling/trip_model/util.py
+++ b/emission/analysis/modelling/trip_model/util.py
@@ -12,7 +12,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi
     """extract the features needed to perform replacement mode modeling from a set of
     trips.
 
-    recodes variables that are categorical, and (TODO: scales numeric variables 0-1).
+    recodes variables that are categorical.
 
     :param feature_list: features to gather from each trip
     :type feature_list: List[string]
@@ -27,7 +27,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi
     """
     # get dataframe from json trips
     trips_df = pd.json_normalize(trips)
-    X = trips_df[list(feature_list.keys())]
+    X = trips_df[feature_list.keys()]
     # any features that are strings must be encoded as numeric variables
     # we use one-hot encoding for categorical variables
     # https://pbpython.com/pandas_dtypes.html
@@ -52,7 +52,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi
     if is_prediction:
         y = None
     else:
-        y = trips_df[dependent_var]
+        y = trips_df[dependent_var['name']]
     return X, y
 
 
diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
index 864d3bf29..9b928fbcc 100644
--- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
+++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
@@ -49,7 +49,15 @@ def testSmoke(self):
                     "school"
                 ]
             },
-            "dependent_var": "data.user_input.replaced_mode"
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(trips)
@@ -107,7 +115,15 @@ def testUnseenFeatures(self):
                     'school'
                 ]
             },
-            "dependent_var": 'data.user_input.replaced_mode'
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(train_trips)
@@ -153,7 +169,15 @@ def testNumeric(self):
                 ],
                 "data.distance": None
             },
-            "dependent_var": 'data.user_input.replaced_mode'
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         X_train, y_train = model.extract_features(trips)
@@ -207,7 +231,15 @@ def testFull(self):
                     '100000+'
                 ]
             },
-            "dependent_var": 'data.user_input.replaced_mode'
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(trips)
@@ -215,7 +247,8 @@ def testFull(self):
 
         # No class in predictions that's not in training data
         for predicted_class in pd.unique(y):
-            self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']]))
+            
+            self.assertIn(predicted_class, model_config['dependent_var']['classes'])
 
 
     def testPredictions(self):
@@ -255,7 +288,15 @@ def testPredictions(self):
                     'school'
                 ]
             },
-            "dependent_var": 'data.user_input.replaced_mode'
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamtg.GradientBoostedDecisionTree(model_config)
         model.fit(trips)
diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py
index 8cc5d354e..96df61a64 100644
--- a/emission/tests/modellingTests/TestSupportVectorMachine.py
+++ b/emission/tests/modellingTests/TestSupportVectorMachine.py
@@ -3,6 +3,7 @@
 import emission.tests.modellingTests.modellingTestAssets as etmm
 import logging
 import pandas as pd
+import random
 
 
 class TestSupportVectorMachine(unittest.TestCase):
@@ -36,11 +37,27 @@ def testSmoke(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm'
-            ],
-            "dependent_var": 'data.user_input.replaced_mode'
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ]
+            },
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamts.SupportVectorMachine(model_config)
         model.fit(trips)
@@ -85,11 +102,27 @@ def testUnseenFeatures(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm'
-            ],
-            "dependent_var": 'data.user_input.replaced_mode'
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ]
+            },
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamts.SupportVectorMachine(model_config)
         model.fit(train_trips)
@@ -122,16 +155,35 @@ def testNumeric(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm',
-                'distance_miles'
-            ],
-            "dependent_var": 'data.user_input.replaced_mode'
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ],
+                "data.distance": None
+            },
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamts.SupportVectorMachine(model_config)
-        model.fit(trips)
-        model.predict(trips)
+        X_train, y_train = model.extract_features(trips)
+        # 3 features for mode confirm, 3 for trip purpose, 1 for distance
+        self.assertEqual(len(X_train.columns), 7)
+        # all feature columns should be strictly numeric
+        self.assertTrue(X_train.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all())
 
 
     def testFull(self):
@@ -158,14 +210,35 @@ def testFull(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm',
-                'data.survey.age',
-                'data.survey.hhinc',
-                'distance_miles'
-            ],
-            "dependent_var": 'data.user_input.replaced_mode'
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ],
+                "data.survey.age": None,
+                "data.survey.hhinc": [
+                    '0-24999',
+                    '25000-49000',
+                    '50000-99999',
+                    '100000+'
+                ],
+                "data.distance": None
+            },
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamts.SupportVectorMachine(model_config)
         model.fit(trips)
@@ -173,7 +246,7 @@ def testFull(self):
 
         # No class in predictions that's not in training data
         for predicted_class in pd.unique(y):
-            self.assertIn(predicted_class, pd.unique(pd.json_normalize(trips)[model_config['dependent_var']]))
+            self.assertIn(predicted_class, model_config['dependent_var']['classes'])
 
 
     def testIncremental(self):
@@ -209,11 +282,27 @@ def testIncremental(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": True,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm'
-            ],
-            "dependent_var": 'data.user_input.replaced_mode'
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ]
+            },
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamts.SupportVectorMachine(model_config)
         # Start with some initialization data
@@ -271,11 +360,27 @@ def testUnseenClassesIncremental(self):
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
-            "feature_list": [
-                'data.user_input.mode_confirm',
-                'data.user_input.purpose_confirm'
-            ],
-            "dependent_var": 'data.user_input.replaced_mode'
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ]
+            },
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
         }
         model = eamts.SupportVectorMachine(model_config)
         # Start with some initialization data
@@ -286,4 +391,70 @@ def testUnseenClassesIncremental(self):
 
         # If an unseen class is introduced, allow sklearn to throw error
         with self.assertRaises(ValueError):
-            model.predict(test_trips)
+            model.predict(test_trips)        
+
+
+    def testPredictions(self):
+        """
+        with a fixed seed, the model should make consistent predictions
+        """
+        random.seed(42)
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive','walk','bike','transit']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": {
+                "data.user_input.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "transit"
+                ],
+                "data.user_input.purpose_confirm": [
+                    "work",
+                    "home",
+                    "school"
+                ]
+            },
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "drive",
+                    "walk",
+                    "bike",
+                    "transit"
+                ]
+            }
+        }
+        model = eamts.SupportVectorMachine(model_config)
+        # there is a separate random number generator in SGDClassifier that 
+        # must be fixed to get consistent predictions
+        model.svm.random_state = (3)
+        model.fit(trips)
+        y = model.predict(trips)
+
+        # Test that predicted == expected
+        # note that it seems with a small dataset the svm tends to predict a single category
+        expected_result = [
+            'transit', 'transit', 'bike', 'transit', 'transit', 'bike', 'transit', 'transit',
+            'transit', 'transit', 'bike', 'transit', 'transit', 'transit', 'transit',
+            'transit', 'transit', 'transit', 'bike', 'bike'
+        ]
+        print(y)
+        for i, prediction in enumerate(y):
+            self.assertEqual(prediction, expected_result[i])

From d194d2b94fa6c53d475f1b565190b33a18b8551b Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Wed, 21 Dec 2022 17:45:37 -0800
Subject: [PATCH 10/12] Demographic data format in replacement modeling

---
 .../analysis/modelling/trip_model/util.py     |  34 ++++-
 .../TestGradientBoostedDecisionTree.py        | 142 ++++++++++--------
 .../TestSupportVectorMachine.py               |  27 +++-
 .../modellingTests/modellingTestAssets.py     |  32 ++--
 4 files changed, 152 insertions(+), 83 deletions(-)

diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py
index 996dec1bf..97bc4e5fe 100644
--- a/emission/analysis/modelling/trip_model/util.py
+++ b/emission/analysis/modelling/trip_model/util.py
@@ -8,6 +8,19 @@
 import emission.core.wrapper.confirmedtrip as ecwc
 
 
+def get_survey_df(trips_df, survey_features, response_ids):
+    survey_data = []
+    for feature in survey_features:
+        feature_data = []
+        for i, response_id in enumerate(response_ids):
+            feature_string = feature.split(".")
+            feature_string.insert(2, response_id)
+            feature_string = ".".join(feature_string)
+            feature_data.append(trips_df.iloc[i,][feature_string])
+        survey_data.append(feature_data)
+    return pd.DataFrame(numpy.column_stack(survey_data), columns=survey_features)
+
+
 def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
     """extract the features needed to perform replacement mode modeling from a set of
     trips.
@@ -27,7 +40,24 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi
     """
     # get dataframe from json trips
     trips_df = pd.json_normalize(trips)
-    X = trips_df[feature_list.keys()]
+    # any features that are part of the demographic survey require special attention
+    # the first nested value of the survey data responses changes depending on the user/response
+    feature_names = list(feature_list.keys())
+    survey_features = []
+    nonsurvey_features = []
+    for x in feature_names:
+        if 'jsonDocResponse' in x:
+            survey_features.append(x)
+        else:
+            nonsurvey_features.append(x)
+    # make sure no features are being lost during separation
+    assert(len(survey_features) + len(nonsurvey_features) == len(feature_names))
+    # need unique response id for every trip to identify survey features in the trip dataframe (key below jsonDocResponse)
+    if len(survey_features) > 0:
+        response_ids = [list(trip['data']['jsonDocResponse'].keys())[0] for trip in trips]
+        X = pd.concat([trips_df[nonsurvey_features], get_survey_df(trips_df, survey_features, response_ids)], axis=1)
+    else:
+        X = trips_df[nonsurvey_features]
     # any features that are strings must be encoded as numeric variables
     # we use one-hot encoding for categorical variables
     # https://pbpython.com/pandas_dtypes.html
@@ -37,7 +67,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi
         # object column == string or mixed variable
         if X[col].dtype=='object':
             cat_col = pd.Categorical(X[col], categories=feature_list[col])
-            # If new features are present in X_test, throw value error
+            # if new features are present in X_test, throw value error
             if cat_col.isnull().any():
                 raise ValueError(f"Cannot predict on unseen classes in: {col}")
             dummies.append(pd.get_dummies(cat_col, prefix=col))
diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
index 9b928fbcc..c85f155bb 100644
--- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
+++ b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
@@ -34,6 +34,7 @@ def testSmoke(self):
             within_threshold=m, 
             threshold=0.001,  # ~ 111 meters in degrees WGS84
         )
+        print(trips[1])
         # pass in a test configuration
         model_config = {
             "incremental_evaluation": False,
@@ -196,6 +197,11 @@ def testFull(self):
             "purpose_confirm": ['work', 'home', 'school'],
             "replaced_mode": ['drive','walk','bike','transit']
         }
+        survey_data = {
+           "group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'],
+           "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'],
+           "group_pa5ah98.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more']
+        }
         # generate $n trips.
         n = 20
         m = 5
@@ -205,6 +211,7 @@ def testFull(self):
             origin=(0, 0), 
             destination=(1, 1), 
             label_data=label_data, 
+            survey_data=survey_data,
             within_threshold=m, 
             threshold=0.001,  # ~ 111 meters in degrees WGS84
         )
@@ -223,12 +230,19 @@ def testFull(self):
                     'school'
                 ],
                 "data.distance": None,
-                "data.survey.age": None,
-                "data.survey.hhinc": [
-                    '0-24999',
-                    '25000-49000',
-                    '50000-99999',
-                    '100000+'
+                "data.jsonDocResponse.group_hg4zz25.How_old_are_you": [
+                    '0___25_years_old',
+                    '26___55_years_old',
+                    '56___70_years_old'
+                ],
+                "data.jsonDocResponse.group_hg4zz25.Are_you_a_student": [
+                    'not_a_student',
+                    'yes'
+                ],
+                "data.jsonDocResponse.group_pa5ah98.Please_identify_which_category": [
+                    '0_to__49_999',
+                    '_50_000_to__99_999',
+                    '100_000_or_more'
                 ]
             },
             "dependent_var": {
@@ -251,62 +265,62 @@ def testFull(self):
             self.assertIn(predicted_class, model_config['dependent_var']['classes'])
 
 
-    def testPredictions(self):
-        """
-        with a fixed seed, the model should make consistent predictions
-        """
-        random.seed(42)
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk','bike','transit']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    'walk',
-                    'bike',
-                    'transit'
-                ],
-                "data.user_input.purpose_confirm": [
-                    'work',
-                    'home',
-                    'school'
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamtg.GradientBoostedDecisionTree(model_config)
-        model.fit(trips)
-        y = model.predict(trips)
+    # def testPredictions(self):
+    #     """
+    #     with a fixed seed, the model should make consistent predictions
+    #     """
+    #     random.seed(42)
+    #     label_data = {
+    #         "mode_confirm": ['walk', 'bike', 'transit'],
+    #         "purpose_confirm": ['work', 'home', 'school'],
+    #         "replaced_mode": ['drive','walk','bike','transit']
+    #     }
+    #     # generate $n trips.
+    #     n = 20
+    #     m = 5
+    #     trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=label_data, 
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     # pass in a test configuration
+    #     model_config = {
+    #         "incremental_evaluation": False,
+    #         "feature_list": {
+    #             "data.user_input.mode_confirm": [
+    #                 'walk',
+    #                 'bike',
+    #                 'transit'
+    #             ],
+    #             "data.user_input.purpose_confirm": [
+    #                 'work',
+    #                 'home',
+    #                 'school'
+    #             ]
+    #         },
+    #         "dependent_var": {
+    #             "name": "data.user_input.replaced_mode",
+    #             "classes": [
+    #                 "drive",
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ]
+    #         }
+    #     }
+    #     model = eamtg.GradientBoostedDecisionTree(model_config)
+    #     model.fit(trips)
+    #     y = model.predict(trips)
 
-        # Test that predicted == expected
-        expected_result = [
-            'transit', 'transit', 'walk', 'transit', 'drive', 'walk', 'bike', 'transit',
-            'transit', 'transit', 'walk', 'drive', 'drive', 'drive', 'drive', 'drive',
-            'transit', 'transit', 'walk', 'walk'
-        ]
-        for i, prediction in enumerate(y):
-            self.assertEqual(prediction, expected_result[i])
+    #     # Test that predicted == expected
+    #     expected_result = [
+    #         'transit', 'transit', 'walk', 'transit', 'drive', 'walk', 'bike', 'transit',
+    #         'transit', 'transit', 'walk', 'drive', 'drive', 'drive', 'drive', 'drive',
+    #         'transit', 'transit', 'walk', 'walk'
+    #     ]
+    #     for i, prediction in enumerate(y):
+    #         self.assertEqual(prediction, expected_result[i])
diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py
index 96df61a64..1d1e9ae41 100644
--- a/emission/tests/modellingTests/TestSupportVectorMachine.py
+++ b/emission/tests/modellingTests/TestSupportVectorMachine.py
@@ -195,6 +195,11 @@ def testFull(self):
             "purpose_confirm": ['work', 'home', 'school'],
             "replaced_mode": ['drive','walk','bike','transit']
         }
+        survey_data = {
+           "group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'],
+           "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'],
+           "group_pa5ah98.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more']
+        }
         # generate $n trips.
         n = 20
         m = 5
@@ -204,6 +209,7 @@ def testFull(self):
             origin=(0, 0), 
             destination=(1, 1), 
             label_data=label_data, 
+            survey_data=survey_data, 
             within_threshold=m, 
             threshold=0.001,  # ~ 111 meters in degrees WGS84
         )
@@ -221,14 +227,21 @@ def testFull(self):
                     "home",
                     "school"
                 ],
-                "data.survey.age": None,
-                "data.survey.hhinc": [
-                    '0-24999',
-                    '25000-49000',
-                    '50000-99999',
-                    '100000+'
+                "data.distance": None,
+                "data.jsonDocResponse.group_hg4zz25.How_old_are_you": [
+                    '0___25_years_old',
+                    '26___55_years_old',
+                    '56___70_years_old'
                 ],
-                "data.distance": None
+                "data.jsonDocResponse.group_hg4zz25.Are_you_a_student": [
+                    'not_a_student',
+                    'yes'
+                ],
+                "data.jsonDocResponse.group_pa5ah98.Please_identify_which_category": [
+                    '0_to__49_999',
+                    '_50_000_to__99_999',
+                    '100_000_or_more'
+                ]
             },
             "dependent_var": {
                 "name": "data.user_input.replaced_mode",
diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py
index c3cef0f42..4fcfeff24 100644
--- a/emission/tests/modellingTests/modellingTestAssets.py
+++ b/emission/tests/modellingTests/modellingTestAssets.py
@@ -1,4 +1,5 @@
 import random
+import string
 from typing import Optional, Tuple, List, Dict
 from uuid import UUID
 import emission.analysis.modelling.trip_model.trip_model as eamtm
@@ -134,6 +135,7 @@ def generate_mock_trips(
     origin, 
     destination, 
     label_data = None, 
+    survey_data = None,
     within_threshold = None,
     start_ts: None = None,
     end_ts: None = None,
@@ -145,7 +147,7 @@ def generate_mock_trips(
     within a threshold from the provided o/d pair, and some have labels. some other
     ones can be sampled to appear outside of the threshold of the o/d locations.
 
-    label_data is an optional dictionary with labels and sample weights, for example:
+    label_data and survey_data are optional dictionaries with labels and sample weights, for example:
     {
         "mode_confirm": ['walk', 'bike'],
         "replaced_mode": ['drive', 'tnc'],
@@ -154,6 +156,14 @@ def generate_mock_trips(
         "replaced_mode_weights": [0.4, 0.6],
         "purpose_weights": [0.1, 0.9]
     }
+    {
+        "group_hg4zz25.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more'],
+        "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'],
+        "data.jsonDocResponse.group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'],
+        "group_hg4zz25.Please_identify_which_category_weights": [0.8, 0.1, 0.1],
+        "group_hg4zz25.Are_you_a_student": [0.9, 0.1],
+        "data.jsonDocResponse.group_hg4zz25.How_old_are_you_weights": [0.4, 0.4, 0.2]
+    }
 
     weights entries are optional and result in uniform sampling.
 
@@ -162,6 +172,7 @@ def generate_mock_trips(
     :param origin: origin coordinates
     :param destination: destination coordinates
     :param label_data: dictionary of label data, see above, defaults to None
+    :param survey_data: dictionary of survey data, see above, defaults to None
     :param within_threshold: number of trips that should fall within the provided
            distance threshold in degrees WGS84, defaults to None
     :param threshold: distance threshold in WGS84 for sampling, defaults to 0.01
@@ -189,22 +200,23 @@ def generate_mock_trips(
             purpose_weights=label_data.get('purpose_weights')
         )
         trip = build_mock_trip(user_id, o, d, labels, start_ts, end_ts)
-        trip = add_trip_demographics(trip)
+        if survey_data is not None:
+            trip = add_trip_demographics(trip, survey_data)
         result.append(trip)
         
     random.shuffle(result) 
     return result
 
 
-def add_trip_demographics(trip):
-    trip['data']['survey'] = {}
-    survey_features = {
-        'hhinc':['0-24999','25000-49000','50000-99999','100000+'],
-        'age':[x for x in range(0, 70)],
-        'veh':['0','1','2','3','4+']
-    }
+def add_trip_demographics(trip, survey_features):
+    response_id = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=22))
+    trip['data']['jsonDocResponse'] = {response_id: {'group_yk8eb99': {}, 'group_hg4zz25': {}, 'group_pa5ah98': {}}}
     for feature in survey_features:
-        trip['data']['survey'][feature] = random.choice(survey_features[feature])
+        feature_labels = feature.split(".")
+        feature_group = feature_labels[0]
+        feature_name = feature_labels[1]
+        feature_value = random.choice(survey_features[feature])
+        trip['data']['jsonDocResponse'][response_id][feature_group].update({feature_name: feature_value})
     return trip
 
 if __name__ == '__main__':

From 693b8e0978e5b81d013dad8c674dacdcb9567ae6 Mon Sep 17 00:00:00 2001
From: zackAemmer <zae5op@gmail.com>
Date: Thu, 22 Dec 2022 17:32:49 -0800
Subject: [PATCH 11/12] Partial switch to inferred labels for replacement mode,
 demographics

---
 .../analysis/modelling/trip_model/util.py     |  39 +-
 .../TestGradientBoostedDecisionTree.py        | 326 ------------
 .../TestReplacementTripModels.py              | 417 +++++++++++++++
 .../TestSupportVectorMachine.py               | 473 ------------------
 .../modellingTests/modellingTestAssets.py     |  32 +-
 5 files changed, 452 insertions(+), 835 deletions(-)
 delete mode 100644 emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
 create mode 100644 emission/tests/modellingTests/TestReplacementTripModels.py
 delete mode 100644 emission/tests/modellingTests/TestSupportVectorMachine.py

diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py
index 97bc4e5fe..31152d262 100644
--- a/emission/analysis/modelling/trip_model/util.py
+++ b/emission/analysis/modelling/trip_model/util.py
@@ -5,20 +5,30 @@
 from numpy.linalg import norm
 import pandas as pd
 
+import emission.core.get_database as edb
 import emission.core.wrapper.confirmedtrip as ecwc
 
 
-def get_survey_df(trips_df, survey_features, response_ids):
-    survey_data = []
-    for feature in survey_features:
-        feature_data = []
-        for i, response_id in enumerate(response_ids):
-            feature_string = feature.split(".")
-            feature_string.insert(2, response_id)
-            feature_string = ".".join(feature_string)
-            feature_data.append(trips_df.iloc[i,][feature_string])
-        survey_data.append(feature_data)
-    return pd.DataFrame(numpy.column_stack(survey_data), columns=survey_features)
+# TODO: Currently fails to match the uuid_list to what is in the database. Needs to be split into 1 function which reads demographic data, 1 function which formats features
+def get_survey_df(uuid_list, survey_features):
+    # we use the "survey." identifier to separate out the features which require survey attention in the config, but do not need it in actual key
+    survey_features_response = [".".join(x.split(".")[1:]) for x in survey_features]
+    # retrieve survey response records for any user that has supplied a trip which is being trained on
+    all_survey_results = list(edb.get_timeseries_db().find({"user_id": {"$in": uuid_list}, "metadata.key": "manual/demographic_survey"}))
+    # these are the uuids that were able to be retrieved from the database; if they don't match the requested ones throw an error since we cannot train/test
+    survey_result_uuids = [s["user_id"] for s in all_survey_results]
+    # the challenge is that one of the first dictionary keys changes across users, so we cannot apply a single json_normalize and take feature values
+    # each unique key will end up as its own column, even if it is really the same feature as others, and will be NaN for all other users
+    # this is the id that changes across users, we keep a list here to index later when summarizing all of the survey response results to a single df
+    survey_response_ids = [list(s['data']['jsonDocResponse'].keys())[0] for s in all_survey_results]
+    result = []
+    for i, s in enumerate(all_survey_results):
+        response = pd.json_normalize(s['data']['jsonDocResponse'][survey_response_ids[i]])
+        response = response[survey_features_response]
+        response.columns = survey_features
+        response['user_id'] = survey_result_uuids[i]
+        result.append(response)
+    return pd.concat(result, axis=0).reset_index(inplace=True, drop=True)
 
 
 def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confirmedtrip, is_prediction=False) -> List[float]:
@@ -46,7 +56,7 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi
     survey_features = []
     nonsurvey_features = []
     for x in feature_names:
-        if 'jsonDocResponse' in x:
+        if 'survey' in x:
             survey_features.append(x)
         else:
             nonsurvey_features.append(x)
@@ -54,8 +64,9 @@ def get_replacement_mode_features(feature_list, dependent_var, trips: ecwc.Confi
     assert(len(survey_features) + len(nonsurvey_features) == len(feature_names))
     # need unique response id for every trip to identify survey features in the trip dataframe (key below jsonDocResponse)
     if len(survey_features) > 0:
-        response_ids = [list(trip['data']['jsonDocResponse'].keys())[0] for trip in trips]
-        X = pd.concat([trips_df[nonsurvey_features], get_survey_df(trips_df, survey_features, response_ids)], axis=1)
+        uuid_list = [list(trip['user_id'] for trip in trips)]
+        survey_df = get_survey_df(uuid_list, survey_features)
+        X = pd.concat([trips_df[nonsurvey_features], survey_df], axis=1)
     else:
         X = trips_df[nonsurvey_features]
     # any features that are strings must be encoded as numeric variables
diff --git a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py b/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
deleted file mode 100644
index c85f155bb..000000000
--- a/emission/tests/modellingTests/TestGradientBoostedDecisionTree.py
+++ /dev/null
@@ -1,326 +0,0 @@
-import unittest
-import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg
-import emission.tests.modellingTests.modellingTestAssets as etmm
-import logging
-import pandas as pd
-import random
-
-
-class TestGradientBoostedDecisionTree(unittest.TestCase):
-
-    def setUp(self) -> None:
-        logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
-        level=logging.DEBUG)
-
-
-    def testSmoke(self):
-        """
-        the model should fit and predict on normal data without errors
-        """
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        print(trips[1])
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    "walk",
-                    "bike",
-                    "transit"
-                ],
-                "data.user_input.purpose_confirm": [
-                    "work",
-                    "home",
-                    "school"
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamtg.GradientBoostedDecisionTree(model_config)
-        model.fit(trips)
-        model.predict(trips)
-
-
-    def testUnseenFeatures(self):
-        """
-        if the input classes for a feature change throw sklearn error
-        the test mode_confirm includes 'drive' which is not in the training set nor config
-        """
-        train_label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        test_label_data = {
-            "mode_confirm": ['drive'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        train_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=train_label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        test_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=test_label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    'walk',
-                    'bike',
-                    'transit'
-                ],
-                "data.user_input.purpose_confirm": [
-                    'work',
-                    'home',
-                    'school'
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamtg.GradientBoostedDecisionTree(model_config)
-        model.fit(train_trips)
-
-        with self.assertRaises(ValueError):
-            y = model.predict(test_trips)
-
-
-    def testNumeric(self):
-        """
-        the model should handle numeric and categorical variable types
-        """
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    'walk',
-                    'bike',
-                    'transit'
-                ],
-                "data.user_input.purpose_confirm": [
-                    'work',
-                    'home',
-                    'school'
-                ],
-                "data.distance": None
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamtg.GradientBoostedDecisionTree(model_config)
-        X_train, y_train = model.extract_features(trips)
-        # 3 features for mode confirm, 3 for trip purpose, 1 for distance
-        self.assertEqual(len(X_train.columns), 7)
-        # all feature columns should be strictly numeric
-        self.assertTrue(X_train.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all())
-
-
-    def testFull(self):
-        """
-        the model should handle survey, trip, and user input features
-        """
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk','bike','transit']
-        }
-        survey_data = {
-           "group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'],
-           "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'],
-           "group_pa5ah98.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            survey_data=survey_data,
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    'walk',
-                    'bike',
-                    'transit'
-                ],
-                "data.user_input.purpose_confirm": [
-                    'work',
-                    'home',
-                    'school'
-                ],
-                "data.distance": None,
-                "data.jsonDocResponse.group_hg4zz25.How_old_are_you": [
-                    '0___25_years_old',
-                    '26___55_years_old',
-                    '56___70_years_old'
-                ],
-                "data.jsonDocResponse.group_hg4zz25.Are_you_a_student": [
-                    'not_a_student',
-                    'yes'
-                ],
-                "data.jsonDocResponse.group_pa5ah98.Please_identify_which_category": [
-                    '0_to__49_999',
-                    '_50_000_to__99_999',
-                    '100_000_or_more'
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamtg.GradientBoostedDecisionTree(model_config)
-        model.fit(trips)
-        y = model.predict(trips)
-
-        # No class in predictions that's not in training data
-        for predicted_class in pd.unique(y):
-            
-            self.assertIn(predicted_class, model_config['dependent_var']['classes'])
-
-
-    # def testPredictions(self):
-    #     """
-    #     with a fixed seed, the model should make consistent predictions
-    #     """
-    #     random.seed(42)
-    #     label_data = {
-    #         "mode_confirm": ['walk', 'bike', 'transit'],
-    #         "purpose_confirm": ['work', 'home', 'school'],
-    #         "replaced_mode": ['drive','walk','bike','transit']
-    #     }
-    #     # generate $n trips.
-    #     n = 20
-    #     m = 5
-    #     trips = etmm.generate_mock_trips(
-    #         user_id="joe", 
-    #         trips=n, 
-    #         origin=(0, 0), 
-    #         destination=(1, 1), 
-    #         label_data=label_data, 
-    #         within_threshold=m, 
-    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
-    #     )
-    #     # pass in a test configuration
-    #     model_config = {
-    #         "incremental_evaluation": False,
-    #         "feature_list": {
-    #             "data.user_input.mode_confirm": [
-    #                 'walk',
-    #                 'bike',
-    #                 'transit'
-    #             ],
-    #             "data.user_input.purpose_confirm": [
-    #                 'work',
-    #                 'home',
-    #                 'school'
-    #             ]
-    #         },
-    #         "dependent_var": {
-    #             "name": "data.user_input.replaced_mode",
-    #             "classes": [
-    #                 "drive",
-    #                 "walk",
-    #                 "bike",
-    #                 "transit"
-    #             ]
-    #         }
-    #     }
-    #     model = eamtg.GradientBoostedDecisionTree(model_config)
-    #     model.fit(trips)
-    #     y = model.predict(trips)
-
-    #     # Test that predicted == expected
-    #     expected_result = [
-    #         'transit', 'transit', 'walk', 'transit', 'drive', 'walk', 'bike', 'transit',
-    #         'transit', 'transit', 'walk', 'drive', 'drive', 'drive', 'drive', 'drive',
-    #         'transit', 'transit', 'walk', 'walk'
-    #     ]
-    #     for i, prediction in enumerate(y):
-    #         self.assertEqual(prediction, expected_result[i])
diff --git a/emission/tests/modellingTests/TestReplacementTripModels.py b/emission/tests/modellingTests/TestReplacementTripModels.py
new file mode 100644
index 000000000..e4fe4b2a7
--- /dev/null
+++ b/emission/tests/modellingTests/TestReplacementTripModels.py
@@ -0,0 +1,417 @@
+import unittest
+import emission.core.get_database as edb
+import emission.analysis.modelling.trip_model.gradient_boosted_decision_tree as eamtg
+import emission.analysis.modelling.trip_model.support_vector_machine as eamts
+import emission.tests.modellingTests.modellingTestAssets as etmm
+import logging
+import pandas as pd
+import random
+
+
+class TestReplacementTripModels(unittest.TestCase):
+
+    def setUp(self) -> None:
+        logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
+        level=logging.DEBUG)
+
+
+    def testSmoke(self):
+        """
+        the model should fit and predict on normal data without errors
+        """
+        # though we cannot use mode_confirm or purpose_confirm to predict, they are required for mock trip generation
+        # for now, just pass it to user-label and sensed-label data
+        label_data = {
+            "mode_confirm": ['drive'],
+            "replaced_mode": ['drive','walk'],
+            "purpose_confirm": ['walk']
+        }
+        # generate $n trips.
+        n = 20
+        m = 5
+        trips = etmm.generate_mock_trips(
+            user_id="joe", 
+            trips=n, 
+            origin=(0, 0), 
+            destination=(1, 1), 
+            label_data=label_data, 
+            sensed_label_data=label_data, 
+            within_threshold=m, 
+            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        )
+        # pass in a test configuration
+        model_config = {
+            "incremental_evaluation": False,
+            "feature_list": {
+                "data.inferred_labels.mode_confirm": [
+                    "walk",
+                    "bike",
+                    "drive"
+                ]
+            },
+            "dependent_var": {
+                "name": "data.user_input.replaced_mode",
+                "classes": [
+                    "walk",
+                    "bike",
+                    "drive",
+                ]
+            }
+        }
+        model = eamtg.GradientBoostedDecisionTree(model_config)
+        model.fit(trips)
+        model.predict(trips)
+        model = eamts.SupportVectorMachine(model_config)
+        model.fit(trips)
+        model.predict(trips)
+
+
+#TODO: These tests were written using a prior version of the model which did not include the sensed data.
+# So sensed features were actually being read from the 'user_input' key rather than 'inferred_labels'
+# The above test shows how these can be set up to read from inferred labels, however they are dependent on the 
+# 'add_sensed_labels()' function in modellingTestAssets.py. Once that function correctly samples the labels
+# (instead of just using drive for every label) these tests should work again. The exception is 'testFull' which
+# will rely on a new function which adds mock-demographic-data to the database. See todo in get_survey_df() under util.py
+# for more info on that.
+
+    # def testUnseenFeatures(self):
+    #     """
+    #     if the input classes for a feature change throw sklearn error
+    #     """
+    #     train_label_data = {
+    #         "mode_confirm": ['drive'],
+    #         "purpose_confirm": ['work', 'home', 'school'],
+    #         "replaced_mode": ['drive','walk']
+    #     }
+    #     test_label_data = {
+    #         "mode_confirm": ['walk','transit'],
+    #         "purpose_confirm": ['work', 'home', 'school'],
+    #         "replaced_mode": ['drive','walk']
+    #     }
+    #     # generate $n trips.
+    #     n = 20
+    #     m = 5
+    #     train_trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=train_label_data, 
+    #         sensed_label_data=train_label_data,
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     test_trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=test_label_data, 
+    #         sensed_label_data=test_label_data,
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     # pass in a test configuration
+    #     model_config = {
+    #         "incremental_evaluation": False,
+    #         "feature_list": {
+    #             "data.inferred_labels.mode_confirm": [
+    #                 "drive"
+    #             ]
+    #         },
+    #         "dependent_var": {
+    #             "name": "data.user_input.replaced_mode",
+    #             "classes": [
+    #                 "drive",
+    #                 "walk"
+    #             ]
+    #         }
+    #     }
+    #     model = eamtg.GradientBoostedDecisionTree(model_config)
+    #     model.fit(train_trips)
+    #     with self.assertRaises(ValueError):
+    #         model.predict(test_trips)
+    #     model = eamts.SupportVectorMachine(model_config)
+    #     model.fit(train_trips)
+    #     with self.assertRaises(ValueError):
+    #         model.predict(test_trips)
+
+
+    # def testFull(self):
+    #     """
+    #     the model should handle survey, trip, and user input features
+    #     """
+    #     label_data = {
+    #         "mode_confirm": ['walk', 'bike', 'transit'],
+    #         "purpose_confirm": ['work', 'home', 'school'],
+    #         "replaced_mode": ['drive','walk','bike','transit']
+    #     }
+    #     # generate $n trips.
+    #     n = 20
+    #     m = 5
+    #     # for the sake of testing, need a UUID that correlates to a survey response in the database; use the first one
+    #     all_survey_results = list(edb.get_timeseries_db().find({"metadata.key": "manual/demographic_survey"}))
+    #     sample_uuid = all_survey_results[0]['user_id']
+    #     trips = etmm.generate_mock_trips(
+    #         user_id=sample_uuid, 
+    #         trips=n, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=label_data, 
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     print(trips[0])
+    #     # pass in a test configuration
+    #     model_config = {
+    #         "incremental_evaluation": False,
+    #         "feature_list": {
+    #             "data.inferred_labels.mode_confirm": [
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ],
+    #             "data.distance": None,
+    #             "survey.group_hg4zz25.How_old_are_you": [
+    #                 '0___25_years_old',
+    #                 '26___55_years_old',
+    #                 '56___70_years_old'
+    #             ],
+    #             "survey.group_hg4zz25.Are_you_a_student": [
+    #                 'not_a_student',
+    #                 'yes'
+    #             ],
+    #             "survey.group_pa5ah98.Please_identify_which_category": [
+    #                 '0_to__49_999',
+    #                 '_50_000_to__99_999',
+    #                 '100_000_or_more'
+    #             ]
+    #         },
+    #         "dependent_var": {
+    #             "name": "data.user_input.replaced_mode",
+    #             "classes": [
+    #                 "drive",
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ]
+    #         }
+    #     }
+    #     model = eamtg.GradientBoostedDecisionTree(model_config)
+    #     model.fit(trips)
+    #     y = model.predict(trips)
+    #     # No class in predictions that's not in training data
+    #     for predicted_class in pd.unique(y):
+    #         self.assertIn(predicted_class, model_config['dependent_var']['classes'])
+
+    #     model = eamts.SupportVectorMachine(model_config)
+    #     model.fit(trips)
+    #     y = model.predict(trips)
+    #     # No class in predictions that's not in training data
+    #     for predicted_class in pd.unique(y):
+    #         self.assertIn(predicted_class, model_config['dependent_var']['classes'])
+
+
+    # def testIncremental(self):
+    #     """
+    #     the model should fit and predict incrementally on normal data without errors
+    #     """
+    #     label_data = {
+    #         "mode_confirm": ['walk', 'bike', 'transit'],
+    #         "purpose_confirm": ['work', 'home', 'school'],
+    #         "replaced_mode": ['drive','walk']
+    #     }
+    #     # generate $n trips.
+    #     n = 20
+    #     m = 5
+    #     initial_trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=label_data, 
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     additional_trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n*5, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=label_data, 
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     # pass in a test configuration
+    #     model_config = {
+    #         "incremental_evaluation": True,
+    #         "feature_list": {
+    #             "data.inferred_labels.mode_confirm": [
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ]
+    #         },
+    #         "dependent_var": {
+    #             "name": "data.user_input.replaced_mode",
+    #             "classes": [
+    #                 "drive",
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ]
+    #         }
+    #     }
+    #     model = eamts.SupportVectorMachine(model_config)
+    #     # Start with some initialization data
+    #     model.fit(initial_trips)
+    #     # Train on additional sets of data and predict for initial data
+    #     for i in range(0, 5):
+    #         model.fit(additional_trips[i:(i+1)*n])
+    #         model.predict(initial_trips)
+
+
+    # def testUnseenClassesIncremental(self):
+    #     """
+    #     if the input classes for a feature change throw sklearn error
+    #     """
+    #     train_label_data = {
+    #         "mode_confirm": ['walk', 'bike', 'transit'],
+    #         "purpose_confirm": ['work', 'home', 'school'],
+    #         "replaced_mode": ['drive','walk']
+    #     }
+    #     test_label_data = {
+    #         "mode_confirm": ['drive'],
+    #         "purpose_confirm": ['work', 'home', 'school'],
+    #         "replaced_mode": ['drive','walk']
+    #     }
+    #     # generate $n trips.
+    #     n = 20
+    #     m = 5
+    #     initial_trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=train_label_data, 
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     additional_trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n*5, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=train_label_data, 
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     test_trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=test_label_data, 
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     # pass in a test configuration
+    #     model_config = {
+    #         "incremental_evaluation": False,
+    #         "feature_list": {
+    #             "data.inferred_labels.mode_confirm": [
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ]
+    #         },
+    #         "dependent_var": {
+    #             "name": "data.user_input.replaced_mode",
+    #             "classes": [
+    #                 "drive",
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ]
+    #         }
+    #     }
+    #     model = eamts.SupportVectorMachine(model_config)
+    #     # Start with some initialization data
+    #     model.fit(initial_trips)
+    #     # Train on additional sets of data
+    #     for i in range(0, 5):
+    #         model.fit(additional_trips[i:(i+1)*n])
+    #     # If an unseen class is introduced, allow sklearn to throw error
+    #     with self.assertRaises(ValueError):
+    #         model.predict(test_trips)        
+
+
+    # def testPredictions(self):
+    #     """
+    #     with a fixed seed, the model should make consistent predictions
+    #     """
+    #     random.seed(42)
+    #     label_data = {
+    #         "mode_confirm": ['walk', 'bike', 'transit'],
+    #         "purpose_confirm": ['work', 'home', 'school'],
+    #         "replaced_mode": ['drive','walk','bike','transit']
+    #     }
+    #     # generate $n trips.
+    #     n = 20
+    #     m = 5
+    #     trips = etmm.generate_mock_trips(
+    #         user_id="joe", 
+    #         trips=n, 
+    #         origin=(0, 0), 
+    #         destination=(1, 1), 
+    #         label_data=label_data, 
+    #         within_threshold=m, 
+    #         threshold=0.001,  # ~ 111 meters in degrees WGS84
+    #     )
+    #     # pass in a test configuration
+    #     model_config = {
+    #         "incremental_evaluation": False,
+    #         "feature_list": {
+    #             "data.inferred_labels.mode_confirm": [
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ]
+    #         },
+    #         "dependent_var": {
+    #             "name": "data.user_input.replaced_mode",
+    #             "classes": [
+    #                 "drive",
+    #                 "walk",
+    #                 "bike",
+    #                 "transit"
+    #             ]
+    #         }
+    #     }
+    #     model = eamtg.GradientBoostedDecisionTree(model_config)
+    #     model.fit(trips)
+    #     y = model.predict(trips)
+    #     # Test that predicted == expected
+    #     expected_result = [
+    #         'transit', 'bike', 'transit', 'bike', 'transit', 'bike', 'drive', 'transit',
+    #         'transit', 'drive', 'transit', 'transit', 'bike', 'bike', 'bike', 'transit',
+    #         'transit', 'transit', 'bike', 'drive'
+    #     ]
+    #     for i, prediction in enumerate(y):
+    #         self.assertEqual(prediction, expected_result[i])
+
+    #     model = eamts.SupportVectorMachine(model_config)
+    #     # there is a separate random number generator in SGDClassifier that 
+    #     # must be fixed to get consistent predictions
+    #     model.svm.random_state = (3)
+    #     model.fit(trips)
+    #     y = model.predict(trips)
+    #     # Test that predicted == expected
+    #     # note that it seems with a small dataset the svm tends to predict a single category
+    #     expected_result = [
+    #         'drive', 'drive', 'transit', 'drive', 'drive', 'drive', 'drive', 'transit',
+    #         'transit', 'drive', 'drive', 'transit', 'drive', 'drive', 'drive', 'transit',
+    #         'drive', 'transit', 'drive', 'drive'
+    #     ]
+    #     for i, prediction in enumerate(y):
+    #         self.assertEqual(prediction, expected_result[i])
diff --git a/emission/tests/modellingTests/TestSupportVectorMachine.py b/emission/tests/modellingTests/TestSupportVectorMachine.py
deleted file mode 100644
index 1d1e9ae41..000000000
--- a/emission/tests/modellingTests/TestSupportVectorMachine.py
+++ /dev/null
@@ -1,473 +0,0 @@
-import unittest
-import emission.analysis.modelling.trip_model.support_vector_machine as eamts
-import emission.tests.modellingTests.modellingTestAssets as etmm
-import logging
-import pandas as pd
-import random
-
-
-class TestSupportVectorMachine(unittest.TestCase):
-
-    def setUp(self) -> None:
-        logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
-        level=logging.DEBUG)
-
-
-    def testSmoke(self):
-        """
-        the model should fit and predict on normal data without errors
-        """
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    "walk",
-                    "bike",
-                    "transit"
-                ],
-                "data.user_input.purpose_confirm": [
-                    "work",
-                    "home",
-                    "school"
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamts.SupportVectorMachine(model_config)
-        model.fit(trips)
-        model.predict(trips)
-
-
-    def testUnseenFeatures(self):
-        """
-        if the input classes for a feature change throw sklearn error
-        """
-        train_label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        test_label_data = {
-            "mode_confirm": ['drive'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        train_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=train_label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        test_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=test_label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    "walk",
-                    "bike",
-                    "transit"
-                ],
-                "data.user_input.purpose_confirm": [
-                    "work",
-                    "home",
-                    "school"
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamts.SupportVectorMachine(model_config)
-        model.fit(train_trips)
-
-        with self.assertRaises(ValueError):
-            model.predict(test_trips)
-
-
-    def testNumeric(self):
-        """
-        the model should handle numeric and categorical variable types
-        """
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    "walk",
-                    "bike",
-                    "transit"
-                ],
-                "data.user_input.purpose_confirm": [
-                    "work",
-                    "home",
-                    "school"
-                ],
-                "data.distance": None
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamts.SupportVectorMachine(model_config)
-        X_train, y_train = model.extract_features(trips)
-        # 3 features for mode confirm, 3 for trip purpose, 1 for distance
-        self.assertEqual(len(X_train.columns), 7)
-        # all feature columns should be strictly numeric
-        self.assertTrue(X_train.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all())
-
-
-    def testFull(self):
-        """
-        the model should handle survey, trip, and user input features
-        """
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk','bike','transit']
-        }
-        survey_data = {
-           "group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'],
-           "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'],
-           "group_pa5ah98.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            survey_data=survey_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    "walk",
-                    "bike",
-                    "transit"
-                ],
-                "data.user_input.purpose_confirm": [
-                    "work",
-                    "home",
-                    "school"
-                ],
-                "data.distance": None,
-                "data.jsonDocResponse.group_hg4zz25.How_old_are_you": [
-                    '0___25_years_old',
-                    '26___55_years_old',
-                    '56___70_years_old'
-                ],
-                "data.jsonDocResponse.group_hg4zz25.Are_you_a_student": [
-                    'not_a_student',
-                    'yes'
-                ],
-                "data.jsonDocResponse.group_pa5ah98.Please_identify_which_category": [
-                    '0_to__49_999',
-                    '_50_000_to__99_999',
-                    '100_000_or_more'
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamts.SupportVectorMachine(model_config)
-        model.fit(trips)
-        y = model.predict(trips)
-
-        # No class in predictions that's not in training data
-        for predicted_class in pd.unique(y):
-            self.assertIn(predicted_class, model_config['dependent_var']['classes'])
-
-
-    def testIncremental(self):
-        """
-        the model should fit and predict incrementally on normal data without errors
-        """
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        initial_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        additional_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n*5, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": True,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    "walk",
-                    "bike",
-                    "transit"
-                ],
-                "data.user_input.purpose_confirm": [
-                    "work",
-                    "home",
-                    "school"
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamts.SupportVectorMachine(model_config)
-        # Start with some initialization data
-        model.fit(initial_trips)
-        # Train on additional sets of data and predict for initial data
-        for i in range(0, 5):
-            model.fit(additional_trips[i:(i+1)*n])
-            model.predict(initial_trips)
-
-
-    def testUnseenClassesIncremental(self):
-        """
-        if the input classes for a feature change throw sklearn error
-        """
-        train_label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        test_label_data = {
-            "mode_confirm": ['drive'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        initial_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=train_label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        additional_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n*5, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=train_label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        test_trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=test_label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    "walk",
-                    "bike",
-                    "transit"
-                ],
-                "data.user_input.purpose_confirm": [
-                    "work",
-                    "home",
-                    "school"
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamts.SupportVectorMachine(model_config)
-        # Start with some initialization data
-        model.fit(initial_trips)
-        # Train on additional sets of data
-        for i in range(0, 5):
-            model.fit(additional_trips[i:(i+1)*n])
-
-        # If an unseen class is introduced, allow sklearn to throw error
-        with self.assertRaises(ValueError):
-            model.predict(test_trips)        
-
-
-    def testPredictions(self):
-        """
-        with a fixed seed, the model should make consistent predictions
-        """
-        random.seed(42)
-        label_data = {
-            "mode_confirm": ['walk', 'bike', 'transit'],
-            "purpose_confirm": ['work', 'home', 'school'],
-            "replaced_mode": ['drive','walk','bike','transit']
-        }
-        # generate $n trips.
-        n = 20
-        m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-        # pass in a test configuration
-        model_config = {
-            "incremental_evaluation": False,
-            "feature_list": {
-                "data.user_input.mode_confirm": [
-                    "walk",
-                    "bike",
-                    "transit"
-                ],
-                "data.user_input.purpose_confirm": [
-                    "work",
-                    "home",
-                    "school"
-                ]
-            },
-            "dependent_var": {
-                "name": "data.user_input.replaced_mode",
-                "classes": [
-                    "drive",
-                    "walk",
-                    "bike",
-                    "transit"
-                ]
-            }
-        }
-        model = eamts.SupportVectorMachine(model_config)
-        # there is a separate random number generator in SGDClassifier that 
-        # must be fixed to get consistent predictions
-        model.svm.random_state = (3)
-        model.fit(trips)
-        y = model.predict(trips)
-
-        # Test that predicted == expected
-        # note that it seems with a small dataset the svm tends to predict a single category
-        expected_result = [
-            'transit', 'transit', 'bike', 'transit', 'transit', 'bike', 'transit', 'transit',
-            'transit', 'transit', 'bike', 'transit', 'transit', 'transit', 'transit',
-            'transit', 'transit', 'transit', 'bike', 'bike'
-        ]
-        print(y)
-        for i, prediction in enumerate(y):
-            self.assertEqual(prediction, expected_result[i])
diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py
index 4fcfeff24..674eb6f70 100644
--- a/emission/tests/modellingTests/modellingTestAssets.py
+++ b/emission/tests/modellingTests/modellingTestAssets.py
@@ -135,7 +135,7 @@ def generate_mock_trips(
     origin, 
     destination, 
     label_data = None, 
-    survey_data = None,
+    sensed_label_data = None,
     within_threshold = None,
     start_ts: None = None,
     end_ts: None = None,
@@ -147,7 +147,7 @@ def generate_mock_trips(
     within a threshold from the provided o/d pair, and some have labels. some other
     ones can be sampled to appear outside of the threshold of the o/d locations.
 
-    label_data and survey_data are optional dictionaries with labels and sample weights, for example:
+    label_data/sensed_label-data is optional dictionary with labels and sample weights, for example:
     {
         "mode_confirm": ['walk', 'bike'],
         "replaced_mode": ['drive', 'tnc'],
@@ -156,14 +156,6 @@ def generate_mock_trips(
         "replaced_mode_weights": [0.4, 0.6],
         "purpose_weights": [0.1, 0.9]
     }
-    {
-        "group_hg4zz25.Please_identify_which_category": ['0_to__49_999', '_50_000_to__99_999', '100_000_or_more'],
-        "group_hg4zz25.Are_you_a_student": ['not_a_student', 'yes'],
-        "data.jsonDocResponse.group_hg4zz25.How_old_are_you": ['0___25_years_old', '26___55_years_old', '56___70_years_old'],
-        "group_hg4zz25.Please_identify_which_category_weights": [0.8, 0.1, 0.1],
-        "group_hg4zz25.Are_you_a_student": [0.9, 0.1],
-        "data.jsonDocResponse.group_hg4zz25.How_old_are_you_weights": [0.4, 0.4, 0.2]
-    }
 
     weights entries are optional and result in uniform sampling.
 
@@ -172,7 +164,7 @@ def generate_mock_trips(
     :param origin: origin coordinates
     :param destination: destination coordinates
     :param label_data: dictionary of label data, see above, defaults to None
-    :param survey_data: dictionary of survey data, see above, defaults to None
+    :param sensed_label_data: dictionary of sensed data, see above, defaults to None
     :param within_threshold: number of trips that should fall within the provided
            distance threshold in degrees WGS84, defaults to None
     :param threshold: distance threshold in WGS84 for sampling, defaults to 0.01
@@ -200,25 +192,21 @@ def generate_mock_trips(
             purpose_weights=label_data.get('purpose_weights')
         )
         trip = build_mock_trip(user_id, o, d, labels, start_ts, end_ts)
-        if survey_data is not None:
-            trip = add_trip_demographics(trip, survey_data)
+        if sensed_label_data is not None:
+            trip = add_sensed_labels(trip, sensed_label_data)
         result.append(trip)
         
     random.shuffle(result) 
     return result
 
 
-def add_trip_demographics(trip, survey_features):
-    response_id = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=22))
-    trip['data']['jsonDocResponse'] = {response_id: {'group_yk8eb99': {}, 'group_hg4zz25': {}, 'group_pa5ah98': {}}}
-    for feature in survey_features:
-        feature_labels = feature.split(".")
-        feature_group = feature_labels[0]
-        feature_name = feature_labels[1]
-        feature_value = random.choice(survey_features[feature])
-        trip['data']['jsonDocResponse'][response_id][feature_group].update({feature_name: feature_value})
+def add_sensed_labels(trip, sensed_label_data):
+    for label in sensed_label_data:
+        # TODO: currently just makes one inferred label 'drive'; should be random option from sensed_label_data
+        trip['data']['inferred_labels'] = {'mode_confirm': sensed_label_data[label][0]}
     return trip
 
+
 if __name__ == '__main__':
     label_data = {
         "mode_confirm": ['walk', 'bike', 'drive'],

From 94673568d55edcc668381ebf17c791e866c98167 Mon Sep 17 00:00:00 2001
From: aGuttman <slaker117@gmail.com>
Date: Fri, 23 Dec 2022 13:39:21 -0700
Subject: [PATCH 12/12] Integrating Replace Mode Model

Building out pipeline infrastructure to run replace mode
---
 .../inference/labels/inferrers.py             |  16 +++
 .../inference/labels/pipeline_replace_mode.py | 102 ++++++++++++++++++
 .../modelling/trip_model/model_storage.py     |  51 +++++++++
 .../modelling/trip_model/run_model.py         |  40 +++++++
 emission/core/wrapper/labelprediction.py      |   1 +
 .../analysis_timeseries_queries.py            |   1 +
 6 files changed, 211 insertions(+)
 create mode 100644 emission/analysis/classification/inference/labels/pipeline_replace_mode.py

diff --git a/emission/analysis/classification/inference/labels/inferrers.py b/emission/analysis/classification/inference/labels/inferrers.py
index c6b939671..16d5cf3c3 100644
--- a/emission/analysis/classification/inference/labels/inferrers.py
+++ b/emission/analysis/classification/inference/labels/inferrers.py
@@ -156,3 +156,19 @@ def predict_cluster_confidence_discounting(trip, max_confidence=None, first_conf
     labels = copy.deepcopy(labels)
     for l in labels: l["p"] *= confidence_coeff
     return labels
+
+def predict_gradient_boosted_decision_tree(trip, max_confidence=None, first_confidence=None, confidence_multiplier=None):
+    # load application config 
+    model_type = eamtc.get_model_type()
+    model_storage = eamtc.get_model_storage()
+    labels, n = eamur.predict_labels_with_gbdt(trip, model_type, model_storage)
+    if n <= 0:  # No model data or trip didn't match a cluster
+        logging.debug(f"In predict_gradient_boosted_decision_tree: n={n}; returning as-is")
+        return labels
+
+    # confidence_coeff = n_to_confidence_coeff(n, max_confidence, first_confidence, confidence_multiplier)
+    # logging.debug(f"In predict_cluster_confidence_discounting: n={n}; discounting with coefficient {confidence_coeff}")
+
+    labels = copy.deepcopy(labels)
+    for l in labels: l["p"] *= confidence_coeff
+    return labels
\ No newline at end of file
diff --git a/emission/analysis/classification/inference/labels/pipeline_replace_mode.py b/emission/analysis/classification/inference/labels/pipeline_replace_mode.py
new file mode 100644
index 000000000..3c8656f00
--- /dev/null
+++ b/emission/analysis/classification/inference/labels/pipeline_replace_mode.py
@@ -0,0 +1,102 @@
+# Standard imports
+import logging
+import random
+import copy
+
+# Our imports
+import emission.storage.pipeline_queries as epq
+import emission.storage.timeseries.abstract_timeseries as esta
+import emission.storage.decorations.analysis_timeseries_queries as esda
+import emission.core.wrapper.labelprediction as ecwl
+import emission.core.wrapper.entry as ecwe
+import emission.analysis.classification.inference.labels.inferrers as eacili
+import emission.analysis.classification.inference.labels.ensembles as eacile
+
+
+# For each algorithm in ecwl.AlgorithmTypes that runs on a trip (e.g., not the ensemble, which
+# runs on the results of other algorithms), primary_algorithms specifies a corresponding
+# function in eacili to run. This makes it easy to plug in additional algorithms later.
+primary_algorithms = {
+    ecwl.AlgorithmTypes.GRADIENT_BOOSTED_DECISION_TREE: eacili.predict_gradient_boosted_decision_tree
+}
+
+# ensemble specifies which algorithm in eacile to run.
+# This makes it easy to test various ways of combining various algorithms.
+ensemble = eacile.ensemble_first_prediction
+
+
+# Does all the work necessary for a given user
+def infer_labels(user_id):
+    time_query = epq.get_time_range_for_label_inference(user_id)
+    try:
+        lip = LabelInferencePipeline()
+        lip.user_id = user_id
+        lip.run_prediction_pipeline(user_id, time_query)
+        if lip.last_trip_done is None:
+            logging.debug("After run, last_trip_done == None, must be early return")
+        epq.mark_label_inference_done(user_id, lip.last_trip_done)
+    except:
+        logging.exception("Error while inferring labels, timestamp is unchanged")
+        epq.mark_label_inference_failed(user_id)
+
+# Code structure based on emission.analysis.classification.inference.mode.pipeline
+# and emission.analysis.classification.inference.mode.rule_engine
+class LabelInferencePipeline:
+    def __init__(self):
+        self._last_trip_done = None
+    
+    @property
+    def last_trip_done(self):
+        return self._last_trip_done
+
+    # For a given user and time range, runs all the primary algorithms and ensemble, saves results
+    # to the database, and records progress
+    def run_prediction_pipeline(self, user_id, time_range):
+        self.ts = esta.TimeSeries.get_time_series(user_id)
+        self.toPredictTrips = esda.get_entries(
+            esda.CLEANED_TRIP_KEY, user_id, time_query=time_range)
+        for cleaned_trip in self.toPredictTrips:
+            # Create an inferred trip
+            cleaned_trip_dict = copy.copy(cleaned_trip)["data"]
+            inferred_trip = ecwe.Entry.create_entry(user_id, "analysis/inferred_trip", cleaned_trip_dict)
+            
+            # Run the algorithms and the ensemble, store results
+            results = self.compute_and_save_algorithms(inferred_trip)
+            ensemble = self.compute_and_save_ensemble(inferred_trip, results)
+
+            # Put final results into the inferred trip and store it
+            inferred_trip["data"]["cleaned_trip"] = cleaned_trip.get_id()
+            inferred_trip["data"]["inferred_labels"] = ensemble["prediction"]
+            self.ts.insert(inferred_trip)
+
+            if self._last_trip_done is None or self._last_trip_done["data"]["end_ts"] < cleaned_trip["data"]["end_ts"]:
+                self._last_trip_done = cleaned_trip
+    
+    # This is where the labels for a given trip are actually predicted.
+    # Though the only information passed in is the trip object, the trip object can provide the
+    # user_id and other potentially useful information.
+    def compute_and_save_algorithms(self, trip):
+        predictions = []
+        for algorithm_id, algorithm_fn in primary_algorithms.items():
+            prediction = algorithm_fn(trip)
+            lp = ecwl.Labelprediction()
+            lp.trip_id = trip.get_id()
+            lp.algorithm_id = algorithm_id
+            lp.prediction = prediction
+            lp.start_ts = trip["data"]["start_ts"]
+            lp.end_ts = trip["data"]["end_ts"]
+            self.ts.insert_data(self.user_id, "inference/labels", lp)
+            predictions.append(lp)
+        return predictions
+
+    # Combine all our predictions into a single ensemble prediction.
+    # As a placeholder, we just take the first prediction.
+    # TODO: implement a real combination algorithm.
+    def compute_and_save_ensemble(self, trip, predictions):
+        il = ecwl.Labelprediction()
+        il.trip_id = trip.get_id()
+        il.start_ts = trip["data"]["start_ts"]
+        il.end_ts = trip["data"]["end_ts"]
+        (il.algorithm_id, il.prediction) = ensemble(trip, predictions)
+        self.ts.insert_data(self.user_id, "analysis/inferred_labels", il)
+        return il
\ No newline at end of file
diff --git a/emission/analysis/modelling/trip_model/model_storage.py b/emission/analysis/modelling/trip_model/model_storage.py
index 7e94d1217..6cce5db63 100644
--- a/emission/analysis/modelling/trip_model/model_storage.py
+++ b/emission/analysis/modelling/trip_model/model_storage.py
@@ -35,6 +35,57 @@ def from_str(cls, str):
             msg = f"{str} is not a known ModelStorage, must be one of {names}"
             raise KeyError(msg)
 
+def load_model_all_users(model_type: eamum.ModelType, model_storage: ModelStorage) -> Optional[Dict]:
+    """load a user label model from a model storage location
+
+    :param user_id: the user to request a model for
+    :param model_type: expected type of model stored
+    :param model_storage: storage format 
+    :return: the model representation as a Python Dict or None
+    :raises: TypeError if loaded model has different type than expected type
+             KeyError if the ModelType is not known
+    """
+    if model_storage == ModelStorage.DOCUMENT_DATABASE:
+        
+        # retrieve stored model with timestamp that matches/exceeds the most
+        # recent PipelineState.TRIP_MODEL entry        
+        ms = esma.ModelStorage.get_model_storage(0)
+        latest_model_entry = ms.get_current_model(key=esda.REPLACE_MODEL_STORE_KEY)
+
+        if latest_model_entry is None:
+            logging.debug(f'no {model_type.name} model found')
+            return None
+
+        write_ts = latest_model_entry['metadata']['write_ts']
+        logging.debug(f'retrieved latest trip model recorded at timestamp {write_ts}')
+        logging.debug(latest_model_entry)
+
+        # parse str to enum for ModelType
+        latest_model_type_str = latest_model_entry.get('data', {}).get('model_type')
+        if latest_model_type_str is None:
+            raise TypeError('stored model does not have a model type')
+        latest_model_type = eamum.ModelType.from_str(latest_model_type_str)
+        
+        # validate and return
+        if latest_model_entry is None:
+            return None
+        elif latest_model_type != model_type:
+            msg = (
+                f"loading model has model type '{latest_model_type.name}' " 
+                f"but was expected to have model type {model_type.name}"
+            )
+            raise TypeError(msg)
+        else:
+            return latest_model_entry['data']['model']
+
+    else:
+        storage_types_str = ",".join(ModelStorage.names())
+        msg = (
+            f"unknown model storage type {model_storage}, must be one of "
+            f"{{{storage_types_str}}}"
+        )
+        raise TypeError(msg)
+
 def load_model(user_id, model_type: eamum.ModelType, model_storage: ModelStorage) -> Optional[Dict]:
     """load a user label model from a model storage location
 
diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py
index e3e2b1c4e..094eacef1 100644
--- a/emission/analysis/modelling/trip_model/run_model.py
+++ b/emission/analysis/modelling/trip_model/run_model.py
@@ -118,6 +118,27 @@ def predict_labels_with_n(
         predictions, n = model.predict(trip)
         return predictions, n
 
+def predict_labels_with_gbdt(
+    trip: ecwc.Confirmedtrip,
+    model_type = eamumt.ModelType.GRADIENT_BOOSTED_DECISION_TREE,
+    model_storage = eamums.ModelStorage.DOCUMENT_DATABASE,
+    model_config = None):
+    """
+    invoke the user label prediction model to predict labels for a trip.
+
+    :param trip: the trip to predict labels for
+    :param model_type: type of prediction model to run
+    :param model_storage: location to read/write models
+    :param model_config: optional configuration for model, for debugging purposes
+    :return: a list of predictions
+    """
+    user_id = trip['user_id']
+    model = _load_stored_trip_model_all_users(model_type, model_storage, model_config)
+    if model is None:
+        return [], -1
+    else:
+        predictions, n = model.predict(trip)
+        return predictions, n
 
 def _get_training_data(user_id: UUID, time_query: Optional[estt.TimeQuery]):
     """
@@ -159,6 +180,25 @@ def _load_stored_trip_model(
         model.from_dict(model_dict)
         return model
     
+def _load_stored_trip_model_all_users(
+    model_type: eamumt.ModelType, 
+    model_storage: eamums.ModelStorage,
+    model_config = None) -> Optional[eamuu.TripModel]:
+    """helper to build a user label prediction model class with the 
+    contents of a stored model shared across all users.
+
+    :param model_type: TripModel type configured for this OpenPATH server
+    :param model_storage: storage type
+    :param model_config: optional configuration for model, for debugging purposes
+    :return: model, or None if no model is stored for this user
+    """
+    model_dict = eamums.load_model_all_users(model_type, model_storage)
+    if model_dict is None:
+        return None
+    else:    
+        model = model_type.build(model_config)
+        model.from_dict(model_dict)
+        return model
 
 def _latest_timestamp(trips: List[ecwc.Confirmedtrip]) -> float:
     """extract the latest timestamp observed from a list of trips
diff --git a/emission/core/wrapper/labelprediction.py b/emission/core/wrapper/labelprediction.py
index c74609e5a..9725e95ee 100644
--- a/emission/core/wrapper/labelprediction.py
+++ b/emission/core/wrapper/labelprediction.py
@@ -19,6 +19,7 @@ class AlgorithmTypes(enum.Enum):
     TWO_STAGE_BIN_CLUSTER = 5
     PLACEHOLDER_PREDICTOR_DEMO = 6
     CONFIDENCE_DISCOUNTED_CLUSTER = 7
+    GRADIENT_BOOSTED_DECISION_TREE = 8
 
 
 class Labelprediction(ecwb.WrapperBase):
diff --git a/emission/storage/decorations/analysis_timeseries_queries.py b/emission/storage/decorations/analysis_timeseries_queries.py
index 9f8ab6a70..c684dc127 100644
--- a/emission/storage/decorations/analysis_timeseries_queries.py
+++ b/emission/storage/decorations/analysis_timeseries_queries.py
@@ -38,6 +38,7 @@
 METRICS_DAILY_MEAN_MEDIAN_SPEED = "metrics/daily_mean_median_speed"
 INFERRED_LABELS_KEY = "inference/labels"
 TRIP_MODEL_STORE_KEY = "inference/trip_model"
+REPLACE_MODEL_STORE_KEY = "inference/replace_model"
 
 # General methods