-
Notifications
You must be signed in to change notification settings - Fork 119
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Scalability fixes - Load model once per user #944
Changes from all commits
408cad3
7d5b5d2
f924776
a7668f5
d973da3
d042e1b
fb278ff
30c02a4
b15a0e0
54a42a8
7f70be9
6c2c9c0
433b40a
9032ce0
7950981
425966a
64232ab
3372604
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,8 @@ | |
import logging | ||
import random | ||
import copy | ||
import time | ||
import arrow | ||
|
||
# Our imports | ||
import emission.storage.pipeline_queries as epq | ||
|
@@ -55,48 +57,68 @@ def run_prediction_pipeline(self, user_id, time_range): | |
self.ts = esta.TimeSeries.get_time_series(user_id) | ||
self.toPredictTrips = esda.get_entries( | ||
esda.CLEANED_TRIP_KEY, user_id, time_query=time_range) | ||
for cleaned_trip in self.toPredictTrips: | ||
# Create an inferred trip | ||
|
||
cleaned_trip_list = self.toPredictTrips | ||
inferred_trip_list = [] | ||
results_dict = {} | ||
ensemble_list = [] | ||
|
||
# Create list of inferred trips | ||
for cleaned_trip in cleaned_trip_list: | ||
cleaned_trip_dict = copy.copy(cleaned_trip)["data"] | ||
inferred_trip = ecwe.Entry.create_entry(user_id, "analysis/inferred_trip", cleaned_trip_dict) | ||
|
||
inferred_trip_list.append(inferred_trip) | ||
|
||
if inferred_trip_list: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't this always going to be true, given that you define There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did test this out and Python evaluates an empty list to be False when used in an if condition. |
||
# Computing outside loop by passing trip_list to ensure model loads once | ||
# Run the algorithms and the ensemble, store results | ||
results = self.compute_and_save_algorithms(inferred_trip) | ||
ensemble = self.compute_and_save_ensemble(inferred_trip, results) | ||
results_dict = self.compute_and_save_algorithms(inferred_trip_list) | ||
ensemble_list = self.compute_and_save_ensemble(inferred_trip_list, results_dict) | ||
|
||
start_insert_inferred_trip_time = time.process_time() | ||
for cleaned_trip, inferred_trip, ensemble in zip(cleaned_trip_list, inferred_trip_list, ensemble_list): | ||
# Put final results into the inferred trip and store it | ||
inferred_trip["data"]["cleaned_trip"] = cleaned_trip.get_id() | ||
inferred_trip["data"]["inferred_labels"] = ensemble["prediction"] | ||
self.ts.insert(inferred_trip) | ||
|
||
if self._last_trip_done is None or self._last_trip_done["data"]["end_ts"] < cleaned_trip["data"]["end_ts"]: | ||
self._last_trip_done = cleaned_trip | ||
# print(f"{arrow.now()} Inside run_prediction_pipeline: Saving inferred_trip total time = {time.process_time() - start_insert_inferred_trip_time}") | ||
|
||
# This is where the labels for a given trip are actually predicted. | ||
# Though the only information passed in is the trip object, the trip object can provide the | ||
# user_id and other potentially useful information. | ||
def compute_and_save_algorithms(self, trip): | ||
predictions = [] | ||
def compute_and_save_algorithms(self, trip_list): | ||
predictions_dict = {trip.get_id(): [] for trip in trip_list} | ||
for algorithm_id, algorithm_fn in primary_algorithms.items(): | ||
prediction = algorithm_fn(trip) | ||
lp = ecwl.Labelprediction() | ||
lp.trip_id = trip.get_id() | ||
lp.algorithm_id = algorithm_id | ||
lp.prediction = prediction | ||
lp.start_ts = trip["data"]["start_ts"] | ||
lp.end_ts = trip["data"]["end_ts"] | ||
self.ts.insert_data(self.user_id, "inference/labels", lp) | ||
predictions.append(lp) | ||
return predictions | ||
prediction_list = algorithm_fn(trip_list) | ||
start_insert_inference_labels_time = time.process_time() | ||
for trip, prediction in zip(trip_list, prediction_list): | ||
lp = ecwl.Labelprediction() | ||
lp.algorithm_id = algorithm_id | ||
lp.trip_id = trip.get_id() | ||
lp.prediction = prediction | ||
lp.start_ts = trip["data"]["start_ts"] | ||
lp.end_ts = trip["data"]["end_ts"] | ||
self.ts.insert_data(self.user_id, "inference/labels", lp) | ||
predictions_dict[trip.get_id()].append(lp) | ||
# print(f"{arrow.now()} Inside compute_and_save_algorithms: Saving inference/labels total time = {time.process_time() - start_insert_inference_labels_time}") | ||
return predictions_dict | ||
|
||
# Combine all our predictions into a single ensemble prediction. | ||
# As a placeholder, we just take the first prediction. | ||
# TODO: implement a real combination algorithm. | ||
def compute_and_save_ensemble(self, trip, predictions): | ||
il = ecwl.Labelprediction() | ||
il.trip_id = trip.get_id() | ||
il.start_ts = trip["data"]["start_ts"] | ||
il.end_ts = trip["data"]["end_ts"] | ||
(il.algorithm_id, il.prediction) = ensemble(trip, predictions) | ||
self.ts.insert_data(self.user_id, "analysis/inferred_labels", il) | ||
return il | ||
def compute_and_save_ensemble(self, trip_list, predictions_dict): | ||
start_insert_inferred_labels_time = time.process_time() | ||
il_list = [] | ||
for trip, key in zip(trip_list, predictions_dict): | ||
il = ecwl.Labelprediction() | ||
il.trip_id = trip.get_id() | ||
il.start_ts = trip["data"]["start_ts"] | ||
il.end_ts = trip["data"]["end_ts"] | ||
(il.algorithm_id, il.prediction) = ensemble(trip, predictions_dict[key]) | ||
self.ts.insert_data(self.user_id, "analysis/inferred_labels", il) | ||
il_list.append(il) | ||
# print(f"{arrow.now()} Inside compute_and_save_ensemble: Saving inferred_labels total time = {time.process_time() - start_insert_inferred_labels_time}") | ||
return il_list |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit, future fix: you should add some additional details to the log - what were the unique user IDs, how many is "multiiple"? Otherwise, if I see this log, I am not sure how to debug it.