Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

otp-runner deployment #316

Merged
merged 6 commits into from
Jul 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
291 changes: 140 additions & 151 deletions src/main/java/com/conveyal/datatools/manager/jobs/DeployJob.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import com.conveyal.datatools.manager.auth.Auth0UserProfile;
import com.conveyal.datatools.manager.models.Deployment;
import com.conveyal.datatools.manager.models.OtpServer;
import com.conveyal.datatools.manager.utils.json.JsonUtil;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
Expand All @@ -35,8 +36,7 @@
import java.io.IOException;
import java.util.Collections;

import static com.conveyal.datatools.manager.jobs.DeployJob.BUNDLE_DOWNLOAD_COMPLETE_FILE;
import static com.conveyal.datatools.manager.jobs.DeployJob.GRAPH_STATUS_FILE;
import static com.conveyal.datatools.manager.jobs.DeployJob.OTP_RUNNER_STATUS_FILE;

/**
* Job that is dispatched during a {@link DeployJob} that spins up EC2 instances. This handles waiting for the server to
Expand All @@ -53,9 +53,9 @@ public class MonitorServerStatusJob extends MonitorableJob {
private final AmazonEC2 ec2;
private final AmazonElasticLoadBalancing elbClient;
private final CloseableHttpClient httpClient = HttpClients.createDefault();
// Delay checks by twenty seconds to give user-data script time to upload the instance's user data log if part of the
// Delay checks by four seconds to give user-data script time to upload the instance's user data log if part of the
// script fails (e.g., uploading or downloading a file).
private static final int DELAY_SECONDS = 20;
private static final int DELAY_SECONDS = 4;
evansiroky marked this conversation as resolved.
Show resolved Hide resolved
public long graphTaskSeconds;

public MonitorServerStatusJob(Auth0UserProfile owner, DeployJob deployJob, Instance instance, boolean graphAlreadyBuilt) {
Expand Down Expand Up @@ -93,86 +93,61 @@ public String getDeploymentId () {

@Override
public void jobLogic() {
String ipUrl = "http://" + instance.getPublicIpAddress();
// Get OTP URL for instance to check for availability.
boolean routerIsAvailable = false, graphIsAvailable = false;
String ipUrl = "http://" + instance.getPublicIpAddress();
if (otpServer.ec2Info == null || otpServer.ec2Info.targetGroupArn == null) {
// Fail the job from the outset if there is no target group defined.
failJob("There is no load balancer under which to register ec2 instance.");
}
try {
if (graphAlreadyBuilt) {
// If graph already build, instance's user data will download Graph.obj automatically instead of bundle.
status.update("Loading graph...", 40);
} else {
// Otherwise, we need to verify that the bundle downloaded successfully.
boolean bundleIsDownloaded = false;
// Progressively check status of OTP server
if (deployment.buildGraphOnly) {
// No need to check that OTP is running. Just check to see that the graph is built.
routerIsAvailable = true;
}
// First, check that OTP has started up.
status.update("Prepping for graph build...", 20);
String bundleUrl = String.join("/", ipUrl, BUNDLE_DOWNLOAD_COMPLETE_FILE);
long bundleDownloadStartTime = System.currentTimeMillis();
while (!bundleIsDownloaded) {
// If the request is successful, the OTP instance has started.
waitAndCheckInstanceHealth("bundle download check: " + bundleUrl);
bundleIsDownloaded = checkForSuccessfulRequest(bundleUrl);
// wait 20 minutes max for the bundle to download
long maxBundleDownloadTimeMillis = 20 * 60 * 1000;
if (taskHasTimedOut(bundleDownloadStartTime, maxBundleDownloadTimeMillis)) {
failJob("Job timed out while checking for server bundle download status.");
return;
}
}
// Check status of bundle download and fail job if there was a failure.
String bundleStatus = getUrlAsString(bundleUrl);
if (bundleStatus == null || !bundleStatus.contains("SUCCESS")) {
failJob("Failure encountered while downloading transit bundle.");
// Wait for otp-runner to produce first status file
long statusCheckStartTime = System.currentTimeMillis();
String statusUrl = String.join("/", ipUrl, OTP_RUNNER_STATUS_FILE);
boolean otpRunnerStatusAvailable = false;
while (!otpRunnerStatusAvailable) {
// If the request is successful, the OTP instance has started.
waitAndCheckInstanceHealth("otp-runner status file availability check: " + statusUrl);
otpRunnerStatusAvailable = checkForSuccessfulRequest(statusUrl);
long maxOtpRunnerStartupTimeMillis = 5 * 60 * 1000;
if (taskHasTimedOut(statusCheckStartTime, maxOtpRunnerStartupTimeMillis)) {
failJob("Job timed out while waiting for otp-runner to produce a status file!");
return;
}
long bundleDownloadSeconds = (System.currentTimeMillis() - bundleDownloadStartTime) / 1000;
LOG.info("Bundle downloaded in {} seconds!", bundleDownloadSeconds);
status.update("Building graph...", 30);
}
// Once bundle is downloaded, we await the build (or download if graph already built) of the graph.
long graphCheckStartTime = System.currentTimeMillis();
String graphStatusUrl = String.join("/", ipUrl, GRAPH_STATUS_FILE);
while (!graphIsAvailable) {
// Wait for otp-runner to write a status that fulfills expectations of this job
statusCheckStartTime = System.currentTimeMillis();
boolean otpRunnerCompleted = false;
while (!otpRunnerCompleted) {
// If the request is successful, the OTP instance has started.
waitAndCheckInstanceHealth("graph build/download check: " + graphStatusUrl);
graphIsAvailable = checkForSuccessfulRequest(graphStatusUrl);
// wait a maximum of 4 hours if building the graph, or 20 minutes if downloading a graph
long maxGraphBuildOrDownloadWaitTimeMillis = graphAlreadyBuilt ? 20 * 60 * 1000 : 4 * 60 * 60 * 1000;
if (taskHasTimedOut(graphCheckStartTime, maxGraphBuildOrDownloadWaitTimeMillis)) {
failJob("Job timed out while waiting for graph build/download. If this was a graph building machine, it may have run out of memory.");
waitAndCheckInstanceHealth("otp-runner completion check: " + statusUrl);
otpRunnerCompleted = checkForOtpRunnerCompletion(statusUrl);
// Check if an otp-runner status file check has already failed this job.
if (status.error) {
return;
}
// wait a maximum of 5 hours if building a graph, or 1 hour if just starting a server
long maxOtpRunnerWaitTimeMillis = (graphAlreadyBuilt ? 5 : 1) * 60 * 60 * 1000;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did we bump up the wait time for graph build? We should justify this amount of time in the comments.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe I bumped it up to 5 hours because I rounded up from the previous scheme of 20 min bundle download timeout + 4 hour graph build timeout.

if (taskHasTimedOut(statusCheckStartTime, maxOtpRunnerWaitTimeMillis)) {
failJob("Job timed out while waiting for otp-runner to finish!");
return;
}
}
// Check graph status and fail job if there was a failure.
String graphStatus = getUrlAsString(graphStatusUrl);
if (graphStatus == null || !graphStatus.contains("SUCCESS")) {
failJob("Failure encountered while building/downloading graph.");
return;
}
graphTaskSeconds = (System.currentTimeMillis() - graphCheckStartTime) / 1000;
graphTaskSeconds = (System.currentTimeMillis() - statusCheckStartTime) / 1000;
String message = String.format("Graph build/download completed in %d seconds!", graphTaskSeconds);
LOG.info(message);
// If only task for this instance is to build the graph (either because that is the deployment purpose or
// because this instance type/image is for graph building only), this machine's job is complete and we can
// consider this job done.
if (deployment.buildGraphOnly || (!graphAlreadyBuilt && otpServer.ec2Info.hasSeparateGraphBuildConfig())) {
if (isBuildOnlyServer()) {
status.completeSuccessfully(message);
LOG.info("View logs at {}", getUserDataLogS3Path());
LOG.info("View logs at {}", getOtpRunnerLogS3Path());
return;
}
status.update("Loading graph...", 70);
// Once this is confirmed, check for the availability of the router, which will indicate that the graph
// load has completed successfully.
String routerUrl = String.join("/", ipUrl, "otp/routers/default");
long routerCheckStartTime = System.currentTimeMillis();
boolean routerIsAvailable = false;
while (!routerIsAvailable) {
// If the request was successful, the graph build is complete!
// TODO: Substitute in specific router ID? Or just default to... "default".
Expand Down Expand Up @@ -220,7 +195,7 @@ public void jobLogic() {
routerUrl
)
);
LOG.info("View logs at {}", getUserDataLogS3Path());
LOG.info("View logs at {}", getOtpRunnerLogS3Path());
deployJob.incrementCompletedServers();
} catch (InstanceHealthException e) {
// If at any point during the job, an instance health check indicates that the EC2 instance being monitored
Expand All @@ -232,19 +207,23 @@ public void jobLogic() {
}
}

private boolean isBuildOnlyServer() {
return deployment.buildGraphOnly || (!graphAlreadyBuilt && otpServer.ec2Info.hasSeparateGraphBuildConfig());
}

/**
* Gets the expected path to the user data logs that get uploaded to s3
* Gets the expected path to the otp-runner logs that get uploaded to s3
*/
private String getUserDataLogS3Path() {
return String.format("%s/%s.log", deployJob.getS3FolderURI(), instance.getInstanceId());
private String getOtpRunnerLogS3Path() {
return String.format("%s/%s-otp-runner.log", deployJob.getS3FolderURI(), instance.getInstanceId());
}

/**
* Helper that fails with a helpful message about where to find uploaded logs.
*/
private void failJob(String message) {
LOG.error(message);
status.fail(String.format("%s Check logs at: %s", message, getUserDataLogS3Path()));
status.fail(String.format("%s Check logs at: %s", message, getOtpRunnerLogS3Path()));
}

/** Determine if a specific task has passed time limit for its run time. */
Expand Down Expand Up @@ -299,15 +278,27 @@ public InstanceHealthException(String instanceStateName) {
}
}

/** Make HTTP request to URL and return the string response. */
private String getUrlAsString(String url) {
private boolean checkForOtpRunnerCompletion(String url) {
HttpGet httpGet = new HttpGet(url);
OtpRunnerStatus otpRunnerStatus;
try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
return EntityUtils.toString(response.getEntity());
otpRunnerStatus = JsonUtil.objectMapper.readValue(response.getEntity().getContent(), OtpRunnerStatus.class);
} catch (IOException e) {
LOG.error("Could not complete request to {}", url);
LOG.error("Could not get otp-runner status from {}", url);
e.printStackTrace();
return null;
return false;
}
if (otpRunnerStatus.error) {
failJob(otpRunnerStatus.message);
return false;
}
status.update(otpRunnerStatus.message, otpRunnerStatus.pctProgress);
if (graphAlreadyBuilt || !isBuildOnlyServer()) {
// server that finishes after OTP server is successfully started
return otpRunnerStatus.serverStarted;
} else {
// server that finishes after graph is uploaded
return otpRunnerStatus.graphUploaded;
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package com.conveyal.datatools.manager.jobs;

import java.util.List;

/**
* A mapping of the possible values of an otp-runner manifest. For further documentation please see otp-runner docs at
* https://github.com/ibi-group/otp-runner#manifestjson-values
*/
public class OtpRunnerManifest {
evansiroky marked this conversation as resolved.
Show resolved Hide resolved
public String buildConfigJSON;
public boolean buildGraph;
public String buildLogFile;
public String graphObjUrl;
public String graphsFolder;
public List<String> gtfsAndOsmUrls;
public String jarFile;
public String jarUrl;
public String otpRunnerLogFile;
public boolean prefixLogUploadsWithInstanceId;
public String routerConfigJSON;
public String routerName;
public boolean runServer;
public String s3UploadPath;
public String serverLogFile;
public int serverStartupTimeoutSeconds;
public String statusFileLocation;
public boolean uploadGraphBuildLogs;
public boolean uploadGraphBuildReport;
public boolean uploadGraph;
public boolean uploadOtpRunnerLogs;
public boolean uploadServerStartupLogs;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package com.conveyal.datatools.manager.jobs;

/**
* A mapping of the fields and values that otp-runner writes to a status file. See otp-runner documentation for more
* info: https://github.com/ibi-group/otp-runner#statusjson
*/
public class OtpRunnerStatus {
evansiroky marked this conversation as resolved.
Show resolved Hide resolved
public boolean error;
public boolean graphBuilt;
public boolean graphUploaded;
public boolean serverStarted;
public String message;
public int numFilesDownloaded;
public double pctProgress;
public int totalFilesToDownload;
}
Loading