From 7112ec870ce297c01107966c1189a93e86d623e6 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Thu, 18 May 2023 18:31:58 +0100 Subject: [PATCH 1/4] Add initial docs on k8s healthchecks Signed-off-by: Nigel Jones --- site/docs/guides/admin/healthchecks.md | 215 +++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 site/docs/guides/admin/healthchecks.md diff --git a/site/docs/guides/admin/healthchecks.md b/site/docs/guides/admin/healthchecks.md new file mode 100644 index 0000000000..31b24d3117 --- /dev/null +++ b/site/docs/guides/admin/healthchecks.md @@ -0,0 +1,215 @@ + + + +There are various API calls that will check the status of Egeria. + +These may be typically used in a Kubernetes environment to check if Egeria is ready to service requests. Here we summarize what is available + +## Example API calls + +In these examples the [httpie](https://httpie.io) tool will be used as it will print both the response code, and pretty-formatted body by default. Other tools like curl may also be used, but more parsing may be required of the responses. + +The examples here were run against the lab charts, using user 'garygeeke'. A simple security plugin is active which restricts user access to api calls. + +### Platform +This checks if the *platform* is available. + +#### Platform is not running +➜ ~ curl -k -X GET --connect-timeout 5 --max-time 5 "https://44623abc-eu-gb.lb.appdomain.cloud:9443/open-metadata/admin-services/users/admin/server-origin" +Egeria OMAG Server Platform (version 4.1-SNAPSHOT) + +#### Platform is running +➜ ~ http --verify=no --pretty=format GET "https://44623abc-eu-gb.lb.appdomain.cloud:9443/open-metadata/admin-services/users/admin/server-origin" +HTTP/1.1 200 +Connection: keep-alive +Content-Length: 42 +Content-Type: text/plain;charset=UTF-8 +Date: Thu, 18 May 2023 17:10:27 GMT +Keep-Alive: timeout=60 + +Egeria OMAG Server Platform (version 4.0) + + + +### Server + +#### Server is not known +``` +➜ ~ http --verify=no --pretty=format GET "https://44623abc-eu-gb.lb.appdomain.cloud:9443/open-metadata/admin-services/users/admin/servers/cocoMDS99/instance/status" +HTTP/1.1 200 +Connection: keep-alive +Content-Type: application/json +Date: Thu, 18 May 2023 17:08:15 GMT +Keep-Alive: timeout=60 +Transfer-Encoding: chunked + +{ + "actionDescription": "getActiveServerStatus", + "class": "OMAGServerStatusResponse", + "exceptionClassName": "org.odpi.openmetadata.frameworks.connectors.ffdc.InvalidParameterException", + "exceptionErrorMessage": "OMAG-MULTI-TENANT-404-001 The OMAG Server cocoMDS99 is not available to service a request from user admin", + "exceptionErrorMessageId": "OMAG-MULTI-TENANT-404-001", + "exceptionErrorMessageParameters": [ + "cocoMDS99", + "admin" + ], + "exceptionProperties": { + "parameterName": "serverName", + "serverName": "cocoMDS99" + }, + "exceptionSystemAction": "The system is unable to process the request because the server is not running on the called platform.", + "exceptionUserAction": "Verify that the correct server is being called on the correct platform and that this server is running. Retry the request when the server is available.", + "relatedHTTPCode": 404 +} + +``` +#### No permission for api call +``` +➜ ~ http --verify=no --pretty=format GET "https://44623abc-eu-gb.lb.appdomain.cloud:9443/open-metadata/admin-services/users/admin/servers/cocoMDS5/instance/status" +HTTP/1.1 200 +Connection: keep-alive +Content-Type: application/json +Date: Thu, 18 May 2023 17:07:28 GMT +Keep-Alive: timeout=60 +Transfer-Encoding: chunked + +{ +"actionDescription": "validateUserForServer", +"class": "OMAGServerStatusResponse", +"exceptionClassName": "org.odpi.openmetadata.commonservices.ffdc.exceptions.UserNotAuthorizedException", +"exceptionErrorMessage": "OMAG-PLATFORM-SECURITY-403-002 User admin is not authorized to issue a request to server cocoMDS5", +"exceptionSystemAction": "The system is unable to process a request from the user because they do not have access to the requested OMAG server. The request fails with a UserNotAuthorizedException exception.", +"exceptionUserAction": "Determine whether the user should have access to the server. If they should have, take steps to add them to the authorized list of users. If this user should not have access, investigate where the request came from to determine if the system is under attack, or it was a mistake, or the user's tool is not configured to connect to the correct server.", +"relatedHTTPCode": 403 +} +``` +### Server is available +``` +➜ ~ http --verify=no --pretty=format GET "https://44623abc-eu-gb.lb.appdomain.cloud:9443/open-metadata/admin-services/users/garygeeke/servers/cocoMDS2/instance/status" +HTTP/1.1 200 +Connection: keep-alive +Content-Type: application/json +Date: Thu, 18 May 2023 17:06:46 GMT +Keep-Alive: timeout=60 +Transfer-Encoding: chunked + +{ + "class": "OMAGServerStatusResponse", + "relatedHTTPCode": 200, + "serverStatus": { + "serverActiveStatus": "RUNNING", + "serverName": "cocoMDS2", + "serverType": "Metadata Access Store", + "services": [ + { + "serviceName": "Subject Area OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Security Officer OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Open Metadata Repository Services (OMRS)", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Data Privacy OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Community Profile OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Asset Consumer OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Asset Lineage OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Open Metadata Store Services", + "serviceStatus": "STARTING" + }, + { + "serviceName": "Asset Catalog OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "IT Infrastructure OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Asset Owner OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Connected Asset Services", + "serviceStatus": "STARTING" + }, + { + "serviceName": "Digital Architecture OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Glossary View OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Governance Program OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Project Management OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Governance Engine OMAS", + "serviceStatus": "RUNNING" + }, + { + "serviceName": "Open Integration Service", + "serviceStatus": "STARTING" + } + ] + } +} +``` + + +## Interpreting the API calls + +A timeout will occur if the platform is not running. +In all other cases a HTTP 200 will be returned. + +Additionally the server status call returns fine-grained information about all the services configured in a server. + +In the simplest case it would be reasonable to define that the server is not available until all services are running. + +## what about connectors? + +Each service running on the platform may have a dependence on connectors, such as for topics/kafka, or in the case of integration, technology connectors such as to a database. Each connector may behave differently, and in many cases not report any issue for a transient error. + +## Defining a Kubernetes health check + +See also [Kubernetes docs](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) + +Kubernetes has 3 types of health checks + - startup -- to confirm a pod has started + - readiness -- to confirm a pod is ready. Typically this will then allow requests to be routed here + - liveness -- to check the pod is still responding to requests in a timely fashion. + +Typically pods will be restarted if these health checks do not pass in a specified time period. + +Each of these checks can be of several types + - tcpSocket -- this just checks for an open port. + - grpc -- issues grpc call (we do no use grpc in egeria) + - httpGet -- a simple GET. If return is >=200 and <400 it is successful + - exec -- issues a specified command within the container + +Looking at the checks above, since all return 200 - if anything - they will always succeed. +Therefore a simple httpGet check cannot be used. + +Instead an 'exec' From 70b20bda47655eb5e43f00dd0c4730ebfd8a7c27 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Fri, 19 May 2023 10:45:39 +0100 Subject: [PATCH 2/4] Add k8s healthchecks into navigation Signed-off-by: Nigel Jones --- .../docs/guides/{admin => operations/kubernetes}/healthchecks.md | 0 site/docs/guides/operations/kubernetes/index.md | 1 + site/mkdocs.yml | 1 + 3 files changed, 2 insertions(+) rename site/docs/guides/{admin => operations/kubernetes}/healthchecks.md (100%) diff --git a/site/docs/guides/admin/healthchecks.md b/site/docs/guides/operations/kubernetes/healthchecks.md similarity index 100% rename from site/docs/guides/admin/healthchecks.md rename to site/docs/guides/operations/kubernetes/healthchecks.md diff --git a/site/docs/guides/operations/kubernetes/index.md b/site/docs/guides/operations/kubernetes/index.md index 805dcf1a6e..5aff37bd0e 100644 --- a/site/docs/guides/operations/kubernetes/index.md +++ b/site/docs/guides/operations/kubernetes/index.md @@ -10,6 +10,7 @@ Kubernetes offers one standard way of deploying the Egeria platform into a varie - [Egeria charts](charts/overview) - [Container images](container-images.md) - [Developing a custom deployment](custom-deployment.md) +- [Health Checks](healthchecks.md) - [Egeria Operator](operator.md) ---8<-- "snippets/abbr.md" diff --git a/site/mkdocs.yml b/site/mkdocs.yml index 2ccd2a2986..9e2bd2f51e 100644 --- a/site/mkdocs.yml +++ b/site/mkdocs.yml @@ -159,6 +159,7 @@ nav: - PTS Chart: guides/operations/kubernetes/charts/pts.md - Container Images: guides/operations/kubernetes/container-images.md - Custom Deployment: guides/operations/kubernetes/custom-deployment.md + - Healthchecks: guides/operations/kubernetes/healthchecks.md - Egeria Operator: guides/operations/kubernetes/operator.md - Diagnostic Guide: - Diagnostic Process: guides/diagnostic/overview.md From 0284794ca780e81797cca55d0c5035c90f41af92 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Fri, 19 May 2023 10:58:40 +0100 Subject: [PATCH 3/4] Add further k8s healthcheck info including example for checking server status Signed-off-by: Nigel Jones --- .../operations/kubernetes/healthchecks.md | 55 +++++++++++++------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/site/docs/guides/operations/kubernetes/healthchecks.md b/site/docs/guides/operations/kubernetes/healthchecks.md index 31b24d3117..a0f98b9960 100644 --- a/site/docs/guides/operations/kubernetes/healthchecks.md +++ b/site/docs/guides/operations/kubernetes/healthchecks.md @@ -1,11 +1,30 @@ +# Healthchecks when running Egeria in Kubernetes + There are various API calls that will check the status of Egeria. -These may be typically used in a Kubernetes environment to check if Egeria is ready to service requests. Here we summarize what is available +These may be typically used in a Kubernetes environment to check if Egeria is ready to service requests. Here we summarize what is available. + +## Defining a Kubernetes health check + +See also [Kubernetes docs](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) + +Kubernetes has 3 types of health checks +- startup -- to confirm a pod has started +- readiness -- to confirm a pod is ready. Typically this will then allow requests to be routed here +- liveness -- to check the pod is still responding to requests in a timely fashion. + +Typically pods will be restarted if these health checks do not pass in a specified time period. -## Example API calls +Each of these checks can be of several types +- tcpSocket -- this just checks for an open port. +- grpc -- issues grpc call (we do no use grpc in egeria) +- httpGet -- a simple GET. If return is >=200 and <400 it is successful +- exec -- issues a specified command within the container +- +## Example Egeria API calls In these examples the [httpie](https://httpie.io) tool will be used as it will print both the response code, and pretty-formatted body by default. Other tools like curl may also be used, but more parsing may be required of the responses. @@ -179,11 +198,13 @@ Transfer-Encoding: chunked ``` -## Interpreting the API calls +## Interpreting the Egeria API calls A timeout will occur if the platform is not running. In all other cases a HTTP 200 will be returned. +Finer grain detail is provided by the 'relatedHTTPCode' field within the response body - this is more similar to what might be expected from a typical status check. For example, Looking at the checks above, all return 200 as the http status code, but vary in terms of the relatedHTTPCode. + Additionally the server status call returns fine-grained information about all the services configured in a server. In the simplest case it would be reasonable to define that the server is not available until all services are running. @@ -192,24 +213,22 @@ In the simplest case it would be reasonable to define that the server is not ava Each service running on the platform may have a dependence on connectors, such as for topics/kafka, or in the case of integration, technology connectors such as to a database. Each connector may behave differently, and in many cases not report any issue for a transient error. -## Defining a Kubernetes health check +## Formulating a k8s health check -See also [Kubernetes docs](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) +We cannot use a k8s http status check since this returns 200 in most cases. Instead we need to use the 'exec' method. This will run a curl command within the container, and parse out the relatedHTTPCode, returning 0 if we think all is ok -Kubernetes has 3 types of health checks - - startup -- to confirm a pod has started - - readiness -- to confirm a pod is ready. Typically this will then allow requests to be routed here - - liveness -- to check the pod is still responding to requests in a timely fashion. +For example +``` +curl -k -o - -X GET --connect-timeout 5 --max-time 5 "https://44623abc-eu-gb.lb.appdomain.cloud:9443/open-metadata/admin-services/users/admin/servers/cocoMDS5/instance/status" | grep '.*\"relatedHTTPCode\": 200' +``` -Typically pods will be restarted if these health checks do not pass in a specified time period. +This will return 0 (healthcheck passes) if we see an embedded 200 response, and 1 otherwise, therefore satisfying the requirement for a server specific Healthcheck + +## Status aggregation + +The above example requires that the healthcheck is coded against a specific server. Future work will investigate the answers to: -Each of these checks can be of several types - - tcpSocket -- this just checks for an open port. - - grpc -- issues grpc call (we do no use grpc in egeria) - - httpGet -- a simple GET. If return is >=200 and <400 it is successful - - exec -- issues a specified command within the container +#### Are all servers on the platform available? -Looking at the checks above, since all return 200 - if anything - they will always succeed. -Therefore a simple httpGet check cannot be used. -Instead an 'exec' +#### Is everything a server provides available? From 02859113bce987c33b68fd3c5f7e2dac49c4a466 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Fri, 19 May 2023 11:10:04 +0100 Subject: [PATCH 4/4] Add info on server starting up, and refer to jq tool for status parsing Signed-off-by: Nigel Jones --- .../operations/kubernetes/healthchecks.md | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/site/docs/guides/operations/kubernetes/healthchecks.md b/site/docs/guides/operations/kubernetes/healthchecks.md index a0f98b9960..43cf02979a 100644 --- a/site/docs/guides/operations/kubernetes/healthchecks.md +++ b/site/docs/guides/operations/kubernetes/healthchecks.md @@ -51,6 +51,7 @@ Egeria OMAG Server Platform (version 4.0) ### Server +This API call was introduced in [release 3.4](/release-notes/3-4.md) #### Server is not known ``` @@ -196,7 +197,28 @@ Transfer-Encoding: chunked } } ``` +### Server is starting +This is similar to the previous example, but the response body is: +```xml + { + "class": "OMAGServerStatusResponse", + "relatedHTTPCode": 200, + "serverStatus": { + "serverName": "cocoMDS2", + "serverType": "Metadata Server", + "serverActiveStatus": "STARTING", + "services": [ + { + "serviceName": "Open Metadata Repository Services (OMRS)", + "serviceStatus": "STARTING" + } + ] + } +``` +Other status are referenced in the linked release note. + +This demonstrates a further challenge -- serviceStatus also needs to be checked ## Interpreting the Egeria API calls @@ -224,11 +246,12 @@ curl -k -o - -X GET --connect-timeout 5 --max-time 5 "https://44623abc-eu-gb.lb. This will return 0 (healthcheck passes) if we see an embedded 200 response, and 1 otherwise, therefore satisfying the requirement for a server specific Healthcheck -## Status aggregation +However this is insufficient as we see from the 'starting' example above -- a server would show as ready even if in the process of starting up or shutting down, yet we would not want to direct requests in those cases. -The above example requires that the healthcheck is coded against a specific server. Future work will investigate the answers to: +## Status aggregation & finer grained state -#### Are all servers on the platform available? +We've seen above that we need to go further, look at finer grained state of the server, and potentially individual services. +We will now make use of the [jq](https://stedolan.github.io/jq/) tool to perform improved queries to check status -#### Is everything a server provides available? +.... content to be added