Skip to content

Commit

Permalink
Define gRPC server for liveness probe
Browse files Browse the repository at this point in the history
[upstream commit 6e17c79]

Now, we use tetra status command to report the status of tetragon
agent. This comes with some overheads as tetra binary has a lot of
additional functionality and it seems like an overkill to use that for
status reporting.

On the other hand, k8s supports liveness probes by using an gRPC
endpoint (i.e.
https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-grpc-liveness-probe).
This patch first creates a dedicated gRPC server to report agent status that
can be used for the liveness probe.

Signed-off-by: Anastasios Papagiannis <[email protected]>
  • Loading branch information
tpapagian committed May 29, 2024
1 parent 69e0cbb commit 331f51a
Show file tree
Hide file tree
Showing 15 changed files with 954 additions and 0 deletions.
5 changes: 5 additions & 0 deletions cmd/tetragon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/cilium/tetragon/pkg/fileutils"
"github.com/cilium/tetragon/pkg/filters"
tetragonGrpc "github.com/cilium/tetragon/pkg/grpc"
"github.com/cilium/tetragon/pkg/health"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/metricsconfig"
Expand Down Expand Up @@ -443,6 +444,10 @@ func tetragonExecute() error {
}
}

if option.Config.HealthServerAddress != "" {
health.StartHealthServer(ctx, option.Config.HealthServerAddress, option.Config.HealthServerInterval)
}

log.WithField("enabled", option.Config.ExportFilename != "").WithField("fileName", option.Config.ExportFilename).Info("Exporter configuration")
obs.AddListener(pm)
saveInitInfo()
Expand Down
2 changes: 2 additions & 0 deletions docs/content/en/docs/reference/daemon-flags.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Flags:
--force-large-progs Force loading large programs, even in kernels with < 5.3 versions
--force-small-progs Force loading small programs, even in kernels with >= 5.3 versions
--gops-address string gops server address (e.g. 'localhost:8118'). Disabled by default
--health-server-address string Health server address (e.g. ':6789')(use '' to disabled it) (default ":6789")
--health-server-interval int Health server interval in seconds (default 10)
-h, --help help for tetragon
--k8s-kubeconfig-path string Absolute path of the kubernetes kubeconfig file
--kernel string Kernel version
Expand Down
3 changes: 3 additions & 0 deletions docs/content/en/docs/reference/helm-chart.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ To use [the values available](#values), with `helm install` or `helm upgrade`, u
| tetragon.gops.port | int | `8118` | The port at which to expose gops. |
| tetragon.grpc.address | string | `"localhost:54321"` | The address at which to expose gRPC. Examples: localhost:54321, unix:///var/run/tetragon/tetragon.sock |
| tetragon.grpc.enabled | bool | `true` | Whether to enable exposing Tetragon gRPC. |
| tetragon.healthGrpc.enabled | bool | `true` | Whether to enable health gRPC server. |
| tetragon.healthGrpc.interval | int | `10` | The interval at which to check the health of the agent. |
| tetragon.healthGrpc.port | int | `6789` | The port at which to expose health gRPC. |
| tetragon.hostProcPath | string | `"/proc"` | Location of the host proc filesystem in the runtime environment. If the runtime runs in the host, the path is /proc. Exceptions to this are environments like kind, where the runtime itself does not run on the host. |
| tetragon.image.override | string | `nil` | |
| tetragon.image.repository | string | `"quay.io/cilium/tetragon"` | |
Expand Down
3 changes: 3 additions & 0 deletions install/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ Helm chart for Tetragon
| tetragon.gops.port | int | `8118` | The port at which to expose gops. |
| tetragon.grpc.address | string | `"localhost:54321"` | The address at which to expose gRPC. Examples: localhost:54321, unix:///var/run/tetragon/tetragon.sock |
| tetragon.grpc.enabled | bool | `true` | Whether to enable exposing Tetragon gRPC. |
| tetragon.healthGrpc.enabled | bool | `true` | Whether to enable health gRPC server. |
| tetragon.healthGrpc.interval | int | `10` | The interval at which to check the health of the agent. |
| tetragon.healthGrpc.port | int | `6789` | The port at which to expose health gRPC. |
| tetragon.hostProcPath | string | `"/proc"` | Location of the host proc filesystem in the runtime environment. If the runtime runs in the host, the path is /proc. Exceptions to this are environments like kind, where the runtime itself does not run on the host. |
| tetragon.image.override | string | `nil` | |
| tetragon.image.repository | string | `"quay.io/cilium/tetragon"` | |
Expand Down
6 changes: 6 additions & 0 deletions install/kubernetes/templates/tetragon_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ data:
server-address: {{ .Values.tetragon.grpc.address }}
{{- else }}
{{- end }}
{{- if .Values.tetragon.healthGrpc.enabled }}
health-server-address: :{{ .Values.tetragon.healthGrpc.port }}
health-server-interval: {{ .Values.tetragon.healthGrpc.interval | quote }}
{{- else }}
health-server-address: ""
{{- end }}
{{- if .Values.tetragon.tcpStatsSampleSegs }}
tcp-stats-sample-segs: {{ .Values.tetragon.tcpStatsSampleSegs | quote }}
{{- end }}
Expand Down
7 changes: 7 additions & 0 deletions install/kubernetes/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,13 @@ tetragon:
enablePolicyFilterDebug: false
# Enable latency monitoring in message handling
enableMsgHandlingLatency: false
healthGrpc:
# -- Whether to enable health gRPC server.
enabled: true
# -- The port at which to expose health gRPC.
port: 6789
# -- The interval at which to check the health of the agent.
interval: 10
# -- Location of the host proc filesystem in the runtime environment. If the runtime runs in the
# host, the path is /proc. Exceptions to this are environments like kind, where the runtime itself
# does not run on the host.
Expand Down
67 changes: 67 additions & 0 deletions pkg/health/server.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon

package health

import (
"context"
"net"
"time"

"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/logger"
"google.golang.org/grpc"
gh "google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
)

var (
log = logger.GetLogger()
)

func StartHealthServer(ctx context.Context, address string, interval int) {
// Create a new health server and mark it as serving.
healthServer := gh.NewServer()
healthServer.SetServingStatus("liveness", grpc_health_v1.HealthCheckResponse_SERVING)

// Create a new gRPC server for health checks and register the healthServer.
grpcHealthServer := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(grpcHealthServer, healthServer)

// Start the gRPC server for the health checks.
go func() {
// the gRPC server for the health checks listens on port 6789
listener, err := net.Listen("tcp", address)
if err != nil {
log.WithError(err).Fatal("Failed to listen for gRPC healthserver")
}

log.WithField("address", address).WithField("interval", interval).Info("Starting gRPC health server")
if err = grpcHealthServer.Serve(listener); err != nil {
log.WithError(err).Fatal("Failed to start gRPC healthserver")
}
}()

// Check the agent health periodically. To check if our agent is health we call
// health.GetHealth() and we report the status to the healthServer.
go func() {
ticker := time.NewTicker(time.Duration(interval) * time.Second)
for {
select {
case <-ticker.C:
servingStatus := grpc_health_v1.HealthCheckResponse_NOT_SERVING
if response, err := GetHealth(); err == nil {
if st := response.GetHealthStatus(); len(st) > 0 && st[0].Status == tetragon.HealthStatusResult_HEALTH_STATUS_RUNNING {
servingStatus = grpc_health_v1.HealthCheckResponse_SERVING
}
}
healthServer.SetServingStatus("liveness", servingStatus)
case <-ctx.Done():
ticker.Stop()
healthServer.Shutdown() // set all services to NOT_SERVING
grpcHealthServer.Stop()
return
}
}
}()
}
3 changes: 3 additions & 0 deletions pkg/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ type config struct {
EnablePodInfo bool

ExposeKernelAddresses bool

HealthServerAddress string
HealthServerInterval int
}

var (
Expand Down
8 changes: 8 additions & 0 deletions pkg/option/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ const (
KeyEnablePodInfo = "enable-pod-info"

KeyExposeKernelAddresses = "expose-kernel-addresses"

KeyHealthServerAddress = "health-server-address"
KeyHealthTimeInterval = "health-server-interval"
)

func ReadAndSetFlags() error {
Expand Down Expand Up @@ -169,6 +172,8 @@ func ReadAndSetFlags() error {

Config.ExposeKernelAddresses = viper.GetBool(KeyExposeKernelAddresses)

Config.HealthServerAddress = viper.GetString(KeyHealthServerAddress)
Config.HealthServerInterval = viper.GetInt(KeyHealthTimeInterval)
return nil
}

Expand Down Expand Up @@ -274,4 +279,7 @@ func AddFlags(flags *pflag.FlagSet) {
flags.Bool(KeyEnablePodInfo, false, "Enable PodInfo custom resource")

flags.Bool(KeyExposeKernelAddresses, false, "Expose real kernel addresses in events stack traces")

flags.String(KeyHealthServerAddress, ":6789", "Health server address (e.g. ':6789')(use '' to disabled it)")
flags.Int(KeyHealthTimeInterval, 10, "Health server interval in seconds")
}
117 changes: 117 additions & 0 deletions vendor/google.golang.org/grpc/health/client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 331f51a

Please sign in to comment.