Skip to content

Commit

Permalink
Define gRPC server for liveness probe
Browse files Browse the repository at this point in the history
Now, we use tetra status command to report the status of tetragon
agent. This comes with some overheads as tetra binary has a lot of
additional functionality and it seems like an overkill to use that for
status reporting.

On the other hand, k8s supports liveness probes by using an gRPC
endpoint (i.e.
https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-grpc-liveness-probe).
This patch first creates a dedicated gRPC server to report agent status that
can be used for the liveness probe.

Signed-off-by: Anastasios Papagiannis <[email protected]>
  • Loading branch information
tpapagian committed May 29, 2024
1 parent 4c47837 commit f39587d
Show file tree
Hide file tree
Showing 15 changed files with 958 additions and 0 deletions.
5 changes: 5 additions & 0 deletions cmd/tetragon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/cilium/tetragon/pkg/fileutils"
"github.com/cilium/tetragon/pkg/filters"
tetragonGrpc "github.com/cilium/tetragon/pkg/grpc"
"github.com/cilium/tetragon/pkg/health"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/metricsconfig"
Expand Down Expand Up @@ -440,6 +441,10 @@ func tetragonExecute() error {
}
}

if option.Config.HealthServerAddress != "" {
health.StartHealthServer(ctx, option.Config.HealthServerAddress, option.Config.HealthServerInterval)
}

log.WithField("enabled", option.Config.ExportFilename != "").WithField("fileName", option.Config.ExportFilename).Info("Exporter configuration")
obs.AddListener(pm)
saveInitInfo()
Expand Down
3 changes: 3 additions & 0 deletions docs/content/en/docs/reference/helm-chart.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions docs/data/tetragon_flags.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions install/kubernetes/tetragon/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions install/kubernetes/tetragon/templates/tetragon_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ data:
server-address: {{ .Values.tetragon.grpc.address }}
{{- else }}
{{- end }}
{{- if .Values.tetragon.healthGrpc.enabled }}
health-server-address: :{{ .Values.tetragon.healthGrpc.port }}
health-server-interval: {{ .Values.tetragon.healthGrpc.interval | quote }}
{{- else }}
health-server-address: ""
{{- end }}
{{- if .Values.tetragon.tcpStatsSampleSegs }}
tcp-stats-sample-segs: {{ .Values.tetragon.tcpStatsSampleSegs | quote }}
{{- end }}
Expand Down
7 changes: 7 additions & 0 deletions install/kubernetes/tetragon/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,13 @@ tetragon:
enablePolicyFilterDebug: false
# -- Enable latency monitoring in message handling
enableMsgHandlingLatency: false
healthGrpc:
# -- Whether to enable health gRPC server.
enabled: true
# -- The port at which to expose health gRPC.
port: 6789
# -- The interval at which to check the health of the agent.
interval: 10
# -- Location of the host proc filesystem in the runtime environment. If the runtime runs in the
# host, the path is /proc. Exceptions to this are environments like kind, where the runtime itself
# does not run on the host.
Expand Down
67 changes: 67 additions & 0 deletions pkg/health/server.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon

package health

import (
"context"
"net"
"time"

"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/logger"
"google.golang.org/grpc"
gh "google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
)

var (
log = logger.GetLogger()
)

func StartHealthServer(ctx context.Context, address string, interval int) {
// Create a new health server and mark it as serving.
healthServer := gh.NewServer()
healthServer.SetServingStatus("liveness", grpc_health_v1.HealthCheckResponse_SERVING)

// Create a new gRPC server for health checks and register the healthServer.
grpcHealthServer := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(grpcHealthServer, healthServer)

// Start the gRPC server for the health checks.
go func() {
// the gRPC server for the health checks listens on port 6789
listener, err := net.Listen("tcp", address)
if err != nil {
log.WithError(err).Fatal("Failed to listen for gRPC healthserver")
}

log.WithField("address", address).WithField("interval", interval).Info("Starting gRPC health server")
if err = grpcHealthServer.Serve(listener); err != nil {
log.WithError(err).Fatal("Failed to start gRPC healthserver")
}
}()

// Every 10 seconds check the agent health. To check if our agent is health we call
// health.GetHealth() and we report the status to the healthServer.
go func() {
ticker := time.NewTicker(time.Duration(interval) * time.Second)
for {
select {
case <-ticker.C:
servingStatus := grpc_health_v1.HealthCheckResponse_NOT_SERVING
if response, err := GetHealth(); err == nil {
if st := response.GetHealthStatus(); len(st) > 0 && st[0].Status == tetragon.HealthStatusResult_HEALTH_STATUS_RUNNING {
servingStatus = grpc_health_v1.HealthCheckResponse_SERVING
}
}
healthServer.SetServingStatus("liveness", servingStatus)
case <-ctx.Done():
ticker.Stop()
healthServer.Shutdown() // set all services to NOT_SERVING
grpcHealthServer.Stop()
return
}
}
}()
}
3 changes: 3 additions & 0 deletions pkg/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ type config struct {
CgroupRate CgroupRate

UsernameMetadata int

HealthServerAddress string
HealthServerInterval int
}

var (
Expand Down
8 changes: 8 additions & 0 deletions pkg/option/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ const (
KeyCgroupRate = "cgroup-rate"

KeyUsernameMetadata = "username-metadata"

KeyHealthServerAddress = "health-server-address"
KeyHealthTimeInterval = "health-server-interval"
)

type UsernameMetadaCode int
Expand Down Expand Up @@ -211,6 +214,8 @@ func ReadAndSetFlags() error {
}

Config.CgroupRate = ParseCgroupRate(viper.GetString(KeyCgroupRate))
Config.HealthServerAddress = viper.GetString(KeyHealthServerAddress)
Config.HealthServerInterval = viper.GetInt(KeyHealthTimeInterval)
return nil
}

Expand Down Expand Up @@ -362,4 +367,7 @@ func AddFlags(flags *pflag.FlagSet) {
flags.String(KeyUsernameMetadata, "disabled", "Resolve UIDs to user names for processes running in host namespace")

flags.String(KeyCgroupRate, "", "Base sensor events cgroup rate <events,interval> disabled by default ('1000/1s' means rate 1000 events per second")

flags.String(KeyHealthServerAddress, ":6789", "Health server address (e.g. ':6789')(use '' to disabled it)")
flags.Int(KeyHealthTimeInterval, 10, "Health server interval in seconds")
}
117 changes: 117 additions & 0 deletions vendor/google.golang.org/grpc/health/client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit f39587d

Please sign in to comment.