Skip to content

Commit

Permalink
Allow for configuring the chaos-handler (#840)
Browse files Browse the repository at this point in the history
* allow for configuring the handler timeout

* make timeout error configurable

* control the max timeout

* Specify namespace in on_init example

* check for init config in annotations

* edit docs and defaults

* log a warning when timeout annotation is bad
  • Loading branch information
ptnapoleon authored Feb 26, 2024
1 parent bacc083 commit e7f2698
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 18 deletions.
1 change: 1 addition & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,4 @@ data:
image: {{ template "chaos-controller.format-image" deepCopy .Values.global.chaos.defaultImage | merge .Values.global.oci | merge .Values.handler.image }}
enabled: {{ .Values.handler.enabled }}
timeout: {{ .Values.handler.timeout | quote }}
maxTimeout: {{ .Values.handler.maxTimeout | quote }}
3 changes: 2 additions & 1 deletion chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ handler:
image:
repo: chaos-handler
enabled: true # enable the chaos handler (required to use the onInit disruption feature)
timeout: 1m # time the handler init container will wait before exiting if no signal is received
timeout: 10m # time the handler init container will wait before exiting if no signal is received
maxTimeout: 2h # maximum amount of time to allow users to configure for their handler timeout

proxy:
image:
Expand Down
12 changes: 9 additions & 3 deletions cli/handler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ import (
)

var (
logger *zap.SugaredLogger
timeout time.Duration
logger *zap.SugaredLogger
timeout time.Duration
succeedOnTimeout bool

rootCmd = &cobra.Command{
Use: "chaos-handler",
Expand All @@ -40,14 +41,19 @@ var (
os.Exit(0)
case <-timer.C:
logger.Info("timed out, SIGUSR1 was never received, exiting")
os.Exit(1)
if succeedOnTimeout {
os.Exit(0)
} else {
os.Exit(1)
}
}
},
}
)

func init() {
rootCmd.PersistentFlags().DurationVar(&timeout, "timeout", time.Minute, "Time to wait for the signal before the handler exits by itself")
rootCmd.PersistentFlags().BoolVar(&succeedOnTimeout, "succeed-on-timeout", false, "If set to true, this container will exit 0 after a timeout, otherwise will exit 1")
}

func main() {
Expand Down
13 changes: 10 additions & 3 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,10 @@ type injectorNetworkDisruptionConfig struct {
}

type handlerConfig struct {
Enabled bool `json:"enabled"`
Image string `json:"image"`
Timeout time.Duration `json:"timeout"`
Enabled bool `json:"enabled"`
Image string `json:"image"`
Timeout time.Duration `json:"timeout"`
MaxTimeout time.Duration `json:"maxTimeout"`
}

const DefaultDisruptionDeletionTimeout = time.Minute * 15
Expand Down Expand Up @@ -291,6 +292,12 @@ func New(logger *zap.SugaredLogger, osArgs []string) (config, error) {
return cfg, err
}

mainFS.DurationVar(&cfg.Handler.MaxTimeout, "handler-max-timeout", time.Hour, "Handler init container maximum timeout")

if err := viper.BindPFlag("handler.maxTimeout", mainFS.Lookup("handler-max-timeout")); err != nil {
return cfg, err
}

mainFS.StringVar(&cfg.Controller.Webhook.CertDir, "admission-webhook-cert-dir", "", "Admission webhook certificate directory to search for tls.crt and tls.key files")

if err := viper.BindPFlag("controller.webhook.certDir", mainFS.Lookup("admission-webhook-cert-dir")); err != nil {
Expand Down
5 changes: 4 additions & 1 deletion docs/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,13 @@ It can be handy to disrupt packets on pod initialization, meaning before contain
- the chaos-controller will inject an init containers name `chaos-handler` as the first init container in your pod
- this init container is lightweight and does nothing but waiting for a `SIGUSR1` signal to complete successfully
- thus, until a disruption targets the pod with the init container, it will do nothing but wait until it times out. The init container has no k8s api access, and does no proactive searching for existing disruption resources.
- you can configure how long the init container will wait for the `SIGUSR1` signal by setting an annotation with the key `chaos.datadoghq.com/disrupt-on-init-timeout` on your pod. The value should be a duration string, which we'll use to set the timeout for the init container. The default timeout is 10m.
- the `chaos-handler` init container will automatically exit and fail if no signal is received
- If you want the container to exit 0 on timeout, set an annotation with the key `chaos.datadoghq.com/disrupt-on-init-succeed-on-timeout` on your pod.
- apply your disruption [with the init mode on](../examples/on_init.yaml)
- the chaos pod will inject the disruption and unstuck your pod from the pending state

Note that in this mode, only pending pods with a running `chaos-handler` init container and matching your labels + the special label specified above will be targeted. The `chaos-handler` init container will automatically exit and fail if no signal is received within the specified timeout (default is 1 minute).
Note that in this mode, only pending pods with a running `chaos-handler` init container and matching your labels + the special label specified above will be targeted.

## Notifier

Expand Down
1 change: 1 addition & 0 deletions examples/on_init.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ apiVersion: chaos.datadoghq.com/v1beta1
kind: Disruption
metadata:
name: on-init
namespace: chaos-demo
spec:
onInit: true # apply the disruption on pod initialization (it requires the pod to target to be redeployed with the chaos.datadoghq.com/disrupt-on-init label to be held in the pending state)
level: pod
Expand Down
9 changes: 5 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,10 +353,11 @@ func main() {
// register chaos handler init container mutating webhook
mgr.GetWebhookServer().Register("/mutate-v1-pod-chaos-handler-init-container", &webhook.Admission{
Handler: &chaoswebhook.ChaosHandlerMutator{
Client: mgr.GetClient(),
Log: logger,
Image: cfg.Handler.Image,
Timeout: cfg.Handler.Timeout,
Client: mgr.GetClient(),
Log: logger,
Image: cfg.Handler.Image,
Timeout: cfg.Handler.Timeout,
MaxTimeout: cfg.Handler.MaxTimeout,
},
})
}
Expand Down
39 changes: 33 additions & 6 deletions webhook/chaos_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package webhook
import (
"context"
"encoding/json"
"fmt"
"net/http"
"time"

Expand All @@ -18,11 +19,12 @@ import (
)

type ChaosHandlerMutator struct {
Client client.Client
Log *zap.SugaredLogger
Image string
Timeout time.Duration
decoder *admission.Decoder
Client client.Client
Log *zap.SugaredLogger
Image string
Timeout time.Duration
MaxTimeout time.Duration
decoder *admission.Decoder
}

func (m *ChaosHandlerMutator) InjectDecoder(d *admission.Decoder) error {
Expand Down Expand Up @@ -58,6 +60,30 @@ func (m *ChaosHandlerMutator) Handle(ctx context.Context, req admission.Request)
podName = pod.ObjectMeta.GenerateName
}

handlerTimeout := m.Timeout.String()
succeedOnTimeout := ""

timeoutLabel, ok := pod.Annotations["chaos.datadoghq.com/disrupt-on-init-timeout"]
if ok {
if timeoutOverride, err := time.ParseDuration(timeoutLabel); err == nil {
if timeoutOverride > m.MaxTimeout {
m.Log.Warnw("pod was rejected due to handler timeout set too high", "timeout", timeoutOverride.String(), "maxTimeout", m.MaxTimeout.String())
err = fmt.Errorf("you have requested a handler timeout of %s but the maximum allowed timeout is %s", timeoutOverride.String(), m.MaxTimeout.String())

return admission.Errored(http.StatusBadRequest, err)
}

handlerTimeout = timeoutOverride.String()
} else if err != nil {
m.Log.Warnw("could not parse user's disrupt-on-init-timeout annotation", "err", err, "pod", podName, "namespace", req.Namespace)
}
}

_, ok = pod.Annotations["chaos.datadoghq.com/disrupt-on-init-succeed-on-timeout"]
if ok {
succeedOnTimeout = "--succeed-on-timeout"
}

m.Log.Infow("injecting chaos handler init container into targeted pod", "pod", podName, "namespace", req.Namespace)

// build chaos handler init container
Expand All @@ -67,7 +93,8 @@ func (m *ChaosHandlerMutator) Handle(ctx context.Context, req admission.Request)
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{
"--timeout",
m.Timeout.String(),
handlerTimeout,
succeedOnTimeout,
},
}

Expand Down

0 comments on commit e7f2698

Please sign in to comment.