From 1ad8b12e04db0e634fafc6356262eb22f54adc05 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Thu, 4 May 2023 05:01:25 +0000 Subject: [PATCH 01/17] install Go otel packages --- go.mod | 10 ++++++++++ go.sum | 28 +++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 761456f33..319dbbd6d 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,10 @@ require ( github.com/pkg/errors v0.9.1 github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.8.2 + go.opentelemetry.io/otel v1.15.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.15.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.15.0 + go.opentelemetry.io/otel/sdk v1.15.0 golang.org/x/crypto v0.8.0 golang.org/x/mod v0.10.0 golang.org/x/net v0.9.0 @@ -46,6 +50,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver v3.5.1+incompatible // indirect github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/coredns/caddy v1.1.0 // indirect github.com/coredns/corefile-migration v1.0.20 // indirect @@ -59,6 +64,7 @@ require ( github.com/evanphx/json-patch v5.6.0+incompatible // indirect github.com/evanphx/json-patch/v5 v5.6.0 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.2.3 // indirect github.com/go-openapi/jsonpointer v0.19.5 // indirect github.com/go-openapi/jsonreference v0.20.0 // indirect @@ -78,6 +84,7 @@ require ( github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2 // indirect github.com/google/uuid v1.3.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.2.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.13 // indirect @@ -115,6 +122,9 @@ require ( github.com/subosito/gotenv v1.4.1 // indirect github.com/valyala/fastjson v1.6.3 // indirect go.opencensus.io v0.24.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.15.0 // indirect + go.opentelemetry.io/otel/trace v1.15.0 // indirect + go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect go.uber.org/zap v1.21.0 // indirect diff --git a/go.sum b/go.sum index 32e039566..d92efa5d6 100644 --- a/go.sum +++ b/go.sum @@ -93,6 +93,8 @@ github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnweb github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= +github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= +github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -179,6 +181,8 @@ github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbV github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.2.3 h1:a9vnzlIBPQBBkeaR9IuMUfmVOrQlkoC4YfPoFkX3T7A= github.com/go-logr/zapr v1.2.3/go.mod h1:eIauM6P8qSvTw5o2ez6UEAfGjQKrxQTl5EoK+Qa2oG4= github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= @@ -200,6 +204,8 @@ github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zV github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/glog v1.0.0 h1:nfP3RFugxnNRyKgeWd4oI1nYvXpxrx8ck8ZrcizshdQ= +github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -248,6 +254,7 @@ github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github/v45 v45.2.0 h1:5oRLszbrkvxDDqBCNj2hjDZMKmvexaZ1xw/FCD+K3FI= @@ -294,6 +301,8 @@ github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmg github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0 h1:BZHcxBETFHIdVyhyEfOvn/RdU/QGdLI4y34qQGjGWO0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q= github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -549,13 +558,27 @@ go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +go.opentelemetry.io/otel v1.15.0 h1:NIl24d4eiLJPM0vKn4HjLYM+UZf6gSfi9Z+NmCxkWbk= +go.opentelemetry.io/otel v1.15.0/go.mod h1:qfwLEbWhLPk5gyWrne4XnF0lC8wtywbuJbgfAE3zbek= +go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.15.0 h1:ZSdnH1x5Gm/eUFNQquwSt4/LMCOqS6KPlI9qaTKx5Ho= +go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.15.0/go.mod h1:uOTV75+LOzV+ODmL8ahRLWkFA3eQcSC2aAsbxIu4duk= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.15.0 h1:rk5I7PaOk5NGQHfHR2Rz6MgdA8AYQSHwsigFsOxEC1c= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.15.0/go.mod h1:pvkFJxNUXyJ5i8u6m8NIcqkoOf/65VM2mSyBbBJfeVQ= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.15.0 h1:rHD0vfQbtki6/FnsMzTpAOgdv+Ku+T6R47MZXmgelf8= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.15.0/go.mod h1:RPagkaZrpwD+rSwQjzos6rBLsHOvenOqufCj4/7I46E= +go.opentelemetry.io/otel/sdk v1.15.0 h1:jZTCkRRd08nxD6w7rIaZeDNGZGGQstH3SfLQ3ZsKICk= +go.opentelemetry.io/otel/sdk v1.15.0/go.mod h1:XDEMrYWzJ4YlC17i6Luih2lwDw2j6G0PkUfr1ZqE+rQ= +go.opentelemetry.io/otel/trace v1.15.0 h1:5Fwje4O2ooOxkfyqI/kJwxWotggDLix4BSAvpE1wlpo= +go.opentelemetry.io/otel/trace v1.15.0/go.mod h1:CUsmE2Ht1CRkvE8OsMESvraoZrrcgD1J2W8GV1ev0Y4= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= +go.opentelemetry.io/proto/otlp v0.19.0 h1:IVN6GR+mhC4s5yfcTbmzHYODqvWAp3ZedA2SJPI1Nnw= +go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= -go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk= +go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= @@ -672,6 +695,7 @@ golang.org/x/oauth2 v0.0.0-20201109201403-9fd604954f58/go.mod h1:KelEdhl1UZF7XfJ golang.org/x/oauth2 v0.0.0-20201208152858-08078c50e5b5/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= +golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= @@ -905,6 +929,7 @@ google.golang.org/genproto v0.0.0-20201210142538-e3217bee35cc/go.mod h1:FWY/as6D google.golang.org/genproto v0.0.0-20201214200347-8c77b98c765d/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210108203827-ffc7fda8c3d7/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210226172003-ab064af71705/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20211118181313-81c1377c94b1/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20220107163113-42d7afdf6368/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= @@ -927,6 +952,7 @@ google.golang.org/grpc v1.34.0/go.mod h1:WotjhfgOW/POjDeRt8vscBtXq+2VjORFy659qA5 google.golang.org/grpc v1.35.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.40.0/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9KAK34= +google.golang.org/grpc v1.42.0/go.mod h1:k+4IHHFw41K8+bbowsex27ge2rCb65oeWqe4jJ590SU= google.golang.org/grpc v1.45.0/go.mod h1:lN7owxKUQEqMfSyQikvvk5tf/6zMPsrK+ONuO11+0rQ= google.golang.org/grpc v1.54.0 h1:EhTqbhiYeixwWQtAEZAxmV9MGqcjEU2mFx52xCzNyag= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= From f955a09d8ccd15e42cc3e616c51c13fc7ff85c09 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Thu, 4 May 2023 05:07:16 +0000 Subject: [PATCH 02/17] set up TracerProvider + tracer object --- pkg/otel/tracing.go | 106 +++++++++++++++++++++++++++++++++++ util/telemetry/oteltracer.go | 11 ++++ 2 files changed, 117 insertions(+) create mode 100644 pkg/otel/tracing.go create mode 100644 util/telemetry/oteltracer.go diff --git a/pkg/otel/tracing.go b/pkg/otel/tracing.go new file mode 100644 index 000000000..e2b32fafb --- /dev/null +++ b/pkg/otel/tracing.go @@ -0,0 +1,106 @@ +package otel + +import ( + "context" + "time" + + "github.com/go-logr/logr" + "github.com/pkg/errors" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "k8s.io/client-go/pkg/version" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.4.0" +) + +func RegisterTracing(ctx context.Context, log logr.Logger) error { + + tracerProvider, err := SetUpTracing(ctx) + if err != nil { + return err + } + + // Safely shut down the tracer provider when context terminates + go func() { + <-ctx.Done() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := tracerProvider.Shutdown(ctx); err != nil { + log.Error(err, "failed to shut down tracer provider") + } + }() + + return nil +} + +func newExporter(ctx context.Context) (*otlptrace.Exporter, error) { + + conn, err := grpc.DialContext(ctx, "opentelemetry-collector:4317", + // Using non-TLS connection for dev environment + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + ) + + if err != nil { + return nil, errors.Wrap(err, "failed to create gRPC connection to collector for opentelemetry") + } + + // Set up a trace exporter + traceExporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithGRPCConn(conn)) + + if err != nil { + return nil, errors.Wrap(err, "failed to create trace exporter for opentelemetry") + } + + return traceExporter, nil +} + +func SetUpTracing(ctx context.Context) (*trace.TracerProvider, error) { + + traceExporter, err := newExporter(ctx) + + if err != nil { + return nil, err + } + + // labels/tags/res common to all traces + // TODO: consider to add more fields + resource, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceNameKey.String("capg"), + attribute.String("exporter", "otlpgrpc"), + attribute.String("version", version.Get().String()), + ), + ) + + if err != nil { + return nil, errors.Wrap(err, "failed to create opentelemetry resource") + } + + traceProvider := trace.NewTracerProvider( + trace.WithBatcher(traceExporter), + trace.WithResource(resource), + // TODO: dynamic sampling rate? + // sampling rate based on parent span = 60% + trace.WithSampler(trace.ParentBased(trace.TraceIDRatioBased(0.6))), + ) + + otel.SetTracerProvider(traceProvider) + + otel.SetTextMapPropagator( + propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + ), + ) + + return traceProvider, nil +} diff --git a/util/telemetry/oteltracer.go b/util/telemetry/oteltracer.go new file mode 100644 index 000000000..69574a3e9 --- /dev/null +++ b/util/telemetry/oteltracer.go @@ -0,0 +1,11 @@ +package telemetry + +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/trace" +) + +// default Tracer +func Tracer() trace.Tracer { + return otel.Tracer("capg") +} From dcfb15b42fa8e17ecaa722e102c3ab3499b69719 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Thu, 4 May 2023 05:07:44 +0000 Subject: [PATCH 03/17] add enableTracing flag --- main.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/main.go b/main.go index caa92ea00..b1bbd031f 100644 --- a/main.go +++ b/main.go @@ -40,6 +40,7 @@ import ( infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" expcontrollers "sigs.k8s.io/cluster-api-provider-gcp/exp/controllers" "sigs.k8s.io/cluster-api-provider-gcp/feature" + ot "sigs.k8s.io/cluster-api-provider-gcp/pkg/otel" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" "sigs.k8s.io/cluster-api-provider-gcp/version" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -84,6 +85,7 @@ var ( leaderElectionLeaseDuration time.Duration leaderElectionRenewDeadline time.Duration leaderElectionRetryPeriod time.Duration + enableTracing bool ) func main() { @@ -147,6 +149,13 @@ func main() { // Setup the context that's going to be used in controllers and for the manager. ctx := ctrl.SetupSignalHandler() + if enableTracing { + if err := ot.RegisterTracing(ctx, setupLog); err != nil { + setupLog.Error(err, "unable to set up tracing") + os.Exit(1) + } + } + if setupErr := setupReconcilers(ctx, mgr); setupErr != nil { setupLog.Error(err, "unable to setup reconcilers") os.Exit(1) @@ -366,5 +375,11 @@ func initFlags(fs *pflag.FlagSet) { "The maximum duration a reconcile loop can run (e.g. 90m)", ) + fs.BoolVar(&enableTracing, + "enable-tracing", + false, + "Enable collecting and sending traces to opentelemetry-collector service", + ) + feature.MutableGates.AddFlag(fs) } From 49c1ea2b60050b3522f8ce63b7d82d0809f5a9f3 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Thu, 4 May 2023 05:08:13 +0000 Subject: [PATCH 04/17] instrument GCP machine + GCPCluster controllers --- controllers/gcpcluster_controller.go | 34 +++++++++++++++++++++++++++ controllers/gcpmachine_controller.go | 35 ++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/controllers/gcpcluster_controller.go b/controllers/gcpcluster_controller.go index 0e53c08e2..cadce0e74 100644 --- a/controllers/gcpcluster_controller.go +++ b/controllers/gcpcluster_controller.go @@ -23,6 +23,8 @@ import ( "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/filter" "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" apierrors "k8s.io/apimachinery/pkg/api/errors" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud" @@ -32,6 +34,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/compute/networks" "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/compute/subnets" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" @@ -62,6 +65,14 @@ type GCPClusterReconciler struct { func (r *GCPClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := log.FromContext(ctx).WithValues("controller", "GCPCluster") + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPClusterReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPCluster"), + ), + ) + defer span.End() + c, err := ctrl.NewControllerManagedBy(mgr). WithOptions(options). For(&infrav1.GCPCluster{}). @@ -106,6 +117,17 @@ func (r *GCPClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) defer cancel() log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPClusterReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPCluster"), + ), + ) + defer span.End() + gcpCluster := &infrav1.GCPCluster{} err := r.Get(ctx, req.NamespacedName, gcpCluster) if err != nil { @@ -161,6 +183,12 @@ func (r *GCPClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) func (r *GCPClusterReconciler) reconcile(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPClusterReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPCluster") controllerutil.AddFinalizer(clusterScope.GCPCluster, infrav1.ClusterFinalizer) @@ -227,6 +255,12 @@ func (r *GCPClusterReconciler) reconcile(ctx context.Context, clusterScope *scop func (r *GCPClusterReconciler) reconcileDelete(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPClusterReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Reconciling Delete GCPCluster") reconcilers := []cloud.Reconciler{ diff --git a/controllers/gcpmachine_controller.go b/controllers/gcpmachine_controller.go index 924faec8f..49fd9bf3b 100644 --- a/controllers/gcpmachine_controller.go +++ b/controllers/gcpmachine_controller.go @@ -22,11 +22,14 @@ import ( "time" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" apierrors "k8s.io/apimachinery/pkg/api/errors" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/scope" "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/compute/instances" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" capierrors "sigs.k8s.io/cluster-api/errors" "sigs.k8s.io/cluster-api/util" @@ -57,6 +60,15 @@ type GCPMachineReconciler struct { func (r *GCPMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := ctrl.LoggerFrom(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPMachineReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPMachine"), + ), + ) + defer span.End() + c, err := ctrl.NewControllerManagedBy(mgr). WithOptions(options). For(&infrav1.GCPMachine{}). @@ -136,6 +148,17 @@ func (r *GCPMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) defer cancel() log := ctrl.LoggerFrom(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPMachineReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPMachine"), + ), + ) + defer span.End() + gcpMachine := &infrav1.GCPMachine{} err := r.Get(ctx, req.NamespacedName, gcpMachine) if err != nil { @@ -218,6 +241,12 @@ func (r *GCPMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) func (r *GCPMachineReconciler) reconcile(ctx context.Context, machineScope *scope.MachineScope) (ctrl.Result, error) { log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPMachineReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPMachine") controllerutil.AddFinalizer(machineScope.GCPMachine, infrav1.MachineFinalizer) @@ -252,6 +281,12 @@ func (r *GCPMachineReconciler) reconcile(ctx context.Context, machineScope *scop func (r *GCPMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.MachineScope) (_ ctrl.Result, reterr error) { log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPMachineReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Reconciling Delete GCPMachine") if err := instances.New(machineScope).Delete(ctx); err != nil { From deb44dc863d5e68a3afe7239436e7b3a19e73474 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 5 May 2023 02:10:35 +0000 Subject: [PATCH 05/17] helm charts for jaeger all-in-one + otel collector --- .../jaeger/fetch-jaeger-resources.sh | 31 ++++++++ .../opentelemetry/fetch-otel-resources.sh | 34 +++++++++ hack/observability/opentelemetry/values.yaml | 70 +++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100755 hack/observability/jaeger/fetch-jaeger-resources.sh create mode 100644 hack/observability/opentelemetry/fetch-otel-resources.sh create mode 100644 hack/observability/opentelemetry/values.yaml diff --git a/hack/observability/jaeger/fetch-jaeger-resources.sh b/hack/observability/jaeger/fetch-jaeger-resources.sh new file mode 100755 index 000000000..6d053a55b --- /dev/null +++ b/hack/observability/jaeger/fetch-jaeger-resources.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Copyright 2021 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# "Borrow" with permission from CAPZ team: https://github.com/kubernetes-sigs/cluster-api-provider-azure/blob/main/hack/observability/jaeger/fetch-jaeger-resources.sh + +set -o errexit +set -o nounset +set -o pipefail + +CHART_RELEASE=${CHART_RELEASE:-0.1.8} +JAEGER_ROOT=$(dirname "${BASH_SOURCE[0]}") +CHART_ROOT=$JAEGER_ROOT/chart + +rm -rf "$CHART_ROOT" +# "tar" has no POSIX standard, so use only basic options and test with both BSD and GNU. +wget -qO- https://github.com/hansehe/jaeger-all-in-one/raw/master/helm/charts/jaeger-all-in-one-"$CHART_RELEASE".tgz \ + | tar xvz -C "$JAEGER_ROOT" +mv "$JAEGER_ROOT"/jaeger-all-in-one "$CHART_ROOT" diff --git a/hack/observability/opentelemetry/fetch-otel-resources.sh b/hack/observability/opentelemetry/fetch-otel-resources.sh new file mode 100644 index 000000000..bd346cdea --- /dev/null +++ b/hack/observability/opentelemetry/fetch-otel-resources.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright 2021 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# "Borrow" with permission from CAPZ team: https://github.com/kubernetes-sigs/cluster-api-provider-azure/blob/main/hack/observability/opentelemetry/fetch-otel-resources.sh + + +set -o errexit +set -o nounset +set -o pipefail + +CHART_RELEASE=${CHART_RELEASE:-0.53.0} +OTEL_ROOT=$(dirname "${BASH_SOURCE[0]}") +CHART_ROOT=$OTEL_ROOT/chart + + +rm -rf "$CHART_ROOT" +# # "tar" has no POSIX standard, so use only basic options and test with both BSD and GNU. +wget -qO- https://github.com/open-telemetry/opentelemetry-helm-charts/releases/download/opentelemetry-collector-"$CHART_RELEASE"/opentelemetry-collector-"$CHART_RELEASE".tgz \ + | tar xvz -C "$OTEL_ROOT" --exclude "ci" --exclude "examples" +mv "$OTEL_ROOT"/opentelemetry-collector "$CHART_ROOT" +wget -q https://raw.githubusercontent.com/open-telemetry/opentelemetry-helm-charts/main/LICENSE -P "$CHART_ROOT" diff --git a/hack/observability/opentelemetry/values.yaml b/hack/observability/opentelemetry/values.yaml new file mode 100644 index 000000000..67850766f --- /dev/null +++ b/hack/observability/opentelemetry/values.yaml @@ -0,0 +1,70 @@ +mode: "deployment" + +config: + receivers: + jaeger: null # disable Jaeger receiver + #otlp: # using default configs + prometheus: null # disable Prometheus receiver + zipkin: null # disable Zipkin receiver + + # (2) Processors + processors: + memory_limiter: + limit_percentage: 50 + check_interval: 1s + spike_limit_percentage: 30 + batch: + send_batch_size: 8192 + + # (3) exporter + exporters: + jaeger: + endpoint: jaeger-all-in-one:14250 + tls: + insecure: true + #export to Cloud Trace on GCP + googlecloud: + # disable metric + logging collections + metric: null + log: null + # use default configs for trace + #trace: + #use_insecure: true + + # (4) service + service: + # A pipeline = a set of receivers, processors and exporters. + pipelines: + metrics: null + logs: null + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [jaeger, googlecloud] + +image: + repository: otel/opentelemetry-collector-contrib + pullPolicy: IfNotPresent + tag: 0.75.0 +command: + name: otelcol-contrib + +# Configuration for connecting to GCP's Cloud Trace +extraEnvs: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /home/.gcp/credentials.json + +# retrieve GOOGLE_APPLICATION_CREDENTIALS env var from Secret +extraVolumes: + - name: credentials + secret: + secretName: manager-bootstrap-credentials + +extraVolumeMounts: + - name: credentials + mountPath: /home/.gcp + +ports: + jaeger-thrift: null + jaeger-grpc: null + zipkin: null From bb98f0edb2f9d05265e769ad1b80ed957184aee9 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 5 May 2023 02:10:49 +0000 Subject: [PATCH 06/17] templates to set up dev env --- hack/observability/kustomization.yaml | 6 ++++++ .../controller-manager-dev-env-otel-patch.yml | 17 +++++++++++++++++ .../secrets-dev-env-otel-patch.yaml | 5 +++++ 3 files changed, 28 insertions(+) create mode 100644 hack/observability/kustomization.yaml create mode 100644 hack/observability/opentelemetry/controller-manager-dev-env-otel-patch.yml create mode 100644 hack/observability/opentelemetry/secrets-dev-env-otel-patch.yaml diff --git a/hack/observability/kustomization.yaml b/hack/observability/kustomization.yaml new file mode 100644 index 000000000..87bd08476 --- /dev/null +++ b/hack/observability/kustomization.yaml @@ -0,0 +1,6 @@ +resources: + - ../../config/default + +patchesStrategicMerge: + - opentelemetry/secrets-dev-env-otel-patch.yaml + - opentelemetry/controller-manager-dev-env-otel-patch.yml diff --git a/hack/observability/opentelemetry/controller-manager-dev-env-otel-patch.yml b/hack/observability/opentelemetry/controller-manager-dev-env-otel-patch.yml new file mode 100644 index 000000000..e6d92601c --- /dev/null +++ b/hack/observability/opentelemetry/controller-manager-dev-env-otel-patch.yml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: capg-system + name: capg-controller-manager +spec: + template: + spec: + # patch: ../../config/manager/manager.yaml + containers: + - name: manager + args: + - "--leader-elect" + - "--feature-gates=GKE=${EXP_CAPG_GKE:=false}" + - "--metrics-bind-addr=localhost:8080" + - "--v=${CAPG_LOGLEVEL:=0}" + - "--enable-tracing" diff --git a/hack/observability/opentelemetry/secrets-dev-env-otel-patch.yaml b/hack/observability/opentelemetry/secrets-dev-env-otel-patch.yaml new file mode 100644 index 000000000..9cdb836dc --- /dev/null +++ b/hack/observability/opentelemetry/secrets-dev-env-otel-patch.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: Secret +metadata: + name: manager-bootstrap-credentials + namespace: capg-system From bc4ea9fbee91a3c5188726ca6b5fa41fed1b774b Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 5 May 2023 02:13:20 +0000 Subject: [PATCH 07/17] instrument cloud/scope --- cloud/scope/clients.go | 25 +++++++++++++++++++++++++ cloud/scope/cluster.go | 7 +++++++ cloud/scope/managedcluster.go | 7 +++++++ cloud/scope/managedcontrolplane.go | 7 +++++++ cloud/scope/managedmachinepool.go | 7 +++++++ 5 files changed, 53 insertions(+) diff --git a/cloud/scope/clients.go b/cloud/scope/clients.go index e70db2a74..bc5353889 100644 --- a/cloud/scope/clients.go +++ b/cloud/scope/clients.go @@ -31,6 +31,7 @@ import ( "k8s.io/client-go/pkg/version" "k8s.io/client-go/util/flowcontrol" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -89,6 +90,12 @@ func defaultClientOptions(ctx context.Context, credentialsRef *infrav1.ObjectRef } func newComputeService(ctx context.Context, credentialsRef *infrav1.ObjectReference, crClient client.Client) (*compute.Service, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clients.newComputeService", + ) + defer span.End() + opts, err := defaultClientOptions(ctx, credentialsRef, crClient) if err != nil { return nil, fmt.Errorf("getting default gcp client options: %w", err) @@ -103,6 +110,12 @@ func newComputeService(ctx context.Context, credentialsRef *infrav1.ObjectRefere } func newClusterManagerClient(ctx context.Context, credentialsRef *infrav1.ObjectReference, crClient client.Client) (*container.ClusterManagerClient, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clients.newClusterManagerClient", + ) + defer span.End() + opts, err := defaultClientOptions(ctx, credentialsRef, crClient) if err != nil { return nil, fmt.Errorf("getting default gcp client options: %w", err) @@ -117,6 +130,12 @@ func newClusterManagerClient(ctx context.Context, credentialsRef *infrav1.Object } func newIamCredentialsClient(ctx context.Context, credentialsRef *infrav1.ObjectReference, crClient client.Client) (*credentials.IamCredentialsClient, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clients.newIamCredentialsClient", + ) + defer span.End() + opts, err := defaultClientOptions(ctx, credentialsRef, crClient) if err != nil { return nil, fmt.Errorf("getting default gcp client options: %w", err) @@ -131,6 +150,12 @@ func newIamCredentialsClient(ctx context.Context, credentialsRef *infrav1.Object } func newInstanceGroupManagerClient(ctx context.Context, credentialsRef *infrav1.ObjectReference, crClient client.Client) (*computerest.InstanceGroupManagersClient, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clients.newInstanceGroupManagerClient", + ) + defer span.End() + opts, err := defaultClientOptions(ctx, credentialsRef, crClient) if err != nil { return nil, fmt.Errorf("getting default gcp client options: %w", err) diff --git a/cloud/scope/cluster.go b/cloud/scope/cluster.go index 831eb728c..7f170f9ee 100644 --- a/cloud/scope/cluster.go +++ b/cloud/scope/cluster.go @@ -27,6 +27,7 @@ import ( "k8s.io/utils/pointer" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" @@ -43,6 +44,12 @@ type ClusterScopeParams struct { // NewClusterScope creates a new Scope from the supplied parameters. // This is meant to be called for each reconcile iteration. func NewClusterScope(ctx context.Context, params ClusterScopeParams) (*ClusterScope, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clusterScope.NewClusterScope", + ) + defer span.End() + if params.Cluster == nil { return nil, errors.New("failed to generate new scope from nil Cluster") } diff --git a/cloud/scope/managedcluster.go b/cloud/scope/managedcluster.go index 1ace53d87..38cb5b191 100644 --- a/cloud/scope/managedcluster.go +++ b/cloud/scope/managedcluster.go @@ -27,6 +27,7 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" @@ -44,6 +45,12 @@ type ManagedClusterScopeParams struct { // NewManagedClusterScope creates a new Scope from the supplied parameters. // This is meant to be called for each reconcile iteration. func NewManagedClusterScope(ctx context.Context, params ManagedClusterScopeParams) (*ManagedClusterScope, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.managedClusterScope.NewManagedClusterScope", + ) + defer span.End() + if params.Cluster == nil { return nil, errors.New("failed to generate new scope from nil Cluster") } diff --git a/cloud/scope/managedcontrolplane.go b/cloud/scope/managedcontrolplane.go index cfaec26b1..a59f1645c 100644 --- a/cloud/scope/managedcontrolplane.go +++ b/cloud/scope/managedcontrolplane.go @@ -21,6 +21,7 @@ import ( "fmt" "sigs.k8s.io/cluster-api-provider-gcp/util/location" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/cluster-api/util/conditions" @@ -52,6 +53,12 @@ type ManagedControlPlaneScopeParams struct { // NewManagedControlPlaneScope creates a new Scope from the supplied parameters. // This is meant to be called for each reconcile iteration. func NewManagedControlPlaneScope(ctx context.Context, params ManagedControlPlaneScopeParams) (*ManagedControlPlaneScope, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.managedControlPlaneScope.NewManagedControlPlaneScope", + ) + defer span.End() + if params.Cluster == nil { return nil, errors.New("failed to generate new scope from nil Cluster") } diff --git a/cloud/scope/managedmachinepool.go b/cloud/scope/managedmachinepool.go index 7f0dea927..a7b49d8ac 100644 --- a/cloud/scope/managedmachinepool.go +++ b/cloud/scope/managedmachinepool.go @@ -22,6 +22,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud" "sigs.k8s.io/cluster-api-provider-gcp/util/location" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/cluster-api/util/conditions" @@ -51,6 +52,12 @@ type ManagedMachinePoolScopeParams struct { // NewManagedMachinePoolScope creates a new Scope from the supplied parameters. // This is meant to be called for each reconcile iteration. func NewManagedMachinePoolScope(ctx context.Context, params ManagedMachinePoolScopeParams) (*ManagedMachinePoolScope, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.managedMachinePoolScope.NewManagedMachinePoolScope", + ) + defer span.End() + if params.Cluster == nil { return nil, errors.New("failed to generate new scope from nil Cluster") } From eb25369063a3c6e78a1fa8bf9902c130e3450481 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Sun, 7 May 2023 02:00:04 +0000 Subject: [PATCH 08/17] instrument exp/controllers --- .../gcpmanagedcluster_controller.go | 33 +++++++++++++++++++ .../gcpmanagedcontrolplane_controller.go | 33 +++++++++++++++++++ .../gcpmanagedmachinepool_controller.go | 33 +++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/exp/controllers/gcpmanagedcluster_controller.go b/exp/controllers/gcpmanagedcluster_controller.go index 80d4d5fe3..431444e63 100644 --- a/exp/controllers/gcpmanagedcluster_controller.go +++ b/exp/controllers/gcpmanagedcluster_controller.go @@ -24,6 +24,8 @@ import ( "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/filter" "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" @@ -33,6 +35,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/compute/subnets" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" @@ -68,6 +71,16 @@ func (r *GCPManagedClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re log := log.FromContext(ctx) + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedClusterReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPManagedCluster"), + ), + ) + defer span.End() + gcpCluster := &infrav1exp.GCPManagedCluster{} err := r.Get(ctx, req.NamespacedName, gcpCluster) if err != nil { @@ -142,6 +155,14 @@ func (r *GCPManagedClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re func (r *GCPManagedClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := ctrl.LoggerFrom(ctx) + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedClusterReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPManagedClusterReconciler"), + ), + ) + defer span.End() + c, err := ctrl.NewControllerManagedBy(mgr). WithOptions(options). For(&infrav1exp.GCPManagedCluster{}). @@ -169,6 +190,12 @@ func (r *GCPManagedClusterReconciler) SetupWithManager(ctx context.Context, mgr func (r *GCPManagedClusterReconciler) reconcile(ctx context.Context, clusterScope *scope.ManagedClusterScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedcluster") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedClusterReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPManagedCluster") controllerutil.AddFinalizer(clusterScope.GCPManagedCluster, infrav1exp.ClusterFinalizer) @@ -226,6 +253,12 @@ func (r *GCPManagedClusterReconciler) reconcile(ctx context.Context, clusterScop func (r *GCPManagedClusterReconciler) reconcileDelete(ctx context.Context, clusterScope *scope.ManagedClusterScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedcluster", "action", "delete") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedClusterReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Reconciling Delete GCPManagedCluster") if clusterScope.GCPManagedControlPlane != nil { diff --git a/exp/controllers/gcpmanagedcontrolplane_controller.go b/exp/controllers/gcpmanagedcontrolplane_controller.go index 0e88d7e78..9d25abbac 100644 --- a/exp/controllers/gcpmanagedcontrolplane_controller.go +++ b/exp/controllers/gcpmanagedcontrolplane_controller.go @@ -21,6 +21,8 @@ import ( "fmt" "time" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "sigs.k8s.io/cluster-api/util/annotations" "github.com/pkg/errors" @@ -30,6 +32,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/container/clusters" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" @@ -63,6 +66,14 @@ type GCPManagedControlPlaneReconciler struct { func (r *GCPManagedControlPlaneReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := log.FromContext(ctx).WithValues("controller", "GCPManagedControlPlane") + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedControlPlaneReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPManagedControlPlane"), + ), + ) + defer span.End() + gcpManagedControlPlane := &infrav1exp.GCPManagedControlPlane{} c, err := ctrl.NewControllerManagedBy(mgr). WithOptions(options). @@ -90,6 +101,16 @@ func (r *GCPManagedControlPlaneReconciler) Reconcile(ctx context.Context, req ct log := ctrl.LoggerFrom(ctx) + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedControlPlaneReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPManagedControlPlane"), + ), + ) + defer span.End() + // Get the control plane instance gcpManagedControlPlane := &infrav1exp.GCPManagedControlPlane{} if err := r.Client.Get(ctx, req.NamespacedName, gcpManagedControlPlane); err != nil { @@ -154,6 +175,12 @@ func (r *GCPManagedControlPlaneReconciler) Reconcile(ctx context.Context, req ct func (r *GCPManagedControlPlaneReconciler) reconcile(ctx context.Context, managedControlPlaneScope *scope.ManagedControlPlaneScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedcontrolplane") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedControlPlaneReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPManagedControlPlane") controllerutil.AddFinalizer(managedControlPlaneScope.GCPManagedControlPlane, infrav1exp.ManagedControlPlaneFinalizer) @@ -192,6 +219,12 @@ func (r *GCPManagedControlPlaneReconciler) reconcile(ctx context.Context, manage func (r *GCPManagedControlPlaneReconciler) reconcileDelete(ctx context.Context, managedControlPlaneScope *scope.ManagedControlPlaneScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedcontrolplane", "action", "delete") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedControlPlaneReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Deleting GCPManagedControlPlane") reconcilers := map[string]cloud.ReconcilerWithResult{ diff --git a/exp/controllers/gcpmanagedmachinepool_controller.go b/exp/controllers/gcpmanagedmachinepool_controller.go index 537a79658..9f079875c 100644 --- a/exp/controllers/gcpmanagedmachinepool_controller.go +++ b/exp/controllers/gcpmanagedmachinepool_controller.go @@ -24,6 +24,8 @@ import ( "github.com/go-logr/logr" "github.com/googleapis/gax-go/v2/apierror" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "google.golang.org/grpc/codes" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -41,6 +43,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud/scope" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/predicates" @@ -150,6 +153,14 @@ func managedControlPlaneToManagedMachinePoolMapFunc(c client.Client, gvk schema. func (r *GCPManagedMachinePoolReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := log.FromContext(ctx).WithValues("controller", "GCPManagedMachinePool") + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedMachinePoolReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPManagedMachinePool"), + ), + ) + defer span.End() + gvk, err := apiutil.GVKForObject(new(infrav1exp.GCPManagedMachinePool), mgr.GetScheme()) if err != nil { return errors.Wrapf(err, "failed to find GVK for GCPManagedMachinePool") @@ -228,6 +239,16 @@ func (r *GCPManagedMachinePoolReconciler) Reconcile(ctx context.Context, req ctr ctx, cancel := context.WithTimeout(ctx, reconciler.DefaultedLoopTimeout(r.ReconcileTimeout)) defer cancel() + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedMachinePoolReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPManagedMachinePool"), + ), + ) + defer span.End() + log := ctrl.LoggerFrom(ctx) // Get the managed machine pool @@ -318,6 +339,12 @@ func (r *GCPManagedMachinePoolReconciler) Reconcile(ctx context.Context, req ctr func (r *GCPManagedMachinePoolReconciler) reconcile(ctx context.Context, managedMachinePoolScope *scope.ManagedMachinePoolScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedmachinepool") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedMachinePoolReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPManagedMachinePool") controllerutil.AddFinalizer(managedMachinePoolScope.GCPManagedMachinePool, infrav1exp.ManagedMachinePoolFinalizer) @@ -359,6 +386,12 @@ func (r *GCPManagedMachinePoolReconciler) reconcile(ctx context.Context, managed func (r *GCPManagedMachinePoolReconciler) reconcileDelete(ctx context.Context, managedMachinePoolScope *scope.ManagedMachinePoolScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedmachinepool", "action", "delete") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedMachinePoolReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Deleting GCPManagedMachinePool") reconcilers := map[string]cloud.ReconcilerWithResult{ From 1b98f8eef2393c2c8e155bbdb69810e61bd8d059 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Sun, 7 May 2023 02:00:39 +0000 Subject: [PATCH 09/17] instrument cloud/services/compute --- cloud/services/compute/firewalls/reconcile.go | 11 +++ cloud/services/compute/instances/reconcile.go | 11 +++ .../compute/loadbalancers/reconcile.go | 71 +++++++++++++++++++ cloud/services/compute/networks/reconcile.go | 21 ++++++ cloud/services/compute/subnets/reconcile.go | 16 +++++ 5 files changed, 130 insertions(+) diff --git a/cloud/services/compute/firewalls/reconcile.go b/cloud/services/compute/firewalls/reconcile.go index 9c1daa07f..08df8b17f 100644 --- a/cloud/services/compute/firewalls/reconcile.go +++ b/cloud/services/compute/firewalls/reconcile.go @@ -21,11 +21,17 @@ import ( "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconcile cluster firewall compoenents. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "firewalls.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling firewall resources") for _, spec := range s.scope.FirewallRulesSpec() { @@ -48,6 +54,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete delete cluster firewall compoenents. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "firewalls.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Deleting firewall resources") for _, spec := range s.scope.FirewallRulesSpec() { diff --git a/cloud/services/compute/instances/reconcile.go b/cloud/services/compute/instances/reconcile.go index 55abcd460..e67d32448 100644 --- a/cloud/services/compute/instances/reconcile.go +++ b/cloud/services/compute/instances/reconcile.go @@ -29,11 +29,17 @@ import ( "k8s.io/utils/pointer" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconcile machine instance. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "instances.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling instance resources") instance, err := s.createOrGetInstance(ctx) @@ -96,6 +102,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete delete machine instance. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "instances.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Deleting instance resources") instanceSpec := s.scope.InstanceSpec(log) diff --git a/cloud/services/compute/loadbalancers/reconcile.go b/cloud/services/compute/loadbalancers/reconcile.go index a16820fa6..d622b6a6e 100644 --- a/cloud/services/compute/loadbalancers/reconcile.go +++ b/cloud/services/compute/loadbalancers/reconcile.go @@ -23,11 +23,17 @@ import ( "google.golang.org/api/compute/v1" "k8s.io/utils/pointer" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconcile cluster control-plane loadbalancer compoenents. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling loadbalancer resources") instancegroups, err := s.createOrGetInstanceGroups(ctx) @@ -60,6 +66,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete delete cluster control-plane loadbalancer compoenents. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Deleting loadbalancer resources") if err := s.deleteForwardingRule(ctx); err != nil { @@ -86,6 +97,11 @@ func (s *Service) Delete(ctx context.Context) error { } func (s *Service) createOrGetInstanceGroups(ctx context.Context) ([]*compute.InstanceGroup, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetInstanceGroups", + ) + defer span.End() + log := log.FromContext(ctx) fd := s.scope.FailureDomains() zones := make([]string, 0, len(fd)) @@ -130,6 +146,11 @@ func (s *Service) createOrGetInstanceGroups(ctx context.Context) ([]*compute.Ins } func (s *Service) createOrGetHealthCheck(ctx context.Context) (*compute.HealthCheck, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetHealthCheck", + ) + defer span.End() + log := log.FromContext(ctx) healthcheckSpec := s.scope.HealthCheckSpec() log.V(2).Info("Looking for healthcheck", "name", healthcheckSpec.Name) @@ -157,6 +178,11 @@ func (s *Service) createOrGetHealthCheck(ctx context.Context) (*compute.HealthCh } func (s *Service) createOrGetBackendService(ctx context.Context, instancegroups []*compute.InstanceGroup, healthcheck *compute.HealthCheck) (*compute.BackendService, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetBackendService", + ) + defer span.End() + log := log.FromContext(ctx) backends := make([]*compute.Backend, 0, len(instancegroups)) for _, group := range instancegroups { @@ -202,6 +228,11 @@ func (s *Service) createOrGetBackendService(ctx context.Context, instancegroups } func (s *Service) createOrGetTargetTCPProxy(ctx context.Context, service *compute.BackendService) (*compute.TargetTcpProxy, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetTargetTCPProxy", + ) + defer span.End() + log := log.FromContext(ctx) targetSpec := s.scope.TargetTCPProxySpec() targetSpec.Service = service.SelfLink @@ -229,6 +260,11 @@ func (s *Service) createOrGetTargetTCPProxy(ctx context.Context, service *comput } func (s *Service) createOrGetAddress(ctx context.Context) (*compute.Address, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetAddress", + ) + defer span.End() + log := log.FromContext(ctx) addrSpec := s.scope.AddressSpec() log.V(2).Info("Looking for address", "name", addrSpec.Name) @@ -259,6 +295,11 @@ func (s *Service) createOrGetAddress(ctx context.Context) (*compute.Address, err } func (s *Service) createForwardingRule(ctx context.Context, target *compute.TargetTcpProxy, addr *compute.Address) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createForwardingRule", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.ForwardingRuleSpec() key := meta.GlobalKey(spec.Name) @@ -289,6 +330,11 @@ func (s *Service) createForwardingRule(ctx context.Context, target *compute.Targ } func (s *Service) deleteForwardingRule(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteForwardingRule", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.ForwardingRuleSpec() key := meta.GlobalKey(spec.Name) @@ -303,6 +349,11 @@ func (s *Service) deleteForwardingRule(ctx context.Context) error { } func (s *Service) deleteAddress(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteAddress", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.AddressSpec() key := meta.GlobalKey(spec.Name) @@ -316,6 +367,11 @@ func (s *Service) deleteAddress(ctx context.Context) error { } func (s *Service) deleteTargetTCPProxy(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteTargetTCPProxy", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.TargetTCPProxySpec() key := meta.GlobalKey(spec.Name) @@ -330,6 +386,11 @@ func (s *Service) deleteTargetTCPProxy(ctx context.Context) error { } func (s *Service) deleteBackendService(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteBackendService", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.BackendServiceSpec() key := meta.GlobalKey(spec.Name) @@ -344,6 +405,11 @@ func (s *Service) deleteBackendService(ctx context.Context) error { } func (s *Service) deleteHealthCheck(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteHealthCheck", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.HealthCheckSpec() key := meta.GlobalKey(spec.Name) @@ -358,6 +424,11 @@ func (s *Service) deleteHealthCheck(ctx context.Context) error { } func (s *Service) deleteInstanceGroups(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteInstanceGroups", + ) + defer span.End() + log := log.FromContext(ctx) for zone := range s.scope.Network().APIServerInstanceGroups { spec := s.scope.InstanceGroupSpec(zone) diff --git a/cloud/services/compute/networks/reconcile.go b/cloud/services/compute/networks/reconcile.go index 63c2dafce..46997e1f2 100644 --- a/cloud/services/compute/networks/reconcile.go +++ b/cloud/services/compute/networks/reconcile.go @@ -24,11 +24,17 @@ import ( "k8s.io/utils/pointer" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconcile cluster network components. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "networks.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling network resources") network, err := s.createOrGetNetwork(ctx) @@ -51,6 +57,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete delete cluster network components. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "networks.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Deleting network resources") networkKey := meta.GlobalKey(s.scope.NetworkName()) @@ -92,6 +103,11 @@ func (s *Service) Delete(ctx context.Context) error { // createOrGetNetwork creates a network if not exist otherwise return existing network. func (s *Service) createOrGetNetwork(ctx context.Context) (*compute.Network, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "networks.Services.createOrGetNetwork", + ) + defer span.End() + log := log.FromContext(ctx) log.V(2).Info("Looking for network", "name", s.scope.NetworkName()) networkKey := meta.GlobalKey(s.scope.NetworkName()) @@ -119,6 +135,11 @@ func (s *Service) createOrGetNetwork(ctx context.Context) (*compute.Network, err // createOrGetRouter creates a cloudnat router if not exist otherwise return the existing. func (s *Service) createOrGetRouter(ctx context.Context, network *compute.Network) (*compute.Router, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "networks.Services.createOrGetRouter", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.NatRouterSpec() log.V(2).Info("Looking for cloudnat router", "name", spec.Name) diff --git a/cloud/services/compute/subnets/reconcile.go b/cloud/services/compute/subnets/reconcile.go index 403c49fb7..54232cdd1 100644 --- a/cloud/services/compute/subnets/reconcile.go +++ b/cloud/services/compute/subnets/reconcile.go @@ -22,11 +22,17 @@ import ( "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" "google.golang.org/api/compute/v1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconcile cluster network components. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "subnets.Services.Reconcile", + ) + defer span.End() + logger := log.FromContext(ctx) logger.Info("Reconciling subnetwork resources") @@ -40,6 +46,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete deletes cluster subnetwork components. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "subnets.Services.Delete", + ) + defer span.End() + logger := log.FromContext(ctx) for _, subnetSpec := range s.scope.SubnetSpecs() { logger.V(2).Info("Deleting a subnet", "name", subnetSpec.Name) @@ -56,6 +67,11 @@ func (s *Service) Delete(ctx context.Context) error { // createOrGetSubnets creates the subnetworks if they don't exist otherwise return the existing ones. func (s *Service) createOrGetSubnets(ctx context.Context) ([]*compute.Subnetwork, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "subnets.Services.createOrGetSubnets", + ) + defer span.End() + logger := log.FromContext(ctx) subnets := []*compute.Subnetwork{} for _, subnetSpec := range s.scope.SubnetSpecs() { From ea54fe71dd25fe38abb94e451e6baf61a42b84a5 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Sun, 7 May 2023 02:00:54 +0000 Subject: [PATCH 10/17] instrument cloud/services/container --- .../services/container/clusters/kubeconfig.go | 31 ++++++++++++++ .../services/container/clusters/reconcile.go | 31 ++++++++++++++ .../services/container/nodepools/reconcile.go | 41 +++++++++++++++++++ 3 files changed, 103 insertions(+) diff --git a/cloud/services/container/clusters/kubeconfig.go b/cloud/services/container/clusters/kubeconfig.go index 7c3554c32..51bb2544e 100644 --- a/cloud/services/container/clusters/kubeconfig.go +++ b/cloud/services/container/clusters/kubeconfig.go @@ -32,6 +32,7 @@ import ( "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/tools/clientcmd/api" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/cluster-api/util/kubeconfig" "sigs.k8s.io/cluster-api/util/secret" ) @@ -42,6 +43,11 @@ const ( ) func (s *Service) reconcileKubeconfig(ctx context.Context, cluster *containerpb.Cluster, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.reconcileKubeconfig", + ) + defer span.End() + log.Info("Reconciling kubeconfig") clusterRef := types.NamespacedName{ Name: s.scope.Cluster.Name, @@ -72,6 +78,11 @@ func (s *Service) reconcileKubeconfig(ctx context.Context, cluster *containerpb. } func (s *Service) reconcileAdditionalKubeconfigs(ctx context.Context, cluster *containerpb.Cluster, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.reconcileAdditionalKubeconfigs", + ) + defer span.End() + log.Info("Reconciling additional kubeconfig") clusterRef := types.NamespacedName{ Name: s.scope.Cluster.Name + "-user", @@ -99,6 +110,11 @@ func (s *Service) reconcileAdditionalKubeconfigs(ctx context.Context, cluster *c } func (s *Service) createUserKubeconfigSecret(ctx context.Context, cluster *containerpb.Cluster, clusterRef *types.NamespacedName) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.createUserKubeconfigSecret", + ) + defer span.End() + controllerOwnerRef := *metav1.NewControllerRef(s.scope.GCPManagedControlPlane, infrav1exp.GroupVersion.WithKind("GCPManagedControlPlane")) contextName := s.getKubeConfigContextName(false) @@ -137,6 +153,11 @@ func (s *Service) createUserKubeconfigSecret(ctx context.Context, cluster *conta } func (s *Service) createCAPIKubeconfigSecret(ctx context.Context, cluster *containerpb.Cluster, clusterRef *types.NamespacedName, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.createCAPIKubeconfigSecret", + ) + defer span.End() + controllerOwnerRef := *metav1.NewControllerRef(s.scope.GCPManagedControlPlane, infrav1exp.GroupVersion.WithKind("GCPManagedControlPlane")) contextName := s.getKubeConfigContextName(false) @@ -174,6 +195,11 @@ func (s *Service) createCAPIKubeconfigSecret(ctx context.Context, cluster *conta } func (s *Service) updateCAPIKubeconfigSecret(ctx context.Context, configSecret *corev1.Secret) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.updateCAPIKubeconfigSecret", + ) + defer span.End() + data, ok := configSecret.Data[secret.KubeconfigDataName] if !ok { return errors.Errorf("missing key %q in secret data", secret.KubeconfigDataName) @@ -241,6 +267,11 @@ func (s *Service) createBaseKubeConfig(contextName string, cluster *containerpb. } func (s *Service) generateToken(ctx context.Context) (string, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.generateToken", + ) + defer span.End() + req := &credentialspb.GenerateAccessTokenRequest{ Name: fmt.Sprintf("projects/-/serviceAccounts/%s", s.scope.GetCredential().ClientEmail), Scope: []string{ diff --git a/cloud/services/container/clusters/reconcile.go b/cloud/services/container/clusters/reconcile.go index 4317e058b..27539bcc8 100644 --- a/cloud/services/container/clusters/reconcile.go +++ b/cloud/services/container/clusters/reconcile.go @@ -30,6 +30,7 @@ import ( "google.golang.org/grpc/codes" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/conditions" ctrl "sigs.k8s.io/controller-runtime" @@ -38,6 +39,11 @@ import ( // Reconcile reconcile GKE cluster. func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx).WithValues("service", "container.clusters") log.Info("Reconciling cluster resources") @@ -177,6 +183,11 @@ func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) { // Delete delete GKE cluster. func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx).WithValues("service", "container.clusters") log.Info("Deleting cluster resources") @@ -221,6 +232,11 @@ func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) { } func (s *Service) describeCluster(ctx context.Context, log *logr.Logger) (*containerpb.Cluster, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.describeCluster", + ) + defer span.End() + getClusterRequest := &containerpb.GetClusterRequest{ Name: s.scope.ClusterFullName(), } @@ -240,6 +256,11 @@ func (s *Service) describeCluster(ctx context.Context, log *logr.Logger) (*conta } func (s *Service) createCluster(ctx context.Context, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.createCluster", + ) + defer span.End() + nodePools, machinePools, _ := s.scope.GetAllNodePools(ctx) log.V(2).Info("Running pre-flight checks on machine pools before cluster creation") @@ -282,6 +303,11 @@ func (s *Service) createCluster(ctx context.Context, log *logr.Logger) error { } func (s *Service) updateCluster(ctx context.Context, updateClusterRequest *containerpb.UpdateClusterRequest, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.updateCluster", + ) + defer span.End() + _, err := s.scope.ManagedControlPlaneClient().UpdateCluster(ctx, updateClusterRequest) if err != nil { log.Error(err, "Error updating GKE cluster", "name", s.scope.ClusterName()) @@ -292,6 +318,11 @@ func (s *Service) updateCluster(ctx context.Context, updateClusterRequest *conta } func (s *Service) deleteCluster(ctx context.Context, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.deleteCluster", + ) + defer span.End() + deleteClusterRequest := &containerpb.DeleteClusterRequest{ Name: s.scope.ClusterFullName(), } diff --git a/cloud/services/container/nodepools/reconcile.go b/cloud/services/container/nodepools/reconcile.go index f315f339c..3fa73b261 100644 --- a/cloud/services/container/nodepools/reconcile.go +++ b/cloud/services/container/nodepools/reconcile.go @@ -23,6 +23,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud" "sigs.k8s.io/cluster-api-provider-gcp/util/resourceurl" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "google.golang.org/api/iterator" "google.golang.org/grpc/codes" @@ -45,6 +46,11 @@ import ( // Reconcile reconcile GKE node pool. func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling node pool resources") @@ -180,6 +186,11 @@ func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) { // Delete delete GKE node pool. func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Deleting node pool resources") @@ -223,6 +234,11 @@ func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) { } func (s *Service) describeNodePool(ctx context.Context, log *logr.Logger) (*containerpb.NodePool, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.describeNodePool", + ) + defer span.End() + getNodePoolRequest := &containerpb.GetNodePoolRequest{ Name: s.scope.NodePoolFullName(), } @@ -242,6 +258,11 @@ func (s *Service) describeNodePool(ctx context.Context, log *logr.Logger) (*cont } func (s *Service) getInstances(ctx context.Context, nodePool *containerpb.NodePool) ([]*computepb.ManagedInstance, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.getInstances", + ) + defer span.End() + instances := []*computepb.ManagedInstance{} for _, url := range nodePool.InstanceGroupUrls { @@ -271,6 +292,11 @@ func (s *Service) getInstances(ctx context.Context, nodePool *containerpb.NodePo } func (s *Service) createNodePool(ctx context.Context, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.createNodePool", + ) + defer span.End() + log.V(2).Info("Running pre-flight checks on machine pool before creation") if err := shared.ManagedMachinePoolPreflightCheck(s.scope.GCPManagedMachinePool, s.scope.MachinePool, s.scope.Region()); err != nil { return fmt.Errorf("preflight checks on machine pool before creating: %w", err) @@ -291,6 +317,11 @@ func (s *Service) createNodePool(ctx context.Context, log *logr.Logger) error { } func (s *Service) updateNodePoolVersionOrImage(ctx context.Context, updateNodePoolRequest *containerpb.UpdateNodePoolRequest) error { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.updateNodePoolVersionOrImage", + ) + defer span.End() + _, err := s.scope.ManagedMachinePoolClient().UpdateNodePool(ctx, updateNodePoolRequest) if err != nil { return err @@ -300,6 +331,11 @@ func (s *Service) updateNodePoolVersionOrImage(ctx context.Context, updateNodePo } func (s *Service) updateNodePoolAutoscaling(ctx context.Context, setNodePoolAutoscalingRequest *containerpb.SetNodePoolAutoscalingRequest) error { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.updateNodePoolAutoScaling", + ) + defer span.End() + _, err := s.scope.ManagedMachinePoolClient().SetNodePoolAutoscaling(ctx, setNodePoolAutoscalingRequest) if err != nil { return err @@ -309,6 +345,11 @@ func (s *Service) updateNodePoolAutoscaling(ctx context.Context, setNodePoolAuto } func (s *Service) updateNodePoolSize(ctx context.Context, setNodePoolSizeRequest *containerpb.SetNodePoolSizeRequest) error { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.updateNodePoolSize", + ) + defer span.End() + _, err := s.scope.ManagedMachinePoolClient().SetNodePoolSize(ctx, setNodePoolSizeRequest) if err != nil { return err From 560fd0a711c7f53b78d78fddf2d7a372fdb33a4c Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 12 May 2023 15:02:33 +0000 Subject: [PATCH 11/17] add sampling rate --- main.go | 9 ++++++++- pkg/otel/tracing.go | 11 +++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/main.go b/main.go index b1bbd031f..d1feb41e1 100644 --- a/main.go +++ b/main.go @@ -86,6 +86,7 @@ var ( leaderElectionRenewDeadline time.Duration leaderElectionRetryPeriod time.Duration enableTracing bool + samplingRate float64 ) func main() { @@ -150,7 +151,7 @@ func main() { ctx := ctrl.SetupSignalHandler() if enableTracing { - if err := ot.RegisterTracing(ctx, setupLog); err != nil { + if err := ot.RegisterTracing(ctx, samplingRate, setupLog); err != nil { setupLog.Error(err, "unable to set up tracing") os.Exit(1) } @@ -381,5 +382,11 @@ func initFlags(fs *pflag.FlagSet) { "Enable collecting and sending traces to opentelemetry-collector service", ) + fs.Float64Var(&samplingRate, + "trace-sampling-rate", + 0.6, + "The fraction of all the traces will be sample", + ) + feature.MutableGates.AddFlag(fs) } diff --git a/pkg/otel/tracing.go b/pkg/otel/tracing.go index e2b32fafb..367f16603 100644 --- a/pkg/otel/tracing.go +++ b/pkg/otel/tracing.go @@ -20,9 +20,9 @@ import ( semconv "go.opentelemetry.io/otel/semconv/v1.4.0" ) -func RegisterTracing(ctx context.Context, log logr.Logger) error { +func RegisterTracing(ctx context.Context, samplingRate float64, log logr.Logger) error { - tracerProvider, err := SetUpTracing(ctx) + tracerProvider, err := SetUpTracing(ctx, samplingRate) if err != nil { return err } @@ -64,7 +64,7 @@ func newExporter(ctx context.Context) (*otlptrace.Exporter, error) { return traceExporter, nil } -func SetUpTracing(ctx context.Context) (*trace.TracerProvider, error) { +func SetUpTracing(ctx context.Context, samplingRate float64) (*trace.TracerProvider, error) { traceExporter, err := newExporter(ctx) @@ -89,9 +89,8 @@ func SetUpTracing(ctx context.Context) (*trace.TracerProvider, error) { traceProvider := trace.NewTracerProvider( trace.WithBatcher(traceExporter), trace.WithResource(resource), - // TODO: dynamic sampling rate? - // sampling rate based on parent span = 60% - trace.WithSampler(trace.ParentBased(trace.TraceIDRatioBased(0.6))), + // 0 < samplingRate <= 1 (< 0 -> be treated as 0; >= 1 -> always sample) + trace.WithSampler(trace.ParentBased(trace.TraceIDRatioBased(samplingRate))), ) otel.SetTracerProvider(traceProvider) From ba6261ec2a629bec6012a24bc06299a6078f1721 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 12 May 2023 15:02:50 +0000 Subject: [PATCH 12/17] Tilt file --- Tiltfile | 225 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 179 insertions(+), 46 deletions(-) diff --git a/Tiltfile b/Tiltfile index f72ac6de6..29bc569d5 100644 --- a/Tiltfile +++ b/Tiltfile @@ -5,10 +5,12 @@ tools_bin = "./hack/tools/bin" kubectl_cmd = "./hack/tools/bin/kubectl" kind_cmd = "./hack/tools/bin/kind" -#Add tools to path +# Add tools to path os.putenv("PATH", os.getenv("PATH") + ":" + tools_bin) -update_settings(k8s_upsert_timeout_secs = 60) # on first tilt up, often can take longer than 30 seconds +update_settings( + k8s_upsert_timeout_secs=60 +) # on first tilt up, often can take longer than 30 seconds # set defaults settings = { @@ -26,10 +28,12 @@ settings = { keys = ["GCP_B64ENCODED_CREDENTIALS"] # global settings -settings.update(read_json( - "tilt-settings.json", - default = {}, -)) +settings.update( + read_json( + "tilt-settings.json", + default={}, + ) +) if settings.get("trigger_mode") == "manual": trigger_mode(TRIGGER_MODE_MANUAL) @@ -40,36 +44,61 @@ if "allowed_contexts" in settings: if "default_registry" in settings: default_registry(settings.get("default_registry")) + # deploy CAPI def deploy_capi(): version = settings.get("capi_version") - capi_uri = "https://github.com/kubernetes-sigs/cluster-api/releases/download/{}/cluster-api-components.yaml".format(version) - cmd = "curl -sSL {} | {} | {} apply -f -".format(capi_uri, envsubst_cmd, kubectl_cmd) - local(cmd, quiet = True) + capi_uri = "https://github.com/kubernetes-sigs/cluster-api/releases/download/{}/cluster-api-components.yaml".format( + version + ) + cmd = "curl -sSL {} | {} | {} apply -f -".format( + capi_uri, envsubst_cmd, kubectl_cmd + ) + local(cmd, quiet=True) if settings.get("extra_args"): extra_args = settings.get("extra_args") if extra_args.get("core"): core_extra_args = extra_args.get("core") if core_extra_args: for namespace in ["capi-system"]: - patch_args_with_extra_args(namespace, "capi-controller-manager", core_extra_args) + patch_args_with_extra_args( + namespace, "capi-controller-manager", core_extra_args + ) if extra_args.get("kubeadm-bootstrap"): kb_extra_args = extra_args.get("kubeadm-bootstrap") if kb_extra_args: - patch_args_with_extra_args("capi-kubeadm-bootstrap-system", "capi-kubeadm-bootstrap-controller-manager", kb_extra_args) + patch_args_with_extra_args( + "capi-kubeadm-bootstrap-system", + "capi-kubeadm-bootstrap-controller-manager", + kb_extra_args, + ) + def patch_args_with_extra_args(namespace, name, extra_args): - args_str = str(local("{} get deployments {} -n {} -o jsonpath={{.spec.template.spec.containers[0].args}}".format(kubectl_cmd, name, namespace))) + args_str = str( + local( + "{} get deployments {} -n {} -o jsonpath={{.spec.template.spec.containers[0].args}}".format( + kubectl_cmd, name, namespace + ) + ) + ) args_to_add = [arg for arg in extra_args if arg not in args_str] if args_to_add: args = args_str[1:-1].split() args.extend(args_to_add) - patch = [{ - "op": "replace", - "path": "/spec/template/spec/containers/0/args", - "value": args, - }] - local("{} patch deployment {} -n {} --type json -p='{}'".format(kubectl_cmd, name, namespace, str(encode_json(patch)).replace("\n", ""))) + patch = [ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/args", + "value": args, + } + ] + local( + "{} patch deployment {} -n {} --type json -p='{}'".format( + kubectl_cmd, name, namespace, str(encode_json(patch)).replace("\n", "") + ) + ) + # Users may define their own Tilt customizations in tilt.d. This directory is excluded from git and these files will # not be checked in to version control. @@ -78,23 +107,37 @@ def include_user_tilt_files(): for f in user_tiltfiles: include(f) -def append_arg_for_container_in_deployment(yaml_stream, name, namespace, contains_image_name, args): + +def append_arg_for_container_in_deployment( + yaml_stream, name, namespace, contains_image_name, args +): for item in yaml_stream: - if item["kind"] == "Deployment" and item.get("metadata").get("name") == name and item.get("metadata").get("namespace") == namespace: + if ( + item["kind"] == "Deployment" + and item.get("metadata").get("name") == name + and item.get("metadata").get("namespace") == namespace + ): containers = item.get("spec").get("template").get("spec").get("containers") for container in containers: if contains_image_name in container.get("image"): container.get("args").extend(args) + def fixup_yaml_empty_arrays(yaml_str): yaml_str = yaml_str.replace("conditions: null", "conditions: []") return yaml_str.replace("storedVersions: null", "storedVersions: []") + def validate_auth(): substitutions = settings.get("kustomize_substitutions", {}) missing = [k for k in keys if k not in substitutions] if missing: - fail("missing kustomize_substitutions keys {} in tilt-settings.json".format(missing)) + fail( + "missing kustomize_substitutions keys {} in tilt-settings.json".format( + missing + ) + ) + tilt_helper_dockerfile_header = """ # Tilt image @@ -113,35 +156,58 @@ COPY --from=tilt-helper /restart.sh . COPY manager . """ + # Build CAPG and add feature gates def capg(): # Apply the kustomized yaml for this provider substitutions = settings.get("kustomize_substitutions", {}) os.environ.update(substitutions) - # yaml = str(kustomizesub("./hack/observability")) # build an observable kind deployment by default - yaml = str(kustomizesub("./config/default")) + yaml = str( + kustomizesub("./hack/observability") + ) # build an observable kind deployment by default + # TODO: consider to remove + # yaml = str(kustomizesub("./config/default")) # add extra_args if they are defined if settings.get("extra_args"): gcp_extra_args = settings.get("extra_args").get("gcp") if gcp_extra_args: yaml_dict = decode_yaml_stream(yaml) - append_arg_for_container_in_deployment(yaml_dict, "capg-controller-manager", "capg-system", "cluster-api-gcp-controller", gcp_extra_args) + append_arg_for_container_in_deployment( + yaml_dict, + "capg-controller-manager", + "capg-system", + "cluster-api-gcp-controller", + gcp_extra_args, + ) yaml = str(encode_yaml_stream(yaml_dict)) yaml = fixup_yaml_empty_arrays(yaml) # Set up a local_resource build of the provider's manager binary. local_resource( "manager", - cmd = 'mkdir -p .tiltbuild;CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags \'-extldflags "-static"\' -o .tiltbuild/manager', - deps = ["api", "cloud", "config", "controllers", "exp", "feature", "pkg", "go.mod", "go.sum", "main.go"], + cmd="mkdir -p .tiltbuild;CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags '-extldflags \"-static\"' -o .tiltbuild/manager", + deps=[ + "api", + "cloud", + "config", + "controllers", + "exp", + "feature", + "pkg", + "go.mod", + "go.sum", + "main.go", + ], ) - dockerfile_contents = "\n".join([ - tilt_helper_dockerfile_header, - tilt_dockerfile_header, - ]) + dockerfile_contents = "\n".join( + [ + tilt_helper_dockerfile_header, + tilt_dockerfile_header, + ] + ) entrypoint = ["sh", "/start.sh", "/manager"] extra_args = settings.get("extra_args") @@ -151,45 +217,110 @@ def capg(): # Set up an image build for the provider. The live update configuration syncs the output from the local_resource # build into the container. docker_build( - ref = "gcr.io/k8s-staging-cluster-api-gcp/cluster-api-gcp-controller", - context = "./.tiltbuild/", - dockerfile_contents = dockerfile_contents, - target = "tilt", - entrypoint = entrypoint, - only = "manager", - live_update = [ + ref="gcr.io/k8s-staging-cluster-api-gcp/cluster-api-gcp-controller", + context="./.tiltbuild/", + dockerfile_contents=dockerfile_contents, + target="tilt", + entrypoint=entrypoint, + only="manager", + live_update=[ sync(".tiltbuild/manager", "/manager"), run("sh /restart.sh"), ], - ignore = ["templates"], + ignore=["templates"], ) k8s_yaml(blob(yaml)) + +def observability(): + # Install the OpenTelemetry helm chart + gcp_project_id = os.getenv("GCP_PROJECT_ID", "") + + k8s_yaml( + helm( + "./hack/observability/opentelemetry/chart", + name="opentelemetry-collector", + namespace="capg-system", + values=["./hack/observability/opentelemetry/values.yaml"], + # refer https://github.com/helm/helm/issues/1987 + set=[ + "extraEnvs[0].name=GCP_PROJECT_ID", + "extraEnvs[0].value=" + gcp_project_id, + ], + ) + ) + + k8s_yaml( + helm( + "./hack/observability/jaeger/chart", + name="jaeger-all-in-one", + namespace="capg-system", + set=[ + # TODO: consider to remove + # "crd.install=false", + # "rbac.create=false", + "resources.limits.cpu=200m", + "resources.limits.memory=256Mi", + ], + ) + ) + + k8s_resource( + workload="jaeger-all-in-one", + new_name="traces: jaeger-all-in-one", + port_forwards=[ + port_forward(16686, name="View traces", link_path="/search?service=capg") + ], + labels=["observability"], + ) + + k8s_resource(workload="opentelemetry-collector", labels=["observability"]) + + def base64_encode(to_encode): - encode_blob = local("echo '{}' | tr -d '\n' | base64 - | tr -d '\n'".format(to_encode), quiet = True) + encode_blob = local( + "echo '{}' | tr -d '\n' | base64 - | tr -d '\n'".format(to_encode), quiet=True + ) return str(encode_blob) + def base64_encode_file(path_to_encode): - encode_blob = local("cat {} | tr -d '\n' | base64 - | tr -d '\n'".format(path_to_encode), quiet = True) + encode_blob = local( + "cat {} | tr -d '\n' | base64 - | tr -d '\n'".format(path_to_encode), quiet=True + ) return str(encode_blob) + def read_file_from_path(path_to_read): - str_blob = local("cat {} | tr -d '\n'".format(path_to_read), quiet = True) + str_blob = local("cat {} | tr -d '\n'".format(path_to_read), quiet=True) return str(str_blob) + def base64_decode(to_decode): - decode_blob = local("echo '{}' | base64 --decode -".format(to_decode), quiet = True) + decode_blob = local("echo '{}' | base64 --decode -".format(to_decode), quiet=True) return str(decode_blob) + def kustomizesub(folder): - yaml = local("hack/kustomize-sub.sh {}".format(folder), quiet = True) + yaml = local("hack/kustomize-sub.sh {}".format(folder), quiet=True) return yaml + def waitforsystem(): - local(kubectl_cmd + " wait --for=condition=ready --timeout=300s pod --all -n capi-kubeadm-bootstrap-system") - local(kubectl_cmd + " wait --for=condition=ready --timeout=300s pod --all -n capi-kubeadm-control-plane-system") - local(kubectl_cmd + " wait --for=condition=ready --timeout=300s pod --all -n capi-system") + local( + kubectl_cmd + + " wait --for=condition=ready --timeout=300s pod --all -n capi-kubeadm-bootstrap-system" + ) + local( + kubectl_cmd + + " wait --for=condition=ready --timeout=300s pod --all -n capi-kubeadm-control-plane-system" + ) + local( + kubectl_cmd + + " wait --for=condition=ready --timeout=300s pod --all -n capi-system" + ) + ############################## # Actual work happens here @@ -208,4 +339,6 @@ deploy_capi() capg() +observability() + waitforsystem() From 173a97f9a0c85bbe6e9417e5d5c5dfeae7f0945d Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Mon, 22 May 2023 06:16:37 +0000 Subject: [PATCH 13/17] debug blocking tracing connection --- pkg/otel/tracing.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/otel/tracing.go b/pkg/otel/tracing.go index 367f16603..85336dab4 100644 --- a/pkg/otel/tracing.go +++ b/pkg/otel/tracing.go @@ -8,7 +8,6 @@ import ( "github.com/pkg/errors" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" - "k8s.io/client-go/pkg/version" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -18,6 +17,7 @@ import ( "go.opentelemetry.io/otel/sdk/resource" "go.opentelemetry.io/otel/sdk/trace" semconv "go.opentelemetry.io/otel/semconv/v1.4.0" + ver "sigs.k8s.io/cluster-api-provider-gcp/version" ) func RegisterTracing(ctx context.Context, samplingRate float64, log logr.Logger) error { @@ -44,10 +44,13 @@ func RegisterTracing(ctx context.Context, samplingRate float64, log logr.Logger) func newExporter(ctx context.Context) (*otlptrace.Exporter, error) { + ctx, cancel := context.WithTimeout(ctx, time.Second) + defer cancel() + conn, err := grpc.DialContext(ctx, "opentelemetry-collector:4317", // Using non-TLS connection for dev environment grpc.WithTransportCredentials(insecure.NewCredentials()), - grpc.WithBlock(), + grpc.WithBlock(), // blocking code ) if err != nil { @@ -73,12 +76,11 @@ func SetUpTracing(ctx context.Context, samplingRate float64) (*trace.TracerProvi } // labels/tags/res common to all traces - // TODO: consider to add more fields resource, err := resource.New(ctx, resource.WithAttributes( semconv.ServiceNameKey.String("capg"), attribute.String("exporter", "otlpgrpc"), - attribute.String("version", version.Get().String()), + attribute.String("version", ver.Get().String()), ), ) From f4a05541de4810c7f94558f2f56bfce02a364f05 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 26 May 2023 18:14:12 +0000 Subject: [PATCH 14/17] helm charts for jaeger + otel collector --- hack/observability/jaeger/chart/.helmignore | 22 + hack/observability/jaeger/chart/Chart.yaml | 16 + hack/observability/jaeger/chart/LICENSE | 21 + hack/observability/jaeger/chart/README.md | 26 + .../jaeger/chart/templates/NOTES.txt | 23 + .../jaeger/chart/templates/_helpers.tpl | 63 ++ .../jaeger/chart/templates/ingress.yaml | 64 ++ .../jaeger/chart/templates/jaeger-volume.yaml | 18 + .../chart/templates/service-headless.yaml | 56 ++ .../jaeger/chart/templates/service.yaml | 56 ++ .../chart/templates/serviceaccount.yaml | 11 + .../jaeger/chart/templates/statefulset.yaml | 110 +++ .../templates/tests/test-connection.yaml | 17 + hack/observability/jaeger/chart/values.yaml | 100 +++ .../opentelemetry/chart/.helmignore | 23 + .../opentelemetry/chart/CONTRIBUTING.md | 8 + .../opentelemetry/chart/Chart.yaml | 14 + .../observability/opentelemetry/chart/LICENSE | 201 +++++ .../opentelemetry/chart/README.md | 217 +++++ .../opentelemetry/chart/UPGRADING.md | 289 ++++++ .../opentelemetry/chart/templates/NOTES.txt | 38 + .../opentelemetry/chart/templates/_config.tpl | 329 +++++++ .../chart/templates/_helpers.tpl | 131 +++ .../opentelemetry/chart/templates/_pod.tpl | 194 +++++ .../chart/templates/clusterrole.yaml | 50 ++ .../chart/templates/clusterrolebinding.yaml | 22 + .../chart/templates/configmap-agent.yaml | 11 + .../templates/configmap-statefulset.yaml | 11 + .../chart/templates/configmap.yaml | 11 + .../chart/templates/daemonset.yaml | 44 + .../chart/templates/deployment.yaml | 45 + .../opentelemetry/chart/templates/hpa.yaml | 32 + .../chart/templates/ingress.yaml | 54 ++ .../chart/templates/networkpolicy.yaml | 38 + .../opentelemetry/chart/templates/pdb.yaml | 18 + .../chart/templates/podmonitor.yaml | 18 + .../chart/templates/prometheusrule.yaml | 87 ++ .../chart/templates/service.yaml | 33 + .../chart/templates/serviceaccount.yaml | 14 + .../chart/templates/servicemonitor.yaml | 18 + .../chart/templates/statefulset.yaml | 44 + .../opentelemetry/chart/values.schema.json | 824 ++++++++++++++++++ .../opentelemetry/chart/values.yaml | 486 +++++++++++ .../controller-manager-dev-env-otel-patch.yml | 1 + .../secrets-dev-env-otel-patch.yaml | 5 +- hack/observability/opentelemetry/values.yaml | 9 +- 46 files changed, 3916 insertions(+), 6 deletions(-) create mode 100644 hack/observability/jaeger/chart/.helmignore create mode 100644 hack/observability/jaeger/chart/Chart.yaml create mode 100644 hack/observability/jaeger/chart/LICENSE create mode 100644 hack/observability/jaeger/chart/README.md create mode 100644 hack/observability/jaeger/chart/templates/NOTES.txt create mode 100644 hack/observability/jaeger/chart/templates/_helpers.tpl create mode 100644 hack/observability/jaeger/chart/templates/ingress.yaml create mode 100644 hack/observability/jaeger/chart/templates/jaeger-volume.yaml create mode 100644 hack/observability/jaeger/chart/templates/service-headless.yaml create mode 100644 hack/observability/jaeger/chart/templates/service.yaml create mode 100644 hack/observability/jaeger/chart/templates/serviceaccount.yaml create mode 100644 hack/observability/jaeger/chart/templates/statefulset.yaml create mode 100644 hack/observability/jaeger/chart/templates/tests/test-connection.yaml create mode 100644 hack/observability/jaeger/chart/values.yaml create mode 100644 hack/observability/opentelemetry/chart/.helmignore create mode 100644 hack/observability/opentelemetry/chart/CONTRIBUTING.md create mode 100644 hack/observability/opentelemetry/chart/Chart.yaml create mode 100644 hack/observability/opentelemetry/chart/LICENSE create mode 100644 hack/observability/opentelemetry/chart/README.md create mode 100644 hack/observability/opentelemetry/chart/UPGRADING.md create mode 100644 hack/observability/opentelemetry/chart/templates/NOTES.txt create mode 100644 hack/observability/opentelemetry/chart/templates/_config.tpl create mode 100644 hack/observability/opentelemetry/chart/templates/_helpers.tpl create mode 100644 hack/observability/opentelemetry/chart/templates/_pod.tpl create mode 100644 hack/observability/opentelemetry/chart/templates/clusterrole.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/clusterrolebinding.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/configmap-agent.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/configmap-statefulset.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/configmap.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/daemonset.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/deployment.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/hpa.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/ingress.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/networkpolicy.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/pdb.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/podmonitor.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/prometheusrule.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/service.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/serviceaccount.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/servicemonitor.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/statefulset.yaml create mode 100644 hack/observability/opentelemetry/chart/values.schema.json create mode 100644 hack/observability/opentelemetry/chart/values.yaml diff --git a/hack/observability/jaeger/chart/.helmignore b/hack/observability/jaeger/chart/.helmignore new file mode 100644 index 000000000..50af03172 --- /dev/null +++ b/hack/observability/jaeger/chart/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/hack/observability/jaeger/chart/Chart.yaml b/hack/observability/jaeger/chart/Chart.yaml new file mode 100644 index 000000000..a72810422 --- /dev/null +++ b/hack/observability/jaeger/chart/Chart.yaml @@ -0,0 +1,16 @@ +apiVersion: v2 +appVersion: 1.41.0 +description: Jaeger all-in-one helm chart for Kubernetes +home: https://github.com/hansehe/jaeger-all-in-one +icon: https://raw.githubusercontent.com/hansehe/jaeger-all-in-one/master/helm/jaeger.png +keywords: +- jaeger +- jaeger-all-in-one +maintainers: +- email: hans.erik.heggem@gmail.com + name: hansehe +name: jaeger-all-in-one +sources: +- https://github.com/hansehe/jaeger-all-in-one +type: application +version: 0.1.8 diff --git a/hack/observability/jaeger/chart/LICENSE b/hack/observability/jaeger/chart/LICENSE new file mode 100644 index 000000000..2a7cec499 --- /dev/null +++ b/hack/observability/jaeger/chart/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Hans Erik Heggem + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/hack/observability/jaeger/chart/README.md b/hack/observability/jaeger/chart/README.md new file mode 100644 index 000000000..878731bb3 --- /dev/null +++ b/hack/observability/jaeger/chart/README.md @@ -0,0 +1,26 @@ +# Jaeger Tracing - All In One + +## Introduction + +The Jaeger tracing all-in-one service enables jaeger for development purposes, check out: +- https://www.jaegertracing.io/docs/1.18/getting-started/ + +## Installing the Chart + +To install the chart with the release name `jaeger-all-in-one` run: + +```bash +$ helm repo add jaeger-all-in-one https://raw.githubusercontent.com/hansehe/jaeger-all-in-one/master/helm/charts +$ helm install jaeger-all-in-one jaeger-all-in-one/jaeger-all-in-one +``` + +Alternatively, a YAML file that specifies the values for the parameters can be provided while installing the chart. For example, + +```bash +$ helm install jaeger-all-in-one -f values.yaml jaeger-all-in-one/jaeger-all-in-one +``` + +## Configuration + +Find all possible configuration values here: +- https://github.com/hansehe/jaeger-all-in-one/blob/master/helm/jaeger-all-in-one/values.yaml diff --git a/hack/observability/jaeger/chart/templates/NOTES.txt b/hack/observability/jaeger/chart/templates/NOTES.txt new file mode 100644 index 000000000..f05fa1802 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/NOTES.txt @@ -0,0 +1,23 @@ +{{- if .Values.enabled }} +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ . }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "jaeger-all-in-one.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "jaeger-all-in-one.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "jaeger-all-in-one.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "jaeger-all-in-one.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + echo "Visit http://127.0.0.1:{{ .Values.service.port }} to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME {{ .Values.service.port }}:{{ .Values.service.port }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/_helpers.tpl b/hack/observability/jaeger/chart/templates/_helpers.tpl new file mode 100644 index 000000000..c8f340b9b --- /dev/null +++ b/hack/observability/jaeger/chart/templates/_helpers.tpl @@ -0,0 +1,63 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "jaeger-all-in-one.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "jaeger-all-in-one.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "jaeger-all-in-one.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "jaeger-all-in-one.labels" -}} +helm.sh/chart: {{ include "jaeger-all-in-one.chart" . }} +{{ include "jaeger-all-in-one.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "jaeger-all-in-one.selectorLabels" -}} +app.kubernetes.io/name: {{ include "jaeger-all-in-one.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* +Create the name of the service account to use +*/}} +{{- define "jaeger-all-in-one.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} + {{ default (include "jaeger-all-in-one.fullname" .) .Values.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.serviceAccount.name }} +{{- end -}} +{{- end -}} diff --git a/hack/observability/jaeger/chart/templates/ingress.yaml b/hack/observability/jaeger/chart/templates/ingress.yaml new file mode 100644 index 000000000..8998588f5 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/ingress.yaml @@ -0,0 +1,64 @@ +{{- if .Values.enabled }} +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "jaeger-all-in-one.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "jaeger-all-in-one.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/jaeger-volume.yaml b/hack/observability/jaeger/chart/templates/jaeger-volume.yaml new file mode 100644 index 000000000..20eedea60 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/jaeger-volume.yaml @@ -0,0 +1,18 @@ +{{- if .Values.enabled }} +{{- if .Values.volume.enabled -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "jaeger-all-in-one.fullname" . }} + namespace: {{ .Release.Namespace }} +spec: + {{- if .Values.volume.className }} + storageClassName: {{ .Values.volume.className }} + {{- end }} + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.volume.size }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/service-headless.yaml b/hack/observability/jaeger/chart/templates/service-headless.yaml new file mode 100644 index 000000000..5c10d6fbf --- /dev/null +++ b/hack/observability/jaeger/chart/templates/service-headless.yaml @@ -0,0 +1,56 @@ +{{- if .Values.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "jaeger-all-in-one.fullname" . }}-headless + namespace: {{ .Release.Namespace }} + labels: + {{- include "jaeger-all-in-one.labels" . | nindent 4 }} + {{- with .Values.service.headless.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + clusterIP: None + ports: + - port: 6831 + targetPort: udp-com-thr + protocol: UDP + name: udp-com-thr + - port: 6832 + targetPort: udp-bin-thr + protocol: UDP + name: udp-bin-thr + - port: 5775 + targetPort: udp-bin-thr-o + protocol: UDP + name: udp-bin-thr-o + - port: 5778 + targetPort: http-configs + protocol: TCP + name: http-configs + - port: {{ .Values.service.port }} + targetPort: http-ui + protocol: TCP + name: http-ui + - port: 14250 + targetPort: grpc-proto + protocol: TCP + name: grpc-proto + - port: 14268 + targetPort: http-bin-thr + protocol: TCP + name: http-bin-thr + - port: 14269 + targetPort: http-admin + protocol: TCP + name: http-admin + {{- if .Values.enableHttpZipkinCollector }} + - port: 9411 + targetPort: http-zipkin + protocol: TCP + name: http-zipkin + {{- end }} + selector: + {{- include "jaeger-all-in-one.selectorLabels" . | nindent 4 }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/service.yaml b/hack/observability/jaeger/chart/templates/service.yaml new file mode 100644 index 000000000..687a20283 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/service.yaml @@ -0,0 +1,56 @@ +{{- if .Values.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "jaeger-all-in-one.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "jaeger-all-in-one.labels" . | nindent 4 }} + {{- with .Values.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 6831 + targetPort: udp-com-thr + protocol: UDP + name: udp-com-thr + - port: 6832 + targetPort: udp-bin-thr + protocol: UDP + name: udp-bin-thr + - port: 5775 + targetPort: udp-bin-thr-o + protocol: UDP + name: udp-bin-thr-o + - port: 5778 + targetPort: http-configs + protocol: TCP + name: http-configs + - port: {{ .Values.service.port }} + targetPort: http-ui + protocol: TCP + name: http-ui + - port: 14250 + targetPort: grpc-proto + protocol: TCP + name: grpc-proto + - port: 14268 + targetPort: http-bin-thr + protocol: TCP + name: http-bin-thr + - port: 14269 + targetPort: http-admin + protocol: TCP + name: http-admin + {{- if .Values.enableHttpZipkinCollector }} + - port: 9411 + targetPort: http-zipkin + protocol: TCP + name: http-zipkin + {{- end }} + selector: + {{- include "jaeger-all-in-one.selectorLabels" . | nindent 4 }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/serviceaccount.yaml b/hack/observability/jaeger/chart/templates/serviceaccount.yaml new file mode 100644 index 000000000..85ca08c18 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/serviceaccount.yaml @@ -0,0 +1,11 @@ +{{- if .Values.enabled }} +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "jaeger-all-in-one.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: +{{ include "jaeger-all-in-one.labels" . | nindent 4 }} +{{- end -}} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/statefulset.yaml b/hack/observability/jaeger/chart/templates/statefulset.yaml new file mode 100644 index 000000000..2458eaad7 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/statefulset.yaml @@ -0,0 +1,110 @@ +{{- if .Values.enabled }} +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "jaeger-all-in-one.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "jaeger-all-in-one.labels" . | nindent 4 }} +spec: + serviceName: {{ include "jaeger-all-in-one.fullname" . }}-headless + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "jaeger-all-in-one.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "jaeger-all-in-one.selectorLabels" . | nindent 8 }} + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "jaeger-all-in-one.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- if .Values.volume.enabled }} + volumes: + - name: jaeger-volume + persistentVolumeClaim: + claimName: {{ include "jaeger-all-in-one.fullname" . }} + {{- end }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: udp-com-thr + containerPort: 6831 + protocol: UDP + - name: udp-bin-thr + containerPort: 6832 + protocol: UDP + - name: udp-bin-thr-o + containerPort: 5775 + protocol: UDP + - name: http-configs + containerPort: 5778 + protocol: TCP + - name: http-ui + containerPort: 16686 + protocol: TCP + - name: grpc-proto + containerPort: 14250 + protocol: TCP + - name: http-bin-thr + containerPort: 14268 + protocol: TCP + - name: http-admin + containerPort: 14269 + protocol: TCP + {{- if .Values.enableHttpZipkinCollector }} + - name: http-zipkin + containerPort: 9411 + protocol: TCP + {{- end }} + {{- if .Values.volume.enabled }} + volumeMounts: + - mountPath: "/badger" + name: jaeger-volume + {{- end }} + livenessProbe: + httpGet: + path: {{ .Values.healthCheckUrl | quote }} + port: http-admin + readinessProbe: + httpGet: + path: {{ .Values.healthCheckUrl | quote }} + port: http-admin + resources: + {{- toYaml .Values.resources | nindent 12 }} + env: + {{- range $key, $value := .Values.environmentVariables }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- if .Values.enableHttpZipkinCollector }} + - name: COLLECTOR_ZIPKIN_HOST_PORT + value: "9411" + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/hack/observability/jaeger/chart/templates/tests/test-connection.yaml b/hack/observability/jaeger/chart/templates/tests/test-connection.yaml new file mode 100644 index 000000000..c009bdb60 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/tests/test-connection.yaml @@ -0,0 +1,17 @@ +{{- if .Values.tests.enabled }} +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "jaeger-all-in-one.fullname" . }}-test-connection" + labels: +{{ include "jaeger-all-in-one.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test-success +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "jaeger-all-in-one.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/values.yaml b/hack/observability/jaeger/chart/values.yaml new file mode 100644 index 000000000..5026287d3 --- /dev/null +++ b/hack/observability/jaeger/chart/values.yaml @@ -0,0 +1,100 @@ +# Default values for jaeger-all-in-one. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +enabled: true +replicaCount: 1 + +image: + repository: jaegertracing/all-in-one + pullPolicy: IfNotPresent + +healthCheckUrl: / +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" +terminationGracePeriodSeconds: 10 +environmentVariables: + MEMORY_MAX_TRACES: 100000 + SPAN_STORAGE_TYPE: badger + BADGER_EPHEMERAL: false + BADGER_DIRECTORY_VALUE: /badger/data + BADGER_DIRECTORY_KEY: /badger/key + +enableHttpZipkinCollector: false + +serviceAccount: + # Specifies whether a service account should be created + create: true + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + prometheus.io/port: "14269" + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 16686 + annotations: + prometheus.io/probe: "true" + prometheus.io/probe-path: "/" + headless: + annotations: {} + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # cert-manager.io/cluster-issuer: letsencrypt + # nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + # nginx.ingress.kubernetes.io/from-to-www-redirect: "true" + hosts: + - host: jaeger.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: jaeger-tls + # hosts: + # - jaeger.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +volume: + enabled: true + className: "" + size: 3Gi + +tests: + enabled: true \ No newline at end of file diff --git a/hack/observability/opentelemetry/chart/.helmignore b/hack/observability/opentelemetry/chart/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/hack/observability/opentelemetry/chart/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/hack/observability/opentelemetry/chart/CONTRIBUTING.md b/hack/observability/opentelemetry/chart/CONTRIBUTING.md new file mode 100644 index 000000000..e2dd7d2b0 --- /dev/null +++ b/hack/observability/opentelemetry/chart/CONTRIBUTING.md @@ -0,0 +1,8 @@ +# Collector Chart Contributing Guide + +## Bumping Default Collector Version + +1. Increase the minor version of the chart by one and set the patch version to zero. +2. Update the chart's `appVersion` to match the new collector version. This version will be used as the image tag by default. +3. Review the corresponding release notes in [Collector Core](https://github.com/open-telemetry/opentelemetry-collector/releases), [Collector Contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib/releases), and [Collector Releases](https://github.com/open-telemetry/opentelemetry-collector-releases/releases). If any changes affect the helm charts, adjust the helm chart accordingly. +4. Run `make generate-examples`. \ No newline at end of file diff --git a/hack/observability/opentelemetry/chart/Chart.yaml b/hack/observability/opentelemetry/chart/Chart.yaml new file mode 100644 index 000000000..c25eb655f --- /dev/null +++ b/hack/observability/opentelemetry/chart/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +appVersion: 0.75.0 +description: OpenTelemetry Collector Helm chart for Kubernetes +home: https://opentelemetry.io/ +icon: https://opentelemetry.io/img/logos/opentelemetry-logo-nav.png +maintainers: +- name: dmitryax +- name: TylerHelmuth +name: opentelemetry-collector +sources: +- https://github.com/open-telemetry/opentelemetry-collector +- https://github.com/open-telemetry/opentelemetry-collector-contrib +type: application +version: 0.53.0 diff --git a/hack/observability/opentelemetry/chart/LICENSE b/hack/observability/opentelemetry/chart/LICENSE new file mode 100644 index 000000000..f49a4e16e --- /dev/null +++ b/hack/observability/opentelemetry/chart/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/hack/observability/opentelemetry/chart/README.md b/hack/observability/opentelemetry/chart/README.md new file mode 100644 index 000000000..a15197fde --- /dev/null +++ b/hack/observability/opentelemetry/chart/README.md @@ -0,0 +1,217 @@ +# OpenTelemetry Collector Helm Chart + +The helm chart installs [OpenTelemetry Collector](https://github.com/open-telemetry/opentelemetry-collector) +in kubernetes cluster. + +## Prerequisites + +- Kubernetes 1.23+ +- Helm 3.9+ + +## Installing the Chart + +Add OpenTelemetry Helm repository: + +```console +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +``` + +To install the chart with the release name my-opentelemetry-collector, run the following command: + +```console +helm install my-opentelemetry-collector open-telemetry/opentelemetry-collector +``` + +## Upgrading + +See [UPGRADING.md](UPGRADING.md). + +## Security Considerations + +OpenTelemetry Collector recommends to bind receivers' servers to addresses that limit connections to authorized users. +For this reason, by default the chart binds all the Collector's endpoints to the pod's IP. + +More info is available in the [Security Best Practices docummentation](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/security-best-practices.md#safeguards-against-denial-of-service-attacks) + +Some care must be taken when using `hostNetwork: true`, as then OpenTelemetry Collector will listen on all the addresses in the host network namespace. + +## Configuration + +### Default configuration + +By default this chart will deploy an OpenTelemetry Collector as daemonset with three pipelines (logs, metrics and traces) +and logging exporter enabled by default. Besides daemonset (agent), it can be also installed as deployment. + +*Example*: Install collector as a deployment, and do not run it as an agent. + +```yaml +mode: deployment +``` + +By default collector has the following receivers enabled: + +- **metrics**: OTLP and prometheus. Prometheus is configured only for scraping collector's own metrics. +- **traces**: OTLP, zipkin and jaeger (thrift and grpc). +- **logs**: OTLP (to enable container logs, see [Configuration for Kubernetes container logs](#configuration-for-kubernetes-container-logs)). + +There are two ways to configure collector pipelines, which can be used together as well. + +### Basic top level configuration + +Default components can be removed with `null`. When changing a pipeline, you must explicitly list all the components that are in the pipeline, including any default components. + +*Example*: Disable metrics and logging pipelines and non-otlp receivers: + +```yaml +config: + receivers: + jaeger: null + prometheus: null + zipkin: null + service: + pipelines: + traces: + receivers: + - otlp + metrics: null + logs: null +``` + +*Example*: Add host metrics receiver: + +```yaml +mode: daemonset + +presets: + hostMetrics: + enabled: true +``` + +### Configuration for Kubernetes container logs + +The collector can be used to collect logs sent to standard output by Kubernetes containers. +This feature is disabled by default. It has the following requirements: + +- It needs agent collector to be deployed. +- It requires the [contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib) version +of the collector image. + +To enable this feature, set the `presets.logsCollection.enabled` property to `true`. +Here is an example `values.yaml`: + +```yaml +mode: daemonset + +presets: + logsCollection: + enabled: true + includeCollectorLogs: true +``` + +The way this feature works is it adds a `filelog` receiver on the `logs` pipeline. This receiver is preconfigured +to read the files where Kubernetes container runtime writes all containers' console output to. + +#### :warning: Warning: Risk of looping the exported logs back into the receiver, causing "log explosion" + +The container logs pipeline uses the `logging` console exporter by default. +Paired with the default `filelog` receiver that receives all containers' console output, +it is easy to accidentally feed the exported logs back into the receiver. + +Also note that using the `--log-level=debug` option for the `logging` exporter causes it to output +multiple lines per single received log, which when looped, would amplify the logs exponentially. + +To prevent the looping, the default configuration of the receiver excludes logs from the collector's containers. + +If you want to include the collector's logs, make sure to replace the `logging` exporter +with an exporter that does not send logs to collector's standard output. + +Here's an example `values.yaml` file that replaces the default `logging` exporter on the `logs` pipeline +with an `otlphttp` exporter that sends the container logs to `https://example.com:55681` endpoint. +It also clears the `filelog` receiver's `exclude` property, for collector logs to be included in the pipeline. + +```yaml +mode: daemonset + +presets: + logsCollection: + enabled: true + includeCollectorLogs: true + +config: + exporters: + otlphttp: + endpoint: https://example.com:55681 + service: + pipelines: + logs: + exporters: + - otlphttp +``` + +### Configuration for Kubernetes attributes processor + +The collector can be configured to add Kubernetes metadata to logs, metrics and traces. + +This feature is disabled by default. It has the following requirements: + +- It requires [k8sattributesprocessor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor) processor to be included in the collector, such as [contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib) version of the collector image. + +To enable this feature, set the `presets.kubernetesAttributes.enabled` property to `true`. +Here is an example `values.yaml`: + +```yaml +mode: daemonset +presets: + kubernetesAttributes: + enabled: true +``` + +### Configuration for Kubernetes Cluster Metrics + +The collector can be configured to collects cluster-level metrics from the Kubernetes API server. A single instance of this receiver can be used to monitor a cluster. + +This feature is disabled by default. It has the following requirements: + +- It requires [k8sclusterreceiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sclusterreceiver) to be included in the collector, such as [contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib) version of the collector image. +- It requires statefulset or deployment mode with a signle replica. + +To enable this feature, set the `presets.clusterMetrics.enabled` property to `true`. + +Here is an example `values.yaml`: + +```yaml +mode: deployment +replicaCount: 1 +presets: + clusterMetrics: + enabled: true +``` + +### Configuration for retrieving Kubelet metrics + +The collector can be configured to collect Kubelet metrics. + +This feature is disabled by default. It has the following requirements: + +- It requires [kubeletstats](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kubeletstatsreceiver) receiver to be included in the collector, such as [contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib) version of the collector image. + +To enable this feature, set the `presets.kubeletMetrics.enabled` property to `true`. +Here is an example `values.yaml`: + +```yaml +mode: daemonset +presets: + kubeletMetrics: + enabled: true +``` + +### CRDs + +At this time, Prometheus CRDs are supported but other CRDs are not. + +### Other configuration options + +The [values.yaml](./values.yaml) file contains information about all other configuration +options for this chart. + +For more examples see [Examples](examples). diff --git a/hack/observability/opentelemetry/chart/UPGRADING.md b/hack/observability/opentelemetry/chart/UPGRADING.md new file mode 100644 index 000000000..f182e811f --- /dev/null +++ b/hack/observability/opentelemetry/chart/UPGRADING.md @@ -0,0 +1,289 @@ +# Upgrade guidelines + +## 0.46.0 to 0.47.0 + +[Update Collector Endpoints to use Pod IP Instead of 0.0.0.0](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/603) + +The [Collector's security guidelines were updated](https://github.com/open-telemetry/opentelemetry-collector/pull/6959) to include containerized environments when discussing safeguards against denial of service attacks. +To be in compliance with the Collector's security best practices the chart has been updated to use the Collector's pod IP in place of `0.0.0.0`. + +The chart will continue to allow complete configuration of the Collector via the `config` field in the values.yaml. If pod IP does not suite your needs you can use `config` to set something different. + +See [Security Best Practices docummentation](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/security-best-practices.md#safeguards-against-denial-of-service-attacks) for more details. + +The new default of binding to the pod IP, rather than `0.0.0.0`, will cause `kubectl port-forward` to fail. If port-forwarding is desired, the following `value.yaml` snippet will allow the Collector bind to `127.0.0.1` inside the pod, in addition to the pod's IP: + +```yaml +config: + receivers: + jaeger/local: + protocols: + grpc: + endpoint: 127.0.0.1:14250 + thrift_compact: + endpoint: 127.0.0.1:6831 + thrift_http: + endpoint: 127.0.0.1:14268 + otlp/local: + protocols: + grpc: + endpoint: 127.0.0.1:4317 + http: + endpoint: 127.0.0.1:4318 + zipkin/local: + endpoint: 127.0.0.1:9411 + service: + pipelines: + traces: + receivers: + - otlp + - otlp/local + - jaeger + - jaeger/local + - zipkin + - zipkin/local +``` + +## 0.40.7 to 0.41.0 + +[Require Kubernetes version 1.23 or later](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/541) + +If you enable use of a _HorizontalPodAutoscaler_ for the collector when running in the "deployment" mode by way of `.Values.autoscaling.enabled`, the manifest now uses the "autoscaling/v2" API group version, which [is available only as recently as Kubernetes version 1.23](https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/#horizontalpodautoscaler-v2-graduates-to-ga). As [all previous versions of this API group are deprecated and removed as of Kubernetes version 1.26](https://kubernetes.io/docs/reference/using-api/deprecation-guide/#horizontalpodautoscaler-v126), we don't offer support for Kubernetes versions older than 1.23. + +## 0.34.0 to 0.34.0 + +[config supports templating](TBD) + +The chart now supports templating in `.Values.config`. If you are currently using any `{{ }}` syntax in `.Values.yaml` it will now be rendered. To escape existing instances of `{{ }}`, use ``` {{` `}} ```. For example, `{{ REDACTED_EMAIL }}` becomes ``` {{` {{ REDACTED_EMAIL }} `}} ```. + +## 0.28.0 to 0.29.0 + +[Reduce requested resources](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/273) + +Resource `limits` have been reduced. Upgrades/installs of chart 0.29.0 will now use fewer resources. In order to set the resources back to what they were, you will need to override the `resources` section in the `values.yaml`. + +*Example*: + +```yaml +resources: + limits: + cpu: 1 + memory: 2Gi +``` + +## 0.23.1 to 0.24.0 + +[Remove containerLogs in favor of presets.logsCollection]() + +The ability to enable logs collection from the collector has been moved from `containerLogs.enabled` to `presets.logsCollection.enabled`. If you are currently using `containerLogs.enabled`, you should instead use the preset: + +```yaml +presets: + logsCollection: + enabled: true +``` + +If you are using `containerLogs.enabled` and also enabling collection of the collector logs you can use `includeCollectorLogs` + +```yaml +presets: + logsCollection: + enabled: true + includeCollectorLogs: true +``` + +You no longer need to update `config.service.pipelines.logs` to include the filelog receiver yourself as the preset will automatically update the logs pipeline to include the filelog receiver. + +The filelog's preset configuration can modified by `config.receivers`, but preset configuration cannot be removed. If you need to remove any filelog receiver configuration generated by the preset you should not use the preset. Instead, configure the filelog receiver manually in `config.receivers` and set any other necessary fields in the values.yaml to modify k8s as needed. + +See the [daemonset-collector-logs example](https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-collector/examples/daemonset-collector-logs) to see an example of the preset in action. + +## 0.18.0 to 0.19.0 + +[Remove agentCollector and standaloneCollector settings](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/216) + +The `agentCollector` and `standaloneCollector` config sections have been removed. Upgrades/installs of chart 0.19.0 will fail if `agentCollector` or `standaloneCollector` are in the values.yaml. See the [Migrate to mode](#migrate-to-mode) steps for instructions on how to replace `agentCollector` and `standaloneCollector` with `mode`. + +## 0.13.0 to 0.14.0 + +[Remove two-deployment mode](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/159) + +The ability to install both the agent and standalone collectors simultaneous with the chart has been removed. Installs/upgrades where both `.Values.agentCollector.enabled` and `.Values.standloneCollector.enables` are true will fail. `agentCollector` and `standloneCollector` have also be deprecated, but backward compatibility has been maintained. + +### To run both a deployment and daemonset + +Install a deployment version of the collector. This is done by setting `.Values.mode` to `deployment` + +```yaml +mode: deployment +``` + +Next, install an daemonset version of the collector that is configured to send traffic to the previously installed deployment. This is done by setting `.Values.mode` to `daemonset` and updating `.Values.config` so that data is exported to the deployment. + +```yaml +mode: daemonset + +config: + exporters: + otlp: + endpoint: example-opentelemetry-collector:4317 + tls: + insecure: true + service: + pipelines: + logs: + exporters: + - otlp + - logging + metrics: + exporters: + - otlp + - logging + traces: + exporters: + - otlp + - logging +``` + +See the [daemonset-and-deployment](examples/daemonset-and-deployment) example to see the rendered config. + +### Migrate to `mode`: + +The `agentCollector` and `standaloneCollector` sections in values.yaml have been deprecated. Instead there is a new field, `mode`, that determines if the collector is being installed as a daemonset or deployment. + +```yaml +# Valid values are "daemonset" and "deployment". +# If set, agentCollector and standaloneCollector are ignored. +mode: +``` + +The following fields have also been added to the root-level to replace the depracated `agentCollector` and `standaloneCollector` settings. + +```yaml +containerLogs: + enabled: false + +resources: + limits: + cpu: 1 + memory: 2Gi + +podAnnotations: {} + +podLabels: {} + +# Host networking requested for this pod. Use the host's network namespace. +hostNetwork: false + +# only used with deployment mode +replicaCount: 1 + +annotations: {} +``` + +When using `mode`, these settings should be used instead of their counterparts in `agentCollector` and `standaloneCollector`. + +Set `mode` to `daemonset` if `agentCollector` was being used. Move all `agentCollector` settings to the corresponding root-level setting. If `agentCollector.configOverride` was being used, merge the settings with `.Values.config`. + +Example agentCollector values.yaml: + +```yaml +agentCollector: + resources: + limits: + cpu: 3 + memory: 6Gi + configOverride: + receivers: + hostmetrics: + scrapers: + cpu: + disk: + filesystem: + service: + pipelines: + metrics: + receivers: [otlp, prometheus, hostmetrics] +``` + +Example mode values.yaml: + +```yaml +mode: daemonset + +resources: + limits: + cpu: 3 + memory: 6Gi + +config: + receivers: + hostmetrics: + scrapers: + cpu: + disk: + filesystem: + service: + pipelines: + metrics: + receivers: [otlp, prometheus, hostmetrics] +``` + +Set `mode` to `deployment` if `standaloneCollector` was being used. Move all `standaloneCollector` settings to the corresponding root-level setting. If `standaloneCollector.configOverride` was being used, merge the settings with `.Values.config`. + +Example standaloneCollector values.yaml: + +```yaml +standaloneCollector: + enabled: true + replicaCount: 2 + configOverride: + receivers: + podman_stats: + endpoint: unix://run/podman/podman.sock + timeout: 10s + collection_interval: 10s + service: + pipelines: + metrics: + receivers: [otlp, prometheus, podman_stats] +``` + +Example mode values.yaml: + +```yaml +mode: deployment + +replicaCount: 2 + +config: + receivers: + receivers: + podman_stats: + endpoint: unix://run/podman/podman.sock + timeout: 10s + collection_interval: 10s + service: + pipelines: + metrics: + receivers: [otlp, prometheus, podman_stats] +``` + +Default configuration in `.Values.config` can now be removed with `null`. When changing a pipeline, you must explicitly list all the components that are in the pipeline, including any default components. + +*Example*: Disable metrics and logging pipelines and non-otlp receivers: + +```yaml +config: + receivers: + jaeger: null + prometheus: null + zipkin: null + service: + pipelines: + traces: + receivers: + - otlp + metrics: null + logs: null +``` diff --git a/hack/observability/opentelemetry/chart/templates/NOTES.txt b/hack/observability/opentelemetry/chart/templates/NOTES.txt new file mode 100644 index 000000000..1bebd0e97 --- /dev/null +++ b/hack/observability/opentelemetry/chart/templates/NOTES.txt @@ -0,0 +1,38 @@ +{{- if not (eq (toString .Values.extraConfigMapMounts) "") }} +[WARNING] "extraConfigMapMounts" parameter is deprecated, please use "extraVolumes" or "extraVolumesMounts" instead. +{{ end }} + +{{- if not (eq (toString .Values.extraHostPathMounts) "") }} +[WARNING] "extraHostPathMounts" parameter is deprecated, please use "extraVolumes" or "extraVolumesMounts" instead. +{{ end }} + +{{- if not (eq (toString .Values.secretMounts) "") }} +[WARNING] "secretMounts" parameter is deprecated, please use "extraVolumes" or "extraVolumeMounts" instead. +{{ end }} + +{{- if and (not (eq .Values.mode "daemonset")) (not (eq .Values.mode "deployment")) (not (eq .Values.mode "statefulset")) }} +{{ fail "[ERROR] 'mode' must be set. See https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/UPGRADING.md for instructions." }} +{{ end }} + +{{- if not .Values.configMap.create }} +[WARNING] "configMap" wil not be created and "config" will not take effect. +{{ end }} + +{{- if not (eq (toString .Values.containerLogs) "") }} +[WARNING] 'containerLogs' is deprecated. Use 'presets.logsCollection' instead. See https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/UPGRADING.md#0231-to-0240 for instructions on how to migrate. +{{ end }} + +[INFO] as of chart version 0.47.0 the default collector configuration has been updated to use pod IP instead of 0.0.0.0 for its endpoints. See https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/UPGRADING.md#0460-to-0470 for details. + +{{- if .Values.presets.clusterMetrics.enabled }} +{{- if eq .Values.mode "daemonset"}} +{{- fail "Cluster Metrics preset is not suitable for daemonset mode. Please use statefulset or deployment mode with replicaCount: 1"}} +{{ end }} +{{- if gt (int .Values.replicaCount) 1 }} +{{- fail "Cluster Metrics preset is not suitable for replicaCount greater than one. Please change replica count to one." }} +{{ end }} +{{ end }} + +{{- if .Values.presets.kubernetesEvents.enabled }} +[WARNING] The 'k8sobjects' is a ALPHA receiver and may be changed anytime. +{{ end }} \ No newline at end of file diff --git a/hack/observability/opentelemetry/chart/templates/_config.tpl b/hack/observability/opentelemetry/chart/templates/_config.tpl new file mode 100644 index 000000000..ac805a84f --- /dev/null +++ b/hack/observability/opentelemetry/chart/templates/_config.tpl @@ -0,0 +1,329 @@ +{{/* +Default memory limiter configuration for OpenTelemetry Collector based on k8s resource limits. +*/}} +{{- define "opentelemetry-collector.memoryLimiter" -}} +# check_interval is the time between measurements of memory usage. +check_interval: 5s + +# By default limit_mib is set to 80% of ".Values.resources.limits.memory" +limit_percentage: 80 + +# By default spike_limit_mib is set to 25% of ".Values.resources.limits.memory" +spike_limit_percentage: 25 +{{- end }} + +{{/* +Merge user supplied config into memory limiter config. +*/}} +{{- define "opentelemetry-collector.baseConfig" -}} +{{- $processorsConfig := get .Values.config "processors" }} +{{- if not $processorsConfig.memory_limiter }} +{{- $_ := set $processorsConfig "memory_limiter" (include "opentelemetry-collector.memoryLimiter" . | fromYaml) }} +{{- end }} +{{- $memoryBallastConfig := get .Values.config.extensions "memory_ballast" }} +{{- if or (not $memoryBallastConfig) (not $memoryBallastConfig.size_in_percentage) }} +{{- $_ := set $memoryBallastConfig "size_in_percentage" 40 }} +{{- end }} +{{- .Values.config | toYaml }} +{{- end }} + +{{/* +Build config file for daemonset OpenTelemetry Collector +*/}} +{{- define "opentelemetry-collector.daemonsetConfig" -}} +{{- $values := deepCopy .Values }} +{{- $data := dict "Values" $values | mustMergeOverwrite (deepCopy .) }} +{{- $config := include "opentelemetry-collector.baseConfig" $data | fromYaml }} +{{- if eq (include "opentelemetry-collector.logsCollectionEnabled" .) "true" }} +{{- $config = (include "opentelemetry-collector.applyLogsCollectionConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.hostMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyHostMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubeletMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubeletMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubernetesAttributes.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubernetesAttributesConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.clusterMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyClusterMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- tpl (toYaml $config) . }} +{{- end }} + +{{/* +Build config file for deployment OpenTelemetry Collector +*/}} +{{- define "opentelemetry-collector.deploymentConfig" -}} +{{- $values := deepCopy .Values }} +{{- $data := dict "Values" $values | mustMergeOverwrite (deepCopy .) }} +{{- $config := include "opentelemetry-collector.baseConfig" $data | fromYaml }} +{{- if eq (include "opentelemetry-collector.logsCollectionEnabled" .) "true" }} +{{- $config = (include "opentelemetry-collector.applyLogsCollectionConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.hostMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyHostMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubeletMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubeletMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubernetesAttributes.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubernetesAttributesConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubernetesEvents.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubernetesEventsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.clusterMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyClusterMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- tpl (toYaml $config) . }} +{{- end }} + +{{- define "opentelemetry-collector.applyHostMetricsConfig" -}} +{{- $config := mustMergeOverwrite (include "opentelemetry-collector.hostMetricsConfig" .Values | fromYaml) .config }} +{{- $_ := set $config.service.pipelines.metrics "receivers" (append $config.service.pipelines.metrics.receivers "hostmetrics" | uniq) }} +{{- $config | toYaml }} +{{- end }} + +{{- define "opentelemetry-collector.hostMetricsConfig" -}} +receivers: + hostmetrics: + root_path: /hostfs + collection_interval: 10s + scrapers: + cpu: + load: + memory: + disk: + filesystem: + exclude_mount_points: + mount_points: + - /dev/* + - /proc/* + - /sys/* + - /run/k3s/containerd/* + - /var/lib/docker/* + - /var/lib/kubelet/* + - /snap/* + match_type: regexp + exclude_fs_types: + fs_types: + - autofs + - binfmt_misc + - bpf + - cgroup2 + - configfs + - debugfs + - devpts + - devtmpfs + - fusectl + - hugetlbfs + - iso9660 + - mqueue + - nsfs + - overlay + - proc + - procfs + - pstore + - rpc_pipefs + - securityfs + - selinuxfs + - squashfs + - sysfs + - tracefs + match_type: strict + network: +{{- end }} + +{{- define "opentelemetry-collector.applyClusterMetricsConfig" -}} +{{- $config := mustMergeOverwrite (include "opentelemetry-collector.clusterMetricsConfig" .Values | fromYaml) .config }} +{{- $_ := set $config.service.pipelines.metrics "receivers" (append $config.service.pipelines.metrics.receivers "k8s_cluster" | uniq) }} +{{- $config | toYaml }} +{{- end }} + +{{- define "opentelemetry-collector.clusterMetricsConfig" -}} +receivers: + k8s_cluster: + collection_interval: 10s +{{- end }} + +{{- define "opentelemetry-collector.applyKubeletMetricsConfig" -}} +{{- $config := mustMergeOverwrite (include "opentelemetry-collector.kubeletMetricsConfig" .Values | fromYaml) .config }} +{{- $_ := set $config.service.pipelines.metrics "receivers" (append $config.service.pipelines.metrics.receivers "kubeletstats" | uniq) }} +{{- $config | toYaml }} +{{- end }} + +{{- define "opentelemetry-collector.kubeletMetricsConfig" -}} +receivers: + kubeletstats: + collection_interval: 20s + auth_type: "serviceAccount" + endpoint: "${K8S_NODE_NAME}:10250" +{{- end }} + +{{- define "opentelemetry-collector.applyLogsCollectionConfig" -}} +{{- $config := mustMergeOverwrite (include "opentelemetry-collector.logsCollectionConfig" .Values | fromYaml) .config }} +{{- $_ := set $config.service.pipelines.logs "receivers" (append $config.service.pipelines.logs.receivers "filelog" | uniq) }} +{{- if .Values.Values.presets.logsCollection.storeCheckpoints}} +{{- $_ := set $config.service "extensions" (append $config.service.extensions "file_storage" | uniq) }} +{{- end }} +{{- $config | toYaml }} +{{- end }} + +{{- define "opentelemetry-collector.logsCollectionConfig" -}} +{{- if .Values.presets.logsCollection.storeCheckpoints }} +extensions: + file_storage: + directory: /var/lib/otelcol +{{- end }} +receivers: + filelog: + include: [ /var/log/pods/*/*/*.log ] + {{- if .Values.presets.logsCollection.includeCollectorLogs }} + exclude: [] + {{- else }} + # Exclude collector container's logs. The file format is /var/log/pods/__//.log + exclude: [ /var/log/pods/{{ .Release.Namespace }}_{{ include "opentelemetry-collector.fullname" . }}*_*/{{ include "opentelemetry-collector.lowercase_chartname" . }}/*.log ] + {{- end }} + start_at: beginning + {{- if .Values.presets.logsCollection.storeCheckpoints}} + storage: file_storage + {{- end }} + include_file_path: true + include_file_name: false + operators: + # Find out which format is used by kubernetes + - type: router + id: get-format + routes: + - output: parser-docker + expr: 'body matches "^\\{"' + - output: parser-crio + expr: 'body matches "^[^ Z]+ "' + - output: parser-containerd + expr: 'body matches "^[^ Z]+Z"' + # Parse CRI-O format + - type: regex_parser + id: parser-crio + regex: '^(?P