From 56fc243ff5883b998b010e09a1ac83bcae60a712 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 17 Sep 2024 13:20:07 -0400 Subject: [PATCH] ccl/sqlproxyccl: add proxy.sql.routing_method_count metric Previously, it was not possible to determine the distribution of users connecting to the proxy via SNI, database, or cluster option (via the options parameter). This commit introduces a `proxy.sql.routing_method_count` metric to record the number of occurrences of each proxy routing method. Only successful parsing will be recorded; if a user provides invalid input, it will not be tracked. This enhancement allows us to monitor and understand which method was used by the proxy to retrieve the cluster identifier. Epic: CC-6926 Release note: None --- pkg/ccl/sqlproxyccl/BUILD.bazel | 1 + pkg/ccl/sqlproxyccl/metrics.go | 25 ++++++-- pkg/ccl/sqlproxyccl/proxy_handler.go | 16 ++++- pkg/ccl/sqlproxyccl/proxy_handler_test.go | 77 ++++++++++++++++++++++- 4 files changed, 109 insertions(+), 10 deletions(-) diff --git a/pkg/ccl/sqlproxyccl/BUILD.bazel b/pkg/ccl/sqlproxyccl/BUILD.bazel index 36c3198f1e40..884ab5a6a99f 100644 --- a/pkg/ccl/sqlproxyccl/BUILD.bazel +++ b/pkg/ccl/sqlproxyccl/BUILD.bazel @@ -38,6 +38,7 @@ go_library( "//pkg/util/httputil", "//pkg/util/log", "//pkg/util/metric", + "//pkg/util/metric/aggmetric", "//pkg/util/netutil/addr", "//pkg/util/randutil", "//pkg/util/retry", diff --git a/pkg/ccl/sqlproxyccl/metrics.go b/pkg/ccl/sqlproxyccl/metrics.go index 253f740bee51..1c1cda51a887 100644 --- a/pkg/ccl/sqlproxyccl/metrics.go +++ b/pkg/ccl/sqlproxyccl/metrics.go @@ -11,6 +11,7 @@ package sqlproxyccl import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/util/metric" + "github.com/cockroachdb/cockroach/pkg/util/metric/aggmetric" ) // metrics contains pointers to the metrics for monitoring proxy operations. @@ -45,6 +46,11 @@ type metrics struct { QueryCancelSuccessful *metric.Counter AccessControlFileErrorCount *metric.Gauge + + RoutingMethodCount *aggmetric.AggCounter + SNIRoutingMethodCount *aggmetric.Counter + DatabaseRoutingMethodCount *aggmetric.Counter + ClusterOptionRoutingMethodCount *aggmetric.Counter } // MetricStruct implements the metrics.Struct interface. @@ -212,18 +218,23 @@ var ( Measurement: "Query Cancel Requests", Unit: metric.Unit_COUNT, } - accessControlFileErrorCount = metric.Metadata{ + metaAccessControlFileErrorCount = metric.Metadata{ Name: "proxy.access_control.errors", Help: "Numbers of access control list files that are currently having errors", Measurement: "Access Control File Errors", Unit: metric.Unit_COUNT, } + metaRoutingMethodCount = metric.Metadata{ + Name: "proxy.sql.routing_method_count", + Help: "Number of occurrences of each proxy routing method", + Measurement: "Number of occurrences", + Unit: metric.Unit_COUNT, + } ) // makeProxyMetrics instantiates the metrics holder for proxy monitoring. func makeProxyMetrics() metrics { - - return metrics{ + m := &metrics{ BackendDisconnectCount: metric.NewCounter(metaBackendDisconnectCount), IdleDisconnectCount: metric.NewCounter(metaIdleDisconnectCount), BackendDownCount: metric.NewCounter(metaBackendDownCount), @@ -273,8 +284,14 @@ func makeProxyMetrics() metrics { QueryCancelForwarded: metric.NewCounter(metaQueryCancelForwarded), QueryCancelSuccessful: metric.NewCounter(metaQueryCancelSuccessful), - AccessControlFileErrorCount: metric.NewGauge(accessControlFileErrorCount), + AccessControlFileErrorCount: metric.NewGauge(metaAccessControlFileErrorCount), + + RoutingMethodCount: aggmetric.NewCounter(metaRoutingMethodCount, "method"), } + m.SNIRoutingMethodCount = m.RoutingMethodCount.AddChild("sni") + m.DatabaseRoutingMethodCount = m.RoutingMethodCount.AddChild("database") + m.ClusterOptionRoutingMethodCount = m.RoutingMethodCount.AddChild("cluster_option") + return *m } // updateForError updates the metrics relevant for the type of the error diff --git a/pkg/ccl/sqlproxyccl/proxy_handler.go b/pkg/ccl/sqlproxyccl/proxy_handler.go index ad32cc4ed2b1..39d49726b5c3 100644 --- a/pkg/ccl/sqlproxyccl/proxy_handler.go +++ b/pkg/ccl/sqlproxyccl/proxy_handler.go @@ -375,7 +375,7 @@ func (handler *proxyHandler) handle( // NOTE: Errors returned from this function are user-facing errors so we // should be careful with the details that we want to expose. - backendStartupMsg, clusterName, tenID, err := clusterNameAndTenantFromParams(ctx, fe) + backendStartupMsg, clusterName, tenID, err := clusterNameAndTenantFromParams(ctx, fe, handler.metrics) if err != nil { clientErr := withCode(err, codeParamsRoutingFailed) log.Errorf(ctx, "unable to extract cluster name and tenant id: %s", err.Error()) @@ -715,7 +715,7 @@ func (handler *proxyHandler) setupIncomingCert(ctx context.Context) error { // through its command-line options, i.e. "-c NAME=VALUE", "-cNAME=VALUE", and // "--NAME=VALUE". func clusterNameAndTenantFromParams( - ctx context.Context, fe *FrontendAdmitInfo, + ctx context.Context, fe *FrontendAdmitInfo, metrics *metrics, ) (*pgproto3.StartupMessage, string, roachpb.TenantID, error) { clusterIdentifierDB, databaseName, err := parseDatabaseParam(fe.Msg.Parameters["database"]) if err != nil { @@ -748,6 +748,7 @@ func clusterNameAndTenantFromParams( // are blank. If SNI doesn't parse as a valid cluster id, then we assume // that the end user didn't intend to pass cluster info through SNI and // show missing cluster identifier instead of invalid cluster identifier. + metrics.SNIRoutingMethodCount.Inc(1) return fe.Msg, clusterName, tenID, nil } } @@ -790,7 +791,16 @@ func clusterNameAndTenantFromParams( paramsOut[key] = value } } - + // If both are provided, they must be the same, and we will track both. + if clusterIdentifierDB != "" { + metrics.DatabaseRoutingMethodCount.Inc(1) + } + // This metric is specific to the --cluster option. If we introduce a new + // --routing-id in the future, the existing metrics will still work, and + // we should introduce a new RoutingIDOptionReads metric. + if clusterIdentifierOpt != "" { + metrics.ClusterOptionRoutingMethodCount.Inc(1) + } outMsg := &pgproto3.StartupMessage{ ProtocolVersion: fe.Msg.ProtocolVersion, Parameters: paramsOut, diff --git a/pkg/ccl/sqlproxyccl/proxy_handler_test.go b/pkg/ccl/sqlproxyccl/proxy_handler_test.go index 83e0fb9853f4..0cb6129e2d95 100644 --- a/pkg/ccl/sqlproxyccl/proxy_handler_test.go +++ b/pkg/ccl/sqlproxyccl/proxy_handler_test.go @@ -2517,15 +2517,17 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { testCases := []struct { name string + sniServerName string params map[string]string expectedClusterName string expectedTenantID uint64 expectedParams map[string]string expectedError string expectedHint string + expectedMetrics func(t *testing.T, m *metrics) }{ { - name: "empty params", + name: "empty params and server name", params: map[string]string{}, expectedError: "missing cluster identifier", expectedHint: clusterIdentifierHint, @@ -2640,6 +2642,32 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { expectedError: "invalid cluster identifier 'happy-koala-0'", expectedHint: "Tenant ID 0 is invalid.", }, + { + name: "invalid sni format", + sniServerName: "foo-bar-baz", + params: map[string]string{}, + expectedError: "missing cluster identifier", + expectedHint: clusterIdentifierHint, + }, + { + name: "invalid sni value", + sniServerName: "happy2koala-.abc.aws-ap-south-1.cockroachlabs.cloud", + params: map[string]string{}, + expectedError: "missing cluster identifier", + expectedHint: clusterIdentifierHint, + }, + { + name: "valid sni value", + sniServerName: "happy-seal-10.abc.gcp-us-central1.cockroachlabs.cloud", + params: map[string]string{}, + expectedClusterName: "happy-seal", + expectedTenantID: 10, + expectedParams: map[string]string{}, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(1), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.SNIRoutingMethodCount.Value()) + }, + }, { name: "multiple similar cluster identifiers", params: map[string]string{ @@ -2649,6 +2677,11 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { expectedClusterName: "happy-koala", expectedTenantID: 7, expectedParams: map[string]string{"database": "defaultdb"}, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(2), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.DatabaseRoutingMethodCount.Value()) + require.Equal(t, int64(1), m.ClusterOptionRoutingMethodCount.Value()) + }, }, { name: "cluster identifier in database param", @@ -2659,6 +2692,10 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { expectedClusterName: strings.Repeat("a", 100), expectedTenantID: 7, expectedParams: map[string]string{"database": "defaultdb", "foo": "bar"}, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(1), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.DatabaseRoutingMethodCount.Value()) + }, }, { name: "valid cluster identifier with invalid arrangements", @@ -2672,6 +2709,10 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { "database": "defaultdb", "options": "-c -c -c -c", }, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(1), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.ClusterOptionRoutingMethodCount.Value()) + }, }, { name: "short option: cluster identifier in options param", @@ -2682,6 +2723,10 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { expectedClusterName: "happy-koala", expectedTenantID: 7, expectedParams: map[string]string{"database": "defaultdb"}, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(1), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.ClusterOptionRoutingMethodCount.Value()) + }, }, { name: "short option with spaces: cluster identifier in options param", @@ -2692,6 +2737,10 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { expectedClusterName: "happy-koala", expectedTenantID: 7, expectedParams: map[string]string{"database": "defaultdb"}, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(1), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.ClusterOptionRoutingMethodCount.Value()) + }, }, { name: "long option: cluster identifier in options param", @@ -2705,6 +2754,10 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { "database": "defaultdb", "options": "--foo=test", }, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(1), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.ClusterOptionRoutingMethodCount.Value()) + }, }, { name: "long option: cluster identifier in options param with other options", @@ -2718,6 +2771,10 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { "database": "defaultdb", "options": "-csearch_path=public \t--foo=test", }, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(1), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.ClusterOptionRoutingMethodCount.Value()) + }, }, { name: "leading 0s are ok", @@ -2725,19 +2782,24 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { expectedClusterName: "happy-koala-0", expectedTenantID: 7, expectedParams: map[string]string{"database": "defaultdb"}, + expectedMetrics: func(t *testing.T, m *metrics) { + require.Equal(t, int64(1), m.RoutingMethodCount.Count()) + require.Equal(t, int64(1), m.DatabaseRoutingMethodCount.Value()) + }, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { msg := &pgproto3.StartupMessage{Parameters: tc.params} + m := makeProxyMetrics() originalParams := make(map[string]string) for k, v := range msg.Parameters { originalParams[k] = v } - fe := &FrontendAdmitInfo{Msg: msg} - outMsg, clusterName, tenantID, err := clusterNameAndTenantFromParams(ctx, fe) + fe := &FrontendAdmitInfo{Msg: msg, SniServerName: tc.sniServerName} + outMsg, clusterName, tenantID, err := clusterNameAndTenantFromParams(ctx, fe, &m) if tc.expectedError == "" { require.NoErrorf(t, err, "failed test case\n%+v", tc) @@ -2755,6 +2817,15 @@ func TestClusterNameAndTenantFromParams(t *testing.T) { // Check that the original parameters were not modified. require.Equal(t, originalParams, msg.Parameters) + + if tc.expectedMetrics != nil { + tc.expectedMetrics(t, &m) + } else { + require.Zero(t, m.RoutingMethodCount.Count()) + require.Zero(t, m.SNIRoutingMethodCount.Value()) + require.Zero(t, m.DatabaseRoutingMethodCount.Value()) + require.Zero(t, m.ClusterOptionRoutingMethodCount.Value()) + } }) } }