Skip to content

Commit

Permalink
cloud,backupccl: add metric for time of last backup failure due to KMS
Browse files Browse the repository at this point in the history
Add a metric `backup.last-failed-time.kms-inaccessible` that tracks the
timestamp of the last backup failure due to a KMS error. This metric is only
tracked for BACKUP statrements that are executed with the OPTION
`updates_cluster_monitoring_metrics` set.

Release note (sql change): adds the `updates_cluster_monitoring_metrics` backup
option that allows an operator to opt-in tracking the timestamp
of the last backup failure due to a KMS error.
  • Loading branch information
Rui Hu committed Sep 26, 2023
1 parent 5d61611 commit 2ad9e79
Show file tree
Hide file tree
Showing 31 changed files with 551 additions and 92 deletions.
2 changes: 1 addition & 1 deletion build/teamcity/cockroach/nightlies/cloud_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ source "$dir/teamcity-support.sh" # For $root
source "$dir/teamcity-bazel-support.sh" # For run_bazel

tc_start_block "Run cloud unit tests"
BAZEL_SUPPORT_EXTRA_DOCKER_ARGS="-e GITHUB_API_TOKEN -e GITHUB_REPO -e TC_BUILDTYPE_ID -e TC_BUILD_BRANCH -e TC_BUILD_ID -e TC_SERVER_URL -e GOOGLE_EPHEMERAL_CREDENTIALS -e GOOGLE_KMS_KEY_NAME -e GOOGLE_LIMITED_KEY_ID -e ASSUME_SERVICE_ACCOUNT -e GOOGLE_LIMITED_BUCKET -e ASSUME_SERVICE_ACCOUNT_CHAIN -e AWS_DEFAULT_REGION -e AWS_SHARED_CREDENTIALS_FILE -e AWS_CONFIG_FILE -e AWS_S3_BUCKET -e AWS_ASSUME_ROLE -e AWS_ROLE_ARN_CHAIN -e AWS_KMS_KEY_ARN -e AWS_S3_ENDPOINT -e AWS_KMS_ENDPOINT -e AWS_KMS_REGION -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AZURE_ACCOUNT_KEY -e AZURE_ACCOUNT_NAME -e AZURE_CONTAINER -e AZURE_CLIENT_ID -e AZURE_CLIENT_SECRET -e AZURE_TENANT_ID -e AZURE_VAULT_NAME -e AZURE_KMS_KEY_NAME -e AZURE_KMS_KEY_VERSION -e BUILD_VCS_NUMBER" \
BAZEL_SUPPORT_EXTRA_DOCKER_ARGS="-e GITHUB_API_TOKEN -e GITHUB_REPO -e TC_BUILDTYPE_ID -e TC_BUILD_BRANCH -e TC_BUILD_ID -e TC_SERVER_URL -e GOOGLE_EPHEMERAL_CREDENTIALS -e GOOGLE_KMS_KEY_NAME -e GOOGLE_LIMITED_KEY_ID -e ASSUME_SERVICE_ACCOUNT -e GOOGLE_LIMITED_BUCKET -e ASSUME_SERVICE_ACCOUNT_CHAIN -e AWS_DEFAULT_REGION -e AWS_SHARED_CREDENTIALS_FILE -e AWS_CONFIG_FILE -e AWS_S3_BUCKET -e AWS_ASSUME_ROLE -e AWS_ROLE_ARN_CHAIN -e AWS_KMS_KEY_ARN -e AWS_S3_ENDPOINT -e AWS_KMS_ENDPOINT -e AWS_KMS_REGION -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AZURE_ACCOUNT_KEY -e AZURE_ACCOUNT_NAME -e AZURE_CONTAINER -e AZURE_CLIENT_ID -e AZURE_CLIENT_SECRET -e AZURE_TENANT_ID -e AZURE_VAULT_NAME -e AZURE_LIMITED_VAULT_NAME -e AZURE_KMS_KEY_NAME -e AZURE_KMS_KEY_VERSION -e BUILD_VCS_NUMBER" \
run_bazel build/teamcity/cockroach/nightlies/cloud_unit_tests_impl.sh "$@"
tc_end_block "Run cloud unit tests"
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ bazel_test_env=(--test_env=GO_TEST_WRAP_TESTV=1 \
--test_env=AZURE_CLIENT_SECRET="$AZURE_CLIENT_SECRET" \
--test_env=AZURE_TENANT_ID="$AZURE_TENANT_ID" \
--test_env=AZURE_VAULT_NAME="$AZURE_VAULT_NAME" \
--test_env=AZURE_LIMITED_VAULT_NAME="$AZURE_LIMITED_VAULT_NAME" \
--test_env=AZURE_KMS_KEY_NAME="$AZURE_KMS_KEY_NAME" \
--test_env=AZURE_KMS_KEY_VERSION="$AZURE_KMS_KEY_VERSION")
exit_status=0
Expand Down
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,7 @@
<tr><td>STORAGE</td><td>txnwaitqueue.query.waiting</td><td>Number of transaction status queries waiting for an updated transaction record</td><td>Waiting Queries</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>STORAGE</td><td>valbytes</td><td>Number of bytes taken up by values</td><td>Storage</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
<tr><td>STORAGE</td><td>valcount</td><td>Count of all values</td><td>MVCC Values</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>backup.last-failed-time.kms-inaccessible</td><td>The unix timestamp of the most recent failure of backup due to errKMSInaccessible by a backup specified as maintaining this metric</td><td>Jobs</td><td>GAUGE</td><td>TIMESTAMP_SEC</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>changefeed.admit_latency</td><td>Event admission latency: a difference between event MVCC timestamp and the time it was admitted into changefeed pipeline; Note: this metric includes the time spent waiting until event can be processed due to backpressure or time spent resolving schema descriptors. Also note, this metric excludes latency during backfill</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>changefeed.aggregator_progress</td><td>The earliest timestamp up to which any aggregator is guaranteed to have emitted all values for</td><td>Unix Timestamp Nanoseconds</td><td>GAUGE</td><td>TIMESTAMP_NS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>changefeed.backfill_count</td><td>Number of changefeeds currently executing backfill</td><td>Count</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
Expand Down
2 changes: 2 additions & 0 deletions docs/generated/sql/bnf/backup_options.bnf
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ backup_options ::=
| 'INCREMENTAL_LOCATION' '=' string_or_placeholder_opt_list
| 'EXECUTION' 'LOCALITY' '=' string_or_placeholder
| 'INCLUDE_ALL_VIRTUAL_CLUSTERS' '=' a_expr
| 'UPDATES_CLUSTER_MONITORING_METRICS'
| 'UPDATES_CLUSTER_MONITORING_METRICS' '=' a_expr
4 changes: 4 additions & 0 deletions docs/generated/sql/bnf/stmt_block.bnf
Original file line number Diff line number Diff line change
Expand Up @@ -1452,6 +1452,7 @@ unreserved_keyword ::=
| 'UNSPLIT'
| 'UNTIL'
| 'UPDATE'
| 'UPDATES_CLUSTER_MONITORING_METRICS'
| 'UPSERT'
| 'USE'
| 'USERS'
Expand Down Expand Up @@ -2324,6 +2325,8 @@ backup_options ::=
| 'INCREMENTAL_LOCATION' '=' string_or_placeholder_opt_list
| 'EXECUTION' 'LOCALITY' '=' string_or_placeholder
| include_all_clusters '=' a_expr
| 'UPDATES_CLUSTER_MONITORING_METRICS'
| 'UPDATES_CLUSTER_MONITORING_METRICS' '=' a_expr

c_expr ::=
d_expr
Expand Down Expand Up @@ -4054,6 +4057,7 @@ bare_label_keywords ::=
| 'UNSPLIT'
| 'UNTIL'
| 'UPDATE'
| 'UPDATES_CLUSTER_MONITORING_METRICS'
| 'UPSERT'
| 'USE'
| 'USER'
Expand Down
1 change: 1 addition & 0 deletions pkg/ccl/backupccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ go_library(
"alter_backup_planning.go",
"alter_backup_schedule.go",
"backup_job.go",
"backup_metrics.go",
"backup_planning.go",
"backup_planning_tenant.go",
"backup_processor.go",
Expand Down
3 changes: 3 additions & 0 deletions pkg/ccl/backupccl/alter_backup_schedule.go
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,9 @@ func processOptionsForArgs(inOpts tree.BackupOptions, outOpts *tree.BackupOption
outOpts.IncrementalStorage = inOpts.IncrementalStorage
}
}
if inOpts.UpdatesClusterMonitoringMetrics != nil {
outOpts.UpdatesClusterMonitoringMetrics = inOpts.UpdatesClusterMonitoringMetrics
}
return nil
}

Expand Down
50 changes: 50 additions & 0 deletions pkg/ccl/backupccl/backup_cloud_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@ import (
"github.com/cockroachdb/cockroach/pkg/cloud"
"github.com/cockroachdb/cockroach/pkg/cloud/amazon"
"github.com/cockroachdb/cockroach/pkg/cloud/azure"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/stretchr/testify/require"
)

// The tests in this file talk to remote APIs which require credentials.
Expand Down Expand Up @@ -230,3 +232,51 @@ func TestCloudBackupRestoreAzure(t *testing.T) {
}
}
}

// TestCloudBackupRestoreKMSInaccessibleMetric tests that backup statements
// updates the KMSInaccessibleError metric.
func TestCloudBackupRestoreKMSInaccessibleMetric(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

azureVaultName := os.Getenv("AZURE_VAULT_NAME")
if azureVaultName == "" {
skip.IgnoreLintf(t, "AZURE_VAULT_NAME env var must be set")
}

for _, tt := range []struct {
name string
uri string
}{
{
name: "s3",
uri: "aws-kms:///non-existent-key?AUTH=implicit&REGION=us-east-1",
},
{
name: "gcs",
uri: "gcp-kms:///non-existent-key?AUTH=implicit",
},
{
name: "azure",
uri: fmt.Sprintf("azure-kms:///non-existent-key/000?AUTH=implicit&AZURE_VAULT_NAME=%s", azureVaultName),
},
} {
tc, sqlDB, _, cleanupFn := backupRestoreTestSetup(t, 1, 1, InitManualReplication)
defer cleanupFn()

t.Run(tt.name, func(t *testing.T) {
testStart := timeutil.Now().Unix()
bm := tc.Server(0).JobRegistry().(*jobs.Registry).MetricsStruct().Backup.(*BackupMetrics)

// LastKMSInaccessibleErrorTime should not be set without
// updates_cluster_monitoring_metrics despite a KMS error.
sqlDB.ExpectErr(t, "failed to encrypt data key", "BACKUP INTO 'userfile:///foo' WITH OPTIONS (kms = $1)", tt.uri)
require.Equal(t, bm.LastKMSInaccessibleErrorTime.Value(), int64(0))

// LastKMSInaccessibleErrorTime should be set with
// updates_cluster_monitoring_metrics.
sqlDB.ExpectErr(t, "failed to encrypt data key", "BACKUP INTO 'userfile:///foo' WITH OPTIONS (kms = $1, updates_cluster_monitoring_metrics)", tt.uri)
require.GreaterOrEqual(t, bm.LastKMSInaccessibleErrorTime.Value(), testStart)
})
}
}
17 changes: 13 additions & 4 deletions pkg/ccl/backupccl/backup_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -1204,9 +1204,7 @@ func (b *backupResumer) maybeNotifyScheduledJobCompletion(
}

scheduleID := int64(tree.MustBeDInt(datums[0]))
if err := jobs.NotifyJobTermination(
ctx, txn, env, b.job.ID(), jobStatus, b.job.Details(), scheduleID,
); err != nil {
if err := jobs.NotifyJobTermination(ctx, txn, env, b.job.ID(), jobStatus, b.job.Details(), scheduleID); err != nil {
return errors.Wrapf(err,
"failed to notify schedule %d of completion of job %d", scheduleID, b.job.ID())
}
Expand All @@ -1226,15 +1224,26 @@ func (b *backupResumer) OnFailOrCancel(

p := execCtx.(sql.JobExecContext)
cfg := p.ExecCfg()
details := b.job.Details().(jobspb.BackupDetails)

b.deleteCheckpoint(ctx, cfg, p.User())
if err := cfg.InternalDB.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
details := b.job.Details().(jobspb.BackupDetails)
pts := cfg.ProtectedTimestampProvider.WithTxn(txn)
return releaseProtectedTimestamp(ctx, pts, details.ProtectedTimestampRecord)
}); err != nil {
return err
}

// If this backup should update cluster monitoring metrics, update the
// metrics.
if details.UpdatesClusterMonitoringMetrics {
metrics := p.ExecCfg().JobRegistry.MetricsStruct().Backup.(*BackupMetrics)
if cloud.IsKMSInaccessible(jobErr) {
now := timeutil.Now()
metrics.LastKMSInaccessibleErrorTime.Update(now.Unix())
}
}

// This should never return an error unless resolving the schedule that the
// job is being run under fails. This could happen if the schedule is dropped
// while the job is executing.
Expand Down
40 changes: 40 additions & 0 deletions pkg/ccl/backupccl/backup_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2023 The Cockroach Authors.
//
// Licensed as a CockroachDB Enterprise file under the Cockroach Community
// License (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt

package backupccl

import (
"time"

"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/util/metric"
)

type BackupMetrics struct {
LastKMSInaccessibleErrorTime *metric.Gauge
}

// MetricStruct implements the metric.Struct interface.
func (b BackupMetrics) MetricStruct() {}

// MakeBackupMetrics instantiates the metrics for backup.
func MakeBackupMetrics(time.Duration) metric.Struct {
m := &BackupMetrics{
LastKMSInaccessibleErrorTime: metric.NewGauge(metric.Metadata{
Name: "backup.last-failed-time.kms-inaccessible",
Help: "The unix timestamp of the most recent failure of backup due to errKMSInaccessible by a backup specified as maintaining this metric",
Measurement: "Jobs",
Unit: metric.Unit_TIMESTAMP_SEC,
}),
}
return m
}

func init() {
jobs.MakeBackupMetricsHook = MakeBackupMetrics
}
43 changes: 28 additions & 15 deletions pkg/ccl/backupccl/backup_planning.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,10 @@ func resolveOptionsForBackupJobDescription(
}

newOpts := tree.BackupOptions{
CaptureRevisionHistory: opts.CaptureRevisionHistory,
Detached: opts.Detached,
ExecutionLocality: opts.ExecutionLocality,
CaptureRevisionHistory: opts.CaptureRevisionHistory,
Detached: opts.Detached,
ExecutionLocality: opts.ExecutionLocality,
UpdatesClusterMonitoringMetrics: opts.UpdatesClusterMonitoringMetrics,
}

if opts.EncryptionPassphrase != nil {
Expand Down Expand Up @@ -532,6 +533,7 @@ func backupTypeCheck(
exprutil.Bools{
backupStmt.Options.CaptureRevisionHistory,
backupStmt.Options.IncludeAllSecondaryTenants,
backupStmt.Options.UpdatesClusterMonitoringMetrics,
}); err != nil {
return false, nil, err
}
Expand Down Expand Up @@ -658,6 +660,16 @@ func backupPlanHook(
}
}

var updatesClusterMonitoringMetrics bool
if backupStmt.Options.UpdatesClusterMonitoringMetrics != nil {
updatesClusterMonitoringMetrics, err = exprEval.Bool(
ctx, backupStmt.Options.UpdatesClusterMonitoringMetrics,
)
if err != nil {
return nil, nil, nil, false, err
}
}

fn := func(ctx context.Context, _ []sql.PlanNode, resultsCh chan<- tree.Datums) error {
// TODO(dan): Move this span into sql.
ctx, span := tracing.ChildSpan(ctx, stmt.StatementTag())
Expand Down Expand Up @@ -751,18 +763,19 @@ func backupPlanHook(
}

initialDetails := jobspb.BackupDetails{
Destination: jobspb.BackupDetails_Destination{To: to, IncrementalStorage: incrementalStorage},
EndTime: endTime,
RevisionHistory: revisionHistory,
IncludeAllSecondaryTenants: includeAllSecondaryTenants,
IncrementalFrom: incrementalFrom,
FullCluster: backupStmt.Coverage() == tree.AllDescriptors,
ResolvedCompleteDbs: completeDBs,
EncryptionOptions: &encryptionParams,
AsOfInterval: asOfInterval,
Detached: detached,
ApplicationName: p.SessionData().ApplicationName,
ExecutionLocality: executionLocality,
Destination: jobspb.BackupDetails_Destination{To: to, IncrementalStorage: incrementalStorage},
EndTime: endTime,
RevisionHistory: revisionHistory,
IncludeAllSecondaryTenants: includeAllSecondaryTenants,
IncrementalFrom: incrementalFrom,
FullCluster: backupStmt.Coverage() == tree.AllDescriptors,
ResolvedCompleteDbs: completeDBs,
EncryptionOptions: &encryptionParams,
AsOfInterval: asOfInterval,
Detached: detached,
ApplicationName: p.SessionData().ApplicationName,
ExecutionLocality: executionLocality,
UpdatesClusterMonitoringMetrics: updatesClusterMonitoringMetrics,
}
if backupStmt.CreatedByInfo != nil && backupStmt.CreatedByInfo.Name == jobs.CreatedByScheduledJobs {
initialDetails.ScheduleID = backupStmt.CreatedByInfo.ID
Expand Down
8 changes: 4 additions & 4 deletions pkg/ccl/backupccl/backup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4284,13 +4284,13 @@ type testKMS struct {

var _ cloud.KMS = &testKMS{}

func (k *testKMS) MasterKeyID() (string, error) {
func (k *testKMS) MasterKeyID() string {
kmsURL, err := url.ParseRequestURI(k.uri)
if err != nil {
return "", err
return ""
}

return strings.TrimPrefix(kmsURL.Path, "/"), nil
return strings.TrimPrefix(kmsURL.Path, "/")
}

// Encrypt appends the KMS URI master key ID to data.
Expand Down Expand Up @@ -4435,7 +4435,7 @@ func TestGetEncryptedDataKeyByKMSMasterKeyID(t *testing.T) {
testKMS, err := MakeTestKMS(ctx, uri, nil)
require.NoError(t, err)

masterKeyID, err := testKMS.MasterKeyID()
masterKeyID := testKMS.MasterKeyID()
require.NoError(t, err)

encryptedDataKey, err := testKMS.Encrypt(ctx, plaintextDataKey)
Expand Down
17 changes: 4 additions & 13 deletions pkg/ccl/backupccl/backupencryption/encryption.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,7 @@ func ValidateKMSURIsAgainstFullBackup(

// Depending on the KMS specific implementation, this may or may not contact
// the remote KMS.
id, err := kms.MasterKeyID()
if err != nil {
return nil, err
}
id := kms.MasterKeyID()

encryptedDataKey, err := kmsMasterKeyIDToDataKey.getEncryptedDataKey(PlaintextMasterKeyID(id))
if err != nil {
Expand Down Expand Up @@ -277,17 +274,11 @@ func GetEncryptedDataKeyFromURI(
}
encryptedDataKey, err := kms.Encrypt(ctx, plaintextDataKey)
if err != nil {
return "", nil, errors.Wrapf(err, "failed to encrypt data key for KMS scheme %s",
kmsURL.Scheme)
}

masterKeyID, err := kms.MasterKeyID()
if err != nil {
return "", nil, errors.Wrapf(err, "failed to get master key ID for KMS scheme %s",
kmsURL.Scheme)
return "", nil, cloud.KMSInaccessible(errors.Wrapf(err, "failed to encrypt data key for KMS scheme %s",
kmsURL.Scheme))
}

return masterKeyID, encryptedDataKey, nil
return kms.MasterKeyID(), encryptedDataKey, nil
}

// GetEncryptedDataKeyByKMSMasterKeyID constructs a mapping {MasterKeyID :
Expand Down
12 changes: 12 additions & 0 deletions pkg/ccl/backupccl/create_scheduled_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ type scheduledBackupSpec struct {
incrementalStorage []string
includeAllSecondaryTenants *bool
execLoc *string
updatesMetrics *bool
}

func makeScheduleDetails(opts map[string]string) (jobspb.ScheduleDetails, error) {
Expand Down Expand Up @@ -718,6 +719,16 @@ func makeScheduledBackupSpec(
spec.includeAllSecondaryTenants = &includeSecondary
}

if schedule.BackupOptions.UpdatesClusterMonitoringMetrics != nil {
updatesMetrics, err := exprEval.Bool(
ctx, schedule.BackupOptions.UpdatesClusterMonitoringMetrics,
)
if err != nil {
return nil, err
}
spec.updatesMetrics = &updatesMetrics
}

return spec, nil
}

Expand Down Expand Up @@ -799,6 +810,7 @@ func createBackupScheduleTypeCheck(
bools := exprutil.Bools{
schedule.BackupOptions.CaptureRevisionHistory,
schedule.BackupOptions.IncludeAllSecondaryTenants,
schedule.BackupOptions.UpdatesClusterMonitoringMetrics,
}
if err := exprutil.TypeCheck(
ctx, scheduleBackupOp, p.SemaCtx(), stringExprs, bools, stringArrays, opts,
Expand Down
2 changes: 2 additions & 0 deletions pkg/ccl/backupccl/schedule_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ type scheduledBackupExecutor struct {
type backupMetrics struct {
*jobs.ExecutorMetrics
*jobs.ExecutorPTSMetrics
// TODO(rui): move this to the backup job so it can be controlled by the
// updates_cluster_monitoring_metrics option.
RpoMetric *metric.Gauge
}

Expand Down
Loading

0 comments on commit 2ad9e79

Please sign in to comment.