Skip to content

Commit

Permalink
roachprod: implement --dry-run and --verbose
Browse files Browse the repository at this point in the history
Debugging roachprod in a remote (user) environment
can be challenging. On several occassions, simply
printing all executed (cloud) commands was sufficient
to reproduce and root cause an issue.

This change implements two mutually exclusive
modes, namely `dry-run` and `verbose`. The latter
merely logs every command before it's executed.
The former suppresses execution of any command
which can modify the (cloud) infrastructure; i.e.,
`dry-run` executes read-only commands; it logs
every command, executed or not. Also note that
`dry-run` leaves temporary files like VM startup
scripts on disk, so that they can be readily examined.

Thus, `dry-run` is primarily for dumping _all_
commands without the risk of modifying anything;
`verbose` is primarily for live debugging.

Release note: None
Epic: none
Fixes: cockroachdb#99091
  • Loading branch information
srosenberg committed Dec 21, 2024
1 parent 12812af commit e77e603
Show file tree
Hide file tree
Showing 38 changed files with 756 additions and 440 deletions.
2 changes: 2 additions & 0 deletions build/teamcity/cockroach/nightlies/roachtest_nightly_impl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ source $root/build/teamcity/util/roachtest_util.sh
# Standard release branches are in the format `release-24.1` for the
# 24.1 release, for example.
release_branch_regex="^release-[0-9][0-9]\.[0-9]"
# Test selection is enabled only on release branches.
selective_tests="false"

if [[ "${TC_BUILD_BRANCH}" == "master" ]]; then
# We default to using test selection on master, unless explicitly
Expand Down
3 changes: 3 additions & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ ALL_TESTS = [
"//pkg/roachprod/prometheus:prometheus_test",
"//pkg/roachprod/promhelperclient:promhelperclient_test",
"//pkg/roachprod/ssh:ssh_test",
"//pkg/roachprod/vm/cli:cli_test",
"//pkg/roachprod/vm/gce:gce_test",
"//pkg/roachprod/vm/local:local_test",
"//pkg/roachprod/vm:vm_test",
Expand Down Expand Up @@ -1628,6 +1629,8 @@ GO_TARGETS = [
"//pkg/roachprod/vm/aws/terraformgen:terraformgen_lib",
"//pkg/roachprod/vm/aws:aws",
"//pkg/roachprod/vm/azure:azure",
"//pkg/roachprod/vm/cli:cli",
"//pkg/roachprod/vm/cli:cli_test",
"//pkg/roachprod/vm/flagstub:flagstub",
"//pkg/roachprod/vm/gce/testutils:testutils",
"//pkg/roachprod/vm/gce:gce",
Expand Down
7 changes: 6 additions & 1 deletion pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ func initFlags() {
fmt.Sprintf("use the shared user %q for ssh rather than your user %q",
config.SharedUser, config.OSUser.Username))

rootCmd.PersistentFlags().BoolVarP(&config.DryRun, "dry-run", "d",
false, "dry-run mode (log all commands & execute read-only ones; i.e., no infra changes)")
rootCmd.PersistentFlags().BoolVarP(&config.Verbose, "verbose", "v",
false, "verbose mode (log all executed commands)")

createCmd.Flags().DurationVarP(&createVMOpts.Lifetime,
"lifetime", "l", 12*time.Hour, "Lifetime of the cluster")
createCmd.Flags().BoolVar(&createVMOpts.SSDOpts.UseLocalSSD,
Expand Down Expand Up @@ -188,7 +193,7 @@ func initFlags() {
"Show cost estimates",
)
listCmd.Flags().BoolVarP(&listDetails,
"details", "d", false, "Show cluster details")
"details", "", false, "Show cluster details")
listCmd.Flags().BoolVar(&listJSON,
"json", false, "Show cluster specs in a json format")
listCmd.Flags().BoolVarP(&listMine,
Expand Down
14 changes: 10 additions & 4 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1689,7 +1689,7 @@ The command will print an app.side-eye.io URL where the snapshot can be viewed.
func validateAndConfigure(cmd *cobra.Command, args []string) {
// Skip validation for commands that are self-sufficient.
switch cmd.Name() {
case "help", "version", "list":
case "help", "version":
return
}

Expand Down Expand Up @@ -1724,6 +1724,12 @@ func validateAndConfigure(cmd *cobra.Command, args []string) {
}
providersSet[p] = struct{}{}
}
if config.DryRun {
if config.Verbose {
printErrAndExit(fmt.Errorf("--verbose and --dry-run are mutually exclusive"))
}
config.Logger.Printf("Enabling [***Experimental***] --dry-run mode. No infra changes will be made!")
}
}

var updateCmd = &cobra.Command{
Expand Down Expand Up @@ -1801,7 +1807,7 @@ var sshKeysListCmd = &cobra.Command{
Use: "list",
Short: "list every SSH public key installed on clusters managed by roachprod",
Run: wrap(func(cmd *cobra.Command, args []string) error {
authorizedKeys, err := gce.GetUserAuthorizedKeys()
authorizedKeys, err := gce.GetUserAuthorizedKeys(config.Logger)
if err != nil {
return err
}
Expand Down Expand Up @@ -1833,7 +1839,7 @@ var sshKeysAddCmd = &cobra.Command{
}

fmt.Printf("Adding new public key for user %s...\n", ak.User)
return gce.AddUserAuthorizedKey(ak)
return gce.AddUserAuthorizedKey(config.Logger, ak)
}),
}

Expand All @@ -1844,7 +1850,7 @@ var sshKeysRemoveCmd = &cobra.Command{
Run: wrap(func(cmd *cobra.Command, args []string) error {
user := args[0]

existingKeys, err := gce.GetUserAuthorizedKeys()
existingKeys, err := gce.GetUserAuthorizedKeys(config.Logger)
if err != nil {
return fmt.Errorf("failed to fetch existing keys: %w", err)
}
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ go_library(
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/aws",
"//pkg/roachprod/vm/azure",
"//pkg/roachprod/vm/cli",
"//pkg/roachprod/vm/gce",
"//pkg/roachprod/vm/local",
"//pkg/server/debug/replay",
Expand Down
2 changes: 1 addition & 1 deletion pkg/roachprod/cloud/cluster_cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ func DestroyCluster(l *logger.Logger, c *Cluster) error {
// and clean-up entries prematurely.
stopSpinner = ui.NewDefaultSpinner(l, "Destroying DNS entries").Start()
dnsErr := vm.FanOutDNS(c.VMs, func(p vm.DNSProvider, vms vm.List) error {
return p.DeleteRecordsBySubdomain(context.Background(), c.Name)
return p.DeleteRecordsBySubdomain(context.Background(), l, c.Name)
})
stopSpinner()

Expand Down
4 changes: 2 additions & 2 deletions pkg/roachprod/cloud/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ func GCDNS(l *logger.Logger, cloud *Cloud, dryrun bool) error {
if !ok {
continue
}
records, err := p.ListRecords(ctx)
records, err := p.ListRecords(ctx, l)
if err != nil {
return err
}
Expand All @@ -541,7 +541,7 @@ func GCDNS(l *logger.Logger, cloud *Cloud, dryrun bool) error {
sort.Strings(recordNames)

if err := destroyResource(dryrun, func() error {
return p.DeleteRecordsByName(ctx, recordNames...)
return p.DeleteRecordsByName(ctx, l, recordNames...)
}); err != nil {
return err
}
Expand Down
21 changes: 19 additions & 2 deletions pkg/roachprod/clusters_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,20 @@ func saveCluster(l *logger.Logger, c *cloud.Cluster) error {
err = errors.CombineErrors(err, tmpFile.Sync())
err = errors.CombineErrors(err, tmpFile.Close())
if err == nil {
err = os.Rename(tmpFile.Name(), filename)
if config.DryRun || config.Verbose {
l.Printf("exec: mv %s %s", tmpFile.Name(), filename)
}
if !config.DryRun {
err = os.Rename(tmpFile.Name(), filename)
}
}
if err != nil {
_ = os.Remove(tmpFile.Name())
if config.DryRun || config.Verbose {
l.Printf("exec: rm %s", tmpFile.Name())
}
if !config.DryRun {
_ = os.Remove(tmpFile.Name())
}
return err
}
return nil
Expand Down Expand Up @@ -258,5 +268,12 @@ func (localVMStorage) SaveCluster(l *logger.Logger, cluster *cloud.Cluster) erro
// DeleteCluster is part of the local.VMStorage interface.
func (localVMStorage) DeleteCluster(l *logger.Logger, name string) error {
path := clusterFilename(name)
if config.DryRun || config.Verbose {
l.Printf("exec: rm %s", path)
}
if config.DryRun {
return nil
}

return os.Remove(path)
}
4 changes: 4 additions & 0 deletions pkg/roachprod/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ var (
OSUser *user.User
// Quiet is used to disable fancy progress output.
Quiet = false
// DryRun disables executing commands which would otherwise cause infra. changes.
DryRun = false
// Verbose enables logging all executed commands. What gets logged is a superset of DryRun.
Verbose = false
// The default roachprod logger.
// N.B. When roachprod is used via CLI, this logger is used for all output.
// When roachprod is used via API (e.g. from roachtest), this logger is used only in the few cases,
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/install/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ go_library(
"//pkg/roachprod/ui",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/aws",
"//pkg/roachprod/vm/cli",
"//pkg/roachprod/vm/gce",
"//pkg/roachprod/vm/local",
"//pkg/testutils",
Expand Down
18 changes: 13 additions & 5 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/ui"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/aws"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/cli"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/local"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/retry"
Expand Down Expand Up @@ -438,7 +439,7 @@ func (c *SyncedCluster) Stop(
return err
}

services, err := c.DiscoverServices(ctx, name, ServiceTypeSQL)
services, err := c.DiscoverServices(ctx, l, name, ServiceTypeSQL)
if err != nil {
return err
}
Expand Down Expand Up @@ -2339,6 +2340,8 @@ func (c *SyncedCluster) Logs(
cmd.Stdout = os.Stdout
cmd.Stderr = &stderrBuf

cli.MaybeLogCmd(context.Background(), l, cmd)

if err := cmd.Run(); err != nil {
if ctx.Err() != nil {
return nil
Expand Down Expand Up @@ -2379,6 +2382,9 @@ func (c *SyncedCluster) Logs(
cmd.Stdout = out
var errBuf bytes.Buffer
cmd.Stderr = &errBuf

cli.MaybeLogCmd(ctx, l, cmd)

if err := cmd.Run(); err != nil && ctx.Err() == nil {
return errors.Wrapf(err, "failed to run cockroach debug merge-logs:\n%v", errBuf.String())
}
Expand Down Expand Up @@ -2632,7 +2638,7 @@ func (c *SyncedCluster) pgurls(
}
m := make(map[Node]string, len(hosts))
for node, host := range hosts {
desc, err := c.DiscoverService(ctx, node, virtualClusterName, ServiceTypeSQL, sqlInstance)
desc, err := c.DiscoverService(ctx, l, node, virtualClusterName, ServiceTypeSQL, sqlInstance)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -2667,7 +2673,7 @@ func (c *SyncedCluster) loadBalancerURL(
sqlInstance int,
auth PGAuthMode,
) (string, error) {
services, err := c.DiscoverServices(ctx, virtualClusterName, ServiceTypeSQL)
services, err := c.DiscoverServices(ctx, l, virtualClusterName, ServiceTypeSQL)
if err != nil {
return "", err
}
Expand Down Expand Up @@ -2802,6 +2808,8 @@ func scp(ctx context.Context, l *logger.Logger, src, dest string) (*RunResultDet
cmd := exec.CommandContext(ctx, args[0], args[1:]...)
cmd.WaitDelay = time.Second // make sure the call below returns when the context is canceled

cli.MaybeLogCmd(context.Background(), l, cmd)

out, err := cmd.CombinedOutput()
if err != nil {
err = rperrors.NewSSHError(errors.Wrapf(err, "~ %s\n%s", strings.Join(args, " "), out))
Expand Down Expand Up @@ -2993,10 +3001,10 @@ func (c *SyncedCluster) Init(ctx context.Context, l *logger.Logger, node Node) e

// allPublicAddrs returns a string that can be used when starting cockroach to
// indicate the location of all nodes in the cluster.
func (c *SyncedCluster) allPublicAddrs(ctx context.Context) (string, error) {
func (c *SyncedCluster) allPublicAddrs(ctx context.Context, l *logger.Logger) (string, error) {
var addrs []string
for _, node := range c.Nodes {
port, err := c.NodePort(ctx, node, "" /* virtualClusterName */, 0 /* sqlInstance */)
port, err := c.NodePort(ctx, l, node, "" /* virtualClusterName */, 0 /* sqlInstance */)
if err != nil {
return "", err
}
Expand Down
Loading

0 comments on commit e77e603

Please sign in to comment.