diff --git a/.github/workflows/ci-build-crates.yml b/.github/workflows/ci-build-crates.yml index e12f6031cb8..8adf9e08047 100644 --- a/.github/workflows/ci-build-crates.yml +++ b/.github/workflows/ci-build-crates.yml @@ -99,6 +99,7 @@ jobs: build: name: Build ${{ matrix.crate }} crate + timeout-minutes: 90 needs: [ matrix, check-matrix ] runs-on: ubuntu-latest strategy: diff --git a/.github/workflows/ci-unit-tests-docker.yml b/.github/workflows/ci-unit-tests-docker.yml index fcd8e74d41f..bf15ce55930 100644 --- a/.github/workflows/ci-unit-tests-docker.yml +++ b/.github/workflows/ci-unit-tests-docker.yml @@ -91,6 +91,7 @@ jobs: # TODO: turn this test and the getblocktemplate test into a matrix, so the jobs use exactly the same diagnostics settings test-all: name: Test all + timeout-minutes: 180 runs-on: ubuntu-latest-xl needs: build steps: @@ -144,6 +145,7 @@ jobs: # (The gRPC feature is a zebrad feature, so it isn't needed here.) test-fake-activation-heights: name: Test with fake activation heights + timeout-minutes: 60 runs-on: ubuntu-latest needs: build steps: @@ -167,6 +169,7 @@ jobs: # (We activate the gRPC feature to avoid recompiling `zebrad`, but we don't actually run any gRPC tests.) test-empty-sync: name: Test checkpoint sync from empty state + timeout-minutes: 60 runs-on: ubuntu-latest needs: build steps: @@ -189,6 +192,7 @@ jobs: # (We activate the gRPC feature to avoid recompiling `zebrad`, but we don't actually run any gRPC tests.) test-lightwalletd-integration: name: Test integration with lightwalletd + timeout-minutes: 60 runs-on: ubuntu-latest needs: build steps: diff --git a/.github/workflows/sub-deploy-integration-tests-gcp.yml b/.github/workflows/sub-deploy-integration-tests-gcp.yml index e718eaa43af..336bbe6b360 100644 --- a/.github/workflows/sub-deploy-integration-tests-gcp.yml +++ b/.github/workflows/sub-deploy-integration-tests-gcp.yml @@ -104,146 +104,14 @@ env: CACHED_STATE_UPDATE_LIMIT: 576 jobs: - # set up and launch the test, if it doesn't use any cached state - # each test runs one of the *-with/without-cached-state job series, and skips the other - launch-without-cached-state: - name: Launch ${{ inputs.test_id }} test - if: ${{ !inputs.needs_zebra_state }} - runs-on: zfnd-runners - permissions: - contents: 'read' - id-token: 'write' - steps: - - uses: actions/checkout@v4.1.0 - with: - persist-credentials: false - fetch-depth: '2' - - uses: r7kamura/rust-problem-matchers@v1.4.0 - - - name: Inject slug/short variables - uses: rlespinasse/github-slug-action@v4 - with: - short-length: 7 - - # Makes the Zcash network name lowercase. - # - # Labels in GCP are required to be in lowercase, but the blockchain network - # uses sentence case, so we need to downcase ${{ inputs.network }}. - # - # Passes ${{ inputs.network }} to subsequent steps using $NETWORK env variable. - - name: Downcase network name for labels - run: | - NETWORK_CAPS="${{ inputs.network }}" - echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV" - - # Install our SSH secret - - name: Install private SSH key - uses: shimataro/ssh-key-action@v2.5.1 - with: - key: ${{ secrets.GCP_SSH_PRIVATE_KEY }} - name: google_compute_engine - known_hosts: unnecessary - - - name: Generate public SSH key - run: | - sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client - ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub - - # Setup gcloud CLI - - name: Authenticate to Google Cloud - id: auth - uses: google-github-actions/auth@v1.1.1 - with: - retries: '3' - workload_identity_provider: '${{ vars.GCP_WIF }}' - service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}' - - - name: Set up Cloud SDK - uses: google-github-actions/setup-gcloud@v1.1.1 - - # Create a Compute Engine virtual machine - - name: Create ${{ inputs.test_id }} GCP compute instance - id: create-instance - run: | - gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \ - --boot-disk-size 50GB \ - --boot-disk-type pd-ssd \ - --image-project=cos-cloud \ - --image-family=cos-stable \ - --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=400GB,type=pd-ssd \ - --container-image=gcr.io/google-containers/busybox \ - --machine-type ${{ vars.GCP_LARGE_MACHINE }} \ - --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \ - --scopes cloud-platform \ - --metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE \ - --metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \ - --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \ - --tags ${{ inputs.app_name }} \ - --zone ${{ vars.GCP_ZONE }} - - # Format the mounted disk if the test doesn't use a cached state. - - name: Format ${{ inputs.test_id }} volume - shell: /usr/bin/bash -exo pipefail {0} - run: | - gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ - --zone ${{ vars.GCP_ZONE }} \ - --ssh-flag="-o ServerAliveInterval=5" \ - --ssh-flag="-o ConnectionAttempts=20" \ - --ssh-flag="-o ConnectTimeout=5" \ - --command=' \ - set -ex; - # Extract the correct disk name based on the device-name - DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); - sudo mkfs.ext4 -v /dev/$DISK_NAME \ - ' - - # Launch the test without any cached state - - name: Launch ${{ inputs.test_id }} test - id: launch-test - shell: /usr/bin/bash -exo pipefail {0} - run: | - gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ - --zone ${{ vars.GCP_ZONE }} \ - --ssh-flag="-o ServerAliveInterval=5" \ - --ssh-flag="-o ConnectionAttempts=20" \ - --ssh-flag="-o ConnectTimeout=5" \ - --command=' \ - set -ex; - # Extract the correct disk name based on the device-name - export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \ - - sudo docker run \ - --name ${{ inputs.test_id }} \ - --tty \ - --detach \ - ${{ inputs.test_variables }} \ - --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ - ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ - ' - - # Show debug logs if previous job failed - - name: Show debug logs if previous job failed - if: ${{ failure() }} - shell: /usr/bin/bash -exo pipefail {0} - run: | - gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ - --zone ${{ vars.GCP_ZONE }} \ - --ssh-flag="-o ServerAliveInterval=5" \ - --ssh-flag="-o ConnectionAttempts=20" \ - --ssh-flag="-o ConnectTimeout=5" \ - --command=' \ - lsblk; - sudo lsof /dev/$DISK_NAME; - sudo dmesg; - sudo journalctl -b \ - ' - - # set up and launch the test, if it uses cached state - # each test runs one of the *-with/without-cached-state job series, and skips the other - launch-with-cached-state: - name: Launch ${{ inputs.test_id }} test - if: ${{ inputs.needs_zebra_state }} + # Show all the test logs, then follow the logs of the test we just launched, until it finishes. + # Then check the result of the test. + # + # If `inputs.is_long_test` is `true`, the timeout is 5 days, otherwise it's 3 hours. + test-result: + name: Run ${{ inputs.test_id }} test runs-on: zfnd-runners + timeout-minutes: ${{ inputs.is_long_test && 7200 || 180 }} outputs: cached_disk_name: ${{ steps.get-disk-name.outputs.cached_disk_name }} permissions: @@ -314,6 +182,7 @@ jobs: # TODO: move this script into a file, and call it from sub-find-cached-disks.yml as well. - name: Find ${{ inputs.test_id }} cached state disk id: get-disk-name + if: ${{ inputs.needs_zebra_state || inputs.needs_lwd_state }} run: | set -x LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" "$GITHUB_WORKSPACE/zebra-state/src/constants.rs" | grep -oE "[0-9]+" | tail -n1) @@ -381,18 +250,21 @@ jobs: echo "STATE_VERSION=$LOCAL_STATE_VERSION" >> "$GITHUB_ENV" echo "CACHED_DISK_NAME=$CACHED_DISK_NAME" >> "$GITHUB_ENV" + echo "DISK_OPTION=image=$CACHED_DISK_NAME," >> "$GITHUB_ENV" # Create a Compute Engine virtual machine and attach a cached state disk using the # $CACHED_DISK_NAME variable as the source image to populate the disk cached state + # if the test needs it. - name: Create ${{ inputs.test_id }} GCP compute instance id: create-instance + shell: /usr/bin/bash -x {0} run: | gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \ --boot-disk-size 50GB \ --boot-disk-type pd-ssd \ --image-project=cos-cloud \ --image-family=cos-stable \ - --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=400GB,type=pd-ssd \ + --create-disk=${DISK_OPTION}name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=400GB,type=pd-ssd \ --container-image=gcr.io/google-containers/busybox \ --machine-type ${{ vars.GCP_LARGE_MACHINE }} \ --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \ @@ -403,29 +275,10 @@ jobs: --tags ${{ inputs.app_name }} \ --zone ${{ vars.GCP_ZONE }} - # Launch the test with the previously created Zebra-only cached state. - # Each test runs one of the "Launch test" steps, and skips the other. - # - # SSH into the just created VM, and create a Docker container to run the incoming test - # from ${{ inputs.test_id }}, then mount the sudo docker volume created in the previous job. - # - # The disk mounted in the VM is located at /dev/$DISK_NAME, we mount the root `/` of this disk to the docker - # container in one path: - # - /var/cache/zebrad-cache -> ${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} -> $ZEBRA_CACHED_STATE_DIR - # - # This path must match the variable used by the tests in Rust, which are also set in - # `ci-unit-tests-docker.yml` to be able to run this tests. - # - # Although we're mounting the disk root, Zebra will only respect the values from - # $ZEBRA_CACHED_STATE_DIR. The inputs like ${{ inputs.zebra_state_dir }} are only used - # to match that variable paths. - - name: Launch ${{ inputs.test_id }} test - # This step only runs for tests that just read or write a Zebra state. - # - # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially. - # TODO: we should find a better logic for this use cases - if: ${{ (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }} - shell: /usr/bin/bash -exo pipefail {0} + # Format the mounted disk if the test doesn't use a cached state. + - name: Format ${{ inputs.test_id }} volume + if: ${{ !inputs.needs_zebra_state && !inputs.needs_lwd_state }} + shell: /usr/bin/bash -ex {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ @@ -435,36 +288,14 @@ jobs: --command=' \ set -ex; # Extract the correct disk name based on the device-name - export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \ - - sudo docker run \ - --name ${{ inputs.test_id }} \ - --tty \ - --detach \ - ${{ inputs.test_variables }} \ - --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ - ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ - ' - - # Show debug logs if previous job failed - - name: Show debug logs if previous job failed - if: ${{ failure() && (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }} - shell: /usr/bin/bash -exo pipefail {0} - run: | - gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ - --zone ${{ vars.GCP_ZONE }} \ - --ssh-flag="-o ServerAliveInterval=5" \ - --ssh-flag="-o ConnectionAttempts=20" \ - --ssh-flag="-o ConnectTimeout=5" \ - --command=' \ - lsblk; - sudo lsof /dev/$DISK_NAME; - sudo dmesg; - sudo journalctl -b \ + DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); + sudo mkfs.ext4 -v /dev/$DISK_NAME \ ' - # Launch the test with the previously created Lightwalletd and Zebra cached state. - # Each test runs one of the "Launch test" steps, and skips the other. + # Launch the test with the previously created disk or cached state. + # + # This step uses a $MOUNT_FLAGS variable to mount the disk to the docker container. + # If the test needs Lightwalletd state, we add the Lightwalletd state mount to the $MOUNT_FLAGS variable. # # SSH into the just created VM, and create a Docker container to run the incoming test # from ${{ inputs.test_id }}, then mount the sudo docker volume created in the previous job. @@ -473,8 +304,8 @@ jobs: # VM and to the container might require more steps in this workflow, and additional # considerations. # - # The disk mounted in the VM is located at /dev/$DISK_NAME, we want the root `/` of this disk to be - # available in the docker container at two different paths: + # The disk mounted in the VM is located at /dev/$DISK_NAME, we mount the root `/` of this disk to the docker + # container, and might have two different paths (if lightwalletd state is needed): # - /var/cache/zebrad-cache -> ${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} -> $ZEBRA_CACHED_STATE_DIR # - /var/cache/lwd-cache -> ${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} -> $LIGHTWALLETD_DATA_DIR # @@ -484,19 +315,16 @@ jobs: # subdirectories for their data. (But Zebra, lightwalletd, and the test harness must not # delete the whole cache directory.) # - # This paths must match the variables used by the tests in Rust, which are also set in + # These paths must match the variables used by the tests in Rust, which are also set in # `ci-unit-tests-docker.yml` to be able to run this tests. # # Although we're mounting the disk root to both directories, Zebra and Lightwalletd # will only respect the values from $ZEBRA_CACHED_STATE_DIR and $LIGHTWALLETD_DATA_DIR, - # the inputs like ${{ inputs.lwd_state_dir }} are only used to match those variables paths. + # the inputs like ${{ inputs.zebra_state_dir }} and ${{ inputs.lwd_state_dir }} + # are only used to match those variables paths. - name: Launch ${{ inputs.test_id }} test - # This step only runs for tests that read or write Lightwalletd and Zebra states. - # - # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially. - # TODO: we should find a better logic for this use cases - if: ${{ (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }} - shell: /usr/bin/bash -exo pipefail {0} + id: launch-test + shell: /usr/bin/bash -x {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ @@ -504,24 +332,31 @@ jobs: --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ --command=' \ - set -ex; + # Extract the correct disk name based on the device-name - export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \ + DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-) + + MOUNT_FLAGS="--mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }}" + + # Check if we need to mount for Lightwalletd state + # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially. + if [[ "${{ inputs.needs_lwd_state }}" == "true" || "${{ inputs.test_id }}" == "lwd-full-sync" ]]; then + MOUNT_FLAGS="$MOUNT_FLAGS --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }}" + fi sudo docker run \ --name ${{ inputs.test_id }} \ --tty \ --detach \ ${{ inputs.test_variables }} \ - --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ - --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \ + ${MOUNT_FLAGS} \ ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ ' # Show debug logs if previous job failed - name: Show debug logs if previous job failed - if: ${{ failure() && (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }} - shell: /usr/bin/bash -exo pipefail {0} + if: ${{ failure() }} + shell: /usr/bin/bash -x {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ @@ -535,58 +370,6 @@ jobs: sudo journalctl -b \ ' - # Show all the test logs, then follow the logs of the test we just launched, until it finishes. - # Then check the result of the test. - # - # If `inputs.is_long_test` is `true`, the timeout is 5 days, otherwise it's 3 hours. - test-result: - name: Run ${{ inputs.test_id }} test - # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one. - needs: [ launch-with-cached-state, launch-without-cached-state ] - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. - if: ${{ !cancelled() }} - timeout-minutes: ${{ inputs.is_long_test && 7200 || 180 }} - runs-on: zfnd-runners - permissions: - contents: 'read' - id-token: 'write' - steps: - - uses: actions/checkout@v4.1.0 - with: - persist-credentials: false - fetch-depth: '2' - - - name: Inject slug/short variables - uses: rlespinasse/github-slug-action@v4 - with: - short-length: 7 - - # Install our SSH secret - - name: Install private SSH key - uses: shimataro/ssh-key-action@v2.5.1 - with: - key: ${{ secrets.GCP_SSH_PRIVATE_KEY }} - name: google_compute_engine - known_hosts: unnecessary - - - name: Generate public SSH key - run: | - sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client - ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub - - # Setup gcloud CLI - - name: Authenticate to Google Cloud - id: auth - uses: google-github-actions/auth@v1.1.1 - with: - retries: '3' - workload_identity_provider: '${{ vars.GCP_WIF }}' - service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}' - - - name: Set up Cloud SDK - uses: google-github-actions/setup-gcloud@v1.1.1 - # Show all the logs since the container launched, # following until we see zebrad startup messages. # @@ -600,7 +383,7 @@ jobs: # # Errors in the tests are caught by the final test status job. - name: Check startup logs for ${{ inputs.test_id }} - shell: /usr/bin/bash -exo pipefail {0} + shell: /usr/bin/bash -x {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ @@ -608,10 +391,6 @@ jobs: --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ --command=' \ - trap "" PIPE; - - # Temporarily disable "set -e" to handle the broken pipe error gracefully - set +e; sudo docker logs \ --tail all \ --follow \ @@ -633,7 +412,7 @@ jobs: # with that status. # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.) - name: Result of ${{ inputs.test_id }} test - shell: /usr/bin/bash -exo pipefail {0} + shell: /usr/bin/bash -x {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ @@ -641,10 +420,6 @@ jobs: --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ --command=' \ - trap "" PIPE; - - # Temporarily disable "set -e" to handle the broken pipe error gracefully - set +e; sudo docker logs \ --tail all \ --follow \ @@ -653,7 +428,6 @@ jobs: grep --max-count=1 --extended-regexp --color=always \ "test result: .*ok.* [1-9][0-9]* passed.*finished in"; LOGS_EXIT_STATUS=$?; - set -e; EXIT_STATUS=$(sudo docker wait ${{ inputs.test_id }} || echo "Error retrieving exit status"); echo "sudo docker exit status: $EXIT_STATUS"; @@ -672,7 +446,7 @@ jobs: create-state-image: name: Create ${{ inputs.test_id }} cached state image runs-on: ubuntu-latest - needs: [ test-result, launch-with-cached-state ] + needs: [ test-result ] # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one. # Normally, if a job is skipped, all the jobs that depend on it are also skipped. # So we need to override the default success() check to make this job run. @@ -779,7 +553,7 @@ jobs: # Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION, # $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables. - name: Get database versions from logs - shell: /usr/bin/bash -exo pipefail {0} + shell: /usr/bin/bash -x {0} run: | INITIAL_DISK_DB_VERSION="" RUNNING_DB_VERSION="" @@ -869,7 +643,7 @@ jobs: # # Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable. - name: Get sync height from logs - shell: /usr/bin/bash -exo pipefail {0} + shell: /usr/bin/bash -x {0} run: | SYNC_HEIGHT="" @@ -917,7 +691,7 @@ jobs: - name: Get original cached state height from google cloud run: | ORIGINAL_HEIGHT="0" - ORIGINAL_DISK_NAME="${{ format('{0}', needs.launch-with-cached-state.outputs.cached_disk_name) }}" + ORIGINAL_DISK_NAME="${{ format('{0}', needs.test-result.outputs.cached_disk_name) }}" if [[ -n "$ORIGINAL_DISK_NAME" ]]; then ORIGINAL_HEIGHT=$(gcloud compute images list --filter="status=READY AND name=$ORIGINAL_DISK_NAME" --format="value(labels.height)") diff --git a/zebra-state/src/service/finalized_state/disk_db.rs b/zebra-state/src/service/finalized_state/disk_db.rs index 3772fb7a789..c042ea6db0d 100644 --- a/zebra-state/src/service/finalized_state/disk_db.rs +++ b/zebra-state/src/service/finalized_state/disk_db.rs @@ -21,6 +21,7 @@ use std::{ use itertools::Itertools; use rlimit::increase_nofile_limit; +use rocksdb::ReadOptions; use zebra_chain::parameters::Network; use crate::{ @@ -194,7 +195,7 @@ pub trait ReadDisk { fn zs_first_key_value(&self, cf: &C) -> Option<(K, V)> where C: rocksdb::AsColumnFamilyRef, - K: FromDisk, + K: IntoDisk + FromDisk, V: FromDisk; /// Returns the highest key in `cf`, and the corresponding value. @@ -203,7 +204,7 @@ pub trait ReadDisk { fn zs_last_key_value(&self, cf: &C) -> Option<(K, V)> where C: rocksdb::AsColumnFamilyRef, - K: FromDisk, + K: IntoDisk + FromDisk, V: FromDisk; /// Returns the first key greater than or equal to `lower_bound` in `cf`, @@ -321,34 +322,22 @@ impl ReadDisk for DiskDb { fn zs_first_key_value(&self, cf: &C) -> Option<(K, V)> where C: rocksdb::AsColumnFamilyRef, - K: FromDisk, + K: IntoDisk + FromDisk, V: FromDisk, { // Reading individual values from iterators does not seem to cause database hangs. - self.db - .iterator_cf(cf, rocksdb::IteratorMode::Start) - .next()? - .map(|(key_bytes, value_bytes)| { - Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes))) - }) - .expect("unexpected database failure") + self.zs_range_iter(cf, .., false).next() } #[allow(clippy::unwrap_in_result)] fn zs_last_key_value(&self, cf: &C) -> Option<(K, V)> where C: rocksdb::AsColumnFamilyRef, - K: FromDisk, + K: IntoDisk + FromDisk, V: FromDisk, { // Reading individual values from iterators does not seem to cause database hangs. - self.db - .iterator_cf(cf, rocksdb::IteratorMode::End) - .next()? - .map(|(key_bytes, value_bytes)| { - Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes))) - }) - .expect("unexpected database failure") + self.zs_range_iter(cf, .., true).next() } #[allow(clippy::unwrap_in_result)] @@ -358,17 +347,8 @@ impl ReadDisk for DiskDb { K: IntoDisk + FromDisk, V: FromDisk, { - let lower_bound = lower_bound.as_bytes(); - let from = rocksdb::IteratorMode::From(lower_bound.as_ref(), rocksdb::Direction::Forward); - // Reading individual values from iterators does not seem to cause database hangs. - self.db - .iterator_cf(cf, from) - .next()? - .map(|(key_bytes, value_bytes)| { - Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes))) - }) - .expect("unexpected database failure") + self.zs_range_iter(cf, lower_bound.., false).next() } #[allow(clippy::unwrap_in_result)] @@ -378,17 +358,8 @@ impl ReadDisk for DiskDb { K: IntoDisk + FromDisk, V: FromDisk, { - let upper_bound = upper_bound.as_bytes(); - let from = rocksdb::IteratorMode::From(upper_bound.as_ref(), rocksdb::Direction::Reverse); - // Reading individual values from iterators does not seem to cause database hangs. - self.db - .iterator_cf(cf, from) - .next()? - .map(|(key_bytes, value_bytes)| { - Some((K::from_bytes(key_bytes), V::from_bytes(value_bytes))) - }) - .expect("unexpected database failure") + self.zs_range_iter(cf, ..=upper_bound, true).next() } fn zs_items_in_range_ordered(&self, cf: &C, range: R) -> BTreeMap @@ -398,7 +369,7 @@ impl ReadDisk for DiskDb { V: FromDisk, R: RangeBounds, { - self.zs_range_iter(cf, range).collect() + self.zs_range_iter(cf, range, false).collect() } fn zs_items_in_range_unordered(&self, cf: &C, range: R) -> HashMap @@ -408,7 +379,7 @@ impl ReadDisk for DiskDb { V: FromDisk, R: RangeBounds, { - self.zs_range_iter(cf, range).collect() + self.zs_range_iter(cf, range, false).collect() } } @@ -430,15 +401,24 @@ impl DiskWriteBatch { impl DiskDb { /// Returns an iterator over the items in `cf` in `range`. /// + /// Accepts a `reverse` argument. If it is `true`, creates the iterator with an + /// [`IteratorMode`](rocksdb::IteratorMode) of [`End`](rocksdb::IteratorMode::End), or + /// [`From`](rocksdb::IteratorMode::From) with [`Direction::Reverse`](rocksdb::Direction::Reverse). + /// /// Holding this iterator open might delay block commit transactions. - pub fn zs_range_iter(&self, cf: &C, range: R) -> impl Iterator + '_ + pub fn zs_range_iter( + &self, + cf: &C, + range: R, + reverse: bool, + ) -> impl Iterator + '_ where C: rocksdb::AsColumnFamilyRef, K: IntoDisk + FromDisk, V: FromDisk, R: RangeBounds, { - self.zs_range_iter_with_direction(cf, range, false) + self.zs_range_iter_with_direction(cf, range, reverse) } /// Returns a reverse iterator over the items in `cf` in `range`. @@ -495,11 +475,12 @@ impl DiskDb { let range = (start_bound, end_bound); let mode = Self::zs_iter_mode(&range, reverse); + let opts = Self::zs_iter_opts(&range); // Reading multiple items from iterators has caused database hangs, // in previous RocksDB versions self.db - .iterator_cf(cf, mode) + .iterator_cf_opt(cf, opts, mode) .map(|result| result.expect("unexpected database failure")) .map(|(key, value)| (key.to_vec(), value)) // Skip excluded "from" bound and empty ranges. The `mode` already skips keys @@ -514,6 +495,64 @@ impl DiskDb { .map(|(key, value)| (K::from_bytes(key), V::from_bytes(value))) } + /// Returns the RocksDB ReadOptions with a lower and upper bound for a range. + fn zs_iter_opts(range: &R) -> ReadOptions + where + R: RangeBounds>, + { + let mut opts = ReadOptions::default(); + let (lower_bound, upper_bound) = Self::zs_iter_bounds(range); + + if let Some(bound) = lower_bound { + opts.set_iterate_lower_bound(bound); + }; + + if let Some(bound) = upper_bound { + opts.set_iterate_upper_bound(bound); + }; + + opts + } + + /// Returns a lower and upper iterate bounds for a range. + /// + /// Note: Since upper iterate bounds are always exclusive in RocksDB, this method + /// will increment the upper bound by 1 if the end bound of the provided range + /// is inclusive. + fn zs_iter_bounds(range: &R) -> (Option>, Option>) + where + R: RangeBounds>, + { + use std::ops::Bound::*; + + let lower_bound = match range.start_bound() { + Included(bound) | Excluded(bound) => Some(bound.clone()), + Unbounded => None, + }; + + let upper_bound = match range.end_bound().cloned() { + Included(mut bound) => { + // Increment the last byte in the upper bound that is less than u8::MAX, and + // clear any bytes after it to increment the next key in lexicographic order + // (next big-endian number) this Vec represents to RocksDB. + let is_wrapped_overflow = bound.iter_mut().rev().all(|v| { + *v = v.wrapping_add(1); + v == &0 + }); + + if is_wrapped_overflow { + bound.insert(0, 0x01) + } + + Some(bound) + } + Excluded(bound) => Some(bound), + Unbounded => None, + }; + + (lower_bound, upper_bound) + } + /// Returns the RocksDB iterator "from" mode for `range`. /// /// RocksDB iterators are ordered by increasing key bytes by default. diff --git a/zebra-state/src/service/finalized_state/disk_db/tests.rs b/zebra-state/src/service/finalized_state/disk_db/tests.rs index 17613e8b3b5..20fecbbf127 100644 --- a/zebra-state/src/service/finalized_state/disk_db/tests.rs +++ b/zebra-state/src/service/finalized_state/disk_db/tests.rs @@ -24,3 +24,50 @@ impl DiskDb { rocksdb::DB::list_cf(&opts, path) } } + +/// Check that zs_iter_opts returns an upper bound one greater than provided inclusive end bounds. +#[test] +fn zs_iter_opts_increments_key_by_one() { + let _init_guard = zebra_test::init(); + + // TODO: add an empty key (`()` type or `[]` when serialized) test case + let keys: [u32; 14] = [ + 0, + 1, + 200, + 255, + 256, + 257, + 65535, + 65536, + 65537, + 16777215, + 16777216, + 16777217, + 16777218, + u32::MAX, + ]; + + for key in keys { + let (_, bytes) = DiskDb::zs_iter_bounds(&..=key.to_be_bytes().to_vec()); + let mut extra_bytes = bytes.expect("there should be an upper bound"); + let bytes = extra_bytes.split_off(extra_bytes.len() - 4); + let upper_bound = u32::from_be_bytes(bytes.clone().try_into().expect("should be 4 bytes")); + let expected_upper_bound = key.wrapping_add(1); + + assert_eq!( + expected_upper_bound, upper_bound, + "the upper bound should be 1 greater than the original key" + ); + + if expected_upper_bound == 0 { + assert_eq!( + extra_bytes, + vec![1], + "there should be an extra byte with a value of 1" + ); + } else { + assert_eq!(extra_bytes.len(), 0, "there should be no extra bytes"); + } + } +} diff --git a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs index 58d9e43a6c1..77992a1128f 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs @@ -209,7 +209,7 @@ impl ZebraDb { R: std::ops::RangeBounds, { let sapling_trees = self.db.cf_handle("sapling_note_commitment_tree").unwrap(); - self.db.zs_range_iter(&sapling_trees, range) + self.db.zs_range_iter(&sapling_trees, range, false) } /// Returns the Sapling note commitment trees in the reversed range, in decreasing height order. @@ -282,7 +282,7 @@ impl ZebraDb { if let Some(exclusive_end_bound) = exclusive_end_bound { list = self .db - .zs_range_iter(&sapling_subtrees, start_index..exclusive_end_bound) + .zs_range_iter(&sapling_subtrees, start_index..exclusive_end_bound, false) .collect(); } else { // If there is no end bound, just return all the trees. @@ -291,7 +291,7 @@ impl ZebraDb { // the trees run out.) list = self .db - .zs_range_iter(&sapling_subtrees, start_index..) + .zs_range_iter(&sapling_subtrees, start_index.., false) .collect(); } @@ -382,7 +382,7 @@ impl ZebraDb { R: std::ops::RangeBounds, { let orchard_trees = self.db.cf_handle("orchard_note_commitment_tree").unwrap(); - self.db.zs_range_iter(&orchard_trees, range) + self.db.zs_range_iter(&orchard_trees, range, false) } /// Returns the Orchard note commitment trees in the reversed range, in decreasing height order. @@ -455,7 +455,7 @@ impl ZebraDb { if let Some(exclusive_end_bound) = exclusive_end_bound { list = self .db - .zs_range_iter(&orchard_subtrees, start_index..exclusive_end_bound) + .zs_range_iter(&orchard_subtrees, start_index..exclusive_end_bound, false) .collect(); } else { // If there is no end bound, just return all the trees. @@ -464,7 +464,7 @@ impl ZebraDb { // the trees run out.) list = self .db - .zs_range_iter(&orchard_subtrees, start_index..) + .zs_range_iter(&orchard_subtrees, start_index.., false) .collect(); }