From a2641265d8758b939a84217b74370929bc7f891e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 21 Dec 2021 13:25:50 +0100
Subject: [PATCH] Debug CI.

---
 .buildkite/pipeline.yml | 393 +---------------------------------------
 test/sorting.jl         | 272 +++++++++++++--------------
 2 files changed, 137 insertions(+), 528 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 9a322fd4f6..0ac73793c3 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -2,31 +2,13 @@ steps:
 
   # Julia versions
 
-  - label: "Julia 1.6"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1:
-          test_args: "--quickfail"
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 120
-
   - label: "Julia 1.6 (debug)"
     plugins:
       - JuliaCI/julia#v1:
           version: 1.6
       - JuliaCI/julia-test#v1:
           julia_args: "-g2"
-          test_args: "--sanitize --quickfail --jobs=1"
+          test_args: "--sanitize --quickfail --jobs=1 sorting"
       - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
@@ -46,379 +28,6 @@ steps:
         !build.pull_request.draft
     timeout_in_minutes: 180
 
-  - label: "Julia 1.7"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.7
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-  - label: "Julia 1.8"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: nightly
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-    soft_fail:
-      - exit_status: 1
-
-
-    # CUDA versions (from artifacts)
-
-    # NOTE: we support those CUDA versions for which the latest cuDNN is available
-    #       https://developer.nvidia.com/rdp/cudnn-archive
-
-    # NOTE: only the latest version of CUDA that has CUDNN & CUTENSOR is tested thoroughly.
-    #       once we have sufficient CI resources, expand this to all CUDA versions again.
-
-  - label: "CUDA 11.5"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "11.0"
-    env:
-      JULIA_CUDA_VERSION: '11.5'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-  - label: "CUDA 11.4"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1:
-          test_args: "--thorough"
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "11.0"
-      cap: "sm_80"
-    env:
-      JULIA_CUDA_VERSION: '11.4'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-  - label: "CUDA 11.3"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "11.0"
-      cap: "sm_80"
-    env:
-      JULIA_CUDA_VERSION: '11.3'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-  - label: "CUDA 11.2"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "11.0"
-      cap: "sm_80"
-    env:
-      JULIA_CUDA_VERSION: '11.2'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-  - label: "CUDA 11.1"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "11.0"
-      cap: "sm_80"
-    env:
-      JULIA_CUDA_VERSION: '11.1'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-  - label: "CUDA 11.0"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "11.0"
-      cap: "sm_80"
-    env:
-      JULIA_CUDA_VERSION: '11.0'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-  - label: "CUDA 10.2"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "10.2"
-      cap: "sm_75"
-    env:
-      JULIA_CUDA_VERSION: '10.2'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-  - label: "CUDA 10.1"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    agents:
-      queue: "juliagpu"
-      cuda: "10.1"
-      cap: "sm_75"
-    env:
-      JULIA_CUDA_VERSION: '10.1'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 120
-
-
-   # special tests
-
-  # - label: "Windows"
-  #   plugins:
-  #     - JuliaCI/julia#v1:
-  #         version: 1.6
-  #     - JuliaCI/julia-test#v1: ~
-  #     # XXX: no coverage, as no secrets on Windows
-  #   agents:
-  #     queue: "juliagpu-windows"
-  #     cuda: "*"
-  #   if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-  #   timeout_in_minutes: 120
-
-  - label: "NNlibCUDA.jl"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-            - examples
-    command: |
-      julia -e 'using Pkg;
-
-                println("--- :julia: Instantiating project");
-                Pkg.develop(PackageSpec(path=pwd()));
-                Pkg.add(PackageSpec(name="NNlibCUDA", rev="master"));
-                Pkg.instantiate();
-
-                println("+++ :julia: Running tests");
-                Pkg.test("NNlibCUDA"; coverage=true);'
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
-    timeout_in_minutes: 60
-
-
-  # other tasks
-
-  # we want to benchmark every commit on the master branch, even if it failed CI
-  - wait: ~
-    continue_on_failure: true
-
-  # if we will submit results, use the benchmark queue so that we will
-  # be running on the same system each time
-  - label: "Benchmarks on 1.6"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-    env:
-      BENCHMARKS: "true"
-      CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
-      CODESPEED_BRANCH: "$BUILDKITE_BRANCH"
-      CODESPEED_COMMIT: "$BUILDKITE_COMMIT"
-      CODESPEED_EXECUTABLE: "Julia 1.6"
-    command: |
-      julia --project -e '
-        ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]
-
-        println("--- :julia: Instantiating project")
-        using Pkg
-        Pkg.instantiate()
-        Pkg.activate("perf")
-        Pkg.instantiate()
-        push!(LOAD_PATH, @__DIR__)
-
-        println("+++ :julia: Benchmarking")
-        include("perf/runbenchmarks.jl")'
-    agents:
-      queue: "benchmark"
-      cuda: "*"
-    if: build.message !~ /\[skip benchmarks\]/ &&
-        build.branch =~ /^master$$/ &&
-        !build.pull_request.draft
-    timeout_in_minutes: 30
-
-  - label: "Benchmarks on 1.7"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.7
-    env:
-      BENCHMARKS: "true"
-      CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
-      CODESPEED_BRANCH: "$BUILDKITE_BRANCH"
-      CODESPEED_COMMIT: "$BUILDKITE_COMMIT"
-      CODESPEED_EXECUTABLE: "Julia 1.7"
-    command: |
-      julia --project -e '
-        ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]
-
-        println("--- :julia: Instantiating project")
-        using Pkg
-        Pkg.instantiate()
-        Pkg.activate("perf")
-        Pkg.instantiate()
-        push!(LOAD_PATH, @__DIR__)
-
-        println("+++ :julia: Benchmarking")
-        include("perf/runbenchmarks.jl")'
-    agents:
-      queue: "benchmark"
-      cuda: "*"
-    if: build.message !~ /\[skip benchmarks\]/ &&
-        build.branch =~ /^master$$/ &&
-        !build.pull_request.draft
-    timeout_in_minutes: 30
-
-  - wait
-
-  # benchmarks outside of the master branch don't submit their results,
-  # so they can run on any system in the juliagpu queue.
-  - label: "Benchmarks (dry run)"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-    command: |
-      julia --project -e '
-        println("--- :julia: Instantiating project")
-        using Pkg
-        Pkg.instantiate()
-        Pkg.activate("perf")
-        Pkg.instantiate()
-        push!(LOAD_PATH, @__DIR__)
-
-        println("+++ :julia: Benchmarking")
-        include("perf/runbenchmarks.jl")'
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip benchmarks\]/ &&
-        build.branch !~ /^master$$/ &&
-        !build.pull_request.draft
-    timeout_in_minutes: 30
-
-  - label: "Documentation"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.6
-    command: |
-      julia --project -e '
-        println("--- :julia: Instantiating project")
-        using Pkg
-        Pkg.instantiate()
-        Pkg.activate("docs")
-        Pkg.instantiate()
-        push!(LOAD_PATH, @__DIR__)
-
-        println("+++ :julia: Building documentation")
-        include("docs/make.jl")'
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip docs\]/ && !build.pull_request.draft
-    timeout_in_minutes: 30
-
 
 
 env:
diff --git a/test/sorting.jl b/test/sorting.jl
index d27cf27789..c3cd4294af 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -6,19 +6,19 @@ using DataStructures
 import CUDA.QuickSortImpl: flex_lt, find_partition, quicksort!,
         partition_batches_kernel, consolidate_batch_partition, bubble_sort
 
-@testset "integer functions" begin
-    @test flex_lt(1, 2, false, isless, identity) == true
-    @test flex_lt(1, 2, true, isless, identity) == true
-    @test flex_lt(2, 2, false, isless, identity) == false
-    @test flex_lt(2, 2, true, isless, identity) == true
-    @test flex_lt(3, 2, false, isless, identity) == false
-    @test flex_lt(3, 2, true, isless, identity) == false
-
-    @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 0, 5, false, isless, identity) == 4
-    @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 5, 10, false, isless, identity) == 9
-    @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 0, 5, true, isless, identity) == 3
-    @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 5, 10, true, isless, identity) == 8
-end
+# @testset "integer functions" begin
+#     @test flex_lt(1, 2, false, isless, identity) == true
+#     @test flex_lt(1, 2, true, isless, identity) == true
+#     @test flex_lt(2, 2, false, isless, identity) == false
+#     @test flex_lt(2, 2, true, isless, identity) == true
+#     @test flex_lt(3, 2, false, isless, identity) == false
+#     @test flex_lt(3, 2, true, isless, identity) == false
+
+#     @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 0, 5, false, isless, identity) == 4
+#     @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 5, 10, false, isless, identity) == 9
+#     @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 0, 5, true, isless, identity) == 3
+#     @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 5, 10, true, isless, identity) == 8
+# end
 
 function test_batch_partition(T, N, lo, hi, seed, lt=isless, by=identity)
     my_range = lo + 1 : hi
@@ -60,28 +60,28 @@ function test_batch_partition(T, N, lo, hi, seed, lt=isless, by=identity)
     @test sort_match
 end
 
-@testset "batch partition" begin
-    test_batch_partition(Int8, 10000, 2000, 6000, 0)
-    test_batch_partition(Int8, 10000, 2000, 6000, 1)
-    test_batch_partition(Int8, 10000000, 0, 10000000, 0)
-    test_batch_partition(Int8, 10000000, 5000, 500000, 0)
-    test_batch_partition(Int8, 10000, 0, 10000, 0)
-    test_batch_partition(Int8, 10000, 2000, 6000, 0)
-    test_batch_partition(Int8, 10000, 2000, 6000, 1)
-    test_batch_partition(Int8, 10000000, 0, 10000000, 0)
-    test_batch_partition(Int8, 10000000, 5000, 500000, 0)
-
-    test_batch_partition(Float32, 10000, 0, 10000, 0)
-    test_batch_partition(Float32, 10000, 2000, 6000, 0)
-    test_batch_partition(Float32, 10000, 2000, 6000, 1)
-    test_batch_partition(Float32, 10000000, 0, 10000000, 0)
-    test_batch_partition(Float32, 10000000, 5000, 500000, 0)
-    test_batch_partition(Float32, 10000, 0, 10000, 0)
-    test_batch_partition(Float32, 10000, 2000, 6000, 0)
-    test_batch_partition(Float32, 10000, 2000, 6000, 1)
-    test_batch_partition(Float32, 10000000, 0, 10000000, 0)
-    test_batch_partition(Float32, 10000000, 5000, 500000, 0)
-end
+# @testset "batch partition" begin
+#     test_batch_partition(Int8, 10000, 2000, 6000, 0)
+#     test_batch_partition(Int8, 10000, 2000, 6000, 1)
+#     test_batch_partition(Int8, 10000000, 0, 10000000, 0)
+#     test_batch_partition(Int8, 10000000, 5000, 500000, 0)
+#     test_batch_partition(Int8, 10000, 0, 10000, 0)
+#     test_batch_partition(Int8, 10000, 2000, 6000, 0)
+#     test_batch_partition(Int8, 10000, 2000, 6000, 1)
+#     test_batch_partition(Int8, 10000000, 0, 10000000, 0)
+#     test_batch_partition(Int8, 10000000, 5000, 500000, 0)
+
+#     test_batch_partition(Float32, 10000, 0, 10000, 0)
+#     test_batch_partition(Float32, 10000, 2000, 6000, 0)
+#     test_batch_partition(Float32, 10000, 2000, 6000, 1)
+#     test_batch_partition(Float32, 10000000, 0, 10000000, 0)
+#     test_batch_partition(Float32, 10000000, 5000, 500000, 0)
+#     test_batch_partition(Float32, 10000, 0, 10000, 0)
+#     test_batch_partition(Float32, 10000, 2000, 6000, 0)
+#     test_batch_partition(Float32, 10000, 2000, 6000, 1)
+#     test_batch_partition(Float32, 10000000, 0, 10000000, 0)
+#     test_batch_partition(Float32, 10000000, 5000, 500000, 0)
+# end
 
 function test_consolidate_kernel(vals, pivot, my_floor, L, b_sums, dest, parity, lt, by)
     i = threadIdx().x
@@ -130,29 +130,29 @@ function test_consolidate_partition(T, N, lo, hi, seed, block_dim, lt=isless, by
     @test all(post_sort[partition + 1 : hi] |> cc .== filter(x -> x >= pivot, temp) |> cc)
 end
 
-@testset "consolidate partition" begin
-    test_consolidate_partition(Int8, 10000, 0, 10000, 0, 16)
-    test_consolidate_partition(Int8, 10000, 0, 10000, 0, 32)
-    test_consolidate_partition(Int8, 10000, 0, 10000, 0, 64)
-    test_consolidate_partition(Int8, 10000, 9, 6333, 0, 16)
-    test_consolidate_partition(Int8, 10000, 9, 6333, 0, 32)
-    test_consolidate_partition(Int8, 10000, 9, 6333, 0, 64)
-    test_consolidate_partition(Int8, 10000, 129, 9999, 0, 16)
-    test_consolidate_partition(Int8, 10000, 129, 9999, 0, 32)
-    test_consolidate_partition(Int8, 10000, 129, 9999, 0, 64)
-    test_consolidate_partition(Int8, 10000, 0, 10000, 1, 16)
-    test_consolidate_partition(Int8, 10000, 0, 10000, 2, 32)
-    test_consolidate_partition(Int8, 10000, 0, 10000, 3, 64)
-    test_consolidate_partition(Int8, 10000, 9, 6333, 4, 16)
-    test_consolidate_partition(Int8, 10000, 9, 6333, 5, 32)
-    test_consolidate_partition(Int8, 10000, 9, 6333, 6, 64)
-    test_consolidate_partition(Int8, 10000, 129, 9999, 7, 16)
-    test_consolidate_partition(Int8, 10000, 129, 9999, 8, 32)
-    test_consolidate_partition(Int8, 10000, 129, 9999, 9, 64)
-    test_consolidate_partition(Int8, 10000, 3329, 9999, 10, 16)
-    test_consolidate_partition(Int8, 10000, 3329, 9999, 11, 32)
-    test_consolidate_partition(Int8, 10000, 3329, 9999, 12, 64)
-end
+# @testset "consolidate partition" begin
+#     test_consolidate_partition(Int8, 10000, 0, 10000, 0, 16)
+#     test_consolidate_partition(Int8, 10000, 0, 10000, 0, 32)
+#     test_consolidate_partition(Int8, 10000, 0, 10000, 0, 64)
+#     test_consolidate_partition(Int8, 10000, 9, 6333, 0, 16)
+#     test_consolidate_partition(Int8, 10000, 9, 6333, 0, 32)
+#     test_consolidate_partition(Int8, 10000, 9, 6333, 0, 64)
+#     test_consolidate_partition(Int8, 10000, 129, 9999, 0, 16)
+#     test_consolidate_partition(Int8, 10000, 129, 9999, 0, 32)
+#     test_consolidate_partition(Int8, 10000, 129, 9999, 0, 64)
+#     test_consolidate_partition(Int8, 10000, 0, 10000, 1, 16)
+#     test_consolidate_partition(Int8, 10000, 0, 10000, 2, 32)
+#     test_consolidate_partition(Int8, 10000, 0, 10000, 3, 64)
+#     test_consolidate_partition(Int8, 10000, 9, 6333, 4, 16)
+#     test_consolidate_partition(Int8, 10000, 9, 6333, 5, 32)
+#     test_consolidate_partition(Int8, 10000, 9, 6333, 6, 64)
+#     test_consolidate_partition(Int8, 10000, 129, 9999, 7, 16)
+#     test_consolidate_partition(Int8, 10000, 129, 9999, 8, 32)
+#     test_consolidate_partition(Int8, 10000, 129, 9999, 9, 64)
+#     test_consolidate_partition(Int8, 10000, 3329, 9999, 10, 16)
+#     test_consolidate_partition(Int8, 10000, 3329, 9999, 11, 32)
+#     test_consolidate_partition(Int8, 10000, 3329, 9999, 12, 64)
+# end
 
 function init_case(T, f, N::Integer)
     a = map(x -> T(f(x)), 1:N)
@@ -256,86 +256,86 @@ function check_sortperm(T, N; kwargs...)
 end
 
 # Makes sure that non-maximally-large block sizes don't result in race conds
-@testset "reduced block sizes" begin
-    function init()
-        a = map(x -> x%UInt8, reverse(1:100000))
-        c = CuArray(a)
-        a, c
-    end
-
-    function check(block_size_shift)
-        original_arr, device_arr = init()
-        sort!(device_arr)
-        quicksort!(device_arr; lt=isless, by=identity, dims=1, block_size_shift=block_size_shift)
-        host_result = Array(device_arr)
-        @test check_equivalence(original_arr, host_result)
-    end
-
-    # repeat since race-conditions non-deterministic
-    for x in 1:25
-        check(1)
-        check(2)
-        check(3)
-        check(4)
-    end
-end
+# @testset "reduced block sizes" begin
+#     function init()
+#         a = map(x -> x%UInt8, reverse(1:100000))
+#         c = CuArray(a)
+#         a, c
+#     end
+
+#     function check(block_size_shift)
+#         original_arr, device_arr = init()
+#         sort!(device_arr)
+#         quicksort!(device_arr; lt=isless, by=identity, dims=1, block_size_shift=block_size_shift)
+#         host_result = Array(device_arr)
+#         @test check_equivalence(original_arr, host_result)
+#     end
+
+#     # repeat since race-conditions non-deterministic
+#     for x in 1:25
+#         check(1)
+#         check(2)
+#         check(3)
+#         check(4)
+#     end
+# end
 
 @testset "interface" begin
-    @testset "quicksort" begin
-        # pre-sorted
-        @test check_sort!(Int, 1000000; alg=CUDA.QuickSort)
-        @test check_sort!(Int32, 1000000; alg=CUDA.QuickSort)
-        @test check_sort!(Float64, 1000000; alg=CUDA.QuickSort)
-        @test check_sort!(Float32, 1000000; alg=CUDA.QuickSort)
-        @test check_sort!(Int32, 1000000; rev=true)
-        @test check_sort!(Float32, 1000000; rev=true)
-
-        # reverse sorted
-        @test check_sort!(Int32, 1000000, x -> -x; alg=CUDA.QuickSort)
-        @test check_sort!(Float32, 1000000, x -> -x; alg=CUDA.QuickSort)
-        @test check_sort!(Int32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort)
-        @test check_sort!(Float32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort)
-
-        @test check_sort!(Int, 10000, x -> rand(Int); alg=CUDA.QuickSort)
-        @test check_sort!(Int32, 10000, x -> rand(Int32); alg=CUDA.QuickSort)
-        @test check_sort!(Int8, 10000, x -> rand(Int8); alg=CUDA.QuickSort)
-        @test check_sort!(Float64, 10000, x -> rand(Float64); alg=CUDA.QuickSort)
-        @test check_sort!(Float32, 10000, x -> rand(Float32); alg=CUDA.QuickSort)
-        @test check_sort!(Float16, 10000, x -> rand(Float16); alg=CUDA.QuickSort)
-        @test check_sort!(Tuple{Int,Int}, 10000, x -> (rand(Int), rand(Int)); alg=CUDA.QuickSort)
-
-        # non-uniform distributions
-        @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 2); alg=CUDA.QuickSort)
-        @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 3); alg=CUDA.QuickSort)
-
-        # more copies of each value than can fit in one block
-        @test check_sort!(Int8, 4000000, x -> rand(Int8); alg=CUDA.QuickSort)
-
-        # multiple dimensions
-        @test check_sort!(Int32, (4, 50000, 4); dims=2)
-        @test check_sort!(Int32, (4, 4, 50000); dims=3, rev=true)
-
-        # large sizes
-        @test check_sort!(Float32, 2^25; alg=CUDA.QuickSort)
-
-        # various sync depths
-        for depth in 0:4
-            CUDA.limit!(CUDA.LIMIT_DEV_RUNTIME_SYNC_DEPTH, depth)
-            @test check_sort!(Int, 100000, x -> rand(Int); alg=CUDA.QuickSort)
-        end
-
-        # using a `by` argument
-        @test check_sort(Float32, 100000; by=x->abs(x - 0.5), alg=CUDA.QuickSort)
-        @test check_sort!(Float32, (100000, 4); by=x->abs(x - 0.5), dims=1)
-        @test check_sort!(Float32, (4, 100000); by=x->abs(x - 0.5), dims=2)
-        @test check_sort!(Float64, 400000; by=x->8*x-round(8*x), alg=CUDA.QuickSort)
-        @test check_sort!(Float64, (100000, 4); by=x->8*x-round(8*x), dims=1)
-        @test check_sort!(Float64, (4, 100000); by=x->8*x-round(8*x), dims=2)
-        # target bubble sort by using sub-blocksize input:
-        @test check_sort!(Int, 200; by=x->x % 2, alg=CUDA.QuickSort)
-        @test check_sort!(Int, 200; by=x->x % 3, alg=CUDA.QuickSort)
-        @test check_sort!(Int, 200; by=x->x % 4, alg=CUDA.QuickSort)
-    end # end quicksort tests
+    # @testset "quicksort" begin
+    #     # pre-sorted
+    #     @test check_sort!(Int, 1000000; alg=CUDA.QuickSort)
+    #     @test check_sort!(Int32, 1000000; alg=CUDA.QuickSort)
+    #     @test check_sort!(Float64, 1000000; alg=CUDA.QuickSort)
+    #     @test check_sort!(Float32, 1000000; alg=CUDA.QuickSort)
+    #     @test check_sort!(Int32, 1000000; rev=true)
+    #     @test check_sort!(Float32, 1000000; rev=true)
+
+    #     # reverse sorted
+    #     @test check_sort!(Int32, 1000000, x -> -x; alg=CUDA.QuickSort)
+    #     @test check_sort!(Float32, 1000000, x -> -x; alg=CUDA.QuickSort)
+    #     @test check_sort!(Int32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort)
+    #     @test check_sort!(Float32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort)
+
+    #     @test check_sort!(Int, 10000, x -> rand(Int); alg=CUDA.QuickSort)
+    #     @test check_sort!(Int32, 10000, x -> rand(Int32); alg=CUDA.QuickSort)
+    #     @test check_sort!(Int8, 10000, x -> rand(Int8); alg=CUDA.QuickSort)
+    #     @test check_sort!(Float64, 10000, x -> rand(Float64); alg=CUDA.QuickSort)
+    #     @test check_sort!(Float32, 10000, x -> rand(Float32); alg=CUDA.QuickSort)
+    #     @test check_sort!(Float16, 10000, x -> rand(Float16); alg=CUDA.QuickSort)
+    #     @test check_sort!(Tuple{Int,Int}, 10000, x -> (rand(Int), rand(Int)); alg=CUDA.QuickSort)
+
+    #     # non-uniform distributions
+    #     @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 2); alg=CUDA.QuickSort)
+    #     @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 3); alg=CUDA.QuickSort)
+
+    #     # more copies of each value than can fit in one block
+    #     @test check_sort!(Int8, 4000000, x -> rand(Int8); alg=CUDA.QuickSort)
+
+    #     # multiple dimensions
+    #     @test check_sort!(Int32, (4, 50000, 4); dims=2)
+    #     @test check_sort!(Int32, (4, 4, 50000); dims=3, rev=true)
+
+    #     # large sizes
+    #     @test check_sort!(Float32, 2^25; alg=CUDA.QuickSort)
+
+    #     # various sync depths
+    #     for depth in 0:4
+    #         CUDA.limit!(CUDA.LIMIT_DEV_RUNTIME_SYNC_DEPTH, depth)
+    #         @test check_sort!(Int, 100000, x -> rand(Int); alg=CUDA.QuickSort)
+    #     end
+
+    #     # using a `by` argument
+    #     @test check_sort(Float32, 100000; by=x->abs(x - 0.5), alg=CUDA.QuickSort)
+    #     @test check_sort!(Float32, (100000, 4); by=x->abs(x - 0.5), dims=1)
+    #     @test check_sort!(Float32, (4, 100000); by=x->abs(x - 0.5), dims=2)
+    #     @test check_sort!(Float64, 400000; by=x->8*x-round(8*x), alg=CUDA.QuickSort)
+    #     @test check_sort!(Float64, (100000, 4); by=x->8*x-round(8*x), dims=1)
+    #     @test check_sort!(Float64, (4, 100000); by=x->8*x-round(8*x), dims=2)
+    #     # target bubble sort by using sub-blocksize input:
+    #     @test check_sort!(Int, 200; by=x->x % 2, alg=CUDA.QuickSort)
+    #     @test check_sort!(Int, 200; by=x->x % 3, alg=CUDA.QuickSort)
+    #     @test check_sort!(Int, 200; by=x->x % 4, alg=CUDA.QuickSort)
+    # end # end quicksort tests
 
     @testset "bitonic sort" begin
         # test various types