From a2641265d8758b939a84217b74370929bc7f891e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 21 Dec 2021 13:25:50 +0100 Subject: [PATCH] Debug CI. --- .buildkite/pipeline.yml | 393 +--------------------------------------- test/sorting.jl | 272 +++++++++++++-------------- 2 files changed, 137 insertions(+), 528 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 9a322fd4f6..0ac73793c3 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -2,31 +2,13 @@ steps: # Julia versions - - label: "Julia 1.6" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: - test_args: "--quickfail" - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 120 - - label: "Julia 1.6 (debug)" plugins: - JuliaCI/julia#v1: version: 1.6 - JuliaCI/julia-test#v1: julia_args: "-g2" - test_args: "--sanitize --quickfail --jobs=1" + test_args: "--sanitize --quickfail --jobs=1 sorting" - JuliaCI/julia-coverage#v1: codecov: true dirs: @@ -46,379 +28,6 @@ steps: !build.pull_request.draft timeout_in_minutes: 180 - - label: "Julia 1.7" - plugins: - - JuliaCI/julia#v1: - version: 1.7 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - label: "Julia 1.8" - plugins: - - JuliaCI/julia#v1: - version: nightly - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - soft_fail: - - exit_status: 1 - - - # CUDA versions (from artifacts) - - # NOTE: we support those CUDA versions for which the latest cuDNN is available - # https://developer.nvidia.com/rdp/cudnn-archive - - # NOTE: only the latest version of CUDA that has CUDNN & CUTENSOR is tested thoroughly. - # once we have sufficient CI resources, expand this to all CUDA versions again. - - - label: "CUDA 11.5" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - env: - JULIA_CUDA_VERSION: '11.5' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - label: "CUDA 11.4" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: - test_args: "--thorough" - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - cap: "sm_80" - env: - JULIA_CUDA_VERSION: '11.4' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - label: "CUDA 11.3" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - cap: "sm_80" - env: - JULIA_CUDA_VERSION: '11.3' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - label: "CUDA 11.2" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - cap: "sm_80" - env: - JULIA_CUDA_VERSION: '11.2' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - label: "CUDA 11.1" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - cap: "sm_80" - env: - JULIA_CUDA_VERSION: '11.1' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - label: "CUDA 11.0" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - cap: "sm_80" - env: - JULIA_CUDA_VERSION: '11.0' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - label: "CUDA 10.2" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "10.2" - cap: "sm_75" - env: - JULIA_CUDA_VERSION: '10.2' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - label: "CUDA 10.1" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "10.1" - cap: "sm_75" - env: - JULIA_CUDA_VERSION: '10.1' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - - - # special tests - - # - label: "Windows" - # plugins: - # - JuliaCI/julia#v1: - # version: 1.6 - # - JuliaCI/julia-test#v1: ~ - # # XXX: no coverage, as no secrets on Windows - # agents: - # queue: "juliagpu-windows" - # cuda: "*" - # if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - # timeout_in_minutes: 120 - - - label: "NNlibCUDA.jl" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - command: | - julia -e 'using Pkg; - - println("--- :julia: Instantiating project"); - Pkg.develop(PackageSpec(path=pwd())); - Pkg.add(PackageSpec(name="NNlibCUDA", rev="master")); - Pkg.instantiate(); - - println("+++ :julia: Running tests"); - Pkg.test("NNlibCUDA"; coverage=true);' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 60 - - - # other tasks - - # we want to benchmark every commit on the master branch, even if it failed CI - - wait: ~ - continue_on_failure: true - - # if we will submit results, use the benchmark queue so that we will - # be running on the same system each time - - label: "Benchmarks on 1.6" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - env: - BENCHMARKS: "true" - CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME" - CODESPEED_BRANCH: "$BUILDKITE_BRANCH" - CODESPEED_COMMIT: "$BUILDKITE_COMMIT" - CODESPEED_EXECUTABLE: "Julia 1.6" - command: | - julia --project -e ' - ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] - - println("--- :julia: Instantiating project") - using Pkg - Pkg.instantiate() - Pkg.activate("perf") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) - - println("+++ :julia: Benchmarking") - include("perf/runbenchmarks.jl")' - agents: - queue: "benchmark" - cuda: "*" - if: build.message !~ /\[skip benchmarks\]/ && - build.branch =~ /^master$$/ && - !build.pull_request.draft - timeout_in_minutes: 30 - - - label: "Benchmarks on 1.7" - plugins: - - JuliaCI/julia#v1: - version: 1.7 - env: - BENCHMARKS: "true" - CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME" - CODESPEED_BRANCH: "$BUILDKITE_BRANCH" - CODESPEED_COMMIT: "$BUILDKITE_COMMIT" - CODESPEED_EXECUTABLE: "Julia 1.7" - command: | - julia --project -e ' - ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] - - println("--- :julia: Instantiating project") - using Pkg - Pkg.instantiate() - Pkg.activate("perf") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) - - println("+++ :julia: Benchmarking") - include("perf/runbenchmarks.jl")' - agents: - queue: "benchmark" - cuda: "*" - if: build.message !~ /\[skip benchmarks\]/ && - build.branch =~ /^master$$/ && - !build.pull_request.draft - timeout_in_minutes: 30 - - - wait - - # benchmarks outside of the master branch don't submit their results, - # so they can run on any system in the juliagpu queue. - - label: "Benchmarks (dry run)" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - command: | - julia --project -e ' - println("--- :julia: Instantiating project") - using Pkg - Pkg.instantiate() - Pkg.activate("perf") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) - - println("+++ :julia: Benchmarking") - include("perf/runbenchmarks.jl")' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip benchmarks\]/ && - build.branch !~ /^master$$/ && - !build.pull_request.draft - timeout_in_minutes: 30 - - - label: "Documentation" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - command: | - julia --project -e ' - println("--- :julia: Instantiating project") - using Pkg - Pkg.instantiate() - Pkg.activate("docs") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) - - println("+++ :julia: Building documentation") - include("docs/make.jl")' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip docs\]/ && !build.pull_request.draft - timeout_in_minutes: 30 - env: diff --git a/test/sorting.jl b/test/sorting.jl index d27cf27789..c3cd4294af 100644 --- a/test/sorting.jl +++ b/test/sorting.jl @@ -6,19 +6,19 @@ using DataStructures import CUDA.QuickSortImpl: flex_lt, find_partition, quicksort!, partition_batches_kernel, consolidate_batch_partition, bubble_sort -@testset "integer functions" begin - @test flex_lt(1, 2, false, isless, identity) == true - @test flex_lt(1, 2, true, isless, identity) == true - @test flex_lt(2, 2, false, isless, identity) == false - @test flex_lt(2, 2, true, isless, identity) == true - @test flex_lt(3, 2, false, isless, identity) == false - @test flex_lt(3, 2, true, isless, identity) == false - - @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 0, 5, false, isless, identity) == 4 - @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 5, 10, false, isless, identity) == 9 - @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 0, 5, true, isless, identity) == 3 - @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 5, 10, true, isless, identity) == 8 -end +# @testset "integer functions" begin +# @test flex_lt(1, 2, false, isless, identity) == true +# @test flex_lt(1, 2, true, isless, identity) == true +# @test flex_lt(2, 2, false, isless, identity) == false +# @test flex_lt(2, 2, true, isless, identity) == true +# @test flex_lt(3, 2, false, isless, identity) == false +# @test flex_lt(3, 2, true, isless, identity) == false + +# @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 0, 5, false, isless, identity) == 4 +# @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 5, 10, false, isless, identity) == 9 +# @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 0, 5, true, isless, identity) == 3 +# @test find_partition([1, 2, 2, 3, 4, 1, 2, 2, 3, 4], 3, 5, 10, true, isless, identity) == 8 +# end function test_batch_partition(T, N, lo, hi, seed, lt=isless, by=identity) my_range = lo + 1 : hi @@ -60,28 +60,28 @@ function test_batch_partition(T, N, lo, hi, seed, lt=isless, by=identity) @test sort_match end -@testset "batch partition" begin - test_batch_partition(Int8, 10000, 2000, 6000, 0) - test_batch_partition(Int8, 10000, 2000, 6000, 1) - test_batch_partition(Int8, 10000000, 0, 10000000, 0) - test_batch_partition(Int8, 10000000, 5000, 500000, 0) - test_batch_partition(Int8, 10000, 0, 10000, 0) - test_batch_partition(Int8, 10000, 2000, 6000, 0) - test_batch_partition(Int8, 10000, 2000, 6000, 1) - test_batch_partition(Int8, 10000000, 0, 10000000, 0) - test_batch_partition(Int8, 10000000, 5000, 500000, 0) - - test_batch_partition(Float32, 10000, 0, 10000, 0) - test_batch_partition(Float32, 10000, 2000, 6000, 0) - test_batch_partition(Float32, 10000, 2000, 6000, 1) - test_batch_partition(Float32, 10000000, 0, 10000000, 0) - test_batch_partition(Float32, 10000000, 5000, 500000, 0) - test_batch_partition(Float32, 10000, 0, 10000, 0) - test_batch_partition(Float32, 10000, 2000, 6000, 0) - test_batch_partition(Float32, 10000, 2000, 6000, 1) - test_batch_partition(Float32, 10000000, 0, 10000000, 0) - test_batch_partition(Float32, 10000000, 5000, 500000, 0) -end +# @testset "batch partition" begin +# test_batch_partition(Int8, 10000, 2000, 6000, 0) +# test_batch_partition(Int8, 10000, 2000, 6000, 1) +# test_batch_partition(Int8, 10000000, 0, 10000000, 0) +# test_batch_partition(Int8, 10000000, 5000, 500000, 0) +# test_batch_partition(Int8, 10000, 0, 10000, 0) +# test_batch_partition(Int8, 10000, 2000, 6000, 0) +# test_batch_partition(Int8, 10000, 2000, 6000, 1) +# test_batch_partition(Int8, 10000000, 0, 10000000, 0) +# test_batch_partition(Int8, 10000000, 5000, 500000, 0) + +# test_batch_partition(Float32, 10000, 0, 10000, 0) +# test_batch_partition(Float32, 10000, 2000, 6000, 0) +# test_batch_partition(Float32, 10000, 2000, 6000, 1) +# test_batch_partition(Float32, 10000000, 0, 10000000, 0) +# test_batch_partition(Float32, 10000000, 5000, 500000, 0) +# test_batch_partition(Float32, 10000, 0, 10000, 0) +# test_batch_partition(Float32, 10000, 2000, 6000, 0) +# test_batch_partition(Float32, 10000, 2000, 6000, 1) +# test_batch_partition(Float32, 10000000, 0, 10000000, 0) +# test_batch_partition(Float32, 10000000, 5000, 500000, 0) +# end function test_consolidate_kernel(vals, pivot, my_floor, L, b_sums, dest, parity, lt, by) i = threadIdx().x @@ -130,29 +130,29 @@ function test_consolidate_partition(T, N, lo, hi, seed, block_dim, lt=isless, by @test all(post_sort[partition + 1 : hi] |> cc .== filter(x -> x >= pivot, temp) |> cc) end -@testset "consolidate partition" begin - test_consolidate_partition(Int8, 10000, 0, 10000, 0, 16) - test_consolidate_partition(Int8, 10000, 0, 10000, 0, 32) - test_consolidate_partition(Int8, 10000, 0, 10000, 0, 64) - test_consolidate_partition(Int8, 10000, 9, 6333, 0, 16) - test_consolidate_partition(Int8, 10000, 9, 6333, 0, 32) - test_consolidate_partition(Int8, 10000, 9, 6333, 0, 64) - test_consolidate_partition(Int8, 10000, 129, 9999, 0, 16) - test_consolidate_partition(Int8, 10000, 129, 9999, 0, 32) - test_consolidate_partition(Int8, 10000, 129, 9999, 0, 64) - test_consolidate_partition(Int8, 10000, 0, 10000, 1, 16) - test_consolidate_partition(Int8, 10000, 0, 10000, 2, 32) - test_consolidate_partition(Int8, 10000, 0, 10000, 3, 64) - test_consolidate_partition(Int8, 10000, 9, 6333, 4, 16) - test_consolidate_partition(Int8, 10000, 9, 6333, 5, 32) - test_consolidate_partition(Int8, 10000, 9, 6333, 6, 64) - test_consolidate_partition(Int8, 10000, 129, 9999, 7, 16) - test_consolidate_partition(Int8, 10000, 129, 9999, 8, 32) - test_consolidate_partition(Int8, 10000, 129, 9999, 9, 64) - test_consolidate_partition(Int8, 10000, 3329, 9999, 10, 16) - test_consolidate_partition(Int8, 10000, 3329, 9999, 11, 32) - test_consolidate_partition(Int8, 10000, 3329, 9999, 12, 64) -end +# @testset "consolidate partition" begin +# test_consolidate_partition(Int8, 10000, 0, 10000, 0, 16) +# test_consolidate_partition(Int8, 10000, 0, 10000, 0, 32) +# test_consolidate_partition(Int8, 10000, 0, 10000, 0, 64) +# test_consolidate_partition(Int8, 10000, 9, 6333, 0, 16) +# test_consolidate_partition(Int8, 10000, 9, 6333, 0, 32) +# test_consolidate_partition(Int8, 10000, 9, 6333, 0, 64) +# test_consolidate_partition(Int8, 10000, 129, 9999, 0, 16) +# test_consolidate_partition(Int8, 10000, 129, 9999, 0, 32) +# test_consolidate_partition(Int8, 10000, 129, 9999, 0, 64) +# test_consolidate_partition(Int8, 10000, 0, 10000, 1, 16) +# test_consolidate_partition(Int8, 10000, 0, 10000, 2, 32) +# test_consolidate_partition(Int8, 10000, 0, 10000, 3, 64) +# test_consolidate_partition(Int8, 10000, 9, 6333, 4, 16) +# test_consolidate_partition(Int8, 10000, 9, 6333, 5, 32) +# test_consolidate_partition(Int8, 10000, 9, 6333, 6, 64) +# test_consolidate_partition(Int8, 10000, 129, 9999, 7, 16) +# test_consolidate_partition(Int8, 10000, 129, 9999, 8, 32) +# test_consolidate_partition(Int8, 10000, 129, 9999, 9, 64) +# test_consolidate_partition(Int8, 10000, 3329, 9999, 10, 16) +# test_consolidate_partition(Int8, 10000, 3329, 9999, 11, 32) +# test_consolidate_partition(Int8, 10000, 3329, 9999, 12, 64) +# end function init_case(T, f, N::Integer) a = map(x -> T(f(x)), 1:N) @@ -256,86 +256,86 @@ function check_sortperm(T, N; kwargs...) end # Makes sure that non-maximally-large block sizes don't result in race conds -@testset "reduced block sizes" begin - function init() - a = map(x -> x%UInt8, reverse(1:100000)) - c = CuArray(a) - a, c - end - - function check(block_size_shift) - original_arr, device_arr = init() - sort!(device_arr) - quicksort!(device_arr; lt=isless, by=identity, dims=1, block_size_shift=block_size_shift) - host_result = Array(device_arr) - @test check_equivalence(original_arr, host_result) - end - - # repeat since race-conditions non-deterministic - for x in 1:25 - check(1) - check(2) - check(3) - check(4) - end -end +# @testset "reduced block sizes" begin +# function init() +# a = map(x -> x%UInt8, reverse(1:100000)) +# c = CuArray(a) +# a, c +# end + +# function check(block_size_shift) +# original_arr, device_arr = init() +# sort!(device_arr) +# quicksort!(device_arr; lt=isless, by=identity, dims=1, block_size_shift=block_size_shift) +# host_result = Array(device_arr) +# @test check_equivalence(original_arr, host_result) +# end + +# # repeat since race-conditions non-deterministic +# for x in 1:25 +# check(1) +# check(2) +# check(3) +# check(4) +# end +# end @testset "interface" begin - @testset "quicksort" begin - # pre-sorted - @test check_sort!(Int, 1000000; alg=CUDA.QuickSort) - @test check_sort!(Int32, 1000000; alg=CUDA.QuickSort) - @test check_sort!(Float64, 1000000; alg=CUDA.QuickSort) - @test check_sort!(Float32, 1000000; alg=CUDA.QuickSort) - @test check_sort!(Int32, 1000000; rev=true) - @test check_sort!(Float32, 1000000; rev=true) - - # reverse sorted - @test check_sort!(Int32, 1000000, x -> -x; alg=CUDA.QuickSort) - @test check_sort!(Float32, 1000000, x -> -x; alg=CUDA.QuickSort) - @test check_sort!(Int32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort) - @test check_sort!(Float32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort) - - @test check_sort!(Int, 10000, x -> rand(Int); alg=CUDA.QuickSort) - @test check_sort!(Int32, 10000, x -> rand(Int32); alg=CUDA.QuickSort) - @test check_sort!(Int8, 10000, x -> rand(Int8); alg=CUDA.QuickSort) - @test check_sort!(Float64, 10000, x -> rand(Float64); alg=CUDA.QuickSort) - @test check_sort!(Float32, 10000, x -> rand(Float32); alg=CUDA.QuickSort) - @test check_sort!(Float16, 10000, x -> rand(Float16); alg=CUDA.QuickSort) - @test check_sort!(Tuple{Int,Int}, 10000, x -> (rand(Int), rand(Int)); alg=CUDA.QuickSort) - - # non-uniform distributions - @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 2); alg=CUDA.QuickSort) - @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 3); alg=CUDA.QuickSort) - - # more copies of each value than can fit in one block - @test check_sort!(Int8, 4000000, x -> rand(Int8); alg=CUDA.QuickSort) - - # multiple dimensions - @test check_sort!(Int32, (4, 50000, 4); dims=2) - @test check_sort!(Int32, (4, 4, 50000); dims=3, rev=true) - - # large sizes - @test check_sort!(Float32, 2^25; alg=CUDA.QuickSort) - - # various sync depths - for depth in 0:4 - CUDA.limit!(CUDA.LIMIT_DEV_RUNTIME_SYNC_DEPTH, depth) - @test check_sort!(Int, 100000, x -> rand(Int); alg=CUDA.QuickSort) - end - - # using a `by` argument - @test check_sort(Float32, 100000; by=x->abs(x - 0.5), alg=CUDA.QuickSort) - @test check_sort!(Float32, (100000, 4); by=x->abs(x - 0.5), dims=1) - @test check_sort!(Float32, (4, 100000); by=x->abs(x - 0.5), dims=2) - @test check_sort!(Float64, 400000; by=x->8*x-round(8*x), alg=CUDA.QuickSort) - @test check_sort!(Float64, (100000, 4); by=x->8*x-round(8*x), dims=1) - @test check_sort!(Float64, (4, 100000); by=x->8*x-round(8*x), dims=2) - # target bubble sort by using sub-blocksize input: - @test check_sort!(Int, 200; by=x->x % 2, alg=CUDA.QuickSort) - @test check_sort!(Int, 200; by=x->x % 3, alg=CUDA.QuickSort) - @test check_sort!(Int, 200; by=x->x % 4, alg=CUDA.QuickSort) - end # end quicksort tests + # @testset "quicksort" begin + # # pre-sorted + # @test check_sort!(Int, 1000000; alg=CUDA.QuickSort) + # @test check_sort!(Int32, 1000000; alg=CUDA.QuickSort) + # @test check_sort!(Float64, 1000000; alg=CUDA.QuickSort) + # @test check_sort!(Float32, 1000000; alg=CUDA.QuickSort) + # @test check_sort!(Int32, 1000000; rev=true) + # @test check_sort!(Float32, 1000000; rev=true) + + # # reverse sorted + # @test check_sort!(Int32, 1000000, x -> -x; alg=CUDA.QuickSort) + # @test check_sort!(Float32, 1000000, x -> -x; alg=CUDA.QuickSort) + # @test check_sort!(Int32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort) + # @test check_sort!(Float32, 1000000, x -> -x; rev=true, alg=CUDA.QuickSort) + + # @test check_sort!(Int, 10000, x -> rand(Int); alg=CUDA.QuickSort) + # @test check_sort!(Int32, 10000, x -> rand(Int32); alg=CUDA.QuickSort) + # @test check_sort!(Int8, 10000, x -> rand(Int8); alg=CUDA.QuickSort) + # @test check_sort!(Float64, 10000, x -> rand(Float64); alg=CUDA.QuickSort) + # @test check_sort!(Float32, 10000, x -> rand(Float32); alg=CUDA.QuickSort) + # @test check_sort!(Float16, 10000, x -> rand(Float16); alg=CUDA.QuickSort) + # @test check_sort!(Tuple{Int,Int}, 10000, x -> (rand(Int), rand(Int)); alg=CUDA.QuickSort) + + # # non-uniform distributions + # @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 2); alg=CUDA.QuickSort) + # @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 3); alg=CUDA.QuickSort) + + # # more copies of each value than can fit in one block + # @test check_sort!(Int8, 4000000, x -> rand(Int8); alg=CUDA.QuickSort) + + # # multiple dimensions + # @test check_sort!(Int32, (4, 50000, 4); dims=2) + # @test check_sort!(Int32, (4, 4, 50000); dims=3, rev=true) + + # # large sizes + # @test check_sort!(Float32, 2^25; alg=CUDA.QuickSort) + + # # various sync depths + # for depth in 0:4 + # CUDA.limit!(CUDA.LIMIT_DEV_RUNTIME_SYNC_DEPTH, depth) + # @test check_sort!(Int, 100000, x -> rand(Int); alg=CUDA.QuickSort) + # end + + # # using a `by` argument + # @test check_sort(Float32, 100000; by=x->abs(x - 0.5), alg=CUDA.QuickSort) + # @test check_sort!(Float32, (100000, 4); by=x->abs(x - 0.5), dims=1) + # @test check_sort!(Float32, (4, 100000); by=x->abs(x - 0.5), dims=2) + # @test check_sort!(Float64, 400000; by=x->8*x-round(8*x), alg=CUDA.QuickSort) + # @test check_sort!(Float64, (100000, 4); by=x->8*x-round(8*x), dims=1) + # @test check_sort!(Float64, (4, 100000); by=x->8*x-round(8*x), dims=2) + # # target bubble sort by using sub-blocksize input: + # @test check_sort!(Int, 200; by=x->x % 2, alg=CUDA.QuickSort) + # @test check_sort!(Int, 200; by=x->x % 3, alg=CUDA.QuickSort) + # @test check_sort!(Int, 200; by=x->x % 4, alg=CUDA.QuickSort) + # end # end quicksort tests @testset "bitonic sort" begin # test various types