diff --git a/Project.toml b/Project.toml index 4e3f3d9..fe2cdbb 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BinomialGPU" uuid = "c5bbfde1-2136-42cd-9b65-d5719df69ebf" authors = ["Simone Carlo Surace"] -version = "0.4.2" +version = "0.4.3" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" diff --git a/src/kernels.jl b/src/kernels.jl index cb770a9..61b8533 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -33,7 +33,66 @@ end # BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc) -## Kernel for scalar parameters +## Kernels for scalar parameters +function kernel_naive_scalar!(A, n, p, seed::UInt32, counter::UInt32) + device_rng = Random.default_rng() + + # initialize the state + @inbounds Random.seed!(device_rng, seed, counter) + + # grid-stride loop + tid = threadIdx().x + window = (blockDim().x - 1i32) * gridDim().x + offset = (blockIdx().x - 1i32) * blockDim().x + + while offset < length(A) + i = tid + offset + + k = 0 + ctr = 1 + while ctr <= n + rand(Float32) < p && (k += 1) + ctr += 1 + end + + if i <= length(A) + @inbounds A[i] = k + end + offset += window + end + return nothing +end +function kernel_inversion_scalar!(A, n, p, seed::UInt32, counter::UInt32) + device_rng = Random.default_rng() + + # initialize the state + @inbounds Random.seed!(device_rng, seed, counter) + + # grid-stride loop + tid = threadIdx().x + window = (blockDim().x - 1i32) * gridDim().x + offset = (blockIdx().x - 1i32) * blockDim().x + + while offset < length(A) + i = tid + offset + + logp = CUDA.log(1f0-p) + geom_sum = 0f0 + k = 0 + while true + geom = ceil(CUDA.log(rand(Float32)) / logp) + geom_sum += geom + geom_sum > n && break + k += 1 + end + + if i <= length(A) + @inbounds A[i] = k + end + offset += window + end + return nothing +end function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32) device_rng = Random.default_rng() @@ -49,80 +108,47 @@ function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32) while offset < length(A) i = tid + offset - # edge cases - if p <= 0 || n <= 0 - k = 0 - elseif p >= 1 - k = n - # Use naive algorithm for n <= 17 - elseif n <= 17 - k = 0 - ctr = 1 - while ctr <= n - rand(Float32) < p && (k += 1) - ctr += 1 - end - # Use inversion algorithm for n*p < 10 - elseif n * p < 10f0 - logp = CUDA.log(1f0-p) - geom_sum = 0f0 - k = 0 - while true - geom = ceil(CUDA.log(rand(Float32)) / logp) - geom_sum += geom - geom_sum > n && break - k += 1 + r = p/(1f0-p) + s = p*(1f0-p) + + stddev = sqrt(n * s) + b = 1.15f0 + 2.53f0 * stddev + a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p + c = n * p + 0.5f0 + v_r = 0.92f0 - 4.2f0 / b + + alpha = (2.83f0 + 5.1f0 / b) * stddev; + m = floor((n + 1) * p) + + ks = 0f0 + + while true + usample = rand(Float32) - 0.5f0 + vsample = rand(Float32) + + us = 0.5f0 - abs(usample) + ks = floor((2 * a / us + b) * usample + c) + + if us >= 0.07f0 && vsample <= v_r + break end - # BTRS algorithm - else - # BTRS approximations work well for p <= 0.5 - # invert p and set `invert` flag - (invert = p > 0.5f0) && (p = 1f0 - p) - r = p/(1f0-p) - s = p*(1f0-p) - - stddev = sqrt(n * s) - b = 1.15f0 + 2.53f0 * stddev - a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p - c = n * p + 0.5f0 - v_r = 0.92f0 - 4.2f0 / b - - alpha = (2.83f0 + 5.1f0 / b) * stddev; - m = floor((n + 1) * p) + if ks < 0 || ks > n + continue + end - ks = 0f0 - - while true - usample = rand(Float32) - 0.5f0 - vsample = rand(Float32) - - us = 0.5f0 - abs(usample) - ks = floor((2 * a / us + b) * usample + c) - - if us >= 0.07f0 && vsample <= v_r - break - end - - if ks < 0 || ks > n - continue - end - - v2 = CUDA.log(vsample * alpha / (a / (us * us) + b)) - ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) + - (n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) + - (ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) + - stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks) - if v2 <= ub - break - end + v2 = CUDA.log(vsample * alpha / (a / (us * us) + b)) + ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) + + (n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) + + (ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) + + stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks) + if v2 <= ub + break end - invert && (ks = n - ks) - k = Int(ks) end if i <= length(A) - @inbounds A[i] = k + @inbounds A[i] = ks end offset += window end @@ -165,6 +191,9 @@ function kernel_BTRS!( @inbounds n = count[I1] @inbounds p = prob[CartesianIndex(I1, I2)] end + # BTRS approximations work well for p <= 0.5 + # invert p and set `invert` flag + (invert = p > 0.5f0) && (p = 1-p) else n = 0 p = 0f0 @@ -197,10 +226,6 @@ function kernel_BTRS!( end # BTRS algorithm else - # BTRS approximations work well for p <= 0.5 - # invert p and set `invert` flag - (invert = p > 0.5f0) && (p = 1f0 - p) - r = p/(1f0-p) s = p*(1f0-p) @@ -239,12 +264,15 @@ function kernel_BTRS!( break end end - invert && (ks = n - ks) k = Int(ks) end if i <= length(A) - @inbounds A[i] = k + if invert + @inbounds A[i] = n - k + else + @inbounds A[i] = k + end end offset += window end diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl index 509129c..09e1641 100644 --- a/src/rand_binomial.jl +++ b/src/rand_binomial.jl @@ -65,20 +65,56 @@ end ## constant (scalar) parameters function rand_binom!(rng, A::BinomialArray, count::Integer, prob::AbstractFloat) - kernel = @cuda launch=false kernel_BTRS_scalar!( - A, count, Float32(prob), rng.seed, rng.counter - ) + n = count + + # edge cases + if prob <= 0 || n <= 0 + A .= 0 + return A + elseif prob >= 1 + A .= n + return A + end + + invert = prob > 0.5f0 + if invert + p = 1 - prob + else + p = prob + end + + # Use naive algorithm for n <= 17 + if n <= 17 + kernel = @cuda launch=false kernel_naive_scalar!( + A, n, Float32(p), rng.seed, rng.counter + ) + # Use inversion algorithm for n*p < 10 + elseif n * p < 10f0 + kernel = @cuda launch=false kernel_inversion_scalar!( + A, n, Float32(p), rng.seed, rng.counter + ) + # BTRS algorithm + else + kernel = @cuda launch=false kernel_BTRS_scalar!( + A, n, Float32(p), rng.seed, rng.counter + ) + end + config = launch_configuration(kernel.fun) threads = max(32, min(config.threads, length(A))) blocks = min(config.blocks, cld(length(A), threads)) - kernel(A, count, Float32(prob), rng.seed, rng.counter; threads=threads, blocks=blocks) + kernel(A, n, Float32(p), rng.seed, rng.counter; threads=threads, blocks=blocks) new_counter = Int64(rng.counter) + length(A) overflow, remainder = fldmod(new_counter, typemax(UInt32)) rng.seed += overflow # XXX: is this OK? rng.counter = remainder - return A + if invert + return n .- A + else + return A + end end ## arrays of parameters diff --git a/test/Project.toml b/test/Project.toml index faeac78..ae5622a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,6 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index 1af5c53..cf9ec21 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,7 @@ using BinomialGPU using CUDA +using Distributions +using Statistics using BenchmarkTools using Test @@ -92,22 +94,6 @@ using Test @test rand_binomial!(A, count = 2, prob = 1.5) == CUDA.fill(2, 256) # probabilities greater than 1 are equivalent to 1 @test_throws MethodError rand_binomial!(A, count = 5., prob = 0.5) # non-integer counts throw an error end - - @testset "benchmarks" begin - # benchmarks - A = CUDA.zeros(Int, 1024, 1024) - n = 128 - p = 0.5 - ns = CUDA.fill(128, (1024, 1024)) - ps = CUDA.rand(1024, 1024) - println("") - println("Benchmarking constant parameter array: should run in less than 2ms on an RTX20xx card") - display(@benchmark CUDA.@sync rand_binomial!($A, count = $n, prob = $p)) - println("") - println("Benchmarking full parameter array: should run in less than 2ms on an RTX20xx card") - display(@benchmark CUDA.@sync rand_binomial!($A, count = $ns, prob = $ps)) - println("") - end end # in-place @testset "out-of-place" begin @@ -179,4 +165,55 @@ using Test end end end # out-of-place + + @testset "Distributional tests" begin + function mean_var_CI(m, S2, n, p, N, α) + truemean = n*p + truevar = n*p*(1-p) + a = quantile(Normal(), α/2) + b = quantile(Normal(), 1-α/2) + c = quantile(Chisq(N-1), α/2) + d = quantile(Chisq(N-1), 1-α/2) + @test a <= sqrt(N/truevar)*(m - truemean) <= b + @test c <= (N-1)*S2/truevar <= d + end + @testset "Scalar parameters" begin + function test_mean_variance(N, n, p) + CUDA.@sync A = rand_binomial(N, count = n, prob = p) + mean_var_CI(mean(A), var(A), n, p, N, 1e-5) + end + N = 2^20 + @testset "n = $n, p = $p" for n in [1, 10, 20, 50, 100, 200, 500, 1000], + p in 0.1:0.1:0.9 + test_mean_variance(N, n, p) + end + end + @testset "Arrays of parameters" begin + function test_mean_variance(N, n, p) + CUDA.@sync A = rand_binomial(N, count = fill(n, N), prob = fill(p, N)) + mean_var_CI(mean(A), var(A), n, p, N, 1e-5) + end + N = 2^20 + @testset "n = $n, p = $p" for n in [1, 10, 20, 50, 100, 200, 500, 1000], + p in 0.1:0.1:0.9 + test_mean_variance(N, n, p) + end + end + end # Distributional tests + + @testset "benchmarks" begin + # benchmarks + A = CUDA.zeros(Int, 1024, 1024) + n = 128 + p = 0.5 + ns = CUDA.fill(128, (1024, 1024)) + ps = CUDA.rand(1024, 1024) + println("") + println("Benchmarking constant parameter array: should run in less than 2ms on an RTX20xx card") + display(@benchmark CUDA.@sync rand_binomial!($A, count = $n, prob = $p)) + println("") + println("Benchmarking full parameter array: should run in less than 2ms on an RTX20xx card") + display(@benchmark CUDA.@sync rand_binomial!($A, count = $ns, prob = $ps)) + println("") + end end