diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 05bc4bf..1875e00 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,25 +1,36 @@ steps: # Julia versions - - label: "Julia 1.6, CUDA 11.2" + - label: "Julia 1.6" plugins: - JuliaCI/julia#v1: - version: 1.6-nightly + version: 1.6 - JuliaCI/julia-test#v1: - test_args: "--thorough" + test_args: "--quickfail" - JuliaCI/julia-coverage#v1: codecov: true dirs: - src agents: queue: "juliagpu" - cuda: "11.2" - cap: "recent" - env: - JULIA_CUDA_VERSION: '11.2' - JULIA_CUDA_USE_BINARYBUILDER: 'true' + cuda: "*" if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 120 + + - label: "Julia 1.7" + plugins: + - JuliaCI/julia#v1: + version: 1.7 + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 env: JULIA_PKG_SERVER: "" # it often struggles with our large artifacts diff --git a/Project.toml b/Project.toml index 2f1e82f..ff4661c 100644 --- a/Project.toml +++ b/Project.toml @@ -9,8 +9,8 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [compat] -BenchmarkTools = "0.6, 0.7, 1.0" -CUDA = "2, 3.0" +BenchmarkTools = "1" +CUDA = "3" julia = "1.6" [extras] diff --git a/src/BinomialGPU.jl b/src/BinomialGPU.jl index f2b0910..6aa5237 100644 --- a/src/BinomialGPU.jl +++ b/src/BinomialGPU.jl @@ -3,6 +3,9 @@ module BinomialGPU using CUDA using Random +using CUDA: cuda_rng, i32 + + # user-level API include("rand_binomial.jl") export rand_binomial! diff --git a/src/kernels.jl b/src/kernels.jl index af7147d..1e33683 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -31,116 +31,243 @@ function stirling_approx_tail(k)::Float32 end -# BTRS algorithm, adapted from the tensorflow library -# (github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc) -function kernel_BTRS!( - A, count, prob, - R1, R2, Rp, Ra, - count_dim_larger_than_prob_dim, - seed::UInt32 -) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc) - @inbounds Random.seed!(seed) +## Kernel for scalar parameters +function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32) + device_rng = Random.default_rng() - @inbounds if i <= length(A) - I = Ra[i] - Ip = Rp[I[1]] - I1 = R1[Ip[1]] - I2 = R2[Ip[2]] + # initialize the state + @inbounds Random.seed!(device_rng, seed, counter) - if count_dim_larger_than_prob_dim - n = count[CartesianIndex(I1, I2)] - p = prob[I1] - else - n = count[I1] - p = prob[CartesianIndex(I1, I2)] - end + # grid-stride loop + tid = threadIdx().x + window = (blockDim().x - 1i32) * gridDim().x + offset = (blockIdx().x - 1i32) * blockDim().x + + k = 0 + while offset < length(A) + i = tid + offset # edge cases if p <= 0 || n <= 0 - A[i] = 0 - return + k = 0 elseif p >= 1 - A[i] = n - return - end - + k = n # Use naive algorithm for n <= 17 - if n <= 17 + elseif n <= 17 k = 0 ctr = 1 while ctr <= n rand(Float32) < p && (k += 1) ctr += 1 end - A[i] = k - return - end - # Use inversion algorithm for n*p < 10 - if n * p < 10f0 - logp = log(1f0-p) + elseif n * p < 10f0 + logp = CUDA.log(1f0-p) geom_sum = 0f0 - num_geom = 0 + k = 0 while true - geom = ceil(log(rand(Float32)) / logp) + geom = ceil(CUDA.log(rand(Float32)) / logp) geom_sum += geom geom_sum > n && break - num_geom += 1 + k += 1 + end + # BTRS algorithm + else + # BTRS approximations work well for p <= 0.5 + # invert p and set `invert` flag + (invert = p > 0.5f0) && (p = 1f0 - p) + + r = p/(1f0-p) + s = p*(1f0-p) + + stddev = sqrt(n * s) + b = 1.15f0 + 2.53f0 * stddev + a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p + c = n * p + 0.5f0 + v_r = 0.92f0 - 4.2f0 / b + + alpha = (2.83f0 + 5.1f0 / b) * stddev; + m = floor((n + 1) * p) + + ks = 0f0 + + while true + usample = rand(Float32) - 0.5f0 + vsample = rand(Float32) + + us = 0.5f0 - abs(usample) + ks = floor((2 * a / us + b) * usample + c) + + if us >= 0.07f0 && vsample <= v_r + break + end + + if ks < 0 || ks > n + continue + end + + v2 = CUDA.log(vsample * alpha / (a / (us * us) + b)) + ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) + + (n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) + + (ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) + + stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks) + if v2 <= ub + break + end end - A[i] = num_geom - return + invert && (ks = n - ks) + k = Int(ks) end - # BTRS algorithm - # BTRS approximations work well for p <= 0.5 - (invert = p > 0.5f0) && (p = 1f0 - p) + if i <= length(A) + @inbounds A[i] = k + end + offset += window + end + return nothing +end - r = p/(1f0-p) - s = p*(1f0-p) - stddev = sqrt(n * s) - b = 1.15f0 + 2.53f0 * stddev - a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p - c = n * p + 0.5f0 - v_r = 0.92f0 - 4.2f0 / b +## Kernel for arrays of parameters +function kernel_BTRS!( + A, + count, prob, + seed::UInt32, counter::UInt32, + R1, R2, Rp, Ra, + count_dim_larger_than_prob_dim +) + device_rng = Random.default_rng() + + # initialize the state + @inbounds Random.seed!(device_rng, seed, counter) - alpha = (2.83f0 + 5.1f0 / b) * stddev; - m = floor((n + 1) * p) + # grid-stride loop + tid = threadIdx().x + window = (blockDim().x - 1i32) * gridDim().x + offset = (blockIdx().x - 1i32) * blockDim().x - while true - usample = rand(Float32) - 0.5f0 - vsample = rand(Float32) + while offset < length(A) + i = tid + offset - us = 0.5f0 - abs(usample) - ks = floor((2 * a / us + b) * usample + c) + # PARAMETER LOOKUP + if i <= length(A) + @inbounds I = Ra[i] + @inbounds Ip = Rp[I[1]] + @inbounds I1 = R1[Ip[1]] + @inbounds I2 = R2[Ip[2]] - if us >= 0.07f0 && vsample <= v_r - break + if count_dim_larger_than_prob_dim + @inbounds n = count[CartesianIndex(I1, I2)] + @inbounds p = prob[I1] + else + @inbounds n = count[I1] + @inbounds p = prob[CartesianIndex(I1, I2)] end + else + n = 0 + p = 0 + end - if ks < 0 || ks > n - continue + # SAMPLER + # edge cases + if p <= 0 || n <= 0 + k = 0 + elseif p >= 1 + k = n + # Use naive algorithm for n <= 17 + elseif n <= 17 + k = 0 + ctr = 1 + while ctr <= n + rand(Float32) < p && (k += 1) + ctr += 1 end + # Use inversion algorithm for n*p < 10 + elseif n * p < 10f0 + logp = CUDA.log(1f0-p) + geom_sum = 0f0 + k = 0 + while true + geom = ceil(CUDA.log(rand(Float32)) / logp) + geom_sum += geom + geom_sum > n && break + k += 1 + end + # BTRS algorithm + else + # BTRS approximations work well for p <= 0.5 + # invert p and set `invert` flag + (invert = p > 0.5f0) && (p = 1f0 - p) + + r = p/(1f0-p) + s = p*(1f0-p) + + stddev = sqrt(n * s) + b = 1.15f0 + 2.53f0 * stddev + a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p + c = n * p + 0.5f0 + v_r = 0.92f0 - 4.2f0 / b + + alpha = (2.83f0 + 5.1f0 / b) * stddev; + m = floor((n + 1) * p) - v2 = log(vsample * alpha / (a / (us * us) + b)) - ub = (m + 0.5f0) * log((m + 1) / (r * (n - m + 1))) + - (n + 1) * log((n - m + 1) / (n - ks + 1)) + - (ks + 0.5f0) * log(r * (n - ks + 1) / (ks + 1)) + - stirling_approx_tail(m) + stirling_approx_tail(n - m) - - stirling_approx_tail(ks) - stirling_approx_tail(n - ks) - if v2 <= ub - break + ks = 0f0 + + while true + usample = rand(Float32) - 0.5f0 + vsample = rand(Float32) + + us = 0.5f0 - abs(usample) + ks = floor((2 * a / us + b) * usample + c) + + if us >= 0.07f0 && vsample <= v_r + break + end + + if ks < 0 || ks > n + continue + end + + v2 = CUDA.log(vsample * alpha / (a / (us * us) + b)) + ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) + + (n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) + + (ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) + + stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks) + if v2 <= ub + break + end end + invert && (ks = n - ks) + k = Int(ks) end - # if p = 1 - p[i] was used, undo inversion - invert && (ks = n - ks) - A[i] = Int(ks); - nothing - end#if + if i <= length(A) + @inbounds A[i] = k + end + offset += window + end + return nothing +end + + +## old, unused kernels (for reference) + +#naive algorithm, full +function kernel_naive_full!(A, count, prob, randstates) + index1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x + stride1 = blockDim().x * gridDim().x + + @inbounds for i in index1:stride1:length(A) + A[i] = 0 + for m in 1:count[i] + @inbounds A[i] += GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates) < prob[i] + end + end return end + + ## COV_EXCL_STOP diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl index 0b4f5bc..005269b 100644 --- a/src/rand_binomial.jl +++ b/src/rand_binomial.jl @@ -1,113 +1,127 @@ -# extend the CUDA.jl functionality (rand, randn, rand_poisson, etc.) -# to include binomial distributions +## extend the CUDA.jl functionality (rand, randn, rand_poisson, etc.) to include binomial distributions const BinomialType = Union{Type{<:Integer}} -const BinomialArray = DenseCuArray{<:Integer} +const BinomialArray = AnyCuArray{<:Integer} ## exported functions: in-place +rand_binomial!(A::BinomialArray; kwargs...) = rand_binomial!(cuda_rng(), A; kwargs...) + rand_binomial!(A::AnyCuArray; kwargs...) = - error("BinomialGPU.jl does not support generating - binomially-distributed random numbers of type $(eltype(A))") + error("BinomialGPU.jl does not support generating binomially-distributed random numbers of type $(eltype(A))") ## unexported functions: out of place -rand_binomial(T::BinomialType, dims::Dims; kwargs...) = rand_binomial(T, dims; kwargs...) +rand_binomial(T::BinomialType, dims::Dims; kwargs...) = rand_binomial(cuda_rng(), T, dims; kwargs...) rand_binomial(T::BinomialType, dim1::Integer, dims::Integer...; kwargs...) = - rand_binomial(T, Dims((dim1, dims...)); kwargs...) + rand_binomial(cuda_rng(), T, Dims((dim1, dims...)); kwargs...) -rand_binomial(T::Type, dims::Dims; kwargs...) = - rand_binomial!(CuArray{T}(undef, dims...); kwargs...) +rand_binomial(T::Type, dims::Dims; kwargs...) = rand_binomial!(CuArray{T}(undef, dims...); kwargs...) rand_binomial(T::Type, dim1::Integer, dims::Integer...; kwargs...) = rand_binomial!(CuArray{T}(undef, dim1, dims...); kwargs...) rand_binomial(dim1::Integer, dims::Integer...; kwargs...) = - rand_binomial(Int, Dims((dim1, dims...)); kwargs...) + rand_binomial(cuda_rng(), Dims((dim1, dims...)); kwargs...) ## main internal function -function rand_binomial!(A::BinomialArray; count, prob) - return rand_binom!(A, count, prob) +function rand_binomial!(rng, A::BinomialArray; count, prob) + return rand_binom!(rng, A, count, prob) end ## dispatching on parameter types # constant parameters -function rand_binom!(A::BinomialArray, count::Integer, prob::Number) - # revert to full parameter case (this could be suboptimal, - # as a table-based method should in principle be faster) - ns = CUDA.fill(Int(count), size(A)) - ps = CUDA.fill(Float32(prob), size(A)) - return rand_binom!(A, ns, ps) +function rand_binom!(rng, A::BinomialArray, count::Integer, prob::Number) + kernel = @cuda launch=false kernel_BTRS_scalar!( + A, count, Float32(prob), rng.seed, rng.counter + ) + config = launch_configuration(kernel.fun) + threads = max(32, min(config.threads, length(A))) + blocks = min(config.blocks, cld(length(A), threads)) + kernel(A, count, Float32(prob), rng.seed, rng.counter; threads=threads, blocks=blocks) + + new_counter = Int64(rng.counter) + length(A) + overflow, remainder = fldmod(new_counter, typemax(UInt32)) + rng.seed += overflow # XXX: is this OK? + rng.counter = remainder + + return A end # arrays of parameters -function rand_binom!(A::BinomialArray, count::BinomialArray, prob::Number) - # revert to full parameter case (this could be suboptimal, - # as a table-based method should in principle be faster) +function rand_binom!(rng, A::BinomialArray, count::AbstractArray{<:Integer}, prob::Number) + # revert to full parameter case (this could be suboptimal, as a table-based method should in principle be faster) cucount = cu(count) ps = CUDA.fill(Float32(prob), size(A)) - return rand_binom!(A, cucount, ps) + return rand_binom!(rng, A, cucount, ps) end -function rand_binom!(A::BinomialArray, count::Integer, prob::AbstractArray{<:Number}) - # revert to full parameter case (this could be suboptimal, - # as a table-based method should in principle be faster) +function rand_binom!(rng, A::BinomialArray, count::Integer, prob::AbstractArray{<:Number}) + # revert to full parameter case (this could be suboptimal, as a table-based method should in principle be faster) ns = CUDA.fill(Int(count), size(A)) cuprob = cu(prob) - return rand_binom!(A, ns, cuprob) + return rand_binom!(rng, A, ns, cuprob) end -function rand_binom!(A::BinomialArray, count::BinomialArray, prob::AbstractArray{<:Number}) +function rand_binom!( + rng, + A::BinomialArray, + count::AbstractArray{<:Integer}, + prob::AbstractArray{<:Number} +) cucount = cu(count) cuprob = cu(prob) - return rand_binom!(A, cucount, cuprob) + return rand_binom!(rng, A, cucount, cuprob) end -function rand_binom!(A::BinomialArray, count::BinomialArray, prob::DenseCuArray{Float32}) +function rand_binom!(rng, A::BinomialArray, count::BinomialArray, prob::DenseCuArray{Float32}) if ndims(count) > ndims(A) || ndims(prob) > ndims(A) - throw(DimensionMismatch("`count` and `prob` need to be scalar - or have less or equal dimensions than A")) + throw(DimensionMismatch("`count` and `prob` need to be scalar or have less or equal dimensions than A")) return A end if size(A)[1:ndims(count)] == size(count) && size(A)[1:ndims(prob)] == size(prob) count_dim_larger_than_prob_dim = ndims(count) > ndims(prob) if count_dim_larger_than_prob_dim - # indices for count - R1 = CartesianIndices(prob) - # indices for prob that are not included in R1 - R2 = CartesianIndices(size(count)[ndims(prob)+1:end]) + # indices for prob + R1 = CartesianIndices(prob) + # indices for count that are not included in R1 + R2 = CartesianIndices(size(count)[ndims(prob)+1:end]) # remaining indices in A - Rr = CartesianIndices(size(A)[ndims(count)+1:end]) + Rr = CartesianIndices(size(A)[ndims(count)+1:end]) else # indices for count R1 = CartesianIndices(count) - # indices for prob that are not included in R1 + # indices for count that are not included in R1 R2 = CartesianIndices(size(prob)[ndims(count)+1:end]) # remaining indices in A - Rr = CartesianIndices(size(A)[ndims(prob)+1:end])# + Rr = CartesianIndices(size(A)[ndims(prob)+1:end]) end Rp = CartesianIndices((length(R1), length(R2))) # indices for parameters Ra = CartesianIndices((length(Rp), length(Rr))) # indices for parameters and A - seed = rand(UInt32) - - kernel = @cuda launch=false kernel_BTRS!( - A, count, prob, - R1, R2, Rp, Ra, - count_dim_larger_than_prob_dim, - seed - ) + kernel = @cuda launch=false kernel_BTRS!( + A, + count, prob, + rng.seed, rng.counter, + R1, R2, Rp, Ra, + count_dim_larger_than_prob_dim + ) config = launch_configuration(kernel.fun) - threads = Base.min(length(A), config.threads, 256) - blocks = cld(length(A), threads) - + threads = max(32, min(config.threads, length(A))) + blocks = min(config.blocks, cld(length(A), threads)) kernel( - A, count, prob, + A, + count, prob, + rng.seed, rng.counter, R1, R2, Rp, Ra, - count_dim_larger_than_prob_dim, - seed; - threads = threads, blocks=blocks - ) + count_dim_larger_than_prob_dim; + threads=threads, blocks=blocks + ) + + new_counter = Int64(rng.counter) + length(A) + overflow, remainder = fldmod(new_counter, typemax(UInt32)) + rng.seed += overflow # XXX: is this OK? + rng.counter = remainder else throw(DimensionMismatch("`count` and `prob` need have size compatible with A")) end