Skip to content

Commit

Permalink
Fix inversion bug
Browse files Browse the repository at this point in the history
The inversion of p -> 1-p was not triggered sometimes, leading to wrong statistics in part of the sampled array.

Added distributional tests for mean and variance.
  • Loading branch information
simsurace authored Feb 14, 2022
2 parents 613da4b + 9aebcae commit 558dfc6
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 96 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "BinomialGPU"
uuid = "c5bbfde1-2136-42cd-9b65-d5719df69ebf"
authors = ["Simone Carlo Surace"]
version = "0.4.2"
version = "0.4.3"

[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Expand Down
176 changes: 102 additions & 74 deletions src/kernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,66 @@ end

# BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)

## Kernel for scalar parameters
## Kernels for scalar parameters
function kernel_naive_scalar!(A, n, p, seed::UInt32, counter::UInt32)
device_rng = Random.default_rng()

# initialize the state
@inbounds Random.seed!(device_rng, seed, counter)

# grid-stride loop
tid = threadIdx().x
window = (blockDim().x - 1i32) * gridDim().x
offset = (blockIdx().x - 1i32) * blockDim().x

while offset < length(A)
i = tid + offset

k = 0
ctr = 1
while ctr <= n
rand(Float32) < p && (k += 1)
ctr += 1
end

if i <= length(A)
@inbounds A[i] = k
end
offset += window
end
return nothing
end
function kernel_inversion_scalar!(A, n, p, seed::UInt32, counter::UInt32)
device_rng = Random.default_rng()

# initialize the state
@inbounds Random.seed!(device_rng, seed, counter)

# grid-stride loop
tid = threadIdx().x
window = (blockDim().x - 1i32) * gridDim().x
offset = (blockIdx().x - 1i32) * blockDim().x

while offset < length(A)
i = tid + offset

logp = CUDA.log(1f0-p)
geom_sum = 0f0
k = 0
while true
geom = ceil(CUDA.log(rand(Float32)) / logp)
geom_sum += geom
geom_sum > n && break
k += 1
end

if i <= length(A)
@inbounds A[i] = k
end
offset += window
end
return nothing
end
function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
device_rng = Random.default_rng()

Expand All @@ -49,80 +108,47 @@ function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
while offset < length(A)
i = tid + offset

# edge cases
if p <= 0 || n <= 0
k = 0
elseif p >= 1
k = n
# Use naive algorithm for n <= 17
elseif n <= 17
k = 0
ctr = 1
while ctr <= n
rand(Float32) < p && (k += 1)
ctr += 1
end
# Use inversion algorithm for n*p < 10
elseif n * p < 10f0
logp = CUDA.log(1f0-p)
geom_sum = 0f0
k = 0
while true
geom = ceil(CUDA.log(rand(Float32)) / logp)
geom_sum += geom
geom_sum > n && break
k += 1
r = p/(1f0-p)
s = p*(1f0-p)

stddev = sqrt(n * s)
b = 1.15f0 + 2.53f0 * stddev
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
c = n * p + 0.5f0
v_r = 0.92f0 - 4.2f0 / b

alpha = (2.83f0 + 5.1f0 / b) * stddev;
m = floor((n + 1) * p)

ks = 0f0

while true
usample = rand(Float32) - 0.5f0
vsample = rand(Float32)

us = 0.5f0 - abs(usample)
ks = floor((2 * a / us + b) * usample + c)

if us >= 0.07f0 && vsample <= v_r
break
end
# BTRS algorithm
else
# BTRS approximations work well for p <= 0.5
# invert p and set `invert` flag
(invert = p > 0.5f0) && (p = 1f0 - p)

r = p/(1f0-p)
s = p*(1f0-p)

stddev = sqrt(n * s)
b = 1.15f0 + 2.53f0 * stddev
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
c = n * p + 0.5f0
v_r = 0.92f0 - 4.2f0 / b

alpha = (2.83f0 + 5.1f0 / b) * stddev;
m = floor((n + 1) * p)
if ks < 0 || ks > n
continue
end

ks = 0f0

while true
usample = rand(Float32) - 0.5f0
vsample = rand(Float32)

us = 0.5f0 - abs(usample)
ks = floor((2 * a / us + b) * usample + c)

if us >= 0.07f0 && vsample <= v_r
break
end

if ks < 0 || ks > n
continue
end

v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
(n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
(ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
if v2 <= ub
break
end
v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
(n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
(ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
if v2 <= ub
break
end
invert && (ks = n - ks)
k = Int(ks)
end

if i <= length(A)
@inbounds A[i] = k
@inbounds A[i] = ks
end
offset += window
end
Expand Down Expand Up @@ -165,6 +191,9 @@ function kernel_BTRS!(
@inbounds n = count[I1]
@inbounds p = prob[CartesianIndex(I1, I2)]
end
# BTRS approximations work well for p <= 0.5
# invert p and set `invert` flag
(invert = p > 0.5f0) && (p = 1-p)
else
n = 0
p = 0f0
Expand Down Expand Up @@ -197,10 +226,6 @@ function kernel_BTRS!(
end
# BTRS algorithm
else
# BTRS approximations work well for p <= 0.5
# invert p and set `invert` flag
(invert = p > 0.5f0) && (p = 1f0 - p)

r = p/(1f0-p)
s = p*(1f0-p)

Expand Down Expand Up @@ -239,12 +264,15 @@ function kernel_BTRS!(
break
end
end
invert && (ks = n - ks)
k = Int(ks)
end

if i <= length(A)
@inbounds A[i] = k
if invert
@inbounds A[i] = n - k
else
@inbounds A[i] = k
end
end
offset += window
end
Expand Down
46 changes: 41 additions & 5 deletions src/rand_binomial.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,56 @@ end

## constant (scalar) parameters
function rand_binom!(rng, A::BinomialArray, count::Integer, prob::AbstractFloat)
kernel = @cuda launch=false kernel_BTRS_scalar!(
A, count, Float32(prob), rng.seed, rng.counter
)
n = count

# edge cases
if prob <= 0 || n <= 0
A .= 0
return A
elseif prob >= 1
A .= n
return A
end

invert = prob > 0.5f0
if invert
p = 1 - prob
else
p = prob
end

# Use naive algorithm for n <= 17
if n <= 17
kernel = @cuda launch=false kernel_naive_scalar!(
A, n, Float32(p), rng.seed, rng.counter
)
# Use inversion algorithm for n*p < 10
elseif n * p < 10f0
kernel = @cuda launch=false kernel_inversion_scalar!(
A, n, Float32(p), rng.seed, rng.counter
)
# BTRS algorithm
else
kernel = @cuda launch=false kernel_BTRS_scalar!(
A, n, Float32(p), rng.seed, rng.counter
)
end

config = launch_configuration(kernel.fun)
threads = max(32, min(config.threads, length(A)))
blocks = min(config.blocks, cld(length(A), threads))
kernel(A, count, Float32(prob), rng.seed, rng.counter; threads=threads, blocks=blocks)
kernel(A, n, Float32(p), rng.seed, rng.counter; threads=threads, blocks=blocks)

new_counter = Int64(rng.counter) + length(A)
overflow, remainder = fldmod(new_counter, typemax(UInt32))
rng.seed += overflow # XXX: is this OK?
rng.counter = remainder

return A
if invert
return n .- A
else
return A
end
end

## arrays of parameters
Expand Down
2 changes: 2 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
69 changes: 53 additions & 16 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
using BinomialGPU
using CUDA
using Distributions
using Statistics

using BenchmarkTools
using Test
Expand Down Expand Up @@ -92,22 +94,6 @@ using Test
@test rand_binomial!(A, count = 2, prob = 1.5) == CUDA.fill(2, 256) # probabilities greater than 1 are equivalent to 1
@test_throws MethodError rand_binomial!(A, count = 5., prob = 0.5) # non-integer counts throw an error
end

@testset "benchmarks" begin
# benchmarks
A = CUDA.zeros(Int, 1024, 1024)
n = 128
p = 0.5
ns = CUDA.fill(128, (1024, 1024))
ps = CUDA.rand(1024, 1024)
println("")
println("Benchmarking constant parameter array: should run in less than 2ms on an RTX20xx card")
display(@benchmark CUDA.@sync rand_binomial!($A, count = $n, prob = $p))
println("")
println("Benchmarking full parameter array: should run in less than 2ms on an RTX20xx card")
display(@benchmark CUDA.@sync rand_binomial!($A, count = $ns, prob = $ps))
println("")
end
end # in-place

@testset "out-of-place" begin
Expand Down Expand Up @@ -179,4 +165,55 @@ using Test
end
end
end # out-of-place

@testset "Distributional tests" begin
function mean_var_CI(m, S2, n, p, N, α)
truemean = n*p
truevar = n*p*(1-p)
a = quantile(Normal(), α/2)
b = quantile(Normal(), 1-α/2)
c = quantile(Chisq(N-1), α/2)
d = quantile(Chisq(N-1), 1-α/2)
@test a <= sqrt(N/truevar)*(m - truemean) <= b
@test c <= (N-1)*S2/truevar <= d
end
@testset "Scalar parameters" begin
function test_mean_variance(N, n, p)
CUDA.@sync A = rand_binomial(N, count = n, prob = p)
mean_var_CI(mean(A), var(A), n, p, N, 1e-5)
end
N = 2^20
@testset "n = $n, p = $p" for n in [1, 10, 20, 50, 100, 200, 500, 1000],
p in 0.1:0.1:0.9
test_mean_variance(N, n, p)
end
end
@testset "Arrays of parameters" begin
function test_mean_variance(N, n, p)
CUDA.@sync A = rand_binomial(N, count = fill(n, N), prob = fill(p, N))
mean_var_CI(mean(A), var(A), n, p, N, 1e-5)
end
N = 2^20
@testset "n = $n, p = $p" for n in [1, 10, 20, 50, 100, 200, 500, 1000],
p in 0.1:0.1:0.9
test_mean_variance(N, n, p)
end
end
end # Distributional tests

@testset "benchmarks" begin
# benchmarks
A = CUDA.zeros(Int, 1024, 1024)
n = 128
p = 0.5
ns = CUDA.fill(128, (1024, 1024))
ps = CUDA.rand(1024, 1024)
println("")
println("Benchmarking constant parameter array: should run in less than 2ms on an RTX20xx card")
display(@benchmark CUDA.@sync rand_binomial!($A, count = $n, prob = $p))
println("")
println("Benchmarking full parameter array: should run in less than 2ms on an RTX20xx card")
display(@benchmark CUDA.@sync rand_binomial!($A, count = $ns, prob = $ps))
println("")
end
end

2 comments on commit 558dfc6

@simsurace
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/54635

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.4.3 -m "<description of version>" 558dfc6395070f69482953537f0aeb5bf5e66bf0
git push origin v0.4.3

Please sign in to comment.