Skip to content

Commit

Permalink
Merge pull request #18 from JuliaGPU/newrand
Browse files Browse the repository at this point in the history
Change to native `rand`
  • Loading branch information
simsurace authored Jan 25, 2022
2 parents 7244289 + 601589d commit bdfebce
Show file tree
Hide file tree
Showing 5 changed files with 293 additions and 138 deletions.
27 changes: 19 additions & 8 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,25 +1,36 @@
steps:
# Julia versions

- label: "Julia 1.6, CUDA 11.2"
- label: "Julia 1.6"
plugins:
- JuliaCI/julia#v1:
version: 1.6-nightly
version: 1.6
- JuliaCI/julia-test#v1:
test_args: "--thorough"
test_args: "--quickfail"
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
- src
agents:
queue: "juliagpu"
cuda: "11.2"
cap: "recent"
env:
JULIA_CUDA_VERSION: '11.2'
JULIA_CUDA_USE_BINARYBUILDER: 'true'
cuda: "*"
if: build.message !~ /\[skip tests\]/
timeout_in_minutes: 120

- label: "Julia 1.7"
plugins:
- JuliaCI/julia#v1:
version: 1.7
- JuliaCI/julia-test#v1: ~
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
- src
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

env:
JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
Expand Down
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[compat]
BenchmarkTools = "0.6, 0.7, 1.0"
CUDA = "2, 3.0"
BenchmarkTools = "1"
CUDA = "3"
julia = "1.6"

[extras]
Expand Down
3 changes: 3 additions & 0 deletions src/BinomialGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ module BinomialGPU
using CUDA
using Random

using CUDA: cuda_rng, i32


# user-level API
include("rand_binomial.jl")
export rand_binomial!
Expand Down
275 changes: 201 additions & 74 deletions src/kernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,116 +31,243 @@ function stirling_approx_tail(k)::Float32
end


# BTRS algorithm, adapted from the tensorflow library
# (github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
function kernel_BTRS!(
A, count, prob,
R1, R2, Rp, Ra,
count_dim_larger_than_prob_dim,
seed::UInt32
)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
# BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)

@inbounds Random.seed!(seed)
## Kernel for scalar parameters
function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
device_rng = Random.default_rng()

@inbounds if i <= length(A)
I = Ra[i]
Ip = Rp[I[1]]
I1 = R1[Ip[1]]
I2 = R2[Ip[2]]
# initialize the state
@inbounds Random.seed!(device_rng, seed, counter)

if count_dim_larger_than_prob_dim
n = count[CartesianIndex(I1, I2)]
p = prob[I1]
else
n = count[I1]
p = prob[CartesianIndex(I1, I2)]
end
# grid-stride loop
tid = threadIdx().x
window = (blockDim().x - 1i32) * gridDim().x
offset = (blockIdx().x - 1i32) * blockDim().x

k = 0
while offset < length(A)
i = tid + offset

# edge cases
if p <= 0 || n <= 0
A[i] = 0
return
k = 0
elseif p >= 1
A[i] = n
return
end

k = n
# Use naive algorithm for n <= 17
if n <= 17
elseif n <= 17
k = 0
ctr = 1
while ctr <= n
rand(Float32) < p && (k += 1)
ctr += 1
end
A[i] = k
return
end

# Use inversion algorithm for n*p < 10
if n * p < 10f0
logp = log(1f0-p)
elseif n * p < 10f0
logp = CUDA.log(1f0-p)
geom_sum = 0f0
num_geom = 0
k = 0
while true
geom = ceil(log(rand(Float32)) / logp)
geom = ceil(CUDA.log(rand(Float32)) / logp)
geom_sum += geom
geom_sum > n && break
num_geom += 1
k += 1
end
# BTRS algorithm
else
# BTRS approximations work well for p <= 0.5
# invert p and set `invert` flag
(invert = p > 0.5f0) && (p = 1f0 - p)

r = p/(1f0-p)
s = p*(1f0-p)

stddev = sqrt(n * s)
b = 1.15f0 + 2.53f0 * stddev
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
c = n * p + 0.5f0
v_r = 0.92f0 - 4.2f0 / b

alpha = (2.83f0 + 5.1f0 / b) * stddev;
m = floor((n + 1) * p)

ks = 0f0

while true
usample = rand(Float32) - 0.5f0
vsample = rand(Float32)

us = 0.5f0 - abs(usample)
ks = floor((2 * a / us + b) * usample + c)

if us >= 0.07f0 && vsample <= v_r
break
end

if ks < 0 || ks > n
continue
end

v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
(n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
(ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
if v2 <= ub
break
end
end
A[i] = num_geom
return
invert && (ks = n - ks)
k = Int(ks)
end

# BTRS algorithm
# BTRS approximations work well for p <= 0.5
(invert = p > 0.5f0) && (p = 1f0 - p)
if i <= length(A)
@inbounds A[i] = k
end
offset += window
end
return nothing
end

r = p/(1f0-p)
s = p*(1f0-p)

stddev = sqrt(n * s)
b = 1.15f0 + 2.53f0 * stddev
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
c = n * p + 0.5f0
v_r = 0.92f0 - 4.2f0 / b
## Kernel for arrays of parameters
function kernel_BTRS!(
A,
count, prob,
seed::UInt32, counter::UInt32,
R1, R2, Rp, Ra,
count_dim_larger_than_prob_dim
)
device_rng = Random.default_rng()

# initialize the state
@inbounds Random.seed!(device_rng, seed, counter)

alpha = (2.83f0 + 5.1f0 / b) * stddev;
m = floor((n + 1) * p)
# grid-stride loop
tid = threadIdx().x
window = (blockDim().x - 1i32) * gridDim().x
offset = (blockIdx().x - 1i32) * blockDim().x

while true
usample = rand(Float32) - 0.5f0
vsample = rand(Float32)
while offset < length(A)
i = tid + offset

us = 0.5f0 - abs(usample)
ks = floor((2 * a / us + b) * usample + c)
# PARAMETER LOOKUP
if i <= length(A)
@inbounds I = Ra[i]
@inbounds Ip = Rp[I[1]]
@inbounds I1 = R1[Ip[1]]
@inbounds I2 = R2[Ip[2]]

if us >= 0.07f0 && vsample <= v_r
break
if count_dim_larger_than_prob_dim
@inbounds n = count[CartesianIndex(I1, I2)]
@inbounds p = prob[I1]
else
@inbounds n = count[I1]
@inbounds p = prob[CartesianIndex(I1, I2)]
end
else
n = 0
p = 0
end

if ks < 0 || ks > n
continue
# SAMPLER
# edge cases
if p <= 0 || n <= 0
k = 0
elseif p >= 1
k = n
# Use naive algorithm for n <= 17
elseif n <= 17
k = 0
ctr = 1
while ctr <= n
rand(Float32) < p && (k += 1)
ctr += 1
end
# Use inversion algorithm for n*p < 10
elseif n * p < 10f0
logp = CUDA.log(1f0-p)
geom_sum = 0f0
k = 0
while true
geom = ceil(CUDA.log(rand(Float32)) / logp)
geom_sum += geom
geom_sum > n && break
k += 1
end
# BTRS algorithm
else
# BTRS approximations work well for p <= 0.5
# invert p and set `invert` flag
(invert = p > 0.5f0) && (p = 1f0 - p)

r = p/(1f0-p)
s = p*(1f0-p)

stddev = sqrt(n * s)
b = 1.15f0 + 2.53f0 * stddev
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
c = n * p + 0.5f0
v_r = 0.92f0 - 4.2f0 / b

alpha = (2.83f0 + 5.1f0 / b) * stddev;
m = floor((n + 1) * p)

v2 = log(vsample * alpha / (a / (us * us) + b))
ub = (m + 0.5f0) * log((m + 1) / (r * (n - m + 1))) +
(n + 1) * log((n - m + 1) / (n - ks + 1)) +
(ks + 0.5f0) * log(r * (n - ks + 1) / (ks + 1)) +
stirling_approx_tail(m) + stirling_approx_tail(n - m) -
stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
if v2 <= ub
break
ks = 0f0

while true
usample = rand(Float32) - 0.5f0
vsample = rand(Float32)

us = 0.5f0 - abs(usample)
ks = floor((2 * a / us + b) * usample + c)

if us >= 0.07f0 && vsample <= v_r
break
end

if ks < 0 || ks > n
continue
end

v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
(n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
(ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
if v2 <= ub
break
end
end
invert && (ks = n - ks)
k = Int(ks)
end

# if p = 1 - p[i] was used, undo inversion
invert && (ks = n - ks)
A[i] = Int(ks);
nothing
end#if
if i <= length(A)
@inbounds A[i] = k
end
offset += window
end
return nothing
end


## old, unused kernels (for reference)

#naive algorithm, full
function kernel_naive_full!(A, count, prob, randstates)
index1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride1 = blockDim().x * gridDim().x

@inbounds for i in index1:stride1:length(A)
A[i] = 0
for m in 1:count[i]
@inbounds A[i] += GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates) < prob[i]
end
end
return
end



## COV_EXCL_STOP
Loading

2 comments on commit bdfebce

@simsurace
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/53142

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.0 -m "<description of version>" bdfebce263c9d8052f57c9b89ef7b29517222a5b
git push origin v0.3.0

Please sign in to comment.