diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 05bc4bf..1875e00 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,25 +1,36 @@
 steps:
   # Julia versions
 
-  - label: "Julia 1.6, CUDA 11.2"
+  - label: "Julia 1.6"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.6-nightly
+          version: 1.6
       - JuliaCI/julia-test#v1:
-          test_args: "--thorough"
+          test_args: "--quickfail"
       - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
     agents:
       queue: "juliagpu"
-      cuda: "11.2"
-      cap: "recent"
-    env:
-      JULIA_CUDA_VERSION: '11.2'
-      JULIA_CUDA_USE_BINARYBUILDER: 'true'
+      cuda: "*"
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 120
+  
+  - label: "Julia 1.7"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.7
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+          dirs:
+            - src
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+    timeout_in_minutes: 120
 
 env:
   JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
diff --git a/Project.toml b/Project.toml
index 2f1e82f..ff4661c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,8 +9,8 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [compat]
-BenchmarkTools = "0.6, 0.7, 1.0"
-CUDA = "2, 3.0"
+BenchmarkTools = "1"
+CUDA = "3"
 julia = "1.6"
 
 [extras]
diff --git a/src/BinomialGPU.jl b/src/BinomialGPU.jl
index f2b0910..6aa5237 100644
--- a/src/BinomialGPU.jl
+++ b/src/BinomialGPU.jl
@@ -3,6 +3,9 @@ module BinomialGPU
 using CUDA
 using Random
 
+using CUDA: cuda_rng, i32
+
+
 # user-level API
 include("rand_binomial.jl")
 export rand_binomial!
diff --git a/src/kernels.jl b/src/kernels.jl
index af7147d..1e33683 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -31,116 +31,243 @@ function stirling_approx_tail(k)::Float32
 end
 
 
-# BTRS algorithm, adapted from the tensorflow library 
-# (github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
-function kernel_BTRS!(
-    A, count, prob, 
-    R1, R2, Rp, Ra, 
-    count_dim_larger_than_prob_dim, 
-    seed::UInt32
-)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+# BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
 
-    @inbounds Random.seed!(seed)
+## Kernel for scalar parameters
+function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
+    device_rng = Random.default_rng()
 
-    @inbounds if i <= length(A)
-        I  = Ra[i]
-        Ip = Rp[I[1]]
-        I1 = R1[Ip[1]]
-        I2 = R2[Ip[2]]
+    # initialize the state
+    @inbounds Random.seed!(device_rng, seed, counter)
 
-        if count_dim_larger_than_prob_dim
-            n = count[CartesianIndex(I1, I2)]
-            p = prob[I1]
-        else
-            n = count[I1]
-            p = prob[CartesianIndex(I1, I2)]
-        end
+    # grid-stride loop
+    tid    = threadIdx().x
+    window = (blockDim().x - 1i32) * gridDim().x
+    offset = (blockIdx().x - 1i32) * blockDim().x
+
+    k = 0
+    while offset < length(A)
+        i = tid + offset
 
         # edge cases
         if p <= 0 || n <= 0
-            A[i] = 0
-            return
+            k = 0
         elseif p >= 1
-            A[i] = n
-            return
-        end
-        
+            k = n
         # Use naive algorithm for n <= 17
-        if n <= 17
+        elseif n <= 17
             k = 0
             ctr = 1
             while ctr <= n
                 rand(Float32) < p && (k += 1)
                 ctr += 1
             end
-            A[i] = k
-            return
-        end
-
         # Use inversion algorithm for n*p < 10
-        if n * p < 10f0
-            logp = log(1f0-p)
+        elseif n * p < 10f0
+            logp = CUDA.log(1f0-p)
             geom_sum = 0f0
-            num_geom = 0
+            k = 0
             while true
-                geom      = ceil(log(rand(Float32)) / logp)
+                geom = ceil(CUDA.log(rand(Float32)) / logp)
                 geom_sum += geom
                 geom_sum > n && break
-                num_geom += 1
+                k += 1
+            end
+        # BTRS algorithm
+        else
+            # BTRS approximations work well for p <= 0.5
+            # invert p and set `invert` flag
+            (invert = p > 0.5f0) && (p = 1f0 - p)
+
+            r       = p/(1f0-p)
+            s       = p*(1f0-p)
+    
+            stddev  = sqrt(n * s)
+            b       = 1.15f0 + 2.53f0 * stddev
+            a       = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
+            c       = n * p + 0.5f0
+            v_r     = 0.92f0 - 4.2f0 / b
+    
+            alpha   = (2.83f0 + 5.1f0 / b) * stddev;
+            m       = floor((n + 1) * p)
+
+            ks = 0f0
+    
+            while true
+                usample = rand(Float32) - 0.5f0
+                vsample = rand(Float32)
+    
+                us = 0.5f0 - abs(usample)
+                ks = floor((2 * a / us + b) * usample + c)
+    
+                if us >= 0.07f0 && vsample <= v_r
+                    break
+                end
+    
+                if ks < 0 || ks > n
+                    continue
+                end
+    
+                v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
+                ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
+                     (n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
+                     (ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
+                     stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
+                if v2 <= ub
+                    break
+                end
             end
-            A[i] = num_geom
-            return
+            invert && (ks = n - ks)
+            k = Int(ks)
         end
 
-        # BTRS algorithm
-        # BTRS approximations work well for p <= 0.5
-        (invert = p > 0.5f0) && (p = 1f0 - p)
+        if i <= length(A)
+            @inbounds A[i] = k
+        end
+        offset += window
+    end
+    return nothing
+end
 
-        r       = p/(1f0-p)
-        s       = p*(1f0-p)
 
-        stddev  = sqrt(n * s)
-        b       = 1.15f0 + 2.53f0 * stddev
-        a       = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
-        c       = n * p + 0.5f0
-        v_r     = 0.92f0 - 4.2f0 / b
+## Kernel for arrays of parameters
+function kernel_BTRS!(
+    A, 
+    count, prob, 
+    seed::UInt32, counter::UInt32, 
+    R1, R2, Rp, Ra, 
+    count_dim_larger_than_prob_dim
+)
+    device_rng = Random.default_rng()
+
+    # initialize the state
+    @inbounds Random.seed!(device_rng, seed, counter)
 
-        alpha   = (2.83f0 + 5.1f0 / b) * stddev;
-        m       = floor((n + 1) * p)
+    # grid-stride loop
+    tid    = threadIdx().x
+    window = (blockDim().x - 1i32) * gridDim().x
+    offset = (blockIdx().x - 1i32) * blockDim().x
 
-        while true
-            usample = rand(Float32) - 0.5f0
-            vsample = rand(Float32)
+    while offset < length(A)
+        i = tid + offset
 
-            us = 0.5f0 - abs(usample)
-            ks = floor((2 * a / us + b) * usample + c)
+        # PARAMETER LOOKUP
+        if i <= length(A)
+            @inbounds I  = Ra[i]
+            @inbounds Ip = Rp[I[1]]
+            @inbounds I1 = R1[Ip[1]]
+            @inbounds I2 = R2[Ip[2]]
 
-            if us >= 0.07f0 && vsample <= v_r
-                break
+            if count_dim_larger_than_prob_dim
+                @inbounds n = count[CartesianIndex(I1, I2)]
+                @inbounds p = prob[I1]
+            else
+                @inbounds n = count[I1]
+                @inbounds p = prob[CartesianIndex(I1, I2)]
             end
+        else
+            n = 0
+            p = 0
+        end
 
-            if ks < 0 || ks > n
-                continue
+        # SAMPLER
+        # edge cases
+        if p <= 0 || n <= 0
+            k = 0
+        elseif p >= 1
+            k = n
+        # Use naive algorithm for n <= 17
+        elseif n <= 17
+            k = 0
+            ctr = 1
+            while ctr <= n
+                rand(Float32) < p && (k += 1)
+                ctr += 1
             end
+        # Use inversion algorithm for n*p < 10
+        elseif n * p < 10f0
+            logp = CUDA.log(1f0-p)
+            geom_sum = 0f0
+            k = 0
+            while true
+                geom = ceil(CUDA.log(rand(Float32)) / logp)
+                geom_sum += geom
+                geom_sum > n && break
+                k += 1
+            end
+        # BTRS algorithm
+        else
+            # BTRS approximations work well for p <= 0.5
+            # invert p and set `invert` flag
+            (invert = p > 0.5f0) && (p = 1f0 - p)
+
+            r       = p/(1f0-p)
+            s       = p*(1f0-p)
+    
+            stddev  = sqrt(n * s)
+            b       = 1.15f0 + 2.53f0 * stddev
+            a       = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
+            c       = n * p + 0.5f0
+            v_r     = 0.92f0 - 4.2f0 / b
+    
+            alpha   = (2.83f0 + 5.1f0 / b) * stddev;
+            m       = floor((n + 1) * p)
 
-            v2 = log(vsample * alpha / (a / (us * us) + b))
-            ub = (m + 0.5f0) * log((m + 1) / (r * (n - m + 1))) +
-                 (n + 1) * log((n - m + 1) / (n - ks + 1)) +
-                 (ks + 0.5f0) * log(r * (n - ks + 1) / (ks + 1)) +
-                 stirling_approx_tail(m) + stirling_approx_tail(n - m) - 
-                 stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
-            if v2 <= ub
-                break
+            ks = 0f0
+    
+            while true
+                usample = rand(Float32) - 0.5f0
+                vsample = rand(Float32)
+    
+                us = 0.5f0 - abs(usample)
+                ks = floor((2 * a / us + b) * usample + c)
+    
+                if us >= 0.07f0 && vsample <= v_r
+                    break
+                end
+    
+                if ks < 0 || ks > n
+                    continue
+                end
+    
+                v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
+                ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
+                     (n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
+                     (ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
+                     stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
+                if v2 <= ub
+                    break
+                end
             end
+            invert && (ks = n - ks)
+            k = Int(ks)
         end
 
-        # if p = 1 - p[i] was used, undo inversion
-        invert && (ks = n - ks)
-        A[i] = Int(ks);
-        nothing
-    end#if
+        if i <= length(A)
+            @inbounds A[i] = k
+        end
+        offset += window
+    end
+    return nothing
+end
+
+
+## old, unused kernels (for reference)
+
+#naive algorithm, full
+function kernel_naive_full!(A, count, prob, randstates)
+    index1  = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    stride1 = blockDim().x * gridDim().x
+
+    @inbounds for i in index1:stride1:length(A)
+        A[i] = 0
+        for m in 1:count[i]
+            @inbounds A[i] += GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates) < prob[i]
+        end
+    end
     return
 end
 
+
+
 ## COV_EXCL_STOP
diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl
index 0b4f5bc..005269b 100644
--- a/src/rand_binomial.jl
+++ b/src/rand_binomial.jl
@@ -1,113 +1,127 @@
-# extend the CUDA.jl functionality (rand, randn, rand_poisson, etc.) 
-# to include binomial distributions
+## extend the CUDA.jl functionality (rand, randn, rand_poisson, etc.) to include binomial distributions
 
 const BinomialType = Union{Type{<:Integer}}
-const BinomialArray = DenseCuArray{<:Integer}
+const BinomialArray = AnyCuArray{<:Integer}
 
 ## exported functions: in-place
+rand_binomial!(A::BinomialArray; kwargs...) = rand_binomial!(cuda_rng(), A; kwargs...)
+
 rand_binomial!(A::AnyCuArray; kwargs...) =
-    error("BinomialGPU.jl does not support generating 
-    binomially-distributed random numbers of type $(eltype(A))")
+    error("BinomialGPU.jl does not support generating binomially-distributed random numbers of type $(eltype(A))")
 
 ## unexported functions: out of place
-rand_binomial(T::BinomialType, dims::Dims; kwargs...) = rand_binomial(T, dims; kwargs...)
+rand_binomial(T::BinomialType, dims::Dims; kwargs...) = rand_binomial(cuda_rng(), T, dims; kwargs...)
 
 rand_binomial(T::BinomialType, dim1::Integer, dims::Integer...; kwargs...) =
-    rand_binomial(T, Dims((dim1, dims...)); kwargs...)
+    rand_binomial(cuda_rng(), T, Dims((dim1, dims...)); kwargs...)
 
-rand_binomial(T::Type, dims::Dims; kwargs...) = 
-    rand_binomial!(CuArray{T}(undef, dims...); kwargs...)
+rand_binomial(T::Type, dims::Dims; kwargs...) = rand_binomial!(CuArray{T}(undef, dims...); kwargs...)
 
 rand_binomial(T::Type, dim1::Integer, dims::Integer...; kwargs...) =
     rand_binomial!(CuArray{T}(undef, dim1, dims...); kwargs...)
 
 rand_binomial(dim1::Integer, dims::Integer...; kwargs...) =
-    rand_binomial(Int, Dims((dim1, dims...)); kwargs...)
+    rand_binomial(cuda_rng(), Dims((dim1, dims...)); kwargs...)
 
 ## main internal function
-function rand_binomial!(A::BinomialArray; count, prob)
-    return rand_binom!(A, count, prob)
+function rand_binomial!(rng, A::BinomialArray; count, prob)
+    return rand_binom!(rng, A, count, prob)
 end
 
 ## dispatching on parameter types
 
 # constant parameters
-function rand_binom!(A::BinomialArray, count::Integer, prob::Number)
-    # revert to full parameter case (this could be suboptimal, 
-    # as a table-based method should in principle be faster)
-    ns = CUDA.fill(Int(count), size(A))
-    ps = CUDA.fill(Float32(prob), size(A))
-    return rand_binom!(A, ns, ps)
+function rand_binom!(rng, A::BinomialArray, count::Integer, prob::Number)
+    kernel  = @cuda launch=false kernel_BTRS_scalar!(
+        A, count, Float32(prob), rng.seed, rng.counter
+    )
+    config  = launch_configuration(kernel.fun)
+    threads = max(32, min(config.threads, length(A)))
+    blocks  = min(config.blocks, cld(length(A), threads))
+    kernel(A, count, Float32(prob), rng.seed, rng.counter; threads=threads, blocks=blocks)
+
+    new_counter = Int64(rng.counter) + length(A)
+    overflow, remainder = fldmod(new_counter, typemax(UInt32))
+    rng.seed += overflow     # XXX: is this OK?
+    rng.counter = remainder
+
+    return A
 end
 
 # arrays of parameters
-function rand_binom!(A::BinomialArray, count::BinomialArray, prob::Number)
-    # revert to full parameter case (this could be suboptimal, 
-    # as a table-based method should in principle be faster)
+function rand_binom!(rng, A::BinomialArray, count::AbstractArray{<:Integer}, prob::Number)
+    # revert to full parameter case (this could be suboptimal, as a table-based method should in principle be faster)
     cucount = cu(count)
     ps = CUDA.fill(Float32(prob), size(A))
-    return rand_binom!(A, cucount, ps)
+    return rand_binom!(rng, A, cucount, ps)
 end
 
-function rand_binom!(A::BinomialArray, count::Integer, prob::AbstractArray{<:Number})
-    # revert to full parameter case (this could be suboptimal, 
-    # as a table-based method should in principle be faster)
+function rand_binom!(rng, A::BinomialArray, count::Integer, prob::AbstractArray{<:Number})
+    # revert to full parameter case (this could be suboptimal, as a table-based method should in principle be faster)
     ns = CUDA.fill(Int(count), size(A))
     cuprob  = cu(prob)
-    return rand_binom!(A, ns, cuprob)
+    return rand_binom!(rng, A, ns, cuprob)
 end
 
-function rand_binom!(A::BinomialArray, count::BinomialArray, prob::AbstractArray{<:Number})
+function rand_binom!(
+    rng, 
+    A::BinomialArray, 
+    count::AbstractArray{<:Integer}, 
+    prob::AbstractArray{<:Number}
+)
     cucount = cu(count)
     cuprob  = cu(prob)
-    return rand_binom!(A, cucount, cuprob)
+    return rand_binom!(rng, A, cucount, cuprob)
 end
 
-function rand_binom!(A::BinomialArray, count::BinomialArray, prob::DenseCuArray{Float32})
+function rand_binom!(rng, A::BinomialArray, count::BinomialArray, prob::DenseCuArray{Float32})
     if ndims(count) > ndims(A) || ndims(prob) > ndims(A)
-        throw(DimensionMismatch("`count` and `prob` need to be scalar 
-                or have less or equal dimensions than A"))
+        throw(DimensionMismatch("`count` and `prob` need to be scalar or have less or equal dimensions than A"))
         return A
     end
     if size(A)[1:ndims(count)] == size(count) && size(A)[1:ndims(prob)] == size(prob)
         count_dim_larger_than_prob_dim = ndims(count) > ndims(prob)
         if count_dim_larger_than_prob_dim
-            # indices for count
-            R1 = CartesianIndices(prob) 
-            # indices for prob that are not included in R1
-            R2 = CartesianIndices(size(count)[ndims(prob)+1:end]) 
+            # indices for prob
+            R1 = CartesianIndices(prob)
+            # indices for count that are not included in R1
+            R2 = CartesianIndices(size(count)[ndims(prob)+1:end])
             # remaining indices in A
-            Rr = CartesianIndices(size(A)[ndims(count)+1:end]) 
+            Rr = CartesianIndices(size(A)[ndims(count)+1:end])
         else
             # indices for count
             R1 = CartesianIndices(count)
-            # indices for prob that are not included in R1
+            # indices for count that are not included in R1
             R2 = CartesianIndices(size(prob)[ndims(count)+1:end])
             # remaining indices in A
-            Rr = CartesianIndices(size(A)[ndims(prob)+1:end])#
+            Rr = CartesianIndices(size(A)[ndims(prob)+1:end])
         end
         Rp = CartesianIndices((length(R1), length(R2))) # indices for parameters
         Ra = CartesianIndices((length(Rp), length(Rr))) # indices for parameters and A
 
-        seed = rand(UInt32)
-
-        kernel  = @cuda launch=false kernel_BTRS!(
-                                        A, count, prob, 
-                                        R1, R2, Rp, Ra, 
-                                        count_dim_larger_than_prob_dim,
-                                        seed
-                                     )
+        kernel = @cuda launch=false kernel_BTRS!(
+            A, 
+            count, prob, 
+            rng.seed, rng.counter, 
+            R1, R2, Rp, Ra, 
+            count_dim_larger_than_prob_dim
+        )
         config  = launch_configuration(kernel.fun)
-        threads = Base.min(length(A), config.threads, 256)
-        blocks  = cld(length(A), threads)
-
+        threads = max(32, min(config.threads, length(A)))
+        blocks  = min(config.blocks, cld(length(A), threads))
         kernel(
-            A, count, prob, 
+            A, 
+            count, prob, 
+            rng.seed, rng.counter,
             R1, R2, Rp, Ra, 
-            count_dim_larger_than_prob_dim,
-            seed;
-            threads = threads, blocks=blocks
-         )
+            count_dim_larger_than_prob_dim; 
+            threads=threads, blocks=blocks
+        )
+
+        new_counter = Int64(rng.counter) + length(A)
+        overflow, remainder = fldmod(new_counter, typemax(UInt32))
+        rng.seed += overflow     # XXX: is this OK?
+        rng.counter = remainder
     else
         throw(DimensionMismatch("`count` and `prob` need have size compatible with A"))
     end