diff --git a/Project.toml b/Project.toml
index 4e3f3d9..fe2cdbb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "BinomialGPU"
 uuid = "c5bbfde1-2136-42cd-9b65-d5719df69ebf"
 authors = ["Simone Carlo Surace"]
-version = "0.4.2"
+version = "0.4.3"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
diff --git a/src/kernels.jl b/src/kernels.jl
index cb770a9..61b8533 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -33,7 +33,66 @@ end
 
 # BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
 
-## Kernel for scalar parameters
+## Kernels for scalar parameters
+function kernel_naive_scalar!(A, n, p, seed::UInt32, counter::UInt32)
+    device_rng = Random.default_rng()
+
+    # initialize the state
+    @inbounds Random.seed!(device_rng, seed, counter)
+
+    # grid-stride loop
+    tid    = threadIdx().x
+    window = (blockDim().x - 1i32) * gridDim().x
+    offset = (blockIdx().x - 1i32) * blockDim().x
+
+    while offset < length(A)
+        i = tid + offset
+        
+        k = 0
+        ctr = 1
+        while ctr <= n
+            rand(Float32) < p && (k += 1)
+            ctr += 1
+        end
+
+        if i <= length(A)
+            @inbounds A[i] = k
+        end
+        offset += window
+    end
+    return nothing
+end
+function kernel_inversion_scalar!(A, n, p, seed::UInt32, counter::UInt32)
+    device_rng = Random.default_rng()
+
+    # initialize the state
+    @inbounds Random.seed!(device_rng, seed, counter)
+
+    # grid-stride loop
+    tid    = threadIdx().x
+    window = (blockDim().x - 1i32) * gridDim().x
+    offset = (blockIdx().x - 1i32) * blockDim().x
+
+    while offset < length(A)
+        i = tid + offset
+
+        logp = CUDA.log(1f0-p)
+        geom_sum = 0f0
+        k = 0
+        while true
+            geom = ceil(CUDA.log(rand(Float32)) / logp)
+            geom_sum += geom
+            geom_sum > n && break
+            k += 1
+        end
+
+        if i <= length(A)
+            @inbounds A[i] = k
+        end
+        offset += window
+    end
+    return nothing
+end
 function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
     device_rng = Random.default_rng()
 
@@ -49,80 +108,47 @@ function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
     while offset < length(A)
         i = tid + offset
 
-        # edge cases
-        if p <= 0 || n <= 0
-            k = 0
-        elseif p >= 1
-            k = n
-        # Use naive algorithm for n <= 17
-        elseif n <= 17
-            k = 0
-            ctr = 1
-            while ctr <= n
-                rand(Float32) < p && (k += 1)
-                ctr += 1
-            end
-        # Use inversion algorithm for n*p < 10
-        elseif n * p < 10f0
-            logp = CUDA.log(1f0-p)
-            geom_sum = 0f0
-            k = 0
-            while true
-                geom = ceil(CUDA.log(rand(Float32)) / logp)
-                geom_sum += geom
-                geom_sum > n && break
-                k += 1
+        r       = p/(1f0-p)
+        s       = p*(1f0-p)
+
+        stddev  = sqrt(n * s)
+        b       = 1.15f0 + 2.53f0 * stddev
+        a       = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
+        c       = n * p + 0.5f0
+        v_r     = 0.92f0 - 4.2f0 / b
+
+        alpha   = (2.83f0 + 5.1f0 / b) * stddev;
+        m       = floor((n + 1) * p)
+
+        ks = 0f0
+
+        while true
+            usample = rand(Float32) - 0.5f0
+            vsample = rand(Float32)
+
+            us = 0.5f0 - abs(usample)
+            ks = floor((2 * a / us + b) * usample + c)
+
+            if us >= 0.07f0 && vsample <= v_r
+                break
             end
-        # BTRS algorithm
-        else
-            # BTRS approximations work well for p <= 0.5
-            # invert p and set `invert` flag
-            (invert = p > 0.5f0) && (p = 1f0 - p)
 
-            r       = p/(1f0-p)
-            s       = p*(1f0-p)
-    
-            stddev  = sqrt(n * s)
-            b       = 1.15f0 + 2.53f0 * stddev
-            a       = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
-            c       = n * p + 0.5f0
-            v_r     = 0.92f0 - 4.2f0 / b
-    
-            alpha   = (2.83f0 + 5.1f0 / b) * stddev;
-            m       = floor((n + 1) * p)
+            if ks < 0 || ks > n
+                continue
+            end
 
-            ks = 0f0
-    
-            while true
-                usample = rand(Float32) - 0.5f0
-                vsample = rand(Float32)
-    
-                us = 0.5f0 - abs(usample)
-                ks = floor((2 * a / us + b) * usample + c)
-    
-                if us >= 0.07f0 && vsample <= v_r
-                    break
-                end
-    
-                if ks < 0 || ks > n
-                    continue
-                end
-    
-                v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
-                ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
-                     (n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
-                     (ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
-                     stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
-                if v2 <= ub
-                    break
-                end
+            v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
+            ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
+                    (n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
+                    (ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
+                    stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
+            if v2 <= ub
+                break
             end
-            invert && (ks = n - ks)
-            k = Int(ks)
         end
 
         if i <= length(A)
-            @inbounds A[i] = k
+            @inbounds A[i] = ks
         end
         offset += window
     end
@@ -165,6 +191,9 @@ function kernel_BTRS!(
                 @inbounds n = count[I1]
                 @inbounds p = prob[CartesianIndex(I1, I2)]
             end
+            # BTRS approximations work well for p <= 0.5
+            # invert p and set `invert` flag
+            (invert = p > 0.5f0) && (p = 1-p)
         else
             n = 0
             p = 0f0
@@ -197,10 +226,6 @@ function kernel_BTRS!(
             end
         # BTRS algorithm
         else
-            # BTRS approximations work well for p <= 0.5
-            # invert p and set `invert` flag
-            (invert = p > 0.5f0) && (p = 1f0 - p)
-
             r       = p/(1f0-p)
             s       = p*(1f0-p)
     
@@ -239,12 +264,15 @@ function kernel_BTRS!(
                     break
                 end
             end
-            invert && (ks = n - ks)
             k = Int(ks)
         end
 
         if i <= length(A)
-            @inbounds A[i] = k
+            if invert
+                @inbounds A[i] = n - k
+            else
+                @inbounds A[i] = k
+            end
         end
         offset += window
     end
diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl
index 509129c..09e1641 100644
--- a/src/rand_binomial.jl
+++ b/src/rand_binomial.jl
@@ -65,20 +65,56 @@ end
 
 ## constant (scalar) parameters
 function rand_binom!(rng, A::BinomialArray, count::Integer, prob::AbstractFloat)
-    kernel  = @cuda launch=false kernel_BTRS_scalar!(
-        A, count, Float32(prob), rng.seed, rng.counter
-    )
+    n = count
+
+    # edge cases
+    if prob <= 0 || n <= 0
+        A .= 0
+        return A
+    elseif prob >= 1
+        A .= n
+        return A
+    end
+
+    invert = prob > 0.5f0
+    if invert
+        p = 1 - prob
+    else
+        p = prob
+    end
+
+    # Use naive algorithm for n <= 17
+    if n <= 17
+        kernel = @cuda launch=false kernel_naive_scalar!(
+            A, n, Float32(p), rng.seed, rng.counter
+        )
+    # Use inversion algorithm for n*p < 10
+    elseif n * p < 10f0
+        kernel = @cuda launch=false kernel_inversion_scalar!(
+            A, n, Float32(p), rng.seed, rng.counter
+        )
+    # BTRS algorithm
+    else
+        kernel = @cuda launch=false kernel_BTRS_scalar!(
+            A, n, Float32(p), rng.seed, rng.counter
+        )
+    end
+
     config  = launch_configuration(kernel.fun)
     threads = max(32, min(config.threads, length(A)))
     blocks  = min(config.blocks, cld(length(A), threads))
-    kernel(A, count, Float32(prob), rng.seed, rng.counter; threads=threads, blocks=blocks)
+    kernel(A, n, Float32(p), rng.seed, rng.counter; threads=threads, blocks=blocks)
 
     new_counter = Int64(rng.counter) + length(A)
     overflow, remainder = fldmod(new_counter, typemax(UInt32))
     rng.seed += overflow     # XXX: is this OK?
     rng.counter = remainder
 
-    return A
+    if invert
+        return n .- A
+    else
+        return A
+    end
 end
 
 ## arrays of parameters
diff --git a/test/Project.toml b/test/Project.toml
index faeac78..ae5622a 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,6 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index 1af5c53..cf9ec21 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,7 @@
 using BinomialGPU
 using CUDA
+using Distributions
+using Statistics
 
 using BenchmarkTools
 using Test
@@ -92,22 +94,6 @@ using Test
             @test rand_binomial!(A, count = 2, prob = 1.5) == CUDA.fill(2, 256) # probabilities greater than 1 are equivalent to 1
             @test_throws MethodError rand_binomial!(A, count = 5., prob = 0.5) # non-integer counts throw an error
         end
-
-        @testset "benchmarks" begin
-            # benchmarks
-            A = CUDA.zeros(Int, 1024, 1024)
-            n = 128
-            p = 0.5
-            ns = CUDA.fill(128, (1024, 1024))
-            ps = CUDA.rand(1024, 1024)
-            println("")
-            println("Benchmarking constant parameter array: should run in less than 2ms on an RTX20xx card")
-            display(@benchmark CUDA.@sync rand_binomial!($A, count = $n, prob = $p))
-            println("")
-            println("Benchmarking full parameter array: should run in less than 2ms on an RTX20xx card")
-            display(@benchmark CUDA.@sync rand_binomial!($A, count = $ns, prob = $ps))
-            println("")
-        end
     end # in-place
 
     @testset "out-of-place" begin
@@ -179,4 +165,55 @@ using Test
             end
         end
     end # out-of-place
+
+    @testset "Distributional tests" begin
+        function mean_var_CI(m, S2, n, p, N, α)
+            truemean = n*p
+            truevar = n*p*(1-p)
+            a = quantile(Normal(), α/2)
+            b = quantile(Normal(), 1-α/2)
+            c = quantile(Chisq(N-1), α/2)
+            d = quantile(Chisq(N-1), 1-α/2)
+            @test a <= sqrt(N/truevar)*(m - truemean) <= b
+            @test c <= (N-1)*S2/truevar <= d
+        end
+        @testset "Scalar parameters" begin
+            function test_mean_variance(N, n, p)
+                CUDA.@sync A = rand_binomial(N, count = n, prob = p)
+                mean_var_CI(mean(A), var(A), n, p, N, 1e-5)
+            end
+            N = 2^20
+            @testset "n = $n, p = $p" for n in [1, 10, 20, 50, 100, 200, 500, 1000],
+                p in 0.1:0.1:0.9
+                test_mean_variance(N, n, p)
+            end
+        end
+        @testset "Arrays of parameters" begin
+            function test_mean_variance(N, n, p)
+                CUDA.@sync A = rand_binomial(N, count = fill(n, N), prob = fill(p, N))
+                mean_var_CI(mean(A), var(A), n, p, N, 1e-5)
+            end
+            N = 2^20
+            @testset "n = $n, p = $p" for n in [1, 10, 20, 50, 100, 200, 500, 1000],
+                p in 0.1:0.1:0.9
+                test_mean_variance(N, n, p)
+            end
+        end
+    end # Distributional tests
+
+    @testset "benchmarks" begin
+        # benchmarks
+        A = CUDA.zeros(Int, 1024, 1024)
+        n = 128
+        p = 0.5
+        ns = CUDA.fill(128, (1024, 1024))
+        ps = CUDA.rand(1024, 1024)
+        println("")
+        println("Benchmarking constant parameter array: should run in less than 2ms on an RTX20xx card")
+        display(@benchmark CUDA.@sync rand_binomial!($A, count = $n, prob = $p))
+        println("")
+        println("Benchmarking full parameter array: should run in less than 2ms on an RTX20xx card")
+        display(@benchmark CUDA.@sync rand_binomial!($A, count = $ns, prob = $ps))
+        println("")
+    end
 end