diff --git a/Project.toml b/Project.toml index d98ad9f1..a1272138 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LoopVectorization" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" authors = ["Chris Elrod "] -version = "0.12.151" +version = "0.12.152" [weakdeps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" @@ -57,5 +57,5 @@ Static = "0.8.4" StaticArrayInterface = "1" ThreadingUtilities = "0.5" UnPack = "1" -VectorizationBase = "0.21.53" +VectorizationBase = "0.21.60" julia = "1.6" diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index 54001f87..744e4b29 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -196,7 +196,8 @@ export LowDimArray, vfilter, vfilter!, vmapreduce, - vreduce + vreduce, + vcount const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##") @@ -234,6 +235,7 @@ include("reconstruct_loopset.jl") include("constructors.jl") include("user_api_conveniences.jl") include("simdfunctionals/mapreduce.jl") +include("simdfunctionals/count.jl") include("broadcast.jl") """ diff --git a/src/simdfunctionals/count.jl b/src/simdfunctionals/count.jl new file mode 100644 index 00000000..5d4c64e1 --- /dev/null +++ b/src/simdfunctionals/count.jl @@ -0,0 +1,41 @@ +_vcount(f) = 0 +function _vcount(f::F, args::Vararg{DenseArray,M}) where {F,M} + x = first(args) + y = Base.tail(args) + foreach(a -> @assert(size(a) == size(x)), y) + N = length(x) + ptrargs = map(VectorizationBase.zstridedpointer, args) + i = 0 + V = VectorizationBase.pick_vector_width( + reduce(promote_type, map(eltype, ptrargs)) + ) + W = unwrap(V) + UNROLL = 4 + LOG2UNROLL = 2 + counts = if VERSION >= v"1.7" + VecUnroll(ntuple(Returns(0), Val(UNROLL))) + else + VecUnroll(ntuple(_ -> (0), Val(UNROLL))) + end + while i < vsub_nsw(N, ((W << LOG2UNROLL) - 1)) + index = VectorizationBase.Unroll{1,W,UNROLL,1,W,zero(UInt)}((i,)) + counts += count_ones(f(VectorizationBase.fmap(vload, ptrargs, index)...)) + i = vadd_nw(i, StaticInt{UNROLL}() * W) + end + count = reduce_tup(+, data(counts)) + while i < vsub_nsw(N, (W - 1)) # stops at 16 when + count += count_ones(f(map1(vload, ptrargs, (MM{W}(i),))...)) + i = vadd_nw(i, W) + end + if i < N + m = mask(StaticInt(W), N & (W - 1)) + vfinal = f(map1(vload, ptrargs, (MM{W}(i),), m)...) + count += count_ones(vfinal & m) + end + count +end + +@generated function vcount(f::F, args::Vararg{DenseArray,M}) where {F,M} + call = Expr(:call, :_vcount, :f) + gc_preserve_call_quote(call, M::Int) +end diff --git a/src/simdfunctionals/map.jl b/src/simdfunctionals/map.jl index 3f22f8e2..f6669945 100644 --- a/src/simdfunctionals/map.jl +++ b/src/simdfunctionals/map.jl @@ -74,8 +74,11 @@ function vmap_singlethread!( ::Val{NonTemporal}, args::Vararg{AbstractArray,A} ) where {F,T<:NativeTypes,A,NonTemporal} - ptry, ptrargs, N = setup_vmap!(f, y, Val{NonTemporal}(), args...) - _vmap_singlethread!(f, ptry, Zero(), N, Val{NonTemporal}(), ptrargs) + presy = preserve_buffer(y) + GC.@preserve presy begin + ptry, ptrargs, N = setup_vmap!(f, y, Val{NonTemporal}(), args...) + _vmap_singlethread!(f, ptry, Zero(), N, Val{NonTemporal}(), ptrargs) + end nothing end function _vmap_singlethread!( @@ -263,20 +266,25 @@ function vmap_multithread!( end nothing end -function gc_preserve_vmap_quote(NonTemporal::Bool, Threaded::Bool, A::Int) - m = Threaded ? :vmap_multithread! : :vmap_singlethread! - call = Expr(:call, m, :f, :y, Expr(:call, Expr(:curly, :Val, NonTemporal))) +function gc_preserve_call_quote(call, A::Int) q = Expr(:block, Expr(:meta, :inline)) gcpres = Expr(:gc_preserve, call) - for a ∈ 1:Int(A)::Int + for a ∈ 1:A arg = Symbol(:arg_, a) parg = Symbol(:parg_, a) - push!(q.args, Expr(:(=), arg, :(@inbounds args[$a])))#Expr(:ref, :args, a))) + push!(q.args, Expr(:(=), arg, :($getfield(args, $a)))) push!(q.args, Expr(:(=), parg, Expr(:call, :preserve_buffer, arg))) push!(call.args, arg) push!(gcpres.args, parg) end - push!(q.args, gcpres, :y) + push!(q.args, gcpres) + q +end +function gc_preserve_vmap_quote(NonTemporal::Bool, Threaded::Bool, A::Int) + m = Threaded ? :vmap_multithread! : :vmap_singlethread! + call = Expr(:call, m, :f, :y, Expr(:call, Expr(:curly, :Val, NonTemporal))) + q = gc_preserve_call_quote(call, A) + push!(q.args, :y) q end @generated function gc_preserve_vmap!( diff --git a/test/grouptests.jl b/test/grouptests.jl index f3e38273..e97a629b 100644 --- a/test/grouptests.jl +++ b/test/grouptests.jl @@ -15,7 +15,7 @@ const START_TIME = time() @time if LOOPVECTORIZATION_TEST == "all" || LOOPVECTORIZATION_TEST == "part2" if VERSION <= v"1.8" || isempty(VERSION.prerelease) using Aqua - @time Aqua.test_all(LoopVectorization, ambiguities = false) + @time Aqua.test_all(LoopVectorization, ambiguities = false, piracy = false) end @test isempty(detect_unbound_args(LoopVectorization)) diff --git a/test/map.jl b/test/map.jl index 30cb969f..ed0481bf 100644 --- a/test/map.jl +++ b/test/map.jl @@ -37,4 +37,10 @@ @test vmap(abs2, 1:100) == map(abs2, 1:100) @test vmapt(abs2, 1:3:10000) == map(abs2, 1:3:10000) @test vmapt(abs2, 1.0:3.0:10000.0) ≈ map(abs2, 1.0:3.0:10000.0) + + for n = -64:64 + let x = rand(UInt8, (1 << 14) + n) + @test count(==(UInt8('\n')), x) == vcount(==(UInt8('\n')), x) + end + end end