diff --git a/src/compiler/emit.jl b/src/compiler/emit.jl index 6d03c82e2..94c7a7fa3 100644 --- a/src/compiler/emit.jl +++ b/src/compiler/emit.jl @@ -64,7 +64,7 @@ function forward_stacks!(adj, F) isconcretetype(T) || (T = Any) rec = insert_node!(adj.forw, length(adj.forw.stmts), T, xtuple(recs...)) - if usetyped + if usetyped && length(adj.perm) > 1 rec = insert_node!(adj.forw, length(adj.forw.stmts), Pullback{F,T}, Expr(:call, Pullback{F,T}, rec)) else @@ -119,13 +119,18 @@ end varargs(m::Method, n) = m.isva ? n - m.nargs + 1 : nothing +function getmeta(T) + m = meta(T) + (usetyped && m != nothing) || return m + any(x -> isexpr(x, :goto, :gotoifnot), m.code.code) || return IRTools.meta(T) + return m +end + function _lookup_grad(T) - (m = meta(T)) == nothing && return - usetyped && m.ret == Union{} && return + (m = getmeta(T)) == nothing && return + m isa IRTools.TypedMeta && m.ret == Union{} && return va = varargs(m.method, length(T.parameters)) forw, back = stacks!(Adjoint(IRCode(m), varargs = va), T) - # verify_ir(forw) - # verify_ir(back) m, forw, back end diff --git a/src/lib/broadcast.jl b/src/lib/broadcast.jl index ad1f876c9..ff94a65de 100644 --- a/src/lib/broadcast.jl +++ b/src/lib/broadcast.jl @@ -13,8 +13,7 @@ # `--' `" `--' `" `'-' using Base.Broadcast -using Base.Broadcast: Broadcasted, AbstractArrayStyle, broadcasted, materialize, - instantiate, flatten, combine_eltypes +using Base.Broadcast: AbstractArrayStyle, broadcasted # There's a saying that debugging code is about twice as hard as writing it in # the first place. So if you're as clever as you can be when writing code, how @@ -28,218 +27,72 @@ using Base.Broadcast: Broadcasted, AbstractArrayStyle, broadcasted, materialize, # Base's broadcasting is very cleverly written, and this makes differentiating # it... somewhat tricky. -# Structural utilities -# ==================== - -using Base: tail - -tcat(x) = x -tcat(x, y, z...) = tcat((x..., y...), z...) - -broadcast_args(x) = (x,) -broadcast_args(bc::Broadcasted) = tcat(map(broadcast_args, bc.args)...) - -_unflatten(x, xs) = first(xs), tail(xs) - -_unflatten(x::Tuple{}, xs) = (), xs - -function _unflatten(x::Tuple, xs) - t1, xs1 = _unflatten(first(x), xs) - t2, xs2 = _unflatten(tail(x), xs1) - (t1, t2...), xs2 -end - -function _unflatten(bc::Broadcasted, xs) - t, xs′ = _unflatten(bc.args, xs) - (args=t,f=nothing,axes=nothing), xs′ -end - -unflatten(x, xs) = _unflatten(x, xs)[1] - -unflatten(x, xs::Nothing) = nothing +# Utilities +# ========= accum_sum(xs; dims = :) = reduce(accum, xs, dims = dims) # Work around reducedim_init issue accum_sum(xs::AbstractArray{Nothing}; dims = :) = nothing accum_sum(xs::AbstractArray{<:Real}; dims = :) = sum(xs, dims = dims) +accum_sum(xs::Real; dims = :) = xs trim(x, Δ) = reshape(Δ, ntuple(i -> size(Δ, i), Val(ndims(x)))) -unbroadcast(x::AbstractArray, Δ) = - size(x) == size(Δ) ? Δ : - length(x) == length(Δ) ? trim(x, Δ) : - trim(x, accum_sum(Δ, dims = ntuple(i -> size(x, i) == 1 ? i : ndims(Δ)+1, Val(ndims(Δ))))) - -unbroadcast(x::Union{Number,Ref}, Δ) = accum_sum(Δ) - -# Trivial Mode -# ============ - -# In some cases, such as `exp.(a .+ b)`, we can see the the gradient only depends -# on the output. Handling these specially is great for performance and memory -# usage, though of course relatively limited. It happens that the set of cases -# lines up nicely with activation functions commonly used in neural nets, though. - -# TODO fix this up and use it - -Jtrivial(f, a...) = nothing -Jtrivial(::typeof(+), a...) = a -Jtrivial(::typeof(-), a, b) = (a..., .-b...) - -trivia(_) = (1,) -function trivia(bc::Broadcasted) - t = map(trivia, bc.args) - any(t -> t === nothing, t) && return - Jtrivial(bc.f, t...) -end - -Joutput(f, a...) = nothing -Joutput(::typeof(exp), x) = map(t -> y -> y*t, x) - -function Jbroadcast(bc::Broadcasted) - t = map(trivia, bc.args) - any(t -> t === nothing, t) && return - Joutput(bc.f, t...) -end - -@inline function unbroadcast_t(x, y, ȳ, j::J) where J - trim(x, j.(y).*ȳ) -end - -@inline function unbroadcast_t(x::Number, y, ȳ, j::J) where J - x̄ = zero(float(x)) - @simd for I in eachindex(y) - @inbounds x̄ += j(y[I])*ȳ[I] - end - return x̄ -end - -function ∇broadcast_t(bc::Broadcasted, J) - y = copy(instantiate(bc)) - back(ȳ) = map(unbroadcast_t, broadcast_args(bc), map(_ -> y, J), map(_ -> ȳ, J), J) - return y, back -end - -# Forward Mode -# ============ - -# Forward mode has many limitations – mainly in that it only supports reals / -# arrays of reals and small numbers of inputs – but in those cases it works very -# generally across broadcasted functions, and handles loops particularly well. -# Most importantly it's easy on the compiler, so until we figure out reverse -# mode we're maintaining this implementation for common cases. - -import ForwardDiff -using ForwardDiff: Dual - -dualtype(::Type{Dual{G,T,P}}) where {G,T,P} = T -dualtype(T) = T - -function dual_function(f::F) where F - function (args::Vararg{Any,N}) where N - ds = map(args, ntuple(identity,Val(N))) do x, i - Dual(x, ntuple(j -> i==j, Val(N))) - end - return f(ds...) - end -end - -dualify(bc::Broadcasted{S}) where S = Broadcasted{S}(dual_function(bc.f), bc.args, bc.axes) +unbroadcast(x::AbstractArray, x̄) = + size(x) == size(x̄) ? x̄ : + length(x) == length(x̄) ? trim(x, x̄) : + trim(x, accum_sum(x̄, dims = ntuple(i -> size(x, i) == 1 ? i : ndims(x̄)+1, Val(ndims(x̄))))) -@inline function broadcast_gradient!(bc::Broadcasted, dest::AbstractArray, grads...) - @simd for I in eachindex(bc) - @inbounds begin - out = bc[I] - dest[I] = ForwardDiff.value(out) - Δs = out isa Dual ? out.partials.values : map(_ -> false, grads) - map((g, p) -> g[I] = p, grads, Δs) - end - end -end - -function broadcast_gradient(bc::Broadcasted, ::Type{T}) where T - dest = similar(bc, T) - grads = map(_ -> similar(bc, promote_type(T,Bool)), bc.args) - broadcast_gradient!(bc, dest, grads...) - return dest, grads -end +unbroadcast(x::Union{Number,Ref}, x̄) = accum_sum(x̄) -@inline function ∇broadcast_f(bc′::Broadcasted) - bc = dualify(instantiate(flatten(bc′))) - T = combine_eltypes(bc.f, bc.args) - T <: Bool && return copy(bc′), _ -> nothing - y, gs = broadcast_gradient(bc, dualtype(T)) - back(Δ) = (unflatten(bc′, map((x, d) -> unbroadcast(x, Δ.*d), bc.args, gs)),) - return y, back -end +# Split Reverse Mode +# ================== -function ∇broadcast_f(bc::Broadcasted{<:AbstractArrayStyle{0}}) - out = dualify(instantiate(flatten(bc)))[] - return out.value, Δ -> (unflatten(bc, map(x -> x*Δ, out.partials.values)),) -end +# TODO: use DiffRules here. It's complicated a little by the fact that we need +# to do CSE, then broadcast-ify the expression so that the closure captures the +# right arrays. -# Compatibility test +@adjoint broadcasted(::typeof(+), xs...) = + broadcast(+, xs...), ȳ -> (nothing, map(x -> unbroadcast(x, ȳ), xs)...) -isrealinput(x) = x isa Union{Real,AbstractArray{<:Real}} -isrealinput(bc::Broadcasted) = all(isrealinput, bc.args) +@adjoint broadcasted(::typeof(*), x, y) = x.*y, + z̄ -> (nothing, unbroadcast(x, z̄ .* y), unbroadcast(y, z̄ .* x)) -# Reverse Mode -# ============ +# General Fallback +# ================ # The fused reverse mode implementation is the most general but currently has # poor performance. It works by flattening the broadcast and mapping the call to # `_forward` over the input. # However, the core call -# broadcast(_forward, (cx,), bc′.f, bc′.args...) +# broadcast(_forward, (cx,), f, args...) # is already 10x slower than a simple broadcast (presumably due to inlining # issues, or something similar) and the other operations needed take it to about # 100x overhead. -# One thing to experiment with would be a non-fused reverse mode, which is the -# more typical option for this kind of AD. While less efficient than fusing -# in principle, it's likely that this can be much easier on the compiler. - @generated inclen(::NTuple{N,Any}) where N = Val(N+1) -function ∇broadcast_r(cx, bc::Broadcasted) - bc′, unflatten = _forward(cx, Broadcast.flatten, bc) - len = inclen(bc′.args) - y∂b = broadcast(_forward, (cx,), bc′.f, bc′.args...) +@adjoint function broadcasted(::AbstractArrayStyle, f, args...) + len = inclen(args) + y∂b = broadcast((x...) -> _forward(__context__, f, x...), args...) y = map(x -> x[1], y∂b) ∂b = map(x -> x[2], y∂b) y, function (ȳ) dxs_zip = map((∂b, ȳ) -> ∂b(ȳ), ∂b, ȳ) dxs = ntuple(i -> map(x -> x[i], dxs_zip), len) - (f = accum_sum(dxs[1]), - args = map(unbroadcast, bc′.args, Base.tail(dxs)), - axes = nothing) |> unflatten |> Base.tail + (nothing, accum_sum(dxs[1]), map(unbroadcast, args, Base.tail(dxs))...) end end -function ∇broadcast_r(bc::Broadcasted{<:AbstractArrayStyle{0}}) - bc′, unflatten = _forward(Broadcast.flatten, bc) - len = Val(length(bc′.args)+1) - y, ∂b = broadcast(_forward, bc′.f, bc′.args...) +@adjoint function broadcasted(::AbstractArrayStyle{0}, f, args...) + len = inclen(args) + y, ∂b = broadcast((x...) -> _forward(__context__, f, x...), args...) y, function (ȳ) dxs = ∂b(ȳ) - (f = dxs[1], - args = Base.tail(dxs), - axes = nothing) |> unflatten |> Base.tail + (nothing, dxs...) end end -∇broadcast(cx, bc::Broadcasted, J) = ∇broadcast_t(bc, J) - -∇broadcast(cx, bc::Broadcasted, ::Nothing) = - isrealinput(bc) ? ∇broadcast_f(bc) : ∇broadcast_r(cx, bc) - -∇broadcast(cx, bc::Broadcasted) = ∇broadcast(cx, bc, Jbroadcast(bc)) - -@adjoint function broadcasted(f, args...) - broadcasted(f, args...), Δ -> (nothing, Δ.args...) -end - -@adjoint materialize(bc::Broadcasted{<:AbstractArrayStyle}) = - ∇broadcast(__context__, bc, nothing) +@adjoint! (b::typeof(broadcast))(f, args...) = _forward(__context__, broadcasted, f, args...) diff --git a/src/lib/grad.jl b/src/lib/grad.jl index 6cadd4b34..5dbda6265 100644 --- a/src/lib/grad.jl +++ b/src/lib/grad.jl @@ -4,6 +4,7 @@ using MacroTools: combinedef named(arg) = isexpr(arg, :(::)) && length(arg.args) == 1 ? :($(gensym())::$(arg.args[1])) : arg typeless(x) = MacroTools.prewalk(x -> isexpr(x, :(::)) ? x.args[1] : x, x) +isvararg(x) = isexpr(x, :(::)) && namify(x.args[2]) == :Vararg for n = 0:3 gradtuple = Symbol(:gradtuple, n) @@ -26,8 +27,11 @@ function gradm(ex, mut = false) (esc(gensym()), :(Core.Typeof($(esc(name))))) kT = :(Core.kwftype($T)) Ts == nothing && (Ts = []) - args = esc.(named.(args)) - argnames = typeless.(args) + args = named.(args) + argnames = Any[typeless(arg) for arg in args] + isvararg(args[end]) && (argnames[end] = :($(argnames[end])...)) + args = esc.(args) + argnames = esc.(argnames) Ts = esc.(Ts) cx = :($(esc(:__context__))::Context) fargs = kw == nothing ? [cx, :($f::$T), args...] : [kw, cx, :($f::$T), args...] diff --git a/src/lib/lib.jl b/src/lib/lib.jl index 674376bf6..6b335bb68 100644 --- a/src/lib/lib.jl +++ b/src/lib/lib.jl @@ -60,6 +60,8 @@ end # Tuples +using Base: tail + @adjoint tuple(xs...) = xs, identity @adjoint getindex(xs::NTuple{N,Any}, i::Integer) where N = diff --git a/test/gradcheck.jl b/test/gradcheck.jl index 5230d4ff5..a0ee9907f 100644 --- a/test/gradcheck.jl +++ b/test/gradcheck.jl @@ -1,8 +1,6 @@ -using Zygote, NNlib, Test, Random, LinearAlgebra +using Zygote, NNlib, Test, Random, LinearAlgebra, Statistics using Zygote: gradient using NNlib: conv -using Statistics -import Random function ngradient(f, xs::AbstractArray...) grads = zero.(xs) @@ -49,7 +47,7 @@ Random.seed!(0) @test gradtest(x -> logsoftmax(x).*(1:3), 3) @test gradtest(x -> logsoftmax(x).*(1:3), (3,5)) -@test gradtest(x -> x', rand(5)) +@test_broken gradtest(x -> x', rand(5)) @test gradtest(det, (4, 4)) @test gradtest(logdet, map(x -> x*x', (rand(4, 4),))[1]) @@ -355,6 +353,6 @@ end @testset "broadcast" begin if !Zygote.usetyped - @test_broken gradient(x -> sum(sin.(x)), Diagonal(randn(3)))[1][2] == 1 + @test gradient(x -> sum(sin.(x)), Diagonal(randn(3)))[1][2] == 1 end end