LuxDL · avik-pal · Jul 14, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LuxLib"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
 authors = ["Avik Pal <[email protected]> and contributors"]
-version = "0.3.30"
+version = "0.3.31-DEV"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -10,6 +10,7 @@ DispatchDoctor = "8d63f2c5-f18a-4cf2-ba9d-b3f60fc568c8"
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
 FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 LuxDeviceUtils = "34f89e08-e1d5-43b4-8944-0b49ac560553"
@@ -18,19 +19,18 @@ NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+UnrolledUtilities = "0fe1646c-419e-43be-ac14-22321958931b"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [extensions]
 LuxLibAMDGPUExt = "AMDGPU"
 LuxLibCUDAExt = "CUDA"
-LuxLibForwardDiffExt = "ForwardDiff"
 LuxLibReverseDiffExt = "ReverseDiff"
 LuxLibTrackerAMDGPUExt = ["AMDGPU", "Tracker"]
 LuxLibTrackerExt = "Tracker"
@@ -52,7 +52,7 @@ FastClosures = "0.3.2"
 ForwardDiff = "0.10.36"
 LinearAlgebra = "1.10"
 LuxCore = "0.1.13"
-LuxDeviceUtils = "0.1.23"
+LuxDeviceUtils = "0.1.26"
 LuxTestUtils = "0.1.18"
 Markdown = "1.10"
 NNlib = "0.9.13"
@@ -66,6 +66,7 @@ StableRNGs = "1"
 Statistics = "1.10"
 Test = "1.10"
 Tracker = "0.2.34"
+UnrolledUtilities = "0.1.2"
 Zygote = "0.6.69"
 cuDNN = "1.3"
 julia = "1.10"
@@ -74,7 +75,6 @@ julia = "1.10"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LuxDeviceUtils = "34f89e08-e1d5-43b4-8944-0b49ac560553"
 LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -87,4 +87,4 @@ Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Aqua", "ComponentArrays", "ExplicitImports", "ForwardDiff", "LuxTestUtils", "Pkg", "Preferences", "ReTestItems", "ReverseDiff", "StableRNGs", "Test", "Tracker", "Zygote"]
+test = ["Aqua", "ComponentArrays", "ExplicitImports", "LuxTestUtils", "Pkg", "Preferences", "ReTestItems", "ReverseDiff", "StableRNGs", "Test", "Tracker", "Zygote"]
diff --git a/ext/LuxLibAMDGPUExt.jl b/ext/LuxLibAMDGPUExt.jl
@@ -23,26 +23,26 @@ for (wT, xT) in [(Float64, Float64), (Float64, Float32), (Float32, Float64)],
 
     for bT in (Float32, Float64)
         @eval begin
-            function LuxLib.$fname(σ::F, weigjt::ROCArray{$(wT), N}, x::ROCArray{$(xT), N},
+            function LuxLib.$fname(σ::F, weight::ROCArray{$(wT), N}, x::ROCArray{$(xT), N},
                     b::ROCArray{$(bT), N}, cdims::NNlib.ConvDims) where {F, N}
-                @warn "MIOpen doesn't support Float64 convolutions, type-casting everything to \
-                    Float32 to avoid runtime errors" maxlog=1
-                return LuxLib._oftype_array(Float64,
-                    LuxLib.$fname(σ, LuxLib._oftype_array(Float32, weigjt),
-                        LuxLib._oftype_array(Float32, x),
-                        LuxLib._oftype_array(Float32, b), cdims))
+                @warn "MIOpen doesn't support Float64 convolutions, type-casting \
+                       everything to Float32 to avoid runtime errors" maxlog=1
+                return LuxLib._ofeltype_array(Float64,
+                    LuxLib.$fname(σ, LuxLib._ofeltype_array(Float32, weight),
+                        LuxLib._ofeltype_array(Float32, x),
+                        LuxLib._ofeltype_array(Float32, b), cdims))
             end
         end
     end
 
     @eval begin
-        function LuxLib.$fname(σ::F, weigjt::ROCArray{$(wT), N}, x::ROCArray{$(xT), N},
+        function LuxLib.$fname(σ::F, weight::ROCArray{$(wT), N}, x::ROCArray{$(xT), N},
                 b::Nothing, cdims::NNlib.ConvDims) where {F, N}
-            @warn "MIOpen doesn't support Float64 convolutions, type-casting everything to \
-                Float32 to avoid runtime errors" maxlog=1
-            return LuxLib._oftype_array(Float64,
-                LuxLib.$fname(σ, LuxLib._oftype_array(Float32, weigjt),
-                    LuxLib._oftype_array(Float32, x), b, cdims))
+            @warn "MIOpen doesn't support Float64 convolutions, type-casting everything \
+                   to Float32 to avoid runtime errors" maxlog=1
+            return LuxLib._ofeltype_array(Float64,
+                LuxLib.$fname(σ, LuxLib._ofeltype_array(Float32, weight),
+                    LuxLib._ofeltype_array(Float32, x), b, cdims))
         end
     end
 end

diff --git a/ext/LuxLibCUDAExt/LuxLibCUDAExt.jl b/ext/LuxLibCUDAExt/LuxLibCUDAExt.jl
@@ -11,26 +11,6 @@ using NNlib: NNlib
 
 const CRC = ChainRulesCore
 
-const cuBLASLt_functional = Ref(true)
-
-function __init__()
-    try
-        # Test if cuBLASLt is functional
-        y = CUDA.zeros(Float32, 2, 2)
-        w = CUDA.rand(Float32, 2, 2)
-        x = CUDA.rand(Float32, 2, 2)
-        b = CUDA.rand(Float32, 2)
-        LuxLib._cublaslt_matmul_fused!(y, identity, w, x, b)
-    catch
-        cuBLASLt_functional[] = false
-    end
-
-    if CUDA.functional() && !cuBLASLt_functional[]
-        @warn "cuBLASLt is not functional on this system. We won't be able to use \
-               optimized implementations of certain matmul operations."
-    end
-end
-
 # Low level functions
 include("cublaslt.jl")
 

diff --git a/ext/LuxLibCUDAExt/cublaslt.jl b/ext/LuxLibCUDAExt/cublaslt.jl
@@ -1,7 +1,7 @@
 const TransOrAdjOrRegStridedCuMatrix{T} = Union{Transpose{T, <:StridedCuMatrix{T}},
     Adjoint{T, <:StridedCuMatrix{T}}, StridedCuMatrix{T}}
 
-function LuxLib._cublaslt_matmul_fused!(
+function _cublaslt_matmul_fused!(
         @nospecialize(y::TransOrAdjOrRegStridedCuMatrix{<:Real}), σ::F,
         @nospecialize(w::TransOrAdjOrRegStridedCuMatrix{<:Real}),
         @nospecialize(x::TransOrAdjOrRegStridedCuMatrix{<:Real}),
@@ -10,12 +10,11 @@ function LuxLib._cublaslt_matmul_fused!(
     transy = y isa Transpose || y isa Adjoint
     transx = x isa Transpose || x isa Adjoint
     transw = w isa Transpose || x isa Adjoint
-    return LuxLib._cublaslt_matmul_fused!(
+    return _cublaslt_matmul_fused!(
         transy, parent(y), σ, transw, parent(w), transx, parent(x), b, aux)
 end
 
-function LuxLib._cublaslt_matmul_fused!(
-        transy::Bool, @nospecialize(y::StridedCuMatrix{yT}), σ::F,
+function _cublaslt_matmul_fused!(transy::Bool, @nospecialize(y::StridedCuMatrix{yT}), σ::F,
         transw::Bool, @nospecialize(w::StridedCuMatrix{wT}), transx::Bool,
         @nospecialize(x::StridedCuMatrix{xT}), b::Optional{<:StridedCuVector},
         aux::Optional{<:StridedCuMatrix}) where {F, yT, wT, xT}
@@ -26,19 +25,17 @@ function LuxLib._cublaslt_matmul_fused!(
     wxT = promote_type(wT, xT, bT, auxT)
     @warn "Mixed Precision Inputs received for `weight`: $(typeof(w)) and `x`: \
            $(typeof(x)). Promoting to $(wxT)." maxlog=1
-    return LuxLib._cublaslt_matmul_fused!(
-        transy, y, σ, transw, LuxLib._oftype_array(wxT, w),
-        transx, LuxLib._oftype_array(wxT, x),
-        LuxLib._oftype_array(wxT, b), LuxLib._oftype_array(wxT, aux))
+    return _cublaslt_matmul_fused!(transy, y, σ, transw, LuxLib._ofeltype_array(wxT, w),
+        transx, LuxLib._ofeltype_array(wxT, x),
+        LuxLib._ofeltype_array(wxT, b), LuxLib._ofeltype_array(wxT, aux))
 end
 
 # TODO: use https://docs.nvidia.com/cuda/cublas/#cublasltmatmul for a more robust
 #       computeType mapping. Currently no one uses Lux with weird type combinations so we
 #       don't need to worry about it too much and just fall back to the generic
 #       implementation
 # Returns: 0 if successful, -1 if unsuccessful
-function LuxLib._cublaslt_matmul_fused!(
-        transy::Bool, @nospecialize(y::StridedCuMatrix{yT}), σ::F,
+function _cublaslt_matmul_fused!(transy::Bool, @nospecialize(y::StridedCuMatrix{yT}), σ::F,
         transw::Bool, @nospecialize(w::StridedCuMatrix{wxT}), transx::Bool,
         @nospecialize(x::StridedCuMatrix{wxT}), b::Optional{<:StridedCuVector},
         aux::Optional{<:StridedCuMatrix}) where {F, yT, wxT}
@@ -146,45 +143,27 @@ end
 function __epilogue_act(f::F, b, aux) where {F}
     if f === identity
         @assert aux===nothing "`aux` must be `nothing` for `identity` activation."
-        if b === nothing
-            return CUBLAS.CUBLASLT_EPILOGUE_DEFAULT, true
-        else
-            return CUBLAS.CUBLASLT_EPILOGUE_BIAS, true
-        end
+        b === nothing && return CUBLAS.CUBLASLT_EPILOGUE_DEFAULT, true
+        return CUBLAS.CUBLASLT_EPILOGUE_BIAS, true
     elseif f === NNlib.relu
         if b === nothing
-            if aux === nothing
-                return CUBLAS.CUBLASLT_EPILOGUE_RELU, true
-            else
-                return CUBLAS.CUBLASLT_EPILOGUE_RELU_AUX, true
-            end
+            aux === nothing && return CUBLAS.CUBLASLT_EPILOGUE_RELU, true
+            return CUBLAS.CUBLASLT_EPILOGUE_RELU_AUX, true
         else
-            if aux === nothing
-                return CUBLAS.CUBLASLT_EPILOGUE_RELU_BIAS, true
-            else
-                return CUBLAS.CUBLASLT_EPILOGUE_RELU_AUX_BIAS, true
-            end
+            aux === nothing && return CUBLAS.CUBLASLT_EPILOGUE_RELU_BIAS, true
+            return CUBLAS.CUBLASLT_EPILOGUE_RELU_AUX_BIAS, true
         end
     elseif f === NNlib.gelu
         if b === nothing
-            if aux === nothing
-                return CUBLAS.CUBLASLT_EPILOGUE_GELU, true
-            else
-                return CUBLAS.CUBLASLT_EPILOGUE_GELU_AUX, true
-            end
+            aux === nothing && return CUBLAS.CUBLASLT_EPILOGUE_GELU, true
+            return CUBLAS.CUBLASLT_EPILOGUE_GELU_AUX, true
         else
-            if aux === nothing
-                return CUBLAS.CUBLASLT_EPILOGUE_GELU_BIAS, true
-            else
-                return CUBLAS.CUBLASLT_EPILOGUE_GELU_AUX_BIAS, true
-            end
+            aux === nothing && return CUBLAS.CUBLASLT_EPILOGUE_GELU_BIAS, true
+            return CUBLAS.CUBLASLT_EPILOGUE_GELU_AUX_BIAS, true
         end
     else
         @assert aux===nothing "`aux` must be `nothing` for `$(f)` activation."
-        if b === nothing
-            return CUBLAS.CUBLASLT_EPILOGUE_DEFAULT, false
-        else
-            return CUBLAS.CUBLASLT_EPILOGUE_BIAS, false
-        end
+        b === nothing && return CUBLAS.CUBLASLT_EPILOGUE_DEFAULT, false
+        return CUBLAS.CUBLASLT_EPILOGUE_BIAS, false
     end
 end
diff --git a/ext/LuxLibCUDAExt/fused_dense.jl b/ext/LuxLibCUDAExt/fused_dense.jl
@@ -1,25 +1,30 @@
 __length(x) = length(x)
 __length(::Nothing) = nothing
 
-function __might_use_cuBLASLt(::Z, ::A, ::W, ::X, ::B) where {Z, A, W, X, B}
-    cuBLASLt_functional[] || return false
-    return hasmethod(LuxLib._cublaslt_matmul_fused!, (Z, A, W, X, B))
-end
-
-@stable default_mode="warn" function LuxLib.__fused_dense_bias_activation_impl(
-        act::F, weight::AnyCuMatrix, x::AnyCuMatrix, b::Optional{<:AnyCuVector}) where {F}
-    y = similar(x, LuxLib.__get_concrete_fba_output_eltype(act, weight, x, b),
+function __try_cublasLt_fused_matmul(act::F, weight::AnyCuMatrix, x::AnyCuMatrix,
+        b::Optional{<:AnyCuVector}, ::Val{cache}) where {F, cache}
+    z = similar(x, LuxLib.__get_concrete_fba_output_eltype(act, weight, x, b),
         size(weight, 1), size(x, 2))
-    if __might_use_cuBLASLt(y, act, weight, x, b)
-        retcode = LuxLib._cublaslt_matmul_fused!(y, act, weight, x, b)
-        retcode == 0 && return y
+    y = z # aliased for now for type stability
+    if hasmethod(_cublaslt_matmul_fused!,
+        (typeof(z), typeof(act), typeof(weight), typeof(x), typeof(b)))
+        cache && (y = similar(z)) # break aliasing
+        retcode = _cublaslt_matmul_fused!(z, act, weight, x, b, ifelse(cache, y, nothing))
+        retcode == 0 && return (z, y, retcode)
         # cuBLASLt failed for the given inputs use the generic fallback
         @warn "cuBLASLt failed for the given inputs $(act), $(typeof(weight)) \
                [$(size(weight))], $(typeof(x)) [$(size(x))], $(typeof(b)) \
                [$(__length(b))]. Falling back to generic implementation." maxlog=1
     else
         @warn "cuBLASLt not available. Falling back to generic implementation." maxlog=1
     end
+    return (z, y, -1)
+end
+
+@stable default_mode="warn" function LuxLib.__fused_dense_bias_activation_impl(
+        act::F, weight::AnyCuMatrix, x::AnyCuMatrix, b::Optional{<:AnyCuVector}) where {F}
+    (y, _, retcode) = __try_cublasLt_fused_matmul(act, weight, x, b, Val(false))
+    retcode == 0 && return y
     LuxLib.__matmul!(y, weight, x)
     return LuxLib.__apply_bias_activation!!(act, y, b, Val(false))
 end
@@ -28,22 +33,7 @@ end
 function CRC.rrule(::CRC.RuleConfig{>:CRC.HasReverseMode},
         ::typeof(LuxLib.__fused_dense_bias_activation_impl), act::typeof(NNlib.gelu),
         weight::AnyCuMatrix, x::AnyCuMatrix, b::Union{AnyCuVector, Nothing})
-    z = similar(x, LuxLib.__get_concrete_fba_output_eltype(NNlib.gelu, weight, x, b),
-        size(weight, 1), size(x, 2))
-    y = z # aliased for now for type stability
-    retcode = -1
-    if __might_use_cuBLASLt(z, act, weight, x, b)
-        y = similar(z)  # break aliasing
-        retcode = LuxLib._cublaslt_matmul_fused!(z, act, weight, x, b, y)
-        if retcode == -1
-            @warn "cuBLASLt failed for the given inputs $(act), $(typeof(weight)) \
-                [$(size(weight))], $(typeof(x)) [$(size(x))], $(typeof(b)) \
-                [$(__length(b))]. Falling back to generic implementation." maxlog=1
-        end
-    else
-        @warn "cuBLASLt not available. Falling back to generic implementation." maxlog=1
-    end
-
+    (z, y, retcode) = __try_cublasLt_fused_matmul(act, weight, x, b, Val(true))
     if retcode == -1
         # Generic Fallback: break aliasing in _apply_bias_activation!!
         LuxLib.__matmul!(z, weight, x)