Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: general code cleanups #84

Merged
merged 15 commits into from
Jul 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "0.3.30"
version = "0.3.31-DEV"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand All @@ -10,6 +10,7 @@ DispatchDoctor = "8d63f2c5-f18a-4cf2-ba9d-b3f60fc568c8"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
LuxDeviceUtils = "34f89e08-e1d5-43b4-8944-0b49ac560553"
Expand All @@ -18,19 +19,18 @@ NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
UnrolledUtilities = "0fe1646c-419e-43be-ac14-22321958931b"

[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

[extensions]
LuxLibAMDGPUExt = "AMDGPU"
LuxLibCUDAExt = "CUDA"
LuxLibForwardDiffExt = "ForwardDiff"
LuxLibReverseDiffExt = "ReverseDiff"
LuxLibTrackerAMDGPUExt = ["AMDGPU", "Tracker"]
LuxLibTrackerExt = "Tracker"
Expand All @@ -52,7 +52,7 @@ FastClosures = "0.3.2"
ForwardDiff = "0.10.36"
LinearAlgebra = "1.10"
LuxCore = "0.1.13"
LuxDeviceUtils = "0.1.23"
LuxDeviceUtils = "0.1.26"
LuxTestUtils = "0.1.18"
Markdown = "1.10"
NNlib = "0.9.13"
Expand All @@ -66,6 +66,7 @@ StableRNGs = "1"
Statistics = "1.10"
Test = "1.10"
Tracker = "0.2.34"
UnrolledUtilities = "0.1.2"
Zygote = "0.6.69"
cuDNN = "1.3"
julia = "1.10"
Expand All @@ -74,7 +75,6 @@ julia = "1.10"
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
LuxDeviceUtils = "34f89e08-e1d5-43b4-8944-0b49ac560553"
LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Expand All @@ -87,4 +87,4 @@ Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[targets]
test = ["Aqua", "ComponentArrays", "ExplicitImports", "ForwardDiff", "LuxTestUtils", "Pkg", "Preferences", "ReTestItems", "ReverseDiff", "StableRNGs", "Test", "Tracker", "Zygote"]
test = ["Aqua", "ComponentArrays", "ExplicitImports", "LuxTestUtils", "Pkg", "Preferences", "ReTestItems", "ReverseDiff", "StableRNGs", "Test", "Tracker", "Zygote"]
26 changes: 13 additions & 13 deletions ext/LuxLibAMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,26 @@ for (wT, xT) in [(Float64, Float64), (Float64, Float32), (Float32, Float64)],

for bT in (Float32, Float64)
@eval begin
function LuxLib.$fname(σ::F, weigjt::ROCArray{$(wT), N}, x::ROCArray{$(xT), N},
function LuxLib.$fname(σ::F, weight::ROCArray{$(wT), N}, x::ROCArray{$(xT), N},
b::ROCArray{$(bT), N}, cdims::NNlib.ConvDims) where {F, N}
@warn "MIOpen doesn't support Float64 convolutions, type-casting everything to \
Float32 to avoid runtime errors" maxlog=1
return LuxLib._oftype_array(Float64,
LuxLib.$fname(σ, LuxLib._oftype_array(Float32, weigjt),
LuxLib._oftype_array(Float32, x),
LuxLib._oftype_array(Float32, b), cdims))
@warn "MIOpen doesn't support Float64 convolutions, type-casting \
everything to Float32 to avoid runtime errors" maxlog=1
return LuxLib._ofeltype_array(Float64,
LuxLib.$fname(σ, LuxLib._ofeltype_array(Float32, weight),
LuxLib._ofeltype_array(Float32, x),
LuxLib._ofeltype_array(Float32, b), cdims))
end
end
end

@eval begin
function LuxLib.$fname(σ::F, weigjt::ROCArray{$(wT), N}, x::ROCArray{$(xT), N},
function LuxLib.$fname(σ::F, weight::ROCArray{$(wT), N}, x::ROCArray{$(xT), N},
b::Nothing, cdims::NNlib.ConvDims) where {F, N}
@warn "MIOpen doesn't support Float64 convolutions, type-casting everything to \
Float32 to avoid runtime errors" maxlog=1
return LuxLib._oftype_array(Float64,
LuxLib.$fname(σ, LuxLib._oftype_array(Float32, weigjt),
LuxLib._oftype_array(Float32, x), b, cdims))
@warn "MIOpen doesn't support Float64 convolutions, type-casting everything \
to Float32 to avoid runtime errors" maxlog=1
return LuxLib._ofeltype_array(Float64,
LuxLib.$fname(σ, LuxLib._ofeltype_array(Float32, weight),
LuxLib._ofeltype_array(Float32, x), b, cdims))
end
end
end
Expand Down
20 changes: 0 additions & 20 deletions ext/LuxLibCUDAExt/LuxLibCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,6 @@ using NNlib: NNlib

const CRC = ChainRulesCore

const cuBLASLt_functional = Ref(true)

function __init__()
try
# Test if cuBLASLt is functional
y = CUDA.zeros(Float32, 2, 2)
w = CUDA.rand(Float32, 2, 2)
x = CUDA.rand(Float32, 2, 2)
b = CUDA.rand(Float32, 2)
LuxLib._cublaslt_matmul_fused!(y, identity, w, x, b)
catch
cuBLASLt_functional[] = false
end

if CUDA.functional() && !cuBLASLt_functional[]
@warn "cuBLASLt is not functional on this system. We won't be able to use \
optimized implementations of certain matmul operations."
end
end

# Low level functions
include("cublaslt.jl")

Expand Down
59 changes: 19 additions & 40 deletions ext/LuxLibCUDAExt/cublaslt.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
const TransOrAdjOrRegStridedCuMatrix{T} = Union{Transpose{T, <:StridedCuMatrix{T}},
Adjoint{T, <:StridedCuMatrix{T}}, StridedCuMatrix{T}}

function LuxLib._cublaslt_matmul_fused!(
function _cublaslt_matmul_fused!(
@nospecialize(y::TransOrAdjOrRegStridedCuMatrix{<:Real}), σ::F,
@nospecialize(w::TransOrAdjOrRegStridedCuMatrix{<:Real}),
@nospecialize(x::TransOrAdjOrRegStridedCuMatrix{<:Real}),
Expand All @@ -10,12 +10,11 @@ function LuxLib._cublaslt_matmul_fused!(
transy = y isa Transpose || y isa Adjoint
transx = x isa Transpose || x isa Adjoint
transw = w isa Transpose || x isa Adjoint
return LuxLib._cublaslt_matmul_fused!(
return _cublaslt_matmul_fused!(
transy, parent(y), σ, transw, parent(w), transx, parent(x), b, aux)
end

function LuxLib._cublaslt_matmul_fused!(
transy::Bool, @nospecialize(y::StridedCuMatrix{yT}), σ::F,
function _cublaslt_matmul_fused!(transy::Bool, @nospecialize(y::StridedCuMatrix{yT}), σ::F,
transw::Bool, @nospecialize(w::StridedCuMatrix{wT}), transx::Bool,
@nospecialize(x::StridedCuMatrix{xT}), b::Optional{<:StridedCuVector},
aux::Optional{<:StridedCuMatrix}) where {F, yT, wT, xT}
Expand All @@ -26,19 +25,17 @@ function LuxLib._cublaslt_matmul_fused!(
wxT = promote_type(wT, xT, bT, auxT)
@warn "Mixed Precision Inputs received for `weight`: $(typeof(w)) and `x`: \
$(typeof(x)). Promoting to $(wxT)." maxlog=1
return LuxLib._cublaslt_matmul_fused!(
transy, y, σ, transw, LuxLib._oftype_array(wxT, w),
transx, LuxLib._oftype_array(wxT, x),
LuxLib._oftype_array(wxT, b), LuxLib._oftype_array(wxT, aux))
return _cublaslt_matmul_fused!(transy, y, σ, transw, LuxLib._ofeltype_array(wxT, w),
transx, LuxLib._ofeltype_array(wxT, x),
LuxLib._ofeltype_array(wxT, b), LuxLib._ofeltype_array(wxT, aux))
end

# TODO: use https://docs.nvidia.com/cuda/cublas/#cublasltmatmul for a more robust
# computeType mapping. Currently no one uses Lux with weird type combinations so we
# don't need to worry about it too much and just fall back to the generic
# implementation
# Returns: 0 if successful, -1 if unsuccessful
function LuxLib._cublaslt_matmul_fused!(
transy::Bool, @nospecialize(y::StridedCuMatrix{yT}), σ::F,
function _cublaslt_matmul_fused!(transy::Bool, @nospecialize(y::StridedCuMatrix{yT}), σ::F,
transw::Bool, @nospecialize(w::StridedCuMatrix{wxT}), transx::Bool,
@nospecialize(x::StridedCuMatrix{wxT}), b::Optional{<:StridedCuVector},
aux::Optional{<:StridedCuMatrix}) where {F, yT, wxT}
Expand Down Expand Up @@ -146,45 +143,27 @@ end
function __epilogue_act(f::F, b, aux) where {F}
if f === identity
@assert aux===nothing "`aux` must be `nothing` for `identity` activation."
if b === nothing
return CUBLAS.CUBLASLT_EPILOGUE_DEFAULT, true
else
return CUBLAS.CUBLASLT_EPILOGUE_BIAS, true
end
b === nothing && return CUBLAS.CUBLASLT_EPILOGUE_DEFAULT, true
return CUBLAS.CUBLASLT_EPILOGUE_BIAS, true
elseif f === NNlib.relu
if b === nothing
if aux === nothing
return CUBLAS.CUBLASLT_EPILOGUE_RELU, true
else
return CUBLAS.CUBLASLT_EPILOGUE_RELU_AUX, true
end
aux === nothing && return CUBLAS.CUBLASLT_EPILOGUE_RELU, true
return CUBLAS.CUBLASLT_EPILOGUE_RELU_AUX, true
else
if aux === nothing
return CUBLAS.CUBLASLT_EPILOGUE_RELU_BIAS, true
else
return CUBLAS.CUBLASLT_EPILOGUE_RELU_AUX_BIAS, true
end
aux === nothing && return CUBLAS.CUBLASLT_EPILOGUE_RELU_BIAS, true
return CUBLAS.CUBLASLT_EPILOGUE_RELU_AUX_BIAS, true
end
elseif f === NNlib.gelu
if b === nothing
if aux === nothing
return CUBLAS.CUBLASLT_EPILOGUE_GELU, true
else
return CUBLAS.CUBLASLT_EPILOGUE_GELU_AUX, true
end
aux === nothing && return CUBLAS.CUBLASLT_EPILOGUE_GELU, true
return CUBLAS.CUBLASLT_EPILOGUE_GELU_AUX, true
else
if aux === nothing
return CUBLAS.CUBLASLT_EPILOGUE_GELU_BIAS, true
else
return CUBLAS.CUBLASLT_EPILOGUE_GELU_AUX_BIAS, true
end
aux === nothing && return CUBLAS.CUBLASLT_EPILOGUE_GELU_BIAS, true
return CUBLAS.CUBLASLT_EPILOGUE_GELU_AUX_BIAS, true
end
else
@assert aux===nothing "`aux` must be `nothing` for `$(f)` activation."
if b === nothing
return CUBLAS.CUBLASLT_EPILOGUE_DEFAULT, false
else
return CUBLAS.CUBLASLT_EPILOGUE_BIAS, false
end
b === nothing && return CUBLAS.CUBLASLT_EPILOGUE_DEFAULT, false
return CUBLAS.CUBLASLT_EPILOGUE_BIAS, false
end
end
44 changes: 17 additions & 27 deletions ext/LuxLibCUDAExt/fused_dense.jl
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
__length(x) = length(x)
__length(::Nothing) = nothing

function __might_use_cuBLASLt(::Z, ::A, ::W, ::X, ::B) where {Z, A, W, X, B}
cuBLASLt_functional[] || return false
return hasmethod(LuxLib._cublaslt_matmul_fused!, (Z, A, W, X, B))
end

@stable default_mode="warn" function LuxLib.__fused_dense_bias_activation_impl(
act::F, weight::AnyCuMatrix, x::AnyCuMatrix, b::Optional{<:AnyCuVector}) where {F}
y = similar(x, LuxLib.__get_concrete_fba_output_eltype(act, weight, x, b),
function __try_cublasLt_fused_matmul(act::F, weight::AnyCuMatrix, x::AnyCuMatrix,
b::Optional{<:AnyCuVector}, ::Val{cache}) where {F, cache}
z = similar(x, LuxLib.__get_concrete_fba_output_eltype(act, weight, x, b),
size(weight, 1), size(x, 2))
if __might_use_cuBLASLt(y, act, weight, x, b)
retcode = LuxLib._cublaslt_matmul_fused!(y, act, weight, x, b)
retcode == 0 && return y
y = z # aliased for now for type stability
if hasmethod(_cublaslt_matmul_fused!,
(typeof(z), typeof(act), typeof(weight), typeof(x), typeof(b)))
cache && (y = similar(z)) # break aliasing
retcode = _cublaslt_matmul_fused!(z, act, weight, x, b, ifelse(cache, y, nothing))
retcode == 0 && return (z, y, retcode)
# cuBLASLt failed for the given inputs use the generic fallback
@warn "cuBLASLt failed for the given inputs $(act), $(typeof(weight)) \
[$(size(weight))], $(typeof(x)) [$(size(x))], $(typeof(b)) \
[$(__length(b))]. Falling back to generic implementation." maxlog=1
else
@warn "cuBLASLt not available. Falling back to generic implementation." maxlog=1
end
return (z, y, -1)
end

@stable default_mode="warn" function LuxLib.__fused_dense_bias_activation_impl(
act::F, weight::AnyCuMatrix, x::AnyCuMatrix, b::Optional{<:AnyCuVector}) where {F}
(y, _, retcode) = __try_cublasLt_fused_matmul(act, weight, x, b, Val(false))
retcode == 0 && return y
LuxLib.__matmul!(y, weight, x)
return LuxLib.__apply_bias_activation!!(act, y, b, Val(false))
end
Expand All @@ -28,22 +33,7 @@ end
function CRC.rrule(::CRC.RuleConfig{>:CRC.HasReverseMode},
::typeof(LuxLib.__fused_dense_bias_activation_impl), act::typeof(NNlib.gelu),
weight::AnyCuMatrix, x::AnyCuMatrix, b::Union{AnyCuVector, Nothing})
z = similar(x, LuxLib.__get_concrete_fba_output_eltype(NNlib.gelu, weight, x, b),
size(weight, 1), size(x, 2))
y = z # aliased for now for type stability
retcode = -1
if __might_use_cuBLASLt(z, act, weight, x, b)
y = similar(z) # break aliasing
retcode = LuxLib._cublaslt_matmul_fused!(z, act, weight, x, b, y)
if retcode == -1
@warn "cuBLASLt failed for the given inputs $(act), $(typeof(weight)) \
[$(size(weight))], $(typeof(x)) [$(size(x))], $(typeof(b)) \
[$(__length(b))]. Falling back to generic implementation." maxlog=1
end
else
@warn "cuBLASLt not available. Falling back to generic implementation." maxlog=1
end

(z, y, retcode) = __try_cublasLt_fused_matmul(act, weight, x, b, Val(true))
if retcode == -1
# Generic Fallback: break aliasing in _apply_bias_activation!!
LuxLib.__matmul!(z, weight, x)
Expand Down
Loading
Loading