Add to docs and rename fields

SciML · Jul 11, 2023 · cd410a1 · cd410a1
1 parent ea45267
commit cd410a1
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 62 deletions.
diff --git a/docs/src/optimization_packages/optimisers.md b/docs/src/optimization_packages/optimisers.md
@@ -1,6 +1,6 @@
 # [Optimisers.jl](@id optimisers)
 
-## Installation: OptimizationFlux.jl
+## Installation: OptimizationOptimisers.jl
 
 To use this package, install the OptimizationOptimisers package:
 
@@ -9,142 +9,166 @@ import Pkg;
 Pkg.add("OptimizationOptimisers");
 ```
 
+In addition to the optimisation algorithms provided by the Optimisers.jl package this subpackage
+also provides the Sophia optimisation algorithm.
+
+
 ## Local Unconstrained Optimizers
 
+  - Sophia: Based on the recent paper https://arxiv.org/abs/2305.14342. It incorporates second order information
+  in the form of the diagonal of the Hessian matrix hence avoiding the need to compute the complete hessian. It has been shown to converge faster than other first order methods such as Adam and SGD.
+
+      + `solve(problem, Sophia(; η, βs, ϵ, λ, k, ρ))`
+
+      + `η` is the learning rate
+      + `βs` are the decay of momentums
+      + `ϵ` is the epsilon value
+      + `λ` is the weight decay parameter
+      + `k` is the number of iterations to re-compute the diagonal of the Hessian matrix
+      + `ρ` is the momentum
+      + Defaults:
+
+          * `η = 0.001`
+          * `βs = (0.9, 0.999)`
+          * `ϵ = 1e-8`
+          * `λ = 0.1`
+          * `k = 10`
+          * `ρ = 0.04`
+
   - [`Optimisers.Descent`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.Descent): **Classic gradient descent optimizer with learning rate**
-    
+
       + `solve(problem, Descent(η))`
-    
+
       + `η` is the learning rate
       + Defaults:
-        
+
           * `η = 0.1`
 
   - [`Optimisers.Momentum`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.Momentum): **Classic gradient descent optimizer with learning rate and momentum**
-    
+
       + `solve(problem, Momentum(η, ρ))`
-    
+
       + `η` is the learning rate
       + `ρ` is the momentum
       + Defaults:
-        
+
           * `η = 0.01`
           * `ρ = 0.9`
   - [`Optimisers.Nesterov`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.Nesterov): **Gradient descent optimizer with learning rate and Nesterov momentum**
-    
+
       + `solve(problem, Nesterov(η, ρ))`
-    
+
       + `η` is the learning rate
       + `ρ` is the Nesterov momentum
       + Defaults:
-        
+
           * `η = 0.01`
           * `ρ = 0.9`
   - [`Optimisers.RMSProp`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.RMSProp): **RMSProp optimizer**
-    
+
       + `solve(problem, RMSProp(η, ρ))`
-    
+
       + `η` is the learning rate
       + `ρ` is the momentum
       + Defaults:
-        
+
           * `η = 0.001`
           * `ρ = 0.9`
   - [`Optimisers.Adam`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.Adam): **Adam optimizer**
-    
+
       + `solve(problem, Adam(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.RAdam`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.RAdam): **Rectified Adam optimizer**
-    
+
       + `solve(problem, RAdam(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.RAdam`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.OAdam): **Optimistic Adam optimizer**
-    
+
       + `solve(problem, OAdam(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.5, 0.999)`
   - [`Optimisers.AdaMax`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.AdaMax): **AdaMax optimizer**
-    
+
       + `solve(problem, AdaMax(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.ADAGrad`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.ADAGrad): **ADAGrad optimizer**
-    
+
       + `solve(problem, ADAGrad(η))`
-    
+
       + `η` is the learning rate
       + Defaults:
-        
+
           * `η = 0.1`
   - [`Optimisers.ADADelta`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.ADADelta): **ADADelta optimizer**
-    
+
       + `solve(problem, ADADelta(ρ))`
-    
+
       + `ρ` is the gradient decay factor
       + Defaults:
-        
+
           * `ρ = 0.9`
   - [`Optimisers.AMSGrad`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.ADAGrad): **AMSGrad optimizer**
-    
+
       + `solve(problem, AMSGrad(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.NAdam`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.NAdam): **Nesterov variant of the Adam optimizer**
-    
+
       + `solve(problem, NAdam(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.AdamW`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.AdamW): **AdamW optimizer**
-    
+
       + `solve(problem, AdamW(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + `decay` is the decay to weights
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
           * `decay = 0`
   - [`Optimisers.ADABelief`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.ADABelief): **ADABelief variant of Adam**
-    
+
       + `solve(problem, ADABelief(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
diff --git a/lib/OptimizationOptimisers/src/sophia.jl b/lib/OptimizationOptimisers/src/sophia.jl
@@ -1,19 +1,19 @@
 using Optimization.LinearAlgebra
 
 struct Sophia
-    lr::Float64
-    betas::Tuple{Float64, Float64}
-    eps::Float64
-    weight_decay::Float64
+    η::Float64
+    βs::Tuple{Float64, Float64}
+    ϵ::Float64
+    λ::Float64
     k::Integer
-    rho::Float64
+    ρ::Float64
 end
 
 SciMLBase.supports_opt_cache_interface(opt::Sophia) = true
 
-function Sophia(; lr = 1e-3, betas = (0.9, 0.999), eps = 1e-8, weight_decay = 1e-1, k = 10,
-    rho = 0.04)
-    Sophia(lr, betas, eps, weight_decay, k, rho)
+function Sophia(; η = 1e-3, βs = (0.9, 0.999), ϵ = 1e-8, λ = 1e-1, k = 10,
+    ρ = 0.04)
+    Sophia(η, βs, ϵ, λ, k, ρ)
 end
 
 clip(z, ρ) = max(min(z, ρ), -ρ)
@@ -54,11 +54,11 @@ function SciMLBase.__solve(cache::OptimizationCache{
 }
     local x, cur, state
     uType = eltype(cache.u0)
-    lr = uType(cache.opt.lr)
-    betas = uType.(cache.opt.betas)
-    eps = uType(cache.opt.eps)
-    weight_decay = uType(cache.opt.weight_decay)
-    rho = uType(cache.opt.rho)
+    η = uType(cache.opt.η)
+    βs = uType.(cache.opt.βs)
+    ϵ = uType(cache.opt.ϵ)
+    λ = uType(cache.opt.λ)
+    ρ = uType(cache.opt.ρ)
 
     if cache.data != Optimization.DEFAULT_DATA
         maxiters = length(cache.data)
@@ -97,17 +97,17 @@ function SciMLBase.__solve(cache::OptimizationCache{
         elseif cb_call
             break
         end
-        mₜ = betas[1] .* mₜ + (1 - betas[1]) .* gₜ
+        mₜ = βs[1] .* mₜ + (1 - βs[1]) .* gₜ
 
         if i % cache.opt.k == 1
             hₜ₋₁ = copy(hₜ)
             u = randn(uType, length(θ))
             f.hv(hₜ, θ, u, d...)
-            hₜ = betas[2] .* hₜ₋₁ + (1 - betas[2]) .* (u .* hₜ)
+            hₜ = βs[2] .* hₜ₋₁ + (1 - βs[2]) .* (u .* hₜ)
         end
-        θ = θ .- lr * weight_decay .* θ
+        θ = θ .- η * λ .* θ
         θ = θ .-
-            lr .* clip.(mₜ ./ max.(hₜ, Ref(eps)), Ref(rho))
+            η .* clip.(mₜ ./ max.(hₜ, Ref(ϵ)), Ref(ρ))
     end
 
     return SciMLBase.build_solution(cache, cache.opt,

diff --git a/lib/OptimizationOptimisers/test/runtests.jl b/lib/OptimizationOptimisers/test/runtests.jl
@@ -13,8 +13,8 @@ using Zygote
     prob = OptimizationProblem(optprob, x0, _p)
 
     sol = Optimization.solve(prob,
-        OptimizationOptimisers.Sophia(; lr = 0.5,
-            weight_decay = 0.0),
+        OptimizationOptimisers.Sophia(; η = 0.5,
+            λ = 0.0),
         maxiters = 1000)
     @test 10 * sol.objective < l1