From cd410a19a9b19b8e9238a38f857710db41931db2 Mon Sep 17 00:00:00 2001
From: Vaibhav Dixit <vaibhavyashdixit@gmail.com>
Date: Wed, 12 Jul 2023 01:20:16 +0530
Subject: [PATCH] Add to docs and rename fields

---
 docs/src/optimization_packages/optimisers.md | 110 +++++++++++--------
 lib/OptimizationOptimisers/src/sophia.jl     |  34 +++---
 lib/OptimizationOptimisers/test/runtests.jl  |   4 +-
 3 files changed, 86 insertions(+), 62 deletions(-)

diff --git a/docs/src/optimization_packages/optimisers.md b/docs/src/optimization_packages/optimisers.md
index 5c528290e..ecb88d2b3 100644
--- a/docs/src/optimization_packages/optimisers.md
+++ b/docs/src/optimization_packages/optimisers.md
@@ -1,6 +1,6 @@
 # [Optimisers.jl](@id optimisers)
 
-## Installation: OptimizationFlux.jl
+## Installation: OptimizationOptimisers.jl
 
 To use this package, install the OptimizationOptimisers package:
 
@@ -9,142 +9,166 @@ import Pkg;
 Pkg.add("OptimizationOptimisers");
 ```
 
+In addition to the optimisation algorithms provided by the Optimisers.jl package this subpackage
+also provides the Sophia optimisation algorithm.
+
+
 ## Local Unconstrained Optimizers
 
+  - Sophia: Based on the recent paper https://arxiv.org/abs/2305.14342. It incorporates second order information
+  in the form of the diagonal of the Hessian matrix hence avoiding the need to compute the complete hessian. It has been shown to converge faster than other first order methods such as Adam and SGD.
+
+      + `solve(problem, Sophia(; η, βs, ϵ, λ, k, ρ))`
+
+      + `η` is the learning rate
+      + `βs` are the decay of momentums
+      + `ϵ` is the epsilon value
+      + `λ` is the weight decay parameter
+      + `k` is the number of iterations to re-compute the diagonal of the Hessian matrix
+      + `ρ` is the momentum
+      + Defaults:
+
+          * `η = 0.001`
+          * `βs = (0.9, 0.999)`
+          * `ϵ = 1e-8`
+          * `λ = 0.1`
+          * `k = 10`
+          * `ρ = 0.04`
+
   - [`Optimisers.Descent`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.Descent): **Classic gradient descent optimizer with learning rate**
-    
+
       + `solve(problem, Descent(η))`
-    
+
       + `η` is the learning rate
       + Defaults:
-        
+
           * `η = 0.1`
 
   - [`Optimisers.Momentum`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.Momentum): **Classic gradient descent optimizer with learning rate and momentum**
-    
+
       + `solve(problem, Momentum(η, ρ))`
-    
+
       + `η` is the learning rate
       + `ρ` is the momentum
       + Defaults:
-        
+
           * `η = 0.01`
           * `ρ = 0.9`
   - [`Optimisers.Nesterov`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.Nesterov): **Gradient descent optimizer with learning rate and Nesterov momentum**
-    
+
       + `solve(problem, Nesterov(η, ρ))`
-    
+
       + `η` is the learning rate
       + `ρ` is the Nesterov momentum
       + Defaults:
-        
+
           * `η = 0.01`
           * `ρ = 0.9`
   - [`Optimisers.RMSProp`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.RMSProp): **RMSProp optimizer**
-    
+
       + `solve(problem, RMSProp(η, ρ))`
-    
+
       + `η` is the learning rate
       + `ρ` is the momentum
       + Defaults:
-        
+
           * `η = 0.001`
           * `ρ = 0.9`
   - [`Optimisers.Adam`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.Adam): **Adam optimizer**
-    
+
       + `solve(problem, Adam(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.RAdam`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.RAdam): **Rectified Adam optimizer**
-    
+
       + `solve(problem, RAdam(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.RAdam`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.OAdam): **Optimistic Adam optimizer**
-    
+
       + `solve(problem, OAdam(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.5, 0.999)`
   - [`Optimisers.AdaMax`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.AdaMax): **AdaMax optimizer**
-    
+
       + `solve(problem, AdaMax(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.ADAGrad`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.ADAGrad): **ADAGrad optimizer**
-    
+
       + `solve(problem, ADAGrad(η))`
-    
+
       + `η` is the learning rate
       + Defaults:
-        
+
           * `η = 0.1`
   - [`Optimisers.ADADelta`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.ADADelta): **ADADelta optimizer**
-    
+
       + `solve(problem, ADADelta(ρ))`
-    
+
       + `ρ` is the gradient decay factor
       + Defaults:
-        
+
           * `ρ = 0.9`
   - [`Optimisers.AMSGrad`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.ADAGrad): **AMSGrad optimizer**
-    
+
       + `solve(problem, AMSGrad(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.NAdam`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.NAdam): **Nesterov variant of the Adam optimizer**
-    
+
       + `solve(problem, NAdam(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
   - [`Optimisers.AdamW`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.AdamW): **AdamW optimizer**
-    
+
       + `solve(problem, AdamW(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + `decay` is the decay to weights
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
           * `decay = 0`
   - [`Optimisers.ADABelief`](https://fluxml.ai/Optimisers.jl/dev/api/#Optimisers.ADABelief): **ADABelief variant of Adam**
-    
+
       + `solve(problem, ADABelief(η, β::Tuple))`
-    
+
       + `η` is the learning rate
       + `β::Tuple` is the decay of momentums
       + Defaults:
-        
+
           * `η = 0.001`
           * `β::Tuple = (0.9, 0.999)`
diff --git a/lib/OptimizationOptimisers/src/sophia.jl b/lib/OptimizationOptimisers/src/sophia.jl
index 47adbf351..ff396ab89 100644
--- a/lib/OptimizationOptimisers/src/sophia.jl
+++ b/lib/OptimizationOptimisers/src/sophia.jl
@@ -1,19 +1,19 @@
 using Optimization.LinearAlgebra
 
 struct Sophia
-    lr::Float64
-    betas::Tuple{Float64, Float64}
-    eps::Float64
-    weight_decay::Float64
+    η::Float64
+    βs::Tuple{Float64, Float64}
+    ϵ::Float64
+    λ::Float64
     k::Integer
-    rho::Float64
+    ρ::Float64
 end
 
 SciMLBase.supports_opt_cache_interface(opt::Sophia) = true
 
-function Sophia(; lr = 1e-3, betas = (0.9, 0.999), eps = 1e-8, weight_decay = 1e-1, k = 10,
-    rho = 0.04)
-    Sophia(lr, betas, eps, weight_decay, k, rho)
+function Sophia(; η = 1e-3, βs = (0.9, 0.999), ϵ = 1e-8, λ = 1e-1, k = 10,
+    ρ = 0.04)
+    Sophia(η, βs, ϵ, λ, k, ρ)
 end
 
 clip(z, ρ) = max(min(z, ρ), -ρ)
@@ -54,11 +54,11 @@ function SciMLBase.__solve(cache::OptimizationCache{
 }
     local x, cur, state
     uType = eltype(cache.u0)
-    lr = uType(cache.opt.lr)
-    betas = uType.(cache.opt.betas)
-    eps = uType(cache.opt.eps)
-    weight_decay = uType(cache.opt.weight_decay)
-    rho = uType(cache.opt.rho)
+    η = uType(cache.opt.η)
+    βs = uType.(cache.opt.βs)
+    ϵ = uType(cache.opt.ϵ)
+    λ = uType(cache.opt.λ)
+    ρ = uType(cache.opt.ρ)
 
     if cache.data != Optimization.DEFAULT_DATA
         maxiters = length(cache.data)
@@ -97,17 +97,17 @@ function SciMLBase.__solve(cache::OptimizationCache{
         elseif cb_call
             break
         end
-        mₜ = betas[1] .* mₜ + (1 - betas[1]) .* gₜ
+        mₜ = βs[1] .* mₜ + (1 - βs[1]) .* gₜ
 
         if i % cache.opt.k == 1
             hₜ₋₁ = copy(hₜ)
             u = randn(uType, length(θ))
             f.hv(hₜ, θ, u, d...)
-            hₜ = betas[2] .* hₜ₋₁ + (1 - betas[2]) .* (u .* hₜ)
+            hₜ = βs[2] .* hₜ₋₁ + (1 - βs[2]) .* (u .* hₜ)
         end
-        θ = θ .- lr * weight_decay .* θ
+        θ = θ .- η * λ .* θ
         θ = θ .-
-            lr .* clip.(mₜ ./ max.(hₜ, Ref(eps)), Ref(rho))
+            η .* clip.(mₜ ./ max.(hₜ, Ref(ϵ)), Ref(ρ))
     end
 
     return SciMLBase.build_solution(cache, cache.opt,
diff --git a/lib/OptimizationOptimisers/test/runtests.jl b/lib/OptimizationOptimisers/test/runtests.jl
index 4869c9abe..523309d60 100644
--- a/lib/OptimizationOptimisers/test/runtests.jl
+++ b/lib/OptimizationOptimisers/test/runtests.jl
@@ -13,8 +13,8 @@ using Zygote
     prob = OptimizationProblem(optprob, x0, _p)
 
     sol = Optimization.solve(prob,
-        OptimizationOptimisers.Sophia(; lr = 0.5,
-            weight_decay = 0.0),
+        OptimizationOptimisers.Sophia(; η = 0.5,
+            λ = 0.0),
         maxiters = 1000)
     @test 10 * sol.objective < l1