@@ -499,7 +499,7 @@ function apply!(o::NAdam, state, x::AbstractArray{T}, dx) where T
499499end
500500
501501"""
502- AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8; couple = true)
502+ AdamW(η = 0.001, β = (0.9, 0.999), λ = 0.01 , ϵ = 1e-8; couple = true)
503503 AdamW(; [eta, beta, lambda, epsilon, couple])
504504
505505[AdamW](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its
@@ -534,12 +534,12 @@ struct AdamW{Teta,Tbeta<:Tuple,Tlambda,Teps} <: AbstractRule
534534 couple:: Bool
535535end
536536
537- function AdamW(η, β = (0.9 , 0.999 ), λ = 0.0 , ϵ = 1e-8 ; couple:: Bool = true )
537+ function AdamW(η, β = (0.9 , 0.999 ), λ = 0.01 , ϵ = 1e-8 ; couple:: Bool = true )
538538 η < 0 && throw(DomainError(η, " the learning rate cannot be negative" ))
539539 return AdamW(float(η), β, float(λ), float(ϵ), couple)
540540end
541541
542- AdamW(; eta = 0.001 , beta = (0.9 , 0.999 ), lambda= 0.0 , epsilon = 1e-8 , kw... ) =
542+ AdamW(; eta = 0.001 , beta = (0.9 , 0.999 ), lambda= 0.01 , epsilon = 1e-8 , kw... ) =
543543 AdamW(eta, beta, lambda, epsilon; kw... )
544544
545545init(o:: AdamW , x:: AbstractArray{T} ) where T = (zero(x), zero(x), T.(o. beta))
0 commit comments