From a8f13a26db8b940583f00e6942824d519ad32406 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Sat, 7 Oct 2023 08:09:44 +0200 Subject: [PATCH 1/5] wip --- experiments/lasso/runme.jl | 10 ++++++ experiments/sparse_logreg/runme.jl | 10 ++++++ src/AdaProx.jl | 57 +++++++++++++++++++++++++----- 3 files changed, 69 insertions(+), 8 deletions(-) diff --git a/experiments/lasso/runme.jl b/experiments/lasso/runme.jl index f09f138..c6e3947 100644 --- a/experiments/lasso/runme.jl +++ b/experiments/lasso/runme.jl @@ -117,6 +117,16 @@ function run_random_lasso(; name = "Nesterov (backtracking)" ) + sol, numit = AdaProx.backtracking_nesterov_2012( + zeros(n), + f = AdaProx.Counting(f), + g = g, + gamma0 = gam_init, + tol = tol, + maxit = maxit, + name = "Nesterov (2012)" + ) + sol, numit = AdaProx.adaptive_proxgrad( zeros(n), f = AdaProx.Counting(f), diff --git a/experiments/sparse_logreg/runme.jl b/experiments/sparse_logreg/runme.jl index 1923e2d..0cbe03e 100644 --- a/experiments/sparse_logreg/runme.jl +++ b/experiments/sparse_logreg/runme.jl @@ -101,6 +101,16 @@ function run_logreg_l1_data( name = "Nesterov (backtracking)" ) + sol, numit = AdaProx.backtracking_nesterov_2012( + zeros(n), + f = AdaProx.Counting(f), + g = g, + gamma0 = 5.0, + tol = tol, + maxit = maxit/2, + name = "Nesterov (2012)" + ) + sol, numit = AdaProx.adaptive_proxgrad( x0, f = AdaProx.Counting(f), diff --git a/src/AdaProx.jl b/src/AdaProx.jl index b6b8ff2..f1778a7 100644 --- a/src/AdaProx.jl +++ b/src/AdaProx.jl @@ -38,10 +38,10 @@ function backtrack_stepsize(gamma, f, g, x, f_x, grad_x) return gamma, z, f_z, g_z end -function backtracking_proxgrad(x0; f, g, gamma0, xi = 1.0 ,tol = 1e-5, maxit = 100_000, name = "Backtracking PG") +function backtracking_proxgrad(x0; f, g, gamma0, xi = 1.0, tol = 1e-5, maxit = 100_000, name = "Backtracking PG") x, z, gamma = x0, x0, gamma0 grad_x, f_x = gradient(f, x) - for it = 1:maxit + for it in 1:maxit gamma, z, f_z, g_z = backtrack_stepsize(xi * gamma, f, g, x, f_x, grad_x) norm_res = norm(z - x) / gamma @logmsg Record "" method=name it gamma norm_res objective=(f_z + g_z) grad_f_evals=grad_count(f) prox_g_evals=prox_count(g) f_evals=eval_count(f) @@ -58,7 +58,7 @@ function backtracking_nesterov(x0; f, g, gamma0, tol = 1e-5, maxit = 100_000, na x, z, gamma = x0, x0, gamma0 theta = one(gamma) grad_x, f_x = gradient(f, x) - for it = 1:maxit + for it in 1:maxit z_prev = z gamma, z, f_z, g_z = backtrack_stepsize(gamma, f, g, x, f_x, grad_x) norm_res = norm(z - x) / gamma @@ -74,6 +74,47 @@ function backtracking_nesterov(x0; f, g, gamma0, tol = 1e-5, maxit = 100_000, na return z, maxit end +function backtrack_stepsize_with_extrapolation(gamma, f, g, x, A, v) + while true + a = (2 * gamma + sqrt((2 * gamma)^2 + 8 * gamma * A)) / 2 + y = (A * x + a * v) / (A + a) + grad_y, f_y = gradient(f, y) + w = y - gamma * grad_y + z, g_z = prox(g, w, gamma) + ub_z = upper_bound(y, f_y, grad_y, z, gamma) + f_z = f(z) + if f_z <= ub_z + A += a + grad_z, _ = gradient(f, z) + subgrad_z = (w - z) / gamma + v = v - a * (grad_z + subgrad_z) + return gamma, z, f_z, g_z, A, v, y + end + gamma /= 2 + if gamma < 1e-12 + @error "step size became too small ($gamma)" + end + end +end + +function backtracking_nesterov_2012(x0; f, g, gamma0, xi = 2.0, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov (2012)") + x, z, gamma = x0, x0, gamma0 + v = x0 + A = 0 + for it in 1:maxit + gamma, z, f_z, g_z, A, v, y = backtrack_stepsize_with_extrapolation( + xi * gamma, f, g, x, A, v + ) + norm_res = norm(z - y) / gamma + @logmsg Record "" method=name it gamma norm_res objective=(f_z + g_z) grad_f_evals=grad_count(f) prox_g_evals=prox_count(g) f_evals=eval_count(f) + if norm_res <= tol + return z, it + end + x = z + end + return z, maxit +end + # Fixed stepsize fast proximal gradient # # See Chambolle, Pock, "An introduction to continuous optimization for imaging," @@ -108,7 +149,7 @@ function fixed_nesterov( end @assert 0 <= theta <= 1 / sqrt(q) x, x_prev = x0, x0 - for it = 1:maxit + for it in 1:maxit theta_prev = theta if mu == 0 theta = (1 + sqrt(1 + 4 * theta_prev^2)) / 2 @@ -160,7 +201,7 @@ function agraal( gamma = gamma0 rho = 1 / phi + 1 / phi^2 theta = one(gamma) - for it = 1:maxit + for it in 1:maxit C = norm(x - x_prev)^2 / norm(grad_x - grad_x_prev)^2 gamma_prev = gamma gamma = min(rho * gamma_prev, phi * theta * C / (4 * gamma_prev), gamma_max) @@ -281,7 +322,7 @@ function adaptive_primal_dual( x_prev, A_x_prev, grad_x_prev = x, A_x, grad_x x, _ = prox(g, v, gamma) - for it = 1:maxit + for it in 1:maxit A_x = A * x grad_x, _ = gradient(f, x) @@ -445,7 +486,7 @@ function adaptive_linesearch_primal_dual( x_prev, A_x_prev, grad_x_prev = x, A_x, grad_x x, _ = prox(g, v, gamma) - for it = 1:maxit + for it in 1:maxit A_x = A * x grad_x, _ = gradient(f, x) @@ -540,7 +581,7 @@ function malitsky_pock( y_prev = y A_x = A * x At_y = A' * y - for it = 1:maxit + for it in 1:maxit At_y_prev = At_y w = y + sigma * A_x y, _ = prox(h_conj, w, sigma) From 6c4f57e99bd77a51b1dc4bf54cfbad2b250d3ae8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Sat, 7 Oct 2023 08:32:06 +0200 Subject: [PATCH 2/5] add reference --- src/AdaProx.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/AdaProx.jl b/src/AdaProx.jl index f1778a7..300f2bd 100644 --- a/src/AdaProx.jl +++ b/src/AdaProx.jl @@ -54,6 +54,12 @@ function backtracking_proxgrad(x0; f, g, gamma0, xi = 1.0, tol = 1e-5, maxit = 1 return z, maxit end +# Accelerated, backtracking proximal-gradient method, with possibly increasing stepsizes +# +# See Yurii Nesterov, "Gradient methods for minimizing composite functions," +# Mathematical Programming, volume 140, 2013. +# https://link.springer.com/article/10.1007/s10107-012-0629-5 + function backtracking_nesterov(x0; f, g, gamma0, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov") x, z, gamma = x0, x0, gamma0 theta = one(gamma) From ea4cea96e0c3a5db5d9d31f21c8545dff8604096 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Sat, 7 Oct 2023 09:06:10 +0200 Subject: [PATCH 3/5] fix comment and date --- experiments/lasso/runme.jl | 4 ++-- experiments/sparse_logreg/runme.jl | 4 ++-- src/AdaProx.jl | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/experiments/lasso/runme.jl b/experiments/lasso/runme.jl index c6e3947..c0bb9a8 100644 --- a/experiments/lasso/runme.jl +++ b/experiments/lasso/runme.jl @@ -117,14 +117,14 @@ function run_random_lasso(; name = "Nesterov (backtracking)" ) - sol, numit = AdaProx.backtracking_nesterov_2012( + sol, numit = AdaProx.backtracking_nesterov_2013( zeros(n), f = AdaProx.Counting(f), g = g, gamma0 = gam_init, tol = tol, maxit = maxit, - name = "Nesterov (2012)" + name = "Nesterov (2013)" ) sol, numit = AdaProx.adaptive_proxgrad( diff --git a/experiments/sparse_logreg/runme.jl b/experiments/sparse_logreg/runme.jl index 0cbe03e..034a3df 100644 --- a/experiments/sparse_logreg/runme.jl +++ b/experiments/sparse_logreg/runme.jl @@ -101,14 +101,14 @@ function run_logreg_l1_data( name = "Nesterov (backtracking)" ) - sol, numit = AdaProx.backtracking_nesterov_2012( + sol, numit = AdaProx.backtracking_nesterov_2013( zeros(n), f = AdaProx.Counting(f), g = g, gamma0 = 5.0, tol = tol, maxit = maxit/2, - name = "Nesterov (2012)" + name = "Nesterov (2013)" ) sol, numit = AdaProx.adaptive_proxgrad( diff --git a/src/AdaProx.jl b/src/AdaProx.jl index 300f2bd..1917371 100644 --- a/src/AdaProx.jl +++ b/src/AdaProx.jl @@ -54,12 +54,6 @@ function backtracking_proxgrad(x0; f, g, gamma0, xi = 1.0, tol = 1e-5, maxit = 1 return z, maxit end -# Accelerated, backtracking proximal-gradient method, with possibly increasing stepsizes -# -# See Yurii Nesterov, "Gradient methods for minimizing composite functions," -# Mathematical Programming, volume 140, 2013. -# https://link.springer.com/article/10.1007/s10107-012-0629-5 - function backtracking_nesterov(x0; f, g, gamma0, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov") x, z, gamma = x0, x0, gamma0 theta = one(gamma) @@ -80,6 +74,12 @@ function backtracking_nesterov(x0; f, g, gamma0, tol = 1e-5, maxit = 100_000, na return z, maxit end +# Accelerated, backtracking proximal-gradient method, with possibly increasing stepsizes +# +# See Yurii Nesterov, "Gradient methods for minimizing composite functions," +# Mathematical Programming, volume 140, 2013. +# https://link.springer.com/article/10.1007/s10107-012-0629-5 + function backtrack_stepsize_with_extrapolation(gamma, f, g, x, A, v) while true a = (2 * gamma + sqrt((2 * gamma)^2 + 8 * gamma * A)) / 2 @@ -103,7 +103,7 @@ function backtrack_stepsize_with_extrapolation(gamma, f, g, x, A, v) end end -function backtracking_nesterov_2012(x0; f, g, gamma0, xi = 2.0, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov (2012)") +function backtracking_nesterov_2013(x0; f, g, gamma0, xi = 2.0, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov (2012)") x, z, gamma = x0, x0, gamma0 v = x0 A = 0 From 452e0206b49e19864ab2cbbd5970de133a1944b9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Sat, 7 Oct 2023 09:37:07 +0200 Subject: [PATCH 4/5] rename and clean up, add support for strong convexity modulus --- src/AdaProx.jl | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/AdaProx.jl b/src/AdaProx.jl index 1917371..84d8717 100644 --- a/src/AdaProx.jl +++ b/src/AdaProx.jl @@ -76,13 +76,16 @@ end # Accelerated, backtracking proximal-gradient method, with possibly increasing stepsizes # -# See Yurii Nesterov, "Gradient methods for minimizing composite functions," +# See "Accelerated method" from: +# Yurii Nesterov, "Gradient methods for minimizing composite functions," # Mathematical Programming, volume 140, 2013. # https://link.springer.com/article/10.1007/s10107-012-0629-5 -function backtrack_stepsize_with_extrapolation(gamma, f, g, x, A, v) +function _step_backtracking_nesterov_2013(gamma, mu, f, g, x, A, v) + muA1 = mu * A + 1 while true - a = (2 * gamma + sqrt((2 * gamma)^2 + 8 * gamma * A)) / 2 + Delta = 4 * muA1^2 * gamma^2 + 8 * muA1 * gamma * A + a = (2 * muA1 * gamma + sqrt(Delta)) / 2 y = (A * x + a * v) / (A + a) grad_y, f_y = gradient(f, y) w = y - gamma * grad_y @@ -90,11 +93,11 @@ function backtrack_stepsize_with_extrapolation(gamma, f, g, x, A, v) ub_z = upper_bound(y, f_y, grad_y, z, gamma) f_z = f(z) if f_z <= ub_z - A += a grad_z, _ = gradient(f, z) subgrad_z = (w - z) / gamma - v = v - a * (grad_z + subgrad_z) - return gamma, z, f_z, g_z, A, v, y + v = v - a / muA1 * (grad_z + subgrad_z) + A += a + return gamma, z, f_z, g_z, y, A, v end gamma /= 2 if gamma < 1e-12 @@ -103,22 +106,21 @@ function backtrack_stepsize_with_extrapolation(gamma, f, g, x, A, v) end end -function backtracking_nesterov_2013(x0; f, g, gamma0, xi = 2.0, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov (2012)") - x, z, gamma = x0, x0, gamma0 +function backtracking_nesterov_2013(x0; f, g, gamma0, mu = 0, xi = 2, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov (2012)") + x, gamma = x0, gamma0 v = x0 A = 0 for it in 1:maxit - gamma, z, f_z, g_z, A, v, y = backtrack_stepsize_with_extrapolation( - xi * gamma, f, g, x, A, v + gamma, x, f_x, g_x, y, A, v = _step_backtracking_nesterov_2013( + xi * gamma, mu, f, g, x, A, v ) - norm_res = norm(z - y) / gamma - @logmsg Record "" method=name it gamma norm_res objective=(f_z + g_z) grad_f_evals=grad_count(f) prox_g_evals=prox_count(g) f_evals=eval_count(f) + norm_res = norm(x - y) / gamma + @logmsg Record "" method=name it gamma norm_res objective=(f_x + g_x) grad_f_evals=grad_count(f) prox_g_evals=prox_count(g) f_evals=eval_count(f) if norm_res <= tol - return z, it + return x, it end - x = z end - return z, maxit + return x, maxit end # Fixed stepsize fast proximal gradient From 84af1a3400738e0e620a4139277222744700e763 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Sat, 7 Oct 2023 10:00:38 +0200 Subject: [PATCH 5/5] more cleanup --- experiments/lasso/runme.jl | 2 +- experiments/sparse_logreg/runme.jl | 2 +- src/AdaProx.jl | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/experiments/lasso/runme.jl b/experiments/lasso/runme.jl index c0bb9a8..ab7d99c 100644 --- a/experiments/lasso/runme.jl +++ b/experiments/lasso/runme.jl @@ -121,7 +121,7 @@ function run_random_lasso(; zeros(n), f = AdaProx.Counting(f), g = g, - gamma0 = gam_init, + gamma = gam_init, tol = tol, maxit = maxit, name = "Nesterov (2013)" diff --git a/experiments/sparse_logreg/runme.jl b/experiments/sparse_logreg/runme.jl index 034a3df..a4deb45 100644 --- a/experiments/sparse_logreg/runme.jl +++ b/experiments/sparse_logreg/runme.jl @@ -105,7 +105,7 @@ function run_logreg_l1_data( zeros(n), f = AdaProx.Counting(f), g = g, - gamma0 = 5.0, + gamma = 5.0, tol = tol, maxit = maxit/2, name = "Nesterov (2013)" diff --git a/src/AdaProx.jl b/src/AdaProx.jl index 84d8717..e2c98f2 100644 --- a/src/AdaProx.jl +++ b/src/AdaProx.jl @@ -81,7 +81,8 @@ end # Mathematical Programming, volume 140, 2013. # https://link.springer.com/article/10.1007/s10107-012-0629-5 -function _step_backtracking_nesterov_2013(gamma, mu, f, g, x, A, v) +function _step_backtracking_nesterov_2013(x, f, g, gamma, mu, xi, A, v) + gamma = gamma * xi muA1 = mu * A + 1 while true Delta = 4 * muA1^2 * gamma^2 + 8 * muA1 * gamma * A @@ -96,23 +97,22 @@ function _step_backtracking_nesterov_2013(gamma, mu, f, g, x, A, v) grad_z, _ = gradient(f, z) subgrad_z = (w - z) / gamma v = v - a / muA1 * (grad_z + subgrad_z) - A += a - return gamma, z, f_z, g_z, y, A, v + A = A + a + return y, z, f_z, g_z, gamma, A, v end - gamma /= 2 + gamma = gamma / 2 if gamma < 1e-12 @error "step size became too small ($gamma)" end end end -function backtracking_nesterov_2013(x0; f, g, gamma0, mu = 0, xi = 2, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov (2012)") - x, gamma = x0, gamma0 - v = x0 - A = 0 +function backtracking_nesterov_2013(x; f, g, gamma, mu = 0, xi = 2, tol = 1e-5, maxit = 100_000, name = "Backtracking Nesterov (2012)") + A = zero(gamma) + v = x for it in 1:maxit - gamma, x, f_x, g_x, y, A, v = _step_backtracking_nesterov_2013( - xi * gamma, mu, f, g, x, A, v + y, x, f_x, g_x, gamma, A, v = _step_backtracking_nesterov_2013( + x, f, g, gamma, mu, xi, A, v ) norm_res = norm(x - y) / gamma @logmsg Record "" method=name it gamma norm_res objective=(f_x + g_x) grad_f_evals=grad_count(f) prox_g_evals=prox_count(g) f_evals=eval_count(f)