From 01db379f8f18fb3d0c148e01a9c70c6d39267171 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 23 Apr 2021 18:03:16 +0200 Subject: [PATCH 1/7] Tolerate linesearch failures sometimes --- cpp/src/glm/qn/qn_solvers.cuh | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/cpp/src/glm/qn/qn_solvers.cuh b/cpp/src/glm/qn/qn_solvers.cuh index f98873bf5a..ab428d1818 100644 --- a/cpp/src/glm/qn/qn_solvers.cuh +++ b/cpp/src/glm/qn/qn_solvers.cuh @@ -137,10 +137,16 @@ inline OPT_RETCODE min_lbfgs(const LBFGSParam ¶m, LINE_SEARCH_RETCODE lsret = ls_backtrack(param, f, fx, x, grad, step, drt, xp, dev_scalar, stream); - bool isLsSuccess = lsret == LS_SUCCESS; - CUML_LOG_TRACE("Iteration %d, fx=%f", *k, fx); - - if (!isLsSuccess || isnan(fx) || isinf(fx)) { + bool isLsValid = !isnan(fx) && !isinf(fx); + // Linesearch may fail to converge, but still come closer to the solution; + // if that is not the case, let `check_convergence` ("insufficient change") + // below terminate the loop. + bool isLsInDoubt = + lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; + bool isLsSuccess = + lsret == LS_SUCCESS || isLsValid && fx <= fxp && isLsSuccess; + + if (!isLsSuccess || !isLsValid) { fx = fxp; x.copy_async(xp, stream); grad.copy_async(gradp, stream); @@ -283,8 +289,16 @@ inline OPT_RETCODE min_owlqn(const LBFGSParam ¶m, Function &f, ls_backtrack_projected(param, f_wrap, fx, x, grad, pseudo, step, drt, xp, l1_penalty, dev_scalar, stream); - bool isLsSuccess = lsret == LS_SUCCESS; - if (!isLsSuccess || isnan(fx) || isinf(fx)) { + bool isLsValid = !isnan(fx) && !isinf(fx); + // Linesearch may fail to converge, but still come closer to the solution; + // if that is not the case, let `check_convergence` ("insufficient change") + // below terminate the loop. + bool isLsInDoubt = + lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; + bool isLsSuccess = + lsret == LS_SUCCESS || isLsValid && fx <= fxp && isLsSuccess; + + if (!isLsSuccess || !isLsValid) { fx = fxp; x.copy_async(xp, stream); grad.copy_async(gradp, stream); From 687eb35e864f7366bcd38ef355e4935803e9480b Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 27 Apr 2021 12:06:30 +0200 Subject: [PATCH 2/7] Fix a typo and print the LS return code --- cpp/src/glm/qn/qn_solvers.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/glm/qn/qn_solvers.cuh b/cpp/src/glm/qn/qn_solvers.cuh index ab428d1818..c596fa1577 100644 --- a/cpp/src/glm/qn/qn_solvers.cuh +++ b/cpp/src/glm/qn/qn_solvers.cuh @@ -144,14 +144,14 @@ inline OPT_RETCODE min_lbfgs(const LBFGSParam ¶m, bool isLsInDoubt = lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; bool isLsSuccess = - lsret == LS_SUCCESS || isLsValid && fx <= fxp && isLsSuccess; + lsret == LS_SUCCESS || isLsValid && fx <= fxp && isLsInDoubt; if (!isLsSuccess || !isLsValid) { fx = fxp; x.copy_async(xp, stream); grad.copy_async(gradp, stream); if (!isLsSuccess) { - CUML_LOG_ERROR("L-BFGS line search failed"); + CUML_LOG_ERROR("L-BFGS line search failed (code %d)", lsret); return OPT_LS_FAILED; } CUML_LOG_ERROR("L-BFGS error fx=%f at iteration %d", fx, *k); @@ -296,14 +296,14 @@ inline OPT_RETCODE min_owlqn(const LBFGSParam ¶m, Function &f, bool isLsInDoubt = lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; bool isLsSuccess = - lsret == LS_SUCCESS || isLsValid && fx <= fxp && isLsSuccess; + lsret == LS_SUCCESS || isLsValid && fx <= fxp && isLsInDoubt; if (!isLsSuccess || !isLsValid) { fx = fxp; x.copy_async(xp, stream); grad.copy_async(gradp, stream); if (!isLsSuccess) { - CUML_LOG_ERROR("QWL-QN line search failed"); + CUML_LOG_ERROR("QWL-QN line search failed (code %d)", lsret); return OPT_LS_FAILED; } CUML_LOG_ERROR("OWL-QN error fx=%f at iteration %d", fx, *k); From d2055c7ef8fa14d59958a8b3a8119b2f7092c5ab Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 21 Jul 2021 14:00:19 +0200 Subject: [PATCH 3/7] Make linesearch complain only if not converged. --- cpp/src/glm/qn/qn_solvers.cuh | 145 +++++++++++++++++++++------------- 1 file changed, 91 insertions(+), 54 deletions(-) diff --git a/cpp/src/glm/qn/qn_solvers.cuh b/cpp/src/glm/qn/qn_solvers.cuh index e9c26088a3..36dbb91b59 100644 --- a/cpp/src/glm/qn/qn_solvers.cuh +++ b/cpp/src/glm/qn/qn_solvers.cuh @@ -68,6 +68,59 @@ inline size_t owlqn_workspace_size(const LBFGSParam& param, const int n) return lbfgs_workspace_size(param, n) + vec_size; } +template +inline bool update_and_check(const std::string solver, + const LBFGSParam& param, + int iter, + LINE_SEARCH_RETCODE lsret, + T& fx, + T& fxp, + ML::SimpleVec& x, + ML::SimpleVec& xp, + ML::SimpleVec& grad, + ML::SimpleVec& gradp, + std::vector& fx_hist, + T* dev_scalar, + OPT_RETCODE& outcode, + cudaStream_t stream) +{ + bool stop = false; + bool converged = false; + bool isLsValid = !isnan(fx) && !isinf(fx); + // Linesearch may fail to converge, but still come closer to the solution; + // if that is not the case, let `check_convergence` ("insufficient change") + // below terminate the loop. + bool isLsInDoubt = lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; + bool isLsSuccess = lsret == LS_SUCCESS || isLsValid && fx <= fxp + param.ftol && isLsInDoubt; + + // if the target is at least finite, we can check the convergence + if (isLsValid) + converged = check_convergence(param, iter, fx, x, grad, fx_hist, dev_scalar, stream); + + if (!isLsSuccess && !converged) { + CUML_LOG_WARN("%s line search failed (code %d)", solver, lsret); + outcode = OPT_LS_FAILED; + stop = true; + } else if (!isLsValid) { + CUML_LOG_ERROR("%s error fx=%f at iteration %d", solver, fx, iter); + outcode = OPT_NUMERIC_ERROR; + stop = true; + } else if (converged) { + CUML_LOG_DEBUG("%s converged", solver); + outcode = OPT_SUCCESS; + stop = true; + } + + // if lineseach wasn't successful, undo the update. + if (!isLsSuccess || !isLsValid) { + fx = fxp; + x.copy_async(xp, stream); + grad.copy_async(gradp, stream); + } + + return stop; +} + template inline OPT_RETCODE min_lbfgs(const LBFGSParam& param, Function& f, // function to minimize @@ -131,6 +184,8 @@ inline OPT_RETCODE min_lbfgs(const LBFGSParam& param, *k = 1; int end = 0; int n_vec = 0; // number of vector updates made in lbfgs_search_dir + OPT_RETCODE retcode; + LINE_SEARCH_RETCODE lsret; for (; *k <= param.max_iterations; (*k)++) { // Save the curent x and gradient xp.copy_async(x, stream); @@ -138,34 +193,23 @@ inline OPT_RETCODE min_lbfgs(const LBFGSParam& param, fxp = fx; // Line search to update x, fx and gradient - LINE_SEARCH_RETCODE lsret = - ls_backtrack(param, f, fx, x, grad, step, drt, xp, dev_scalar, stream); - - bool isLsValid = !isnan(fx) && !isinf(fx); - // Linesearch may fail to converge, but still come closer to the solution; - // if that is not the case, let `check_convergence` ("insufficient change") - // below terminate the loop. - bool isLsInDoubt = - lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; - bool isLsSuccess = - lsret == LS_SUCCESS || isLsValid && fx <= fxp && isLsInDoubt; - - if (!isLsSuccess || !isLsValid) { - fx = fxp; - x.copy_async(xp, stream); - grad.copy_async(gradp, stream); - if (!isLsSuccess) { - CUML_LOG_ERROR("L-BFGS line search failed (code %d)", lsret); - return OPT_LS_FAILED; - } - CUML_LOG_ERROR("L-BFGS error fx=%f at iteration %d", fx, *k); - return OPT_NUMERIC_ERROR; - } - - if (check_convergence(param, *k, fx, x, grad, fx_hist, dev_scalar, stream)) { - CUML_LOG_DEBUG("L-BFGS converged"); - return OPT_SUCCESS; - } + lsret = ls_backtrack(param, f, fx, x, grad, step, drt, xp, dev_scalar, stream); + + if (update_and_check("L-BFGS", + param, + *k, + lsret, + fx, + fxp, + x, + xp, + grad, + gradp, + fx_hist, + dev_scalar, + retcode, + stream)) + return retcode; // Update s and y // s_{k+1} = x_{k+1} - x_k @@ -288,6 +332,8 @@ inline OPT_RETCODE min_owlqn(const LBFGSParam& param, int end = 0; int n_vec = 0; // number of vector updates made in lbfgs_search_dir + OPT_RETCODE retcode; + LINE_SEARCH_RETCODE lsret; for ((*k) = 1; (*k) <= param.max_iterations; (*k)++) { // Save the curent x and gradient xp.copy_async(x, stream); @@ -295,38 +341,29 @@ inline OPT_RETCODE min_owlqn(const LBFGSParam& param, fxp = fx; // Projected line search to update x, fx and gradient - LINE_SEARCH_RETCODE lsret = ls_backtrack_projected( + lsret = ls_backtrack_projected( param, f_wrap, fx, x, grad, pseudo, step, drt, xp, l1_penalty, dev_scalar, stream); - bool isLsValid = !isnan(fx) && !isinf(fx); - // Linesearch may fail to converge, but still come closer to the solution; - // if that is not the case, let `check_convergence` ("insufficient change") - // below terminate the loop. - bool isLsInDoubt = - lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; - bool isLsSuccess = - lsret == LS_SUCCESS || isLsValid && fx <= fxp && isLsInDoubt; - - if (!isLsSuccess || !isLsValid) { - fx = fxp; - x.copy_async(xp, stream); - grad.copy_async(gradp, stream); - if (!isLsSuccess) { - CUML_LOG_ERROR("QWL-QN line search failed (code %d)", lsret); - return OPT_LS_FAILED; - } - CUML_LOG_ERROR("OWL-QN error fx=%f at iteration %d", fx, *k); - return OPT_NUMERIC_ERROR; - } + if (update_and_check("QWL-QN", + param, + *k, + lsret, + fx, + fxp, + x, + xp, + grad, + gradp, + fx_hist, + dev_scalar, + retcode, + stream)) + return retcode; + // recompute pseudo // pseudo.assign_binary(x, grad, pseudo_grad); update_pseudo(x, grad, pseudo_grad, pg_limit, pseudo, stream); - if (check_convergence(param, *k, fx, x, pseudo, fx_hist, dev_scalar, stream)) { - CUML_LOG_DEBUG("OWL-QN converged"); - return OPT_SUCCESS; - } - // Update s and y - We should only do this if there is no skipping condition col_ref(S, svec, end); From 63cca817506684f07d7610749e7b82bb52eb1416 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 21 Jul 2021 14:38:14 +0200 Subject: [PATCH 4/7] Stop due to linesearch doing nothing. --- cpp/src/glm/qn/qn_solvers.cuh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/glm/qn/qn_solvers.cuh b/cpp/src/glm/qn/qn_solvers.cuh index 36dbb91b59..fa3e35e77b 100644 --- a/cpp/src/glm/qn/qn_solvers.cuh +++ b/cpp/src/glm/qn/qn_solvers.cuh @@ -69,7 +69,7 @@ inline size_t owlqn_workspace_size(const LBFGSParam& param, const int n) } template -inline bool update_and_check(const std::string solver, +inline bool update_and_check(const char* solver, const LBFGSParam& param, int iter, LINE_SEARCH_RETCODE lsret, @@ -90,8 +90,9 @@ inline bool update_and_check(const std::string solver, // Linesearch may fail to converge, but still come closer to the solution; // if that is not the case, let `check_convergence` ("insufficient change") // below terminate the loop. - bool isLsInDoubt = lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; - bool isLsSuccess = lsret == LS_SUCCESS || isLsValid && fx <= fxp + param.ftol && isLsInDoubt; + bool isLsNonCritical = lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; + bool isLsInDoubt = isLsValid && fx <= fxp + param.ftol && isLsNonCritical; + bool isLsSuccess = lsret == LS_SUCCESS || isLsInDoubt; // if the target is at least finite, we can check the convergence if (isLsValid) @@ -109,6 +110,14 @@ inline bool update_and_check(const std::string solver, CUML_LOG_DEBUG("%s converged", solver); outcode = OPT_SUCCESS; stop = true; + } else if (isLsInDoubt && fx + param.ftol >= fxp) { + CUML_LOG_WARN( + "%s stopped, because the line search failed to advance (step delta = %f); " + "perhaps, the convergence criteria are too strict?..", + solver, + fx - fxp); + outcode = OPT_LS_FAILED; + stop = true; } // if lineseach wasn't successful, undo the update. From 5ce7796555d820b18c286757a7d402338689b101 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 22 Jul 2021 11:15:02 +0200 Subject: [PATCH 5/7] Add more comments and bring back some trace messages --- cpp/src/glm/qn/qn_solvers.cuh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cpp/src/glm/qn/qn_solvers.cuh b/cpp/src/glm/qn/qn_solvers.cuh index fa3e35e77b..5e721a642d 100644 --- a/cpp/src/glm/qn/qn_solvers.cuh +++ b/cpp/src/glm/qn/qn_solvers.cuh @@ -91,8 +91,13 @@ inline bool update_and_check(const char* solver, // if that is not the case, let `check_convergence` ("insufficient change") // below terminate the loop. bool isLsNonCritical = lsret == LS_INVALID_STEP_MIN || lsret == LS_MAX_ITERS_REACHED; - bool isLsInDoubt = isLsValid && fx <= fxp + param.ftol && isLsNonCritical; - bool isLsSuccess = lsret == LS_SUCCESS || isLsInDoubt; + // If the error is not critical, check that the target function does not grow. + // This shouldn't really happen, but weird things can happen if the convergence + // thresholds are too small. + bool isLsInDoubt = isLsValid && fx <= fxp + param.ftol && isLsNonCritical; + bool isLsSuccess = lsret == LS_SUCCESS || isLsInDoubt; + + CUML_LOG_TRACE("%s iteration %d, fx=%f", solver, iter, fx); // if the target is at least finite, we can check the convergence if (isLsValid) @@ -111,6 +116,8 @@ inline bool update_and_check(const char* solver, outcode = OPT_SUCCESS; stop = true; } else if (isLsInDoubt && fx + param.ftol >= fxp) { + // If a non-critical error has happened during the line search, check if the target + // is improved at least a bit. Otherwise, stop to avoid spinning till the iteration limit. CUML_LOG_WARN( "%s stopped, because the line search failed to advance (step delta = %f); " "perhaps, the convergence criteria are too strict?..", From 18ced0100dac4b384e8beb0fb42156c0c0f910cb Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Thu, 22 Jul 2021 12:30:51 +0300 Subject: [PATCH 6/7] Update cpp/src/glm/qn/qn_solvers.cuh Co-authored-by: Tamas Bela Feher --- cpp/src/glm/qn/qn_solvers.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/glm/qn/qn_solvers.cuh b/cpp/src/glm/qn/qn_solvers.cuh index 5e721a642d..7599ec339d 100644 --- a/cpp/src/glm/qn/qn_solvers.cuh +++ b/cpp/src/glm/qn/qn_solvers.cuh @@ -119,8 +119,7 @@ inline bool update_and_check(const char* solver, // If a non-critical error has happened during the line search, check if the target // is improved at least a bit. Otherwise, stop to avoid spinning till the iteration limit. CUML_LOG_WARN( - "%s stopped, because the line search failed to advance (step delta = %f); " - "perhaps, the convergence criteria are too strict?..", + "%s stopped, because the line search failed to advance (step delta = %f)", solver, fx - fxp); outcode = OPT_LS_FAILED; From c9aaab2068a93660f941cdd57affd992bd715cdb Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 22 Jul 2021 12:46:08 +0200 Subject: [PATCH 7/7] Make the debug messages less worrying for the users --- cpp/src/glm/qn/qn_solvers.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/glm/qn/qn_solvers.cuh b/cpp/src/glm/qn/qn_solvers.cuh index 7599ec339d..052d593b13 100644 --- a/cpp/src/glm/qn/qn_solvers.cuh +++ b/cpp/src/glm/qn/qn_solvers.cuh @@ -104,11 +104,13 @@ inline bool update_and_check(const char* solver, converged = check_convergence(param, iter, fx, x, grad, fx_hist, dev_scalar, stream); if (!isLsSuccess && !converged) { - CUML_LOG_WARN("%s line search failed (code %d)", solver, lsret); + CUML_LOG_WARN( + "%s line search failed (code %d); stopping at the last valid step", solver, lsret); outcode = OPT_LS_FAILED; stop = true; } else if (!isLsValid) { - CUML_LOG_ERROR("%s error fx=%f at iteration %d", solver, fx, iter); + CUML_LOG_ERROR( + "%s error fx=%f at iteration %d; stopping at the last valid step", solver, fx, iter); outcode = OPT_NUMERIC_ERROR; stop = true; } else if (converged) { @@ -119,9 +121,7 @@ inline bool update_and_check(const char* solver, // If a non-critical error has happened during the line search, check if the target // is improved at least a bit. Otherwise, stop to avoid spinning till the iteration limit. CUML_LOG_WARN( - "%s stopped, because the line search failed to advance (step delta = %f)", - solver, - fx - fxp); + "%s stopped, because the line search failed to advance (step delta = %f)", solver, fx - fxp); outcode = OPT_LS_FAILED; stop = true; }