From 1d0a198748c0e4aa1ce22ab2989a2b734f7d8948 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Wed, 27 Nov 2024 11:50:58 +0200 Subject: Ensure all exceptions are handled in build_*() notifications --- mod/mod-ci-github.cxx | 597 +++++++++++++++++++------------------------------- 1 file changed, 230 insertions(+), 367 deletions(-) (limited to 'mod/mod-ci-github.cxx') diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx index c703f32..56f703e 100644 --- a/mod/mod-ci-github.cxx +++ b/mod/mod-ci-github.cxx @@ -693,6 +693,8 @@ namespace brep // Create a gq_built_result. // + // Throw invalid_argument in case of invalid result_status. + // static gq_built_result make_built_result (result_status rs, bool warning_success, string message) { @@ -1166,312 +1168,6 @@ namespace brep return true; } - // @@ TMP - // -#if 0 - bool ci_github:: - handle_check_run_rerequest (const gh_check_run_event& cr, - bool warning_success) - { - HANDLER_DIAG; - - l3 ([&]{trace << "check_run event { " << cr << " }";}); - - // Fail if this is the conclusion check run. - // - if (cr.check_run.name == conclusion_check_run_name) - { - // @@ Fail conclusion check run with appropriate message and reurn - // true. - - l3 ([&]{trace << "ignoring conclusion check_run";}); - - // 422 Unprocessable Content: The request was well-formed (i.e., - // syntactically correct) but could not be processed. - // - throw invalid_request (422, "Conclusion check run cannot be rebuilt"); - } - - // Get a new installation access token. - // - auto get_iat = [this, &trace, &error, &cr] () - -> optional - { - optional jwt (generate_jwt (trace, error)); - if (!jwt) - return nullopt; - - optional iat ( - obtain_installation_access_token (cr.installation.id, - move (*jwt), - error)); - - if (iat) - l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); - - return iat; - }; - - // Create a new conclusion check run, replacing the existing one. - // - // Return the check run on success or nullopt on failure. - // - auto create_conclusion_cr = - [&cr, &error, warning_success] (const gh_installation_access_token& iat, - build_state bs, - optional rs = nullopt, - optional msg = nullopt) - -> optional - { - optional br; - if (rs) - { - assert (msg); - - br = make_built_result (*rs, warning_success, move (*msg)); - } - - check_run r; - r.name = conclusion_check_run_name; - - if (gq_create_check_run (error, r, iat.token, - rni, hs, - nullopt /* details_url */, - bs, move (br))) - { - return r; - } - else - return nullopt; - }; - - // The overall plan is as follows: - // - // 1. Call the rebuild() function to attempt to schedule a rebuild. Pass - // the update function that does the following (if called): - // - // a. Update the check run being rebuilt (may also not exist). - // - // b. Clear the completed flag if true. - // - // c. "Return" the service data to be used after the call. - // - // 2. If the result of rebuild() indicates the tenant is archived, fail - // the conclusion check run with appropriate diagnostics. - // - // 3. If original state is queued, then no rebuild was scheduled and we do - // nothing. - // - // 4. Otherwise (the original state is building or built): - // - // a. Change the check run state to queued. - // - // b. Change the conclusion check run to building (do unconditionally - // to mitigate races). - // - // Note that while conceptually we are updating existing check runs, in - // practice we have to create new check runs to replace the existing ones - // because GitHub does not allow transitioning out of the built state. - // - // This results in a new node id for each check run but we can't save them - // to the service data after the rebuild() call. As a workaround, when - // updating the service data we 1) clear the re-requested check run's node - // id and set the state_synced flag to true to signal to build_building() - // and build_built() that it needs to create a new check run; and 2) clear - // the conclusion check run's node id to cause build_built() to create a - // new conclusion check run. And these two check runs' node ids will be - // saved to the service data. - - // Parse the check_run's details_url to extract build id. - // - // While this is a bit hackish, there doesn't seem to be a better way - // (like associating custom data with a check run). Note that the GitHub - // UI only allows rebuilding completed check runs, so the details URL - // should be there. - // - optional bid (parse_details_url (cr.check_run.details_url)); - if (!bid) - { - fail << "check run " << cr.check_run.node_id - << ": failed to extract build id from details_url"; - } - - // The IAT retrieved from the service data. - // - optional iat; - - // True if the check run exists in the service data. - // - bool cr_found (false); - - // Update the state of the check run in the service data. Return (via - // captured references) the IAT and whether the check run was found. - // - // Called by rebuild(), but only if the build is actually restarted. - // - auto update_sd = [&iat, - &cr_found, - &error, - &cr] (const tenant_service& ts, build_state) - -> optional - { - // NOTE: this lambda may be called repeatedly (e.g., due to transaction - // being aborted) and so should not move out of its captures. - - service_data sd; - try - { - sd = service_data (*ts.data); - } - catch (const invalid_argument& e) - { - error << "failed to parse service data: " << e; - return nullptr; - } - - if (!iat) - iat = sd.installation_access; - - // If the re-requested check run is found, update it in the service - // data. - // - const string& nid (cr.check_run.node_id); - - for (check_run& cr: sd.check_runs) - { - if (cr.node_id && *cr.node_id == nid) - { - cr_found = true; - cr.state = build_state::queued; - sd.completed = false; - - // Clear the check run node ids and set state_synced to true to - // cause build_building() and/or build_built() to create new check - // runs (see the plan above for details). - // - cr.node_id = nullopt; - cr.state_synced = true; - sd.conclusion_node_id = nullopt; - - return sd.json (); - } - } - - return nullopt; - }; - - optional bs (rebuild (*build_db_, retry_, *bid, update_sd)); - - if (!bs) - { - // Build has expired (most probably the tenant has been archived). - // - // Update the conclusion check run to notify the user (but have to - // replace it with a new one because we don't know the existing one's - // node id). - // - optional iat (get_iat ()); - if (!iat) - throw server_error (); - - if (optional ccr = create_conclusion_cr ( - *iat, - build_state::built, - result_status::error, - "Unable to rebuild: tenant has been archived or no such build")) - { - l3 ([&]{trace << "created conclusion check_run { " << *ccr << " }";}); - } - else - { - // Log the error and return failure to GitHub which will presumably - // indicate this in its GUI. - // - fail << "check run " << cr.check_run.node_id - << ": unable to create conclusion check run"; - } - } - else if (*bs == build_state::queued) - { - // The build was already queued so nothing to be done. This might happen - // if the user clicked "re-run" multiple times before we managed to - // update the check run. - } - else - { - // The build has been requeued. - // - assert (*bs == build_state::building || *bs == build_state::built); - - if (!cr_found) - { - // Respond with an error otherwise GitHub will post a message in its - // GUI saying "you have successfully requested a rebuild of ..." - // - fail << "check_run " << cr.check_run.node_id - << ": build restarted but check run does not exist " - << "in service data"; - } - - // Get a new IAT if the one from the service data has expired. - // - assert (iat.has_value ()); - - if (system_clock::now () > iat->expires_at) - { - iat = get_iat (); - if (!iat) - throw server_error (); - } - - // Update (by replacing) the re-requested and conclusion check runs to - // queued and building, respectively. - // - // If either fails we can only log the error but build_building() and/or - // build_built() should correct the situation (see above for details). - // - - // Update re-requested check run. - // - { - check_run ncr; // New check run. - ncr.name = cr.check_run.name; - - if (gq_create_check_run (error, - ncr, - iat->token, - cr.repository.node_id, - cr.check_run.check_suite.head_sha, - cr.check_run.details_url, - build_state::queued)) - { - l3 ([&]{trace << "created check_run { " << ncr << " }";}); - } - else - { - error << "check_run " << cr.check_run.node_id - << ": unable to create (to update) check run in queued state"; - } - } - - // Update conclusion check run. - // - if (optional ccr = - create_conclusion_cr (*iat, build_state::building)) - { - l3 ([&]{trace << "created conclusion check_run { " << *ccr << " }";}); - } - else - { - error << "check_run " << cr.check_run.node_id - << ": unable to create (to update) conclusion check run"; - } - } - - return true; - } -#endif - // Miscellaneous pull request facts // // - Although some of the GitHub documentation makes it sound like they @@ -1629,6 +1325,8 @@ namespace brep build_unloaded (tenant_service&& ts, const diag_epilogue& log_writer) const noexcept { + // NOTE: this function is noexcept and should not throw. + NOTIFICATION_DIAG (log_writer); service_data sd; @@ -1651,7 +1349,16 @@ namespace brep build_unloaded_pre_check (tenant_service&& ts, service_data&& sd, const diag_epilogue& log_writer) const noexcept + try { + // NOTE: this function is noexcept and should not throw. + // + // In a few places where invalid_argument is unlikely to be thrown and/or + // would indicate that things are seriously broken we let it propagate to + // the function catch block where the pre-check tenant will be canceled + // (otherwise we could end up in an infinite loop, e.g., because the + // problematic arguments won't change). + NOTIFICATION_DIAG (log_writer); // We get here for PRs only (but both local and remote). The overall @@ -1677,6 +1384,8 @@ namespace brep // Request PR pre-check info (triggering the generation of the test merge // commit on the GitHub's side). // + // Let unlikely invalid_argument propagate (see above). + // optional pc ( gq_fetch_pull_request_pre_check_info (error, sd.installation_access.token, @@ -1736,38 +1445,50 @@ namespace brep // notifications until (1) we load the tenant, (2) we cancel it, or (3) // it gets archived after some timeout. // - if (auto pr = create (error, warn, verb_ ? &trace : nullptr, - *build_db_, retry_, - tenant_service (sid, "ci-github", sd.json ()), - chrono::seconds (30) /* interval */, - chrono::seconds (0) /* delay */, - duplicate_tenant_mode::ignore)) - { - if (pr->second == duplicate_tenant_result::ignored) + try + { + if (auto pr = create (error, warn, verb_ ? &trace : nullptr, + *build_db_, retry_, + tenant_service (sid, "ci-github", sd.json ()), + chrono::seconds (30) /* interval */, + chrono::seconds (0) /* delay */, + duplicate_tenant_mode::ignore)) { - // This PR is sharing a head commit with something else. - // - // If this is a local PR then it's probably the branch push, which - // is expected, so do nothing. - // - // If this is a remote PR then it could be anything (branch push, - // local PR, or another remote PR) which in turn means the CI result - // may end up being for head, not merge commit. There is nothing we - // can do about it on our side (the user can enable the head-behind- - // base protection on their side). - // - if (sd.kind == service_data::remote) + if (pr->second == duplicate_tenant_result::ignored) { - l3 ([&]{trace << "remote pull request " << *sd.pr_node_id - << ": CI tenant already exists for " << sid;}); + // This PR is sharing a head commit with something else. + // + // If this is a local PR then it's probably the branch push, which + // is expected, so do nothing. + // + // If this is a remote PR then it could be anything (branch push, + // local PR, or another remote PR) which in turn means the CI + // result may end up being for head, not merge commit. There is + // nothing we can do about it on our side (the user can enable the + // head-behind- base protection on their side). + // + if (sd.kind == service_data::remote) + { + l3 ([&]{trace << "remote pull request " << *sd.pr_node_id + << ": CI tenant already exists for " << sid;}); + } } } + else + { + error << "pull request " << *sd.pr_node_id + << ": failed to create unloaded CI tenant " + << "with tenant_service id " << sid; + + // Fall through to cancel. + } } - else + catch (const runtime_error& e) // Database retries exhausted. { error << "pull request " << *sd.pr_node_id - << ": unable to create unloaded CI tenant " - << "with tenant_service id " << sid; + << ": failed to create unloaded CI tenant " + << "with tenant_service id " << sid + << ": " << e.what (); // Fall through to cancel. } @@ -1775,16 +1496,50 @@ namespace brep // Cancel the pre-check tenant. // - if (!cancel (error, warn, verb_ ? &trace : nullptr, - *build_db_, retry_, - ts.type, - ts.id)) + try + { + if (!cancel (error, warn, verb_ ? &trace : nullptr, + *build_db_, retry_, + ts.type, + ts.id)) + { + // Should never happen (no such tenant). + // + error << "pull request " << *sd.pr_node_id + << ": failed to cancel pre-check tenant with tenant_service id " + << ts.id; + } + } + catch (const runtime_error& e) // Database retries exhausted. { - // Should never happen (no such tenant). - // error << "pull request " << *sd.pr_node_id << ": failed to cancel pre-check tenant with tenant_service id " - << ts.id; + << ts.id << ": " << e.what (); + } + + return nullptr; + } + catch (const std::exception& e) + { + NOTIFICATION_DIAG (log_writer); + error << "pull request " << *sd.pr_node_id + << ": unhandled exception: " << e.what (); + + // Cancel the pre-check tenant otherwise we could end up in an infinite + // loop (see top of function). + // + try + { + if (cancel (error, warn, verb_ ? &trace : nullptr, + *build_db_, retry_, + ts.type, + ts.id)) + l3 ([&]{trace << "canceled pre-check tenant " << ts.id;}); + } + catch (const runtime_error& e) // Database retries exhausted. + { + l3 ([&]{trace << "failed to cancel pre-check tenant " << ts.id << ": " + << e.what ();}); } return nullptr; @@ -1794,7 +1549,16 @@ namespace brep build_unloaded_load (tenant_service&& ts, service_data&& sd, const diag_epilogue& log_writer) const noexcept + try { + // NOTE: this function is noexcept and should not throw. + // + // In a few places where invalid_argument is unlikely to be thrown and/or + // would indicate that things are seriously broken we let it propagate to + // the function catch block where the tenant will be canceled (otherwise + // we could end up in an infinite loop, e.g., because the problematic + // arguments won't change). + NOTIFICATION_DIAG (log_writer); // Load the tenant, which is essentially the same for both branch push and @@ -1840,6 +1604,8 @@ namespace brep check_run cr; cr.name = move (name); + // Let unlikely invalid_argument propagate (see above). + // if (gq_create_check_run (error, cr, iat->token, @@ -1866,12 +1632,16 @@ namespace brep { assert (!node_id.empty ()); + // Let unlikely invalid_argument propagate (see above). + // gq_built_result br ( make_built_result (rs, sd.warning_success, move (summary))); check_run cr; cr.name = name; // For display purposes only. + // Let unlikely invalid_argument propagate (see above). + // if (gq_update_check_run (error, cr, iat->token, @@ -1898,16 +1668,24 @@ namespace brep // string conclusion_node_id; // Conclusion check run node ID. - if (auto cr = create_synthetic_cr (conclusion_check_run_name)) + if (!sd.conclusion_node_id) { - l3 ([&]{trace << "created check_run { " << *cr << " }";}); + if (auto cr = create_synthetic_cr (conclusion_check_run_name)) + { + l3 ([&]{trace << "created check_run { " << *cr << " }";}); - conclusion_node_id = move (*cr->node_id); + conclusion_node_id = move (*cr->node_id); + } } + const string& effective_conclusion_node_id ( + sd.conclusion_node_id + ? *sd.conclusion_node_id + : conclusion_node_id); + // Load the CI tenant if the conclusion check run was created. // - if (!conclusion_node_id.empty ()) + if (!effective_conclusion_node_id.empty ()) { string ru; // Repository URL. @@ -1924,36 +1702,50 @@ namespace brep else ru = sd.repository_clone_url + '#' + sd.check_sha; + // Let unlikely invalid_argument propagate (see above). + // repository_location rl (move (ru), repository_type::git); - optional r (load (error, warn, verb_ ? &trace : nullptr, - *build_db_, retry_, - move (ts), - move (rl))); - - if (!r || r->status != 200) + try { - if (auto cr = update_synthetic_cr (conclusion_node_id, - conclusion_check_run_name, - result_status::error, - to_check_run_summary (r))) - { - l3 ([&]{trace << "updated check_run { " << *cr << " }";}); - } - else + optional r (load (error, warn, verb_ ? &trace : nullptr, + *build_db_, retry_, + move (ts), + move (rl))); + + if (!r || r->status != 200) { - // Nothing really we can do in this case since we will not receive - // any further notifications. Log the error as a last resort. + // Let unlikely invalid_argument propagate (see above). + // + if (auto cr = update_synthetic_cr (effective_conclusion_node_id, + conclusion_check_run_name, + result_status::error, + to_check_run_summary (r))) + { + l3 ([&]{trace << "updated check_run { " << *cr << " }";}); + } + else + { + // Nothing really we can do in this case since we will not receive + // any further notifications. Log the error as a last resort. + + error << "failed to load CI tenant " << ts.id + << " and unable to update conclusion"; + } - error << "failed to load CI tenant " << ts.id - << " and unable to update conclusion"; + return nullptr; // No need to update service data in this case. } + } + catch (const runtime_error& e) // Database retries exhausted. + { + error << "failed to load CI tenant " << ts.id << ": " << e.what (); - return nullptr; // No need to update service data in this case. + // Fall through to retry on next call. } } - else if (!new_iat) - return nullptr; // Nothing to save (but retry on next call). + + if (!new_iat && conclusion_node_id.empty ()) + return nullptr; // Nothing to save (but potentially retry on next call). return [&error, iat = move (new_iat), @@ -1984,6 +1776,28 @@ namespace brep return sd.json (); }; } + catch (const std::exception& e) + { + NOTIFICATION_DIAG (log_writer); + error << "CI tenant " << ts.id << ": unhandled exception: " << e.what (); + + // Cancel the tenant otherwise we could end up in an infinite loop (see + // top of function). + // + try + { + if (cancel (error, warn, verb_ ? &trace : nullptr, + *build_db_, retry_, ts.type, ts.id)) + l3 ([&]{trace << "canceled CI tenant " << ts.id;}); + } + catch (const runtime_error& e) // Database retries exhausted. + { + l3 ([&]{trace << "failed to cancel CI tenant " << ts.id + << ": " << e.what ();}); + } + + return nullptr; + } // Build state change notifications (see tenant-services.hxx for // background). Mapping our state transitions to GitHub pose multiple @@ -2079,7 +1893,10 @@ namespace brep optional istate, const build_queued_hints& hs, const diag_epilogue& log_writer) const noexcept + try { + // NOTE: this function is noexcept and should not throw. + NOTIFICATION_DIAG (log_writer); service_data sd; @@ -2179,6 +1996,8 @@ namespace brep { // Create a check_run for each build as a single request. // + // Let unlikely invalid_argument propagate. + // if (gq_create_check_runs (error, crs, iat->token, @@ -2242,12 +2061,23 @@ namespace brep return sd.json (); }; } + catch (const std::exception& e) + { + NOTIFICATION_DIAG (log_writer); + + error << "CI tenant " << ts.id << ": unhandled exception: " << e.what (); + + return nullptr; + } function (const tenant_service&)> ci_github:: build_building (const tenant_service& ts, const build& b, const diag_epilogue& log_writer) const noexcept + try { + // NOTE: this function is noexcept and should not throw. + NOTIFICATION_DIAG (log_writer); service_data sd; @@ -2323,6 +2153,8 @@ namespace brep // if (iat != nullptr) { + // Let unlikely invalid_argument propagate. + // if (gq_update_check_run (error, *cr, iat->token, @@ -2388,17 +2220,30 @@ namespace brep return sd.json (); }; } + catch (const std::exception& e) + { + NOTIFICATION_DIAG (log_writer); + + string bid (gh_check_run_name (b)); // Full build id. + + error << "check run " << bid << ": unhandled exception: " << e.what(); + + return nullptr; + } function (const tenant_service&)> ci_github:: build_built (const tenant_service& ts, const build& b, const diag_epilogue& log_writer) const noexcept + try { + // NOTE: this function is noexcept and should not throw. + + NOTIFICATION_DIAG (log_writer); + // @@ TODO Include service_data::event_node_id and perhaps ts.id in // diagnostics? E.g. when failing to update check runs we print the // build ID only. - // - NOTIFICATION_DIAG (log_writer); service_data sd; try @@ -2518,6 +2363,11 @@ namespace brep { using namespace web::xhtml; + // Note: let all serialization exceptions propagate. The XML + // serialization code can throw bad_alloc or xml::serialization in + // case of I/O failures, but we're serializing to a string stream so + // both exceptions are unlikely. + // ostringstream os; xml::serializer s (os, "check_run_summary"); @@ -2613,7 +2463,8 @@ namespace brep if (cr.node_id) { - // Update existing check run to built. + // Update existing check run to built. Let unlikely invalid_argument + // propagate. // if (gq_update_check_run (error, cr, @@ -2630,7 +2481,7 @@ namespace brep } else { - // Create new check run. + // Create new check run. Let unlikely invalid_argument propagate. // // Note that we don't have build hints so will be creating this check // run with the full build id as name. In the unlikely event that an @@ -2678,6 +2529,8 @@ namespace brep cr.node_id = *sd.conclusion_node_id; cr.name = conclusion_check_run_name; + // Let unlikely invalid_argument propagate. + // if (gq_update_check_run (error, cr, iat->token, @@ -2782,6 +2635,16 @@ namespace brep return sd.json (); }; } + catch (const std::exception& e) + { + NOTIFICATION_DIAG (log_writer); + + string bid (gh_check_run_name (b)); // Full build id. + + error << "check run " << bid << ": unhandled exception: " << e.what(); + + return nullptr; + } string ci_github:: details_url (const build& b) const -- cgit v1.1