diff options
author | Francois Kritzinger <francois@codesynthesis.com> | 2024-11-19 14:18:04 +0200 |
---|---|---|
committer | Francois Kritzinger <francois@codesynthesis.com> | 2024-11-25 10:30:23 +0200 |
commit | 4108e8c40e0d055de03ff839eaffb4346a921472 (patch) | |
tree | effb44eada67088559705a7de9ad7dff2cf063ad | |
parent | 3c89598f062dfffb615ffdb05b9c7c73f3900f1a (diff) |
Re-implement handle_check_run_rerequest()
-rw-r--r-- | mod/mod-ci-github.cxx | 417 |
1 files changed, 409 insertions, 8 deletions
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx index 57a3b08..2094762 100644 --- a/mod/mod-ci-github.cxx +++ b/mod/mod-ci-github.cxx @@ -633,6 +633,18 @@ namespace brep return true; } + // Create a gq_built_result. + // + // @@ TODO Use everywhere. + // + static gq_built_result + make_built_result (result_status rs, bool warning_success, string message) + { + return {gh_to_conclusion (rs, warning_success), + circle (rs) + ' ' + ucase (to_string (rs)), + move (message)}; + } + // Parse a check run details URL into a build_id. // // Return nullopt if the URL is invalid. @@ -640,11 +652,39 @@ namespace brep static optional<build_id> parse_details_url (const string& details_url); - + // Note that GitHub always posts a message to their GUI saying "You have + // successfully requested <check_run_name> be rerun", regardless of what + // HTTP status code we respond with. However we do return error status codes + // when appropriate in case they start handling them someday. + // bool ci_github:: handle_check_run_rerequest (const gh_check_run_event& cr, bool warning_success) { + HANDLER_DIAG; + + l3 ([&]{trace << "check_run event { " << cr << " }";}); + + // Get a new installation access token. + // + auto get_iat = [this, &trace, &error, &cr] () + -> optional<gh_installation_access_token> + { + optional<string> jwt (generate_jwt (trace, error)); + if (!jwt) + return nullopt; + + optional<gh_installation_access_token> iat ( + obtain_installation_access_token (cr.installation.id, + move (*jwt), + error)); + + if (iat) + l3 ([&]{trace << "installation_access_token { " << *iat << " }";}); + + return iat; + }; + // The overall plan is as follows: // // 1. Load service data. @@ -652,16 +692,19 @@ namespace brep // 2. If the tenant is archived, then fail (re-create) conclusion with // appropriate diagnostics. // - // 3. If the check run is in the queued state, then do nothing. + // @@ TMP A PR might get blocked indefinitely by this. If the user + // re-runs a single CR the "re-run all check runs" option disappears + // from the UI. So the single re-run will keep failing but there + // will be no way to start a new CI from scratch. // - // 4. Re-create the check run in the queued state. + // 3. If the check run is in the queued state, then do nothing. // - // 5. Re-create the check run in the queued state and the conclusion in + // 4. Re-create the check run in the queued state and the conclusion in // the building state. Note: do in a single request to make sure we // either "win" or "loose" the potential race for both (important - // for #8). + // for #7). // - // 6. Call the rebuild() function to attempt to schedule a rebuild. Pass + // 5. Call the rebuild() function to attempt to schedule a rebuild. Pass // the update function that does the following (if called): // // a. Save new node ids. @@ -672,10 +715,12 @@ namespace brep // // d. "Return" the service data to be used after the call. // - // 7. If the result of rebuild() indicates the tenant is archived, then + // @@ TMP Not currently returning anything. + // + // 6. If the result of rebuild() indicates the tenant is archived, then // fail (update) the conclusion check run with appropriate diagnostics. // - // 8. If original state is queued (no rebuild was scheduled), then fail + // 7. If original state is queued (no rebuild was scheduled), then fail // (update) both the check run and the conclusion. // // Note that while conceptually we are updating existing check runs, in @@ -683,6 +728,362 @@ namespace brep // existing ones because GitHub does not allow transitioning out of the // built state. // + + const string& repo_node_id (cr.repository.node_id); + const string& head_sha (cr.check_run.check_suite.head_sha); + + // The build and conclusion check runs. They are sent to GitHub in a + // single request (unless something goes wrong) so store them together + // from the outset. + // + vector<check_run> check_runs (2); + check_run& bcr (check_runs[0]); // Build check run + check_run& ccr (check_runs[1]); // Conclusion check run + + ccr.name = conclusion_check_run_name; + + // Load the service data, failing the conclusion check run if the tenant + // has been archived. + // + service_data sd; + { + // Service id that uniquely identifies the CI tenant. + // + string sid (repo_node_id + ':' + head_sha); + + if (optional<pair<tenant_service, bool>> p = + find (*build_db_, "ci-github", sid)) + { + if (p->second) // Tenant is archived + { + // Fail (re-create) the conclusion check run. + // + optional<gh_installation_access_token> iat (get_iat ()); + if (!iat) + throw server_error (); + + gq_built_result br ( + make_built_result (result_status::error, warning_success, + "Unable to rebuild: tenant has been archived")); + + if (gq_create_check_run (error, ccr, iat->token, + repo_node_id, head_sha, + nullopt /* details_url */, + build_state::built, move (br))) + { + l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";}); + } + else + { + fail << "check_run " << cr.check_run.node_id + << ": unable to re-create conclusion check run"; + } + + return true; + } + + tenant_service& ts (p->first); + + try + { + sd = service_data (*ts.data); + } + catch (const invalid_argument& e) + { + fail << "failed to parse service data: " << e; + } + } + else + { + // No such tenant. + // + error << "check run " << cr.check_run.node_id + << " re-requested but tenant_service with id " << sid + << " does not exist"; + return true; + } + } + + // Get a new IAT if the one from the service data has expired. + // + const gh_installation_access_token* iat (nullptr); + optional<gh_installation_access_token> new_iat; + + if (system_clock::now () > sd.installation_access.expires_at) + { + if (new_iat = get_iat ()) + iat = &*new_iat; + else + throw server_error (); + } + else + iat = &sd.installation_access; + + // Fail if it's the conclusion check run that is being re-requested. + // + if (cr.check_run.name == conclusion_check_run_name) + { + // Must exist if it can be re-requested. + // + assert (sd.conclusion_node_id); + + l3 ([&]{trace << "ignoring re-requested conclusion check_run";}); + + gq_built_result br ( + make_built_result (result_status::error, warning_success, + "Conclusion check run cannot be rebuilt")); + + // Fail (update) the conclusion check run. + // + if (gq_update_check_run (error, ccr, iat->token, + repo_node_id, *sd.conclusion_node_id, + nullopt /* details_url */, + build_state::built, move (br))) + { + l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";}); + } + else + { + fail << "check run " << cr.check_run.node_id + << ": unable to update conclusion check run " + << *sd.conclusion_node_id; + } + + return true; + } + + // Parse the check_run's details_url to extract build id. + // + // While this is a bit hackish, there doesn't seem to be a better way + // (like associating custom data with a check run). Note that the GitHub + // UI only allows rebuilding completed check runs, so the details URL + // should be there. + // + optional<build_id> bid (parse_details_url (cr.check_run.details_url)); + if (!bid) + { + fail << "check run " << cr.check_run.node_id + << ": failed to extract build id from details_url"; + } + + // Initialize `bcr` with the check run from the service data. + // + { + // Search for the check run in the service data. + // + bool found (false); + + for (const check_run& scr: sd.check_runs) + { + if (scr.node_id && *scr.node_id == cr.check_run.node_id) + { + found = true; + + // Do nothing if the build is already queued. + // + if (scr.state == build_state::queued) + { + l3 ([&]{trace << "ignoring already-queued check run";}); + return true; + } + + bcr.name = scr.name; + bcr.build_id = scr.build_id; + bcr.state = scr.state; + + break; + } + } + + if (!found) + { + fail << "check_run " << cr.check_run.node_id + << ": re-requested but does not exist in service data"; + } + } + + // Transition the build and conclusion check runs out of the built state + // by re-creating them. + // + bcr.state = build_state::queued; + bcr.state_synced = false; + bcr.details_url = cr.check_run.details_url; + + ccr.state = build_state::building; + ccr.state_synced = false; + + if (gq_create_check_runs (error, check_runs, iat->token, + repo_node_id, head_sha)) + { + assert (bcr.state == build_state::queued); + assert (ccr.state == build_state::building); + + l3 ([&]{trace << "created check_run { " << bcr << " }";}); + l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";}); + } + else + { + fail << "check run " << cr.check_run.node_id + << ": unable to re-create build and conclusion check runs"; + } + + // Request the rebuild. + + // Function called by rebuild() to update the service data (but only if + // the build is actually restarted). + // + auto update_sd = [&error, &new_iat, + &cr, &bcr, &ccr] (const tenant_service& ts, + build_state) -> optional<string> + { + // NOTE: this lambda may be called repeatedly (e.g., due to transaction + // being aborted) and so should not move out of its captures. + + service_data sd; + try + { + sd = service_data (*ts.data); + } + catch (const invalid_argument& e) + { + error << "failed to parse service data: " << e; + return nullopt; + } + + for (check_run& scr: sd.check_runs) + { + if (scr.node_id && *scr.node_id == cr.check_run.node_id) + { + scr = bcr; // Update with new node_id, state, state_synced. + sd.conclusion_node_id = ccr.node_id; + + sd.completed = false; + + // Save the IAT if we created a new one. + // + if (new_iat) + sd.installation_access = *new_iat; + + return sd.json (); + } + } + + assert (false); // Check run must exist because we found it earlier. + + return nullopt; + }; + + optional<build_state> bs (rebuild (*build_db_, retry_, *bid, update_sd)); + + // If the build has been archived or re-enqueued since we loaded the + // service data, fail (by updating) both the build check run and the + // conclusion check run. Otherwise the build has been successfully + // re-enqueued so do nothing further. + // + if (!bs || *bs == build_state::queued) + { + // @@ TMP Diverging from the plan here to fail both CRs in the + // already-archived case as well. Seems better to not leave the + // build CR in the queued state. + // + gq_built_result bbr; // Built result for the build check run. + gq_built_result cbr; // Built result for the conclusion check run. + + if (!bs) // Archived + { + // The build has expired since we loaded the service data. Most likely + // the tenant has been archived. + // + bbr = make_built_result ( + result_status::error, warning_success, + "Unable to rebuild: tenant has been archived or no such build"); + + cbr = bbr; + } + else // *bs == build_state::queued + { + // This build has been re-enqueued since we first loaded the service + // data. This could happen if the user clicked "re-run" multiple times + // and another handler won the rebuild() race. + // + // However the winner of the check runs race cannot be determined. + // + // Best case the other handler won the check runs race as well and + // thus everything will proceed normally. Our check runs will be + // invisible and disregarded. + // + // Worst case we won the check runs race and the other handler's check + // runs -- the ones that will be updated by the build_*() + // notifications -- are no longer visible, leaving things quite + // broken. + // + // Either way, we fail our check runs. In the best case scenario it + // will have no effect; in the worst case scenario it lets the user + // know something has gone wrong. + // + // @@ TMP In an attempt to recover, the user might end up clicking + // re-run a few times but nothing will change in the UI because the + // new check runs will only be created once the re-enqueued build + // has entered the building state. + // + bbr = make_built_result ( + result_status::error, warning_success, + "Unable to rebuild at this time, please try again in a few minutes"); + + cbr = make_built_result (result_status::error, warning_success, + "Failed to rebuild check run(s)"); + } + + // True if we successfully updated both of the check runs. + // + bool bu (true); // Both updated + + // Fail the build check run. + // + if (gq_update_check_run (error, bcr, iat->token, + repo_node_id, *bcr.node_id, + nullopt /* details_url */, + build_state::built, move (bbr))) + { + l3 ([&]{trace << "updated check_run { " << bcr << " }";}); + } + else + { + bu = false; + + error << "check run " << cr.check_run.node_id + << ": unable to update (replacement) check run " + << *bcr.node_id; + } + + // Fail the conclusion check run. + // + if (gq_update_check_run (error, ccr, iat->token, + repo_node_id, *ccr.node_id, + nullopt /* details_url */, + build_state::built, move (cbr))) + { + l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";}); + } + else + { + bu = false; + + error << "check run " << cr.check_run.node_id + << ": unable to update conclusion check run " << *ccr.node_id; + } + + // Fail the handler if either of the check runs could not be updated. + // + if (!bu) + throw server_error (); + + return true; + } + + // The build has successfully been re-enqueued. + + return true; } // @@ TMP |