diff options
-rw-r--r-- | mod/mod-ci-github.cxx | 65 |
1 files changed, 46 insertions, 19 deletions
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx index c59b1af..7ee1c9f 100644 --- a/mod/mod-ci-github.cxx +++ b/mod/mod-ci-github.cxx @@ -1036,13 +1036,16 @@ namespace brep // Note that GitHub UI does not allow re-running the entire check suite // until all the check runs are completed. However if we got here as a // result of re-requesting the check suite from build_canceled() the check - // runs could be in any state. + // runs could be in any state (and, yes, re-requesting a not completed + // check suite via the API works). // Create an unloaded CI tenant. // // Note: use no delay since we need to (re)create the synthetic conclusion // check run as soon as possible. // + // @@ Maybe impose a delay to match rebuild delay? Also see build_cancel(). + // // Note that we use the create() API instead of start() since duplicate // management is not available in start(). // @@ -3224,10 +3227,31 @@ namespace brep NOTIFICATION_DIAG (log_writer); - // We end up here when the service data could not be saved so re-request - // the check suite in order to restart all of the builds. - - // Load the unsaved service data. + // We end up here when the service data could not be saved (for example, + // due to persistent transaction conflicts, which does happen if the user + // requests a rebuild of a large number of failed check runs). + // + // Note that we cannot recover from this situation since now our state (in + // service data) does not match the state on GitHub. Ideally in this case + // we would like to fail the conclusion check run and ask the user to + // re-request the entire check suite. However, failing the conclusion is + // not enough -- we also need to either remove all other check runs or to + // at least change them to the completed state (failed that, GitHub UI + // won't allow the user to re-request the check suite). Unfortunately, + // there is no way to remove check runs on GitHub nor to change the state + // of all the check runs that match a certain criteria. The only way is + // to specify each check run mutation with its node id (which we may not + // have). So the only way to implement this would be to query all the + // existing check runs (with pagination and all), and then change them to + // the completed state (again, probably in batches). + // + // So instead of going through all this trouble, we are going to just + // re-request the check suite ourselves. Luckily the GitHub API allows + // this even if the check suite is not completed. This is not ideal since + // we may cause an infinite failure cycle, but seem to be the best we can + // do without heroic measures. + + // Parse the unsaved service data. // service_data sd; try @@ -3262,24 +3286,27 @@ namespace brep if (iat != nullptr) { // Re-request the check suite. - - // Note that the conclusion check run is created before the tenant is - // loaded so the unsaved service data should always contain the check - // suite node id. // - assert (sd.check_suite_node_id.has_value ()); - - // Let unlikely invalid_argument propagate. + // Note that the conclusion check run is created before the tenant is + // loaded so the unsaved service data should normally contain the check + // suite node id, but let's not assume, just in case. // - if (gq_rerequest_check_suite (error, - iat->token, - sd.repository_node_id, - *sd.check_suite_node_id)) + if (sd.check_suite_node_id) { - l3 ([&]{trace << "re-requested check suite " << *sd.check_suite_node_id;}); + const string& nid (*sd.check_suite_node_id); + + // Let unlikely invalid_argument propagate. + // + if (gq_rerequest_check_suite (error, + iat->token, + sd.repository_node_id, + nid)) + { + l3 ([&]{trace << "re-requested check suite " << nid;}); + } + else + error << "failed to re-request check suite " << nid; } - else - error << "failed to re-request check suite " << *sd.check_suite_node_id; } } catch (const std::exception& e) |