1 files changed, 46 insertions, 19 deletions
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx
index c59b1af..7ee1c9f 100644
--- a/mod/mod-ci-github.cxx
+++ b/mod/mod-ci-github.cxx
@@ -1036,13 +1036,16 @@ namespace brep
     // Note that GitHub UI does not allow re-running the entire check suite
     // until all the check runs are completed. However if we got here as a
     // result of re-requesting the check suite from build_canceled() the check
-    // runs could be in any state.
+    // runs could be in any state (and, yes, re-requesting a not completed
+    // check suite via the API works).
 
     // Create an unloaded CI tenant.
     //
     // Note: use no delay since we need to (re)create the synthetic conclusion
     // check run as soon as possible.
     //
+    // @@ Maybe impose a delay to match rebuild delay? Also see build_cancel().
+    //
     // Note that we use the create() API instead of start() since duplicate
     // management is not available in start().
     //
@@ -3224,10 +3227,31 @@ namespace brep
 
     NOTIFICATION_DIAG (log_writer);
 
-    // We end up here when the service data could not be saved so re-request
-    // the check suite in order to restart all of the builds.
-
-    // Load the unsaved service data.
+    // We end up here when the service data could not be saved (for example,
+    // due to persistent transaction conflicts, which does happen if the user
+    // requests a rebuild of a large number of failed check runs).
+    //
+    // Note that we cannot recover from this situation since now our state (in
+    // service data) does not match the state on GitHub. Ideally in this case
+    // we would like to fail the conclusion check run and ask the user to
+    // re-request the entire check suite. However, failing the conclusion is
+    // not enough -- we also need to either remove all other check runs or to
+    // at least change them to the completed state (failed that, GitHub UI
+    // won't allow the user to re-request the check suite). Unfortunately,
+    // there is no way to remove check runs on GitHub nor to change the state
+    // of all the check runs that match a certain criteria. The only way is
+    // to specify each check run mutation with its node id (which we may not
+    // have). So the only way to implement this would be to query all the
+    // existing check runs (with pagination and all), and then change them to
+    // the completed state (again, probably in batches).
+    //
+    // So instead of going through all this trouble, we are going to just
+    // re-request the check suite ourselves. Luckily the GitHub API allows
+    // this even if the check suite is not completed. This is not ideal since
+    // we may cause an infinite failure cycle, but seem to be the best we can
+    // do without heroic measures.
+
+    // Parse the unsaved service data.
     //
     service_data sd;
     try
@@ -3262,24 +3286,27 @@ namespace brep
     if (iat != nullptr)
     {
       // Re-request the check suite.
-
-      // Note that the conclusion check run is created before the tenant is
-      // loaded so the unsaved service data should always contain the check
-      // suite node id.
       //
-      assert (sd.check_suite_node_id.has_value ());
-
-      // Let unlikely invalid_argument propagate.
+      // Note that the conclusion check run is created before the tenant is
+      // loaded so the unsaved service data should normally contain the check
+      // suite node id, but let's not assume, just in case.
       //
-      if (gq_rerequest_check_suite (error,
-                                    iat->token,
-                                    sd.repository_node_id,
-                                    *sd.check_suite_node_id))
+      if (sd.check_suite_node_id)
       {
-        l3 ([&]{trace << "re-requested check suite " << *sd.check_suite_node_id;});
+        const string& nid (*sd.check_suite_node_id);
+
+        // Let unlikely invalid_argument propagate.
+        //
+        if (gq_rerequest_check_suite (error,
+                                      iat->token,
+                                      sd.repository_node_id,
+                                      nid))
+        {
+          l3 ([&]{trace << "re-requested check suite " << nid;});
+        }
+        else
+          error << "failed to re-request check suite " << nid;
       }
-      else
-        error << "failed to re-request check suite " << *sd.check_suite_node_id;
     }
   }
   catch (const std::exception& e)