Re-implement handle_check_run_rerequest()

author: Francois Kritzinger <francois@codesynthesis.com> 2024-11-19 14:18:04 +0200
committer: Francois Kritzinger <francois@codesynthesis.com> 2024-11-25 10:30:23 +0200
commit: 4108e8c40e0d055de03ff839eaffb4346a921472 (patch)
tree: effb44eada67088559705a7de9ad7dff2cf063ad /mod
parent: 3c89598f062dfffb615ffdb05b9c7c73f3900f1a (diff)
1 files changed, 409 insertions, 8 deletions
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx
index 57a3b08..2094762 100644
--- a/mod/mod-ci-github.cxx
+++ b/mod/mod-ci-github.cxx
@@ -633,6 +633,18 @@ namespace brep
     return true;
   }
 
+  // Create a gq_built_result.
+  //
+  // @@ TODO Use everywhere.
+  //
+  static gq_built_result
+  make_built_result (result_status rs, bool warning_success, string message)
+  {
+    return {gh_to_conclusion (rs, warning_success),
+            circle (rs) + ' ' + ucase (to_string (rs)),
+            move (message)};
+  }
+
   // Parse a check run details URL into a build_id.
   //
   // Return nullopt if the URL is invalid.
@@ -640,11 +652,39 @@ namespace brep
   static optional<build_id>
   parse_details_url (const string& details_url);
 
-
+  // Note that GitHub always posts a message to their GUI saying "You have
+  // successfully requested <check_run_name> be rerun", regardless of what
+  // HTTP status code we respond with. However we do return error status codes
+  // when appropriate in case they start handling them someday.
+  //
   bool ci_github::
   handle_check_run_rerequest (const gh_check_run_event& cr,
                               bool warning_success)
   {
+    HANDLER_DIAG;
+
+    l3 ([&]{trace << "check_run event { " << cr << " }";});
+
+    // Get a new installation access token.
+    //
+    auto get_iat = [this, &trace, &error, &cr] ()
+      -> optional<gh_installation_access_token>
+      {
+        optional<string> jwt (generate_jwt (trace, error));
+        if (!jwt)
+          return nullopt;
+
+        optional<gh_installation_access_token> iat (
+          obtain_installation_access_token (cr.installation.id,
+                                            move (*jwt),
+                                            error));
+
+        if (iat)
+          l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
+
+        return iat;
+      };
+
     // The overall plan is as follows:
     //
     // 1. Load service data.
@@ -652,16 +692,19 @@ namespace brep
     // 2. If the tenant is archived, then fail (re-create) conclusion with
     //    appropriate diagnostics.
     //
-    // 3. If the check run is in the queued state, then do nothing.
+    //    @@ TMP A PR might get blocked indefinitely by this. If the user
+    //       re-runs a single CR the "re-run all check runs" option disappears
+    //       from the UI. So the single re-run will keep failing but there
+    //       will be no way to start a new CI from scratch.
     //
-    // 4. Re-create the check run in the queued state.
+    // 3. If the check run is in the queued state, then do nothing.
     //
-    // 5. Re-create the check run in the queued state and the conclusion in
+    // 4. Re-create the check run in the queued state and the conclusion in
     //    the building state. Note: do in a single request to make sure we
     //    either "win" or "loose" the potential race for both (important
-    //    for #8).
+    //    for #7).
     //
-    // 6. Call the rebuild() function to attempt to schedule a rebuild. Pass
+    // 5. Call the rebuild() function to attempt to schedule a rebuild. Pass
     //    the update function that does the following (if called):
     //
     //    a. Save new node ids.
@@ -672,10 +715,12 @@ namespace brep
     //
     //    d. "Return" the service data to be used after the call.
     //
-    // 7. If the result of rebuild() indicates the tenant is archived, then
+    //        @@ TMP Not currently returning anything.
+    //
+    // 6. If the result of rebuild() indicates the tenant is archived, then
     //    fail (update) the conclusion check run with appropriate diagnostics.
     //
-    // 8. If original state is queued (no rebuild was scheduled), then fail
+    // 7. If original state is queued (no rebuild was scheduled), then fail
     //    (update) both the check run and the conclusion.
     //
     // Note that while conceptually we are updating existing check runs, in
@@ -683,6 +728,362 @@ namespace brep
     // existing ones because GitHub does not allow transitioning out of the
     // built state.
     //
+
+    const string& repo_node_id (cr.repository.node_id);
+    const string& head_sha (cr.check_run.check_suite.head_sha);
+
+    // The build and conclusion check runs. They are sent to GitHub in a
+    // single request (unless something goes wrong) so store them together
+    // from the outset.
+    //
+    vector<check_run> check_runs (2);
+    check_run& bcr (check_runs[0]); // Build check run
+    check_run& ccr (check_runs[1]); // Conclusion check run
+
+    ccr.name = conclusion_check_run_name;
+
+    // Load the service data, failing the conclusion check run if the tenant
+    // has been archived.
+    //
+    service_data sd;
+    {
+      // Service id that uniquely identifies the CI tenant.
+      //
+      string sid (repo_node_id + ':' + head_sha);
+
+      if (optional<pair<tenant_service, bool>> p =
+            find (*build_db_, "ci-github", sid))
+      {
+        if (p->second) // Tenant is archived
+        {
+          // Fail (re-create) the conclusion check run.
+          //
+          optional<gh_installation_access_token> iat (get_iat ());
+          if (!iat)
+            throw server_error ();
+
+          gq_built_result br (
+            make_built_result (result_status::error, warning_success,
+                               "Unable to rebuild: tenant has been archived"));
+
+          if (gq_create_check_run (error, ccr, iat->token,
+                                   repo_node_id, head_sha,
+                                   nullopt /* details_url */,
+                                   build_state::built, move (br)))
+          {
+            l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";});
+          }
+          else
+          {
+            fail << "check_run " << cr.check_run.node_id
+                 << ": unable to re-create conclusion check run";
+          }
+
+          return true;
+        }
+
+        tenant_service& ts (p->first);
+
+        try
+        {
+          sd = service_data (*ts.data);
+        }
+        catch (const invalid_argument& e)
+        {
+          fail << "failed to parse service data: " << e;
+        }
+      }
+      else
+      {
+        // No such tenant.
+        //
+        error << "check run " << cr.check_run.node_id
+              << " re-requested but tenant_service with id " << sid
+              << " does not exist";
+        return true;
+      }
+    }
+
+    // Get a new IAT if the one from the service data has expired.
+    //
+    const gh_installation_access_token* iat (nullptr);
+    optional<gh_installation_access_token> new_iat;
+
+    if (system_clock::now () > sd.installation_access.expires_at)
+    {
+      if (new_iat = get_iat ())
+        iat = &*new_iat;
+      else
+        throw server_error ();
+    }
+    else
+      iat = &sd.installation_access;
+
+    // Fail if it's the conclusion check run that is being re-requested.
+    //
+    if (cr.check_run.name == conclusion_check_run_name)
+    {
+      // Must exist if it can be re-requested.
+      //
+      assert (sd.conclusion_node_id);
+
+      l3 ([&]{trace << "ignoring re-requested conclusion check_run";});
+
+      gq_built_result br (
+        make_built_result (result_status::error, warning_success,
+                           "Conclusion check run cannot be rebuilt"));
+
+      // Fail (update) the conclusion check run.
+      //
+      if (gq_update_check_run (error, ccr, iat->token,
+                               repo_node_id, *sd.conclusion_node_id,
+                               nullopt /* details_url */,
+                               build_state::built, move (br)))
+      {
+        l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
+      }
+      else
+      {
+        fail << "check run " << cr.check_run.node_id
+             << ": unable to update conclusion check run "
+             << *sd.conclusion_node_id;
+      }
+
+      return true;
+    }
+
+    // Parse the check_run's details_url to extract build id.
+    //
+    // While this is a bit hackish, there doesn't seem to be a better way
+    // (like associating custom data with a check run). Note that the GitHub
+    // UI only allows rebuilding completed check runs, so the details URL
+    // should be there.
+    //
+    optional<build_id> bid (parse_details_url (cr.check_run.details_url));
+    if (!bid)
+    {
+      fail << "check run " << cr.check_run.node_id
+           << ": failed to extract build id from details_url";
+    }
+
+    // Initialize `bcr` with the check run from the service data.
+    //
+    {
+      // Search for the check run in the service data.
+      //
+      bool found (false);
+
+      for (const check_run& scr: sd.check_runs)
+      {
+        if (scr.node_id && *scr.node_id == cr.check_run.node_id)
+        {
+          found = true;
+
+          // Do nothing if the build is already queued.
+          //
+          if (scr.state == build_state::queued)
+          {
+            l3 ([&]{trace << "ignoring already-queued check run";});
+            return true;
+          }
+
+          bcr.name = scr.name;
+          bcr.build_id = scr.build_id;
+          bcr.state = scr.state;
+
+          break;
+        }
+      }
+
+      if (!found)
+      {
+        fail << "check_run " << cr.check_run.node_id
+             << ": re-requested but does not exist in service data";
+      }
+    }
+
+    // Transition the build and conclusion check runs out of the built state
+    // by re-creating them.
+    //
+    bcr.state = build_state::queued;
+    bcr.state_synced = false;
+    bcr.details_url = cr.check_run.details_url;
+
+    ccr.state = build_state::building;
+    ccr.state_synced = false;
+
+    if (gq_create_check_runs (error, check_runs, iat->token,
+                              repo_node_id, head_sha))
+    {
+      assert (bcr.state == build_state::queued);
+      assert (ccr.state == build_state::building);
+
+      l3 ([&]{trace << "created check_run { " << bcr << " }";});
+      l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";});
+    }
+    else
+    {
+      fail << "check run " << cr.check_run.node_id
+           << ": unable to re-create build and conclusion check runs";
+    }
+
+    // Request the rebuild.
+
+    // Function called by rebuild() to update the service data (but only if
+    // the build is actually restarted).
+    //
+    auto update_sd = [&error, &new_iat,
+                      &cr, &bcr, &ccr] (const tenant_service& ts,
+                                        build_state) -> optional<string>
+      {
+        // NOTE: this lambda may be called repeatedly (e.g., due to transaction
+        // being aborted) and so should not move out of its captures.
+
+        service_data sd;
+        try
+        {
+          sd = service_data (*ts.data);
+        }
+        catch (const invalid_argument& e)
+        {
+          error << "failed to parse service data: " << e;
+          return nullopt;
+        }
+
+        for (check_run& scr: sd.check_runs)
+        {
+          if (scr.node_id && *scr.node_id == cr.check_run.node_id)
+          {
+            scr = bcr; // Update with new node_id, state, state_synced.
+            sd.conclusion_node_id = ccr.node_id;
+
+            sd.completed = false;
+
+            // Save the IAT if we created a new one.
+            //
+            if (new_iat)
+              sd.installation_access = *new_iat;
+
+            return sd.json ();
+          }
+        }
+
+        assert (false); // Check run must exist because we found it earlier.
+
+        return nullopt;
+      };
+
+    optional<build_state> bs (rebuild (*build_db_, retry_, *bid, update_sd));
+
+    // If the build has been archived or re-enqueued since we loaded the
+    // service data, fail (by updating) both the build check run and the
+    // conclusion check run. Otherwise the build has been successfully
+    // re-enqueued so do nothing further.
+    //
+    if (!bs || *bs == build_state::queued)
+    {
+      // @@ TMP Diverging from the plan here to fail both CRs in the
+      //        already-archived case as well. Seems better to not leave the
+      //        build CR in the queued state.
+      //
+      gq_built_result bbr; // Built result for the build check run.
+      gq_built_result cbr; // Built result for the conclusion check run.
+
+      if (!bs) // Archived
+      {
+        // The build has expired since we loaded the service data. Most likely
+        // the tenant has been archived.
+        //
+        bbr = make_built_result (
+          result_status::error, warning_success,
+          "Unable to rebuild: tenant has been archived or no such build");
+
+        cbr = bbr;
+      }
+      else // *bs == build_state::queued
+      {
+        // This build has been re-enqueued since we first loaded the service
+        // data. This could happen if the user clicked "re-run" multiple times
+        // and another handler won the rebuild() race.
+        //
+        // However the winner of the check runs race cannot be determined.
+        //
+        // Best case the other handler won the check runs race as well and
+        // thus everything will proceed normally. Our check runs will be
+        // invisible and disregarded.
+        //
+        // Worst case we won the check runs race and the other handler's check
+        // runs -- the ones that will be updated by the build_*()
+        // notifications -- are no longer visible, leaving things quite
+        // broken.
+        //
+        // Either way, we fail our check runs. In the best case scenario it
+        // will have no effect; in the worst case scenario it lets the user
+        // know something has gone wrong.
+        //
+        // @@ TMP In an attempt to recover, the user might end up clicking
+        //    re-run a few times but nothing will change in the UI because the
+        //    new check runs will only be created once the re-enqueued build
+        //    has entered the building state.
+        //
+        bbr = make_built_result (
+          result_status::error, warning_success,
+          "Unable to rebuild at this time, please try again in a few minutes");
+
+        cbr = make_built_result (result_status::error, warning_success,
+                                 "Failed to rebuild check run(s)");
+      }
+
+      // True if we successfully updated both of the check runs.
+      //
+      bool bu (true); // Both updated
+
+      // Fail the build check run.
+      //
+      if (gq_update_check_run (error, bcr, iat->token,
+                               repo_node_id, *bcr.node_id,
+                               nullopt /* details_url */,
+                               build_state::built, move (bbr)))
+      {
+        l3 ([&]{trace << "updated check_run { " << bcr << " }";});
+      }
+      else
+      {
+        bu = false;
+
+        error << "check run " << cr.check_run.node_id
+              << ": unable to update (replacement) check run "
+              << *bcr.node_id;
+      }
+
+      // Fail the conclusion check run.
+      //
+      if (gq_update_check_run (error, ccr, iat->token,
+                               repo_node_id, *ccr.node_id,
+                               nullopt /* details_url */,
+                               build_state::built, move (cbr)))
+      {
+        l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
+      }
+      else
+      {
+        bu = false;
+
+        error << "check run " << cr.check_run.node_id
+              << ": unable to update conclusion check run " << *ccr.node_id;
+      }
+
+      // Fail the handler if either of the check runs could not be updated.
+      //
+      if (!bu)
+        throw server_error ();
+
+      return true;
+    }
+
+    // The build has successfully been re-enqueued.
+
+    return true;
   }
 
   // @@ TMP
author	Francois Kritzinger <francois@codesynthesis.com>	2024-11-19 14:18:04 +0200
committer	Francois Kritzinger <francois@codesynthesis.com>	2024-11-25 10:30:23 +0200
commit	4108e8c40e0d055de03ff839eaffb4346a921472 (patch)
tree	effb44eada67088559705a7de9ad7dff2cf063ad /mod
parent	3c89598f062dfffb615ffdb05b9c7c73f3900f1a (diff)