1 files changed, 923 insertions, 396 deletions
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx
index 6dfaa5f..ded15b6 100644
--- a/mod/mod-ci-github.cxx
+++ b/mod/mod-ci-github.cxx
@@ -19,26 +19,6 @@
 
 #include <stdexcept>
 
-// @@ Remaining TODOs
-//
-//    - Rerequested checks
-//
-//      - check_suite (action: rerequested): received when user re-runs all
-//        checks.
-//
-//      - check_run (action: rerequested): received when user re-runs a
-//        specific check or all failed checks.
-//
-//        @@ TMP I have confirmed that the above is accurate.
-//
-//      Will need to extract a few more fields from check_runs, but the layout
-//      is very similar to that of check_suite.
-//
-//    - Choose strong webhook secret (when deploying).
-//
-//    - Check that delivery UUID has not been received before (replay attack).
-//
-
 // Resources:
 //
 //    Creating an App:
@@ -280,9 +260,6 @@ namespace brep
     // is that we want be "notified" of new actions at which point we can
     // decide whether to ignore them or to handle.
     //
-    // @@ There is also check_run even (re-requested by user, either
-    //    individual check run or all the failed check runs).
-    //
     if (event == "check_suite")
     {
       gh_check_suite_event cs;
@@ -316,13 +293,10 @@ namespace brep
       else if (cs.action == "completed")
       {
         // GitHub thinks that "all the check runs in this check suite have
-        // completed and a conclusion is available". Looks like this one we
-        // ignore?
+        // completed and a conclusion is available". Check with our own
+        // bookkeeping and log an error if there is a mismatch.
         //
-        // What if our bookkeeping says otherwise? But then we can't even
-        // access the service data easily here. @@ TODO: maybe/later.
-        //
-        return true;
+        return handle_check_suite_completed (move (cs), warning_success);
       }
       else
       {
@@ -398,7 +372,8 @@ namespace brep
         throw invalid_request (400, move (m));
       }
 
-      if (pr.action == "opened" || pr.action == "synchronize")
+      if (pr.action == "opened" ||
+          pr.action == "synchronize")
       {
         // opened
         //   A pull request was opened.
@@ -406,23 +381,78 @@ namespace brep
         // synchronize
         //   A pull request's head branch was updated from the base branch or
         //   new commits were pushed to the head branch. (Note that there is
-        //   no equivalent event for the base branch. That case gets handled
-        //   in handle_check_suite_request() instead. @@ Not anymore.)
+        //   no equivalent event for the base branch.)
         //
-        // Note that both cases are handled the same: we start a new CI
+        // Note that both cases are handled similarly: we start a new CI
         // request which will be reported on the new commit id.
         //
         return handle_pull_request (move (pr), warning_success);
       }
-      else
+      else if (pr.action == "edited")
+      {
+        // PR base branch changed (to a different branch) besides other
+        // irrelevant changes (title, body, etc).
+        //
+        // This is in a sense a special case of the base branch moving. In
+        // that case we don't do anything (due to the head sharing problem)
+        // relying instead on the branch protection rule. So it makes sense
+        // to do the same here.
+        //
+        return true;
+      }
+      else if (pr.action == "closed")
+      {
+        // PR has been closed (as merged or not; see merged member). Also
+        // apparently received if base branch is deleted (and the same
+        // for head branch). See also the reopened event below.
+        //
+        // While it may seem natural to cancel the CI for the closed PR, it
+        // might actually be useful to have a completed CI record. GitHub
+        // doesn't prevent us from publishing CI results for the closed PR
+        // (even if both base and head branches were deleted). And if such a
+        // PR is reopened, the CI results remain.
+        //
+        return true;
+      }
+      else if (pr.action == "reopened")
       {
-        // Ignore the remaining actions by sending a 200 response with empty
-        // body.
+        // Previously closed PR has been reopened.
+        //
+        // Since we don't cancel the CI for a closed PR, there is nothing
+        // to do if it is reopened.
         //
-        // @@ Ignore known but log unknown, as in check_suite above?
+        return true;
+      }
+      else if (pr.action == "assigned"               ||
+               pr.action == "auto_merge_disabled"    ||
+               pr.action == "auto_merge_enabled"     ||
+               pr.action == "converted_to_draft"     ||
+               pr.action == "demilestoned"           ||
+               pr.action == "dequeued"               ||
+               pr.action == "enqueued"               ||
+               pr.action == "labeled"                ||
+               pr.action == "locked"                 ||
+               pr.action == "milestoned"             ||
+               pr.action == "ready_for_review"       ||
+               pr.action == "review_request_removed" ||
+               pr.action == "review_requested"       ||
+               pr.action == "unassigned"             ||
+               pr.action == "unlabeled"              ||
+               pr.action == "unlocked")
+      {
+        // These have no relation to CI.
         //
         return true;
       }
+      else
+      {
+        // Ignore unknown actions by sending a 200 response with empty body
+        // but also log as an error since we want to notice new actions.
+        //
+        error << "unknown action '" << pr.action << "' in pull_request event";
+
+        return true;
+      }
     }
     else
     {
@@ -502,20 +532,18 @@ namespace brep
 
     l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
 
-    // @@ What happens if we call this functions with an already existing
-    //    node_id (e.g., replay attack). See the UUID header above.
-    //
-
     // While it would have been nice to cancel CIs of PRs with this branch as
-    // base not to waste resources, there are complications: Firsty, we can
-    // only do this for remote PRs (since local PRs may share the result with
-    // branch push). Secondly, we try to do our best even if the branch
-    // protection rule for head behind is not enabled. In this case, it would
-    // be good to complete the CI. So maybe/later.
+    // base not to waste resources, there are complications: Firstly, we can
+    // only do this for remote PRs (since local PRs will most likely share the
+    // result with branch push). Secondly, we try to do our best even if the
+    // branch protection rule for head behind is not enabled. In this case, it
+    // would be good to complete the CI. So maybe/later. See also the head
+    // case in handle_pull_request(), where we do cancel remote PRs that are
+    // not shared.
 
     // Service id that uniquely identifies the CI tenant.
     //
-    string sid (cs.repository.node_id + ":" + cs.check_suite.head_sha);
+    string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha);
 
     // If the user requests a rebuild of the (entire) PR, then this manifests
     // as the check_suite rather than pull_request event. Specifically:
@@ -541,11 +569,13 @@ namespace brep
     {
       kind = service_data::remote;
 
-      if (optional<tenant_service> ts = find (*build_db_, "ci-github", sid))
+      if (optional<tenant_data> d = find (*build_db_, "ci-github", sid))
       {
+        tenant_service& ts (d->service);
+
         try
         {
-          service_data sd (*ts->data);
+          service_data sd (*ts.data);
           check_sha = move (sd.check_sha); // Test merge commit.
         }
         catch (const invalid_argument& e)
@@ -630,6 +660,132 @@ namespace brep
     return true;
   }
 
+  bool ci_github::
+  handle_check_suite_completed (gh_check_suite_event cs, bool warning_success)
+  {
+    // The plans is as follows:
+    //
+    // 1. Load the service data.
+    //
+    // 2. Verify it is completed.
+    //
+    // 3. Verify the check run counts match.
+    //
+    // 4. Verify (like in build_built()) that all the check runs are
+    //    completed.
+    //
+    // 5. Verify the result matches what GitHub thinks it is.
+
+    HANDLER_DIAG;
+
+    l3 ([&]{trace << "check_suite event { " << cs << " }";});
+
+    // Service id that uniquely identifies the CI tenant.
+    //
+    string sid (cs.repository.node_id + ':' + cs.check_suite.head_sha);
+
+    // The common log entry subject.
+    //
+    string sub ("check suite " + cs.check_suite.node_id + '/' + sid);
+
+    // Load the service data.
+    //
+    service_data sd;
+
+    if (optional<tenant_data> d = find (*build_db_, "ci-github", sid))
+    {
+      try
+      {
+        sd = service_data (*d->service.data);
+      }
+      catch (const invalid_argument& e)
+      {
+        fail << "failed to parse service data: " << e;
+      }
+    }
+    else
+    {
+      error << sub << ": tenant_service does not exist";
+      return true;
+    }
+
+    // Verify the completed flag and the number of check runs.
+    //
+    if (!sd.completed)
+    {
+      error << sub << " service data complete flag is false";
+      return true;
+    }
+
+    // Received count will be one higher because we don't store the conclusion
+    // check run.
+    //
+    size_t check_runs_count (sd.check_runs.size () + 1);
+
+    if (check_runs_count == 1)
+    {
+      error << sub << ": no check runs in service data";
+      return true;
+    }
+
+    if (cs.check_suite.check_runs_count != check_runs_count)
+    {
+      error << sub << ": check runs count " << cs.check_suite.check_runs_count
+            << " does not match service data count " << check_runs_count;
+      return true;
+    }
+
+    // Verify that all the check runs are built and compute the summary
+    // conclusion.
+    //
+    result_status conclusion (result_status::success);
+
+    for (const check_run& cr: sd.check_runs)
+    {
+      if (cr.state == build_state::built)
+      {
+        assert (cr.status.has_value ());
+        conclusion |= *cr.status;
+      }
+      else
+      {
+        error << sub << ": unbuilt check run in service data";
+        return true;
+      }
+    }
+
+    // Verify the conclusion.
+    //
+    if (!cs.check_suite.conclusion)
+    {
+      error << sub << ": absent conclusion in completed check suite";
+      return true;
+    }
+
+    string gh_conclusion (gh_to_conclusion (*conclusion, warning_success));
+
+    if (*cs.check_suite.conclusion != gh_conclusion)
+    {
+      error << sub << ": conclusion " << *cs.check_suite.conclusion
+            << " does not match service data conclusion " << gh_conclusion;
+      return true;
+    }
+
+    return true;
+  }
+
+  // Create a gq_built_result.
+  //
+  // Throw invalid_argument in case of invalid result_status.
+  //
+  static gq_built_result
+  make_built_result (result_status rs, bool warning_success, string message)
+  {
+    return {gh_to_conclusion (rs, warning_success),
+            circle (rs) + ' ' + ucase (to_string (rs)),
+            move (message)};
+  }
+
   // Parse a check run details URL into a build_id.
   //
   // Return nullopt if the URL is invalid.
@@ -637,6 +793,12 @@ namespace brep
   static optional<build_id>
   parse_details_url (const string& details_url);
 
+  // Note that GitHub always posts a message to their GUI saying "You have
+  // successfully requested <check_run_name> be rerun", regardless of what
+  // HTTP status code we respond with. However we do return error status codes
+  // when there is no better option (like failing the conclusion) in case they
+  // start handling them someday.
+  //
   bool ci_github::
   handle_check_run_rerequest (const gh_check_run_event& cr,
                               bool warning_success)
@@ -645,20 +807,40 @@ namespace brep
 
     l3 ([&]{trace << "check_run event { " << cr << " }";});
 
-    // Fail if this is the conclusion check run.
+    // The overall plan is as follows:
     //
-    if (cr.check_run.name == conclusion_check_run_name)
-    {
-      // @@ Fail conclusion check run with appropriate message and reurn
-      //    true.
-
-      l3 ([&]{trace << "ignoring conclusion check_run";});
-
-      // 422 Unprocessable Content: The request was well-formed (i.e.,
-      // syntactically correct) but could not be processed.
-      //
-      throw invalid_request (422, "Conclusion check run cannot be rebuilt");
-    }
+    // 1. Load service data.
+    //
+    // 2. If the tenant is archived, then fail (re-create) both the check run
+    //    and the conclusion with appropriate diagnostics.
+    //
+    // 3. If the check run is in the queued state, then do nothing.
+    //
+    // 4. Re-create the check run in the queued state and the conclusion in
+    //    the building state. Note: do in a single request to make sure we
+    //    either "win" or "loose" the potential race for both (important
+    //    for #7).
+    //
+    // 5. Call the rebuild() function to attempt to schedule a rebuild. Pass
+    //    the update function that does the following (if called):
+    //
+    //    a. Save new node ids.
+    //
+    //    b. Update the check run state (may also not exist).
+    //
+    //    c. Clear the completed flag if true.
+    //
+    // 6. If the result of rebuild() indicates the tenant is archived, then
+    //    fail (update) both the check run and conclusion with appropriate
+    //    diagnostics.
+    //
+    // 7. If original state is queued (no rebuild was scheduled), then fail
+    //    (update) both the check run and the conclusion.
+    //
+    // Note that while conceptually we are updating existing check runs, in
+    // practice we have to re-create as new check runs in order to replace the
+    // existing ones because GitHub does not allow transitioning out of the
+    // built state.
 
     // Get a new installation access token.
     //
@@ -680,77 +862,156 @@ namespace brep
       return iat;
     };
 
-    // Create a new conclusion check run, replacing the existing one.
+    const string& repo_node_id (cr.repository.node_id);
+    const string& head_sha (cr.check_run.check_suite.head_sha);
+
+    // Prepare the build and conclusion check runs. They are sent to GitHub in
+    // a single request (unless something goes wrong) so store them together
+    // from the outset.
     //
-    // Return the check run on success or nullopt on failure.
+    vector<check_run> check_runs (2);
+    check_run& bcr (check_runs[0]); // Build check run
+    check_run& ccr (check_runs[1]); // Conclusion check run
+
+    ccr.name = conclusion_check_run_name;
+
+    // Load the service data, failing the check runs if the tenant has been
+    // archived.
     //
-    auto create_conclusion_cr =
-      [&cr, &error, warning_success] (const gh_installation_access_token& iat,
-                                      build_state bs,
-                                      optional<result_status> rs = nullopt,
-                                      optional<string> msg = nullopt)
-      -> optional<check_run>
+    service_data sd;
+    string tenant_id;
     {
-      optional<gq_built_result> br;
-      if (rs)
+      // Service id that uniquely identifies the CI tenant.
+      //
+      string sid (repo_node_id + ':' + head_sha);
+
+      if (optional<tenant_data> d = find (*build_db_, "ci-github", sid))
       {
-        assert (msg);
+        if (d->archived) // Tenant is archived
+        {
+          // Fail (re-create) the check runs.
+          //
+          optional<gh_installation_access_token> iat (get_iat ());
+          if (!iat)
+            throw server_error ();
+
+          gq_built_result br (
+            make_built_result (
+              result_status::error, warning_success,
+              "Unable to rebuild individual configuration: build has "
+              "been archived"));
+
+          // Try to update the conclusion check run even if the first update
+          // fails.
+          //
+          bool f (false); // Failed.
+
+          if (gq_create_check_run (error, bcr, iat->token,
+                                   repo_node_id, head_sha,
+                                   cr.check_run.details_url,
+                                   build_state::built, br))
+          {
+            l3 ([&]{trace << "created check_run { " << bcr << " }";});
+          }
+          else
+          {
+            error << "check_run " << cr.check_run.node_id
+                  << ": unable to re-create check run";
+            f = true;
+          }
 
-        br = gq_built_result (gh_to_conclusion (*rs, warning_success),
-                              circle (*rs) + ' ' + ucase (to_string (*rs)),
-                              move (*msg));
+          if (gq_create_check_run (error, ccr, iat->token,
+                                   repo_node_id, head_sha,
+                                   nullopt /* details_url */,
+                                   build_state::built, move (br)))
+          {
+            l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";});
+          }
+          else
+          {
+            error << "check_run " << cr.check_run.node_id
+                  << ": unable to re-create conclusion check run";
+            f = true;
+          }
+
+          // Fail the handler if either of the check runs could not be
+          // updated.
+          //
+          if (f)
+            throw server_error ();
+
+          return true;
+        }
+
+        tenant_service& ts (d->service);
+
+        try
+        {
+          sd = service_data (*ts.data);
+        }
+        catch (const invalid_argument& e)
+        {
+          fail << "failed to parse service data: " << e;
+        }
+
+        tenant_id = d->tenant_id;
       }
+      else
+      {
+        // No such tenant.
+        //
+        fail << "check run " << cr.check_run.node_id
+             << " re-requested but tenant_service with id " << sid
+             << " does not exist";
+      }
+    }
 
-      check_run r;
-      r.name = conclusion_check_run_name;
+    // Get a new IAT if the one from the service data has expired.
+    //
+    const gh_installation_access_token* iat (nullptr);
+    optional<gh_installation_access_token> new_iat;
 
-      if (gq_create_check_run (error, r, iat.token,
-                               rni, hs,
+    if (system_clock::now () > sd.installation_access.expires_at)
+    {
+      if ((new_iat = get_iat ()))
+        iat = &*new_iat;
+      else
+        throw server_error ();
+    }
+    else
+      iat = &sd.installation_access;
+
+    // Fail if it's the conclusion check run that is being re-requested.
+    //
+    if (cr.check_run.name == conclusion_check_run_name)
+    {
+      l3 ([&]{trace << "re-requested conclusion check_run";});
+
+      if (!sd.conclusion_node_id)
+        fail << "no conclusion node id for check run " << cr.check_run.node_id;
+
+      gq_built_result br (
+        make_built_result (result_status::error, warning_success,
+                           "Conclusion check run cannot be rebuilt"));
+
+      // Fail (update) the conclusion check run.
+      //
+      if (gq_update_check_run (error, ccr, iat->token,
+                               repo_node_id, *sd.conclusion_node_id,
                                nullopt /* details_url */,
-                               bs, move (br)))
+                               build_state::built, move (br)))
       {
-        return r;
+        l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
       }
       else
-        return nullopt;
-    };
+      {
+        fail << "check run " << cr.check_run.node_id
+             << ": unable to update conclusion check run "
+             << *sd.conclusion_node_id;
+      }
 
-    // The overall plan is as follows:
-    //
-    // 1. Call the rebuild() function to attempt to schedule a rebuild. Pass
-    //    the update function that does the following (if called):
-    //
-    //    a. Update the check run being rebuilt (may also not exist).
-    //
-    //    b. Clear the completed flag if true.
-    //
-    //    c. "Return" the service data to be used after the call.
-    //
-    // 2. If the result of rebuild() indicates the tenant is archived, fail
-    //    the conclusion check run with appropriate diagnostics.
-    //
-    // 3. If original state is queued, then no rebuild was scheduled and we do
-    //    nothing.
-    //
-    // 4. Otherwise (the original state is building or built):
-    //
-    //    a. Change the check run state to queued.
-    //
-    //    b. Change the conclusion check run to building (do unconditionally
-    //       to mitigate races).
-    //
-    // Note that while conceptually we are updating existing check runs, in
-    // practice we have to create new check runs to replace the existing ones
-    // because GitHub does not allow transitioning out of the built state.
-    //
-    // This results in a new node id for each check run but we can't save them
-    // to the service data after the rebuild() call. As a workaround, when
-    // updating the service data we 1) clear the re-requested check run's node
-    // id and set the state_synced flag to true to signal to build_building()
-    // and build_built() that it needs to create a new check run; and 2) clear
-    // the conclusion check run's node id to cause build_built() to create a
-    // new conclusion check run. And these two check runs' node ids will be
-    // saved to the service data.
+      return true;
+    }
 
     // Parse the check_run's details_url to extract build id.
     //
@@ -766,28 +1027,101 @@ namespace brep
            << ": failed to extract build id from details_url";
     }
 
-    // The IAT retrieved from the service data.
+    // Initialize the check run (`bcr`) with state from the service data.
     //
-    optional<gh_installation_access_token> iat;
+    {
+      // Search for the check run in the service data.
+      //
+      // Note that we look by name in case node id got replaced by a racing
+      // re-request (in which case we ignore this request).
+      //
+      auto i (find_if (sd.check_runs.begin (), sd.check_runs.end (),
+                       [&cr] (const check_run& scr)
+                       {
+                         return scr.name == cr.check_run.name;
+                       }));
+
+      if (i == sd.check_runs.end ())
+        fail << "check_run " << cr.check_run.node_id
+             << " (" << cr.check_run.name << "): "
+             << "re-requested but does not exist in service data";
 
-    // True if the check run exists in the service data.
+      // Do nothing if node ids don't match.
+      //
+      if (i->node_id && *i->node_id != cr.check_run.node_id)
+      {
+        l3 ([&]{trace << "check_run " << cr.check_run.node_id
+                      << " (" << cr.check_run.name << "): "
+                      << "node id has changed in service data";});
+        return true;
+      }
+
+      // Do nothing if the build is already queued.
+      //
+      if (i->state == build_state::queued)
+      {
+        l3 ([&]{trace << "ignoring already-queued check run";});
+        return true;
+      }
+
+      bcr.name = i->name;
+      bcr.build_id = i->build_id;
+      bcr.state = i->state;
+    }
+
+    // Transition the build and conclusion check runs out of the built state
+    // (or any other state) by re-creating them.
     //
-    bool cr_found (false);
+    bcr.state = build_state::queued;
+    bcr.state_synced = false;
+    bcr.details_url = cr.check_run.details_url;
 
-    // Update the state of the check run in the service data. Return (via
-    // captured references) the IAT and whether the check run was found.
+    ccr.state = build_state::building;
+    ccr.state_synced = false;
+
+    if (gq_create_check_runs (error, check_runs, iat->token,
+                              repo_node_id, head_sha))
+    {
+      assert (bcr.state == build_state::queued);
+      assert (ccr.state == build_state::building);
+
+      l3 ([&]{trace << "created check_run { " << bcr << " }";});
+      l3 ([&]{trace << "created conclusion check_run { " << ccr << " }";});
+    }
+    else
+    {
+      fail << "check run " << cr.check_run.node_id
+           << ": unable to re-create build and conclusion check runs";
+    }
+
+    // Request the rebuild and update service data.
     //
-    // Called by rebuild(), but only if the build is actually restarted.
+    bool race (false);
+
+    // Callback function called by rebuild() to update the service data (but
+    // only if the build is actually restarted).
     //
-    auto update_sd = [&iat,
-                      &cr_found,
-                      &error,
-                      &cr] (const tenant_service& ts, build_state)
-      -> optional<string>
+    auto update_sd = [&error, &new_iat, &race,
+                      tenant_id = move (tenant_id),
+                      &cr, &bcr, &ccr] (const string& ti,
+                                        const tenant_service& ts,
+                                        build_state) -> optional<string>
     {
       // NOTE: this lambda may be called repeatedly (e.g., due to transaction
       // being aborted) and so should not move out of its captures.
 
+      race = false; // Reset.
+
+      if (tenant_id != ti)
+      {
+        // The tenant got replaced since we loaded it but we managed to
+        // trigger a rebuild in the new tenant. Who knows whose check runs are
+        // visible, so let's fail ours similar to the cases below.
+        //
+        race = true;
+        return nullopt;
+      }
+
       service_data sd;
       try
       {
@@ -796,148 +1130,138 @@ namespace brep
       catch (const invalid_argument& e)
       {
         error << "failed to parse service data: " << e;
-        return nullptr;
+        return nullopt;
       }
 
-      if (!iat)
-        iat = sd.installation_access;
-
-      // If the re-requested check run is found, update it in the service
-      // data.
+      // Note that we again look by name in case node id got replaced by a
+      // racing re-request. In this case, however, it's impossible to decide
+      // who won that race, so let's fail the check suite to be on the safe
+      // side (in a sense, similar to the rebuild() returning queued below).
       //
-      const string& nid (cr.check_run.node_id);
+      auto i (find_if (
+                sd.check_runs.begin (), sd.check_runs.end (),
+                [&cr] (const check_run& scr)
+                {
+                  return scr.name == cr.check_run.name;
+                }));
 
-      for (check_run& cr: sd.check_runs)
+      if (i == sd.check_runs.end ())
       {
-        if (cr.node_id && *cr.node_id == nid)
-        {
-          cr_found = true;
-          cr.state = build_state::queued;
-          sd.completed = false;
-
-          // Clear the check run node ids and set state_synced to true to
-          // cause build_building() and/or build_built() to create new check
-          // runs (see the plan above for details).
-          //
-          cr.node_id = nullopt;
-          cr.state_synced = true;
-          sd.conclusion_node_id = nullopt;
+        error << "check_run " << cr.check_run.node_id
+              << " (" << cr.check_run.name << "): "
+              << "re-requested but does not exist in service data";
+        return nullopt;
+      }
 
-          return sd.json ();
-        }
+      if (i->node_id && *i->node_id != cr.check_run.node_id)
+      {
+        // Keep the old conclusion node id to make sure any further state
+        // transitions are ignored. A bit of a hack.
+        //
+        race = true;
+        return nullopt;
       }
 
-      return nullopt;
+      *i = bcr; // Update with new node_id, state, state_synced.
+
+      sd.conclusion_node_id = ccr.node_id;
+      sd.completed = false;
+
+      // Save the IAT if we created a new one.
+      //
+      if (new_iat)
+        sd.installation_access = *new_iat;
+
+      return sd.json ();
     };
 
     optional<build_state> bs (rebuild (*build_db_, retry_, *bid, update_sd));
 
-    if (!bs)
+    // If the build has been archived or re-enqueued since we loaded the
+    // service data, fail (by updating) both the build check run and the
+    // conclusion check run. Otherwise the build has been successfully
+    // re-enqueued so do nothing further.
+    //
+    if (!race && bs && *bs != build_state::queued)
+      return true;
+
+    gq_built_result br; // Built result for both check runs.
+
+    if (race || bs) // Race or re-enqueued.
     {
-      // Build has expired (most probably the tenant has been archived).
+      // The re-enqueued case: this build has been re-enqueued since we first
+      // loaded the service data. This could happen if the user clicked
+      // "re-run" multiple times and another handler won the rebuild() race.
       //
-      // Update the conclusion check run to notify the user (but have to
-      // replace it with a new one because we don't know the existing one's
-      // node id).
+      // However the winner of the check runs race cannot be determined.
       //
-      optional<gh_installation_access_token> iat (get_iat ());
-      if (!iat)
-        throw server_error ();
-
-      if (optional<check_run> ccr = create_conclusion_cr (
-            *iat,
-            build_state::built,
-            result_status::error,
-            "Unable to rebuild: tenant has been archived or no such build"))
-      {
-        l3 ([&]{trace << "created conclusion check_run { " << *ccr << " }";});
-      }
-      else
-      {
-        // Log the error and return failure to GitHub which will presumably
-        // indicate this in its GUI.
-        //
-        fail << "check run " << cr.check_run.node_id
-             << ": unable to create conclusion check run";
-      }
-    }
-    else if (*bs == build_state::queued)
-    {
-      // The build was already queued so nothing to be done. This might happen
-      // if the user clicked "re-run" multiple times before we managed to
-      // update the check run.
-    }
-    else
-    {
-      // The build has been requeued.
+      // Best case the other handler won the check runs race as well and
+      // thus everything will proceed normally. Our check runs will be
+      // invisible and disregarded.
       //
-      assert (*bs == build_state::building || *bs == build_state::built);
-
-      if (!cr_found)
-      {
-        // Respond with an error otherwise GitHub will post a message in its
-        // GUI saying "you have successfully requested a rebuild of ..."
-        //
-        fail << "check_run " << cr.check_run.node_id
-             << ": build restarted but check run does not exist "
-             << "in service data";
-      }
-
-      // Get a new IAT if the one from the service data has expired.
+      // Worst case we won the check runs race and the other handler's check
+      // runs -- the ones that will be updated by the build_*() notifications
+      // -- are no longer visible, leaving things quite broken.
       //
-      assert (iat.has_value ());
-
-      if (system_clock::now () > iat->expires_at)
-      {
-        iat = get_iat ();
-        if (!iat)
-          throw server_error ();
-      }
-
-      // Update (by replacing) the re-requested and conclusion check runs to
-      // queued and building, respectively.
+      // Either way, we fail our check runs. In the best case scenario it
+      // will have no effect; in the worst case scenario it lets the user
+      // know something has gone wrong.
       //
-      // If either fails we can only log the error but build_building() and/or
-      // build_built() should correct the situation (see above for details).
+      br = make_built_result (result_status::error, warning_success,
+                              "Unable to rebuild, try again");
+    }
+    else // Archived.
+    {
+      // The build has expired since we loaded the service data. Most likely
+      // the tenant has been archived.
       //
+      br = make_built_result (
+        result_status::error, warning_success,
+        "Unable to rebuild individual configuration: build has been archived");
+    }
 
-      // Update re-requested check run.
-      //
-      {
-        check_run ncr; // New check run.
-        ncr.name = cr.check_run.name;
+    // Try to update the conclusion check run even if the first update fails.
+    //
+    bool f (false); // Failed.
 
-        if (gq_create_check_run (error,
-                                 ncr,
-                                 iat->token,
-                                 cr.repository.node_id,
-                                 cr.check_run.check_suite.head_sha,
-                                 cr.check_run.details_url,
-                                 build_state::queued))
-        {
-          l3 ([&]{trace << "created check_run { " << ncr << " }";});
-        }
-        else
-        {
-          error << "check_run " << cr.check_run.node_id
-                << ": unable to create (to update) check run in queued state";
-        }
-      }
+    // Fail the build check run.
+    //
+    if (gq_update_check_run (error, bcr, iat->token,
+                             repo_node_id, *bcr.node_id,
+                             nullopt /* details_url */,
+                             build_state::built, br))
+    {
+      l3 ([&]{trace << "updated check_run { " << bcr << " }";});
+    }
+    else
+    {
+      error << "check run " << cr.check_run.node_id
+            << ": unable to update (replacement) check run "
+            << *bcr.node_id;
+      f = true;
+    }
 
-      // Update conclusion check run.
-      //
-      if (optional<check_run> ccr =
-            create_conclusion_cr (*iat, build_state::building))
-      {
-        l3 ([&]{trace << "created conclusion check_run { " << *ccr << " }";});
-      }
-      else
-      {
-        error << "check_run " << cr.check_run.node_id
-              << ": unable to create (to update) conclusion check run";
-      }
+    // Fail the conclusion check run.
+    //
+    if (gq_update_check_run (error, ccr, iat->token,
+                             repo_node_id, *ccr.node_id,
+                             nullopt /* details_url */,
+                             build_state::built, move (br)))
+    {
+      l3 ([&]{trace << "updated conclusion check_run { " << ccr << " }";});
+    }
+    else
+    {
+      error << "check run " << cr.check_run.node_id
+            << ": unable to update conclusion check run " << *ccr.node_id;
+      f = true;
     }
 
+    // Fail the handler if either of the check runs could not be updated.
+    //
+    if (f)
+      throw server_error ();
+
     return true;
   }
 
@@ -959,30 +1283,6 @@ namespace brep
   //   gets updated with the head commit's SHA and check_suite.pull_requests[]
   //   will contain all PRs with this branch as head.
   //
-  // Remaining TODOs
-  //
-  // - @@ TODO? PR base branch changed (to a different branch)
-  //
-  //   => pull_request(edited)
-  //
-  // - PR closed @@ TODO
-  //
-  //   Also received if base branch is deleted. (And presumably same for head
-  //   branch.)
-  //
-  //   => pull_request(closed)
-  //
-  //   Cancel CI?
-  //
-  // - PR merged @@ TODO
-  //
-  //   => pull_request(merged)
-  //
-  //   => check_suite(PR_base)
-  //
-  //   Probably wouldn't want to CI the base again because the PR CI would've
-  //   done the equivalent already.
-  //
   bool ci_github::
   handle_pull_request (gh_pull_request_event pr, bool warning_success)
   {
@@ -1007,12 +1307,6 @@ namespace brep
 
     l3 ([&]{trace << "installation_access_token { " << *iat << " }";});
 
-    // Note that similar to the branch push case above, while it would have
-    // been nice to cancel the previous CI job once the PR head moves (the
-    // "synchronize" event), due to the head sharing problem the previous CI
-    // job might actually still be relevant (in both local and remote PR
-    // cases).
-
     // Distinguish between local and remote PRs by comparing the head and base
     // repositories' paths.
     //
@@ -1021,6 +1315,48 @@ namespace brep
       ? service_data::local
       : service_data::remote);
 
+    // Note that similar to the branch push case above, while it would have
+    // been nice to cancel the previous CI job once the PR head moves (the
+    // "synchronize" event), due to the head sharing problem the previous CI
+    // job might actually still be relevant (in both local and remote PR
+    // cases). So we only do it for the remote PRs and only if the head is not
+    // shared (via tenant reference counting).
+    //
+    if (kind == service_data::remote && pr.action == "synchronize")
+    {
+      if (pr.before)
+      {
+        // Service id that will uniquely identify the CI tenant.
+        //
+        string sid (pr.repository.node_id + ':' + *pr.before);
+
+        if (optional<tenant_service> ts = cancel (error, warn,
+                                                  verb_ ? &trace : nullptr,
+                                                  *build_db_, retry_,
+                                                  "ci-github", sid,
+                                                  true /* ref_count */))
+        {
+          l3 ([&]{trace << "pull request " << pr.pull_request.node_id
+                        << ": attempted to cancel CI of previous head commit"
+                        << " (ref_count: " << ts->ref_count << ')';});
+        }
+        else
+        {
+          // It's possible that there was no CI for the previous commit for
+          // various reasons (e.g., CI was not enabled).
+          //
+          l3 ([&]{trace << "pull request " << pr.pull_request.node_id
+                        << ": failed to cancel CI of previous head commit "
+                        << "with tenant_service id " << sid;});
+        }
+      }
+      else
+      {
+        error << "pull request " << pr.pull_request.node_id
+              << ": before commit is missing in synchronize event";
+      }
+    }
+
     // Note: for remote PRs the check_sha will be set later, in
     // build_unloaded_pre_check(), to test merge commit id.
     //
@@ -1082,10 +1418,13 @@ namespace brep
     return true;
   }
 
-  function<optional<string> (const tenant_service&)> ci_github::
-  build_unloaded (tenant_service&& ts,
+  function<optional<string> (const string&, const tenant_service&)> ci_github::
+  build_unloaded (const string& ti,
+                  tenant_service&& ts,
                   const diag_epilogue& log_writer) const noexcept
   {
+    // NOTE: this function is noexcept and should not throw.
+
     NOTIFICATION_DIAG (log_writer);
 
     service_data sd;
@@ -1100,15 +1439,24 @@ namespace brep
     }
 
     return sd.pre_check
-             ? build_unloaded_pre_check (move (ts), move (sd), log_writer)
-             : build_unloaded_load (move (ts), move (sd), log_writer);
+      ? build_unloaded_pre_check (move (ts), move (sd), log_writer)
+      : build_unloaded_load (ti, move (ts), move (sd), log_writer);
   }
 
-  function<optional<string> (const tenant_service&)> ci_github::
+  function<optional<string> (const string&, const tenant_service&)> ci_github::
   build_unloaded_pre_check (tenant_service&& ts,
                             service_data&& sd,
                             const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+    //
+    // In a few places where invalid_argument is unlikely to be thrown and/or
+    // would indicate that things are seriously broken we let it propagate to
+    // the function catch block where the pre-check tenant will be canceled
+    // (otherwise we could end up in an infinite loop, e.g., because the
+    // problematic arguments won't change).
+
     NOTIFICATION_DIAG (log_writer);
 
     // We get here for PRs only (but both local and remote). The overall
@@ -1134,6 +1482,8 @@ namespace brep
     // Request PR pre-check info (triggering the generation of the test merge
     // commit on the GitHub's side).
     //
+    // Let unlikely invalid_argument propagate (see above).
+    //
     optional<gq_pr_pre_check_info> pc (
       gq_fetch_pull_request_pre_check_info (error,
                                             sd.installation_access.token,
@@ -1177,7 +1527,7 @@ namespace brep
 
       // Service id that will uniquely identify the CI tenant.
       //
-      string sid (sd.repository_node_id + ":" + sd.report_sha);
+      string sid (sd.repository_node_id + ':' + sd.report_sha);
 
       // Create an unloaded CI tenant, doing nothing if one already exists
       // (which could've been created by a head branch push or another PR
@@ -1193,38 +1543,50 @@ namespace brep
       // notifications until (1) we load the tenant, (2) we cancel it, or (3)
       // it gets archived after some timeout.
       //
-      if (auto pr = create (error, warn, verb_ ? &trace : nullptr,
-                            *build_db_, retry_,
-                            tenant_service (sid, "ci-github", sd.json ()),
-                            chrono::seconds (30) /* interval */,
-                            chrono::seconds (0) /* delay */,
-                            duplicate_tenant_mode::ignore))
-      {
-        if (pr->second == duplicate_tenant_result::ignored)
+      try
+      {
+        if (auto pr = create (error, warn, verb_ ? &trace : nullptr,
+                              *build_db_, retry_,
+                              tenant_service (sid, "ci-github", sd.json ()),
+                              chrono::seconds (30) /* interval */,
+                              chrono::seconds (0) /* delay */,
+                              duplicate_tenant_mode::ignore))
         {
-          // This PR is sharing a head commit with something else.
-          //
-          // If this is a local PR then it's probably the branch push, which
-          // is expected, so do nothing.
-          //
-          // If this is a remote PR then it could be anything (branch push,
-          // local PR, or another remote PR) which in turn means the CI result
-          // may end up being for head, not merge commit. There is nothing we
-          // can do about it on our side (the user can enable the head-behind-
-          // base protection on their side).
-          //
-          if (sd.kind == service_data::remote)
+          if (pr->second == duplicate_tenant_result::ignored)
           {
-            l3 ([&]{trace << "remote pull request " << *sd.pr_node_id
-                          << ": CI tenant already exists for " << sid;});
+            // This PR is sharing a head commit with something else.
+            //
+            // If this is a local PR then it's probably the branch push, which
+            // is expected, so do nothing.
+            //
+            // If this is a remote PR then it could be anything (branch push,
+            // local PR, or another remote PR) which in turn means the CI
+            // result may end up being for head, not merge commit. There is
+            // nothing we can do about it on our side (the user can enable the
+            // head-behind- base protection on their side).
+            //
+            if (sd.kind == service_data::remote)
+            {
+              l3 ([&]{trace << "remote pull request " << *sd.pr_node_id
+                            << ": CI tenant already exists for " << sid;});
+            }
           }
         }
+        else
+        {
+          error << "pull request " << *sd.pr_node_id
+                << ": failed to create unloaded CI tenant "
+                << "with tenant_service id " << sid;
+
+          // Fall through to cancel.
+        }
       }
-      else
+      catch (const runtime_error& e) // Database retries exhausted.
       {
         error << "pull request " << *sd.pr_node_id
-              << ": unable to create unloaded CI tenant "
-              << "with tenant_service id " << sid;
+              << ": failed to create unloaded CI tenant "
+              << "with tenant_service id " << sid
+              << ": " << e.what ();
 
         // Fall through to cancel.
       }
@@ -1232,26 +1594,70 @@ namespace brep
 
     // Cancel the pre-check tenant.
     //
-    if (!cancel (error, warn, verb_ ? &trace : nullptr,
-                 *build_db_, retry_,
-                 ts.type,
-                 ts.id))
+    try
+    {
+      if (!cancel (error, warn, verb_ ? &trace : nullptr,
+                   *build_db_, retry_,
+                   ts.type,
+                   ts.id))
+      {
+        // Should never happen (no such tenant).
+        //
+        error << "pull request " << *sd.pr_node_id
+              << ": failed to cancel pre-check tenant with tenant_service id "
+              << ts.id;
+      }
+    }
+    catch (const runtime_error& e) // Database retries exhausted.
     {
-      // Should never happen (no such tenant).
-      //
       error << "pull request " << *sd.pr_node_id
             << ": failed to cancel pre-check tenant with tenant_service id "
-            << ts.id;
+            << ts.id << ": " << e.what ();
+    }
+
+    return nullptr;
+  }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+    error << "pull request " << *sd.pr_node_id
+          << ": unhandled exception: " << e.what ();
+
+    // Cancel the pre-check tenant otherwise we could end up in an infinite
+    // loop (see top of function).
+    //
+    try
+    {
+      if (cancel (error, warn, verb_ ? &trace : nullptr,
+                  *build_db_, retry_,
+                  ts.type,
+                  ts.id))
+        l3 ([&]{trace << "canceled pre-check tenant " << ts.id;});
+    }
+    catch (const runtime_error& e) // Database retries exhausted.
+    {
+      l3 ([&]{trace << "failed to cancel pre-check tenant " << ts.id << ": "
+                    << e.what ();});
     }
 
     return nullptr;
   }
 
-  function<optional<string> (const tenant_service&)> ci_github::
-  build_unloaded_load (tenant_service&& ts,
+  function<optional<string> (const string&, const tenant_service&)> ci_github::
+  build_unloaded_load (const string& tenant_id,
+                       tenant_service&& ts,
                        service_data&& sd,
                        const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+    //
+    // In a few places where invalid_argument is unlikely to be thrown and/or
+    // would indicate that things are seriously broken we let it propagate to
+    // the function catch block where the tenant will be canceled (otherwise
+    // we could end up in an infinite loop, e.g., because the problematic
+    // arguments won't change).
+
     NOTIFICATION_DIAG (log_writer);
 
     // Load the tenant, which is essentially the same for both branch push and
@@ -1297,6 +1703,8 @@ namespace brep
       check_run cr;
       cr.name = move (name);
 
+      // Let unlikely invalid_argument propagate (see above).
+      //
       if (gq_create_check_run (error,
                                cr,
                                iat->token,
@@ -1323,14 +1731,16 @@ namespace brep
     {
       assert (!node_id.empty ());
 
-      optional<gq_built_result> br (
-        gq_built_result (gh_to_conclusion (rs, sd.warning_success),
-                         circle (rs) + ' ' + ucase (to_string (rs)),
-                         move (summary)));
+      // Let unlikely invalid_argument propagate (see above).
+      //
+      gq_built_result br (
+        make_built_result (rs, sd.warning_success, move (summary)));
 
       check_run cr;
       cr.name = name; // For display purposes only.
 
+      // Let unlikely invalid_argument propagate (see above).
+      //
       if (gq_update_check_run (error,
                                cr,
                                iat->token,
@@ -1357,16 +1767,24 @@ namespace brep
     //
     string conclusion_node_id; // Conclusion check run node ID.
 
-    if (auto cr = create_synthetic_cr (conclusion_check_run_name))
+    if (!sd.conclusion_node_id)
     {
-      l3 ([&]{trace << "created check_run { " << *cr << " }";});
+      if (auto cr = create_synthetic_cr (conclusion_check_run_name))
+      {
+        l3 ([&]{trace << "created check_run { " << *cr << " }";});
 
-      conclusion_node_id = move (*cr->node_id);
+        conclusion_node_id = move (*cr->node_id);
+      }
     }
 
+    const string& effective_conclusion_node_id (
+      sd.conclusion_node_id
+      ? *sd.conclusion_node_id
+      : conclusion_node_id);
+
     // Load the CI tenant if the conclusion check run was created.
     //
-    if (!conclusion_node_id.empty ())
+    if (!effective_conclusion_node_id.empty ())
     {
       string ru; // Repository URL.
 
@@ -1383,46 +1801,65 @@ namespace brep
       else
         ru = sd.repository_clone_url + '#' + sd.check_sha;
 
+      // Let unlikely invalid_argument propagate (see above).
+      //
       repository_location rl (move (ru), repository_type::git);
 
-      optional<start_result> r (load (error, warn, verb_ ? &trace : nullptr,
-                                      *build_db_, retry_,
-                                      move (ts),
-                                      move (rl)));
-
-      if (!r || r->status != 200)
+      try
       {
-        if (auto cr = update_synthetic_cr (conclusion_node_id,
-                                           conclusion_check_run_name,
-                                           result_status::error,
-                                           to_check_run_summary (r)))
-        {
-          l3 ([&]{trace << "updated check_run { " << *cr << " }";});
-        }
-        else
+        optional<start_result> r (load (error, warn, verb_ ? &trace : nullptr,
+                                        *build_db_, retry_,
+                                        move (ts),
+                                        move (rl)));
+
+        if (!r || r->status != 200)
         {
-          // Nothing really we can do in this case since we will not receive
-          // any further notifications. Log the error as a last resort.
+          // Let unlikely invalid_argument propagate (see above).
+          //
+          if (auto cr = update_synthetic_cr (effective_conclusion_node_id,
+                                             conclusion_check_run_name,
+                                             result_status::error,
+                                             to_check_run_summary (r)))
+          {
+            l3 ([&]{trace << "updated check_run { " << *cr << " }";});
+          }
+          else
+          {
+            // Nothing really we can do in this case since we will not receive
+            // any further notifications. Log the error as a last resort.
 
-          error << "failed to load CI tenant " << ts.id
-                << " and unable to update conclusion";
+            error << "failed to load CI tenant " << ts.id
+                  << " and unable to update conclusion";
+          }
+
+          return nullptr; // No need to update service data in this case.
         }
+      }
+      catch (const runtime_error& e) // Database retries exhausted.
+      {
+        error << "failed to load CI tenant " << ts.id << ": " << e.what ();
 
-        return nullptr; // No need to update service data in this case.
+        // Fall through to retry on next call.
       }
     }
-    else if (!new_iat)
-      return nullptr; // Nothing to save (but retry on next call).
+
+    if (!new_iat && conclusion_node_id.empty ())
+      return nullptr; // Nothing to save (but potentially retry on next call).
 
     return [&error,
+            tenant_id,
             iat = move (new_iat),
             cni = move (conclusion_node_id)]
-      (const tenant_service& ts) -> optional<string>
+      (const string& ti,
+       const tenant_service& ts) -> optional<string>
     {
       // NOTE: this lambda may be called repeatedly (e.g., due to
       // transaction being aborted) and so should not move out of its
       // captures.
 
+      if (tenant_id != ti)
+        return nullopt; // Do nothing if the tenant has been replaced.
+
       service_data sd;
       try
       {
@@ -1443,6 +1880,28 @@ namespace brep
       return sd.json ();
     };
   }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+    error << "CI tenant " << ts.id << ": unhandled exception: " << e.what ();
+
+    // Cancel the tenant otherwise we could end up in an infinite loop (see
+    // top of function).
+    //
+    try
+    {
+      if (cancel (error, warn, verb_ ? &trace : nullptr,
+                  *build_db_, retry_, ts.type, ts.id))
+        l3 ([&]{trace << "canceled CI tenant " << ts.id;});
+    }
+    catch (const runtime_error& e) // Database retries exhausted.
+    {
+      l3 ([&]{trace << "failed to cancel CI tenant " << ts.id
+                    << ": " << e.what ();});
+    }
+
+    return nullptr;
+  }
 
   // Build state change notifications (see tenant-services.hxx for
   // background). Mapping our state transitions to GitHub pose multiple
@@ -1453,9 +1912,9 @@ namespace brep
   //    them when notifying GitHub. The first is not important (we expect the
   //    state to go back to building shortly). The second should normally not
   //    happen and would mean that a completed check suite may go back on its
-  //    conclusion (which would be pretty confusing for the user). @@@ This
-  //    can/will happen on check run rebuild. Distinguish between internal
-  //    and external rebuilds?
+  //    conclusion (which would be pretty confusing for the user). Note that
+  //    the ->queued state transition of a check run rebuild triggered by
+  //    us is handled directly in handle_check_run_rerequest().
   //
   //    So, for GitHub notifications, we only have the following linear
   //    transition sequence:
@@ -1532,13 +1991,17 @@ namespace brep
   //    if we have node_id, then we update, otherwise, we create (potentially
   //    overriding the check run created previously).
   //
-  function<optional<string> (const tenant_service&)> ci_github::
-  build_queued (const tenant_service& ts,
+  function<optional<string> (const string&, const tenant_service&)> ci_github::
+  build_queued (const string& tenant_id,
+                const tenant_service& ts,
                 const vector<build>& builds,
                 optional<build_state> istate,
                 const build_queued_hints& hs,
                 const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+
     NOTIFICATION_DIAG (log_writer);
 
     service_data sd;
@@ -1638,11 +2101,12 @@ namespace brep
     {
       // Create a check_run for each build as a single request.
       //
+      // Let unlikely invalid_argument propagate.
+      //
       if (gq_create_check_runs (error,
                                 crs,
                                 iat->token,
-                                sd.repository_node_id, sd.report_sha,
-                                build_state::queued))
+                                sd.repository_node_id, sd.report_sha))
       {
         for (const check_run& cr: crs)
         {
@@ -1654,15 +2118,20 @@ namespace brep
       }
     }
 
-    return [bs = move (bs),
+    return [tenant_id,
+            bs = move (bs),
             iat = move (new_iat),
             crs = move (crs),
             error = move (error),
-            warn = move (warn)] (const tenant_service& ts) -> optional<string>
+            warn = move (warn)] (const string& ti,
+                                 const tenant_service& ts) -> optional<string>
     {
       // NOTE: this lambda may be called repeatedly (e.g., due to transaction
       // being aborted) and so should not move out of its captures.
 
+      if (tenant_id != ti)
+        return nullopt; // Do nothing if the tenant has been replaced.
+
       service_data sd;
       try
       {
@@ -1702,12 +2171,24 @@ namespace brep
       return sd.json ();
     };
   }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
 
-  function<optional<string> (const tenant_service&)> ci_github::
-  build_building (const tenant_service& ts,
+    error << "CI tenant " << ts.id << ": unhandled exception: " << e.what ();
+
+    return nullptr;
+  }
+
+  function<optional<string> (const string&, const tenant_service&)> ci_github::
+  build_building (const string& tenant_id,
+                  const tenant_service& ts,
                   const build& b,
                   const diag_epilogue& log_writer) const noexcept
+  try
   {
+    // NOTE: this function is noexcept and should not throw.
+
     NOTIFICATION_DIAG (log_writer);
 
     service_data sd;
@@ -1783,6 +2264,8 @@ namespace brep
     //
     if (iat != nullptr)
     {
+      // Let unlikely invalid_argument propagate.
+      //
       if (gq_update_check_run (error,
                                *cr,
                                iat->token,
@@ -1806,14 +2289,19 @@ namespace brep
       }
     }
 
-    return [iat = move (new_iat),
+    return [tenant_id,
+            iat = move (new_iat),
             cr = move (*cr),
             error = move (error),
-            warn = move (warn)] (const tenant_service& ts) -> optional<string>
+            warn = move (warn)] (const string& ti,
+                                 const tenant_service& ts) -> optional<string>
     {
       // NOTE: this lambda may be called repeatedly (e.g., due to transaction
       // being aborted) and so should not move out of its captures.
 
+      if (tenant_id != ti)
+        return nullopt; // Do nothing if the tenant has been replaced.
+
       service_data sd;
       try
       {
@@ -1848,18 +2336,31 @@ namespace brep
       return sd.json ();
     };
   }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+
+    string bid (gh_check_run_name (b)); // Full build id.
+
+    error << "check run " << bid << ": unhandled exception: " << e.what();
+
+    return nullptr;
+  }
 
-  function<optional<string> (const tenant_service&)> ci_github::
-  build_built (const tenant_service& ts,
+  function<optional<string> (const string&, const tenant_service&)> ci_github::
+  build_built (const string& tenant_id,
+               const tenant_service& ts,
                const build& b,
                const diag_epilogue& log_writer) const noexcept
+  try
   {
-    // @@ TODO Include service_data::event_node_id and perhaps ts.id in
-    //    diagnostics? E.g. when failing to update check runs we print the
-    //    build ID only.
-    //
+    // NOTE: this function is noexcept and should not throw.
+
     NOTIFICATION_DIAG (log_writer);
 
+    // @@ TODO Include ts.id in diagnostics? Check run build ids alone seem
+    //    kind of meaningless. Log lines get pretty long this way however.
+
     service_data sd;
     try
     {
@@ -1978,6 +2479,11 @@ namespace brep
       {
         using namespace web::xhtml;
 
+        // Note: let all serialization exceptions propagate. The XML
+        // serialization code can throw bad_alloc or xml::serialization in
+        // case of I/O failures, but we're serializing to a string stream so
+        // both exceptions are unlikely.
+        //
         ostringstream os;
         xml::serializer s (os, "check_run_summary");
 
@@ -2069,13 +2575,12 @@ namespace brep
       }
 
       gq_built_result br (
-        gh_to_conclusion (*b.status, sd.warning_success),
-        circle (*b.status) + ' ' + ucase (to_string (*b.status)),
-        move (sm));
+        make_built_result (*b.status, sd.warning_success, move (sm)));
 
       if (cr.node_id)
       {
-        // Update existing check run to built.
+        // Update existing check run to built. Let unlikely invalid_argument
+        // propagate.
         //
         if (gq_update_check_run (error,
                                  cr,
@@ -2092,7 +2597,7 @@ namespace brep
       }
       else
       {
-        // Create new check run.
+        // Create new check run. Let unlikely invalid_argument propagate.
         //
         // Note that we don't have build hints so will be creating this check
         // run with the full build id as name. In the unlikely event that an
@@ -2129,10 +2634,9 @@ namespace brep
 
           result_status rs (*conclusion);
 
-          optional<gq_built_result> br (
-            gq_built_result (gh_to_conclusion (rs, sd.warning_success),
-                             circle (rs) + ' ' + ucase (to_string (rs)),
-                             "All configurations are built"));
+          gq_built_result br (
+            make_built_result (rs, sd.warning_success,
+                               "All configurations are built"));
 
           check_run cr;
 
@@ -2141,6 +2645,8 @@ namespace brep
           cr.node_id = *sd.conclusion_node_id;
           cr.name = conclusion_check_run_name;
 
+          // Let unlikely invalid_argument propagate.
+          //
           if (gq_update_check_run (error,
                                    cr,
                                    iat->token,
@@ -2167,15 +2673,20 @@ namespace brep
       }
     }
 
-    return [iat = move (new_iat),
+    return [tenant_id,
+            iat = move (new_iat),
             cr = move (cr),
             completed = completed,
             error = move (error),
-            warn = move (warn)] (const tenant_service& ts) -> optional<string>
+            warn = move (warn)] (const string& ti,
+                                 const tenant_service& ts) -> optional<string>
     {
       // NOTE: this lambda may be called repeatedly (e.g., due to transaction
       // being aborted) and so should not move out of its captures.
 
+      if (tenant_id != ti)
+        return nullopt; // Do nothing if the tenant has been replaced.
+
       service_data sd;
       try
       {
@@ -2245,6 +2756,16 @@ namespace brep
       return sd.json ();
     };
   }
+  catch (const std::exception& e)
+  {
+    NOTIFICATION_DIAG (log_writer);
+
+    string bid (gh_check_run_name (b)); // Full build id.
+
+    error << "check run " << bid << ": unhandled exception: " << e.what();
+
+    return nullptr;
+  }
 
   string ci_github::
   details_url (const build& b) const
@@ -2270,22 +2791,22 @@ namespace brep
 
     url u (details_url);
 
-    if (!u.query || !u.path || u.path->size () <= 1)
-      return nullopt;
-
     build_id r;
 
     // Extract the tenant from the URL path.
     //
     // Example path: @d2586f57-21dc-40b7-beb2-6517ad7917dd
     //
-    r.package.tenant = u.path->substr (1);
-
-    if (r.package.tenant.empty ())
+    if (!u.path || u.path->size () != 37 || (*u.path)[0] != '@')
       return nullopt;
 
+    r.package.tenant = u.path->substr (1);
+
     // Extract the rest of the build_id members from the URL query.
     //
+    if (!u.query)
+      return nullopt;
+
     bool pn (false), pv (false), tg (false), tc (false), pc (false),
       th (false);
 
@@ -2304,21 +2825,25 @@ namespace brep
 
       ++vp; // Skip '='
 
-      const char* ve (ep ? ep : vp + strlen (vp)); // Value end pointer.
+      const char* ve (ep != nullptr ? ep : vp + strlen (vp)); // Value end.
 
       // Get the value as-is or URL-decode it.
       //
-      auto getval = [vp, ve] () { return string (vp, ve); };
+      auto rawval = [vp, ve] () { return string (vp, ve); };
       auto decval = [vp, ve] () { return mime_url_decode (vp, ve); };
 
       auto make_version = [] (string&& v)
-      { return canonical_version (brep::version (move (v))); };
+      {
+        return canonical_version (brep::version (move (v)));
+      };
 
       auto c = [&n] (bool& b, const char* s)
-      { return n == s ? (b = true) : false; };
+      {
+        return n == s ? (b = true) : false;
+      };
 
       if (c (pn, "builds"))  r.package.name        = package_name (decval ());
-      else if (c (pv, "pv")) r.package.version     = make_version (getval ());
+      else if (c (pv, "pv")) r.package.version     = make_version (rawval ());
       else if (c (tg, "tg")) r.target              = target_triplet (decval ());
       else if (c (tc, "tc")) r.target_config_name  = decval ();
       else if (c (pc, "pc")) r.package_config_name = decval ();
@@ -2326,7 +2851,7 @@ namespace brep
       {
         // Toolchain name and version. E.g. "public-0.17.0"
 
-        string v (getval ());
+        string v (rawval ());
 
         // Note: parsing code based on mod/mod-builds.cxx.
         //
@@ -2338,7 +2863,7 @@ namespace brep
         r.toolchain_version = make_version (v.substr (p + 1));
       }
 
-      qp = ep ? ep + 1 : nullptr;
+      qp = ep != nullptr ? ep + 1 : nullptr;
     }
 
     if (!pn || !pv || !tg || !tc || !pc || !th)
@@ -2346,7 +2871,7 @@ namespace brep
 
     return r;
   }
-  catch (const invalid_argument&)
+  catch (const invalid_argument&) // Invalid url, brep::version, etc.
   {
     return nullopt;
   }
@@ -2455,6 +2980,8 @@ namespace brep
       //
       iat.expires_at -= chrono::minutes (5);
     }
+    // gh_installation_access_token (via github_post())
+    //
     catch (const json::invalid_json_input& e)
     {
       // Note: e.name is the GitHub API endpoint.
@@ -2464,12 +2991,12 @@ namespace brep
             << e.position << ", error: " << e;
       return nullopt;
     }
-    catch (const invalid_argument& e)
+    catch (const invalid_argument& e) // github_post()
     {
       error << "malformed header(s) in response: " << e;
       return nullopt;
     }
-    catch (const system_error& e)
+    catch (const system_error& e) // github_post()
     {
       error << "unable to get installation access token (errno=" << e.code ()
             << "): " << e.what ();