2 files changed, 95 insertions, 85 deletions
diff --git a/mod/mod-ci-github.cxx b/mod/mod-ci-github.cxx
index edef67a..2e2d160 100644
--- a/mod/mod-ci-github.cxx
+++ b/mod/mod-ci-github.cxx
@@ -938,15 +938,16 @@ namespace brep
 
       const service_data::check_run* cr (sd.find_check_run (bid));
 
-      // @@ TMP This doesn't catch the case of an out-of-order initial queued
-      //        following a building or built when the network was down (and
-      //        thus cr->state is absent). We could end up setting the state
-      //        to queued on GH when it should be building or built.
+      // To keep things simple we are going to queue/create a new check run
+      // only if we have no corresponding state (which means we haven't yet
+      // done anything about this check run).
       //
-      if (cr == nullptr ||
-          !cr->state    ||
-          (cr->state != build_state::built &&
-           (istate && *istate == build_state::building))) // Interrupted.
+      // In particular, this will ignore the building->queued (interrupted)
+      // transition so on GitHub the check run will continue showing as
+      // building, which is probably not a big deal. Also, this sidesteps
+      // various "absent state" corner.
+      //
+      if (cr == nullptr)
       {
         if (cr != nullptr)
           crs.emplace_back (move (*cr)).state = nullopt;
@@ -955,6 +956,10 @@ namespace brep
 
         bs.push_back (b);
       }
+      else if (!cr->state)
+        ; // Ignore network issue.
+      else if (istate && *istate == build_state::building)
+        ; // Ignore interrupted.
       else
       {
         // @@ TODO warn with some information
@@ -966,8 +971,6 @@ namespace brep
 
     // Queue a check_run for each build.
     //
-    // @@ TODO updateCheckRun() if this is building->queued.
-    //
     string rq (
       graphql_request (
         queue_check_runs (sd.repository_id, sd.head_sha, bs, &hs)));
@@ -979,7 +982,7 @@ namespace brep
 
     // Get a new installation access token if the current one has expired.
     //
-    const installation_access_token* iat;
+    const installation_access_token* iat (nullptr);
     optional<installation_access_token> new_iat;
 
     if (system_clock::now () > sd.installation_access.expires_at)
@@ -998,87 +1001,85 @@ namespace brep
       iat = &sd.installation_access;
 
     if (iat != nullptr)
+    try
     {
-      try
+      // Response type which parses a GraphQL response containing multiple
+      // check_run objects.
+      //
+      struct resp
       {
-        // Response type which parses a GraphQL response containing multiple
-        // check_run objects.
-        //
-        struct resp
-        {
-          vector<check_run> check_runs; // Received check runs.
+        vector<check_run> check_runs; // Received check runs.
 
-          resp (json::parser& p) : check_runs (parse_check_runs_response (p)) {}
+        resp (json::parser& p) : check_runs (parse_check_runs_response (p)) {}
 
-          resp () = default;
-        } rs;
+        resp () = default;
+      } rs;
 
-        uint16_t sc (
-          github_post (
-            rs,
-            "graphql", // API Endpoint.
-            strings {"Authorization: Bearer " + iat->token},
-            move (rq)));
+      uint16_t sc (
+        github_post (
+          rs,
+          "graphql", // API Endpoint.
+          strings {"Authorization: Bearer " + iat->token},
+          move (rq)));
 
-        if (sc == 200)
+      if (sc == 200)
+      {
+        if (rs.check_runs.size () == bs.size ())
         {
-          if (rs.check_runs.size () == bs.size ())
+          // Validate the check runs in the response against the builds.
+          //
+          for (size_t i (0); i != rs.check_runs.size (); ++i)
           {
-            // Validate the check runs in the response against the builds.
-            //
-            for (size_t i (0); i != rs.check_runs.size (); ++i)
+            const build& b (bs[i]);
+            const check_run& rcr (rs.check_runs[i]); // Received check run.
+
+            if (rcr.name != check_run_name (b, &hs))
+              error << "unexpected check_run name: '" + rcr.name + '\'';
+            else if (rcr.status != "QUEUED")
+              error << "unexpected check_run status: '" + rcr.status + '\'';
+            else
             {
-              const build& b (bs[i]);
-              const check_run& rcr (rs.check_runs[i]); // Received check run.
+              l3 ([&]{trace << "check_run { " << rcr << " }";});
 
-              if (rcr.name != check_run_name (b, &hs))
-                error << "unexpected check_run name: '" + rcr.name + '\'';
-              else if (rcr.status != "QUEUED")
-                error << "unexpected check_run status: '" + rcr.status + '\'';
-              else
-              {
-                l3 ([&]{trace << "check_run { " << rcr << " }";});
+              service_data::check_run& cr (crs[i]);
 
-                service_data::check_run& cr (crs[i]);
+              if (!cr.node_id)
+                cr.node_id = move (rcr.node_id);
 
-                if (!cr.node_id)
-                  cr.node_id = move (rcr.node_id);
-
-                cr.state = build_state::queued;
-              }
+              cr.state = build_state::queued;
             }
           }
-          else
-            error << "unexpected number of check_run objects in response";
         }
         else
-          error << "failed to queue check runs: error HTTP response status "
-                << sc;
-      }
-      catch (const json::invalid_json_input& e)
-      {
-        // Note: e.name is the GitHub API endpoint.
-        //
-        error << "malformed JSON in response from " << e.name << ", line: "
-              << e.line << ", column: " << e.column << ", byte offset: "
-              << e.position << ", error: " << e;
-      }
-      catch (const invalid_argument& e)
-      {
-        error << "malformed header(s) in response: " << e;
-      }
-      catch (const system_error& e)
-      {
-        error << "unable to queue check runs (errno=" << e.code () << "): "
-              << e.what ();
-      }
-      catch (const runtime_error& e) // From parse_check_runs_response().
-      {
-        // GitHub response contained error(s) (could be ours or theirs at this
-        // point).
-        //
-        error << "unable to queue check runs: " << e;
+          error << "unexpected number of check_run objects in response";
       }
+      else
+        error << "failed to queue check runs: error HTTP response status "
+              << sc;
+    }
+    catch (const json::invalid_json_input& e)
+    {
+      // Note: e.name is the GitHub API endpoint.
+      //
+      error << "malformed JSON in response from " << e.name << ", line: "
+            << e.line << ", column: " << e.column << ", byte offset: "
+            << e.position << ", error: " << e;
+    }
+    catch (const invalid_argument& e)
+    {
+      error << "malformed header(s) in response: " << e;
+    }
+    catch (const system_error& e)
+    {
+      error << "unable to queue check runs (errno=" << e.code () << "): "
+            << e.what ();
+    }
+    catch (const runtime_error& e) // From parse_check_runs_response().
+    {
+      // GitHub response contained error(s) (could be ours or theirs at this
+      // point).
+      //
+      error << "unable to queue check runs: " << e;
     }
 
     return [bs = move (bs),
@@ -1095,7 +1096,7 @@ namespace brep
       }
       catch (const invalid_argument& e)
       {
-        // @@ TODO
+        // @@ TODO: try to move error into this lambda?
         // error << "failed to parse service data: " << e;
         return nullopt;
       }
@@ -1103,22 +1104,22 @@ namespace brep
       if (iat)
         sd.installation_access = *iat;
 
-      // Note that we've already removed all builds for which this
-      // notification is out of order.
+      // Note that we've already ignored all the builds for which this
+      // notification was out of order.
       //
       for (size_t i (0); i != bs.size (); ++i)
       {
         const service_data::check_run& cr (crs[i]);
 
-        // Update stored check run if it exists, otherwise store check run for
-        // the first time.
+        // Note that this service data may not be the same as what we observed
+        // in the build_queued() function above. For example, some check runs
+        // that we have queued may have already transitioned to building. So
+        // we skip any check runs that are already present.
         //
         if (service_data::check_run* scr = sd.find_check_run (cr.build_id))
         {
-          if (!scr->node_id)
-            scr->node_id = cr.node_id;
-
-          scr->state = cr.state;
+          // @@ TODO: let's warn about this out of order case to see how
+          //          often we get it.
         }
         else
           sd.check_runs.push_back (cr);
@@ -1132,6 +1133,14 @@ namespace brep
   build_building (const tenant_service&, const build&,
                   const diag_epilogue& /* log_writer */) const noexcept
   {
+    // Note that we may receive this notification before the corresponding
+    // check run object has been persistent in the service data (see the
+    // returned by build_queued() lambda for details). So we may try to create
+    // a new check run but find out that it's actually already exist on the
+    // GitHub side. In this case we will somehow need to get the node_id for
+    // the existing check run and re-try but this time updating instead of
+    // creating.
+
     return nullptr;
   }
 
diff --git a/mod/tenant-service.hxx b/mod/tenant-service.hxx
index b7f5c02..d0cd6ea 100644
--- a/mod/tenant-service.hxx
+++ b/mod/tenant-service.hxx
@@ -71,7 +71,8 @@ namespace brep
     // service data. It should return the new data or nullopt if no update is
     // necessary. Note: tenant_service::data passed to the callback and to the
     // returned function may not be the same. Also, the returned function may
-    // be called multiple times (on transaction retries).
+    // be called multiple times (on transaction retries). Note that the passed
+    // log_writer is valid during the calls to the returned function.
     //
     // The passed initial_state indicates the logical initial state and is
     // either absent, `building` (interrupted), or `built` (rebuild). Note