1 files changed, 80 insertions, 35 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 9aba216..ad8cd74 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -2133,29 +2133,40 @@ try
 
       // Determine which machines we need to offer for this priority.
       //
-      for (const bootstrapped_machine& m: ms)
       {
-        if (!m.lock.locked ())
+        bool interruptable (false);
+        for (const bootstrapped_machine& m: ms)
         {
-          if (!m.lock.prio) // Skip bootstrapping/suspended.
-            continue;
+          if (!m.lock.locked ())
+          {
+            if (!m.lock.prio) // Skip bootstrapping/suspended.
+              continue;
 
-          uint64_t eprio (*m.lock.prio);
+            uint64_t eprio (*m.lock.prio);
 
-          // Determine if our priority can interrupt the existing task.
-          //
-          // Based on the above discussion of the priority lower bound
-          // determination (and some menditation) it's clear that we can only
-          // interrupt the existing task if our priority is (at least) on a
-          // higher 3rd level.
-          //
-          if ((prio / 100) <= (eprio / 100))
-            continue;
+            // Determine if our priority can interrupt the existing task.
+            //
+            // Based on the above discussion of the priority lower bound
+            // determination (and some menditation) it's clear that we can
+            // only interrupt the existing task if our priority is (at least)
+            // on a higher 3rd level.
+            //
+            if ((prio / 100) <= (eprio / 100))
+              continue;
+
+            interruptable = true;
+          }
+
+          tq.machines.emplace_back (m.manifest.machine.id,
+                                    m.manifest.machine.name,
+                                    m.manifest.machine.summary);
         }
 
-        tq.machines.emplace_back (m.manifest.machine.id,
-                                  m.manifest.machine.name,
-                                  m.manifest.machine.summary);
+        // Sanity check: in the priority monitor mode we should only ask for a
+        // task if we can interrupt one (this should be taken care of by the
+        // priority lower bound calculation above).
+        //
+        assert (!prio_mon || interruptable);
       }
 
       if (ops.dump_machines ())
@@ -2370,10 +2381,12 @@ try
       // Next find the corresponding bootstrapped_machine instance in ms. Also
       // unlock all the other machines.
       //
-      // While at it also find the lowest priority candidate to interrupt if
-      // necessary.
+      // While at it also see if we need to interrupt the selected machine (if
+      // busy), one of the existing (if we are at the max allowed instances,
+      // that is in the priority monitor mode), or all existing (if this is a
+      // priority level 4 task).
       //
-      bootstrapped_machine* im (nullptr);
+      vector<bootstrapped_machine*> ims;
       for (bootstrapped_machine& m: ms)
       {
         if (m.manifest.machine.name == t.machine)
@@ -2385,12 +2398,37 @@ try
           m.lock.unlock ();
         else if (m.lock.prio) // Not bootstrapping/suspended.
         {
-          if (im == nullptr || *m.lock.prio < *im->lock.prio)
-            im = &m;
+          // Only consider machines that we can interrupt (see above).
+          //
+          if ((prio / 100) > (*m.lock.prio / 100))
+          {
+            if (prio >= 1000) // Priority level 4 (interrupt all).
+              ims.push_back (&m);
+            else if (prio_mon)
+            {
+              // Find the lowest priority task to interrupt.
+              //
+              if (ims.empty ())
+                ims.push_back (&m);
+              else if (*m.lock.prio < *ims.back ()->lock.prio)
+                ims.back () = &m;
+            }
+          }
         }
       }
+
       assert (pm != nullptr);
 
+      if (!pm->lock.locked ())
+      {
+        if (prio >= 1000)
+          ims.insert (ims.begin (), pm); // Interrupt first (see below).
+        else
+          ims = {pm};
+      }
+
+      assert (!prio_mon || !ims.empty ()); // We should have at least one.
+
       // Move the toolchain lock into this scope so that it's automatically
       // released on any failure (on the happy path it is released by
       // perform_task()).
@@ -2398,25 +2436,25 @@ try
       toolchain_lock& rtl (tl);
       toolchain_lock tl (move (rtl));
 
-      // See if we need to interrupt the selected machine (if busy) or one of
-      // the existing (if we are at the max allowed instances, that is in the
-      // priority monitor mode).
+      // Interrupt the machines, if necessary.
       //
-      if (!pm->lock.locked ())
-        im = pm;
-      else if (prio_mon)
-        assert (im != nullptr); // We should have at least one.
-      else
-        im = nullptr; // No interrupt necessary.
-
-      if (im != nullptr)
+      // Note that if we are interrupting multiple machines, then the target
+      // machine, if needs to be interrupted, must be first. This way if we
+      // are unable to successfully interrupt it, we don't interrupt the rest.
+      //
+      for (bootstrapped_machine* im: ims)
       {
-        assert (!im->lock.locked () && im->lock.prio); // Sanity checks.
+        bool first (im == ims.front ());
+
+        // Sanity checks.
+        //
+        assert (!im->lock.locked () && im->lock.prio);
+        assert (im != pm || first);
 
         const dir_path& tp (im->path); // -<toolchain> path.
 
         l1 ([&]{trace << "interrupting "
-                      << (im == pm ? "target" : "lowest priority")
+                      << (im == pm ? "target" : "lower priority")
                       << " machine " << tp << ", pid " << im->lock.pid;});
 
         // The plan is to send the interrupt and then wait for the lock.
@@ -2439,6 +2477,13 @@ try
             throw_generic_error (errno);
         }
 
+        // If we are interrupting multiple machine, there is no use acquiring
+        // the lock (or failing if unable to) for subsequent machines since
+        // this is merely a performance optimization.
+        //
+        if (!first)
+          continue;
+
         // Try to lock the machine.
         //
         // While this normally shouldn't take long, there could be parts of