aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bbot/agent/agent.cxx115
1 files changed, 80 insertions, 35 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 9aba216..ad8cd74 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -2133,29 +2133,40 @@ try
// Determine which machines we need to offer for this priority.
//
- for (const bootstrapped_machine& m: ms)
{
- if (!m.lock.locked ())
+ bool interruptable (false);
+ for (const bootstrapped_machine& m: ms)
{
- if (!m.lock.prio) // Skip bootstrapping/suspended.
- continue;
+ if (!m.lock.locked ())
+ {
+ if (!m.lock.prio) // Skip bootstrapping/suspended.
+ continue;
- uint64_t eprio (*m.lock.prio);
+ uint64_t eprio (*m.lock.prio);
- // Determine if our priority can interrupt the existing task.
- //
- // Based on the above discussion of the priority lower bound
- // determination (and some menditation) it's clear that we can only
- // interrupt the existing task if our priority is (at least) on a
- // higher 3rd level.
- //
- if ((prio / 100) <= (eprio / 100))
- continue;
+ // Determine if our priority can interrupt the existing task.
+ //
+ // Based on the above discussion of the priority lower bound
+ // determination (and some menditation) it's clear that we can
+ // only interrupt the existing task if our priority is (at least)
+ // on a higher 3rd level.
+ //
+ if ((prio / 100) <= (eprio / 100))
+ continue;
+
+ interruptable = true;
+ }
+
+ tq.machines.emplace_back (m.manifest.machine.id,
+ m.manifest.machine.name,
+ m.manifest.machine.summary);
}
- tq.machines.emplace_back (m.manifest.machine.id,
- m.manifest.machine.name,
- m.manifest.machine.summary);
+ // Sanity check: in the priority monitor mode we should only ask for a
+ // task if we can interrupt one (this should be taken care of by the
+ // priority lower bound calculation above).
+ //
+ assert (!prio_mon || interruptable);
}
if (ops.dump_machines ())
@@ -2370,10 +2381,12 @@ try
// Next find the corresponding bootstrapped_machine instance in ms. Also
// unlock all the other machines.
//
- // While at it also find the lowest priority candidate to interrupt if
- // necessary.
+ // While at it also see if we need to interrupt the selected machine (if
+ // busy), one of the existing (if we are at the max allowed instances,
+ // that is in the priority monitor mode), or all existing (if this is a
+ // priority level 4 task).
//
- bootstrapped_machine* im (nullptr);
+ vector<bootstrapped_machine*> ims;
for (bootstrapped_machine& m: ms)
{
if (m.manifest.machine.name == t.machine)
@@ -2385,12 +2398,37 @@ try
m.lock.unlock ();
else if (m.lock.prio) // Not bootstrapping/suspended.
{
- if (im == nullptr || *m.lock.prio < *im->lock.prio)
- im = &m;
+ // Only consider machines that we can interrupt (see above).
+ //
+ if ((prio / 100) > (*m.lock.prio / 100))
+ {
+ if (prio >= 1000) // Priority level 4 (interrupt all).
+ ims.push_back (&m);
+ else if (prio_mon)
+ {
+ // Find the lowest priority task to interrupt.
+ //
+ if (ims.empty ())
+ ims.push_back (&m);
+ else if (*m.lock.prio < *ims.back ()->lock.prio)
+ ims.back () = &m;
+ }
+ }
}
}
+
assert (pm != nullptr);
+ if (!pm->lock.locked ())
+ {
+ if (prio >= 1000)
+ ims.insert (ims.begin (), pm); // Interrupt first (see below).
+ else
+ ims = {pm};
+ }
+
+ assert (!prio_mon || !ims.empty ()); // We should have at least one.
+
// Move the toolchain lock into this scope so that it's automatically
// released on any failure (on the happy path it is released by
// perform_task()).
@@ -2398,25 +2436,25 @@ try
toolchain_lock& rtl (tl);
toolchain_lock tl (move (rtl));
- // See if we need to interrupt the selected machine (if busy) or one of
- // the existing (if we are at the max allowed instances, that is in the
- // priority monitor mode).
+ // Interrupt the machines, if necessary.
//
- if (!pm->lock.locked ())
- im = pm;
- else if (prio_mon)
- assert (im != nullptr); // We should have at least one.
- else
- im = nullptr; // No interrupt necessary.
-
- if (im != nullptr)
+ // Note that if we are interrupting multiple machines, then the target
+ // machine, if needs to be interrupted, must be first. This way if we
+ // are unable to successfully interrupt it, we don't interrupt the rest.
+ //
+ for (bootstrapped_machine* im: ims)
{
- assert (!im->lock.locked () && im->lock.prio); // Sanity checks.
+ bool first (im == ims.front ());
+
+ // Sanity checks.
+ //
+ assert (!im->lock.locked () && im->lock.prio);
+ assert (im != pm || first);
const dir_path& tp (im->path); // -<toolchain> path.
l1 ([&]{trace << "interrupting "
- << (im == pm ? "target" : "lowest priority")
+ << (im == pm ? "target" : "lower priority")
<< " machine " << tp << ", pid " << im->lock.pid;});
// The plan is to send the interrupt and then wait for the lock.
@@ -2439,6 +2477,13 @@ try
throw_generic_error (errno);
}
+ // If we are interrupting multiple machine, there is no use acquiring
+ // the lock (or failing if unable to) for subsequent machines since
+ // this is merely a performance optimization.
+ //
+ if (!first)
+ continue;
+
// Try to lock the machine.
//
// While this normally shouldn't take long, there could be parts of