aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bbot/agent/agent.cxx228
1 files changed, 143 insertions, 85 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 850f2b1..1b35ee0 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -749,14 +749,58 @@ try
return pr;
}
+ // Compare bbot and library versions returning -1 if older, 0 if the same,
+ // and +1 if newer.
+ //
+ auto compare_bbot = [] (const bootstrap_manifest& m) -> int
+ {
+ auto cmp = [&m] (const string& n, const char* v) -> int
+ {
+ standard_version sv (v);
+ auto i = m.versions.find (n);
+
+ return (i == m.versions.end () || i->second < sv
+ ? -1
+ : i->second > sv ? 1 : 0);
+ };
+
+ // Start from the top assuming a new dependency cannot be added without
+ // changing the dependent's version.
+ //
+ int r;
+ return (
+ (r = cmp ("bbot", BBOT_VERSION_STR)) != 0 ? r :
+ (r = cmp ("libbbot", LIBBBOT_VERSION_STR)) != 0 ? r :
+ (r = cmp ("libbpkg", LIBBPKG_VERSION_STR)) != 0 ? r :
+ (r = cmp ("libbutl", LIBBUTL_VERSION_STR)) != 0 ? r : 0);
+ };
+
// Notice and warn if there are no machines (as opposed to all of them
// being locked).
//
bool none (true);
+ // We used to (re)-bootstrap machines as we are iterating. But with the
+ // introduction of the priority monitoring functionality we need to
+ // respect the --instance-max value. Which means we first need to try to
+ // lock all the machines in order to determine how many of them are busy
+ // then check this count against --instance-max, and only bootstrap if we
+ // are not over the limit. Which means we have to store all the
+ // information about a (first) machine that needs bootstrapping until
+ // after we have enumerated all of them.
+ //
+ struct pending_bootstrap
+ {
+ machine_lock ml;
+ dir_path tp; // -<toolchain>
+ dir_path xp; // -<toolchain>-<xxx>
+ machine_manifest mm;
+ optional<bootstrapped_machine_manifest> bmm;
+ };
+ optional<pending_bootstrap> pboot;
+
// The first level are machine volumes.
//
- bool scratch (false);
for (const dir_entry& ve: dir_iterator (machines, dir_iterator::no_follow))
{
const string vn (ve.path ().string ());
@@ -911,7 +955,7 @@ try
// <name>-<toolchain>-<xxx>
//
- const dir_path xp (snapshot_path (tp));
+ dir_path xp (snapshot_path (tp));
if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0)
{
@@ -938,32 +982,7 @@ try
// are about to be stopped and upgraded (and the upgraded version
// will probably be able to use the result). So we simply ignore
// this machine for this run.
-
- // Return -1 if older, 0 if the same, and +1 if newer.
//
- auto compare_bbot = [] (const bootstrap_manifest& m) -> int
- {
- auto cmp = [&m] (const string& n, const char* v) -> int
- {
- standard_version sv (v);
- auto i = m.versions.find (n);
-
- return (i == m.versions.end () || i->second < sv
- ? -1
- : i->second > sv ? 1 : 0);
- };
-
- // Start from the top assuming a new dependency cannot be added
- // without changing the dependent's version.
- //
- int r;
- return (
- (r = cmp ("bbot", BBOT_VERSION_STR)) != 0 ? r :
- (r = cmp ("libbbot", LIBBBOT_VERSION_STR)) != 0 ? r :
- (r = cmp ("libbpkg", LIBBPKG_VERSION_STR)) != 0 ? r :
- (r = cmp ("libbutl", LIBBUTL_VERSION_STR)) != 0 ? r : 0);
- };
-
optional<bootstrapped_machine_manifest> bmm;
if (te)
{
@@ -972,13 +991,13 @@ try
if (bmm->machine.id != mm.id)
{
- l3 ([&]{trace << "re-bootstrapping " << tp << ": new machine";});
+ l3 ([&]{trace << "re-bootstrap " << tp << ": new machine";});
te = false;
}
if (!tc_id.empty () && bmm->toolchain.id != tc_id)
{
- l3 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";});
+ l3 ([&]{trace << "re-bootstrap " << tp << ": new toolchain";});
te = false;
}
@@ -986,7 +1005,7 @@ try
{
if (i < 0)
{
- l3 ([&]{trace << "re-bootstrapping " << tp << ": new bbot";});
+ l3 ([&]{trace << "re-bootstrap " << tp << ": new bbot";});
te = false;
}
else
@@ -1001,58 +1020,21 @@ try
delete_bootstrapped ();
}
else
- l3 ([&]{trace << "bootstrapping " << tp;});
+ l3 ([&]{trace << "bootstrap " << tp;});
if (!te)
{
- // Use the <name>-<toolchain>-<xxx> snapshot that we have made
- // to bootstrap the new machine. Then atomically rename it to
- // <name>-<toolchain>.
- //
- // Also release all the machine locks that we have acquired so
- // far as well as the global toolchain lock, since the bootstrap
- // will take a while and other instances might be able to use
- // them. Because we are releasing the global lock, we have to
- // restart the enumeration process from scratch.
- //
- r.clear ();
- ml.write (tl, nullopt /* prio */); // Being bootstrapped.
- tl.unlock ();
- scratch = true;
-
- bmm = bootstrap_machine (xp, mm, move (bmm));
-
- if (!bmm)
- {
- l3 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";});
- run_btrfs (trace, "subvolume", "delete", xp);
- break;
- }
-
- try
- {
- mvdir (xp, tp);
- }
- catch (const system_error& e)
- {
- fail << "unable to rename " << xp << " to " << tp;
- }
-
- l2 ([&]{trace << "bootstrapped " << bmm->machine.name;});
-
- // Check the bootstrapped bbot version as above and ignore this
- // machine if it's newer than us.
+ // Ignore any other machines that need bootstrapping.
//
- if (int i = compare_bbot (bmm->bootstrap))
+ if (!pboot)
{
- if (i > 0)
- l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
- else
- warn << "bootstrapped " << tp << " bbot worker is older "
- << "than agent; assuming test setup";
+ pboot = pending_bootstrap {
+ move (ml), move (tp), move (xp), move (mm), move (bmm)};
}
+ else
+ run_btrfs (trace, "subvolume", "delete", xp);
- break; // Restart from scratch.
+ break;
}
else
run_btrfs (trace, "subvolume", "delete", xp);
@@ -1064,14 +1046,7 @@ try
break;
} // Retry loop.
-
- if (scratch)
- break;
-
} // Inner dir_iterator loop.
-
- if (scratch)
- break;
}
catch (const system_error& e)
{
@@ -1079,8 +1054,91 @@ try
}
} // Outer dir_iterator loop.
- if (scratch)
- continue;
+ // See if there is a pending bootstrap and whether we can perform it.
+ //
+ // What should we do if we can't (i.e., we are in the priority minitor
+ // mode)? Well, we could have found some machines that are already
+ // bootstrapped (busy or not) and there may be a higher-priority task for
+ // one of them, so it feels natural to return whatever we've got.
+ //
+ if (pboot)
+ {
+ dir_path& tp (pboot->tp);
+ dir_path& xp (pboot->xp);
+
+ // Determine how many machines are busy (locked by other processes) and
+ // make sure it's below the --instance-max limit, if specified.
+ //
+ if (inst_max != 0)
+ {
+ size_t busy (0);
+ for (const bootstrapped_machine& m: r)
+ if (!m.lock.locked ())
+ ++busy;
+
+ assert (busy <= inst_max);
+
+ if (busy == inst_max)
+ {
+ l1 ([&]{trace << "instance max reached attempting to bootstrap "
+ << tp;});
+ run_btrfs (trace, "subvolume", "delete", xp);
+ return pr;
+ }
+ }
+
+ machine_lock& ml (pboot->ml);
+
+ l3 ([&]{trace << "bootstrapping " << tp;});
+
+ // Use the -<toolchain>-<xxx> snapshot that we have made to bootstrap
+ // the new machine. Then atomically rename it to -<toolchain>.
+ //
+ // Also release all the machine locks that we have acquired so far as
+ // well as the global toolchain lock, since the bootstrap will take a
+ // while and other instances might be able to use them. Because we are
+ // releasing the global lock, we have to restart the enumeration process
+ // from scratch.
+ //
+ r.clear ();
+ ml.write (tl, nullopt /* prio */); // Being bootstrapped.
+ tl.unlock ();
+
+ optional<bootstrapped_machine_manifest> bmm (
+ bootstrap_machine (xp, pboot->mm, move (pboot->bmm)));
+
+ if (!bmm)
+ {
+ l3 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";});
+ run_btrfs (trace, "subvolume", "delete", xp);
+ continue;
+ }
+
+ try
+ {
+ mvdir (xp, tp);
+ }
+ catch (const system_error& e)
+ {
+ fail << "unable to rename " << xp << " to " << tp;
+ }
+
+ l2 ([&]{trace << "bootstrapped " << bmm->machine.name;});
+
+ // Check the bootstrapped bbot version as above and ignore this machine
+ // if it's newer than us.
+ //
+ if (int i = compare_bbot (bmm->bootstrap))
+ {
+ if (i > 0)
+ l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
+ else
+ warn << "bootstrapped " << tp << " bbot worker is older "
+ << "than agent; assuming test setup";
+ }
+
+ continue; // Re-enumerate from scratch.
+ }
if (none)
warn << "no build machines for toolchain " << tc_name;
@@ -1825,7 +1883,7 @@ try
optional<uint64_t> prio_mon;
if (inst_max != 0)
{
- uint16_t busy (0);
+ uint16_t busy (0); // Machines locked by other processes.
optional<uint64_t> prio;
for (const bootstrapped_machine& m: ms)