From 9cc59e9d9d37b0c9bc8f100e277c765089c6574b Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Fri, 12 May 2023 15:59:07 +0200 Subject: Respect --instance-max limit when bootstrapping machines --- bbot/agent/agent.cxx | 228 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 143 insertions(+), 85 deletions(-) diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 850f2b1..1b35ee0 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -749,14 +749,58 @@ try return pr; } + // Compare bbot and library versions returning -1 if older, 0 if the same, + // and +1 if newer. + // + auto compare_bbot = [] (const bootstrap_manifest& m) -> int + { + auto cmp = [&m] (const string& n, const char* v) -> int + { + standard_version sv (v); + auto i = m.versions.find (n); + + return (i == m.versions.end () || i->second < sv + ? -1 + : i->second > sv ? 1 : 0); + }; + + // Start from the top assuming a new dependency cannot be added without + // changing the dependent's version. + // + int r; + return ( + (r = cmp ("bbot", BBOT_VERSION_STR)) != 0 ? r : + (r = cmp ("libbbot", LIBBBOT_VERSION_STR)) != 0 ? r : + (r = cmp ("libbpkg", LIBBPKG_VERSION_STR)) != 0 ? r : + (r = cmp ("libbutl", LIBBUTL_VERSION_STR)) != 0 ? r : 0); + }; + // Notice and warn if there are no machines (as opposed to all of them // being locked). // bool none (true); + // We used to (re)-bootstrap machines as we are iterating. But with the + // introduction of the priority monitoring functionality we need to + // respect the --instance-max value. Which means we first need to try to + // lock all the machines in order to determine how many of them are busy + // then check this count against --instance-max, and only bootstrap if we + // are not over the limit. Which means we have to store all the + // information about a (first) machine that needs bootstrapping until + // after we have enumerated all of them. + // + struct pending_bootstrap + { + machine_lock ml; + dir_path tp; // - + dir_path xp; // -- + machine_manifest mm; + optional bmm; + }; + optional pboot; + // The first level are machine volumes. // - bool scratch (false); for (const dir_entry& ve: dir_iterator (machines, dir_iterator::no_follow)) { const string vn (ve.path ().string ()); @@ -911,7 +955,7 @@ try // -- // - const dir_path xp (snapshot_path (tp)); + dir_path xp (snapshot_path (tp)); if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0) { @@ -938,32 +982,7 @@ try // are about to be stopped and upgraded (and the upgraded version // will probably be able to use the result). So we simply ignore // this machine for this run. - - // Return -1 if older, 0 if the same, and +1 if newer. // - auto compare_bbot = [] (const bootstrap_manifest& m) -> int - { - auto cmp = [&m] (const string& n, const char* v) -> int - { - standard_version sv (v); - auto i = m.versions.find (n); - - return (i == m.versions.end () || i->second < sv - ? -1 - : i->second > sv ? 1 : 0); - }; - - // Start from the top assuming a new dependency cannot be added - // without changing the dependent's version. - // - int r; - return ( - (r = cmp ("bbot", BBOT_VERSION_STR)) != 0 ? r : - (r = cmp ("libbbot", LIBBBOT_VERSION_STR)) != 0 ? r : - (r = cmp ("libbpkg", LIBBPKG_VERSION_STR)) != 0 ? r : - (r = cmp ("libbutl", LIBBUTL_VERSION_STR)) != 0 ? r : 0); - }; - optional bmm; if (te) { @@ -972,13 +991,13 @@ try if (bmm->machine.id != mm.id) { - l3 ([&]{trace << "re-bootstrapping " << tp << ": new machine";}); + l3 ([&]{trace << "re-bootstrap " << tp << ": new machine";}); te = false; } if (!tc_id.empty () && bmm->toolchain.id != tc_id) { - l3 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";}); + l3 ([&]{trace << "re-bootstrap " << tp << ": new toolchain";}); te = false; } @@ -986,7 +1005,7 @@ try { if (i < 0) { - l3 ([&]{trace << "re-bootstrapping " << tp << ": new bbot";}); + l3 ([&]{trace << "re-bootstrap " << tp << ": new bbot";}); te = false; } else @@ -1001,58 +1020,21 @@ try delete_bootstrapped (); } else - l3 ([&]{trace << "bootstrapping " << tp;}); + l3 ([&]{trace << "bootstrap " << tp;}); if (!te) { - // Use the -- snapshot that we have made - // to bootstrap the new machine. Then atomically rename it to - // -. - // - // Also release all the machine locks that we have acquired so - // far as well as the global toolchain lock, since the bootstrap - // will take a while and other instances might be able to use - // them. Because we are releasing the global lock, we have to - // restart the enumeration process from scratch. - // - r.clear (); - ml.write (tl, nullopt /* prio */); // Being bootstrapped. - tl.unlock (); - scratch = true; - - bmm = bootstrap_machine (xp, mm, move (bmm)); - - if (!bmm) - { - l3 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";}); - run_btrfs (trace, "subvolume", "delete", xp); - break; - } - - try - { - mvdir (xp, tp); - } - catch (const system_error& e) - { - fail << "unable to rename " << xp << " to " << tp; - } - - l2 ([&]{trace << "bootstrapped " << bmm->machine.name;}); - - // Check the bootstrapped bbot version as above and ignore this - // machine if it's newer than us. + // Ignore any other machines that need bootstrapping. // - if (int i = compare_bbot (bmm->bootstrap)) + if (!pboot) { - if (i > 0) - l3 ([&]{trace << "ignoring " << tp << ": old bbot";}); - else - warn << "bootstrapped " << tp << " bbot worker is older " - << "than agent; assuming test setup"; + pboot = pending_bootstrap { + move (ml), move (tp), move (xp), move (mm), move (bmm)}; } + else + run_btrfs (trace, "subvolume", "delete", xp); - break; // Restart from scratch. + break; } else run_btrfs (trace, "subvolume", "delete", xp); @@ -1064,14 +1046,7 @@ try break; } // Retry loop. - - if (scratch) - break; - } // Inner dir_iterator loop. - - if (scratch) - break; } catch (const system_error& e) { @@ -1079,8 +1054,91 @@ try } } // Outer dir_iterator loop. - if (scratch) - continue; + // See if there is a pending bootstrap and whether we can perform it. + // + // What should we do if we can't (i.e., we are in the priority minitor + // mode)? Well, we could have found some machines that are already + // bootstrapped (busy or not) and there may be a higher-priority task for + // one of them, so it feels natural to return whatever we've got. + // + if (pboot) + { + dir_path& tp (pboot->tp); + dir_path& xp (pboot->xp); + + // Determine how many machines are busy (locked by other processes) and + // make sure it's below the --instance-max limit, if specified. + // + if (inst_max != 0) + { + size_t busy (0); + for (const bootstrapped_machine& m: r) + if (!m.lock.locked ()) + ++busy; + + assert (busy <= inst_max); + + if (busy == inst_max) + { + l1 ([&]{trace << "instance max reached attempting to bootstrap " + << tp;}); + run_btrfs (trace, "subvolume", "delete", xp); + return pr; + } + } + + machine_lock& ml (pboot->ml); + + l3 ([&]{trace << "bootstrapping " << tp;}); + + // Use the -- snapshot that we have made to bootstrap + // the new machine. Then atomically rename it to -. + // + // Also release all the machine locks that we have acquired so far as + // well as the global toolchain lock, since the bootstrap will take a + // while and other instances might be able to use them. Because we are + // releasing the global lock, we have to restart the enumeration process + // from scratch. + // + r.clear (); + ml.write (tl, nullopt /* prio */); // Being bootstrapped. + tl.unlock (); + + optional bmm ( + bootstrap_machine (xp, pboot->mm, move (pboot->bmm))); + + if (!bmm) + { + l3 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";}); + run_btrfs (trace, "subvolume", "delete", xp); + continue; + } + + try + { + mvdir (xp, tp); + } + catch (const system_error& e) + { + fail << "unable to rename " << xp << " to " << tp; + } + + l2 ([&]{trace << "bootstrapped " << bmm->machine.name;}); + + // Check the bootstrapped bbot version as above and ignore this machine + // if it's newer than us. + // + if (int i = compare_bbot (bmm->bootstrap)) + { + if (i > 0) + l3 ([&]{trace << "ignoring " << tp << ": old bbot";}); + else + warn << "bootstrapped " << tp << " bbot worker is older " + << "than agent; assuming test setup"; + } + + continue; // Re-enumerate from scratch. + } if (none) warn << "no build machines for toolchain " << tc_name; @@ -1825,7 +1883,7 @@ try optional prio_mon; if (inst_max != 0) { - uint16_t busy (0); + uint16_t busy (0); // Machines locked by other processes. optional prio; for (const bootstrapped_machine& m: ms) -- cgit v1.1