aboutsummaryrefslogtreecommitdiff
path: root/bbot
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2023-05-11 11:13:56 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2023-05-11 11:16:04 +0200
commita2ef74aaab96f63f3126d277217b071ad11bf0c6 (patch)
treeaf5dea187cd0c91f73c04f68ff447b9bf0841726 /bbot
parenta3463829567defda8aefa8425e79ff4d270617ba (diff)
Add --instance-max agent option
This is the ground work for the task priority/interrupt support.
Diffstat (limited to 'bbot')
-rw-r--r--bbot/agent/agent.cli14
-rw-r--r--bbot/agent/agent.cxx120
-rw-r--r--bbot/bbot-agent@.service3
3 files changed, 105 insertions, 32 deletions
diff --git a/bbot/agent/agent.cli b/bbot/agent/agent.cli
index aa7eb59..3d028fd 100644
--- a/bbot/agent/agent.cli
+++ b/bbot/agent/agent.cli
@@ -111,6 +111,20 @@ namespace bbot
network ports, interfaces, etc."
}
+ uint16_t --instance-max = 0
+ {
+ "<num>",
+ "Maximum number of instances that can perform tasks concurrently. If the
+ number of instances that have been started is greater than this number
+ (normally by just one), then when the maximum number of tasks is
+ already being performed, the extra instances operate in the \i{priority
+ monitor} mode: they only query controller URLs with priorities higher
+ than of the existing tasks and can only perform a task by interrupting
+ one of them. If the maximum number of instances is \cb{0} (default),
+ then it is assumed the number of instances started is the maximum
+ number, essentially disabling the priority monitor functionality."
+ }
+
size_t --cpu = 1
{
"<num>",
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 8f54346..a8b7c77 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -61,7 +61,8 @@ namespace bbot
standard_version tc_ver;
string tc_id;
- uint16_t inst;
+ uint16_t inst; // 1-based.
+ uint16_t inst_max; // 0 if priority monitoring is disabled.
uint16_t offset;
@@ -682,10 +683,13 @@ snapshot_path (const dir_path& tp)
// (re-)bootstrapping them if necessary.
//
// Note that this function returns both machines that this process managed to
-// lock as well as the machines locked by other processes (except those that
-// are being bootstrapped), in case the caller needs to interrupt one of them
-// for a higher-priority task. In the latter case, the manifest only has the
-// machine_manifest information.
+// lock as well as the machines locked by other processes (including those
+// that are being bootstrapped), in case the caller needs to interrupt one of
+// them for a higher-priority task. In the latter case, the manifest is empty
+// if the machine is being bootstrapped and only has the machine_manifest
+// information otherwise. (The bootstrapped machines have to be returned to
+// get the correct count of currently active instances for the inst_max
+// comparison.)
//
struct bootstrapped_machine
{
@@ -836,7 +840,7 @@ try
none = none && sp.empty ();
- // Try to lock the machine, skipping it if being bootstrapped.
+ // Try to lock the machine.
//
machine_lock ml (lock_machine (tl, tp));
@@ -844,37 +848,39 @@ try
{
// @@ TMP: restore l4 tracing.
- if (!ml.prio) // Being bootstrapped.
+ machine_manifest mm;
+ if (ml.prio)
{
- l1 ([&]{trace << "skipping " << md << ": being bootstrapped "
- << "by " << ml.pid;});
- break;
- }
+ // Get the machine manifest (subset of the steps performed for
+ // the locked case below).
+ //
+ // Note that it's possible the machine we get is not what was
+ // originally locked by the other process (e.g., it has been
+ // upgraded since). It's also possible that if and when we
+ // interrupt and lock this machine, it will be a different
+ // machine (e.g., it has been upgraded since we read this
+ // machine manifest). To deal with all of that we will be
+ // reloading this information if/when we acquire the lock to
+ // this machine.
+ //
+ if (sp.empty ())
+ {
+ l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
+ break;
+ }
- // Get the machine manifest (subset of the steps performed for
- // the locked case below).
- //
- // Note that it's possible the machine we get is not what was
- // originally locked by the other process (e.g., it has been
- // upgraded since). It's also possible that if and when we
- // interrupt and lock this machine, it will be a different
- // machine (e.g., it has been upgraded since we read this
- // machine manifest). To deal with all of that we will be
- // reloading this information if/when we acquire the lock to
- // this machine.
- //
- if (sp.empty ())
+ l1 ([&]{trace << "keeping " << md << ": locked by " << ml.pid
+ << " with priority " << *ml.prio;});
+
+ mm = parse_manifest<machine_manifest> (
+ sp / "manifest", "machine");
+ }
+ else // Being bootstrapped.
{
- l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
- break;
+ l1 ([&]{trace << "keeping " << md << ": being bootstrapped "
+ << "by " << ml.pid;});
}
- l1 ([&]{trace << "keeping " << md << ": locked by " << ml.pid
- << " with priority " << *ml.prio;});
-
- auto mm (
- parse_manifest<machine_manifest> (sp / "manifest", "machine"));
-
// Add the machine to the lists and bail out.
//
r.push_back (bootstrapped_machine {
@@ -1625,6 +1631,8 @@ try
if (inst == 0 || inst > 99)
fail << "invalid --instance value " << inst;
+ inst_max = ops.instance_max ();
+
offset = (tc_num - 1) * 100 + inst;
// Controller URLs.
@@ -1699,6 +1707,9 @@ try
info << "toolchain id " << tc_id <<
info << "instance num " << inst;
+ if (inst_max != 0)
+ dr << info << "instance max " << inst_max;
+
for (const string& u: controllers)
dr << info << "controller url " << u;
}
@@ -1761,6 +1772,49 @@ try
toolchain_lock& tl (er.first);
bootstrapped_machines& ms (er.second);
+ // Determine if we should operate in the priority monitor mode and, if so,
+ // the lower bound on the priorities that we should consider.
+ //
+ optional<uint64_t> prio_mon;
+ if (inst_max != 0)
+ {
+ uint16_t busy (0);
+ optional<uint64_t> prio;
+
+ for (const bootstrapped_machine& m: ms)
+ {
+ if (!m.lock.locked ())
+ {
+ ++busy;
+ if (m.lock.prio && (!prio || *m.lock.prio < *prio))
+ prio = *m.lock.prio;
+ }
+ }
+
+ assert (busy <= inst_max);
+
+ if (busy == inst_max)
+ {
+ if (!prio) // All being bootstrapped.
+ {
+ sleep = rand_sleep ();
+ continue;
+ }
+
+ prio_mon = *prio;
+ }
+ }
+
+ // @@ For now bail out if in the priority monitor mode.
+ //
+ if (prio_mon)
+ {
+ l1 ([&]{trace << "priority monitor, lower bound " << *prio_mon;});
+
+ sleep = rand_sleep () / 2;
+ continue;
+ }
+
// Prepare task request.
//
task_request_manifest tq {
@@ -1779,6 +1833,8 @@ try
{
// @@ For now skip machines locked by other processes.
//
+ // @@ Note: skip machines being bootstrapped.
+ //
if (m.lock.locked ())
tq.machines.emplace_back (m.manifest.machine.id,
m.manifest.machine.name,
diff --git a/bbot/bbot-agent@.service b/bbot/bbot-agent@.service
index e938126..18b7c9e 100644
--- a/bbot/bbot-agent@.service
+++ b/bbot/bbot-agent@.service
@@ -38,6 +38,8 @@ Environment=TOOLCHAIN_NUM=1
Environment=TOOLCHAIN_VER=
Environment=TOOLCHAIN_ID=
+Environment=INSTANCE_MAX=0
+
Environment="CONTROLLER_URL="
Environment="CONTROLLER_TRUST="
@@ -63,6 +65,7 @@ ExecStart=/build/bots/default/bin/bbot-agent \
--toolchain-num ${TOOLCHAIN_NUM} \
--toolchain-ver ${TOOLCHAIN_VER} \
--toolchain-id ${TOOLCHAIN_ID} \
+ --instance-max ${INSTANCE_MAX} \
--instance %i \
$CONTROLLER_TRUST \
$CONTROLLER_URL