aboutsummaryrefslogtreecommitdiff
path: root/bbot
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2023-05-12 07:33:40 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2023-05-12 10:20:41 +0200
commit73961c9624531b6f6326dac4ab0d5c462a2cff94 (patch)
tree0badeccc761bae912ba1fe90701960a48873e365 /bbot
parenta2ef74aaab96f63f3126d277217b071ad11bf0c6 (diff)
Add infrastructure for controller URL priorities
This is the ground work for the task priority/interrupt support.
Diffstat (limited to 'bbot')
-rw-r--r--bbot/agent/agent.cli58
-rw-r--r--bbot/agent/agent.cxx235
2 files changed, 212 insertions, 81 deletions
diff --git a/bbot/agent/agent.cli b/bbot/agent/agent.cli
index 3d028fd..1046685 100644
--- a/bbot/agent/agent.cli
+++ b/bbot/agent/agent.cli
@@ -12,19 +12,73 @@ include <bbot/common.cli>;
namespace bbot
{
{
- "<options> <url>",
+ "<options> <priority> <url>",
"
\h|SYNOPSIS|
\c{\b{bbot-agent --help}\n
\b{bbot-agent --version}\n
- \b{bbot-agent} [<options>] <url>...}
+ \b{bbot-agent} [<options>] [<priority>=]<url>...}
\h|DESCRIPTION|
\cb{bbot-agent} @@ TODO.
+ The controller URL <priority> is a four or five-digit decimal value. If it
+ is absent, then \cb{0} (lowest priority) is assumed. URLs with equal
+ priority are queried at random.
+
+ The <priority> value has the \c{[\i{F}]\i{DCBA}} form which encodes four
+ priority levels (\ci{DCBA}) each occupying one decimal digit (so there are
+ 10 distinct priorities in each level) plus the optional boost flag
+ (\ci{F}). These levels offer different trade-offs between the speed of
+ completing a higher priority task and potentially discarding work that has
+ already been done.
+
+ The first priority level (\ci{A}) is a simple preference: among the URLs
+ with equal values for other levels (\ci{DCB}), those with higher first
+ level priorities are queried first.
+
+ The second priority level (\ci{B}) has the semantics of the first level
+ plus it prevents URLs with lower second priority level from being
+ queried until the task with a higher second priority level has completed,
+ effectively conserving the resources for the higher priority task.
+
+ The third priority level (\ci{C}) has the semantics of the second level
+ plus it may interrupt one lower third priority level task in order to
+ perform the higher third priority task (the interrupt is necessary if the
+ desired machine is used by the lower priority task or the number of tasks
+ already being performed is the maximum allowed to be performed
+ concurrently; see \cb{--instance-max}).
+
+ Finally, the fourth priority level (\ci{D}) has the semantics of the third
+ level except that not one but all the lower fourth priority level tasks
+ are interrupting, effectively dedicating all the available resources to
+ the higher priority task. This level can also be combined with the boost
+ flag \ci{F}. If this flag is \cb{1} then the higher priority task's CPU
+ number (\cb{--cpu}) is boosted to the full number of available hardware
+ threads (or, to view it another way, the fourth priority level has 20
+ possible values, not 10, with the first 0-9 being without the boost while
+ the last 10-19 being with the boost).
+
+ Note that the priority levels are hierarchical in a sense that within a
+ given higher level URLs can be further prioritized using the lower
+ levels. As an example, consider a deployment with three controller URLs:
+ background package rebuilds (\cb{pkg.example.org}), user-initiated CI
+ (\cb{ci.example.org}), and user-initiated interactive CI
+ (\cb{ici.example.org}). Given the following priorities:
+
+ \
+ 0000=https://pkg.example.org
+ 0100=https://ci.example.org
+ 0101=https://ici.example.org
+ \
+
+ Both types of CI tasks will interrupt one background rebuild task if
+ necessary while the interactive CI tasks will be merely preferred over
+ non-interactive.
+
Note that on termination \cb{bbot-agent} may leave behind a machine lock
and working machine snapshot. It is expected that the caller (normally
Build OS monitor) cleans them up before restarting the agent.
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index a8b7c77..850f2b1 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -20,8 +20,10 @@
#include <sys/ioctl.h>
#include <sys/socket.h>
+#include <map>
#include <chrono>
#include <random>
+#include <iomanip> // setw()
#include <iostream>
#include <system_error> // generic_category()
@@ -1635,20 +1637,57 @@ try
offset = (tc_num - 1) * 100 + inst;
- // Controller URLs.
+ // Controller priority to URLs map.
//
- if (argc < 2 &&
- !ops.dump_machines () &&
- !ops.fake_request_specified ())
+ std::map<uint64_t, strings> controllers;
+
+ for (int i (1); i != argc; ++i)
{
- fail << "controller url expected" <<
- info << "run " << argv[0] << " --help for details";
- }
+ // [<prio>=]<url>
+ //
+ string a (argv[i]);
+
+ // See if we have priority, falling back to priority 0 if absent.
+ //
+ uint64_t prio (0);
- strings controllers;
+ // Note that we can also have `=` in <url> (e.g., parameters) so we will
+ // only consider `=` as ours if prior to it we only have digits.
+ //
+ size_t p (a.find ('='));
+ if (p != string::npos && a.find_first_not_of ("0123456789") == p)
+ {
+ // Require exactly four or five digits in case we later need to extend
+ // the priority levels beyond the 10 possible values (e.g., DDCCBBAA).
+ //
+ if (p != 4 && p != 5)
+ fail << "four or five-digit controller url priority expected in '"
+ << a << "'";
- for (int i (1); i != argc; ++i)
- controllers.push_back (argv[i]);
+ char* e;
+ errno = 0;
+ prio = strtoull (a.c_str (), &e, 10);
+ assert (errno != ERANGE && e == a.c_str () + p);
+
+ if (prio > 19999)
+ fail << "out of bounds controller url priority in '" << a << "'";
+
+ a.erase (0, p + 1);
+ }
+
+ controllers[prio].push_back (move (a));
+ }
+
+ if (controllers.empty ())
+ {
+ if (ops.dump_machines () || ops.fake_request_specified ())
+ {
+ controllers[0].push_back ("https://example.org");
+ }
+ else
+ fail << "controller url expected" <<
+ info << "run " << argv[0] << " --help for details";
+ }
// Handle SIGHUP and SIGTERM.
//
@@ -1710,8 +1749,16 @@ try
if (inst_max != 0)
dr << info << "instance max " << inst_max;
- for (const string& u: controllers)
- dr << info << "controller url " << u;
+ // Note: keep last since don't restore fill/setw.
+ //
+ for (const pair<const uint64_t, strings>& p: controllers)
+ {
+ for (const string& u: p.second)
+ {
+ dr.os.fill ('0');
+ dr << info << "controller url " << std::setw (4) << p.first << '=' << u;
+ }
+ }
}
// The work loop. The steps we go through are:
@@ -1815,79 +1862,94 @@ try
continue;
}
- // Prepare task request.
+ // If we get a task, these contain all the corresponding information.
//
- task_request_manifest tq {
- hname,
- tc_name,
- tc_ver,
- imode,
- ilogin,
- fingerprint,
- machine_header_manifests {}
- };
-
- // Note: do not assume tq.machines.size () == ms.size ().
+ task_request_manifest tq;
+ task_response_manifest tr;
+ uint64_t prio;
+ string url;
+
+ // Iterate over controller priorities in reverse, that is, from highest to
+ // lowest.
//
- for (const bootstrapped_machine& m: ms)
- {
- // @@ For now skip machines locked by other processes.
- //
- // @@ Note: skip machines being bootstrapped.
- //
- if (m.lock.locked ())
- tq.machines.emplace_back (m.manifest.machine.id,
- m.manifest.machine.name,
- m.manifest.machine.summary);
- }
+ // @@ Note: doing it in terms of direct iterators in anticipation for
+ // lower_bound().
+ //
+ auto cb (controllers.begin ());
+ auto ce (controllers.end ());
- if (ops.dump_machines ())
+ for (; cb != ce; )
{
- for (const machine_header_manifest& m: tq.machines)
- serialize_manifest (m, cout, "stdout", "machine");
+ const pair<const uint64_t, strings>& pu (*--ce);
- return 0;
- }
+ prio = pu.first;
+ const strings& urls (pu.second);
- if (tq.machines.empty ())
- {
- // Normally this means all the machines are locked so sleep a bit less.
+ // Prepare task request (it will be the same within a given priority).
//
- sleep = rand_sleep () / 2;
- continue;
- }
+ tq = task_request_manifest {
+ hname,
+ tc_name,
+ tc_ver,
+ imode,
+ ilogin,
+ fingerprint,
+ machine_header_manifests {}};
+
+ // Note: do not assume tq.machines.size () == ms.size ().
+ //
+ for (const bootstrapped_machine& m: ms)
+ {
+ // @@ For now skip machines locked by other processes.
+ //
+ // @@ Note: skip machines being bootstrapped.
+ //
+ if (m.lock.locked ())
+ tq.machines.emplace_back (m.manifest.machine.id,
+ m.manifest.machine.name,
+ m.manifest.machine.summary);
+ }
- // Send task requests.
- //
- // Note that we have to do it while holding the lock on all the machines
- // since we don't know which machine we will need.
- //
- string url;
- task_response_manifest tr;
+ if (ops.dump_machines ())
+ {
+ for (const machine_header_manifest& m: tq.machines)
+ serialize_manifest (m, cout, "stdout", "machine");
- if (ops.fake_request_specified ())
- {
- auto t (parse_manifest<task_manifest> (ops.fake_request (), "task"));
+ return 0;
+ }
- tr = task_response_manifest {
- "fake-session", // Dummy session.
- nullopt, // No challenge.
- url, // Empty result URL.
- agent_checksum,
- move (t)};
+ if (tq.machines.empty ())
+ {
+ // If we have no machines for this priority then we won't have any
+ // for any lower priority so bail out.
+ //
+ break;
+ }
- url = "http://example.org";
- }
- else
- {
- // Note that after completing each task we always start from the
- // beginning of the list. This fact can be used to implement a poor
- // man's priority system where we will continue serving the first listed
- // controller for as long as it has tasks (and maybe in the future we
- // will implement a proper priority system).
+ // Send task requests.
+ //
+ // Note that we have to do it while holding the lock on all the machines
+ // since we don't know which machine we will need.
+ //
+ // @@ TODO: need to iterate in random order somehow.
//
- for (const string& u: controllers)
+ for (const string& u: urls)
{
+ if (ops.fake_request_specified ())
+ {
+ auto t (parse_manifest<task_manifest> (ops.fake_request (), "task"));
+
+ tr = task_response_manifest {
+ "fake-session", // Dummy session.
+ nullopt, // No challenge.
+ string (), // Empty result URL.
+ agent_checksum,
+ move (t)};
+
+ url = u;
+ break;
+ }
+
task_response_manifest r;
try
@@ -1903,8 +1965,9 @@ try
"--max-time", ops.request_timeout (),
"--connect-timeout", ops.connect_timeout ());
- // This is tricky/hairy: we may fail hard parsing the output before
- // seeing that curl exited with an error and failing softly.
+ // This is tricky/hairy: we may fail hard parsing the output
+ // before seeing that curl exited with an error and failing
+ // softly.
//
bool f (false);
@@ -1949,8 +2012,8 @@ try
{
const task_manifest& t (*r.task);
- // For security reasons let's require the repository location to be
- // remote.
+ // For security reasons let's require the repository location to
+ // be remote.
//
if (t.repository.local ())
{
@@ -1973,13 +2036,27 @@ try
l2 ([&]{trace << "task for " << t.name << '/' << t.version << " "
<< "on " << t.machine << " "
- << "from " << u;});
+ << "from " << u << " "
+ << "priority " << prio;});
tr = move (r);
url = u;
break;
}
- }
+ } // url loop.
+
+ if (!tr.session.empty ()) // Got a task.
+ break;
+
+ } // prio loop.
+
+ if (tq.machines.empty ()) // No machines.
+ {
+ // Normally this means all the machines are busy so sleep a bit less.
+ //
+ l2 ([&]{trace << "all machines are busy, sleeping";});
+ sleep = rand_sleep () / 2;
+ continue;
}
if (tr.session.empty ()) // No task from any of the controllers.
@@ -2006,7 +2083,7 @@ try
{
if (mh.name == m.manifest.machine.name)
{
- m.lock.write (tl, 1234 /* prio */);
+ m.lock.write (tl, prio);
pm = &m;
}
else