Add infrastructure for controller URL priorities

This is the ground work for the task priority/interrupt support.
author: Boris Kolpackov <boris@codesynthesis.com> 2023-05-12 07:33:40 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2023-05-12 10:20:41 +0200
commit: 73961c9624531b6f6326dac4ab0d5c462a2cff94 (patch)
tree: 0badeccc761bae912ba1fe90701960a48873e365 /bbot
parent: a2ef74aaab96f63f3126d277217b071ad11bf0c6 (diff)
2 files changed, 212 insertions, 81 deletions
diff --git a/bbot/agent/agent.cli b/bbot/agent/agent.cli
index 3d028fd..1046685 100644
--- a/bbot/agent/agent.cli
+++ b/bbot/agent/agent.cli
@@ -12,19 +12,73 @@ include <bbot/common.cli>;
 namespace bbot
 {
   {
-    "<options> <url>",
+    "<options> <priority> <url>",
 
     "
     \h|SYNOPSIS|
 
     \c{\b{bbot-agent --help}\n
        \b{bbot-agent --version}\n
-       \b{bbot-agent} [<options>] <url>...}
+       \b{bbot-agent} [<options>] [<priority>=]<url>...}
 
     \h|DESCRIPTION|
 
     \cb{bbot-agent} @@ TODO.
 
+    The controller URL <priority> is a four or five-digit decimal value. If it
+    is absent, then \cb{0} (lowest priority) is assumed. URLs with equal
+    priority are queried at random.
+
+    The <priority> value has the \c{[\i{F}]\i{DCBA}} form which encodes four
+    priority levels (\ci{DCBA}) each occupying one decimal digit (so there are
+    10 distinct priorities in each level) plus the optional boost flag
+    (\ci{F}). These levels offer different trade-offs between the speed of
+    completing a higher priority task and potentially discarding work that has
+    already been done.
+
+    The first priority level (\ci{A}) is a simple preference: among the URLs
+    with equal values for other levels (\ci{DCB}), those with higher first
+    level priorities are queried first.
+
+    The second priority level (\ci{B}) has the semantics of the first level
+    plus it prevents URLs with lower second priority level from being
+    queried until the task with a higher second priority level has completed,
+    effectively conserving the resources for the higher priority task.
+
+    The third priority level (\ci{C}) has the semantics of the second level
+    plus it may interrupt one lower third priority level task in order to
+    perform the higher third priority task (the interrupt is necessary if the
+    desired machine is used by the lower priority task or the number of tasks
+    already being performed is the maximum allowed to be performed
+    concurrently; see \cb{--instance-max}).
+
+    Finally, the fourth priority level (\ci{D}) has the semantics of the third
+    level except that not one but all the lower fourth priority level tasks
+    are interrupting, effectively dedicating all the available resources to
+    the higher priority task. This level can also be combined with the boost
+    flag \ci{F}. If this flag is \cb{1} then the higher priority task's CPU
+    number (\cb{--cpu}) is boosted to the full number of available hardware
+    threads (or, to view it another way, the fourth priority level has 20
+    possible values, not 10, with the first 0-9 being without the boost while
+    the last 10-19 being with the boost).
+
+    Note that the priority levels are hierarchical in a sense that within a
+    given higher level URLs can be further prioritized using the lower
+    levels. As an example, consider a deployment with three controller URLs:
+    background package rebuilds (\cb{pkg.example.org}), user-initiated CI
+    (\cb{ci.example.org}), and user-initiated interactive CI
+    (\cb{ici.example.org}). Given the following priorities:
+
+    \
+    0000=https://pkg.example.org
+    0100=https://ci.example.org
+    0101=https://ici.example.org
+    \
+
+    Both types of CI tasks will interrupt one background rebuild task if
+    necessary while the interactive CI tasks will be merely preferred over
+    non-interactive.
+
     Note that on termination \cb{bbot-agent} may leave behind a machine lock
     and working machine snapshot. It is expected that the caller (normally
     Build OS monitor) cleans them up before restarting the agent.
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index a8b7c77..850f2b1 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -20,8 +20,10 @@
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 
+#include <map>
 #include <chrono>
 #include <random>
+#include <iomanip>      // setw()
 #include <iostream>
 #include <system_error> // generic_category()
 
@@ -1635,20 +1637,57 @@ try
 
   offset = (tc_num - 1) * 100 + inst;
 
-  // Controller URLs.
+  // Controller priority to URLs map.
   //
-  if (argc < 2 &&
-      !ops.dump_machines () &&
-      !ops.fake_request_specified ())
+  std::map<uint64_t, strings> controllers;
+
+  for (int i (1); i != argc; ++i)
   {
-    fail << "controller url expected" <<
-      info << "run " << argv[0] << " --help for details";
-  }
+    // [<prio>=]<url>
+    //
+    string a (argv[i]);
+
+    // See if we have priority, falling back to priority 0 if absent.
+    //
+    uint64_t prio (0);
 
-  strings controllers;
+    // Note that we can also have `=` in <url> (e.g., parameters) so we will
+    // only consider `=` as ours if prior to it we only have digits.
+    //
+    size_t p (a.find ('='));
+    if (p != string::npos && a.find_first_not_of ("0123456789") == p)
+    {
+      // Require exactly four or five digits in case we later need to extend
+      // the priority levels beyond the 10 possible values (e.g., DDCCBBAA).
+      //
+      if (p != 4 && p != 5)
+        fail << "four or five-digit controller url priority expected in '"
+             << a << "'";
 
-  for (int i (1); i != argc; ++i)
-    controllers.push_back (argv[i]);
+      char* e;
+      errno = 0;
+      prio = strtoull (a.c_str (), &e, 10);
+      assert (errno != ERANGE && e == a.c_str () + p);
+
+      if (prio > 19999)
+        fail << "out of bounds controller url priority in '" << a << "'";
+
+      a.erase (0, p + 1);
+    }
+
+    controllers[prio].push_back (move (a));
+  }
+
+  if (controllers.empty ())
+  {
+    if (ops.dump_machines () || ops.fake_request_specified ())
+    {
+      controllers[0].push_back ("https://example.org");
+    }
+    else
+      fail << "controller url expected" <<
+        info << "run " << argv[0] << " --help for details";
+  }
 
   // Handle SIGHUP and SIGTERM.
   //
@@ -1710,8 +1749,16 @@ try
     if (inst_max != 0)
       dr << info << "instance  max  " << inst_max;
 
-    for (const string& u: controllers)
-      dr << info << "controller url " << u;
+    // Note: keep last since don't restore fill/setw.
+    //
+    for (const pair<const uint64_t, strings>& p: controllers)
+    {
+      for (const string& u: p.second)
+      {
+        dr.os.fill ('0');
+        dr << info << "controller url " << std::setw (4) << p.first << '=' << u;
+      }
+    }
   }
 
   // The work loop. The steps we go through are:
@@ -1815,79 +1862,94 @@ try
       continue;
     }
 
-    // Prepare task request.
+    // If we get a task, these contain all the corresponding information.
     //
-    task_request_manifest tq {
-      hname,
-      tc_name,
-      tc_ver,
-      imode,
-      ilogin,
-      fingerprint,
-      machine_header_manifests {}
-    };
-
-    // Note: do not assume tq.machines.size () == ms.size ().
+    task_request_manifest tq;
+    task_response_manifest tr;
+    uint64_t prio;
+    string url;
+
+    // Iterate over controller priorities in reverse, that is, from highest to
+    // lowest.
     //
-    for (const bootstrapped_machine& m: ms)
-    {
-      // @@ For now skip machines locked by other processes.
-      //
-      // @@ Note: skip machines being bootstrapped.
-      //
-      if (m.lock.locked ())
-        tq.machines.emplace_back (m.manifest.machine.id,
-                                  m.manifest.machine.name,
-                                  m.manifest.machine.summary);
-    }
+    // @@ Note: doing it in terms of direct iterators in anticipation for
+    //    lower_bound().
+    //
+    auto cb (controllers.begin ());
+    auto ce (controllers.end ());
 
-    if (ops.dump_machines ())
+    for (; cb != ce; )
     {
-      for (const machine_header_manifest& m: tq.machines)
-        serialize_manifest (m, cout, "stdout", "machine");
+      const pair<const uint64_t, strings>& pu (*--ce);
 
-      return 0;
-    }
+      prio = pu.first;
+      const strings& urls (pu.second);
 
-    if (tq.machines.empty ())
-    {
-      // Normally this means all the machines are locked so sleep a bit less.
+      // Prepare task request (it will be the same within a given priority).
       //
-      sleep = rand_sleep () / 2;
-      continue;
-    }
+      tq = task_request_manifest {
+        hname,
+        tc_name,
+        tc_ver,
+        imode,
+        ilogin,
+        fingerprint,
+        machine_header_manifests {}};
+
+      // Note: do not assume tq.machines.size () == ms.size ().
+      //
+      for (const bootstrapped_machine& m: ms)
+      {
+        // @@ For now skip machines locked by other processes.
+        //
+        // @@ Note: skip machines being bootstrapped.
+        //
+        if (m.lock.locked ())
+          tq.machines.emplace_back (m.manifest.machine.id,
+                                    m.manifest.machine.name,
+                                    m.manifest.machine.summary);
+      }
 
-    // Send task requests.
-    //
-    // Note that we have to do it while holding the lock on all the machines
-    // since we don't know which machine we will need.
-    //
-    string url;
-    task_response_manifest tr;
+      if (ops.dump_machines ())
+      {
+        for (const machine_header_manifest& m: tq.machines)
+          serialize_manifest (m, cout, "stdout", "machine");
 
-    if (ops.fake_request_specified ())
-    {
-      auto t (parse_manifest<task_manifest> (ops.fake_request (), "task"));
+        return 0;
+      }
 
-      tr = task_response_manifest {
-        "fake-session", // Dummy session.
-        nullopt,        // No challenge.
-        url,            // Empty result URL.
-        agent_checksum,
-        move (t)};
+      if (tq.machines.empty ())
+      {
+        // If we have no machines for this priority then we won't have any
+        // for any lower priority so bail out.
+        //
+        break;
+      }
 
-      url = "http://example.org";
-    }
-    else
-    {
-      // Note that after completing each task we always start from the
-      // beginning of the list. This fact can be used to implement a poor
-      // man's priority system where we will continue serving the first listed
-      // controller for as long as it has tasks (and maybe in the future we
-      // will implement a proper priority system).
+      // Send task requests.
+      //
+      // Note that we have to do it while holding the lock on all the machines
+      // since we don't know which machine we will need.
+      //
+      // @@ TODO: need to iterate in random order somehow.
       //
-      for (const string& u: controllers)
+      for (const string& u: urls)
       {
+        if (ops.fake_request_specified ())
+        {
+          auto t (parse_manifest<task_manifest> (ops.fake_request (), "task"));
+
+          tr = task_response_manifest {
+            "fake-session", // Dummy session.
+            nullopt,        // No challenge.
+            string (),      // Empty result URL.
+            agent_checksum,
+            move (t)};
+
+          url = u;
+          break;
+        }
+
         task_response_manifest r;
 
         try
@@ -1903,8 +1965,9 @@ try
                        "--max-time", ops.request_timeout (),
                        "--connect-timeout", ops.connect_timeout ());
 
-          // This is tricky/hairy: we may fail hard parsing the output before
-          // seeing that curl exited with an error and failing softly.
+          // This is tricky/hairy: we may fail hard parsing the output
+          // before seeing that curl exited with an error and failing
+          // softly.
           //
           bool f (false);
 
@@ -1949,8 +2012,8 @@ try
         {
           const task_manifest& t (*r.task);
 
-          // For security reasons let's require the repository location to be
-          // remote.
+          // For security reasons let's require the repository location to
+          // be remote.
           //
           if (t.repository.local ())
           {
@@ -1973,13 +2036,27 @@ try
 
           l2 ([&]{trace << "task for " << t.name << '/' << t.version << " "
                         << "on " << t.machine << " "
-                        << "from " << u;});
+                        << "from " << u << " "
+                        << "priority " << prio;});
 
           tr  = move (r);
           url = u;
           break;
         }
-      }
+      } // url loop.
+
+      if (!tr.session.empty ()) // Got a task.
+        break;
+
+    } // prio loop.
+
+    if (tq.machines.empty ()) // No machines.
+    {
+      // Normally this means all the machines are busy so sleep a bit less.
+      //
+      l2 ([&]{trace << "all machines are busy, sleeping";});
+      sleep = rand_sleep () / 2;
+      continue;
     }
 
     if (tr.session.empty ()) // No task from any of the controllers.
@@ -2006,7 +2083,7 @@ try
         {
           if (mh.name == m.manifest.machine.name)
           {
-            m.lock.write (tl, 1234 /* prio */);
+            m.lock.write (tl, prio);
             pm = &m;
           }
           else
author	Boris Kolpackov <boris@codesynthesis.com>	2023-05-12 07:33:40 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2023-05-12 10:20:41 +0200
commit	73961c9624531b6f6326dac4ab0d5c462a2cff94 (patch)
tree	0badeccc761bae912ba1fe90701960a48873e365 /bbot
parent	a2ef74aaab96f63f3126d277217b071ad11bf0c6 (diff)