Add --instance-max agent option

This is the ground work for the task priority/interrupt support.
author: Boris Kolpackov <boris@codesynthesis.com> 2023-05-11 11:13:56 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2023-05-11 11:16:04 +0200
commit: a2ef74aaab96f63f3126d277217b071ad11bf0c6 (patch)
tree: af5dea187cd0c91f73c04f68ff447b9bf0841726 /bbot
parent: a3463829567defda8aefa8425e79ff4d270617ba (diff)
3 files changed, 105 insertions, 32 deletions
diff --git a/bbot/agent/agent.cli b/bbot/agent/agent.cli
index aa7eb59..3d028fd 100644
--- a/bbot/agent/agent.cli
+++ b/bbot/agent/agent.cli
@@ -111,6 +111,20 @@ namespace bbot
        network ports, interfaces, etc."
     }
 
+    uint16_t --instance-max = 0
+    {
+      "<num>",
+      "Maximum number of instances that can perform tasks concurrently. If the
+       number of instances that have been started is greater than this number
+       (normally by just one), then when the maximum number of tasks is
+       already being performed, the extra instances operate in the \i{priority
+       monitor} mode: they only query controller URLs with priorities higher
+       than of the existing tasks and can only perform a task by interrupting
+       one of them. If the maximum number of instances is \cb{0} (default),
+       then it is assumed the number of instances started is the maximum
+       number, essentially disabling the priority monitor functionality."
+    }
+
     size_t --cpu = 1
     {
       "<num>",
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 8f54346..a8b7c77 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -61,7 +61,8 @@ namespace bbot
   standard_version tc_ver;
   string           tc_id;
 
-  uint16_t inst;
+  uint16_t inst;     // 1-based.
+  uint16_t inst_max; // 0 if priority monitoring is disabled.
 
   uint16_t offset;
 
@@ -682,10 +683,13 @@ snapshot_path (const dir_path& tp)
 // (re-)bootstrapping them if necessary.
 //
 // Note that this function returns both machines that this process managed to
-// lock as well as the machines locked by other processes (except those that
-// are being bootstrapped), in case the caller needs to interrupt one of them
-// for a higher-priority task. In the latter case, the manifest only has the
-// machine_manifest information.
+// lock as well as the machines locked by other processes (including those
+// that are being bootstrapped), in case the caller needs to interrupt one of
+// them for a higher-priority task. In the latter case, the manifest is empty
+// if the machine is being bootstrapped and only has the machine_manifest
+// information otherwise. (The bootstrapped machines have to be returned to
+// get the correct count of currently active instances for the inst_max
+// comparison.)
 //
 struct bootstrapped_machine
 {
@@ -836,7 +840,7 @@ try
 
             none = none && sp.empty ();
 
-            // Try to lock the machine, skipping it if being bootstrapped.
+            // Try to lock the machine.
             //
             machine_lock ml (lock_machine (tl, tp));
 
@@ -844,37 +848,39 @@ try
             {
               // @@ TMP: restore l4 tracing.
 
-              if (!ml.prio) // Being bootstrapped.
+              machine_manifest mm;
+              if (ml.prio)
               {
-                l1 ([&]{trace << "skipping " << md << ": being bootstrapped "
-                              << "by " << ml.pid;});
-                break;
-              }
+                // Get the machine manifest (subset of the steps performed for
+                // the locked case below).
+                //
+                // Note that it's possible the machine we get is not what was
+                // originally locked by the other process (e.g., it has been
+                // upgraded since). It's also possible that if and when we
+                // interrupt and lock this machine, it will be a different
+                // machine (e.g., it has been upgraded since we read this
+                // machine manifest). To deal with all of that we will be
+                // reloading this information if/when we acquire the lock to
+                // this machine.
+                //
+                if (sp.empty ())
+                {
+                  l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
+                  break;
+                }
 
-              // Get the machine manifest (subset of the steps performed for
-              // the locked case below).
-              //
-              // Note that it's possible the machine we get is not what was
-              // originally locked by the other process (e.g., it has been
-              // upgraded since). It's also possible that if and when we
-              // interrupt and lock this machine, it will be a different
-              // machine (e.g., it has been upgraded since we read this
-              // machine manifest). To deal with all of that we will be
-              // reloading this information if/when we acquire the lock to
-              // this machine.
-              //
-              if (sp.empty ())
+                l1 ([&]{trace << "keeping " << md << ": locked by " << ml.pid
+                              << " with priority " << *ml.prio;});
+
+                mm = parse_manifest<machine_manifest> (
+                  sp / "manifest", "machine");
+              }
+              else // Being bootstrapped.
               {
-                l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
-                break;
+                l1 ([&]{trace << "keeping " << md << ": being bootstrapped "
+                              << "by " << ml.pid;});
               }
 
-              l1 ([&]{trace << "keeping " << md << ": locked by " << ml.pid
-                            << " with priority " << *ml.prio;});
-
-              auto mm (
-                parse_manifest<machine_manifest> (sp / "manifest", "machine"));
-
               // Add the machine to the lists and bail out.
               //
               r.push_back (bootstrapped_machine {
@@ -1625,6 +1631,8 @@ try
   if (inst == 0 || inst > 99)
     fail << "invalid --instance value " << inst;
 
+  inst_max = ops.instance_max ();
+
   offset = (tc_num - 1) * 100 + inst;
 
   // Controller URLs.
@@ -1699,6 +1707,9 @@ try
       info << "toolchain id   " << tc_id <<
       info << "instance  num  " << inst;
 
+    if (inst_max != 0)
+      dr << info << "instance  max  " << inst_max;
+
     for (const string& u: controllers)
       dr << info << "controller url " << u;
   }
@@ -1761,6 +1772,49 @@ try
     toolchain_lock& tl (er.first);
     bootstrapped_machines& ms (er.second);
 
+    // Determine if we should operate in the priority monitor mode and, if so,
+    // the lower bound on the priorities that we should consider.
+    //
+    optional<uint64_t> prio_mon;
+    if (inst_max != 0)
+    {
+      uint16_t           busy (0);
+      optional<uint64_t> prio;
+
+      for (const bootstrapped_machine& m: ms)
+      {
+        if (!m.lock.locked ())
+        {
+          ++busy;
+          if (m.lock.prio && (!prio || *m.lock.prio < *prio))
+            prio = *m.lock.prio;
+        }
+      }
+
+      assert (busy <= inst_max);
+
+      if (busy == inst_max)
+      {
+        if (!prio) // All being bootstrapped.
+        {
+          sleep = rand_sleep ();
+          continue;
+        }
+
+        prio_mon = *prio;
+      }
+    }
+
+    // @@ For now bail out if in the priority monitor mode.
+    //
+    if (prio_mon)
+    {
+      l1 ([&]{trace << "priority monitor, lower bound " << *prio_mon;});
+
+      sleep = rand_sleep () / 2;
+      continue;
+    }
+
     // Prepare task request.
     //
     task_request_manifest tq {
@@ -1779,6 +1833,8 @@ try
     {
       // @@ For now skip machines locked by other processes.
       //
+      // @@ Note: skip machines being bootstrapped.
+      //
       if (m.lock.locked ())
         tq.machines.emplace_back (m.manifest.machine.id,
                                   m.manifest.machine.name,
diff --git a/bbot/bbot-agent@.service b/bbot/bbot-agent@.service
index e938126..18b7c9e 100644
--- a/bbot/bbot-agent@.service
+++ b/bbot/bbot-agent@.service
@@ -38,6 +38,8 @@ Environment=TOOLCHAIN_NUM=1
 Environment=TOOLCHAIN_VER=
 Environment=TOOLCHAIN_ID=
 
+Environment=INSTANCE_MAX=0
+
 Environment="CONTROLLER_URL="
 Environment="CONTROLLER_TRUST="
 
@@ -63,6 +65,7 @@ ExecStart=/build/bots/default/bin/bbot-agent \
   --toolchain-num ${TOOLCHAIN_NUM} \
   --toolchain-ver ${TOOLCHAIN_VER} \
   --toolchain-id ${TOOLCHAIN_ID} \
+  --instance-max ${INSTANCE_MAX} \
   --instance %i \
   $CONTROLLER_TRUST \
   $CONTROLLER_URL
author	Boris Kolpackov <boris@codesynthesis.com>	2023-05-11 11:13:56 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2023-05-11 11:16:04 +0200
commit	a2ef74aaab96f63f3126d277217b071ad11bf0c6 (patch)
tree	af5dea187cd0c91f73c04f68ff447b9bf0841726 /bbot
parent	a3463829567defda8aefa8425e79ff4d270617ba (diff)