2 files changed, 416 insertions, 250 deletions
diff --git a/bbot/agent/agent.cli b/bbot/agent/agent.cli
index 22dc088..aa7eb59 100644
--- a/bbot/agent/agent.cli
+++ b/bbot/agent/agent.cli
@@ -65,6 +65,16 @@ namespace bbot
        interfaces, etc."
     }
 
+    string --toolchain-lock // Note: string to allow empty path.
+    {
+      "<path>",
+      "Absolute path to the global toolchain lock file. If unspecified, then
+       \c{\b{/var/lock/bbot-agent-}\i{toolchain-name}\b{.lock}} is used by
+       default. If empty path is specified then no global locking is
+       performed. If one of the \cb{--fake-*} options is specified, then no
+       locking is performed by default."
+    }
+
     standard_version --toolchain-ver
     {
       "<stdver>",
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index ed90da0..17e3ac5 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -56,6 +56,7 @@ namespace bbot
 
   string           tc_name;
   uint16_t         tc_num;
+  path             tc_lock; // Empty if no locking.
   standard_version tc_ver;
   string           tc_id;
 
@@ -362,7 +363,103 @@ bootstrap_machine (const dir_path& md,
   return r;
 }
 
-// Machine locking.
+// Global toolchain lock.
+//
+// The overall locking protocol is as follows:
+//
+// 1. Before enumerating the machines each agent instance acquires the global
+//    toolchain lock.
+//
+// 2. As the agent enumerates over the machines, it tries to acquire the lock
+//    for each machine.
+//
+// 3. If the agent encounters a machine that it needs to bootstrap, it
+//    releases all the other machine locks followed by the global lock,
+//    proceeds to bootstrap the machine, releases its lock, and restarts the
+//    process from scratch.
+//
+// 4. Otherwise, upon receiving a task response for one of the machines, the
+//    agent releases all the other machine locks followed by the global lock,
+//    proceeds to perform the task on the selected machine, releases its lock,
+//    and restarts the process from scratch.
+//
+// One notable implication of this protocol is that the machine locks are
+// only acquired while holding the global toolchain lock but can be released
+// while not holding this lock.
+//
+// (Note that because of this implication it can theoretically be possible
+// to omit acquiring all the machine locks during the enumeration process,
+// instead only acquiring the lock of the machine we need to bootstrap or
+// build. However, the current approach is simpler since we still need
+// to detect machines that are already locked, which entails acquiring
+// the lock anyway.)
+//
+// Note that unlike the machine lock below, here we don't bother with removing
+// the lock file.
+//
+class toolchain_lock
+{
+public:
+  toolchain_lock () = default; // Empty lock.
+
+  ~toolchain_lock ()
+  {
+    unlock (true /* ignore_errors */);
+  }
+
+  void
+  unlock (bool ignore_errors = false)
+  {
+    if (fl_)
+    {
+      fl_ = false; // We have tried.
+
+      if (flock (fd_.get (), LOCK_UN) != 0 && !ignore_errors)
+        throw_generic_error (errno);
+    }
+  }
+
+  toolchain_lock            (toolchain_lock&&) = default;
+  toolchain_lock& operator= (toolchain_lock&&) = default;
+
+  toolchain_lock            (const toolchain_lock&) = delete;
+  toolchain_lock& operator= (const toolchain_lock&) = delete;
+
+  // Implementation details.
+  //
+public:
+  explicit
+  toolchain_lock (auto_fd&& fd)
+      : fd_ (move (fd)), fl_ (true) {}
+
+private:
+  auto_fd fd_;
+  bool    fl_ = false;
+};
+
+// Note: returns empty lock if toolchain locking is disabled.
+//
+static optional<toolchain_lock>
+lock_toolchain (unsigned int timeout)
+{
+  if (tc_lock.empty ())
+    return toolchain_lock ();
+
+  auto_fd fd (fdopen (tc_lock, fdopen_mode::out | fdopen_mode::create));
+
+  for (; flock (fd.get (), LOCK_EX | LOCK_NB) != 0; sleep (1), --timeout)
+  {
+    if (errno != EWOULDBLOCK)
+      throw_generic_error (errno);
+
+    if (timeout == 0)
+      return nullopt;
+  }
+
+  return toolchain_lock (move (fd));
+}
+
+// Per-toolchain machine lock.
 //
 // We use flock(2) which is straightforward. The tricky part is cleaning the
 // file up. Here we may have a race when two processes are trying to open &
@@ -445,7 +542,7 @@ lock_machine (const dir_path& tp)
     if (st1.st_ino == st2.st_ino)
       return machine_lock (move (fp), move (fd));
 
-    // Note: unlocked by close().
+    // Retry (note: lock is unlocked by auto_fd::close()).
   }
 }
 
@@ -472,319 +569,352 @@ struct bootstrapped_machine
 };
 using bootstrapped_machines = vector<bootstrapped_machine>;
 
-static bootstrapped_machines
+static pair<toolchain_lock, bootstrapped_machines>
 enumerate_machines (const dir_path& machines)
 try
 {
   tracer trace ("enumerate_machines", machines.string ().c_str ());
 
-  bootstrapped_machines r;
-
-  if (ops.fake_machine_specified ())
+  for (;;) // From-scratch retry loop for after bootstrap (see below).
   {
-    auto mh (
-      parse_manifest<machine_header_manifest> (
-        ops.fake_machine (), "machine header"));
-
-    r.push_back (
-      bootstrapped_machine {
-        dir_path (ops.machines ()) /= mh.name, // For diagnostics.
-        bootstrapped_machine_manifest {
-          machine_manifest {
-            move (mh.id),
-            move (mh.name),
-            move (mh.summary),
-            machine_type::kvm,
-            string ("de:ad:be:ef:de:ad"),
-            nullopt,
-            strings ()},
-          toolchain_manifest {tc_id},
-          bootstrap_manifest {}},
-          machine_lock ()});
+    pair<toolchain_lock, bootstrapped_machines> pr;
 
-    return r;
-  }
+    {
+      optional<toolchain_lock> l;
+      while (!(l = lock_toolchain (60 /* seconds */)))
+      {
+        warn << "unable to acquire global toolchain lock " << tc_lock
+             << " for 60s";
+      }
+      pr.first = move (*l);
+    }
 
-  // Notice and warn if there are no machines (as opposed to all of them being
-  // locked).
-  //
-  bool none (true);
+    bootstrapped_machines& r (pr.second);
 
-  // The first level are machine volumes.
-  //
-  for (const dir_entry& ve: dir_iterator (machines, dir_iterator::no_follow))
-  {
-    const string vn (ve.path ().string ());
+    if (ops.fake_machine_specified ())
+    {
+      auto mh (
+        parse_manifest<machine_header_manifest> (
+          ops.fake_machine (), "machine header"));
+
+      r.push_back (
+        bootstrapped_machine {
+          dir_path (ops.machines ()) /= mh.name, // For diagnostics.
+          bootstrapped_machine_manifest {
+            machine_manifest {
+              move (mh.id),
+              move (mh.name),
+              move (mh.summary),
+              machine_type::kvm,
+              string ("de:ad:be:ef:de:ad"),
+              nullopt,
+              strings ()},
+            toolchain_manifest {tc_id},
+            bootstrap_manifest {}},
+            machine_lock ()});
+
+      return pr;
+    }
 
-    // Ignore hidden directories.
+    // Notice and warn if there are no machines (as opposed to all of them
+    // being locked).
     //
-    if (ve.type () != entry_type::directory || vn[0] == '.')
-      continue;
-
-    const dir_path vd (dir_path (machines) /= vn);
+    bool none (true);
 
-    // Inside we have machines.
+    // The first level are machine volumes.
     //
-    try
+    bool scratch (false);
+    for (const dir_entry& ve: dir_iterator (machines, dir_iterator::no_follow))
     {
-      for (const dir_entry& me: dir_iterator (vd, dir_iterator::no_follow))
-      {
-        const string mn (me.path ().string ());
+      const string vn (ve.path ().string ());
 
-        if (me.type () != entry_type::directory || mn[0] == '.')
-          continue;
+      // Ignore hidden directories.
+      //
+      if (ve.type () != entry_type::directory || vn[0] == '.')
+        continue;
 
-        const dir_path md (dir_path (vd) /= mn);
+      const dir_path vd (dir_path (machines) /= vn);
 
-        // Our endgoal here is to obtain a bootstrapped snapshot of this
-        // machine while watching out for potential race conditions (other
-        // instances as well as machines being added/upgraded/removed; see the
-        // manual for details).
-        //
-        // So here is our overall plan:
-        //
-        // 1. Resolve current subvolume link for our bootstrap protocol.
-        //
-        // 2. Lock the machine. This excludes any other instance from trying
-        //    to perform the following steps.
-        //
-        // 3. If there is no link, cleanup old bootstrap (if any) and ignore
-        //    this machine.
-        //
-        // 4. Try to create a snapshot of current subvolume (this operation is
-        //    atomic). If failed (e.g., someone changed the link and removed
-        //    the subvolume in the meantime), retry from #1.
-        //
-        // 5. Compare the snapshot to the already bootstrapped version (if
-        //    any) and see if we need to re-bootstrap. If so, use the snapshot
-        //    as a starting point. Rename to bootstrapped at the end (atomic).
-        //
-        dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -<P>
-        dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // -<toolchain>
-
-        auto delete_bootstrapped = [&tp, &trace] () // Delete -<toolchain>.
+      // Inside we have machines.
+      //
+      try
+      {
+        for (const dir_entry& me: dir_iterator (vd, dir_iterator::no_follow))
         {
-          run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false");
-          run_btrfs (trace, "subvolume", "delete", tp);
-        };
+          const string mn (me.path ().string ());
 
-        for (size_t retry (0);; ++retry)
-        {
-          if (retry != 0)
-            sleep (1);
+          if (me.type () != entry_type::directory || mn[0] == '.')
+            continue;
+
+          const dir_path md (dir_path (vd) /= mn);
 
-          // Resolve the link to subvolume path.
+          // Our endgoal here is to obtain a bootstrapped snapshot of this
+          // machine while watching out for potential race conditions (other
+          // instances as well as machines being added/upgraded/removed; see
+          // the manual for details).
+          //
+          // So here is our overall plan:
+          //
+          // 1. Resolve current subvolume link for our bootstrap protocol.
+          //
+          // 2. Lock the machine. This excludes any other instance from trying
+          //    to perform the following steps.
           //
-          dir_path sp; // <name>-<P>.<R>
+          // 3. If there is no link, cleanup old bootstrap (if any) and ignore
+          //    this machine.
+          //
+          // 4. Try to create a snapshot of current subvolume (this operation
+          //    is atomic). If failed (e.g., someone changed the link and
+          //    removed the subvolume in the meantime), retry from #1.
+          //
+          // 5. Compare the snapshot to the already bootstrapped version (if
+          //    any) and see if we need to re-bootstrap. If so, use the
+          //    snapshot as a starting point. Rename to bootstrapped at the
+          //    end (atomic).
+          //
+          dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -<P>
+          dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // -<toolchain>
 
-          try
+          auto delete_bootstrapped = [&tp, &trace] () // Delete -<toolchain>.
           {
-            sp = path_cast<dir_path> (readsymlink (lp));
+            run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false");
+            run_btrfs (trace, "subvolume", "delete", tp);
+          };
 
-            if (sp.relative ())
-              sp = md / sp;
-          }
-          catch (const system_error& e)
+          for (size_t retry (0);; ++retry)
           {
-            // Leave the subvolume path empty if the subvolume link doesn't
-            // exist and fail on any other error.
-            //
-            if (e.code ().category () != std::generic_category () ||
-                e.code ().value () != ENOENT)
-              fail << "unable to read subvolume link " << lp << ": " << e;
-          }
+            if (retry != 0)
+              sleep (1);
 
-          none = none && sp.empty ();
+            // Resolve the link to subvolume path.
+            //
+            dir_path sp; // <name>-<P>.<R>
 
-          // Try to lock the machine, skipping it if already locked.
-          //
-          optional<machine_lock> ml (lock_machine (tp));
+            try
+            {
+              sp = path_cast<dir_path> (readsymlink (lp));
 
-          if (!ml)
-          {
-            l4 ([&]{trace << "skipping " << md << ": locked";});
-            break;
-          }
+              if (sp.relative ())
+                sp = md / sp;
+            }
+            catch (const system_error& e)
+            {
+              // Leave the subvolume path empty if the subvolume link doesn't
+              // exist and fail on any other error.
+              //
+              if (e.code ().category () != std::generic_category () ||
+                  e.code ().value () != ENOENT)
+                fail << "unable to read subvolume link " << lp << ": " << e;
+            }
 
-          bool te (dir_exists (tp));
+            none = none && sp.empty ();
 
-          // If the resolution fails, then this means there is no current
-          // machine subvolume (for this bootstrap protocol). In this case we
-          // clean up our toolchain subvolume (-<toolchain>, if any) and
-          // ignore this machine.
-          //
-          if (sp.empty ())
-          {
-            if (te)
-              delete_bootstrapped ();
-
-            l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
-            break;
-          }
+            // Try to lock the machine, skipping it if already locked.
+            //
+            optional<machine_lock> ml (lock_machine (tp));
 
-          // <name>-<toolchain>-<xxx>
-          //
-          const dir_path xp (snapshot_path (tp));
+            if (!ml)
+            {
+              l4 ([&]{trace << "skipping " << md << ": locked";});
+              break;
+            }
 
-          if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0)
-          {
-            if (retry >= 10)
-              fail << "unable to snapshot subvolume " << sp;
+            bool te (dir_exists (tp));
 
-            continue;
-          }
+            // If the resolution fails, then this means there is no current
+            // machine subvolume (for this bootstrap protocol). In this case
+            // we clean up our toolchain subvolume (-<toolchain>, if any) and
+            // ignore this machine.
+            //
+            if (sp.empty ())
+            {
+              if (te)
+                delete_bootstrapped ();
 
-          // Load the (original) machine manifest.
-          //
-          auto mm (
-            parse_manifest<machine_manifest> (sp / "manifest", "machine"));
+              l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
+              break;
+            }
 
-          // If we already have <name>-<toolchain>, see if it needs to be re-
-          // bootstrapped. Things that render it obsolete:
-          //
-          // 1. New machine revision  (compare machine ids).
-          // 2. New toolchain         (compare toolchain ids).
-          // 3. New bbot/libbbot      (compare versions).
-          //
-          // The last case has a complication: what should we do if we have
-          // bootstrapped a newer version of bbot? This would mean that we are
-          // about to be stopped and upgraded (and the upgraded version will
-          // probably be able to use the result). So we simply ignore this
-          // machine for this run.
+            // <name>-<toolchain>-<xxx>
+            //
+            const dir_path xp (snapshot_path (tp));
 
-          // Return -1 if older, 0 if the same, and +1 if newer.
-          //
-          auto compare_bbot = [] (const bootstrap_manifest& m) -> int
-          {
-            auto cmp = [&m] (const string& n, const char* v) -> int
+            if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0)
             {
-              standard_version sv (v);
-              auto i = m.versions.find (n);
+              if (retry >= 10)
+                fail << "unable to snapshot subvolume " << sp;
 
-              return (i == m.versions.end () || i->second < sv
-                      ? -1
-                      : i->second > sv ? 1 : 0);
-            };
+              continue;
+            }
 
-            // Start from the top assuming a new dependency cannot be added
-            // without changing the dependent's version.
+            // Load the (original) machine manifest.
             //
-            int r;
-            return
-              (r = cmp ("bbot",       BBOT_VERSION_STR)) != 0 ? r :
-              (r = cmp ("libbbot", LIBBBOT_VERSION_STR)) != 0 ? r :
-              (r = cmp ("libbpkg", LIBBPKG_VERSION_STR)) != 0 ? r :
-              (r = cmp ("libbutl", LIBBUTL_VERSION_STR)) != 0 ? r : 0;
-          };
+            auto mm (
+              parse_manifest<machine_manifest> (sp / "manifest", "machine"));
 
-          optional<bootstrapped_machine_manifest> bmm;
-          if (te)
-          {
-            bmm = parse_manifest<bootstrapped_machine_manifest> (
-              tp / "manifest", "bootstrapped machine");
+            // If we already have <name>-<toolchain>, see if it needs to be
+            // re-bootstrapped. Things that render it obsolete:
+            //
+            // 1. New machine revision  (compare machine ids).
+            // 2. New toolchain         (compare toolchain ids).
+            // 3. New bbot/libbbot      (compare versions).
+            //
+            // The last case has a complication: what should we do if we have
+            // bootstrapped a newer version of bbot? This would mean that we
+            // are about to be stopped and upgraded (and the upgraded version
+            // will probably be able to use the result). So we simply ignore
+            // this machine for this run.
 
-            if (bmm->machine.id != mm.id)
+            // Return -1 if older, 0 if the same, and +1 if newer.
+            //
+            auto compare_bbot = [] (const bootstrap_manifest& m) -> int
             {
-              l3 ([&]{trace << "re-bootstrapping " << tp << ": new machine";});
-              te = false;
-            }
+              auto cmp = [&m] (const string& n, const char* v) -> int
+              {
+                standard_version sv (v);
+                auto i = m.versions.find (n);
+
+                return (i == m.versions.end () || i->second < sv
+                        ? -1
+                        : i->second > sv ? 1 : 0);
+              };
+
+              // Start from the top assuming a new dependency cannot be added
+              // without changing the dependent's version.
+              //
+              int r;
+              return (
+                (r = cmp ("bbot",       BBOT_VERSION_STR)) != 0 ? r :
+                (r = cmp ("libbbot", LIBBBOT_VERSION_STR)) != 0 ? r :
+                (r = cmp ("libbpkg", LIBBPKG_VERSION_STR)) != 0 ? r :
+                (r = cmp ("libbutl", LIBBUTL_VERSION_STR)) != 0 ? r : 0);
+            };
 
-            if (!tc_id.empty () && bmm->toolchain.id != tc_id)
+            optional<bootstrapped_machine_manifest> bmm;
+            if (te)
             {
-              l3 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";});
-              te = false;
-            }
+              bmm = parse_manifest<bootstrapped_machine_manifest> (
+                tp / "manifest", "bootstrapped machine");
 
-            if (int i = compare_bbot (bmm->bootstrap))
-            {
-              if (i < 0)
+              if (bmm->machine.id != mm.id)
               {
-                l3 ([&]{trace << "re-bootstrapping " << tp << ": new bbot";});
+                l3 ([&]{trace << "re-bootstrapping " << tp << ": new machine";});
                 te = false;
               }
-              else
+
+              if (!tc_id.empty () && bmm->toolchain.id != tc_id)
               {
-                l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
-                run_btrfs (trace, "subvolume", "delete", xp);
-                break;
+                l3 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";});
+                te = false;
               }
-            }
 
-            if (!te)
-              delete_bootstrapped ();
-          }
-          else
-            l3 ([&]{trace << "bootstrapping " << tp;});
-
-          if (!te)
-          {
-            // Use the <name>-<toolchain>-<xxx> snapshot that we have made to
-            // bootstrap the new machine. Then atomically rename it to
-            // <name>-<toolchain>.
-            //
-            // Also release all the machine locks that we have acquired so far
-            // since the bootstrap will take a while and other instances might
-            // be able to use them.
-            //
-            r.clear ();
-
-            bmm = bootstrap_machine (xp, mm, move (bmm));
+              if (int i = compare_bbot (bmm->bootstrap))
+              {
+                if (i < 0)
+                {
+                  l3 ([&]{trace << "re-bootstrapping " << tp << ": new bbot";});
+                  te = false;
+                }
+                else
+                {
+                  l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
+                  run_btrfs (trace, "subvolume", "delete", xp);
+                  break;
+                }
+              }
 
-            if (!bmm)
-            {
-              l3 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";});
-              run_btrfs (trace, "subvolume", "delete", xp);
-              break;
+              if (!te)
+                delete_bootstrapped ();
             }
+            else
+              l3 ([&]{trace << "bootstrapping " << tp;});
 
-            try
-            {
-              mvdir (xp, tp);
-            }
-            catch (const system_error& e)
+            if (!te)
             {
-              fail << "unable to rename " << xp << " to " << tp;
-            }
+              // Use the <name>-<toolchain>-<xxx> snapshot that we have made
+              // to bootstrap the new machine. Then atomically rename it to
+              // <name>-<toolchain>.
+              //
+              // Also release all the machine locks that we have acquired so
+              // far as well as the global toolchain lock, since the bootstrap
+              // will take a while and other instances might be able to use
+              // them. Because we are releasing the global lock, we have to
+              // restart the enumeration process from scratch.
+              //
+              r.clear ();
+              pr.first.unlock ();
+              scratch = true;
+
+              bmm = bootstrap_machine (xp, mm, move (bmm));
+
+              if (!bmm)
+              {
+                l3 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";});
+                run_btrfs (trace, "subvolume", "delete", xp);
+                break;
+              }
 
-            l2 ([&]{trace << "bootstrapped " << bmm->machine.name;});
+              try
+              {
+                mvdir (xp, tp);
+              }
+              catch (const system_error& e)
+              {
+                fail << "unable to rename " << xp << " to " << tp;
+              }
 
-            // Check the bootstrapped bbot version as above and ignore this
-            // machine if it's newer than us.
-            //
-            if (int i = compare_bbot (bmm->bootstrap))
-            {
-              if (i > 0)
+              l2 ([&]{trace << "bootstrapped " << bmm->machine.name;});
+
+              // Check the bootstrapped bbot version as above and ignore this
+              // machine if it's newer than us.
+              //
+              if (int i = compare_bbot (bmm->bootstrap))
               {
-                l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
-                break;
+                if (i > 0)
+                  l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
+                else
+                  warn << "bootstrapped " << tp << " bbot worker is older "
+                       << "than agent; assuming test setup";
               }
-              else
-                warn << "bootstrapped " << tp << " bbot worker is older "
-                     << "than agent; assuming test setup";
+
+              break; // Restart from scratch.
             }
-          }
-          else
-            run_btrfs (trace, "subvolume", "delete", xp);
+            else
+              run_btrfs (trace, "subvolume", "delete", xp);
 
-          // Add the machine to the lists.
-          //
-          r.push_back (
-            bootstrapped_machine {move (tp), move (*bmm), move (*ml)});
+            // Add the machine to the lists.
+            //
+            r.push_back (
+              bootstrapped_machine {move (tp), move (*bmm), move (*ml)});
 
+            break;
+          } // Retry loop.
+
+          if (scratch)
+            break;
+
+        } // Inner dir_iterator loop.
+
+        if (scratch)
           break;
-        }
       }
-    }
-    catch (const system_error& e)
-    {
-      fail << "unable to iterate over " << vd << ": " << e;
-    }
-  }
+      catch (const system_error& e)
+      {
+        fail << "unable to iterate over " << vd << ": " << e;
+      }
+    } // Outer dir_iterator loop.
 
-  if (none)
-    warn << "no build machines for toolchain " << tc_name;
+    if (scratch)
+      continue;
 
-  return r;
+    if (none)
+      warn << "no build machines for toolchain " << tc_name;
+
+    return pr;
+
+  } // From-scratch retry loop.
+
+  // Unreachable.
 }
 catch (const system_error& e)
 {
@@ -1296,6 +1426,25 @@ try
 
   tc_name = ops.toolchain_name ();
   tc_num  = ops.toolchain_num ();
+
+  if (ops.toolchain_lock_specified ())
+  {
+    const string& l (ops.toolchain_lock ());
+
+    if (!l.empty ())
+    {
+      tc_lock = path (l);
+
+      if (!tc_lock.absolute ())
+        fail << "--toolchain-lock value '" << l << "' should be absolute path";
+    }
+  }
+  else if (!(ops.fake_bootstrap ()         ||
+             ops.fake_build ()             ||
+             ops.fake_machine_specified () ||
+             ops.fake_request_specified ()))
+    tc_lock = path ("/var/lock/bbot-agent-" + tc_name + ".lock");
+
   tc_ver  = (ops.toolchain_ver_specified ()
              ? ops.toolchain_ver ()
              : standard_version (BBOT_VERSION_STR));
@@ -1439,7 +1588,11 @@ try
 
   for (unsigned int sleep (0);; ::sleep (sleep), sleep = 0)
   {
-    bootstrapped_machines ms (enumerate_machines (ops.machines ()));
+    pair<toolchain_lock, bootstrapped_machines> er (
+      enumerate_machines (ops.machines ()));
+
+    toolchain_lock& tl (er.first);
+    bootstrapped_machines& ms (er.second);
 
     // Prepare task request.
     //
@@ -1611,7 +1764,8 @@ try
     // We have a build task.
     //
     // First find the index of the machine we were asked to use (and verify it
-    // is one of those we sent). Also unlock all the other machines.
+    // is one of those we sent). Also unlock all the other machines as well as
+    // the global toolchain lock.
     //
     size_t i (ms.size ());
     for (size_t j (0); j != ms.size (); ++j)
@@ -1622,6 +1776,8 @@ try
         ms[j].lock.unlock ();
     }
 
+    tl.unlock ();
+
     if (i == ms.size ())
     {
       error << "task from " << url << " for unknown machine "