aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bbot/agent/agent.cxx163
1 files changed, 145 insertions, 18 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 17e3ac5..16cec19 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -6,10 +6,11 @@
#include <pwd.h> // getpwuid()
#include <limits.h> // PATH_MAX
#include <signal.h> // signal()
-#include <stdlib.h> // rand_r()
-#include <unistd.h> // sleep(), getuid(), fsync(), [f]stat()
+#include <stdlib.h> // rand_r(), strto[u]ll()
+#include <string.h> // strchr()
+#include <unistd.h> // sleep(), getpid(), getuid(), fsync(), [f]stat()
#include <ifaddrs.h> // getifaddrs(), freeifaddrs()
-#include <sys/types.h> // stat
+#include <sys/types.h> // stat, pid_t
#include <sys/stat.h> // [f]stat()
#include <sys/file.h> // flock()
@@ -402,9 +403,12 @@ class toolchain_lock
public:
toolchain_lock () = default; // Empty lock.
- ~toolchain_lock ()
+ // Note: returns true if locking is disabled.
+ //
+ bool
+ locked () const
{
- unlock (true /* ignore_errors */);
+ return tc_lock.empty () || fl_;
}
void
@@ -419,6 +423,11 @@ public:
}
}
+ ~toolchain_lock ()
+ {
+ unlock (true /* ignore_errors */);
+ }
+
toolchain_lock (toolchain_lock&&) = default;
toolchain_lock& operator= (toolchain_lock&&) = default;
@@ -475,11 +484,18 @@ lock_toolchain (unsigned int timeout)
class machine_lock
{
public:
- machine_lock () = default; // Empty lock.
+ // A lock is either locked by this process or it contains information
+ // about the process holding the lock.
+ //
+ pid_t pid; // Process using the machine.
+ optional<uint64_t> prio; // Task priority (absent means being bootstrapped).
- ~machine_lock ()
+ machine_lock () = default; // Uninitialized lock.
+
+ bool
+ locked () const
{
- unlock (true /* ignore_errors */);
+ return fl_;
}
void
@@ -496,6 +512,40 @@ public:
}
}
+ // Write the holding process information to the lock file.
+ //
+ // Must be called while holding the toolchain lock (see the lock_machine()
+ // implementation for rationale).
+ //
+ void
+ write (const toolchain_lock& tl, optional<uint64_t> prio)
+ {
+ assert (tl.locked () && fl_);
+
+ pid_t pid (getpid ());
+
+ string l (to_string (pid));
+
+ if (prio)
+ {
+ l += ' ';
+ l += to_string (*prio);
+ }
+
+ auto n (fdwrite (fd_.get (), l.c_str (), l.size ()));
+
+ if (n == -1)
+ throw_generic_ios_failure (errno);
+
+ if (static_cast<size_t> (n) != l.size ())
+ throw_generic_ios_failure (EFBIG);
+ }
+
+ ~machine_lock ()
+ {
+ unlock (true /* ignore_errors */);
+ }
+
machine_lock (machine_lock&&) = default;
machine_lock& operator= (machine_lock&&) = default;
@@ -508,27 +558,88 @@ public:
machine_lock (path&& fp, auto_fd&& fd)
: fp_ (move (fp)), fd_ (move (fd)), fl_ (true) {}
+ machine_lock (pid_t pi, optional<uint64_t> pr)
+ : pid (pi), prio (pr), fl_ (false) {}
+
private:
path fp_;
auto_fd fd_;
bool fl_ = false;
};
-// Try to lock the machine given its -<toolchain> directory.
+// Try to lock the machine given its -<toolchain> directory. Return unlocked
+// lock with pid/prio if already in use. Must be called while holding the
+// toolchain lock.
//
-static optional<machine_lock>
-lock_machine (const dir_path& tp)
+static machine_lock
+lock_machine (const toolchain_lock& tl, const dir_path& tp)
{
+ assert (tl.locked ());
+
path fp (tp + ".lock"); // The -<toolchain>.lock file.
for (;;)
{
- auto_fd fd (fdopen (fp, fdopen_mode::out | fdopen_mode::create));
+ auto_fd fd (fdopen (fp, (fdopen_mode::in |
+ fdopen_mode::out |
+ fdopen_mode::create)));
if (flock (fd.get (), LOCK_EX | LOCK_NB) != 0)
{
if (errno == EWOULDBLOCK)
- return nullopt;
+ {
+ // The file should contain a line in the following format:
+ //
+ // <pid>[ <prio>]
+ //
+ char buf[64]; // Sufficient for 2 64-bit numbers (20 decimals max).
+
+ auto sn (fdread (fd.get (), buf, sizeof (buf)));
+
+ if (sn == -1)
+ throw_generic_ios_failure (errno);
+
+ size_t n (static_cast<size_t> (sn));
+
+ // While there would be a race between locking the file then writing
+ // to it in one process and reading from it in another process, we are
+ // protected by the global toolchain lock, which must be held by both
+ // sides during this dance.
+ //
+ assert (n > 0 && n < sizeof (buf));
+ buf[n] = '\0';
+
+ // Note also that it's possible that by the time we read the pid/prio
+ // the lock has already been released. But this case is no different
+ // from the lock being released after we have read pid/prio but before
+ // acting on this information (e.g., trying to interrupt the other
+ // process), which we have to deal with anyway.
+ //
+ pid_t pid;
+ optional<uint64_t> prio;
+ {
+ char* p (strchr (buf, ' '));
+ char* e;
+
+ {
+ errno = 0;
+ pid = strtoll (buf, &e, 10); // Note: pid_t is signed.
+ assert (errno != ERANGE &&
+ e != buf &&
+ (p != nullptr ? e == p : *e == '\0'));
+ }
+
+ if (p != nullptr)
+ {
+ ++p;
+ errno = 0;
+ prio = strtoull (p, &e, 10);
+ assert (errno != ERANGE && e != p && *e == '\0');
+ }
+ }
+
+ return machine_lock (pid, prio);
+ }
throw_generic_error (errno);
}
@@ -589,6 +700,7 @@ try
pr.first = move (*l);
}
+ toolchain_lock& tl (pr.first);
bootstrapped_machines& r (pr.second);
if (ops.fake_machine_specified ())
@@ -711,11 +823,20 @@ try
// Try to lock the machine, skipping it if already locked.
//
- optional<machine_lock> ml (lock_machine (tp));
+ machine_lock ml (lock_machine (tl, tp));
- if (!ml)
+ if (!ml.locked ())
{
- l4 ([&]{trace << "skipping " << md << ": locked";});
+ if (verb >= 1) // @@ TMP: restore l4 tracing.
+ {
+ diag_record dr (trace);
+ dr << "skipping " << md << ": locked by " << ml.pid
+ << " with priority ";
+ if (ml.prio)
+ dr << *ml.prio;
+ else
+ dr << "<bootstrap>";
+ }
break;
}
@@ -842,7 +963,8 @@ try
// restart the enumeration process from scratch.
//
r.clear ();
- pr.first.unlock ();
+ ml.write (tl, nullopt /* prio */); // Being bootstrapped.
+ tl.unlock ();
scratch = true;
bmm = bootstrap_machine (xp, mm, move (bmm));
@@ -885,7 +1007,7 @@ try
// Add the machine to the lists.
//
r.push_back (
- bootstrapped_machine {move (tp), move (*bmm), move (*ml)});
+ bootstrapped_machine {move (tp), move (*bmm), move (ml)});
break;
} // Retry loop.
@@ -1771,7 +1893,12 @@ try
for (size_t j (0); j != ms.size (); ++j)
{
if (tq.machines[j].name == tr.task->machine)
+ {
+ if (!ops.fake_machine_specified ())
+ ms[j].lock.write (tl, 1234 /* prio */);
+
i = j;
+ }
else
ms[j].lock.unlock ();
}