From f45e240682302d4d67c35292ff18408858e3c3e7 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Wed, 10 May 2023 13:11:13 +0200 Subject: Save holding process information (pid, priority) to machine lock This is the ground work for the task priority/interrupt support. --- bbot/agent/agent.cxx | 163 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 145 insertions(+), 18 deletions(-) diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 17e3ac5..16cec19 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -6,10 +6,11 @@ #include // getpwuid() #include // PATH_MAX #include // signal() -#include // rand_r() -#include // sleep(), getuid(), fsync(), [f]stat() +#include // rand_r(), strto[u]ll() +#include // strchr() +#include // sleep(), getpid(), getuid(), fsync(), [f]stat() #include // getifaddrs(), freeifaddrs() -#include // stat +#include // stat, pid_t #include // [f]stat() #include // flock() @@ -402,9 +403,12 @@ class toolchain_lock public: toolchain_lock () = default; // Empty lock. - ~toolchain_lock () + // Note: returns true if locking is disabled. + // + bool + locked () const { - unlock (true /* ignore_errors */); + return tc_lock.empty () || fl_; } void @@ -419,6 +423,11 @@ public: } } + ~toolchain_lock () + { + unlock (true /* ignore_errors */); + } + toolchain_lock (toolchain_lock&&) = default; toolchain_lock& operator= (toolchain_lock&&) = default; @@ -475,11 +484,18 @@ lock_toolchain (unsigned int timeout) class machine_lock { public: - machine_lock () = default; // Empty lock. + // A lock is either locked by this process or it contains information + // about the process holding the lock. + // + pid_t pid; // Process using the machine. + optional prio; // Task priority (absent means being bootstrapped). - ~machine_lock () + machine_lock () = default; // Uninitialized lock. + + bool + locked () const { - unlock (true /* ignore_errors */); + return fl_; } void @@ -496,6 +512,40 @@ public: } } + // Write the holding process information to the lock file. + // + // Must be called while holding the toolchain lock (see the lock_machine() + // implementation for rationale). + // + void + write (const toolchain_lock& tl, optional prio) + { + assert (tl.locked () && fl_); + + pid_t pid (getpid ()); + + string l (to_string (pid)); + + if (prio) + { + l += ' '; + l += to_string (*prio); + } + + auto n (fdwrite (fd_.get (), l.c_str (), l.size ())); + + if (n == -1) + throw_generic_ios_failure (errno); + + if (static_cast (n) != l.size ()) + throw_generic_ios_failure (EFBIG); + } + + ~machine_lock () + { + unlock (true /* ignore_errors */); + } + machine_lock (machine_lock&&) = default; machine_lock& operator= (machine_lock&&) = default; @@ -508,27 +558,88 @@ public: machine_lock (path&& fp, auto_fd&& fd) : fp_ (move (fp)), fd_ (move (fd)), fl_ (true) {} + machine_lock (pid_t pi, optional pr) + : pid (pi), prio (pr), fl_ (false) {} + private: path fp_; auto_fd fd_; bool fl_ = false; }; -// Try to lock the machine given its - directory. +// Try to lock the machine given its - directory. Return unlocked +// lock with pid/prio if already in use. Must be called while holding the +// toolchain lock. // -static optional -lock_machine (const dir_path& tp) +static machine_lock +lock_machine (const toolchain_lock& tl, const dir_path& tp) { + assert (tl.locked ()); + path fp (tp + ".lock"); // The -.lock file. for (;;) { - auto_fd fd (fdopen (fp, fdopen_mode::out | fdopen_mode::create)); + auto_fd fd (fdopen (fp, (fdopen_mode::in | + fdopen_mode::out | + fdopen_mode::create))); if (flock (fd.get (), LOCK_EX | LOCK_NB) != 0) { if (errno == EWOULDBLOCK) - return nullopt; + { + // The file should contain a line in the following format: + // + // [ ] + // + char buf[64]; // Sufficient for 2 64-bit numbers (20 decimals max). + + auto sn (fdread (fd.get (), buf, sizeof (buf))); + + if (sn == -1) + throw_generic_ios_failure (errno); + + size_t n (static_cast (sn)); + + // While there would be a race between locking the file then writing + // to it in one process and reading from it in another process, we are + // protected by the global toolchain lock, which must be held by both + // sides during this dance. + // + assert (n > 0 && n < sizeof (buf)); + buf[n] = '\0'; + + // Note also that it's possible that by the time we read the pid/prio + // the lock has already been released. But this case is no different + // from the lock being released after we have read pid/prio but before + // acting on this information (e.g., trying to interrupt the other + // process), which we have to deal with anyway. + // + pid_t pid; + optional prio; + { + char* p (strchr (buf, ' ')); + char* e; + + { + errno = 0; + pid = strtoll (buf, &e, 10); // Note: pid_t is signed. + assert (errno != ERANGE && + e != buf && + (p != nullptr ? e == p : *e == '\0')); + } + + if (p != nullptr) + { + ++p; + errno = 0; + prio = strtoull (p, &e, 10); + assert (errno != ERANGE && e != p && *e == '\0'); + } + } + + return machine_lock (pid, prio); + } throw_generic_error (errno); } @@ -589,6 +700,7 @@ try pr.first = move (*l); } + toolchain_lock& tl (pr.first); bootstrapped_machines& r (pr.second); if (ops.fake_machine_specified ()) @@ -711,11 +823,20 @@ try // Try to lock the machine, skipping it if already locked. // - optional ml (lock_machine (tp)); + machine_lock ml (lock_machine (tl, tp)); - if (!ml) + if (!ml.locked ()) { - l4 ([&]{trace << "skipping " << md << ": locked";}); + if (verb >= 1) // @@ TMP: restore l4 tracing. + { + diag_record dr (trace); + dr << "skipping " << md << ": locked by " << ml.pid + << " with priority "; + if (ml.prio) + dr << *ml.prio; + else + dr << ""; + } break; } @@ -842,7 +963,8 @@ try // restart the enumeration process from scratch. // r.clear (); - pr.first.unlock (); + ml.write (tl, nullopt /* prio */); // Being bootstrapped. + tl.unlock (); scratch = true; bmm = bootstrap_machine (xp, mm, move (bmm)); @@ -885,7 +1007,7 @@ try // Add the machine to the lists. // r.push_back ( - bootstrapped_machine {move (tp), move (*bmm), move (*ml)}); + bootstrapped_machine {move (tp), move (*bmm), move (ml)}); break; } // Retry loop. @@ -1771,7 +1893,12 @@ try for (size_t j (0); j != ms.size (); ++j) { if (tq.machines[j].name == tr.task->machine) + { + if (!ops.fake_machine_specified ()) + ms[j].lock.write (tl, 1234 /* prio */); + i = j; + } else ms[j].lock.unlock (); } -- cgit v1.1