From 26f669cc6ecf7c3957bcb6caef26d203b5e582a7 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Sun, 14 May 2023 08:29:50 +0200 Subject: Treat suspended machines same as being bootstrapped for interrupt purposes --- bbot/agent/agent.cxx | 121 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 85 insertions(+), 36 deletions(-) (limited to 'bbot') diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 1b35ee0..8e0caf6 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -487,11 +487,12 @@ lock_toolchain (unsigned int timeout) class machine_lock { public: - // A lock is either locked by this process or it contains information - // about the process holding the lock. + // A lock is either locked by this process or it contains information about + // the process holding the lock. // pid_t pid; // Process using the machine. - optional prio; // Task priority (absent means being bootstrapped). + optional prio; // Task priority (absent means being bootstrapped + // or have been suspended). machine_lock () = default; // Uninitialized lock. @@ -524,29 +525,45 @@ public: // implementation for rationale). // void - write (const toolchain_lock& tl, optional prio) + bootstrap (const toolchain_lock& tl) { assert (tl.locked () && fl_); if (fd_ != nullfd) - { - pid_t pid (getpid ()); + write (nullopt); + } - string l (to_string (pid)); + void + perform_task (const toolchain_lock& tl, uint64_t prio) + { + assert (tl.locked () && fl_); - if (prio) - { - l += ' '; - l += to_string (*prio); - } + if (fd_ != nullfd) + write (prio); + } - auto n (fdwrite (fd_.get (), l.c_str (), l.size ())); + // Truncate the holding process information after the call to perform_task() + // so that it doesn't contain the priority, marking the machine as being + // suspended. + // + // Note that this one can be called without holding the toolchain lock. + // + void + suspend_task () + { + assert (fl_); - if (n == -1) - throw_generic_ios_failure (errno); + if (fd_ != nullfd) + { + assert (tp_ != 0); // Must be called after perform_task(). - if (static_cast (n) != l.size ()) - throw_generic_ios_failure (EFBIG); + // While there is no direct statement to this effect in POSIX, the + // consensus on the internet is that truncation is atomic, in a sense + // that the reader shouldn't see a partially truncated content. Feels + // like should be doubly so when actually truncating as opposed to + // extending the size, which is what we do. + // + fdtruncate (fd_.get (), tp_); } } @@ -573,9 +590,35 @@ public: : pid (pi), prio (pr), fl_ (false) {} private: - path fp_; - auto_fd fd_; - bool fl_ = false; + void + write (optional prio) + { + pid_t pid (getpid ()); + + string l (to_string (pid)); + + if (prio) + { + tp_ = l.size (); // Truncate position. + + l += ' '; + l += to_string (*prio); + } + + auto n (fdwrite (fd_.get (), l.c_str (), l.size ())); + + if (n == -1) + throw_generic_ios_failure (errno); + + if (static_cast (n) != l.size ()) + throw_generic_ios_failure (EFBIG); + } + +private: + path fp_; + auto_fd fd_; + bool fl_ = false; + uint64_t tp_ = 0; // Truncate position. }; // Try to lock the machine given its - directory. Return unlocked @@ -686,12 +729,12 @@ snapshot_path (const dir_path& tp) // // Note that this function returns both machines that this process managed to // lock as well as the machines locked by other processes (including those -// that are being bootstrapped), in case the caller needs to interrupt one of -// them for a higher-priority task. In the latter case, the manifest is empty -// if the machine is being bootstrapped and only has the machine_manifest -// information otherwise. (The bootstrapped machines have to be returned to -// get the correct count of currently active instances for the inst_max -// comparison.) +// that are being bootstrapped or that have been suspended), in case the +// caller needs to interrupt one of them for a higher-priority task. In the +// latter case, the manifest is empty if the machine is bootstrapping or +// suspended and only has the machine_manifest information otherwise. (The +// bootstrapping/suspended machines have to be returned to get the correct +// count of currently active instances for the inst_max comparison.) // struct bootstrapped_machine { @@ -776,7 +819,7 @@ try }; // Notice and warn if there are no machines (as opposed to all of them - // being locked). + // being busy). // bool none (true); @@ -921,10 +964,10 @@ try mm = parse_manifest ( sp / "manifest", "machine"); } - else // Being bootstrapped. + else // Bootstrapping/suspended. { l1 ([&]{trace << "keeping " << md << ": being bootstrapped " - << "by " << ml.pid;}); + << "or suspened by " << ml.pid;}); } // Add the machine to the lists and bail out. @@ -1101,7 +1144,7 @@ try // from scratch. // r.clear (); - ml.write (tl, nullopt /* prio */); // Being bootstrapped. + ml.bootstrap (tl); tl.unlock (); optional bmm ( @@ -1155,7 +1198,8 @@ catch (const system_error& e) } static result_manifest -perform_task (const dir_path& md, +perform_task (machine_lock& ml, + const dir_path& md, const bootstrapped_machine_manifest& mm, const task_manifest& tm) try @@ -1304,7 +1348,7 @@ try try {m->forcedown (false);} catch (const failed&) {} })); - auto soft_fail = [&xp, &m, &r] (const char* msg) + auto soft_fail = [&ml, &xp, &m, &r] (const char* msg) { { diag_record dr (error); @@ -1314,6 +1358,11 @@ try try { + // Update the information in the machine lock to signal that the + // machine is suspended and cannot be interrupted. + // + ml.suspend_task (); + m->suspend (false); m->wait (false); m->cleanup (); @@ -1900,7 +1949,7 @@ try if (busy == inst_max) { - if (!prio) // All being bootstrapped. + if (!prio) // All bootstrapping/suspended. { sleep = rand_sleep (); continue; @@ -1960,7 +2009,7 @@ try { // @@ For now skip machines locked by other processes. // - // @@ Note: skip machines being bootstrapped. + // @@ Note: skip machines bootstrapping/suspended. // if (m.lock.locked ()) tq.machines.emplace_back (m.manifest.machine.id, @@ -2141,7 +2190,7 @@ try { if (mh.name == m.manifest.machine.name) { - m.lock.write (tl, prio); + m.lock.perform_task (tl, prio); pm = &m; } else @@ -2188,7 +2237,7 @@ try if (!tr.agent_checksum || *tr.agent_checksum != agent_checksum) t.worker_checksum = nullopt; - result_manifest r (perform_task (m.path, m.manifest, t)); + result_manifest r (perform_task (m.lock, m.path, m.manifest, t)); m.lock.unlock (); // No need to hold the lock any longer. -- cgit v1.1