From 330db1f15d95537e288b4c371a26e43b5a9b2196 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Wed, 12 May 2021 10:42:42 +0200
Subject: Deal with helper thread starvation during phase switching

The implemented solution entails shadowing old phase queues so that helpers
don't pick up old phase tasks and boosting the max_threads count so that we
can create more helpers if all the existing ones are stuck in the old phase.
---
 libbuild2/scheduler.hxx | 81 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 66 insertions(+), 15 deletions(-)

(limited to 'libbuild2/scheduler.hxx')

diff --git a/libbuild2/scheduler.hxx b/libbuild2/scheduler.hxx
index 9556caa..b85d353 100644
--- a/libbuild2/scheduler.hxx
+++ b/libbuild2/scheduler.hxx
@@ -135,6 +135,35 @@ namespace build2
           L& lock,
           work_queue = work_all);
 
+    // Sub-phases.
+    //
+    // Note that these functions should be called while holding the lock
+    // protecting the phase transition, when there are no longer any threads
+    // in the old phase nor yet any threads in the new phase (or, equivalent,
+    // for example, if the old phase does not utilize the scheduler).
+    //
+    // In particular, for push, while we don't expect any further calls to
+    // async() or wait() in the old phase (until pop), there could still be
+    // active threads that haven't had a chance to deactivated themselves yet.
+    // For pop there should be no remaining tasks queued corresponding to the
+    // phase being popped.
+    //
+    void
+    push_phase ();
+
+    void
+    pop_phase ();
+
+    struct phase_guard
+    {
+      explicit
+      phase_guard (scheduler& s): s_ (s) {s_.push_phase ();}
+      ~phase_guard () {s_.pop_phase ();}
+
+    private:
+      scheduler& s_;
+    };
+
     // Mark the queue so that we don't work any tasks that may already be
     // there. In the normal "bunch of acync() calls followed by wait()"
     // cases this happens automatically but in special cases where async()
@@ -663,29 +692,43 @@ namespace build2
     //
     size_t task_queue_depth_; // Multiple of max_active.
 
-    struct task_queue
+    // Our task queue is circular with head being the index of the first
+    // element and tail -- of the last. Since this makes the empty and one
+    // element cases indistinguishable, we also keep the size.
+    //
+    // The mark is an index somewhere between (figuratively speaking) head and
+    // tail, if enabled. If the mark is hit, then it is disabled until the
+    // queue becomes empty or it is reset by a push.
+    //
+    struct task_queue_data
     {
-      std::mutex mutex;
-      bool shutdown = false;
-
-      size_t stat_full = 0; // Number of times push() returned NULL.
-
-      // Our task queue is circular with head being the index of the first
-      // element and tail -- of the last. Since this makes the empty and one
-      // element cases indistinguishable, we also keep the size.
-      //
-      // The mark is an index somewhere between (figuratively speaking) head
-      // and tail, if enabled. If the mark is hit, then it is disabled until
-      // the queue becomes empty or it is reset by a push.
-      //
       size_t head = 0;
       size_t mark = 0;
       size_t tail = 0;
       size_t size = 0;
 
       unique_ptr<task_data[]> data;
+    };
+
+    struct task_queue: task_queue_data
+    {
+      std::mutex mutex;
+      bool shutdown = false;
 
-      task_queue (size_t depth): data (new task_data[depth]) {}
+      size_t stat_full = 0; // Number of times push() returned NULL.
+
+      task_queue (size_t depth) {data.reset (new task_data[depth]);}
+
+      void
+      swap (task_queue_data& d)
+      {
+        using std::swap;
+        swap (head, d.head);
+        swap (mark, d.mark);
+        swap (tail, d.tail);
+        swap (size, d.size);
+        swap (data, d.data);
+      }
     };
 
     // Task queue API. Expects the queue mutex to be locked.
@@ -857,6 +900,14 @@ namespace build2
     static void
     queue (task_queue*) noexcept;
 
+    // Sub-phases.
+    //
+    small_vector<vector<task_queue_data>, 2> phase_;
+
+    size_t idle_reserve_;
+    size_t old_max_threads_;
+    size_t old_eff_max_threads_;
+
   private:
     optional<size_t>
     wait_impl (size_t, const atomic_count&, work_queue);
-- 
cgit v1.1