From 704e151599a65173533c37723048680d23ad569b Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Wed, 8 May 2024 07:18:02 +0200
Subject: Retry machines directory iteration errors in agent (GH issue #349)

---
 bbot/agent/agent.cxx | 458 +++++++++++++++++++++++++++------------------------
 1 file changed, 241 insertions(+), 217 deletions(-)

(limited to 'bbot/agent/agent.cxx')
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 75f7228..5df470f 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -1073,10 +1073,11 @@ using bootstrapped_machines = vector<bootstrapped_machine>;
 
 static pair<toolchain_lock, bootstrapped_machines>
 enumerate_machines (const dir_path& machines)
-try
 {
   tracer trace ("enumerate_machines", machines.string ().c_str ());
 
+  size_t dir_iter_retry (0); // Directory iteration retry (see below).
+
   for (;;) // From-scratch retry loop for after bootstrap (see below).
   {
     pair<toolchain_lock, bootstrapped_machines> pr;
@@ -1149,280 +1150,307 @@ try
 
     // The first level are machine volumes.
     //
-    for (const dir_entry& ve: dir_iterator (machines, dir_iterator::no_follow))
+    try
     {
-      const string vn (ve.path ().string ());
-
-      // Ignore hidden directories.
-      //
-      if (ve.type () != entry_type::directory || vn[0] == '.')
-        continue;
-
-      const dir_path vd (dir_path (machines) /= vn);
-
-      // Inside we have machines.
-      //
-      try
+      for (const dir_entry& ve:
+             dir_iterator (machines, dir_iterator::no_follow))
       {
-        for (const dir_entry& me: dir_iterator (vd, dir_iterator::no_follow))
-        {
-          const string mn (me.path ().string ());
+        const string vn (ve.path ().string ());
 
-          if (me.type () != entry_type::directory || mn[0] == '.')
-            continue;
+        // Ignore hidden directories.
+        //
+        if (ve.type () != entry_type::directory || vn[0] == '.')
+          continue;
 
-          const dir_path md (dir_path (vd) /= mn);
+        const dir_path vd (dir_path (machines) /= vn);
 
-          // Our endgoal here is to obtain a bootstrapped snapshot of this
-          // machine while watching out for potential race conditions (other
-          // instances as well as machines being added/upgraded/removed; see
-          // the manual for details).
-          //
-          // So here is our overall plan:
-          //
-          // 1. Resolve current subvolume link for our bootstrap protocol.
-          //
-          // 2. Lock the machine. This excludes any other instance from trying
-          //    to perform the following steps.
-          //
-          // 3. If there is no link, cleanup old bootstrap (if any) and ignore
-          //    this machine.
-          //
-          // 4. Try to create a snapshot of current subvolume (this operation
-          //    is atomic). If failed (e.g., someone changed the link and
-          //    removed the subvolume in the meantime), retry from #1.
-          //
-          // 5. Compare the snapshot to the already bootstrapped version (if
-          //    any) and see if we need to re-bootstrap. If so, use the
-          //    snapshot as a starting point. Rename to bootstrapped at the
-          //    end (atomic).
-          //
-          dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -<P>
-          dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // -<toolchain>
-
-          auto delete_bootstrapped = [&tp, &trace] () // Delete -<toolchain>.
+        // Inside we have machines.
+        //
+        try
+        {
+          for (const dir_entry& me: dir_iterator (vd, dir_iterator::no_follow))
           {
-            run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false");
-            run_btrfs (trace, "subvolume", "delete", tp);
-          };
+            const string mn (me.path ().string ());
 
-          for (size_t retry (0);; ++retry)
-          {
-            if (retry != 0)
-              sleep (1);
+            if (me.type () != entry_type::directory || mn[0] == '.')
+              continue;
 
-            // Resolve the link to subvolume path.
+            const dir_path md (dir_path (vd) /= mn);
+
+            // Our endgoal here is to obtain a bootstrapped snapshot of this
+            // machine while watching out for potential race conditions (other
+            // instances as well as machines being added/upgraded/removed; see
+            // the manual for details).
+            //
+            // So here is our overall plan:
+            //
+            // 1. Resolve current subvolume link for our bootstrap protocol.
+            //
+            // 2. Lock the machine. This excludes any other instance from
+            //    trying to perform the following steps.
             //
-            dir_path sp; // <name>-<P>.<R>
+            // 3. If there is no link, cleanup old bootstrap (if any) and
+            //    ignore this machine.
+            //
+            // 4. Try to create a snapshot of current subvolume (this
+            //    operation is atomic). If failed (e.g., someone changed the
+            //    link and removed the subvolume in the meantime), retry from
+            //    #1.
+            //
+            // 5. Compare the snapshot to the already bootstrapped version (if
+            //    any) and see if we need to re-bootstrap. If so, use the
+            //    snapshot as a starting point. Rename to bootstrapped at the
+            //    end (atomic).
+            //
+            dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -<P>
+            dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // -<toolchain>
 
-            try
+            auto delete_bootstrapped = [&tp, &trace] () // Delete -<toolchain>.
             {
-              sp = path_cast<dir_path> (readsymlink (lp));
+              run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false");
+              run_btrfs (trace, "subvolume", "delete", tp);
+            };
 
-              if (sp.relative ())
-                sp = md / sp;
-            }
-            catch (const system_error& e)
+            for (size_t retry (0);; ++retry)
             {
-              // Leave the subvolume path empty if the subvolume link doesn't
-              // exist and fail on any other error.
-              //
-              if (e.code ().category () != std::generic_category () ||
-                  e.code ().value () != ENOENT)
-                fail << "unable to read subvolume link " << lp << ": " << e;
-            }
+              if (retry != 0)
+                sleep (1);
 
-            // Try to lock the machine.
-            //
-            machine_lock ml (lock_machine (tl, tp));
+              // Resolve the link to subvolume path.
+              //
+              dir_path sp; // <name>-<P>.<R>
 
-            if (!ml.locked ())
-            {
-              machine_manifest mm;
-              if (ml.prio)
+              try
               {
-                // Get the machine manifest (subset of the steps performed for
-                // the locked case below).
-                //
-                // Note that it's possible the machine we get is not what was
-                // originally locked by the other process (e.g., it has been
-                // upgraded since). It's also possible that if and when we
-                // interrupt and lock this machine, it will be a different
-                // machine (e.g., it has been upgraded since we read this
-                // machine manifest). To deal with all of that we will be
-                // reloading this information if/when we acquire the lock to
-                // this machine.
-                //
-                if (sp.empty ())
-                {
-                  l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
-                  break;
-                }
-
-                l3 ([&]{trace << "keeping " << md << ": locked by " << ml.pid
-                              << " with priority " << *ml.prio;});
+                sp = path_cast<dir_path> (readsymlink (lp));
 
-                mm = parse_manifest<machine_manifest> (
-                  sp / "manifest", "machine");
-
-                none = none && mm.effective_role () == machine_role::auxiliary;
+                if (sp.relative ())
+                  sp = md / sp;
               }
-              else // Bootstrapping/suspended.
+              catch (const system_error& e)
               {
-                l3 ([&]{trace << "keeping " << md << ": being bootstrapped "
-                              << "or suspened by " << ml.pid;});
-
-                // Assume it is a build machine (we cannot determine whether
-                // it is build or auxiliary without loading its manifest).
+                // Leave the subvolume path empty if the subvolume link
+                // doesn't exist and fail on any other error.
                 //
-                none = false;
+                if (e.code ().category () != std::generic_category () ||
+                    e.code ().value () != ENOENT)
+                  fail << "unable to read subvolume link " << lp << ": " << e;
               }
 
-              // Add the machine to the lists and bail out.
+              // Try to lock the machine.
               //
-              r.push_back (bootstrapped_machine {
-                  move (ml),
-                  move (tp),
-                  bootstrapped_machine_manifest {move (mm), {}, {}}});
+              machine_lock ml (lock_machine (tl, tp));
 
-              break;
-            }
+              if (!ml.locked ())
+              {
+                machine_manifest mm;
+                if (ml.prio)
+                {
+                  // Get the machine manifest (subset of the steps performed
+                  // for the locked case below).
+                  //
+                  // Note that it's possible the machine we get is not what
+                  // was originally locked by the other process (e.g., it has
+                  // been upgraded since). It's also possible that if and when
+                  // we interrupt and lock this machine, it will be a
+                  // different machine (e.g., it has been upgraded since we
+                  // read this machine manifest). To deal with all of that we
+                  // will be reloading this information if/when we acquire the
+                  // lock to this machine.
+                  //
+                  if (sp.empty ())
+                  {
+                    l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
+                    break;
+                  }
 
-            bool te (dir_exists (tp));
+                  l3 ([&]{trace << "keeping " << md << ": locked by " << ml.pid
+                                << " with priority " << *ml.prio;});
 
-            // If the resolution fails, then this means there is no current
-            // machine subvolume (for this bootstrap protocol). In this case
-            // we clean up our toolchain subvolume (-<toolchain>, if any) and
-            // ignore this machine.
-            //
-            if (sp.empty ())
-            {
-              if (te)
-                delete_bootstrapped ();
+                  mm = parse_manifest<machine_manifest> (
+                    sp / "manifest", "machine");
 
-              l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
-              break;
-            }
+                  none =
+                    none && mm.effective_role () == machine_role::auxiliary;
+                }
+                else // Bootstrapping/suspended.
+                {
+                  l3 ([&]{trace << "keeping " << md << ": being bootstrapped "
+                                << "or suspened by " << ml.pid;});
 
-            // <name>-<toolchain>-<xxx>
-            //
-            dir_path xp (snapshot_path (tp));
+                  // Assume it is a build machine (we cannot determine whether
+                  // it is build or auxiliary without loading its manifest).
+                  //
+                  none = false;
+                }
 
-            if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0)
-            {
-              if (retry >= 10)
-                fail << "unable to snapshot subvolume " << sp;
+                // Add the machine to the lists and bail out.
+                //
+                r.push_back (bootstrapped_machine {
+                    move (ml),
+                    move (tp),
+                    bootstrapped_machine_manifest {move (mm), {}, {}}});
 
-              continue;
-            }
+                break;
+              }
 
-            // Load the (original) machine manifest.
-            //
-            machine_manifest mm (
-              parse_manifest<machine_manifest> (sp / "manifest", "machine"));
+              bool te (dir_exists (tp));
 
-            bool aux (mm.effective_role () == machine_role::auxiliary);
+              // If the resolution fails, then this means there is no current
+              // machine subvolume (for this bootstrap protocol). In this case
+              // we clean up our toolchain subvolume (-<toolchain>, if any)
+              // and ignore this machine.
+              //
+              if (sp.empty ())
+              {
+                if (te)
+                  delete_bootstrapped ();
 
-            // Skip machines for which we don't have sufficient RAM.
-            //
-            if (effective_ram_minimum (mm) >
-                (aux ? ops.auxiliary_ram () : ops.build_ram ()))
-            {
-              l3 ([&]{trace << "skipping " << md << ": insufficient RAM";});
-              run_btrfs (trace, "subvolume", "delete", xp);
-              break;
-            }
+                l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
+                break;
+              }
 
-            none = none && aux;
+              // <name>-<toolchain>-<xxx>
+              //
+              dir_path xp (snapshot_path (tp));
 
-            // If we already have <name>-<toolchain>, see if it needs to be
-            // re-bootstrapped. Things that render it obsolete:
-            //
-            // 1. New machine revision  (compare machine ids).
-            // 2. New toolchain         (compare toolchain ids, not auxiliary).
-            // 3. New bbot/libbbot      (compare versions, not auxiliary).
-            //
-            // The last case has a complication: what should we do if we have
-            // bootstrapped a newer version of bbot? This would mean that we
-            // are about to be stopped and upgraded (and the upgraded version
-            // will probably be able to use the result). So we simply ignore
-            // this machine for this run.
-            //
-            // Note: see similar code in the machine interruption logic.
-            //
-            optional<bootstrapped_machine_manifest> bmm;
-            if (te)
-            {
-              bmm = parse_manifest<bootstrapped_machine_manifest> (
-                tp / "manifest", "bootstrapped machine");
+              if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0)
+              {
+                if (retry >= 10)
+                  fail << "unable to snapshot subvolume " << sp;
 
-              if (bmm->machine.id != mm.id)
+                continue;
+              }
+
+              // Load the (original) machine manifest.
+              //
+              machine_manifest mm (
+                parse_manifest<machine_manifest> (sp / "manifest", "machine"));
+
+              bool aux (mm.effective_role () == machine_role::auxiliary);
+
+              // Skip machines for which we don't have sufficient RAM.
+              //
+              if (effective_ram_minimum (mm) >
+                  (aux ? ops.auxiliary_ram () : ops.build_ram ()))
               {
-                l3 ([&]{trace << "re-bootstrap " << tp << ": new machine";});
-                te = false;
+                l3 ([&]{trace << "skipping " << md << ": insufficient RAM";});
+                run_btrfs (trace, "subvolume", "delete", xp);
+                break;
               }
 
-              if (!aux)
+              none = none && aux;
+
+              // If we already have <name>-<toolchain>, see if it needs to be
+              // re-bootstrapped. Things that render it obsolete:
+              //
+              // 1. New machine revision  (compare machine ids).
+              // 2. New toolchain         (compare toolchain ids, not auxiliary).
+              // 3. New bbot/libbbot      (compare versions, not auxiliary).
+              //
+              // The last case has a complication: what should we do if we
+              // have bootstrapped a newer version of bbot? This would mean
+              // that we are about to be stopped and upgraded (and the
+              // upgraded version will probably be able to use the result). So
+              // we simply ignore this machine for this run.
+              //
+              // Note: see similar code in the machine interruption logic.
+              //
+              optional<bootstrapped_machine_manifest> bmm;
+              if (te)
               {
-                if (!tc_id.empty () && bmm->toolchain.id != tc_id)
+                bmm = parse_manifest<bootstrapped_machine_manifest> (
+                  tp / "manifest", "bootstrapped machine");
+
+                if (bmm->machine.id != mm.id)
                 {
-                  l3 ([&]{trace << "re-bootstrap " << tp << ": new toolchain";});
+                  l3 ([&]{trace << "re-bootstrap " << tp << ": new machine";});
                   te = false;
                 }
 
-                if (int i = compare_bbot (bmm->bootstrap))
+                if (!aux)
                 {
-                  if (i < 0)
+                  if (!tc_id.empty () && bmm->toolchain.id != tc_id)
                   {
-                    l3 ([&]{trace << "re-bootstrap " << tp << ": new bbot";});
+                    l3 ([&]{trace << "re-bootstrap " << tp << ": new toolchain";});
                     te = false;
                   }
-                  else
+
+                  if (int i = compare_bbot (bmm->bootstrap))
                   {
-                    l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
-                    run_btrfs (trace, "subvolume", "delete", xp);
-                    break;
+                    if (i < 0)
+                    {
+                      l3 ([&]{trace << "re-bootstrap " << tp << ": new bbot";});
+                      te = false;
+                    }
+                    else
+                    {
+                      l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
+                      run_btrfs (trace, "subvolume", "delete", xp);
+                      break;
+                    }
                   }
                 }
+
+                if (!te)
+                  delete_bootstrapped ();
               }
+              else
+                l3 ([&]{trace << "bootstrap " << tp;});
 
               if (!te)
-                delete_bootstrapped ();
-            }
-            else
-              l3 ([&]{trace << "bootstrap " << tp;});
-
-            if (!te)
-            {
-              // Ignore any other machines that need bootstrapping.
-              //
-              if (!pboot)
               {
-                pboot = pending_bootstrap {
-                  move (ml), move (tp), move (xp), move (mm), move (bmm)};
+                // Ignore any other machines that need bootstrapping.
+                //
+                if (!pboot)
+                {
+                  pboot = pending_bootstrap {
+                    move (ml), move (tp), move (xp), move (mm), move (bmm)};
+                }
+                else
+                  run_btrfs (trace, "subvolume", "delete", xp);
+
+                break;
               }
               else
                 run_btrfs (trace, "subvolume", "delete", xp);
 
+              // Add the machine to the lists.
+              //
+              r.push_back (
+                bootstrapped_machine {move (ml), move (tp), move (*bmm)});
+
               break;
-            }
-            else
-              run_btrfs (trace, "subvolume", "delete", xp);
+            } // Retry loop.
+          } // Inner dir_iterator loop.
+        }
+        catch (const system_error& e)
+        {
+          fail << "unable to iterate over " << vd << ": " << e;
+        }
+      } // Outer dir_iterator loop.
+    }
+    catch (const system_error& e)
+    {
+      // Once in a while we get ENOENT while iterating over the machines
+      // directory. This directory contains the machine directories (not
+      // subvolumes) and is not being changed when we get this error. Maybe
+      // this is due to directory sizes/timestamps changes, but then we would
+      // expect to get this error a lot more often..? So this feels like a
+      // btrfs bug which we are going to retry a few times. See GH issue #349
+      // for additional information.
+      //
+      bool retry (dir_iter_retry++ != 2);
 
-            // Add the machine to the lists.
-            //
-            r.push_back (
-              bootstrapped_machine {move (ml), move (tp), move (*bmm)});
+      (retry ? warn : error) << "unable to iterate over " << machines
+                             << ": " << e;
+      if (retry)
+        continue; // Re-enumerate from scratch.
+      else
+        throw failed ();
+    }
 
-            break;
-          } // Retry loop.
-        } // Inner dir_iterator loop.
-      }
-      catch (const system_error& e)
-      {
-        fail << "unable to iterate over " << vd << ": " << e;
-      }
-    } // Outer dir_iterator loop.
+    dir_iter_retry = 0; // Reset for re-enumeration due to other reasons.
 
     // See if there is a pending bootstrap and whether we can perform it.
     //
@@ -1533,10 +1561,6 @@ try
 
   // Unreachable.
 }
-catch (const system_error& e)
-{
-  fail << "unable to iterate over " << machines << ": " << e << endf;
-}
 
 // Perform the build task throwing interrupt if it has been interrupted.
 //
-- 
cgit v1.1