aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2024-05-08 07:18:02 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2024-05-08 07:23:25 +0200
commit704e151599a65173533c37723048680d23ad569b (patch)
tree30caf11733d9666482679997728a576340bd374a
parenta0ef4ec1ecd34e277f021dc18bcaad7767c8ab11 (diff)
Retry machines directory iteration errors in agent (GH issue #349)
-rw-r--r--bbot/agent/agent.cxx458
1 files changed, 241 insertions, 217 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 75f7228..5df470f 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -1073,10 +1073,11 @@ using bootstrapped_machines = vector<bootstrapped_machine>;
static pair<toolchain_lock, bootstrapped_machines>
enumerate_machines (const dir_path& machines)
-try
{
tracer trace ("enumerate_machines", machines.string ().c_str ());
+ size_t dir_iter_retry (0); // Directory iteration retry (see below).
+
for (;;) // From-scratch retry loop for after bootstrap (see below).
{
pair<toolchain_lock, bootstrapped_machines> pr;
@@ -1149,280 +1150,307 @@ try
// The first level are machine volumes.
//
- for (const dir_entry& ve: dir_iterator (machines, dir_iterator::no_follow))
+ try
{
- const string vn (ve.path ().string ());
-
- // Ignore hidden directories.
- //
- if (ve.type () != entry_type::directory || vn[0] == '.')
- continue;
-
- const dir_path vd (dir_path (machines) /= vn);
-
- // Inside we have machines.
- //
- try
+ for (const dir_entry& ve:
+ dir_iterator (machines, dir_iterator::no_follow))
{
- for (const dir_entry& me: dir_iterator (vd, dir_iterator::no_follow))
- {
- const string mn (me.path ().string ());
+ const string vn (ve.path ().string ());
- if (me.type () != entry_type::directory || mn[0] == '.')
- continue;
+ // Ignore hidden directories.
+ //
+ if (ve.type () != entry_type::directory || vn[0] == '.')
+ continue;
- const dir_path md (dir_path (vd) /= mn);
+ const dir_path vd (dir_path (machines) /= vn);
- // Our endgoal here is to obtain a bootstrapped snapshot of this
- // machine while watching out for potential race conditions (other
- // instances as well as machines being added/upgraded/removed; see
- // the manual for details).
- //
- // So here is our overall plan:
- //
- // 1. Resolve current subvolume link for our bootstrap protocol.
- //
- // 2. Lock the machine. This excludes any other instance from trying
- // to perform the following steps.
- //
- // 3. If there is no link, cleanup old bootstrap (if any) and ignore
- // this machine.
- //
- // 4. Try to create a snapshot of current subvolume (this operation
- // is atomic). If failed (e.g., someone changed the link and
- // removed the subvolume in the meantime), retry from #1.
- //
- // 5. Compare the snapshot to the already bootstrapped version (if
- // any) and see if we need to re-bootstrap. If so, use the
- // snapshot as a starting point. Rename to bootstrapped at the
- // end (atomic).
- //
- dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -<P>
- dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // -<toolchain>
-
- auto delete_bootstrapped = [&tp, &trace] () // Delete -<toolchain>.
+ // Inside we have machines.
+ //
+ try
+ {
+ for (const dir_entry& me: dir_iterator (vd, dir_iterator::no_follow))
{
- run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false");
- run_btrfs (trace, "subvolume", "delete", tp);
- };
+ const string mn (me.path ().string ());
- for (size_t retry (0);; ++retry)
- {
- if (retry != 0)
- sleep (1);
+ if (me.type () != entry_type::directory || mn[0] == '.')
+ continue;
- // Resolve the link to subvolume path.
+ const dir_path md (dir_path (vd) /= mn);
+
+ // Our endgoal here is to obtain a bootstrapped snapshot of this
+ // machine while watching out for potential race conditions (other
+ // instances as well as machines being added/upgraded/removed; see
+ // the manual for details).
+ //
+ // So here is our overall plan:
+ //
+ // 1. Resolve current subvolume link for our bootstrap protocol.
+ //
+ // 2. Lock the machine. This excludes any other instance from
+ // trying to perform the following steps.
//
- dir_path sp; // <name>-<P>.<R>
+ // 3. If there is no link, cleanup old bootstrap (if any) and
+ // ignore this machine.
+ //
+ // 4. Try to create a snapshot of current subvolume (this
+ // operation is atomic). If failed (e.g., someone changed the
+ // link and removed the subvolume in the meantime), retry from
+ // #1.
+ //
+ // 5. Compare the snapshot to the already bootstrapped version (if
+ // any) and see if we need to re-bootstrap. If so, use the
+ // snapshot as a starting point. Rename to bootstrapped at the
+ // end (atomic).
+ //
+ dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -<P>
+ dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // -<toolchain>
- try
+ auto delete_bootstrapped = [&tp, &trace] () // Delete -<toolchain>.
{
- sp = path_cast<dir_path> (readsymlink (lp));
+ run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false");
+ run_btrfs (trace, "subvolume", "delete", tp);
+ };
- if (sp.relative ())
- sp = md / sp;
- }
- catch (const system_error& e)
+ for (size_t retry (0);; ++retry)
{
- // Leave the subvolume path empty if the subvolume link doesn't
- // exist and fail on any other error.
- //
- if (e.code ().category () != std::generic_category () ||
- e.code ().value () != ENOENT)
- fail << "unable to read subvolume link " << lp << ": " << e;
- }
+ if (retry != 0)
+ sleep (1);
- // Try to lock the machine.
- //
- machine_lock ml (lock_machine (tl, tp));
+ // Resolve the link to subvolume path.
+ //
+ dir_path sp; // <name>-<P>.<R>
- if (!ml.locked ())
- {
- machine_manifest mm;
- if (ml.prio)
+ try
{
- // Get the machine manifest (subset of the steps performed for
- // the locked case below).
- //
- // Note that it's possible the machine we get is not what was
- // originally locked by the other process (e.g., it has been
- // upgraded since). It's also possible that if and when we
- // interrupt and lock this machine, it will be a different
- // machine (e.g., it has been upgraded since we read this
- // machine manifest). To deal with all of that we will be
- // reloading this information if/when we acquire the lock to
- // this machine.
- //
- if (sp.empty ())
- {
- l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
- break;
- }
-
- l3 ([&]{trace << "keeping " << md << ": locked by " << ml.pid
- << " with priority " << *ml.prio;});
+ sp = path_cast<dir_path> (readsymlink (lp));
- mm = parse_manifest<machine_manifest> (
- sp / "manifest", "machine");
-
- none = none && mm.effective_role () == machine_role::auxiliary;
+ if (sp.relative ())
+ sp = md / sp;
}
- else // Bootstrapping/suspended.
+ catch (const system_error& e)
{
- l3 ([&]{trace << "keeping " << md << ": being bootstrapped "
- << "or suspened by " << ml.pid;});
-
- // Assume it is a build machine (we cannot determine whether
- // it is build or auxiliary without loading its manifest).
+ // Leave the subvolume path empty if the subvolume link
+ // doesn't exist and fail on any other error.
//
- none = false;
+ if (e.code ().category () != std::generic_category () ||
+ e.code ().value () != ENOENT)
+ fail << "unable to read subvolume link " << lp << ": " << e;
}
- // Add the machine to the lists and bail out.
+ // Try to lock the machine.
//
- r.push_back (bootstrapped_machine {
- move (ml),
- move (tp),
- bootstrapped_machine_manifest {move (mm), {}, {}}});
+ machine_lock ml (lock_machine (tl, tp));
- break;
- }
+ if (!ml.locked ())
+ {
+ machine_manifest mm;
+ if (ml.prio)
+ {
+ // Get the machine manifest (subset of the steps performed
+ // for the locked case below).
+ //
+ // Note that it's possible the machine we get is not what
+ // was originally locked by the other process (e.g., it has
+ // been upgraded since). It's also possible that if and when
+ // we interrupt and lock this machine, it will be a
+ // different machine (e.g., it has been upgraded since we
+ // read this machine manifest). To deal with all of that we
+ // will be reloading this information if/when we acquire the
+ // lock to this machine.
+ //
+ if (sp.empty ())
+ {
+ l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
+ break;
+ }
- bool te (dir_exists (tp));
+ l3 ([&]{trace << "keeping " << md << ": locked by " << ml.pid
+ << " with priority " << *ml.prio;});
- // If the resolution fails, then this means there is no current
- // machine subvolume (for this bootstrap protocol). In this case
- // we clean up our toolchain subvolume (-<toolchain>, if any) and
- // ignore this machine.
- //
- if (sp.empty ())
- {
- if (te)
- delete_bootstrapped ();
+ mm = parse_manifest<machine_manifest> (
+ sp / "manifest", "machine");
- l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
- break;
- }
+ none =
+ none && mm.effective_role () == machine_role::auxiliary;
+ }
+ else // Bootstrapping/suspended.
+ {
+ l3 ([&]{trace << "keeping " << md << ": being bootstrapped "
+ << "or suspened by " << ml.pid;});
- // <name>-<toolchain>-<xxx>
- //
- dir_path xp (snapshot_path (tp));
+ // Assume it is a build machine (we cannot determine whether
+ // it is build or auxiliary without loading its manifest).
+ //
+ none = false;
+ }
- if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0)
- {
- if (retry >= 10)
- fail << "unable to snapshot subvolume " << sp;
+ // Add the machine to the lists and bail out.
+ //
+ r.push_back (bootstrapped_machine {
+ move (ml),
+ move (tp),
+ bootstrapped_machine_manifest {move (mm), {}, {}}});
- continue;
- }
+ break;
+ }
- // Load the (original) machine manifest.
- //
- machine_manifest mm (
- parse_manifest<machine_manifest> (sp / "manifest", "machine"));
+ bool te (dir_exists (tp));
- bool aux (mm.effective_role () == machine_role::auxiliary);
+ // If the resolution fails, then this means there is no current
+ // machine subvolume (for this bootstrap protocol). In this case
+ // we clean up our toolchain subvolume (-<toolchain>, if any)
+ // and ignore this machine.
+ //
+ if (sp.empty ())
+ {
+ if (te)
+ delete_bootstrapped ();
- // Skip machines for which we don't have sufficient RAM.
- //
- if (effective_ram_minimum (mm) >
- (aux ? ops.auxiliary_ram () : ops.build_ram ()))
- {
- l3 ([&]{trace << "skipping " << md << ": insufficient RAM";});
- run_btrfs (trace, "subvolume", "delete", xp);
- break;
- }
+ l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
+ break;
+ }
- none = none && aux;
+ // <name>-<toolchain>-<xxx>
+ //
+ dir_path xp (snapshot_path (tp));
- // If we already have <name>-<toolchain>, see if it needs to be
- // re-bootstrapped. Things that render it obsolete:
- //
- // 1. New machine revision (compare machine ids).
- // 2. New toolchain (compare toolchain ids, not auxiliary).
- // 3. New bbot/libbbot (compare versions, not auxiliary).
- //
- // The last case has a complication: what should we do if we have
- // bootstrapped a newer version of bbot? This would mean that we
- // are about to be stopped and upgraded (and the upgraded version
- // will probably be able to use the result). So we simply ignore
- // this machine for this run.
- //
- // Note: see similar code in the machine interruption logic.
- //
- optional<bootstrapped_machine_manifest> bmm;
- if (te)
- {
- bmm = parse_manifest<bootstrapped_machine_manifest> (
- tp / "manifest", "bootstrapped machine");
+ if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0)
+ {
+ if (retry >= 10)
+ fail << "unable to snapshot subvolume " << sp;
- if (bmm->machine.id != mm.id)
+ continue;
+ }
+
+ // Load the (original) machine manifest.
+ //
+ machine_manifest mm (
+ parse_manifest<machine_manifest> (sp / "manifest", "machine"));
+
+ bool aux (mm.effective_role () == machine_role::auxiliary);
+
+ // Skip machines for which we don't have sufficient RAM.
+ //
+ if (effective_ram_minimum (mm) >
+ (aux ? ops.auxiliary_ram () : ops.build_ram ()))
{
- l3 ([&]{trace << "re-bootstrap " << tp << ": new machine";});
- te = false;
+ l3 ([&]{trace << "skipping " << md << ": insufficient RAM";});
+ run_btrfs (trace, "subvolume", "delete", xp);
+ break;
}
- if (!aux)
+ none = none && aux;
+
+ // If we already have <name>-<toolchain>, see if it needs to be
+ // re-bootstrapped. Things that render it obsolete:
+ //
+ // 1. New machine revision (compare machine ids).
+ // 2. New toolchain (compare toolchain ids, not auxiliary).
+ // 3. New bbot/libbbot (compare versions, not auxiliary).
+ //
+ // The last case has a complication: what should we do if we
+ // have bootstrapped a newer version of bbot? This would mean
+ // that we are about to be stopped and upgraded (and the
+ // upgraded version will probably be able to use the result). So
+ // we simply ignore this machine for this run.
+ //
+ // Note: see similar code in the machine interruption logic.
+ //
+ optional<bootstrapped_machine_manifest> bmm;
+ if (te)
{
- if (!tc_id.empty () && bmm->toolchain.id != tc_id)
+ bmm = parse_manifest<bootstrapped_machine_manifest> (
+ tp / "manifest", "bootstrapped machine");
+
+ if (bmm->machine.id != mm.id)
{
- l3 ([&]{trace << "re-bootstrap " << tp << ": new toolchain";});
+ l3 ([&]{trace << "re-bootstrap " << tp << ": new machine";});
te = false;
}
- if (int i = compare_bbot (bmm->bootstrap))
+ if (!aux)
{
- if (i < 0)
+ if (!tc_id.empty () && bmm->toolchain.id != tc_id)
{
- l3 ([&]{trace << "re-bootstrap " << tp << ": new bbot";});
+ l3 ([&]{trace << "re-bootstrap " << tp << ": new toolchain";});
te = false;
}
- else
+
+ if (int i = compare_bbot (bmm->bootstrap))
{
- l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
- run_btrfs (trace, "subvolume", "delete", xp);
- break;
+ if (i < 0)
+ {
+ l3 ([&]{trace << "re-bootstrap " << tp << ": new bbot";});
+ te = false;
+ }
+ else
+ {
+ l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
+ run_btrfs (trace, "subvolume", "delete", xp);
+ break;
+ }
}
}
+
+ if (!te)
+ delete_bootstrapped ();
}
+ else
+ l3 ([&]{trace << "bootstrap " << tp;});
if (!te)
- delete_bootstrapped ();
- }
- else
- l3 ([&]{trace << "bootstrap " << tp;});
-
- if (!te)
- {
- // Ignore any other machines that need bootstrapping.
- //
- if (!pboot)
{
- pboot = pending_bootstrap {
- move (ml), move (tp), move (xp), move (mm), move (bmm)};
+ // Ignore any other machines that need bootstrapping.
+ //
+ if (!pboot)
+ {
+ pboot = pending_bootstrap {
+ move (ml), move (tp), move (xp), move (mm), move (bmm)};
+ }
+ else
+ run_btrfs (trace, "subvolume", "delete", xp);
+
+ break;
}
else
run_btrfs (trace, "subvolume", "delete", xp);
+ // Add the machine to the lists.
+ //
+ r.push_back (
+ bootstrapped_machine {move (ml), move (tp), move (*bmm)});
+
break;
- }
- else
- run_btrfs (trace, "subvolume", "delete", xp);
+ } // Retry loop.
+ } // Inner dir_iterator loop.
+ }
+ catch (const system_error& e)
+ {
+ fail << "unable to iterate over " << vd << ": " << e;
+ }
+ } // Outer dir_iterator loop.
+ }
+ catch (const system_error& e)
+ {
+ // Once in a while we get ENOENT while iterating over the machines
+ // directory. This directory contains the machine directories (not
+ // subvolumes) and is not being changed when we get this error. Maybe
+ // this is due to directory sizes/timestamps changes, but then we would
+ // expect to get this error a lot more often..? So this feels like a
+ // btrfs bug which we are going to retry a few times. See GH issue #349
+ // for additional information.
+ //
+ bool retry (dir_iter_retry++ != 2);
- // Add the machine to the lists.
- //
- r.push_back (
- bootstrapped_machine {move (ml), move (tp), move (*bmm)});
+ (retry ? warn : error) << "unable to iterate over " << machines
+ << ": " << e;
+ if (retry)
+ continue; // Re-enumerate from scratch.
+ else
+ throw failed ();
+ }
- break;
- } // Retry loop.
- } // Inner dir_iterator loop.
- }
- catch (const system_error& e)
- {
- fail << "unable to iterate over " << vd << ": " << e;
- }
- } // Outer dir_iterator loop.
+ dir_iter_retry = 0; // Reset for re-enumeration due to other reasons.
// See if there is a pending bootstrap and whether we can perform it.
//
@@ -1533,10 +1561,6 @@ try
// Unreachable.
}
-catch (const system_error& e)
-{
- fail << "unable to iterate over " << machines << ": " << e << endf;
-}
// Perform the build task throwing interrupt if it has been interrupted.
//