From 704e151599a65173533c37723048680d23ad569b Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Wed, 8 May 2024 07:18:02 +0200 Subject: Retry machines directory iteration errors in agent (GH issue #349) --- bbot/agent/agent.cxx | 458 +++++++++++++++++++++++++++------------------------ 1 file changed, 241 insertions(+), 217 deletions(-) (limited to 'bbot/agent/agent.cxx') diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 75f7228..5df470f 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -1073,10 +1073,11 @@ using bootstrapped_machines = vector; static pair enumerate_machines (const dir_path& machines) -try { tracer trace ("enumerate_machines", machines.string ().c_str ()); + size_t dir_iter_retry (0); // Directory iteration retry (see below). + for (;;) // From-scratch retry loop for after bootstrap (see below). { pair pr; @@ -1149,280 +1150,307 @@ try // The first level are machine volumes. // - for (const dir_entry& ve: dir_iterator (machines, dir_iterator::no_follow)) + try { - const string vn (ve.path ().string ()); - - // Ignore hidden directories. - // - if (ve.type () != entry_type::directory || vn[0] == '.') - continue; - - const dir_path vd (dir_path (machines) /= vn); - - // Inside we have machines. - // - try + for (const dir_entry& ve: + dir_iterator (machines, dir_iterator::no_follow)) { - for (const dir_entry& me: dir_iterator (vd, dir_iterator::no_follow)) - { - const string mn (me.path ().string ()); + const string vn (ve.path ().string ()); - if (me.type () != entry_type::directory || mn[0] == '.') - continue; + // Ignore hidden directories. + // + if (ve.type () != entry_type::directory || vn[0] == '.') + continue; - const dir_path md (dir_path (vd) /= mn); + const dir_path vd (dir_path (machines) /= vn); - // Our endgoal here is to obtain a bootstrapped snapshot of this - // machine while watching out for potential race conditions (other - // instances as well as machines being added/upgraded/removed; see - // the manual for details). - // - // So here is our overall plan: - // - // 1. Resolve current subvolume link for our bootstrap protocol. - // - // 2. Lock the machine. This excludes any other instance from trying - // to perform the following steps. - // - // 3. If there is no link, cleanup old bootstrap (if any) and ignore - // this machine. - // - // 4. Try to create a snapshot of current subvolume (this operation - // is atomic). If failed (e.g., someone changed the link and - // removed the subvolume in the meantime), retry from #1. - // - // 5. Compare the snapshot to the already bootstrapped version (if - // any) and see if we need to re-bootstrap. If so, use the - // snapshot as a starting point. Rename to bootstrapped at the - // end (atomic). - // - dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -

- dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // - - - auto delete_bootstrapped = [&tp, &trace] () // Delete -. + // Inside we have machines. + // + try + { + for (const dir_entry& me: dir_iterator (vd, dir_iterator::no_follow)) { - run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false"); - run_btrfs (trace, "subvolume", "delete", tp); - }; + const string mn (me.path ().string ()); - for (size_t retry (0);; ++retry) - { - if (retry != 0) - sleep (1); + if (me.type () != entry_type::directory || mn[0] == '.') + continue; - // Resolve the link to subvolume path. + const dir_path md (dir_path (vd) /= mn); + + // Our endgoal here is to obtain a bootstrapped snapshot of this + // machine while watching out for potential race conditions (other + // instances as well as machines being added/upgraded/removed; see + // the manual for details). + // + // So here is our overall plan: + // + // 1. Resolve current subvolume link for our bootstrap protocol. + // + // 2. Lock the machine. This excludes any other instance from + // trying to perform the following steps. // - dir_path sp; // -

. + // 3. If there is no link, cleanup old bootstrap (if any) and + // ignore this machine. + // + // 4. Try to create a snapshot of current subvolume (this + // operation is atomic). If failed (e.g., someone changed the + // link and removed the subvolume in the meantime), retry from + // #1. + // + // 5. Compare the snapshot to the already bootstrapped version (if + // any) and see if we need to re-bootstrap. If so, use the + // snapshot as a starting point. Rename to bootstrapped at the + // end (atomic). + // + dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -

+ dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // - - try + auto delete_bootstrapped = [&tp, &trace] () // Delete -. { - sp = path_cast (readsymlink (lp)); + run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false"); + run_btrfs (trace, "subvolume", "delete", tp); + }; - if (sp.relative ()) - sp = md / sp; - } - catch (const system_error& e) + for (size_t retry (0);; ++retry) { - // Leave the subvolume path empty if the subvolume link doesn't - // exist and fail on any other error. - // - if (e.code ().category () != std::generic_category () || - e.code ().value () != ENOENT) - fail << "unable to read subvolume link " << lp << ": " << e; - } + if (retry != 0) + sleep (1); - // Try to lock the machine. - // - machine_lock ml (lock_machine (tl, tp)); + // Resolve the link to subvolume path. + // + dir_path sp; // -

. - if (!ml.locked ()) - { - machine_manifest mm; - if (ml.prio) + try { - // Get the machine manifest (subset of the steps performed for - // the locked case below). - // - // Note that it's possible the machine we get is not what was - // originally locked by the other process (e.g., it has been - // upgraded since). It's also possible that if and when we - // interrupt and lock this machine, it will be a different - // machine (e.g., it has been upgraded since we read this - // machine manifest). To deal with all of that we will be - // reloading this information if/when we acquire the lock to - // this machine. - // - if (sp.empty ()) - { - l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); - break; - } - - l3 ([&]{trace << "keeping " << md << ": locked by " << ml.pid - << " with priority " << *ml.prio;}); + sp = path_cast (readsymlink (lp)); - mm = parse_manifest ( - sp / "manifest", "machine"); - - none = none && mm.effective_role () == machine_role::auxiliary; + if (sp.relative ()) + sp = md / sp; } - else // Bootstrapping/suspended. + catch (const system_error& e) { - l3 ([&]{trace << "keeping " << md << ": being bootstrapped " - << "or suspened by " << ml.pid;}); - - // Assume it is a build machine (we cannot determine whether - // it is build or auxiliary without loading its manifest). + // Leave the subvolume path empty if the subvolume link + // doesn't exist and fail on any other error. // - none = false; + if (e.code ().category () != std::generic_category () || + e.code ().value () != ENOENT) + fail << "unable to read subvolume link " << lp << ": " << e; } - // Add the machine to the lists and bail out. + // Try to lock the machine. // - r.push_back (bootstrapped_machine { - move (ml), - move (tp), - bootstrapped_machine_manifest {move (mm), {}, {}}}); + machine_lock ml (lock_machine (tl, tp)); - break; - } + if (!ml.locked ()) + { + machine_manifest mm; + if (ml.prio) + { + // Get the machine manifest (subset of the steps performed + // for the locked case below). + // + // Note that it's possible the machine we get is not what + // was originally locked by the other process (e.g., it has + // been upgraded since). It's also possible that if and when + // we interrupt and lock this machine, it will be a + // different machine (e.g., it has been upgraded since we + // read this machine manifest). To deal with all of that we + // will be reloading this information if/when we acquire the + // lock to this machine. + // + if (sp.empty ()) + { + l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); + break; + } - bool te (dir_exists (tp)); + l3 ([&]{trace << "keeping " << md << ": locked by " << ml.pid + << " with priority " << *ml.prio;}); - // If the resolution fails, then this means there is no current - // machine subvolume (for this bootstrap protocol). In this case - // we clean up our toolchain subvolume (-, if any) and - // ignore this machine. - // - if (sp.empty ()) - { - if (te) - delete_bootstrapped (); + mm = parse_manifest ( + sp / "manifest", "machine"); - l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); - break; - } + none = + none && mm.effective_role () == machine_role::auxiliary; + } + else // Bootstrapping/suspended. + { + l3 ([&]{trace << "keeping " << md << ": being bootstrapped " + << "or suspened by " << ml.pid;}); - // -- - // - dir_path xp (snapshot_path (tp)); + // Assume it is a build machine (we cannot determine whether + // it is build or auxiliary without loading its manifest). + // + none = false; + } - if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0) - { - if (retry >= 10) - fail << "unable to snapshot subvolume " << sp; + // Add the machine to the lists and bail out. + // + r.push_back (bootstrapped_machine { + move (ml), + move (tp), + bootstrapped_machine_manifest {move (mm), {}, {}}}); - continue; - } + break; + } - // Load the (original) machine manifest. - // - machine_manifest mm ( - parse_manifest (sp / "manifest", "machine")); + bool te (dir_exists (tp)); - bool aux (mm.effective_role () == machine_role::auxiliary); + // If the resolution fails, then this means there is no current + // machine subvolume (for this bootstrap protocol). In this case + // we clean up our toolchain subvolume (-, if any) + // and ignore this machine. + // + if (sp.empty ()) + { + if (te) + delete_bootstrapped (); - // Skip machines for which we don't have sufficient RAM. - // - if (effective_ram_minimum (mm) > - (aux ? ops.auxiliary_ram () : ops.build_ram ())) - { - l3 ([&]{trace << "skipping " << md << ": insufficient RAM";}); - run_btrfs (trace, "subvolume", "delete", xp); - break; - } + l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); + break; + } - none = none && aux; + // -- + // + dir_path xp (snapshot_path (tp)); - // If we already have -, see if it needs to be - // re-bootstrapped. Things that render it obsolete: - // - // 1. New machine revision (compare machine ids). - // 2. New toolchain (compare toolchain ids, not auxiliary). - // 3. New bbot/libbbot (compare versions, not auxiliary). - // - // The last case has a complication: what should we do if we have - // bootstrapped a newer version of bbot? This would mean that we - // are about to be stopped and upgraded (and the upgraded version - // will probably be able to use the result). So we simply ignore - // this machine for this run. - // - // Note: see similar code in the machine interruption logic. - // - optional bmm; - if (te) - { - bmm = parse_manifest ( - tp / "manifest", "bootstrapped machine"); + if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0) + { + if (retry >= 10) + fail << "unable to snapshot subvolume " << sp; - if (bmm->machine.id != mm.id) + continue; + } + + // Load the (original) machine manifest. + // + machine_manifest mm ( + parse_manifest (sp / "manifest", "machine")); + + bool aux (mm.effective_role () == machine_role::auxiliary); + + // Skip machines for which we don't have sufficient RAM. + // + if (effective_ram_minimum (mm) > + (aux ? ops.auxiliary_ram () : ops.build_ram ())) { - l3 ([&]{trace << "re-bootstrap " << tp << ": new machine";}); - te = false; + l3 ([&]{trace << "skipping " << md << ": insufficient RAM";}); + run_btrfs (trace, "subvolume", "delete", xp); + break; } - if (!aux) + none = none && aux; + + // If we already have -, see if it needs to be + // re-bootstrapped. Things that render it obsolete: + // + // 1. New machine revision (compare machine ids). + // 2. New toolchain (compare toolchain ids, not auxiliary). + // 3. New bbot/libbbot (compare versions, not auxiliary). + // + // The last case has a complication: what should we do if we + // have bootstrapped a newer version of bbot? This would mean + // that we are about to be stopped and upgraded (and the + // upgraded version will probably be able to use the result). So + // we simply ignore this machine for this run. + // + // Note: see similar code in the machine interruption logic. + // + optional bmm; + if (te) { - if (!tc_id.empty () && bmm->toolchain.id != tc_id) + bmm = parse_manifest ( + tp / "manifest", "bootstrapped machine"); + + if (bmm->machine.id != mm.id) { - l3 ([&]{trace << "re-bootstrap " << tp << ": new toolchain";}); + l3 ([&]{trace << "re-bootstrap " << tp << ": new machine";}); te = false; } - if (int i = compare_bbot (bmm->bootstrap)) + if (!aux) { - if (i < 0) + if (!tc_id.empty () && bmm->toolchain.id != tc_id) { - l3 ([&]{trace << "re-bootstrap " << tp << ": new bbot";}); + l3 ([&]{trace << "re-bootstrap " << tp << ": new toolchain";}); te = false; } - else + + if (int i = compare_bbot (bmm->bootstrap)) { - l3 ([&]{trace << "ignoring " << tp << ": old bbot";}); - run_btrfs (trace, "subvolume", "delete", xp); - break; + if (i < 0) + { + l3 ([&]{trace << "re-bootstrap " << tp << ": new bbot";}); + te = false; + } + else + { + l3 ([&]{trace << "ignoring " << tp << ": old bbot";}); + run_btrfs (trace, "subvolume", "delete", xp); + break; + } } } + + if (!te) + delete_bootstrapped (); } + else + l3 ([&]{trace << "bootstrap " << tp;}); if (!te) - delete_bootstrapped (); - } - else - l3 ([&]{trace << "bootstrap " << tp;}); - - if (!te) - { - // Ignore any other machines that need bootstrapping. - // - if (!pboot) { - pboot = pending_bootstrap { - move (ml), move (tp), move (xp), move (mm), move (bmm)}; + // Ignore any other machines that need bootstrapping. + // + if (!pboot) + { + pboot = pending_bootstrap { + move (ml), move (tp), move (xp), move (mm), move (bmm)}; + } + else + run_btrfs (trace, "subvolume", "delete", xp); + + break; } else run_btrfs (trace, "subvolume", "delete", xp); + // Add the machine to the lists. + // + r.push_back ( + bootstrapped_machine {move (ml), move (tp), move (*bmm)}); + break; - } - else - run_btrfs (trace, "subvolume", "delete", xp); + } // Retry loop. + } // Inner dir_iterator loop. + } + catch (const system_error& e) + { + fail << "unable to iterate over " << vd << ": " << e; + } + } // Outer dir_iterator loop. + } + catch (const system_error& e) + { + // Once in a while we get ENOENT while iterating over the machines + // directory. This directory contains the machine directories (not + // subvolumes) and is not being changed when we get this error. Maybe + // this is due to directory sizes/timestamps changes, but then we would + // expect to get this error a lot more often..? So this feels like a + // btrfs bug which we are going to retry a few times. See GH issue #349 + // for additional information. + // + bool retry (dir_iter_retry++ != 2); - // Add the machine to the lists. - // - r.push_back ( - bootstrapped_machine {move (ml), move (tp), move (*bmm)}); + (retry ? warn : error) << "unable to iterate over " << machines + << ": " << e; + if (retry) + continue; // Re-enumerate from scratch. + else + throw failed (); + } - break; - } // Retry loop. - } // Inner dir_iterator loop. - } - catch (const system_error& e) - { - fail << "unable to iterate over " << vd << ": " << e; - } - } // Outer dir_iterator loop. + dir_iter_retry = 0; // Reset for re-enumeration due to other reasons. // See if there is a pending bootstrap and whether we can perform it. // @@ -1533,10 +1561,6 @@ try // Unreachable. } -catch (const system_error& e) -{ - fail << "unable to iterate over " << machines << ": " << e << endf; -} // Perform the build task throwing interrupt if it has been interrupted. // -- cgit v1.1