From 4c7bb136ac8c1c1cd47942ad7fe8257b20997871 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Wed, 12 Apr 2017 15:27:40 +0200 Subject: Complete agent side of machine bootstrap --- bbot/agent.cxx | 128 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 97 insertions(+), 31 deletions(-) (limited to 'bbot/agent.cxx') diff --git a/bbot/agent.cxx b/bbot/agent.cxx index 4d5cc4a..634a94d 100644 --- a/bbot/agent.cxx +++ b/bbot/agent.cxx @@ -101,7 +101,11 @@ btrfs_exit (tracer& t, A&&... a) : run_io_exit (t, fdnull (), fdnull (), 2, "btrfs", forward (a)...); } -static bootstrapped_machine_manifest +// Bootstrap the machine. Return the bootstrapped machine manifest if +// successful and nullopt otherwise (in which case the machine directory +// should be cleaned and the machine ignored for now). +// +static optional bootstrap_machine (const dir_path& md, const machine_manifest& mm, optional obmm) @@ -128,7 +132,7 @@ bootstrap_machine (const dir_path& md, else try { - string br ("br1"); // Use private bridge for now. + string br ("br1"); // Using private bridge for now. // Start the TFTP server (server chroot is /build/tftp). Map: // @@ -138,6 +142,11 @@ bootstrap_machine (const dir_path& md, auto_rmdir arm (dir_path ("/build/tftp/bootstrap/" + tc_name)); try_mkdir_p (arm.path ()); + // Bootstrap result manifest. + // + path mf (arm.path () / "manifest"); + try_rmfile (mf); + tftp_server tftpd ("Gr ^/?(.+)$ /toolchain/" + tc_name + "/\\1\n" + "Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n"); @@ -152,32 +161,85 @@ bootstrap_machine (const dir_path& md, br, tftpd.port ())); - r.machine.mac = m->mac; + { + // If we are terminating with an exception then force the machine down. + // Failed that, the machine's destructor will block waiting for its + // completion. + // + auto mg ( + make_exception_guard ( + [&m, &md] () + { + info << "trying to force machine " << md << " down"; + try {m->forcedown ();} catch (const failed&) {} + })); + + // What happens if the bootstrap process hangs? The simple thing would + // be to force the machine down after some timeout and then fail. But + // that won't be very helpful for investigating the cause. So instead + // the plan is to suspend it after some timeout, issue diagnostics + // (without failing and which Build OS monitor will relay to the admin), + // and wait for the external intervention. + // + auto soft_fail = [&md, &m] (const char* msg) + { + { + diag_record dr (error); + dr << msg << " for machine " << md << ", suspending"; + m->print_info (dr); + } + m->suspend (); + m->wait (); + return nullopt; + }; + + // The first request should be the toolchain download. Wait for up to 60 + // seconds for that to arrive. In a sense we use it as an indication + // that the machine has booted and the bootstrap process has started. + // + size_t to; + const size_t startup_to (60); + const size_t bootstrap_to (ops.bootstrap_timeout ()); + const size_t shutdown_to (60); + + if (!tftpd.serve ((to = startup_to))) + return soft_fail ("bootstrap startup timeout"); + + l2 ([&]{trace << "completed startup in " << startup_to - to << "s";}); + + // Next the bootstrap process may download additional toolchain + // archives, build things, and then upload the result manifest. So on + // our side we serve TFTP requests while periodically checking for the + // manifest file. + // + for (to = bootstrap_to; to != 0 && !file_exists (mf); tftpd.serve (to)) ; + + if (to == 0) + return soft_fail ("bootstrap timeout"); + + l2 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";}); + + // Shut the machine down cleanly. + // + if (!m->shutdown ((to = shutdown_to))) + return soft_fail ("bootstrap shutdown timeout"); + + l2 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";}); + } - // The first request should be the toolchain download. Wait for up to 60 - // seconds for that to arrive. In a sense we use it as an indication that - // the machine has booted and the bootstrap process has started. + // Parse the result manifest. // - size_t timeout (60); - if (tftpd.serve (timeout)) + try { - l2 ([&]{trace << "received first request in " << 60 - timeout << "s";}); + r.bootstrap = parse_manifest (mf, "bootstrap"); } - else + catch (const failed&) { - // @@ What should be do here? Non-fatal? Mark the machine as failed? - // - error << "bootstrap timeout during first request for machine " << md; - m->forcedown (); - throw failed (); + error << "invalid bootstrap manifest for machine " << md; + return nullopt; } - if (!m->shutdown ()) - { - error << "forcing machine " << md << " down"; - m->forcedown (); - throw failed (); - } + r.machine.mac = m->mac; // Save the MAC address. } catch (const system_error& e) { @@ -352,25 +414,25 @@ try (r = cmp ("libbutl", LIBBUTL_VERSION)) != 0 ? r : 0; }; - optional obmm; + optional bmm; if (te) { - obmm = parse_manifest ( + bmm = parse_manifest ( tp / "manifest", "bootstrapped machine"); - if (obmm->machine.id != mm.id) + if (bmm->machine.id != mm.id) { l2 ([&]{trace << "re-bootstrapping " << tp << ": new machine";}); te = false; } - if (obmm->toolchain.id != tc_id) + if (bmm->toolchain.id != tc_id) { l2 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";}); te = false; } - if (int i = compare_bbot (obmm->bootstrap)) + if (int i = compare_bbot (bmm->bootstrap)) { if (i < 0) { @@ -397,8 +459,14 @@ try // bootstrap the new machine. Then atomically rename it to // -. // - bootstrapped_machine_manifest bmm ( - bootstrap_machine (xp, mm, move (obmm))); + bmm = bootstrap_machine (xp, mm, move (bmm)); + + if (!bmm) + { + l2 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";}); + btrfs (trace, "subvolume", "delete", xp); + break; + } try { @@ -409,12 +477,10 @@ try fail << "unable to rename " << xp << " to " << tp; } - te = true; - // Check the boostrapped bbot version as above and ignore this // machine if it's newer than us. // - if (int i = compare_bbot (bmm.bootstrap)) + if (int i = compare_bbot (bmm->bootstrap)) { assert (i > 0); l2 ([&]{trace << "ignoring " << tp << ": old bbot";}); -- cgit v1.1