diff options
-rw-r--r-- | bbot/agent.cli | 14 | ||||
-rw-r--r-- | bbot/agent.cxx | 128 | ||||
-rw-r--r-- | bbot/bbot-agent@.service | 11 | ||||
-rw-r--r-- | bbot/buildfile | 2 | ||||
-rw-r--r-- | bbot/machine | 38 | ||||
-rw-r--r-- | bbot/machine.cxx | 45 | ||||
-rw-r--r-- | bbot/tftp | 6 | ||||
-rw-r--r-- | bbot/tftp.cxx | 5 | ||||
-rw-r--r-- | tests/agent/testscript | 3 |
9 files changed, 198 insertions, 54 deletions
diff --git a/bbot/agent.cli b/bbot/agent.cli index a10889e..562860f 100644 --- a/bbot/agent.cli +++ b/bbot/agent.cli @@ -64,6 +64,20 @@ namespace bbot the default." } + size_t --bootstrap-timeout = 600 + { + "<sec>", + "Maximum number of seconds to wait for machine bootstrap completion, + 600 (10 minutes) by default." + } + + size_t --build-timeout = 1800 + { + "<sec>", + "Maximum number of seconds to wait for build completion, 1800 (30 + minutes) by default." + } + uint16_t --verbose = 1 { "<level>", diff --git a/bbot/agent.cxx b/bbot/agent.cxx index 4d5cc4a..634a94d 100644 --- a/bbot/agent.cxx +++ b/bbot/agent.cxx @@ -101,7 +101,11 @@ btrfs_exit (tracer& t, A&&... a) : run_io_exit (t, fdnull (), fdnull (), 2, "btrfs", forward<A> (a)...); } -static bootstrapped_machine_manifest +// Bootstrap the machine. Return the bootstrapped machine manifest if +// successful and nullopt otherwise (in which case the machine directory +// should be cleaned and the machine ignored for now). +// +static optional<bootstrapped_machine_manifest> bootstrap_machine (const dir_path& md, const machine_manifest& mm, optional<bootstrapped_machine_manifest> obmm) @@ -128,7 +132,7 @@ bootstrap_machine (const dir_path& md, else try { - string br ("br1"); // Use private bridge for now. + string br ("br1"); // Using private bridge for now. // Start the TFTP server (server chroot is /build/tftp). Map: // @@ -138,6 +142,11 @@ bootstrap_machine (const dir_path& md, auto_rmdir arm (dir_path ("/build/tftp/bootstrap/" + tc_name)); try_mkdir_p (arm.path ()); + // Bootstrap result manifest. + // + path mf (arm.path () / "manifest"); + try_rmfile (mf); + tftp_server tftpd ("Gr ^/?(.+)$ /toolchain/" + tc_name + "/\\1\n" + "Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n"); @@ -152,32 +161,85 @@ bootstrap_machine (const dir_path& md, br, tftpd.port ())); - r.machine.mac = m->mac; + { + // If we are terminating with an exception then force the machine down. + // Failed that, the machine's destructor will block waiting for its + // completion. + // + auto mg ( + make_exception_guard ( + [&m, &md] () + { + info << "trying to force machine " << md << " down"; + try {m->forcedown ();} catch (const failed&) {} + })); + + // What happens if the bootstrap process hangs? The simple thing would + // be to force the machine down after some timeout and then fail. But + // that won't be very helpful for investigating the cause. So instead + // the plan is to suspend it after some timeout, issue diagnostics + // (without failing and which Build OS monitor will relay to the admin), + // and wait for the external intervention. + // + auto soft_fail = [&md, &m] (const char* msg) + { + { + diag_record dr (error); + dr << msg << " for machine " << md << ", suspending"; + m->print_info (dr); + } + m->suspend (); + m->wait (); + return nullopt; + }; + + // The first request should be the toolchain download. Wait for up to 60 + // seconds for that to arrive. In a sense we use it as an indication + // that the machine has booted and the bootstrap process has started. + // + size_t to; + const size_t startup_to (60); + const size_t bootstrap_to (ops.bootstrap_timeout ()); + const size_t shutdown_to (60); + + if (!tftpd.serve ((to = startup_to))) + return soft_fail ("bootstrap startup timeout"); + + l2 ([&]{trace << "completed startup in " << startup_to - to << "s";}); + + // Next the bootstrap process may download additional toolchain + // archives, build things, and then upload the result manifest. So on + // our side we serve TFTP requests while periodically checking for the + // manifest file. + // + for (to = bootstrap_to; to != 0 && !file_exists (mf); tftpd.serve (to)) ; + + if (to == 0) + return soft_fail ("bootstrap timeout"); + + l2 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";}); + + // Shut the machine down cleanly. + // + if (!m->shutdown ((to = shutdown_to))) + return soft_fail ("bootstrap shutdown timeout"); + + l2 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";}); + } - // The first request should be the toolchain download. Wait for up to 60 - // seconds for that to arrive. In a sense we use it as an indication that - // the machine has booted and the bootstrap process has started. + // Parse the result manifest. // - size_t timeout (60); - if (tftpd.serve (timeout)) + try { - l2 ([&]{trace << "received first request in " << 60 - timeout << "s";}); + r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap"); } - else + catch (const failed&) { - // @@ What should be do here? Non-fatal? Mark the machine as failed? - // - error << "bootstrap timeout during first request for machine " << md; - m->forcedown (); - throw failed (); + error << "invalid bootstrap manifest for machine " << md; + return nullopt; } - if (!m->shutdown ()) - { - error << "forcing machine " << md << " down"; - m->forcedown (); - throw failed (); - } + r.machine.mac = m->mac; // Save the MAC address. } catch (const system_error& e) { @@ -352,25 +414,25 @@ try (r = cmp ("libbutl", LIBBUTL_VERSION)) != 0 ? r : 0; }; - optional<bootstrapped_machine_manifest> obmm; + optional<bootstrapped_machine_manifest> bmm; if (te) { - obmm = parse_manifest<bootstrapped_machine_manifest> ( + bmm = parse_manifest<bootstrapped_machine_manifest> ( tp / "manifest", "bootstrapped machine"); - if (obmm->machine.id != mm.id) + if (bmm->machine.id != mm.id) { l2 ([&]{trace << "re-bootstrapping " << tp << ": new machine";}); te = false; } - if (obmm->toolchain.id != tc_id) + if (bmm->toolchain.id != tc_id) { l2 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";}); te = false; } - if (int i = compare_bbot (obmm->bootstrap)) + if (int i = compare_bbot (bmm->bootstrap)) { if (i < 0) { @@ -397,8 +459,14 @@ try // bootstrap the new machine. Then atomically rename it to // <name>-<toolchain>. // - bootstrapped_machine_manifest bmm ( - bootstrap_machine (xp, mm, move (obmm))); + bmm = bootstrap_machine (xp, mm, move (bmm)); + + if (!bmm) + { + l2 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";}); + btrfs (trace, "subvolume", "delete", xp); + break; + } try { @@ -409,12 +477,10 @@ try fail << "unable to rename " << xp << " to " << tp; } - te = true; - // Check the boostrapped bbot version as above and ignore this // machine if it's newer than us. // - if (int i = compare_bbot (bmm.bootstrap)) + if (int i = compare_bbot (bmm->bootstrap)) { assert (i > 0); l2 ([&]{trace << "ignoring " << tp << ": old bbot";}); diff --git a/bbot/bbot-agent@.service b/bbot/bbot-agent@.service index f8349ff..af760b3 100644 --- a/bbot/bbot-agent@.service +++ b/bbot/bbot-agent@.service @@ -5,16 +5,23 @@ After=default.target [Service] Type=simple +Environment=VERBOSE=1 + Environment=CPU=1 Environment=RAM=1048576 -Environment=VERBOSE=1 + +Environment=BOOTSTRAP_TIMEOUT=600 +Environment=BUILD_TIMEOUT=1800 + Environment=TOOLCHAIN_ID=123abc Environment=TOOLCHAIN_NUM=1 ExecStart=/build/bbot/%i/bin/bbot-agent --systemd-daemon \ + --verbose ${VERBOSE} \ --cpu ${CPU} \ --ram ${RAM} \ - --verbose ${VERBOSE} \ + --bootstrap-timeout ${BOOTSTRAP_TIMEOUT} \ + --build-timeout ${BUILD_TIMEOUT} \ %i \ ${TOOLCHAIN_NUM} \ ${TOOLCHAIN_ID} diff --git a/bbot/buildfile b/bbot/buildfile index 5498458..311775d 100644 --- a/bbot/buildfile +++ b/bbot/buildfile @@ -67,7 +67,7 @@ if $cli.configured # Usage options. # cli.options += --suppress-undocumented --long-usage --ansi-color \ ---page-usage 'bbot::print_$name$_' --option-length 20 +--page-usage 'bbot::print_$name$_' --option-length 23 # Include generated cli files into the distribution. # diff --git a/bbot/machine b/bbot/machine index c2942ac..f99d11b 100644 --- a/bbot/machine +++ b/bbot/machine @@ -12,21 +12,51 @@ namespace bbot { // A running build machine (container, vm, etc). // + // Note that if the machine is destroyed while it is still running, the + // destructor will block until the machine process terminates. + // class machine { public: - // Shut the machine down cleanly. Return false if machine is still - // running, true if machine exited successfully, and throw failed - // otherwise. + // Shut the machine down cleanly waiting up to the specified number of + // seconds for completion. Update the timeout and return false if the + // machine is still running, true if the machine exited successfully, and + // throw failed otherwise. // virtual bool - shutdown () = 0; + shutdown (size_t& seconds) = 0; // Force the machine down. // virtual void forcedown () = 0; + // Suspend the machine. + // + virtual void + suspend () = 0; + + // Wait for the machine to terminate up to the specified number of + // seconds. Update the timeout and return false if the machine is still + // running, true if the machine exited successfully, and throw failed + // otherwise. + // + virtual bool + wait (size_t& seconds) = 0; + + bool + wait () + { + size_t sec (~0); // Wait indefinitely. + return wait (sec); + } + + // Print information about the machine (as info diagnostics) that can be + // useful for debugging (e.g., how to connect/login, etc). + // + virtual void + print_info (diag_record&) = 0; + public: const string mac; // MAC address (inside the machine). diff --git a/bbot/machine.cxx b/bbot/machine.cxx index 8cad3f9..460e802 100644 --- a/bbot/machine.cxx +++ b/bbot/machine.cxx @@ -104,15 +104,23 @@ namespace bbot uint16_t tftp_port); virtual bool - shutdown () override; + shutdown (size_t& seconds) override; virtual void forcedown () override; - private: + virtual void + suspend () override; + bool - wait (size_t seconds); + wait (size_t& seconds) override; + + using machine::wait; + virtual void + print_info (diag_record&) override; + + private: void monitor_command (const string&); @@ -123,6 +131,7 @@ namespace bbot string tap; // Tap network interface. uint16_t port; // TFTP port. + string vnc; // QEMU VNC TCP addr:port. path monitor; // QEMU monitor UNIX socket. process proc; }; @@ -140,6 +149,7 @@ namespace bbot br (br), tap (create_tap (br, port)), port (port), + vnc ("127.0.0.1:" + to_string (5900 + stoul (tc_num))), monitor ("/tmp/" + tc_name + "-monitor") { tracer trace ("kvm_machine"); @@ -185,7 +195,7 @@ namespace bbot // // VNC & monitor. // - "-vnc", "localhost:" + tc_num, // 5900 + tc_num + "-vnc", "127.0.0.1:" + tc_num, // 5900 + tc_num "-monitor", "unix:" + monitor.string () + ",server,nowait"); } @@ -208,30 +218,45 @@ namespace bbot // forcedown(). // bool kvm_machine:: - shutdown () + shutdown (size_t& seconds) { monitor_command ("system_powerdown"); - // Wait for up to 10 seconds for the machine to shutdown. + // Wait for up to the specified number if seconds for the machine to + // shutdown. // - return wait (10); + return wait (seconds); } void kvm_machine:: forcedown () { monitor_command ("system_reset"); - wait (size_t (~0)); // Wait indefinitely. + wait (); + } + + void kvm_machine:: + suspend () + { + monitor_command ("stop"); + } + + void kvm_machine:: + print_info (diag_record& dr) + { + dr << info << "qemu pid: " << proc.id () + << info << "qemu vnc: " << vnc + << info << "qemu monitor: unix:" << monitor; } bool kvm_machine:: - wait (size_t sec) + wait (size_t& sec) try { tracer trace ("kvm_machine::wait"); bool t; - for (size_t i (0); !(t = proc.try_wait ()) && i != sec; ++i) + for (; !(t = proc.try_wait ()) && sec != 0; --sec) sleep (1); if (t) @@ -28,9 +28,9 @@ namespace bbot uint16_t port () const; - // Wait for a TFTP request for up to the specified number of seconds. If - // a request was served, update the timeout value and return true. Retain - // the original timeout value and return false otherwise. + // Wait for a TFTP request for up to the specified number of seconds. + // Update the timeout value as well as return true if a request was + // served and false otherwise. // bool serve (size_t& seconds); diff --git a/bbot/tftp.cxx b/bbot/tftp.cxx index 9c783c5..27d58a4 100644 --- a/bbot/tftp.cxx +++ b/bbot/tftp.cxx @@ -92,12 +92,13 @@ namespace bbot throw_system_error (errno); } else if (r == 0) // Timeout. + { + sec = 0; return false; + } if (FD_ISSET (fd, &rd)) { - text << "connection"; - // The inetd "protocol" is to pass the socket as stdin/stdout file // descriptors. // diff --git a/tests/agent/testscript b/tests/agent/testscript index 9198460..cf92ace 100644 --- a/tests/agent/testscript +++ b/tests/agent/testscript @@ -23,6 +23,7 @@ test.arguments = stage 1 cp = $src_base/btrfs-cpdir -f /build/machines.orig /build/machines rm = $src_base/btrfs-rmdir /build/machines +#\ : dump-machines : { @@ -112,6 +113,7 @@ rm = $src_base/btrfs-rmdir /build/machines } #\ + : bootstrap : { @@ -135,4 +137,3 @@ rm = $src_base/btrfs-rmdir /build/machines #-$rm } -#\ |