From 6cabe4d21b49ea149e6694c7290ff3e1fa9e08d2 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Mon, 1 May 2017 15:46:25 +0200 Subject: Retry mis-booted machines --- bbot/agent.cli | 12 ++ bbot/agent.cxx | 329 ++++++++++++++++++++++++++--------------------- bbot/bbot-agent@.service | 4 + 3 files changed, 201 insertions(+), 144 deletions(-) (limited to 'bbot') diff --git a/bbot/agent.cli b/bbot/agent.cli index 497b841..49563f8 100644 --- a/bbot/agent.cli +++ b/bbot/agent.cli @@ -106,6 +106,12 @@ namespace bbot 600 (10 minutes) by default." } + size_t --bootstrap-retries = 2 + { + "", + "Number of time to retry a mis-booted bootstrap, 2 by default." + } + size_t --build-timeout = 1800 { "", @@ -113,6 +119,12 @@ namespace bbot minutes) by default." } + size_t --build-retries = 2 + { + "", + "Number of time to retry a mis-booted build, 2 by default." + } + size_t --request-timeout = 300 { "", diff --git a/bbot/agent.cxx b/bbot/agent.cxx index 096bafe..d30799f 100644 --- a/bbot/agent.cxx +++ b/bbot/agent.cxx @@ -121,94 +121,119 @@ bootstrap_machine (const dir_path& md, path mf (arm.path () / "manifest"); try_rmfile (mf); - tftp_server tftpd ("Gr ^/?(.+)$ /toolchains/" + tc_name + "/\\1\n" + - "Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n", - ops.tftp_port () + tc_num); - - l3 ([&]{trace << "tftp server on port " << tftpd.port ();}); - - // Start the machine. + // Note that unlike build, here we use the same VM snapshot for retries, + // which is not ideal. // - unique_ptr m ( - start_machine (md, - mm, - obmm ? obmm->machine.mac : nullopt, - br, - tftpd.port ())); - + for (size_t retry (0);; ++retry) { - // If we are terminating with an exception then force the machine down. - // Failed that, the machine's destructor will block waiting for its - // completion. - // - auto mg ( - make_exception_guard ( - [&m, &md] () - { - info << "trying to force machine " << md << " down"; - try {m->forcedown ();} catch (const failed&) {} - })); - - // What happens if the bootstrap process hangs? The simple thing would - // be to force the machine down after some timeout and then fail. But - // that won't be very helpful for investigating the cause. So instead - // the plan is to suspend it after some timeout, issue diagnostics - // (without failing and which Build OS monitor will relay to the admin), - // and wait for the external intervention. + tftp_server tftpd ("Gr ^/?(.+)$ /toolchains/" + tc_name + "/\\1\n" + + "Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n", + ops.tftp_port () + tc_num); + + l3 ([&]{trace << "tftp server on port " << tftpd.port ();}); + + // Start the machine. // - auto soft_fail = [&md, &m] (const char* msg) + unique_ptr m ( + start_machine (md, + mm, + obmm ? obmm->machine.mac : nullopt, + br, + tftpd.port ())); + { + // If we are terminating with an exception then force the machine down. + // Failed that, the machine's destructor will block waiting for its + // completion. + // + auto mg ( + make_exception_guard ( + [&m, &md] () + { + info << "trying to force machine " << md << " down"; + try {m->forcedown ();} catch (const failed&) {} + })); + + // What happens if the bootstrap process hangs? The simple thing would + // be to force the machine down after some timeout and then fail. But + // that won't be very helpful for investigating the cause. So instead + // the plan is to suspend it after some timeout, issue diagnostics + // (without failing and which Build OS monitor will relay to the + // admin), and wait for the external intervention. + // + auto soft_fail = [&md, &m] (const char* msg) { - diag_record dr (error); - dr << msg << " for machine " << md << ", suspending"; - m->print_info (dr); + { + diag_record dr (error); + dr << msg << " for machine " << md << ", suspending"; + m->print_info (dr); + } + m->suspend (); + m->wait (); + return nullopt; + }; + + // The first request should be the toolchain download. Wait for up to + // 5 minutes for that to arrive. In a sense we use it as an indication + // that the machine has booted and the bootstrap process has started. + // Why wait so long you may wonder? Well, we may be using a new MAC + // address and operating systems like Windows may need to digest that. + // + size_t to; + const size_t startup_to (5 * 60); + const size_t bootstrap_to (ops.bootstrap_timeout ()); + const size_t shutdown_to (5 * 60); + + // This can mean two things: machine mis-configuration or what we + // euphemistically call a "mis-boot": the VM failed to boot for some + // unknown/random reason. Mac OS is particularly know for suffering + // from this. So the strategy is to retry it a couple of times and + // then suspend for investigation. + // + if (!tftpd.serve ((to = startup_to))) + { + if (retry > ops.bootstrap_retries ()) + return soft_fail ("bootstrap startup timeout"); + + warn << "machine " << mm.name << " appears to have " + << "mis-booted, retrying"; + + try {m->forcedown (false);} catch (const failed&) {} + continue; } - m->suspend (); - m->wait (); - return nullopt; - }; - - // The first request should be the toolchain download. Wait for up to 5 - // minutes for that to arrive. In a sense we use it as an indication - // that the machine has booted and the bootstrap process has started. - // Why wait so long you may wonder? Well, we may be using a new MAC - // address and operating systems like Windows may need to digest that. - // - size_t to; - const size_t startup_to (5 * 60); - const size_t bootstrap_to (ops.bootstrap_timeout ()); - const size_t shutdown_to (5 * 60); - if (!tftpd.serve ((to = startup_to))) - return soft_fail ("bootstrap startup timeout"); + l3 ([&]{trace << "completed startup in " << startup_to - to << "s";}); - l3 ([&]{trace << "completed startup in " << startup_to - to << "s";}); + // Next the bootstrap process may download additional toolchain + // archives, build things, and then upload the result manifest. So on + // our side we serve TFTP requests while periodically checking for the + // manifest file. + // + for (to = bootstrap_to; + to != 0 && !file_exists (mf); + tftpd.serve (to)) ; - // Next the bootstrap process may download additional toolchain - // archives, build things, and then upload the result manifest. So on - // our side we serve TFTP requests while periodically checking for the - // manifest file. - // - for (to = bootstrap_to; to != 0 && !file_exists (mf); tftpd.serve (to)) ; + if (to == 0) + return soft_fail ("bootstrap timeout"); - if (to == 0) - return soft_fail ("bootstrap timeout"); + l3 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";}); - l3 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";}); + // Shut the machine down cleanly. + // + if (!m->shutdown ((to = shutdown_to))) + return soft_fail ("bootstrap shutdown timeout"); - // Shut the machine down cleanly. - // - if (!m->shutdown ((to = shutdown_to))) - return soft_fail ("bootstrap shutdown timeout"); + l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";}); + } - l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";}); - } + // Parse the result manifest. + // + r.bootstrap = parse_manifest (mf, "bootstrap"); - // Parse the result manifest. - // - r.bootstrap = parse_manifest (mf, "bootstrap"); + r.machine.mac = m->mac; // Save the MAC address. - r.machine.mac = m->mac; // Save the MAC address. + break; + } } catch (const system_error& e) { @@ -576,96 +601,112 @@ try const dir_path xp ( md.directory () /= path::traits::temp_name (md.leaf ().string ())); - run_btrfs (trace, "subvolume", "snapshot", md, xp); - string br ("br1"); // Using private bridge for now. - // Start the TFTP server. - // - tftp_server tftpd ("Gr ^/?(.+)$ /build/" + tc_name + "/get/\\1\n" + - "Pr ^/?(.+)$ /build/" + tc_name + "/put/\\1\n", - ops.tftp_port () + tc_num); + for (size_t retry (0);; ++retry) + { + if (retry != 0) + run_btrfs (trace, "subvolume", "delete", xp); - l3 ([&]{trace << "tftp server on port " << tftpd.port ();}); + run_btrfs (trace, "subvolume", "snapshot", md, xp); - // Start the machine. - // - unique_ptr m ( - start_machine (xp, - mm.machine, - mm.machine.mac, - br, - tftpd.port ())); - - // Note: the machine handling logic is similar to bootstrap. - // - { - auto mg ( - make_exception_guard ( - [&m, &xp] () - { - info << "trying to force machine " << xp << " down"; - try {m->forcedown ();} catch (const failed&) {} - })); + // Start the TFTP server. + // + tftp_server tftpd ("Gr ^/?(.+)$ /build/" + tc_name + "/get/\\1\n" + + "Pr ^/?(.+)$ /build/" + tc_name + "/put/\\1\n", + ops.tftp_port () + tc_num); + + l3 ([&]{trace << "tftp server on port " << tftpd.port ();}); - auto soft_fail = [&xp, &m, &r] (const char* msg, bool wait = true) + // Start the machine. + // + unique_ptr m ( + start_machine (xp, + mm.machine, + mm.machine.mac, + br, + tftpd.port ())); + + // Note: the machine handling logic is similar to bootstrap. + // { + auto mg ( + make_exception_guard ( + [&m, &xp] () + { + info << "trying to force machine " << xp << " down"; + try {m->forcedown ();} catch (const failed&) {} + })); + + auto soft_fail = [&xp, &m, &r] (const char* msg, bool wait = true) { - diag_record dr (error); - dr << msg << " for machine " << xp << ", suspending"; - m->print_info (dr); - } - m->suspend (); + { + diag_record dr (error); + dr << msg << " for machine " << xp << ", suspending"; + m->print_info (dr); + } + m->suspend (); - if (wait) - m->wait (); + if (wait) + m->wait (); - return r; - }; + return r; + }; - // The first request should be the task manifest download. Wait for up - // to 60 seconds for that to arrive. In a sense we use it as an - // indication that the machine has booted and the worker process has - // started. - // - size_t to; - const size_t startup_to (60); - const size_t build_to (ops.build_timeout ()); + // The first request should be the task manifest download. Wait for up + // to 60 seconds for that to arrive. In a sense we use it as an + // indication that the machine has booted and the worker process has + // started. + // + size_t to; + const size_t startup_to (60); + const size_t build_to (ops.build_timeout ()); - if (!tftpd.serve ((to = startup_to))) - return soft_fail ("build startup timeout"); + if (!tftpd.serve ((to = startup_to))) + { + if (retry > ops.build_retries ()) + return soft_fail ("build startup timeout"); - l3 ([&]{trace << "completed startup in " << startup_to - to << "s";}); + warn << "machine " << mm.machine.name << " appears to have " + << "mis-booted, retrying"; - // Next the worker builds things and then uploads the result manifest. - // So on our side we serve TFTP requests while checking for the manifest - // file. - // - for (to = build_to; to != 0 && !file_exists (rf); tftpd.serve (to)) ; + try {m->forcedown (false);} catch (const failed&) {} + continue; + } - if (to == 0) - return soft_fail ("build timeout"); + l3 ([&]{trace << "completed startup in " << startup_to - to << "s";}); - l3 ([&]{trace << "completed build in " << build_to - to << "s";}); + // Next the worker builds things and then uploads the result manifest. + // So on our side we serve TFTP requests while checking for the + // manifest file. + // + for (to = build_to; to != 0 && !file_exists (rf); tftpd.serve (to)) ; - // Parse the result manifest. - // - r = parse_manifest (rf, "result"); + if (to == 0) + return soft_fail ("build timeout"); - // If the build terminated abnormally, suspent the machine for - // investigation (note that here we don't wait or return). - // - if (r.status == result_status::abnormal) - soft_fail ("build terminated abnormally", false); + l3 ([&]{trace << "completed build in " << build_to - to << "s";}); - // Force the machine down (there is no need wasting time on clean - // shutdown since the next step is to drop the snapshot). Also fail - // softly if things go badly. - // - try {m->forcedown (false);} catch (const failed&) {} - } + // Parse the result manifest. + // + r = parse_manifest (rf, "result"); + + // If the build terminated abnormally, suspent the machine for + // investigation (note that here we don't wait or return). + // + if (r.status == result_status::abnormal) + soft_fail ("build terminated abnormally", false); - run_btrfs (trace, "subvolume", "delete", xp); + // Force the machine down (there is no need wasting time on clean + // shutdown since the next step is to drop the snapshot). Also fail + // softly if things go badly. + // + try {m->forcedown (false);} catch (const failed&) {} + } + + run_btrfs (trace, "subvolume", "delete", xp); + break; + } } // Update package name/version if the returned value as "unknown". diff --git a/bbot/bbot-agent@.service b/bbot/bbot-agent@.service index b15108f..dd7ca21 100644 --- a/bbot/bbot-agent@.service +++ b/bbot/bbot-agent@.service @@ -11,7 +11,9 @@ Environment=CPU=1 Environment=RAM=1048576 Environment=BOOTSTRAP_TIMEOUT=600 +Environment=BOOTSTRAP_RETRIES=2 Environment=BUILD_TIMEOUT=1800 +Environment=BUILD_RETRIES=2 Environment=REQUEST_TIMEOUT=300 Environment=TOOLCHAIN_NAME=%i @@ -26,7 +28,9 @@ ExecStart=/build/bots/%i/bin/bbot-agent --systemd-daemon \ --cpu ${CPU} \ --ram ${RAM} \ --bootstrap-timeout ${BOOTSTRAP_TIMEOUT} \ + --bootstrap-retries ${BOOTSTRAP_RETRIES} \ --build-timeout ${BUILD_TIMEOUT} \ + --build-retries ${BUILD_RETRIES} \ --request-timeout ${REQUEST_TIMEOUT} \ --toolchain-name ${TOOLCHAIN_NAME} \ --toolchain-num ${TOOLCHAIN_NUM} \ -- cgit v1.1