aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2017-05-01 15:46:25 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2017-05-01 15:46:25 +0200
commit6cabe4d21b49ea149e6694c7290ff3e1fa9e08d2 (patch)
tree79c6009de91c2ff583b986a4301e31435aa3383a
parent8b896055d4d90b538211784cf9ad1bf335b20dc6 (diff)
Retry mis-booted machines
-rw-r--r--bbot/agent.cli12
-rw-r--r--bbot/agent.cxx329
-rw-r--r--bbot/bbot-agent@.service4
3 files changed, 201 insertions, 144 deletions
diff --git a/bbot/agent.cli b/bbot/agent.cli
index 497b841..49563f8 100644
--- a/bbot/agent.cli
+++ b/bbot/agent.cli
@@ -106,6 +106,12 @@ namespace bbot
600 (10 minutes) by default."
}
+ size_t --bootstrap-retries = 2
+ {
+ "<num>",
+ "Number of time to retry a mis-booted bootstrap, 2 by default."
+ }
+
size_t --build-timeout = 1800
{
"<sec>",
@@ -113,6 +119,12 @@ namespace bbot
minutes) by default."
}
+ size_t --build-retries = 2
+ {
+ "<num>",
+ "Number of time to retry a mis-booted build, 2 by default."
+ }
+
size_t --request-timeout = 300
{
"<sec>",
diff --git a/bbot/agent.cxx b/bbot/agent.cxx
index 096bafe..d30799f 100644
--- a/bbot/agent.cxx
+++ b/bbot/agent.cxx
@@ -121,94 +121,119 @@ bootstrap_machine (const dir_path& md,
path mf (arm.path () / "manifest");
try_rmfile (mf);
- tftp_server tftpd ("Gr ^/?(.+)$ /toolchains/" + tc_name + "/\\1\n" +
- "Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n",
- ops.tftp_port () + tc_num);
-
- l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
-
- // Start the machine.
+ // Note that unlike build, here we use the same VM snapshot for retries,
+ // which is not ideal.
//
- unique_ptr<machine> m (
- start_machine (md,
- mm,
- obmm ? obmm->machine.mac : nullopt,
- br,
- tftpd.port ()));
-
+ for (size_t retry (0);; ++retry)
{
- // If we are terminating with an exception then force the machine down.
- // Failed that, the machine's destructor will block waiting for its
- // completion.
- //
- auto mg (
- make_exception_guard (
- [&m, &md] ()
- {
- info << "trying to force machine " << md << " down";
- try {m->forcedown ();} catch (const failed&) {}
- }));
-
- // What happens if the bootstrap process hangs? The simple thing would
- // be to force the machine down after some timeout and then fail. But
- // that won't be very helpful for investigating the cause. So instead
- // the plan is to suspend it after some timeout, issue diagnostics
- // (without failing and which Build OS monitor will relay to the admin),
- // and wait for the external intervention.
+ tftp_server tftpd ("Gr ^/?(.+)$ /toolchains/" + tc_name + "/\\1\n" +
+ "Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n",
+ ops.tftp_port () + tc_num);
+
+ l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
+
+ // Start the machine.
//
- auto soft_fail = [&md, &m] (const char* msg)
+ unique_ptr<machine> m (
+ start_machine (md,
+ mm,
+ obmm ? obmm->machine.mac : nullopt,
+ br,
+ tftpd.port ()));
+
{
+ // If we are terminating with an exception then force the machine down.
+ // Failed that, the machine's destructor will block waiting for its
+ // completion.
+ //
+ auto mg (
+ make_exception_guard (
+ [&m, &md] ()
+ {
+ info << "trying to force machine " << md << " down";
+ try {m->forcedown ();} catch (const failed&) {}
+ }));
+
+ // What happens if the bootstrap process hangs? The simple thing would
+ // be to force the machine down after some timeout and then fail. But
+ // that won't be very helpful for investigating the cause. So instead
+ // the plan is to suspend it after some timeout, issue diagnostics
+ // (without failing and which Build OS monitor will relay to the
+ // admin), and wait for the external intervention.
+ //
+ auto soft_fail = [&md, &m] (const char* msg)
{
- diag_record dr (error);
- dr << msg << " for machine " << md << ", suspending";
- m->print_info (dr);
+ {
+ diag_record dr (error);
+ dr << msg << " for machine " << md << ", suspending";
+ m->print_info (dr);
+ }
+ m->suspend ();
+ m->wait ();
+ return nullopt;
+ };
+
+ // The first request should be the toolchain download. Wait for up to
+ // 5 minutes for that to arrive. In a sense we use it as an indication
+ // that the machine has booted and the bootstrap process has started.
+ // Why wait so long you may wonder? Well, we may be using a new MAC
+ // address and operating systems like Windows may need to digest that.
+ //
+ size_t to;
+ const size_t startup_to (5 * 60);
+ const size_t bootstrap_to (ops.bootstrap_timeout ());
+ const size_t shutdown_to (5 * 60);
+
+ // This can mean two things: machine mis-configuration or what we
+ // euphemistically call a "mis-boot": the VM failed to boot for some
+ // unknown/random reason. Mac OS is particularly know for suffering
+ // from this. So the strategy is to retry it a couple of times and
+ // then suspend for investigation.
+ //
+ if (!tftpd.serve ((to = startup_to)))
+ {
+ if (retry > ops.bootstrap_retries ())
+ return soft_fail ("bootstrap startup timeout");
+
+ warn << "machine " << mm.name << " appears to have "
+ << "mis-booted, retrying";
+
+ try {m->forcedown (false);} catch (const failed&) {}
+ continue;
}
- m->suspend ();
- m->wait ();
- return nullopt;
- };
-
- // The first request should be the toolchain download. Wait for up to 5
- // minutes for that to arrive. In a sense we use it as an indication
- // that the machine has booted and the bootstrap process has started.
- // Why wait so long you may wonder? Well, we may be using a new MAC
- // address and operating systems like Windows may need to digest that.
- //
- size_t to;
- const size_t startup_to (5 * 60);
- const size_t bootstrap_to (ops.bootstrap_timeout ());
- const size_t shutdown_to (5 * 60);
- if (!tftpd.serve ((to = startup_to)))
- return soft_fail ("bootstrap startup timeout");
+ l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
- l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
+ // Next the bootstrap process may download additional toolchain
+ // archives, build things, and then upload the result manifest. So on
+ // our side we serve TFTP requests while periodically checking for the
+ // manifest file.
+ //
+ for (to = bootstrap_to;
+ to != 0 && !file_exists (mf);
+ tftpd.serve (to)) ;
- // Next the bootstrap process may download additional toolchain
- // archives, build things, and then upload the result manifest. So on
- // our side we serve TFTP requests while periodically checking for the
- // manifest file.
- //
- for (to = bootstrap_to; to != 0 && !file_exists (mf); tftpd.serve (to)) ;
+ if (to == 0)
+ return soft_fail ("bootstrap timeout");
- if (to == 0)
- return soft_fail ("bootstrap timeout");
+ l3 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";});
- l3 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";});
+ // Shut the machine down cleanly.
+ //
+ if (!m->shutdown ((to = shutdown_to)))
+ return soft_fail ("bootstrap shutdown timeout");
- // Shut the machine down cleanly.
- //
- if (!m->shutdown ((to = shutdown_to)))
- return soft_fail ("bootstrap shutdown timeout");
+ l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";});
+ }
- l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";});
- }
+ // Parse the result manifest.
+ //
+ r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap");
- // Parse the result manifest.
- //
- r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap");
+ r.machine.mac = m->mac; // Save the MAC address.
- r.machine.mac = m->mac; // Save the MAC address.
+ break;
+ }
}
catch (const system_error& e)
{
@@ -576,96 +601,112 @@ try
const dir_path xp (
md.directory () /= path::traits::temp_name (md.leaf ().string ()));
- run_btrfs (trace, "subvolume", "snapshot", md, xp);
-
string br ("br1"); // Using private bridge for now.
- // Start the TFTP server.
- //
- tftp_server tftpd ("Gr ^/?(.+)$ /build/" + tc_name + "/get/\\1\n" +
- "Pr ^/?(.+)$ /build/" + tc_name + "/put/\\1\n",
- ops.tftp_port () + tc_num);
+ for (size_t retry (0);; ++retry)
+ {
+ if (retry != 0)
+ run_btrfs (trace, "subvolume", "delete", xp);
- l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
+ run_btrfs (trace, "subvolume", "snapshot", md, xp);
- // Start the machine.
- //
- unique_ptr<machine> m (
- start_machine (xp,
- mm.machine,
- mm.machine.mac,
- br,
- tftpd.port ()));
-
- // Note: the machine handling logic is similar to bootstrap.
- //
- {
- auto mg (
- make_exception_guard (
- [&m, &xp] ()
- {
- info << "trying to force machine " << xp << " down";
- try {m->forcedown ();} catch (const failed&) {}
- }));
+ // Start the TFTP server.
+ //
+ tftp_server tftpd ("Gr ^/?(.+)$ /build/" + tc_name + "/get/\\1\n" +
+ "Pr ^/?(.+)$ /build/" + tc_name + "/put/\\1\n",
+ ops.tftp_port () + tc_num);
+
+ l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
- auto soft_fail = [&xp, &m, &r] (const char* msg, bool wait = true)
+ // Start the machine.
+ //
+ unique_ptr<machine> m (
+ start_machine (xp,
+ mm.machine,
+ mm.machine.mac,
+ br,
+ tftpd.port ()));
+
+ // Note: the machine handling logic is similar to bootstrap.
+ //
{
+ auto mg (
+ make_exception_guard (
+ [&m, &xp] ()
+ {
+ info << "trying to force machine " << xp << " down";
+ try {m->forcedown ();} catch (const failed&) {}
+ }));
+
+ auto soft_fail = [&xp, &m, &r] (const char* msg, bool wait = true)
{
- diag_record dr (error);
- dr << msg << " for machine " << xp << ", suspending";
- m->print_info (dr);
- }
- m->suspend ();
+ {
+ diag_record dr (error);
+ dr << msg << " for machine " << xp << ", suspending";
+ m->print_info (dr);
+ }
+ m->suspend ();
- if (wait)
- m->wait ();
+ if (wait)
+ m->wait ();
- return r;
- };
+ return r;
+ };
- // The first request should be the task manifest download. Wait for up
- // to 60 seconds for that to arrive. In a sense we use it as an
- // indication that the machine has booted and the worker process has
- // started.
- //
- size_t to;
- const size_t startup_to (60);
- const size_t build_to (ops.build_timeout ());
+ // The first request should be the task manifest download. Wait for up
+ // to 60 seconds for that to arrive. In a sense we use it as an
+ // indication that the machine has booted and the worker process has
+ // started.
+ //
+ size_t to;
+ const size_t startup_to (60);
+ const size_t build_to (ops.build_timeout ());
- if (!tftpd.serve ((to = startup_to)))
- return soft_fail ("build startup timeout");
+ if (!tftpd.serve ((to = startup_to)))
+ {
+ if (retry > ops.build_retries ())
+ return soft_fail ("build startup timeout");
- l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
+ warn << "machine " << mm.machine.name << " appears to have "
+ << "mis-booted, retrying";
- // Next the worker builds things and then uploads the result manifest.
- // So on our side we serve TFTP requests while checking for the manifest
- // file.
- //
- for (to = build_to; to != 0 && !file_exists (rf); tftpd.serve (to)) ;
+ try {m->forcedown (false);} catch (const failed&) {}
+ continue;
+ }
- if (to == 0)
- return soft_fail ("build timeout");
+ l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
- l3 ([&]{trace << "completed build in " << build_to - to << "s";});
+ // Next the worker builds things and then uploads the result manifest.
+ // So on our side we serve TFTP requests while checking for the
+ // manifest file.
+ //
+ for (to = build_to; to != 0 && !file_exists (rf); tftpd.serve (to)) ;
- // Parse the result manifest.
- //
- r = parse_manifest<result_manifest> (rf, "result");
+ if (to == 0)
+ return soft_fail ("build timeout");
- // If the build terminated abnormally, suspent the machine for
- // investigation (note that here we don't wait or return).
- //
- if (r.status == result_status::abnormal)
- soft_fail ("build terminated abnormally", false);
+ l3 ([&]{trace << "completed build in " << build_to - to << "s";});
- // Force the machine down (there is no need wasting time on clean
- // shutdown since the next step is to drop the snapshot). Also fail
- // softly if things go badly.
- //
- try {m->forcedown (false);} catch (const failed&) {}
- }
+ // Parse the result manifest.
+ //
+ r = parse_manifest<result_manifest> (rf, "result");
+
+ // If the build terminated abnormally, suspent the machine for
+ // investigation (note that here we don't wait or return).
+ //
+ if (r.status == result_status::abnormal)
+ soft_fail ("build terminated abnormally", false);
- run_btrfs (trace, "subvolume", "delete", xp);
+ // Force the machine down (there is no need wasting time on clean
+ // shutdown since the next step is to drop the snapshot). Also fail
+ // softly if things go badly.
+ //
+ try {m->forcedown (false);} catch (const failed&) {}
+ }
+
+ run_btrfs (trace, "subvolume", "delete", xp);
+ break;
+ }
}
// Update package name/version if the returned value as "unknown".
diff --git a/bbot/bbot-agent@.service b/bbot/bbot-agent@.service
index b15108f..dd7ca21 100644
--- a/bbot/bbot-agent@.service
+++ b/bbot/bbot-agent@.service
@@ -11,7 +11,9 @@ Environment=CPU=1
Environment=RAM=1048576
Environment=BOOTSTRAP_TIMEOUT=600
+Environment=BOOTSTRAP_RETRIES=2
Environment=BUILD_TIMEOUT=1800
+Environment=BUILD_RETRIES=2
Environment=REQUEST_TIMEOUT=300
Environment=TOOLCHAIN_NAME=%i
@@ -26,7 +28,9 @@ ExecStart=/build/bots/%i/bin/bbot-agent --systemd-daemon \
--cpu ${CPU} \
--ram ${RAM} \
--bootstrap-timeout ${BOOTSTRAP_TIMEOUT} \
+ --bootstrap-retries ${BOOTSTRAP_RETRIES} \
--build-timeout ${BUILD_TIMEOUT} \
+ --build-retries ${BUILD_RETRIES} \
--request-timeout ${REQUEST_TIMEOUT} \
--toolchain-name ${TOOLCHAIN_NAME} \
--toolchain-num ${TOOLCHAIN_NUM} \