aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2019-01-31 13:20:49 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2019-02-01 15:10:50 +0200
commit54ca39e390e6471c451e72b5796fb91ffdea0d2b (patch)
treeaf2ab954f0d85de8b1f589355d413d457bceb908
parent66d3bf73c4853c081214620eb53a32ea89b473b1 (diff)
Periodically check whether machine is still running
-rw-r--r--bbot/agent/agent.cxx87
-rw-r--r--bbot/agent/tftp.hxx2
2 files changed, 83 insertions, 6 deletions
diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx
index 5fecd0e..2378fc2 100644
--- a/bbot/agent/agent.cxx
+++ b/bbot/agent/agent.cxx
@@ -197,7 +197,7 @@ bootstrap_machine (const dir_path& md,
// that won't be very helpful for investigating the cause. So instead
// the plan is to suspend it after some timeout, issue diagnostics
// (without failing and which Build OS monitor will relay to the
- // admin), and wait for the external intervention.
+ // operator), and wait for the external intervention.
//
auto soft_fail = [&md, &m] (const char* msg)
{
@@ -218,6 +218,28 @@ bootstrap_machine (const dir_path& md,
return nullopt;
};
+ // Check whether the machine is still running issuing diagnostics and
+ // returning false if it unexpectedly terminated.
+ //
+ auto check_machine = [&md, &m] ()
+ {
+ try
+ {
+ size_t t (0);
+ if (!m->wait (t /* seconds */, false /* fail_hard */))
+ return true; // Still running.
+
+ // Exited successfully.
+ }
+ catch (const failed&)
+ {
+ // Failed, exit code diagnostics has already been issued.
+ }
+
+ error << "machine " << md << " exited unexpectedly";
+ return false;
+ };
+
// The first request should be the toolchain download. Wait for up to
// 5 minutes for that to arrive. In a sense we use it as an indication
// that the machine has booted and the bootstrap process has started.
@@ -229,13 +251,24 @@ bootstrap_machine (const dir_path& md,
const size_t bootstrap_to (ops.bootstrap_timeout ());
const size_t shutdown_to (5 * 60);
+ // Wait periodically making sure the machine is still alive.
+ //
+ for (to = startup_to; to != 0; )
+ {
+ if (tftpd.serve (to, 2))
+ break;
+
+ if (!check_machine ())
+ return r;
+ }
+
// This can mean two things: machine mis-configuration or what we
// euphemistically call a "mis-boot": the VM failed to boot for some
// unknown/random reason. Mac OS is particularly know for suffering
// from this. So the strategy is to retry it a couple of times and
// then suspend for investigation.
//
- if (!tftpd.serve ((to = startup_to)))
+ if (to == 0)
{
if (retry > ops.bootstrap_retries ())
return soft_fail ("bootstrap startup timeout");
@@ -256,8 +289,11 @@ bootstrap_machine (const dir_path& md,
// file's mtime/size is updated several seconds later; maybe tmpfs
// issue?), we periodically re-check.
//
- for (to = bootstrap_to; to != 0; tftpd.serve (to, 2))
+ for (to = bootstrap_to; to != 0; )
{
+ if (tftpd.serve (to, 2))
+ continue;
+
bool old (false);
if (file_not_empty (mf) || (old = file_not_empty (mfo)))
{
@@ -270,6 +306,9 @@ bootstrap_machine (const dir_path& md,
if (!tftpd.serve (to, 5))
break;
}
+
+ if (!check_machine ())
+ return nullopt;
}
if (to == 0)
@@ -865,6 +904,22 @@ try
return r;
};
+ auto check_machine = [&xp, &m] ()
+ {
+ try
+ {
+ size_t t (0);
+ if (!m->wait (t /* seconds */, false /* fail_hard */))
+ return true;
+ }
+ catch (const failed&)
+ {
+ }
+
+ error << "machine " << xp << " exited unexpectedly";
+ return false;
+ };
+
// The first request should be the task manifest download. Wait for up
// to 90 seconds for that to arrive. In a sense we use it as an
// indication that the machine has booted and the worker process has
@@ -874,7 +929,18 @@ try
const size_t startup_to (90);
const size_t build_to (ops.build_timeout ());
- if (!tftpd.serve ((to = startup_to)))
+ // Wait periodically making sure the machine is still alive.
+ //
+ for (to = startup_to; to != 0; )
+ {
+ if (tftpd.serve (to, 2))
+ break;
+
+ if (!check_machine ())
+ return r;
+ }
+
+ if (to == 0)
{
if (retry > ops.build_retries ())
return soft_fail ("build startup timeout");
@@ -894,13 +960,19 @@ try
// file's mtime/size is updated several seconds later; maybe tmpfs
// issue?), we periodically re-check.
//
- for (to = build_to; to != 0; tftpd.serve (to, 2))
+ for (to = build_to; to != 0; )
{
+ if (tftpd.serve (to, 2))
+ continue;
+
if (file_not_empty (rf))
{
if (!tftpd.serve (to, 5))
break;
}
+
+ if (!check_machine ())
+ return r;
}
if (to == 0)
@@ -932,6 +1004,11 @@ try
// shutdown since the next step is to drop the snapshot). Also fail
// softly if things go badly.
//
+ // One thing to keep in mind are DHCP leases: with this approach
+ // they will not be released. However, since we reuse the same MAC
+ // address since bootstrap, on the next build we should get the same
+ // lease instead of a new one.
+ //
try {m->forcedown (false);} catch (const failed&) {}
}
}
diff --git a/bbot/agent/tftp.hxx b/bbot/agent/tftp.hxx
index 71f32b1..57d14dd 100644
--- a/bbot/agent/tftp.hxx
+++ b/bbot/agent/tftp.hxx
@@ -29,7 +29,7 @@ namespace bbot
uint16_t
port () const;
- // Wait for a TFTP request for up to the specified number of seconds. If
+ // Wait for a TFTP request for up to the specified number of seconds. If
// increment is not 0, then wait in the specified incremenets (i.e., wait
// for up to that number of seconds; useful when one needs to also
// periodically check for something else). Update the timeout value as