From 54ca39e390e6471c451e72b5796fb91ffdea0d2b Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Thu, 31 Jan 2019 13:20:49 +0200 Subject: Periodically check whether machine is still running --- bbot/agent/agent.cxx | 87 +++++++++++++++++++++++++++++++++++++++++++++++++--- bbot/agent/tftp.hxx | 2 +- 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index 5fecd0e..2378fc2 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -197,7 +197,7 @@ bootstrap_machine (const dir_path& md, // that won't be very helpful for investigating the cause. So instead // the plan is to suspend it after some timeout, issue diagnostics // (without failing and which Build OS monitor will relay to the - // admin), and wait for the external intervention. + // operator), and wait for the external intervention. // auto soft_fail = [&md, &m] (const char* msg) { @@ -218,6 +218,28 @@ bootstrap_machine (const dir_path& md, return nullopt; }; + // Check whether the machine is still running issuing diagnostics and + // returning false if it unexpectedly terminated. + // + auto check_machine = [&md, &m] () + { + try + { + size_t t (0); + if (!m->wait (t /* seconds */, false /* fail_hard */)) + return true; // Still running. + + // Exited successfully. + } + catch (const failed&) + { + // Failed, exit code diagnostics has already been issued. + } + + error << "machine " << md << " exited unexpectedly"; + return false; + }; + // The first request should be the toolchain download. Wait for up to // 5 minutes for that to arrive. In a sense we use it as an indication // that the machine has booted and the bootstrap process has started. @@ -229,13 +251,24 @@ bootstrap_machine (const dir_path& md, const size_t bootstrap_to (ops.bootstrap_timeout ()); const size_t shutdown_to (5 * 60); + // Wait periodically making sure the machine is still alive. + // + for (to = startup_to; to != 0; ) + { + if (tftpd.serve (to, 2)) + break; + + if (!check_machine ()) + return r; + } + // This can mean two things: machine mis-configuration or what we // euphemistically call a "mis-boot": the VM failed to boot for some // unknown/random reason. Mac OS is particularly know for suffering // from this. So the strategy is to retry it a couple of times and // then suspend for investigation. // - if (!tftpd.serve ((to = startup_to))) + if (to == 0) { if (retry > ops.bootstrap_retries ()) return soft_fail ("bootstrap startup timeout"); @@ -256,8 +289,11 @@ bootstrap_machine (const dir_path& md, // file's mtime/size is updated several seconds later; maybe tmpfs // issue?), we periodically re-check. // - for (to = bootstrap_to; to != 0; tftpd.serve (to, 2)) + for (to = bootstrap_to; to != 0; ) { + if (tftpd.serve (to, 2)) + continue; + bool old (false); if (file_not_empty (mf) || (old = file_not_empty (mfo))) { @@ -270,6 +306,9 @@ bootstrap_machine (const dir_path& md, if (!tftpd.serve (to, 5)) break; } + + if (!check_machine ()) + return nullopt; } if (to == 0) @@ -865,6 +904,22 @@ try return r; }; + auto check_machine = [&xp, &m] () + { + try + { + size_t t (0); + if (!m->wait (t /* seconds */, false /* fail_hard */)) + return true; + } + catch (const failed&) + { + } + + error << "machine " << xp << " exited unexpectedly"; + return false; + }; + // The first request should be the task manifest download. Wait for up // to 90 seconds for that to arrive. In a sense we use it as an // indication that the machine has booted and the worker process has @@ -874,7 +929,18 @@ try const size_t startup_to (90); const size_t build_to (ops.build_timeout ()); - if (!tftpd.serve ((to = startup_to))) + // Wait periodically making sure the machine is still alive. + // + for (to = startup_to; to != 0; ) + { + if (tftpd.serve (to, 2)) + break; + + if (!check_machine ()) + return r; + } + + if (to == 0) { if (retry > ops.build_retries ()) return soft_fail ("build startup timeout"); @@ -894,13 +960,19 @@ try // file's mtime/size is updated several seconds later; maybe tmpfs // issue?), we periodically re-check. // - for (to = build_to; to != 0; tftpd.serve (to, 2)) + for (to = build_to; to != 0; ) { + if (tftpd.serve (to, 2)) + continue; + if (file_not_empty (rf)) { if (!tftpd.serve (to, 5)) break; } + + if (!check_machine ()) + return r; } if (to == 0) @@ -932,6 +1004,11 @@ try // shutdown since the next step is to drop the snapshot). Also fail // softly if things go badly. // + // One thing to keep in mind are DHCP leases: with this approach + // they will not be released. However, since we reuse the same MAC + // address since bootstrap, on the next build we should get the same + // lease instead of a new one. + // try {m->forcedown (false);} catch (const failed&) {} } } diff --git a/bbot/agent/tftp.hxx b/bbot/agent/tftp.hxx index 71f32b1..57d14dd 100644 --- a/bbot/agent/tftp.hxx +++ b/bbot/agent/tftp.hxx @@ -29,7 +29,7 @@ namespace bbot uint16_t port () const; - // Wait for a TFTP request for up to the specified number of seconds. If + // Wait for a TFTP request for up to the specified number of seconds. If // increment is not 0, then wait in the specified incremenets (i.e., wait // for up to that number of seconds; useful when one needs to also // periodically check for something else). Update the timeout value as -- cgit v1.1