aboutsummaryrefslogtreecommitdiff
path: root/bbot/agent.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'bbot/agent.cxx')
-rw-r--r--bbot/agent.cxx128
1 files changed, 97 insertions, 31 deletions
diff --git a/bbot/agent.cxx b/bbot/agent.cxx
index 4d5cc4a..634a94d 100644
--- a/bbot/agent.cxx
+++ b/bbot/agent.cxx
@@ -101,7 +101,11 @@ btrfs_exit (tracer& t, A&&... a)
: run_io_exit (t, fdnull (), fdnull (), 2, "btrfs", forward<A> (a)...);
}
-static bootstrapped_machine_manifest
+// Bootstrap the machine. Return the bootstrapped machine manifest if
+// successful and nullopt otherwise (in which case the machine directory
+// should be cleaned and the machine ignored for now).
+//
+static optional<bootstrapped_machine_manifest>
bootstrap_machine (const dir_path& md,
const machine_manifest& mm,
optional<bootstrapped_machine_manifest> obmm)
@@ -128,7 +132,7 @@ bootstrap_machine (const dir_path& md,
else
try
{
- string br ("br1"); // Use private bridge for now.
+ string br ("br1"); // Using private bridge for now.
// Start the TFTP server (server chroot is /build/tftp). Map:
//
@@ -138,6 +142,11 @@ bootstrap_machine (const dir_path& md,
auto_rmdir arm (dir_path ("/build/tftp/bootstrap/" + tc_name));
try_mkdir_p (arm.path ());
+ // Bootstrap result manifest.
+ //
+ path mf (arm.path () / "manifest");
+ try_rmfile (mf);
+
tftp_server tftpd ("Gr ^/?(.+)$ /toolchain/" + tc_name + "/\\1\n" +
"Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n");
@@ -152,32 +161,85 @@ bootstrap_machine (const dir_path& md,
br,
tftpd.port ()));
- r.machine.mac = m->mac;
+ {
+ // If we are terminating with an exception then force the machine down.
+ // Failed that, the machine's destructor will block waiting for its
+ // completion.
+ //
+ auto mg (
+ make_exception_guard (
+ [&m, &md] ()
+ {
+ info << "trying to force machine " << md << " down";
+ try {m->forcedown ();} catch (const failed&) {}
+ }));
+
+ // What happens if the bootstrap process hangs? The simple thing would
+ // be to force the machine down after some timeout and then fail. But
+ // that won't be very helpful for investigating the cause. So instead
+ // the plan is to suspend it after some timeout, issue diagnostics
+ // (without failing and which Build OS monitor will relay to the admin),
+ // and wait for the external intervention.
+ //
+ auto soft_fail = [&md, &m] (const char* msg)
+ {
+ {
+ diag_record dr (error);
+ dr << msg << " for machine " << md << ", suspending";
+ m->print_info (dr);
+ }
+ m->suspend ();
+ m->wait ();
+ return nullopt;
+ };
+
+ // The first request should be the toolchain download. Wait for up to 60
+ // seconds for that to arrive. In a sense we use it as an indication
+ // that the machine has booted and the bootstrap process has started.
+ //
+ size_t to;
+ const size_t startup_to (60);
+ const size_t bootstrap_to (ops.bootstrap_timeout ());
+ const size_t shutdown_to (60);
+
+ if (!tftpd.serve ((to = startup_to)))
+ return soft_fail ("bootstrap startup timeout");
+
+ l2 ([&]{trace << "completed startup in " << startup_to - to << "s";});
+
+ // Next the bootstrap process may download additional toolchain
+ // archives, build things, and then upload the result manifest. So on
+ // our side we serve TFTP requests while periodically checking for the
+ // manifest file.
+ //
+ for (to = bootstrap_to; to != 0 && !file_exists (mf); tftpd.serve (to)) ;
+
+ if (to == 0)
+ return soft_fail ("bootstrap timeout");
+
+ l2 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";});
+
+ // Shut the machine down cleanly.
+ //
+ if (!m->shutdown ((to = shutdown_to)))
+ return soft_fail ("bootstrap shutdown timeout");
+
+ l2 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";});
+ }
- // The first request should be the toolchain download. Wait for up to 60
- // seconds for that to arrive. In a sense we use it as an indication that
- // the machine has booted and the bootstrap process has started.
+ // Parse the result manifest.
//
- size_t timeout (60);
- if (tftpd.serve (timeout))
+ try
{
- l2 ([&]{trace << "received first request in " << 60 - timeout << "s";});
+ r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap");
}
- else
+ catch (const failed&)
{
- // @@ What should be do here? Non-fatal? Mark the machine as failed?
- //
- error << "bootstrap timeout during first request for machine " << md;
- m->forcedown ();
- throw failed ();
+ error << "invalid bootstrap manifest for machine " << md;
+ return nullopt;
}
- if (!m->shutdown ())
- {
- error << "forcing machine " << md << " down";
- m->forcedown ();
- throw failed ();
- }
+ r.machine.mac = m->mac; // Save the MAC address.
}
catch (const system_error& e)
{
@@ -352,25 +414,25 @@ try
(r = cmp ("libbutl", LIBBUTL_VERSION)) != 0 ? r : 0;
};
- optional<bootstrapped_machine_manifest> obmm;
+ optional<bootstrapped_machine_manifest> bmm;
if (te)
{
- obmm = parse_manifest<bootstrapped_machine_manifest> (
+ bmm = parse_manifest<bootstrapped_machine_manifest> (
tp / "manifest", "bootstrapped machine");
- if (obmm->machine.id != mm.id)
+ if (bmm->machine.id != mm.id)
{
l2 ([&]{trace << "re-bootstrapping " << tp << ": new machine";});
te = false;
}
- if (obmm->toolchain.id != tc_id)
+ if (bmm->toolchain.id != tc_id)
{
l2 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";});
te = false;
}
- if (int i = compare_bbot (obmm->bootstrap))
+ if (int i = compare_bbot (bmm->bootstrap))
{
if (i < 0)
{
@@ -397,8 +459,14 @@ try
// bootstrap the new machine. Then atomically rename it to
// <name>-<toolchain>.
//
- bootstrapped_machine_manifest bmm (
- bootstrap_machine (xp, mm, move (obmm)));
+ bmm = bootstrap_machine (xp, mm, move (bmm));
+
+ if (!bmm)
+ {
+ l2 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";});
+ btrfs (trace, "subvolume", "delete", xp);
+ break;
+ }
try
{
@@ -409,12 +477,10 @@ try
fail << "unable to rename " << xp << " to " << tp;
}
- te = true;
-
// Check the boostrapped bbot version as above and ignore this
// machine if it's newer than us.
//
- if (int i = compare_bbot (bmm.bootstrap))
+ if (int i = compare_bbot (bmm->bootstrap))
{
assert (i > 0);
l2 ([&]{trace << "ignoring " << tp << ": old bbot";});