aboutsummaryrefslogtreecommitdiff
path: root/bbot/agent.cxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2017-07-13 22:50:15 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2017-07-14 19:10:22 +0300
commitc8ace1ee0a6cab5fd4ea2f084ea436cfa513637d (patch)
treea8db884a665fbf14797393a3b2ff95438c338bb9 /bbot/agent.cxx
parent8e8d599b129d35f638f2c1957c869b054a38b021 (diff)
Make use of wildcards in buildfiles
Diffstat (limited to 'bbot/agent.cxx')
-rw-r--r--bbot/agent.cxx1247
1 files changed, 0 insertions, 1247 deletions
diff --git a/bbot/agent.cxx b/bbot/agent.cxx
deleted file mode 100644
index d71f7b4..0000000
--- a/bbot/agent.cxx
+++ /dev/null
@@ -1,1247 +0,0 @@
-// file : bbot/agent.cxx -*- C++ -*-
-// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
-// license : TBC; see accompanying LICENSE file
-
-#include <bbot/agent.hxx>
-
-#include <pwd.h> // getpwuid()
-#include <limits.h> // PATH_MAX
-#include <signal.h> // signal()
-#include <stdlib.h> // rand_r()
-#include <unistd.h> // sleep(), realink(), getuid(), fsync()
-
-#include <net/if.h> // ifreq
-#include <netinet/in.h> // sockaddr_in
-#include <arpa/inet.h> // inet_ntop()
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-
-#include <chrono>
-#include <iostream>
-
-#include <libbutl/pager.hxx>
-#include <libbutl/sha256.hxx>
-#include <libbutl/openssl.hxx>
-#include <libbutl/filesystem.hxx> // dir_iterator
-
-#include <libbbot/manifest.hxx>
-
-#include <bbot/types.hxx>
-#include <bbot/utility.hxx>
-#include <bbot/diagnostics.hxx>
-
-#include <bbot/tftp.hxx>
-#include <bbot/machine.hxx>
-#include <bbot/machine-manifest.hxx>
-#include <bbot/bootstrap-manifest.hxx>
-
-using namespace std;
-using namespace butl;
-using namespace bbot;
-
-namespace bbot
-{
- agent_options ops;
-
- const string bs_prot ("1");
-
- string tc_name;
- uint16_t tc_num;
- standard_version tc_ver;
- string tc_id;
-
- string hname;
- uid_t uid;
- string uname;
-}
-
-static void
-file_sync (const path& f)
-{
- auto_fd fd (fdopen (f, fdopen_mode::in));
- if (fsync (fd.get ()) != 0)
- throw_system_error (errno);
-}
-
-// The btrfs tool likes to print informational messages, like "Created
-// snapshot such and such". Luckily, it writes them to stdout while proper
-// diagnostics to stderr.
-//
-template <typename... A>
-inline void
-run_btrfs (tracer& t, A&&... a)
-{
- if (verb >= 4)
- run_io (t, fdnull (), 2, 2, "btrfs", forward<A> (a)...);
- else
- run_io (t, fdnull (), fdnull (), 2, "btrfs", forward<A> (a)...);
-}
-
-template <typename... A>
-inline butl::process_exit::code_type
-btrfs_exit (tracer& t, A&&... a)
-{
- return verb >= 4
- ? run_io_exit (t, fdnull (), 2, 2, "btrfs", forward<A> (a)...)
- : run_io_exit (t, fdnull (), fdnull (), 2, "btrfs", forward<A> (a)...);
-}
-
-// Bootstrap the machine. Return the bootstrapped machine manifest if
-// successful and nullopt otherwise (in which case the machine directory
-// should be cleaned and the machine ignored for now).
-//
-static optional<bootstrapped_machine_manifest>
-bootstrap_machine (const dir_path& md,
- const machine_manifest& mm,
- optional<bootstrapped_machine_manifest> obmm)
-{
- tracer trace ("bootstrap_machine", md.string ().c_str ());
-
- bootstrapped_machine_manifest r {
- mm,
- toolchain_manifest {tc_id.empty () ? "bogus" : tc_id},
- bootstrap_manifest {
- bootstrap_manifest::versions_type {
- {"bbot", standard_version (BBOT_VERSION_STR)},
- {"libbbot", standard_version (LIBBBOT_VERSION_STR)},
- {"libbpkg", standard_version (LIBBPKG_VERSION_STR)},
- {"libbutl", standard_version (LIBBUTL_VERSION_STR)}
- }
- }
- };
-
- if (ops.fake_bootstrap ())
- {
- r.machine.mac = "de:ad:be:ef:de:ad";
- }
- else
- try
- {
- string br ("br1"); // Using private bridge for now.
-
- // Start the TFTP server (server chroot is --tftp). Map:
- //
- // GET requests to .../toolchains/<name>/*
- // PUT requests to .../bootstrap/<name>/*
- //
- auto_rmdir arm ((dir_path (ops.tftp ()) /= "bootstrap") /= tc_name);
- try_mkdir_p (arm.path ());
-
- // Bootstrap result manifest.
- //
- path mf (arm.path () / "manifest");
- try_rmfile (mf);
-
- // Note that unlike build, here we use the same VM snapshot for retries,
- // which is not ideal.
- //
- for (size_t retry (0);; ++retry)
- {
- tftp_server tftpd ("Gr ^/?(.+)$ /toolchains/" + tc_name + "/\\1\n" +
- "Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n",
- ops.tftp_port () + tc_num);
-
- l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
-
- // Start the machine.
- //
- unique_ptr<machine> m (
- start_machine (md,
- mm,
- obmm ? obmm->machine.mac : nullopt,
- br,
- tftpd.port ()));
-
- {
- // If we are terminating with an exception then force the machine down.
- // Failed that, the machine's destructor will block waiting for its
- // completion.
- //
- auto mg (
- make_exception_guard (
- [&m, &md] ()
- {
- info << "trying to force machine " << md << " down";
- try {m->forcedown ();} catch (const failed&) {}
- }));
-
- // What happens if the bootstrap process hangs? The simple thing would
- // be to force the machine down after some timeout and then fail. But
- // that won't be very helpful for investigating the cause. So instead
- // the plan is to suspend it after some timeout, issue diagnostics
- // (without failing and which Build OS monitor will relay to the
- // admin), and wait for the external intervention.
- //
- auto soft_fail = [&md, &m] (const char* msg)
- {
- {
- diag_record dr (error);
- dr << msg << " for machine " << md << ", suspending";
- m->print_info (dr);
- }
- m->suspend ();
- m->wait ();
- info << "resuming after machine suspension";
- return nullopt;
- };
-
- // The first request should be the toolchain download. Wait for up to
- // 5 minutes for that to arrive. In a sense we use it as an indication
- // that the machine has booted and the bootstrap process has started.
- // Why wait so long you may wonder? Well, we may be using a new MAC
- // address and operating systems like Windows may need to digest that.
- //
- size_t to;
- const size_t startup_to (5 * 60);
- const size_t bootstrap_to (ops.bootstrap_timeout ());
- const size_t shutdown_to (5 * 60);
-
- // This can mean two things: machine mis-configuration or what we
- // euphemistically call a "mis-boot": the VM failed to boot for some
- // unknown/random reason. Mac OS is particularly know for suffering
- // from this. So the strategy is to retry it a couple of times and
- // then suspend for investigation.
- //
- if (!tftpd.serve ((to = startup_to)))
- {
- if (retry > ops.bootstrap_retries ())
- return soft_fail ("bootstrap startup timeout");
-
- warn << "machine " << mm.name << " appears to have "
- << "mis-booted, retrying";
-
- try {m->forcedown (false);} catch (const failed&) {}
- continue;
- }
-
- l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
-
- // Next the bootstrap process may download additional toolchain
- // archives, build things, and then upload the result manifest. So on
- // our side we serve TFTP requests while periodically checking for the
- // manifest file. To workaround some obscure filesystem races (the
- // file's mtime/size is updated several seconds later; maybe tmpfs
- // issue?), we periodically re-check.
- //
- for (to = bootstrap_to; to != 0; tftpd.serve (to, 2))
- {
- if (file_exists (mf))
- {
- file_sync (mf);
- if (!file_empty (mf))
- break;
- }
- }
-
- if (to == 0)
- return soft_fail ("bootstrap timeout");
-
- l3 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";});
-
- // Shut the machine down cleanly.
- //
- if (!m->shutdown ((to = shutdown_to)))
- return soft_fail ("bootstrap shutdown timeout");
-
- l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";});
- }
-
- // Parse the result manifest.
- //
- r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap");
-
- r.machine.mac = m->mac; // Save the MAC address.
-
- break;
- }
- }
- catch (const system_error& e)
- {
- fail << "bootstrap error: " << e;
- }
-
- serialize_manifest (r, md / "manifest", "bootstrapped machine");
- return r;
-}
-
-// Return available machines and their directories as a parallel array.
-//
-static pair<bootstrapped_machine_manifests, dir_paths>
-enumerate_machines (const dir_path& machines)
-try
-{
- tracer trace ("enumerate_machines", machines.string ().c_str ());
-
- bootstrapped_machine_manifests rm;
- dir_paths rd;
-
- if (ops.fake_machine_specified ())
- {
- auto mh (
- parse_manifest<machine_header_manifest> (
- ops.fake_machine (), "machine header"));
-
- rm.push_back (
- bootstrapped_machine_manifest {
- machine_manifest {
- mh.id,
- mh.name,
- mh.summary,
- machine_type::kvm,
- string ("de:ad:be:ef:de:ad"),
- nullopt},
- toolchain_manifest {tc_id},
- bootstrap_manifest {}
- });
-
- rd.push_back (dir_path (ops.machines ()) /= mh.name); // For diagnostics.
-
- return make_pair (move (rm), move (rd));
- }
-
- // The first level are machine volumes.
- //
- for (const dir_entry& ve: dir_iterator (machines))
- {
- const string vn (ve.path ().string ());
-
- // Ignore hidden directories.
- //
- if (ve.type () != entry_type::directory || vn[0] == '.')
- continue;
-
- const dir_path vd (dir_path (machines) /= vn);
-
- // Inside we have machines.
- //
- try
- {
- for (const dir_entry& me: dir_iterator (vd))
- {
- const string mn (me.path ().string ());
-
- if (me.type () != entry_type::directory || mn[0] == '.')
- continue;
-
- const dir_path md (dir_path (vd) /= mn);
-
- // Our endgoal here is to obtain a bootstrapped snapshot of this
- // machine while watching out for potential race conditions (machines
- // being added/upgraded/removed; see the manual for details).
- //
- // So here is our overall plan:
- //
- // 1. Resolve current subvolume link for our bootstrap protocol.
- //
- // 2. If there is no link, cleanup and ignore this machine.
- //
- // 3. Try to create a snapshot of current subvolume (this operation is
- // atomic). If failed (e.g., someone changed the link and removed
- // the subvolume in the meantime), retry from #1.
- //
- // 4. Compare the snapshot to the already bootstrapped version (if
- // any) and see if we need to re-bootstrap. If so, use the snapshot
- // as a starting point. Rename to bootstrapped at the end (atomic).
- //
- dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -<P>
- dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // -<toolchain>
- bool te (dir_exists (tp));
-
- auto delete_t = [&tp, &trace] ()
- {
- run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false");
- run_btrfs (trace, "subvolume", "delete", tp);
- };
-
- for (size_t retry (0);; ++retry)
- {
- if (retry != 0)
- sleep (1);
-
- // Resolve the link to subvolume path.
- //
- dir_path sp; // <name>-<P>.<R>
- try
- {
- char b [PATH_MAX + 1];
- ssize_t r (readlink (lp.string ().c_str (), b, sizeof (b)));
-
- if (r == -1)
- {
- if (errno != ENOENT)
- throw_generic_error (errno);
- }
- else if (static_cast<size_t> (r) >= sizeof (b))
- throw_generic_error (EINVAL);
- else
- {
- b[r] = '\0';
- sp = dir_path (b);
- if (sp.relative ())
- sp = md / sp;
- }
- }
- catch (const system_error& e)
- {
- fail << "unable to read subvolume link " << lp << ": " << e;
- }
-
- // If the resolution fails, then this means there is no current
- // machine subvolume (for this bootstrap protocol). In this case we
- // clean up our toolchain subvolume (<name>-<toolchain>) and ignore
- // this machine.
- //
- if (sp.empty ())
- {
- if (te)
- delete_t ();
-
- l3 ([&]{trace << "skipping " << md << ": no subvolume link";});
- break;
- }
-
- // <name>-<toolchain>-<xxx>
- //
- const dir_path xp (
- dir_path (md) /= path::traits::temp_name (mn + '-' + tc_name));
-
- if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0)
- {
- if (retry >= 10)
- fail << "unable to snapshot subvolume " << sp;
-
- continue;
- }
-
- // Load the (original) machine manifest.
- //
- auto mm (
- parse_manifest<machine_manifest> (sp / "manifest", "machine"));
-
- // If we already have <name>-<toolchain>, see if it needs to be re-
- // bootstrapped. Things that render it obsolete:
- //
- // 1. New machine revision (compare machine ids).
- // 2. New toolchain (compare toolchain ids).
- // 3. New bbot/libbbot (compare versions).
- //
- // The last case has a complication: what should we do if we have
- // bootstrapped a newer version of bbot? This would mean that we are
- // about to be stopped and upgraded (and the upgraded version will
- // probably be able to use the result). So we simply ignore this
- // machine for this run.
-
- // Return -1 if older, 0 if the same, and +1 if newer.
- //
- auto compare_bbot = [] (const bootstrap_manifest& m) -> int
- {
- auto cmp = [&m] (const string& n, const char* v) -> int
- {
- standard_version sv (v);
- auto i = m.versions.find (n);
-
- return (i == m.versions.end () || i->second < sv
- ? -1
- : i->second > sv ? 1 : 0);
- };
-
- // Start from the top assuming a new dependency cannot be added
- // without changing the dependent's version.
- //
- int r;
- return
- (r = cmp ("bbot", BBOT_VERSION_STR)) != 0 ? r :
- (r = cmp ("libbbot", LIBBBOT_VERSION_STR)) != 0 ? r :
- (r = cmp ("libbpkg", LIBBPKG_VERSION_STR)) != 0 ? r :
- (r = cmp ("libbutl", LIBBUTL_VERSION_STR)) != 0 ? r : 0;
- };
-
- optional<bootstrapped_machine_manifest> bmm;
- if (te)
- {
- bmm = parse_manifest<bootstrapped_machine_manifest> (
- tp / "manifest", "bootstrapped machine");
-
- if (bmm->machine.id != mm.id)
- {
- l3 ([&]{trace << "re-bootstrapping " << tp << ": new machine";});
- te = false;
- }
-
- if (!tc_id.empty () && bmm->toolchain.id != tc_id)
- {
- l3 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";});
- te = false;
- }
-
- if (int i = compare_bbot (bmm->bootstrap))
- {
- if (i < 0)
- {
- l3 ([&]{trace << "re-bootstrapping " << tp << ": new bbot";});
- te = false;
- }
- else
- {
- l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
- run_btrfs (trace, "subvolume", "delete", xp);
- break;
- }
- }
-
- if (!te)
- delete_t ();
- }
- else
- l3 ([&]{trace << "bootstrapping " << tp;});
-
- if (!te)
- {
- // Use the <name>-<toolchain>-<xxx> snapshot that we have made to
- // bootstrap the new machine. Then atomically rename it to
- // <name>-<toolchain>.
- //
- bmm = bootstrap_machine (xp, mm, move (bmm));
-
- if (!bmm)
- {
- l3 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";});
- run_btrfs (trace, "subvolume", "delete", xp);
- break;
- }
-
- try
- {
- mvdir (xp, tp);
- }
- catch (const system_error& e)
- {
- fail << "unable to rename " << xp << " to " << tp;
- }
-
- l2 ([&]{trace << "bootstrapped " << bmm->machine.name;});
-
- // Check the bootstrapped bbot version as above and ignore this
- // machine if it's newer than us.
- //
- if (int i = compare_bbot (bmm->bootstrap))
- {
- if (i > 0)
- {
- l3 ([&]{trace << "ignoring " << tp << ": old bbot";});
- break;
- }
- else
- warn << "bootstrapped " << tp << " bbot worker is older "
- << "than agent; assuming test setup";
- }
- }
- else
- run_btrfs (trace, "subvolume", "delete", xp);
-
- // Add the machine to the lists.
- //
- rm.push_back (move (*bmm));
- rd.push_back (move (tp));
-
- break;
- }
- }
- }
- catch (const system_error& e)
- {
- fail << "unable to iterate over " << vd << ": " << e << endf;
- }
- }
-
- return make_pair (move (rm), move (rd));
-}
-catch (const system_error& e)
-{
- fail << "unable to iterate over " << machines << ": " << e << endf;
-}
-
-static result_manifest
-perform_task (const dir_path& md,
- const bootstrapped_machine_manifest& mm,
- const task_manifest& tm)
-try
-{
- tracer trace ("perform_task", md.string ().c_str ());
-
- result_manifest r {
- tm.name,
- tm.version,
- result_status::abort,
- operation_results {}};
-
- if (ops.fake_build ())
- return r;
-
- // The overall plan is as follows:
- //
- // 1. Snapshot the (bootstrapped) machine.
- //
- // 2. Save the task manifest to the TFTP directory (to be accessed by the
- // worker).
- //
- // 3. Start the TFTP server and the machine.
- //
- // 4. Serve TFTP requests while watching out for the result manifest.
- //
- // 5. Clean up (force the machine down and delete the snapshot).
- //
-
- // TFTP server mapping (server chroot is --tftp):
- //
- // GET requests to .../build/<name>/get/*
- // PUT requests to .../build/<name>/put/*
- //
- auto_rmdir arm ((dir_path (ops.tftp ()) /= "build") /= tc_name);
-
- dir_path gd (dir_path (arm.path ()) /= "get");
- dir_path pd (dir_path (arm.path ()) /= "put");
-
- try_mkdir_p (gd);
- try_mkdir_p (pd);
-
- path tf (gd / "manifest"); // Task manifest file.
- path rf (pd / "manifest"); // Result manifest file.
-
- serialize_manifest (tm, tf, "task");
-
- if (ops.fake_machine_specified ())
- {
- // Simply wait for the file to appear.
- //
- for (size_t i (0); !file_exists (rf); sleep (1))
- if (i++ % 10 == 0)
- l3 ([&]{trace << "waiting for result manifest";});
-
- r = parse_manifest<result_manifest> (rf, "result");
- }
- else
- {
- try_rmfile (rf);
-
- // <name>-<toolchain>-<xxx>
- //
- const dir_path xp (
- md.directory () /= path::traits::temp_name (md.leaf ().string ()));
-
- string br ("br1"); // Using private bridge for now.
-
- for (size_t retry (0);; ++retry)
- {
- if (retry != 0)
- run_btrfs (trace, "subvolume", "delete", xp);
-
- run_btrfs (trace, "subvolume", "snapshot", md, xp);
-
- // Start the TFTP server.
- //
- tftp_server tftpd ("Gr ^/?(.+)$ /build/" + tc_name + "/get/\\1\n" +
- "Pr ^/?(.+)$ /build/" + tc_name + "/put/\\1\n",
- ops.tftp_port () + tc_num);
-
- l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
-
- // Start the machine.
- //
- unique_ptr<machine> m (
- start_machine (xp,
- mm.machine,
- mm.machine.mac,
- br,
- tftpd.port ()));
-
- // Note: the machine handling logic is similar to bootstrap.
- //
- {
- auto mg (
- make_exception_guard (
- [&m, &xp] ()
- {
- info << "trying to force machine " << xp << " down";
- try {m->forcedown ();} catch (const failed&) {}
- }));
-
- auto soft_fail = [&xp, &m, &r] (const char* msg)
- {
- {
- diag_record dr (error);
- dr << msg << " for machine " << xp << ", suspending";
- m->print_info (dr);
- }
- m->suspend ();
- m->wait ();
- info << "resuming after machine suspension";
- return r;
- };
-
- // The first request should be the task manifest download. Wait for up
- // to 60 seconds for that to arrive. In a sense we use it as an
- // indication that the machine has booted and the worker process has
- // started.
- //
- size_t to;
- const size_t startup_to (60);
- const size_t build_to (ops.build_timeout ());
-
- if (!tftpd.serve ((to = startup_to)))
- {
- if (retry > ops.build_retries ())
- return soft_fail ("build startup timeout");
-
- warn << "machine " << mm.machine.name << " appears to have "
- << "mis-booted, retrying";
-
- try {m->forcedown (false);} catch (const failed&) {}
- continue;
- }
-
- l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
-
- // Next the worker builds things and then uploads the result manifest.
- // So on our side we serve TFTP requests while checking for the
- // manifest file. To workaround some obscure filesystem races (the
- // file's mtime/size is updated several seconds later; maybe tmpfs
- // issue?), we periodically re-check.
- //
- for (to = build_to; to != 0; tftpd.serve (to, 2))
- {
- if (file_exists (rf))
- {
- file_sync (rf);
- if (!file_empty (rf))
- break;
- }
- }
-
- if (to == 0)
- return soft_fail ("build timeout");
-
- l3 ([&]{trace << "completed build in " << build_to - to << "s";});
-
- // Parse the result manifest.
- //
- try
- {
- r = parse_manifest<result_manifest> (rf, "result", false);
- }
- catch (const failed&)
- {
- r.status = result_status::abnormal; // Soft-fail below.
- }
-
- if (r.status == result_status::abnormal)
- {
- // If the build terminated abnormally, suspend the machine for
- // investigation.
- //
- return soft_fail ("build terminated abnormally");
- }
- else
- {
- // Force the machine down (there is no need wasting time on clean
- // shutdown since the next step is to drop the snapshot). Also fail
- // softly if things go badly.
- //
- try {m->forcedown (false);} catch (const failed&) {}
- }
- }
-
- run_btrfs (trace, "subvolume", "delete", xp);
- break;
- }
- }
-
- // Update package name/version if the returned value as "unknown".
- //
- if (r.version == bpkg::version ("0"))
- {
- assert (r.status == result_status::abnormal);
-
- r.name = tm.name;
- r.version = tm.version;
- }
-
- return r;
-}
-catch (const system_error& e)
-{
- fail << "build error: " << e << endf;
-}
-
-extern "C" void
-handle_signal (int sig)
-{
- switch (sig)
- {
- case SIGHUP: exit (3); // Unimplemented feature.
- case SIGTERM: exit (0);
- default: assert (false);
- }
-}
-
-int
-main (int argc, char* argv[])
-try
-{
- cli::argv_scanner scan (argc, argv, true);
- ops.parse (scan);
-
- verb = ops.verbose ();
-
- if (ops.systemd_daemon ())
- systemd_diagnostics (true); // With critical errors.
-
- tracer trace ("main");
-
- uid = getuid ();
- uname = getpwuid (uid)->pw_name;
-
- {
- char buf[HOST_NAME_MAX + 1];
-
- if (gethostname (buf, sizeof (buf)) == -1)
- fail << "unable to obtain hostname: "
- << system_error (errno, generic_category ()); // Sanitize.
-
- hname = buf;
- }
-
- // On POSIX ignore SIGPIPE which is signaled to a pipe-writing process if
- // the pipe reading end is closed. Note that by default this signal
- // terminates a process. Also note that there is no way to disable this
- // behavior on a file descriptor basis or for the write() function call.
- //
- if (signal (SIGPIPE, SIG_IGN) == SIG_ERR)
- fail << "unable to ignore broken pipe (SIGPIPE) signal: "
- << system_error (errno, generic_category ()); // Sanitize.
-
- // Version.
- //
- if (ops.version ())
- {
- cout << "bbot-agent " << BBOT_VERSION_ID << endl
- << "libbbot " << LIBBBOT_VERSION_ID << endl
- << "libbpkg " << LIBBBOT_VERSION_ID << endl
- << "libbutl " << LIBBUTL_VERSION_ID << endl
- << "Copyright (c) 2014-2017 Code Synthesis Ltd" << endl
- << "TBC; All rights reserved" << endl;
-
- return 0;
- }
-
- // Help.
- //
- if (ops.help ())
- {
- pager p ("bbot-agent help", false);
- print_bbot_agent_usage (p.stream ());
-
- // If the pager failed, assume it has issued some diagnostics.
- //
- return p.wait () ? 0 : 1;
- }
-
- tc_name = ops.toolchain_name ();
- tc_num = ops.toolchain_num ();
- tc_ver = (ops.toolchain_ver_specified ()
- ? ops.toolchain_ver ()
- : standard_version (BBOT_VERSION_STR));
- tc_id = ops.toolchain_id ();
-
-
- // Controller URLs.
- //
- if (argc < 2 &&
- !ops.dump_machines () &&
- !ops.fake_request_specified ())
- {
- fail << "controller url expected" <<
- info << "run " << argv[0] << " --help for details";
- }
-
- strings controllers;
-
- for (int i (1); i != argc; ++i)
- controllers.push_back (argv[i]);
-
- // Handle SIGHUP and SIGTERM.
- //
- if (signal (SIGHUP, &handle_signal) == SIG_ERR ||
- signal (SIGTERM, &handle_signal) == SIG_ERR)
- fail << "unable to set signal handler: "
- << system_error (errno, generic_category ()); // Sanitize.
-
- optional<string> fingerprint;
-
- if (ops.auth_key_specified ())
- try
- {
- // Note that the process always prints to STDERR, so we redirect it to the
- // null device. We also check for the key file existence to print more
- // meaningful error message if that's not the case.
- //
- if (!file_exists (ops.auth_key ()))
- throw_generic_error (ENOENT);
-
- openssl os (trace,
- ops.auth_key (), path ("-"), fdnull (),
- ops.openssl (), "rsa",
- ops.openssl_option (), "-pubout", "-outform", "DER");
-
- vector<char> k (os.in.read_binary ());
- os.in.close ();
-
- if (!os.wait ())
- throw_generic_error (EIO);
-
- fingerprint = sha256 (k.data (), k.size ()).string ();
- }
- catch (const system_error& e)
- {
- fail << "unable to obtain authentication public key: " << e;
- }
-
- if (ops.systemd_daemon ())
- {
- diag_record dr;
-
- dr << info << "bbot agent " << BBOT_VERSION_ID;
-
- if (fingerprint)
- dr << info << "auth key fp " << *fingerprint;
-
- dr <<
- info << "toolchain name " << tc_name <<
- info << "toolchain num " << tc_num <<
- info << "toolchain ver " << tc_ver.string () <<
- info << "toolchain id " << tc_id <<
- info << "CPU(s) " << ops.cpu () <<
- info << "RAM(kB) " << ops.ram ();
-
- for (const string& u: controllers)
- dr << info << "controller url " << u;
- }
-
- // The work loop. The steps we go through are:
- //
- // 1. Enumerate the available machines, (re-)bootstrapping any if necessary.
- //
- // 2. Poll controller(s) for build tasks.
- //
- // 3. If no build tasks are available, go to #1 (after sleeping a bit).
- //
- // 4. If a build task is returned, do it, upload the result, and go to #1
- // (immediately).
- //
- for (bool sleep (false);; ::sleep (sleep ? 60 : 0), sleep = false)
- {
- // Enumerate the machines.
- //
- auto mp (enumerate_machines (ops.machines ()));
- bootstrapped_machine_manifests& ms (mp.first);
- dir_paths& ds (mp.second);
-
- // Prepare task request.
- //
- task_request_manifest tq {
- hname,
- tc_name,
- tc_ver,
- fingerprint,
- machine_header_manifests {}
- };
-
- for (const bootstrapped_machine_manifest& m: ms)
- tq.machines.emplace_back (m.machine.id,
- m.machine.name,
- m.machine.summary);
-
- if (ops.dump_machines ())
- {
- for (const machine_header_manifest& m: tq.machines)
- serialize_manifest (m, cout, "stdout", "machine");
-
- return 0;
- }
-
- if (tq.machines.empty ())
- {
- warn << "no build machines for toolchain " << tc_name;
- sleep = true;
- continue;
- }
-
- // Send task requests.
- //
- //
- string url;
- task_response_manifest tr;
-
- if (ops.fake_request_specified ())
- {
- auto t (parse_manifest<task_manifest> (ops.fake_request (), "task"));
-
- tr = task_response_manifest {
- "fake-session", // Dummy session.
- nullopt, // No challenge.
- url, // Empty result URL.
- move (t)};
-
- url = "http://example.org";
- }
- else
- {
- for (const string& u: controllers)
- {
- try
- {
- http_curl c (trace,
- path ("-"),
- path ("-"),
- curl::post,
- u,
- "--header", "Content-Type: text/manifest",
- "--max-time", ops.request_timeout ());
-
- // This is tricky/hairy: we may fail hard parsing the output before
- // seeing that curl exited with an error and failing softly.
- //
- bool f (false);
-
- try
- {
- serialize_manifest (tq, c.out, u, "task request", false);
- }
- catch (const failed&) {f = true;}
-
- c.out.close ();
-
- if (!f)
- try
- {
- tr = parse_manifest<task_response_manifest> (
- c.in, u, "task response", false);
- }
- catch (const failed&) {f = true;}
-
- c.in.close ();
-
- if (!c.wait () || f)
- throw_generic_error (EIO);
- }
- catch (const system_error& e)
- {
- error << "unable to request task from " << u << ": " << e;
- continue;
- }
-
- if (tr.challenge && !fingerprint) // Controller misbehaves.
- {
- error << "unexpected challenge from " << u << ": " << *tr.challenge;
- continue;
- }
-
- if (!tr.session.empty ()) // Got a task.
- {
- url = u;
-
- task_manifest& t (*tr.task);
- l2 ([&]{trace << "task for " << t.name << '/' << t.version << " "
- << "on " << t.machine << " "
- << "from " << url;});
- break;
- }
- }
- }
-
- if (tr.session.empty ()) // No task from any of the controllers.
- {
- l2 ([&]{trace << "no tasks from any controllers, sleeping";});
- sleep = true;
- continue;
- }
-
- // We have a build task.
- //
- // First find the index of the machine we were asked to use (and also
- // verify it is one of those we sent).
- //
- size_t i (0);
- for (const machine_header_manifest& m: tq.machines)
- {
- if (m.name == tr.task->machine)
- break;
-
- ++i;
- }
-
- if (i == ms.size ())
- {
- error << "task from " << url << " for unknown machine "
- << tr.task->machine;
-
- if (ops.dump_task ())
- return 0;
-
- continue;
- }
-
- task_manifest& t (*tr.task);
-
- if (ops.dump_task ())
- {
- serialize_manifest (t, cout, "stdout", "task");
- return 0;
- }
-
- // If we have our own repository certificate fingerprints, then use them
- // to replace what we have received from the controller.
- //
- if (!ops.trust ().empty ())
- t.trust = ops.trust ();
-
- const dir_path& d (ds[i]); // The -<toolchain> directory.
- const bootstrapped_machine_manifest& m (ms[i]);
-
- result_manifest r (perform_task (d, m, t));
-
- if (ops.dump_result ())
- {
- serialize_manifest (r, cout, "stdout", "result");
- return 0;
- }
-
- // Prepare answer to the private key challenge.
- //
- optional<vector<char>> challenge;
-
- if (tr.challenge)
- try
- {
- assert (ops.auth_key_specified ());
-
- openssl os (trace,
- fdstream_mode::text, path ("-"), 2,
- ops.openssl (), "rsautl",
- ops.openssl_option (), "-sign", "-inkey", ops.auth_key ());
-
- os.out << *tr.challenge;
- os.out.close ();
-
- challenge = os.in.read_binary ();
- os.in.close ();
-
- if (!os.wait ())
- throw_generic_error (EIO);
- }
- catch (const system_error& e)
- {
- // The task response challenge is valid (verified by manifest parser),
- // so there is something wrong with setup, and so the failure is fatal.
- //
- fail << "unable to sign task response challenge: " << e;
- }
-
- // Upload the result.
- //
- result_request_manifest rq {tr.session, move (challenge), move (r)};
- {
- const string& u (*tr.result_url);
-
- try
- {
- http_curl c (trace,
- path ("-"),
- nullfd, // Not expecting any data in response.
- curl::post,
- u,
- "--header", "Content-Type: text/manifest",
- "--max-time", ops.request_timeout ());
-
- // This is tricky/hairy: we may fail hard writing the input before
- // seeing that curl exited with an error and failing softly.
- //
- bool f (false);
-
- try
- {
- serialize_manifest (rq, c.out, u, "task request");
- }
- catch (const failed&) {f = true;}
-
- c.out.close ();
-
- if (!c.wait () || f)
- throw_generic_error (EIO);
- }
- catch (const system_error& e)
- {
- error << "unable to upload result to " << u << ": " << e;
- continue;
- }
- }
-
- l2 ([&]{trace << "built " << t.name << '/' << t.version << " "
- << "on " << t.machine << " "
- << "for " << url;});
- }
-}
-catch (const failed&)
-{
- return 1; // Diagnostics has already been issued.
-}
-catch (const cli::exception& e)
-{
- error << e;
- return 1;
-}
-
-namespace bbot
-{
- static unsigned int rand_seed; // Seed for rand_r();
-
- size_t
- genrand ()
- {
- if (rand_seed == 0)
- rand_seed = static_cast<unsigned int> (
- chrono::system_clock::now ().time_since_epoch ().count ());
-
- return static_cast<size_t> (rand_r (&rand_seed));
- }
-
- // Note: Linux-specific implementation.
- //
- string
- iface_addr (const string& i)
- {
- if (i.size () >= IFNAMSIZ)
- throw invalid_argument ("interface nama too long");
-
- auto_fd fd (socket (AF_INET, SOCK_DGRAM | SOCK_CLOEXEC, 0));
-
- if (fd.get () == -1)
- throw_system_error (errno);
-
- ifreq ifr;
- ifr.ifr_addr.sa_family = AF_INET;
- strcpy (ifr.ifr_name, i.c_str ());
-
- if (ioctl (fd.get (), SIOCGIFADDR, &ifr) == -1)
- throw_system_error (errno);
-
- char buf[3 * 4 + 3 + 1]; // IPv4 address.
- if (inet_ntop (AF_INET,
- &reinterpret_cast<sockaddr_in*> (&ifr.ifr_addr)->sin_addr,
- buf,
- sizeof (buf)) == nullptr)
- throw_system_error (errno);
-
- return buf;
- }
-}