diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2017-07-13 22:50:15 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2017-07-14 19:10:22 +0300 |
commit | c8ace1ee0a6cab5fd4ea2f084ea436cfa513637d (patch) | |
tree | a8db884a665fbf14797393a3b2ff95438c338bb9 /bbot/agent.cxx | |
parent | 8e8d599b129d35f638f2c1957c869b054a38b021 (diff) |
Make use of wildcards in buildfiles
Diffstat (limited to 'bbot/agent.cxx')
-rw-r--r-- | bbot/agent.cxx | 1247 |
1 files changed, 0 insertions, 1247 deletions
diff --git a/bbot/agent.cxx b/bbot/agent.cxx deleted file mode 100644 index d71f7b4..0000000 --- a/bbot/agent.cxx +++ /dev/null @@ -1,1247 +0,0 @@ -// file : bbot/agent.cxx -*- C++ -*- -// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd -// license : TBC; see accompanying LICENSE file - -#include <bbot/agent.hxx> - -#include <pwd.h> // getpwuid() -#include <limits.h> // PATH_MAX -#include <signal.h> // signal() -#include <stdlib.h> // rand_r() -#include <unistd.h> // sleep(), realink(), getuid(), fsync() - -#include <net/if.h> // ifreq -#include <netinet/in.h> // sockaddr_in -#include <arpa/inet.h> // inet_ntop() -#include <sys/ioctl.h> -#include <sys/socket.h> - -#include <chrono> -#include <iostream> - -#include <libbutl/pager.hxx> -#include <libbutl/sha256.hxx> -#include <libbutl/openssl.hxx> -#include <libbutl/filesystem.hxx> // dir_iterator - -#include <libbbot/manifest.hxx> - -#include <bbot/types.hxx> -#include <bbot/utility.hxx> -#include <bbot/diagnostics.hxx> - -#include <bbot/tftp.hxx> -#include <bbot/machine.hxx> -#include <bbot/machine-manifest.hxx> -#include <bbot/bootstrap-manifest.hxx> - -using namespace std; -using namespace butl; -using namespace bbot; - -namespace bbot -{ - agent_options ops; - - const string bs_prot ("1"); - - string tc_name; - uint16_t tc_num; - standard_version tc_ver; - string tc_id; - - string hname; - uid_t uid; - string uname; -} - -static void -file_sync (const path& f) -{ - auto_fd fd (fdopen (f, fdopen_mode::in)); - if (fsync (fd.get ()) != 0) - throw_system_error (errno); -} - -// The btrfs tool likes to print informational messages, like "Created -// snapshot such and such". Luckily, it writes them to stdout while proper -// diagnostics to stderr. -// -template <typename... A> -inline void -run_btrfs (tracer& t, A&&... a) -{ - if (verb >= 4) - run_io (t, fdnull (), 2, 2, "btrfs", forward<A> (a)...); - else - run_io (t, fdnull (), fdnull (), 2, "btrfs", forward<A> (a)...); -} - -template <typename... A> -inline butl::process_exit::code_type -btrfs_exit (tracer& t, A&&... a) -{ - return verb >= 4 - ? run_io_exit (t, fdnull (), 2, 2, "btrfs", forward<A> (a)...) - : run_io_exit (t, fdnull (), fdnull (), 2, "btrfs", forward<A> (a)...); -} - -// Bootstrap the machine. Return the bootstrapped machine manifest if -// successful and nullopt otherwise (in which case the machine directory -// should be cleaned and the machine ignored for now). -// -static optional<bootstrapped_machine_manifest> -bootstrap_machine (const dir_path& md, - const machine_manifest& mm, - optional<bootstrapped_machine_manifest> obmm) -{ - tracer trace ("bootstrap_machine", md.string ().c_str ()); - - bootstrapped_machine_manifest r { - mm, - toolchain_manifest {tc_id.empty () ? "bogus" : tc_id}, - bootstrap_manifest { - bootstrap_manifest::versions_type { - {"bbot", standard_version (BBOT_VERSION_STR)}, - {"libbbot", standard_version (LIBBBOT_VERSION_STR)}, - {"libbpkg", standard_version (LIBBPKG_VERSION_STR)}, - {"libbutl", standard_version (LIBBUTL_VERSION_STR)} - } - } - }; - - if (ops.fake_bootstrap ()) - { - r.machine.mac = "de:ad:be:ef:de:ad"; - } - else - try - { - string br ("br1"); // Using private bridge for now. - - // Start the TFTP server (server chroot is --tftp). Map: - // - // GET requests to .../toolchains/<name>/* - // PUT requests to .../bootstrap/<name>/* - // - auto_rmdir arm ((dir_path (ops.tftp ()) /= "bootstrap") /= tc_name); - try_mkdir_p (arm.path ()); - - // Bootstrap result manifest. - // - path mf (arm.path () / "manifest"); - try_rmfile (mf); - - // Note that unlike build, here we use the same VM snapshot for retries, - // which is not ideal. - // - for (size_t retry (0);; ++retry) - { - tftp_server tftpd ("Gr ^/?(.+)$ /toolchains/" + tc_name + "/\\1\n" + - "Pr ^/?(.+)$ /bootstrap/" + tc_name + "/\\1\n", - ops.tftp_port () + tc_num); - - l3 ([&]{trace << "tftp server on port " << tftpd.port ();}); - - // Start the machine. - // - unique_ptr<machine> m ( - start_machine (md, - mm, - obmm ? obmm->machine.mac : nullopt, - br, - tftpd.port ())); - - { - // If we are terminating with an exception then force the machine down. - // Failed that, the machine's destructor will block waiting for its - // completion. - // - auto mg ( - make_exception_guard ( - [&m, &md] () - { - info << "trying to force machine " << md << " down"; - try {m->forcedown ();} catch (const failed&) {} - })); - - // What happens if the bootstrap process hangs? The simple thing would - // be to force the machine down after some timeout and then fail. But - // that won't be very helpful for investigating the cause. So instead - // the plan is to suspend it after some timeout, issue diagnostics - // (without failing and which Build OS monitor will relay to the - // admin), and wait for the external intervention. - // - auto soft_fail = [&md, &m] (const char* msg) - { - { - diag_record dr (error); - dr << msg << " for machine " << md << ", suspending"; - m->print_info (dr); - } - m->suspend (); - m->wait (); - info << "resuming after machine suspension"; - return nullopt; - }; - - // The first request should be the toolchain download. Wait for up to - // 5 minutes for that to arrive. In a sense we use it as an indication - // that the machine has booted and the bootstrap process has started. - // Why wait so long you may wonder? Well, we may be using a new MAC - // address and operating systems like Windows may need to digest that. - // - size_t to; - const size_t startup_to (5 * 60); - const size_t bootstrap_to (ops.bootstrap_timeout ()); - const size_t shutdown_to (5 * 60); - - // This can mean two things: machine mis-configuration or what we - // euphemistically call a "mis-boot": the VM failed to boot for some - // unknown/random reason. Mac OS is particularly know for suffering - // from this. So the strategy is to retry it a couple of times and - // then suspend for investigation. - // - if (!tftpd.serve ((to = startup_to))) - { - if (retry > ops.bootstrap_retries ()) - return soft_fail ("bootstrap startup timeout"); - - warn << "machine " << mm.name << " appears to have " - << "mis-booted, retrying"; - - try {m->forcedown (false);} catch (const failed&) {} - continue; - } - - l3 ([&]{trace << "completed startup in " << startup_to - to << "s";}); - - // Next the bootstrap process may download additional toolchain - // archives, build things, and then upload the result manifest. So on - // our side we serve TFTP requests while periodically checking for the - // manifest file. To workaround some obscure filesystem races (the - // file's mtime/size is updated several seconds later; maybe tmpfs - // issue?), we periodically re-check. - // - for (to = bootstrap_to; to != 0; tftpd.serve (to, 2)) - { - if (file_exists (mf)) - { - file_sync (mf); - if (!file_empty (mf)) - break; - } - } - - if (to == 0) - return soft_fail ("bootstrap timeout"); - - l3 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";}); - - // Shut the machine down cleanly. - // - if (!m->shutdown ((to = shutdown_to))) - return soft_fail ("bootstrap shutdown timeout"); - - l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";}); - } - - // Parse the result manifest. - // - r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap"); - - r.machine.mac = m->mac; // Save the MAC address. - - break; - } - } - catch (const system_error& e) - { - fail << "bootstrap error: " << e; - } - - serialize_manifest (r, md / "manifest", "bootstrapped machine"); - return r; -} - -// Return available machines and their directories as a parallel array. -// -static pair<bootstrapped_machine_manifests, dir_paths> -enumerate_machines (const dir_path& machines) -try -{ - tracer trace ("enumerate_machines", machines.string ().c_str ()); - - bootstrapped_machine_manifests rm; - dir_paths rd; - - if (ops.fake_machine_specified ()) - { - auto mh ( - parse_manifest<machine_header_manifest> ( - ops.fake_machine (), "machine header")); - - rm.push_back ( - bootstrapped_machine_manifest { - machine_manifest { - mh.id, - mh.name, - mh.summary, - machine_type::kvm, - string ("de:ad:be:ef:de:ad"), - nullopt}, - toolchain_manifest {tc_id}, - bootstrap_manifest {} - }); - - rd.push_back (dir_path (ops.machines ()) /= mh.name); // For diagnostics. - - return make_pair (move (rm), move (rd)); - } - - // The first level are machine volumes. - // - for (const dir_entry& ve: dir_iterator (machines)) - { - const string vn (ve.path ().string ()); - - // Ignore hidden directories. - // - if (ve.type () != entry_type::directory || vn[0] == '.') - continue; - - const dir_path vd (dir_path (machines) /= vn); - - // Inside we have machines. - // - try - { - for (const dir_entry& me: dir_iterator (vd)) - { - const string mn (me.path ().string ()); - - if (me.type () != entry_type::directory || mn[0] == '.') - continue; - - const dir_path md (dir_path (vd) /= mn); - - // Our endgoal here is to obtain a bootstrapped snapshot of this - // machine while watching out for potential race conditions (machines - // being added/upgraded/removed; see the manual for details). - // - // So here is our overall plan: - // - // 1. Resolve current subvolume link for our bootstrap protocol. - // - // 2. If there is no link, cleanup and ignore this machine. - // - // 3. Try to create a snapshot of current subvolume (this operation is - // atomic). If failed (e.g., someone changed the link and removed - // the subvolume in the meantime), retry from #1. - // - // 4. Compare the snapshot to the already bootstrapped version (if - // any) and see if we need to re-bootstrap. If so, use the snapshot - // as a starting point. Rename to bootstrapped at the end (atomic). - // - dir_path lp (dir_path (md) /= (mn + '-' + bs_prot)); // -<P> - dir_path tp (dir_path (md) /= (mn + '-' + tc_name)); // -<toolchain> - bool te (dir_exists (tp)); - - auto delete_t = [&tp, &trace] () - { - run_btrfs (trace, "property", "set", "-ts", tp, "ro", "false"); - run_btrfs (trace, "subvolume", "delete", tp); - }; - - for (size_t retry (0);; ++retry) - { - if (retry != 0) - sleep (1); - - // Resolve the link to subvolume path. - // - dir_path sp; // <name>-<P>.<R> - try - { - char b [PATH_MAX + 1]; - ssize_t r (readlink (lp.string ().c_str (), b, sizeof (b))); - - if (r == -1) - { - if (errno != ENOENT) - throw_generic_error (errno); - } - else if (static_cast<size_t> (r) >= sizeof (b)) - throw_generic_error (EINVAL); - else - { - b[r] = '\0'; - sp = dir_path (b); - if (sp.relative ()) - sp = md / sp; - } - } - catch (const system_error& e) - { - fail << "unable to read subvolume link " << lp << ": " << e; - } - - // If the resolution fails, then this means there is no current - // machine subvolume (for this bootstrap protocol). In this case we - // clean up our toolchain subvolume (<name>-<toolchain>) and ignore - // this machine. - // - if (sp.empty ()) - { - if (te) - delete_t (); - - l3 ([&]{trace << "skipping " << md << ": no subvolume link";}); - break; - } - - // <name>-<toolchain>-<xxx> - // - const dir_path xp ( - dir_path (md) /= path::traits::temp_name (mn + '-' + tc_name)); - - if (btrfs_exit (trace, "subvolume", "snapshot", sp, xp) != 0) - { - if (retry >= 10) - fail << "unable to snapshot subvolume " << sp; - - continue; - } - - // Load the (original) machine manifest. - // - auto mm ( - parse_manifest<machine_manifest> (sp / "manifest", "machine")); - - // If we already have <name>-<toolchain>, see if it needs to be re- - // bootstrapped. Things that render it obsolete: - // - // 1. New machine revision (compare machine ids). - // 2. New toolchain (compare toolchain ids). - // 3. New bbot/libbbot (compare versions). - // - // The last case has a complication: what should we do if we have - // bootstrapped a newer version of bbot? This would mean that we are - // about to be stopped and upgraded (and the upgraded version will - // probably be able to use the result). So we simply ignore this - // machine for this run. - - // Return -1 if older, 0 if the same, and +1 if newer. - // - auto compare_bbot = [] (const bootstrap_manifest& m) -> int - { - auto cmp = [&m] (const string& n, const char* v) -> int - { - standard_version sv (v); - auto i = m.versions.find (n); - - return (i == m.versions.end () || i->second < sv - ? -1 - : i->second > sv ? 1 : 0); - }; - - // Start from the top assuming a new dependency cannot be added - // without changing the dependent's version. - // - int r; - return - (r = cmp ("bbot", BBOT_VERSION_STR)) != 0 ? r : - (r = cmp ("libbbot", LIBBBOT_VERSION_STR)) != 0 ? r : - (r = cmp ("libbpkg", LIBBPKG_VERSION_STR)) != 0 ? r : - (r = cmp ("libbutl", LIBBUTL_VERSION_STR)) != 0 ? r : 0; - }; - - optional<bootstrapped_machine_manifest> bmm; - if (te) - { - bmm = parse_manifest<bootstrapped_machine_manifest> ( - tp / "manifest", "bootstrapped machine"); - - if (bmm->machine.id != mm.id) - { - l3 ([&]{trace << "re-bootstrapping " << tp << ": new machine";}); - te = false; - } - - if (!tc_id.empty () && bmm->toolchain.id != tc_id) - { - l3 ([&]{trace << "re-bootstrapping " << tp << ": new toolchain";}); - te = false; - } - - if (int i = compare_bbot (bmm->bootstrap)) - { - if (i < 0) - { - l3 ([&]{trace << "re-bootstrapping " << tp << ": new bbot";}); - te = false; - } - else - { - l3 ([&]{trace << "ignoring " << tp << ": old bbot";}); - run_btrfs (trace, "subvolume", "delete", xp); - break; - } - } - - if (!te) - delete_t (); - } - else - l3 ([&]{trace << "bootstrapping " << tp;}); - - if (!te) - { - // Use the <name>-<toolchain>-<xxx> snapshot that we have made to - // bootstrap the new machine. Then atomically rename it to - // <name>-<toolchain>. - // - bmm = bootstrap_machine (xp, mm, move (bmm)); - - if (!bmm) - { - l3 ([&]{trace << "ignoring " << tp << ": failed to bootstrap";}); - run_btrfs (trace, "subvolume", "delete", xp); - break; - } - - try - { - mvdir (xp, tp); - } - catch (const system_error& e) - { - fail << "unable to rename " << xp << " to " << tp; - } - - l2 ([&]{trace << "bootstrapped " << bmm->machine.name;}); - - // Check the bootstrapped bbot version as above and ignore this - // machine if it's newer than us. - // - if (int i = compare_bbot (bmm->bootstrap)) - { - if (i > 0) - { - l3 ([&]{trace << "ignoring " << tp << ": old bbot";}); - break; - } - else - warn << "bootstrapped " << tp << " bbot worker is older " - << "than agent; assuming test setup"; - } - } - else - run_btrfs (trace, "subvolume", "delete", xp); - - // Add the machine to the lists. - // - rm.push_back (move (*bmm)); - rd.push_back (move (tp)); - - break; - } - } - } - catch (const system_error& e) - { - fail << "unable to iterate over " << vd << ": " << e << endf; - } - } - - return make_pair (move (rm), move (rd)); -} -catch (const system_error& e) -{ - fail << "unable to iterate over " << machines << ": " << e << endf; -} - -static result_manifest -perform_task (const dir_path& md, - const bootstrapped_machine_manifest& mm, - const task_manifest& tm) -try -{ - tracer trace ("perform_task", md.string ().c_str ()); - - result_manifest r { - tm.name, - tm.version, - result_status::abort, - operation_results {}}; - - if (ops.fake_build ()) - return r; - - // The overall plan is as follows: - // - // 1. Snapshot the (bootstrapped) machine. - // - // 2. Save the task manifest to the TFTP directory (to be accessed by the - // worker). - // - // 3. Start the TFTP server and the machine. - // - // 4. Serve TFTP requests while watching out for the result manifest. - // - // 5. Clean up (force the machine down and delete the snapshot). - // - - // TFTP server mapping (server chroot is --tftp): - // - // GET requests to .../build/<name>/get/* - // PUT requests to .../build/<name>/put/* - // - auto_rmdir arm ((dir_path (ops.tftp ()) /= "build") /= tc_name); - - dir_path gd (dir_path (arm.path ()) /= "get"); - dir_path pd (dir_path (arm.path ()) /= "put"); - - try_mkdir_p (gd); - try_mkdir_p (pd); - - path tf (gd / "manifest"); // Task manifest file. - path rf (pd / "manifest"); // Result manifest file. - - serialize_manifest (tm, tf, "task"); - - if (ops.fake_machine_specified ()) - { - // Simply wait for the file to appear. - // - for (size_t i (0); !file_exists (rf); sleep (1)) - if (i++ % 10 == 0) - l3 ([&]{trace << "waiting for result manifest";}); - - r = parse_manifest<result_manifest> (rf, "result"); - } - else - { - try_rmfile (rf); - - // <name>-<toolchain>-<xxx> - // - const dir_path xp ( - md.directory () /= path::traits::temp_name (md.leaf ().string ())); - - string br ("br1"); // Using private bridge for now. - - for (size_t retry (0);; ++retry) - { - if (retry != 0) - run_btrfs (trace, "subvolume", "delete", xp); - - run_btrfs (trace, "subvolume", "snapshot", md, xp); - - // Start the TFTP server. - // - tftp_server tftpd ("Gr ^/?(.+)$ /build/" + tc_name + "/get/\\1\n" + - "Pr ^/?(.+)$ /build/" + tc_name + "/put/\\1\n", - ops.tftp_port () + tc_num); - - l3 ([&]{trace << "tftp server on port " << tftpd.port ();}); - - // Start the machine. - // - unique_ptr<machine> m ( - start_machine (xp, - mm.machine, - mm.machine.mac, - br, - tftpd.port ())); - - // Note: the machine handling logic is similar to bootstrap. - // - { - auto mg ( - make_exception_guard ( - [&m, &xp] () - { - info << "trying to force machine " << xp << " down"; - try {m->forcedown ();} catch (const failed&) {} - })); - - auto soft_fail = [&xp, &m, &r] (const char* msg) - { - { - diag_record dr (error); - dr << msg << " for machine " << xp << ", suspending"; - m->print_info (dr); - } - m->suspend (); - m->wait (); - info << "resuming after machine suspension"; - return r; - }; - - // The first request should be the task manifest download. Wait for up - // to 60 seconds for that to arrive. In a sense we use it as an - // indication that the machine has booted and the worker process has - // started. - // - size_t to; - const size_t startup_to (60); - const size_t build_to (ops.build_timeout ()); - - if (!tftpd.serve ((to = startup_to))) - { - if (retry > ops.build_retries ()) - return soft_fail ("build startup timeout"); - - warn << "machine " << mm.machine.name << " appears to have " - << "mis-booted, retrying"; - - try {m->forcedown (false);} catch (const failed&) {} - continue; - } - - l3 ([&]{trace << "completed startup in " << startup_to - to << "s";}); - - // Next the worker builds things and then uploads the result manifest. - // So on our side we serve TFTP requests while checking for the - // manifest file. To workaround some obscure filesystem races (the - // file's mtime/size is updated several seconds later; maybe tmpfs - // issue?), we periodically re-check. - // - for (to = build_to; to != 0; tftpd.serve (to, 2)) - { - if (file_exists (rf)) - { - file_sync (rf); - if (!file_empty (rf)) - break; - } - } - - if (to == 0) - return soft_fail ("build timeout"); - - l3 ([&]{trace << "completed build in " << build_to - to << "s";}); - - // Parse the result manifest. - // - try - { - r = parse_manifest<result_manifest> (rf, "result", false); - } - catch (const failed&) - { - r.status = result_status::abnormal; // Soft-fail below. - } - - if (r.status == result_status::abnormal) - { - // If the build terminated abnormally, suspend the machine for - // investigation. - // - return soft_fail ("build terminated abnormally"); - } - else - { - // Force the machine down (there is no need wasting time on clean - // shutdown since the next step is to drop the snapshot). Also fail - // softly if things go badly. - // - try {m->forcedown (false);} catch (const failed&) {} - } - } - - run_btrfs (trace, "subvolume", "delete", xp); - break; - } - } - - // Update package name/version if the returned value as "unknown". - // - if (r.version == bpkg::version ("0")) - { - assert (r.status == result_status::abnormal); - - r.name = tm.name; - r.version = tm.version; - } - - return r; -} -catch (const system_error& e) -{ - fail << "build error: " << e << endf; -} - -extern "C" void -handle_signal (int sig) -{ - switch (sig) - { - case SIGHUP: exit (3); // Unimplemented feature. - case SIGTERM: exit (0); - default: assert (false); - } -} - -int -main (int argc, char* argv[]) -try -{ - cli::argv_scanner scan (argc, argv, true); - ops.parse (scan); - - verb = ops.verbose (); - - if (ops.systemd_daemon ()) - systemd_diagnostics (true); // With critical errors. - - tracer trace ("main"); - - uid = getuid (); - uname = getpwuid (uid)->pw_name; - - { - char buf[HOST_NAME_MAX + 1]; - - if (gethostname (buf, sizeof (buf)) == -1) - fail << "unable to obtain hostname: " - << system_error (errno, generic_category ()); // Sanitize. - - hname = buf; - } - - // On POSIX ignore SIGPIPE which is signaled to a pipe-writing process if - // the pipe reading end is closed. Note that by default this signal - // terminates a process. Also note that there is no way to disable this - // behavior on a file descriptor basis or for the write() function call. - // - if (signal (SIGPIPE, SIG_IGN) == SIG_ERR) - fail << "unable to ignore broken pipe (SIGPIPE) signal: " - << system_error (errno, generic_category ()); // Sanitize. - - // Version. - // - if (ops.version ()) - { - cout << "bbot-agent " << BBOT_VERSION_ID << endl - << "libbbot " << LIBBBOT_VERSION_ID << endl - << "libbpkg " << LIBBBOT_VERSION_ID << endl - << "libbutl " << LIBBUTL_VERSION_ID << endl - << "Copyright (c) 2014-2017 Code Synthesis Ltd" << endl - << "TBC; All rights reserved" << endl; - - return 0; - } - - // Help. - // - if (ops.help ()) - { - pager p ("bbot-agent help", false); - print_bbot_agent_usage (p.stream ()); - - // If the pager failed, assume it has issued some diagnostics. - // - return p.wait () ? 0 : 1; - } - - tc_name = ops.toolchain_name (); - tc_num = ops.toolchain_num (); - tc_ver = (ops.toolchain_ver_specified () - ? ops.toolchain_ver () - : standard_version (BBOT_VERSION_STR)); - tc_id = ops.toolchain_id (); - - - // Controller URLs. - // - if (argc < 2 && - !ops.dump_machines () && - !ops.fake_request_specified ()) - { - fail << "controller url expected" << - info << "run " << argv[0] << " --help for details"; - } - - strings controllers; - - for (int i (1); i != argc; ++i) - controllers.push_back (argv[i]); - - // Handle SIGHUP and SIGTERM. - // - if (signal (SIGHUP, &handle_signal) == SIG_ERR || - signal (SIGTERM, &handle_signal) == SIG_ERR) - fail << "unable to set signal handler: " - << system_error (errno, generic_category ()); // Sanitize. - - optional<string> fingerprint; - - if (ops.auth_key_specified ()) - try - { - // Note that the process always prints to STDERR, so we redirect it to the - // null device. We also check for the key file existence to print more - // meaningful error message if that's not the case. - // - if (!file_exists (ops.auth_key ())) - throw_generic_error (ENOENT); - - openssl os (trace, - ops.auth_key (), path ("-"), fdnull (), - ops.openssl (), "rsa", - ops.openssl_option (), "-pubout", "-outform", "DER"); - - vector<char> k (os.in.read_binary ()); - os.in.close (); - - if (!os.wait ()) - throw_generic_error (EIO); - - fingerprint = sha256 (k.data (), k.size ()).string (); - } - catch (const system_error& e) - { - fail << "unable to obtain authentication public key: " << e; - } - - if (ops.systemd_daemon ()) - { - diag_record dr; - - dr << info << "bbot agent " << BBOT_VERSION_ID; - - if (fingerprint) - dr << info << "auth key fp " << *fingerprint; - - dr << - info << "toolchain name " << tc_name << - info << "toolchain num " << tc_num << - info << "toolchain ver " << tc_ver.string () << - info << "toolchain id " << tc_id << - info << "CPU(s) " << ops.cpu () << - info << "RAM(kB) " << ops.ram (); - - for (const string& u: controllers) - dr << info << "controller url " << u; - } - - // The work loop. The steps we go through are: - // - // 1. Enumerate the available machines, (re-)bootstrapping any if necessary. - // - // 2. Poll controller(s) for build tasks. - // - // 3. If no build tasks are available, go to #1 (after sleeping a bit). - // - // 4. If a build task is returned, do it, upload the result, and go to #1 - // (immediately). - // - for (bool sleep (false);; ::sleep (sleep ? 60 : 0), sleep = false) - { - // Enumerate the machines. - // - auto mp (enumerate_machines (ops.machines ())); - bootstrapped_machine_manifests& ms (mp.first); - dir_paths& ds (mp.second); - - // Prepare task request. - // - task_request_manifest tq { - hname, - tc_name, - tc_ver, - fingerprint, - machine_header_manifests {} - }; - - for (const bootstrapped_machine_manifest& m: ms) - tq.machines.emplace_back (m.machine.id, - m.machine.name, - m.machine.summary); - - if (ops.dump_machines ()) - { - for (const machine_header_manifest& m: tq.machines) - serialize_manifest (m, cout, "stdout", "machine"); - - return 0; - } - - if (tq.machines.empty ()) - { - warn << "no build machines for toolchain " << tc_name; - sleep = true; - continue; - } - - // Send task requests. - // - // - string url; - task_response_manifest tr; - - if (ops.fake_request_specified ()) - { - auto t (parse_manifest<task_manifest> (ops.fake_request (), "task")); - - tr = task_response_manifest { - "fake-session", // Dummy session. - nullopt, // No challenge. - url, // Empty result URL. - move (t)}; - - url = "http://example.org"; - } - else - { - for (const string& u: controllers) - { - try - { - http_curl c (trace, - path ("-"), - path ("-"), - curl::post, - u, - "--header", "Content-Type: text/manifest", - "--max-time", ops.request_timeout ()); - - // This is tricky/hairy: we may fail hard parsing the output before - // seeing that curl exited with an error and failing softly. - // - bool f (false); - - try - { - serialize_manifest (tq, c.out, u, "task request", false); - } - catch (const failed&) {f = true;} - - c.out.close (); - - if (!f) - try - { - tr = parse_manifest<task_response_manifest> ( - c.in, u, "task response", false); - } - catch (const failed&) {f = true;} - - c.in.close (); - - if (!c.wait () || f) - throw_generic_error (EIO); - } - catch (const system_error& e) - { - error << "unable to request task from " << u << ": " << e; - continue; - } - - if (tr.challenge && !fingerprint) // Controller misbehaves. - { - error << "unexpected challenge from " << u << ": " << *tr.challenge; - continue; - } - - if (!tr.session.empty ()) // Got a task. - { - url = u; - - task_manifest& t (*tr.task); - l2 ([&]{trace << "task for " << t.name << '/' << t.version << " " - << "on " << t.machine << " " - << "from " << url;}); - break; - } - } - } - - if (tr.session.empty ()) // No task from any of the controllers. - { - l2 ([&]{trace << "no tasks from any controllers, sleeping";}); - sleep = true; - continue; - } - - // We have a build task. - // - // First find the index of the machine we were asked to use (and also - // verify it is one of those we sent). - // - size_t i (0); - for (const machine_header_manifest& m: tq.machines) - { - if (m.name == tr.task->machine) - break; - - ++i; - } - - if (i == ms.size ()) - { - error << "task from " << url << " for unknown machine " - << tr.task->machine; - - if (ops.dump_task ()) - return 0; - - continue; - } - - task_manifest& t (*tr.task); - - if (ops.dump_task ()) - { - serialize_manifest (t, cout, "stdout", "task"); - return 0; - } - - // If we have our own repository certificate fingerprints, then use them - // to replace what we have received from the controller. - // - if (!ops.trust ().empty ()) - t.trust = ops.trust (); - - const dir_path& d (ds[i]); // The -<toolchain> directory. - const bootstrapped_machine_manifest& m (ms[i]); - - result_manifest r (perform_task (d, m, t)); - - if (ops.dump_result ()) - { - serialize_manifest (r, cout, "stdout", "result"); - return 0; - } - - // Prepare answer to the private key challenge. - // - optional<vector<char>> challenge; - - if (tr.challenge) - try - { - assert (ops.auth_key_specified ()); - - openssl os (trace, - fdstream_mode::text, path ("-"), 2, - ops.openssl (), "rsautl", - ops.openssl_option (), "-sign", "-inkey", ops.auth_key ()); - - os.out << *tr.challenge; - os.out.close (); - - challenge = os.in.read_binary (); - os.in.close (); - - if (!os.wait ()) - throw_generic_error (EIO); - } - catch (const system_error& e) - { - // The task response challenge is valid (verified by manifest parser), - // so there is something wrong with setup, and so the failure is fatal. - // - fail << "unable to sign task response challenge: " << e; - } - - // Upload the result. - // - result_request_manifest rq {tr.session, move (challenge), move (r)}; - { - const string& u (*tr.result_url); - - try - { - http_curl c (trace, - path ("-"), - nullfd, // Not expecting any data in response. - curl::post, - u, - "--header", "Content-Type: text/manifest", - "--max-time", ops.request_timeout ()); - - // This is tricky/hairy: we may fail hard writing the input before - // seeing that curl exited with an error and failing softly. - // - bool f (false); - - try - { - serialize_manifest (rq, c.out, u, "task request"); - } - catch (const failed&) {f = true;} - - c.out.close (); - - if (!c.wait () || f) - throw_generic_error (EIO); - } - catch (const system_error& e) - { - error << "unable to upload result to " << u << ": " << e; - continue; - } - } - - l2 ([&]{trace << "built " << t.name << '/' << t.version << " " - << "on " << t.machine << " " - << "for " << url;}); - } -} -catch (const failed&) -{ - return 1; // Diagnostics has already been issued. -} -catch (const cli::exception& e) -{ - error << e; - return 1; -} - -namespace bbot -{ - static unsigned int rand_seed; // Seed for rand_r(); - - size_t - genrand () - { - if (rand_seed == 0) - rand_seed = static_cast<unsigned int> ( - chrono::system_clock::now ().time_since_epoch ().count ()); - - return static_cast<size_t> (rand_r (&rand_seed)); - } - - // Note: Linux-specific implementation. - // - string - iface_addr (const string& i) - { - if (i.size () >= IFNAMSIZ) - throw invalid_argument ("interface nama too long"); - - auto_fd fd (socket (AF_INET, SOCK_DGRAM | SOCK_CLOEXEC, 0)); - - if (fd.get () == -1) - throw_system_error (errno); - - ifreq ifr; - ifr.ifr_addr.sa_family = AF_INET; - strcpy (ifr.ifr_name, i.c_str ()); - - if (ioctl (fd.get (), SIOCGIFADDR, &ifr) == -1) - throw_system_error (errno); - - char buf[3 * 4 + 3 + 1]; // IPv4 address. - if (inet_ntop (AF_INET, - &reinterpret_cast<sockaddr_in*> (&ifr.ifr_addr)->sin_addr, - buf, - sizeof (buf)) == nullptr) - throw_system_error (errno); - - return buf; - } -} |