aboutsummaryrefslogtreecommitdiff
path: root/bbot/agent/machine.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'bbot/agent/machine.cxx')
-rw-r--r--bbot/agent/machine.cxx474
1 files changed, 474 insertions, 0 deletions
diff --git a/bbot/agent/machine.cxx b/bbot/agent/machine.cxx
new file mode 100644
index 0000000..422c623
--- /dev/null
+++ b/bbot/agent/machine.cxx
@@ -0,0 +1,474 @@
+// file : bbot/agent/machine.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : TBC; see accompanying LICENSE file
+
+#include <bbot/agent/machine.hxx>
+
+#include <unistd.h> // sleep()
+
+#include <sys/un.h> // sockaddr_un
+#include <sys/socket.h>
+
+#include <cstdio> // snprintf()
+#include <cstring> // strcpy()
+
+#include <bbot/agent/agent.hxx>
+#include <bbot/agent/machine-manifest.hxx>
+
+using namespace std;
+using namespace butl;
+
+namespace bbot
+{
+ // Forward TFTP requests (UDP/69) coming from the machine to the specified
+ // port.
+ //
+ // This allows the machine to connect to any "unknown" IP (e.g., link-local
+ // 196.254.111.222) port 69 and end up being redirected to out TFTP server.
+ //
+ static void
+ iptables (tracer& t,
+ const char* a,
+ const string& tap,
+ const string& br,
+ uint16_t port,
+ bool ignore_errors = false)
+ {
+ string addr (iface_addr (br));
+
+ auto_fd fdn (ignore_errors ? fdnull () : nullfd);
+ int ofd (ignore_errors ? fdn.get () : 2);
+
+ process_exit::code_type e;
+
+ e = run_io_exit (t, 0, ofd, ofd,
+ "sudo", "iptables",
+ "-t", "nat",
+ a, "PREROUTING",
+ "-m", "udp",
+ "-p", "udp",
+ "-m", "physdev",
+ "-i", br,
+ "--physdev-in", tap,
+ "--dport", 69,
+ "-j", "DNAT",
+ "--to-destination", addr + ':' + to_string (port));
+
+ if (e != 0 && !ignore_errors)
+ fail << "process iptables terminated with non-zero exit code";
+
+ // Nobody really knows whether this is really needed (really)...
+ //
+ e = run_io_exit (t, 0, ofd, ofd,
+ "sudo", "iptables",
+ a, "FORWARD",
+ "-m", "udp",
+ "-p", "udp",
+ "-m", "physdev",
+ "-o", br,
+ "--physdev-out", tap,
+ "-d", addr,
+ "--dport", port,
+ "-m", "state",
+ "--state", "NEW,ESTABLISHED,RELATED",
+ "-j", "ACCEPT");
+
+ if (e != 0 && !ignore_errors)
+ fail << "process iptables terminated with non-zero exit code";
+ }
+
+ static string
+ create_tap (const string& br, uint16_t port)
+ {
+ string t ("tap" + to_string (tc_num));
+
+ tracer trace ("create_tap", t.c_str ());
+
+ // First try to delete it in case there is one from a previous run.
+ //
+ iptables (trace, "-D", t, br, port, true); // Ignore errors.
+ run_exit (trace, "sudo", "ip", "tuntap", "delete", t, "mode", "tap");
+
+ run (trace, "sudo", "ip", "tuntap", "add", t, "mode", "tap", "user", uid);
+ run (trace, "sudo", "ip", "link", "set", t, "up");
+ run (trace, "sudo", "ip", "link", "set", t, "master", br);
+
+ iptables (trace, "-A", t, br, port); // Add.
+
+ return t;
+ }
+
+ static void
+ destroy_tap (const string& t, const string& br, uint16_t port)
+ {
+ tracer trace ("destroy_tap", t.c_str ());
+ iptables (trace, "-D", t, br, port); // Delete.
+ run (trace, "sudo", "ip", "tuntap", "delete", t, "mode", "tap");
+ }
+
+ class tap
+ {
+ public:
+ string iface;
+
+ string bridge; // Bridge interface to which this tap belongs
+ uint16_t port; // UDP port to forward TFTP traffic to.
+
+ tap (string b, uint16_t p)
+ : iface (create_tap (b, p)), bridge (move (b)), port (p) {}
+
+ ~tap ()
+ {
+ if (!iface.empty ())
+ {
+ try {destroy ();} catch (...) {}
+ }
+ }
+
+ void
+ destroy ()
+ {
+ destroy_tap (iface, bridge, port);
+ iface.clear ();
+ }
+ };
+
+ static string
+ generate_mac ()
+ {
+ // The last two bits of the first byte are special: bit 1 indicates a
+ // multicast address (which we don't want) while bit 1 -- local assignment
+ // (which we do want).
+ //
+ char r[6 * 2 + 5 + 1];
+ snprintf (r, sizeof (r),
+ "%02x:%02x:%02x:%02x:%02x:%02x",
+ (genrand<uint8_t> () & 0xFE) | 0x02,
+ genrand<uint8_t> (),
+ genrand<uint8_t> (),
+ genrand<uint8_t> (),
+ genrand<uint8_t> (),
+ genrand<uint8_t> ());
+ return r;
+ }
+
+ class kvm_machine: public machine
+ {
+ public:
+ kvm_machine (const dir_path&,
+ const machine_manifest&,
+ const optional<string>& mac,
+ const string& br_iface,
+ uint16_t tftp_port);
+
+ virtual bool
+ shutdown (size_t& seconds) override;
+
+ virtual void
+ forcedown (bool fail_hard) override;
+
+ virtual void
+ suspend () override;
+
+ bool
+ wait (size_t& seconds, bool fail_hard) override;
+
+ using machine::wait;
+
+ virtual void
+ print_info (diag_record&) override;
+
+ private:
+ void
+ monitor_command (const string&, bool fail_hard = true);
+
+ private:
+ path kvm; // Hypervisor binary.
+ tap net; // Tap network interface.
+ string vnc; // QEMU VNC TCP addr:port.
+ path monitor; // QEMU monitor UNIX socket.
+ process proc;
+ };
+
+ kvm_machine::
+ kvm_machine (const dir_path& md,
+ const machine_manifest& mm,
+ const optional<string>& omac,
+ const string& br,
+ uint16_t port)
+ : machine (mm.mac ? *mm.mac : // Fixed mac from machine manifest.
+ omac ? *omac : // Generated mac from previous bootstrap.
+ generate_mac ()),
+ kvm ("kvm"),
+ net (br, port),
+ vnc ("127.0.0.1:" + to_string (5900 + tc_num)),
+ monitor ("/tmp/" + tc_name + "-monitor")
+ {
+ tracer trace ("kvm_machine", md.string ().c_str ());
+
+ if (sizeof (sockaddr_un::sun_path) <= monitor.size ())
+ throw invalid_argument ("monitor unix socket path too long");
+
+ // Map logical CPUs to sockets/cores/threads. Failed that, QEMU just makes
+ // it a machine with that number of sockets and some operating systems
+ // (like Windows) only can do two.
+ //
+ size_t cpu (ops.cpu ());
+
+ size_t sockets (cpu <= 8 ? 1 : cpu <= 64 ? 2 : 4);
+ size_t cores (cpu / sockets);
+ size_t threads (cores <= 4 ? 1 : 2);
+ cores /= threads;
+
+
+ // We probably don't want to commit all the available RAM to the VM since
+ // some of it could be used on the host side for caching, etc. So the
+ // heuristics that we will use is 4G or 1G per CPU, whichever is greater
+ // and the rest divide equally between the host and the VM.
+ //
+ size_t ram ((cpu < 4 ? 4 : cpu) * 1024 * 1024); // Kb.
+
+ if (ram > ops.ram ())
+ ram = ops.ram ();
+ else
+ ram += (ops.ram () - ram) / 2;
+
+ // If we have options, use that instead of the default network and
+ // disk configuration.
+ //
+ strings os;
+
+ if (mm.options)
+ {
+ os = mm.unquoted_options ();
+
+ // Pre-process ifname=? and mac=?.
+ //
+ auto sub = [] (string& o, const char* s, const string& r)
+ {
+ size_t p (o.find (s));
+
+ if (p != string::npos)
+ {
+ p = o.find ('?', p + 1);
+ assert (p != string::npos);
+ o.replace (p, 1, r);
+ }
+ };
+
+ for (string& o: os)
+ {
+ sub (o, "ifname=?", net.iface);
+ sub (o, "mac=?", mac);
+ }
+ }
+ else
+ {
+ auto add = [&os] (string o, string v)
+ {
+ os.push_back (move (o));
+ os.push_back (move (v));
+ };
+
+ // Network.
+ //
+ add ("-netdev", "tap,id=net0,script=no,ifname=" + net.iface);
+ add ("-device", "virtio-net-pci,netdev=net0,mac=" + mac);
+
+ // Disk.
+ //
+ add ("-drive", "if=none,id=disk0,file=disk.img,format=raw");
+ add ("-device", "virtio-blk-pci,scsi=off,drive=disk0");
+
+ //"-drive", "if=none,id=disk0,format=raw,file=disk.img"
+ //"-device", "virtio-scsi-pci,id=scsi"
+ //"-device", "scsi-hd,drive=disk0"
+ }
+
+ // Start the VM.
+ //
+ // Notes:
+ //
+ // 1. echo system_powerdown | socat - UNIX-CONNECT:.../monitor
+ //
+ proc = run_io_start (
+ trace,
+ fdnull (),
+ 2,
+ 2,
+ md, // Run from the machine's directory.
+ kvm,
+ "-boot", "c", // Boot from disk.
+ "-no-reboot", // Exit on VM reboot.
+ "-m", to_string (ram / 1024) + "M",
+ "-cpu", "host",
+ "-smp", (to_string (cpu) +
+ ",sockets=" + to_string (sockets) +
+ ",cores=" + to_string (cores) +
+ ",threads=" + to_string (threads)),
+ os,
+ "-vnc", "127.0.0.1:" + to_string (tc_num), // 5900 + tc_num
+ "-monitor", "unix:" + monitor.string () + ",server,nowait");
+ }
+
+ // Connect to the QEMU monitor via the UNIX socket and send system_reset.
+ // You may be wondering why not system_powerdown? The reason is that while
+ // not all OS know how to power-down the machine, pretty much all of them
+ // can reboot. So combined with the -no-reboot option above, we get the
+ // same result in a more robust way.
+ //
+ // Note that this setup has one side effect: if the VM decided to reboot,
+ // say, during bootstrap, then we will interpret it as a shutdown. Current
+ // thinking saying this is good since we don't want our VMs to reboot
+ // uncontrollably for security and predictability reasons (e.g., we don't
+ // want Windows to decide to install updates -- this stuff should all be
+ // disabled during the VM preparation).
+ //
+ // Actually, this turned out not to be entirely accurate: reset appears to
+ // be a "hard reset" while powerdown causes a clean shutdown. So we use
+ // powerdown to implement shutdown() and reset/-no-reboot for implement
+ // forcedown().
+ //
+ bool kvm_machine::
+ shutdown (size_t& seconds)
+ {
+ monitor_command ("system_powerdown");
+
+ // Wait for up to the specified number if seconds for the machine to
+ // shutdown.
+ //
+ return wait (seconds);
+ }
+
+ void kvm_machine::
+ forcedown (bool fh)
+ {
+ monitor_command ("system_reset", fh);
+ wait (fh);
+ }
+
+ void kvm_machine::
+ suspend ()
+ {
+ monitor_command ("stop");
+ }
+
+ void kvm_machine::
+ print_info (diag_record& dr)
+ {
+ dr << info << "qemu pid: " << proc.id ()
+ << info << "qemu vnc: " << vnc
+ << info << "qemu monitor: unix:" << monitor;
+ }
+
+ bool kvm_machine::
+ wait (size_t& sec, bool fh)
+ {
+ try
+ {
+ tracer trace ("kvm_machine::wait");
+
+ bool t;
+ for (; !(t = proc.try_wait ()) && sec != 0; --sec)
+ sleep (1);
+
+ if (t)
+ {
+ run_io_finish (trace, proc, kvm, fh);
+ net.destroy (); //@@ Always fails hard.
+ try_rmfile (monitor, true); // QEMU doesn't seem to remove it.
+ }
+
+ return t;
+ }
+ catch (const process_error& e)
+ {
+ fail (fh) << "unable to execute " << kvm << ": " << e << endf;
+ }
+ }
+
+ void kvm_machine::
+ monitor_command (const string& c, bool fh)
+ {
+ try
+ {
+ sockaddr_un addr;
+ addr.sun_family = AF_LOCAL;
+ strcpy (addr.sun_path, monitor.string ().c_str ()); // Size check in ctor
+
+ auto_fd sock (socket (AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0));
+
+ if (sock.get () == -1)
+ throw_system_error (errno);
+
+ if (connect (sock.get (),
+ reinterpret_cast<sockaddr*> (&addr),
+ sizeof (addr)) == -1)
+ throw_system_error (errno);
+
+ // Read until we get something.
+ //
+ auto readsome = [&sock] ()
+ {
+ ifdstream ifs (move (sock),
+ fdstream_mode::non_blocking,
+ ostream::badbit);
+
+ char buf[256];
+ for (streamsize n (0), m (0);
+ n == 0 || m != 0;
+ m = ifs.readsome (buf, sizeof (buf) - 1))
+ {
+ if (m != 0)
+ {
+ n += m;
+
+ //buf[m] = '\0';
+ //text << buf;
+ }
+ }
+
+ sock = ifs.release ();
+ };
+
+ // Read QEMU welcome.
+ //
+ readsome ();
+
+ // Write our command.
+ //
+ {
+ ofdstream ofs (move (sock), fdstream_mode::blocking);
+ ofs << c << endl;
+ sock = ofs.release ();
+ }
+
+ // Read QEMU reply (may hit eof).
+ //
+ readsome ();
+ return;
+ }
+ catch (const system_error& e)
+ {
+ fail (fh) << "unable to communicate with qemu monitor: " << e;
+ }
+ }
+
+ unique_ptr<machine>
+ start_machine (const dir_path& md,
+ const machine_manifest& mm,
+ const optional<string>& mac,
+ const string& br_iface,
+ uint16_t tftp_port)
+ {
+ switch (mm.type)
+ {
+ case machine_type::kvm:
+ return make_unique<kvm_machine> (md, mm, mac, br_iface, tftp_port);
+ case machine_type::nspawn:
+ assert (false); //@@ TODO
+ }
+
+ return nullptr;
+ }
+}