// file : bbot/agent/machine.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file #include #include // sleep(), usleep() #include // sockaddr_un #include #include // snprintf() #include // strcpy() #include #include using namespace std; using namespace butl; namespace bbot { // Forward TFTP requests (UDP/69) coming from the machine to the specified // port. // // This allows the machine to connect to any "unknown" IP (e.g., link-local // 196.254.111.222) port 69 and end up being redirected to out TFTP server. // static void iptables (tracer& t, const char* a, const string& tap, const string& br, uint16_t port, bool ignore_errors = false) { string addr (iface_addr (br)); auto_fd fdn (ignore_errors ? fdopen_null () : nullfd); int ofd (ignore_errors ? fdn.get () : 2); process_exit::code_type e; // It seems the order of options is significant when it comes to deleting // the entries (this order is as printed by iptables -S). // e = run_io_exit (t, 0, ofd, ofd, "sudo", "iptables", "-w", // Wait for xtables lock. "-t", "nat", a, "PREROUTING", "-i", br, "-p", "udp", "-m", "udp", "--dport", 69, "-m", "physdev", "--physdev-in", tap, "-j", "DNAT", "--to-destination", addr + ':' + to_string (port)); if (e != 0 && !ignore_errors) fail << "process iptables exited with non-zero code"; // Nobody really knows whether this is really needed (really)... // e = run_io_exit (t, 0, ofd, ofd, "sudo", "iptables", "-w", a, "FORWARD", "-d", addr, "-o", br, "-p", "udp", "-m", "udp", "--dport", port, "-m", "physdev", "--physdev-out", tap, "-m", "state", "--state", "NEW,ESTABLISHED,RELATED", "-j", "ACCEPT"); if (e != 0 && !ignore_errors) fail << "process iptables exited with non-zero code"; } static string create_tap (const string& br, uint16_t machine_num, uint16_t port) { string t ("tap" + to_string (offset + machine_num)); tracer trace ("create_tap", t.c_str ()); // First try to delete it in case there is one from a previous run. // iptables (trace, "-D", t, br, port, true); // Ignore errors. run_exit (trace, "sudo", "ip", "tuntap", "delete", t, "mode", "tap"); run (trace, "sudo", "ip", "tuntap", "add", t, "mode", "tap", "user", uid); // Increase the transmission queue from default 1000 to prevent dropping // packets under high load. See also rx/tx_queue_size in the QEMU // networking setup (it's fuzzy how this is all related to each other). // run (trace, "sudo", "ip", "link", "set", t, "txqueuelen", "4000"); run (trace, "sudo", "ip", "link", "set", t, "up"); run (trace, "sudo", "ip", "link", "set", t, "master", br); iptables (trace, "-A", t, br, port); // Add. return t; } static void destroy_tap (const string& t, const string& br, uint16_t port) { tracer trace ("destroy_tap", t.c_str ()); iptables (trace, "-D", t, br, port); // Delete. run (trace, "sudo", "ip", "tuntap", "delete", t, "mode", "tap"); } class tap { public: string iface; string bridge; // Bridge interface to which this tap belongs uint16_t port; // UDP port to forward TFTP traffic to. tap (string b, uint16_t machine_num, uint16_t p) : iface (create_tap (b, machine_num, p)), bridge (move (b)), port (p) {} ~tap () { if (!iface.empty ()) { try {destroy ();} catch (...) {} } } void destroy () { string i (move (iface)); // No need trying again if below fails. destroy_tap (i, bridge, port); } }; static string generate_mac () { // The last two bits of the first byte are special: bit 1 indicates a // multicast address (which we don't want) while bit 2 -- local assignment // (which we do want). // char r[6 * 2 + 5 + 1]; snprintf (r, sizeof (r), "%02x:%02x:%02x:%02x:%02x:%02x", (genrand () & 0xFE) | 0x02, genrand (), genrand (), genrand (), genrand (), genrand ()); return r; } class kvm_machine: public machine { public: kvm_machine (const dir_path&, const machine_manifest&, uint16_t machine_num, size_t cpus, size_t ram, const optional& mac, const string& br_iface, uint16_t tftp_port, bool pub_vnc); virtual bool shutdown (size_t& seconds) override; virtual void forcedown (bool fail_hard) override; virtual void suspend (bool fail_hard) override; bool wait (size_t& seconds, bool fail_hard) override; using machine::wait; virtual void cleanup () override; virtual void print_info (diag_record&) override; private: // Throw system_error in case of communication errors. // void monitor_command (const string&); private: path kvm; // Hypervisor binary. tap net; // Tap network interface. string vnc; // QEMU VNC TCP addr:port. path monitor; // QEMU monitor UNIX socket. path log; // QEMU log (QMP read end). auto_fd qmp; // QMP write end. process proc; }; kvm_machine:: kvm_machine (const dir_path& md, const machine_manifest& mm, uint16_t m_num, size_t cpus, size_t ram, const optional& omac, const string& br, uint16_t tftp_port, bool pub_vnc) : machine (mm.mac ? *mm.mac : // Fixed mac from machine manifest. omac ? *omac : // Generated mac from previous bootstrap. generate_mac ()), kvm ("kvm"), net (br, m_num, tftp_port), vnc (machine_vnc (m_num, pub_vnc)), monitor ("/tmp/monitor-" + tc_name + '-' + to_string (inst)) { tracer trace ("kvm_machine", md.string ().c_str ()); // Monitor path. // if (m_num != 0) { monitor += '-'; monitor += to_string (m_num); } if (sizeof (sockaddr_un::sun_path) <= monitor.size ()) throw invalid_argument ("monitor unix socket path too long"); // Machine name. // // While we currently can only have one running machine per toolchain, add // the instance number and non-0 machine number for debuggability. // string name (mm.name + '-' + tc_name + '-' + to_string (inst)); if (m_num != 0) { name += '-'; name += to_string (m_num); } // Machine log. Note that it is only removed with an explicit cleanup() // call. // log = path ("/tmp/" + path::traits_type::temp_name (name) + ".log"); // Map logical CPUs to sockets/cores/threads keeping the number of sockets // and cores even. Failed that, QEMU just makes it a machine with that // number of sockets and some operating systems (like Windows) can only do // two. // // Note that for best results you may want to adjust (e.g., by over- // committing) the number of CPUs to be power of 2. // size_t cores (cpus); size_t sockets (cores >= 256 && cores % 8 == 0 ? 4 : cores >= 128 && cores % 4 == 0 ? 2 : 1); cores /= sockets; size_t threads (cores >= 16 && cores % 4 == 0 ? 2 : 1); cores /= threads; // If we have options, use that instead of the default network and // disk configuration. // strings os; if (mm.options) { os = mm.unquoted_options (); // Pre-process ifname=? and mac=?. // auto sub = [] (string& o, const char* s, const string& r) { size_t p (o.find (s)); if (p != string::npos) { p = o.find ('?', p + 1); assert (p != string::npos); o.replace (p, 1, r); } }; for (string& o: os) { sub (o, "ifname=?", net.iface); sub (o, "mac=?", mac); } } else { // @@ TMP: libstud-optional issue #1. // #if 0 auto add = [&os] (string o, optional v = {}) { os.push_back (move (o)); if (v) os.push_back (move (*v)); }; #else auto add = [&os] (string o, string v = {}) { os.push_back (move (o)); if (!v.empty ()) os.push_back (move (v)); }; #endif // Network. // // The rx/tx queue size is between 256 (default) and 1024 and must be a // power of 2. Also, maximum (1024) requires some extra support from the // guest driver failed that it falls back to 256. // add ("-netdev", "tap,id=net0,script=no,ifname=" + net.iface); add ("-device", ("virtio-net-pci,netdev=net0,mac=" + mac + ",tx_queue_size=1024" + ",rx_queue_size=1024")); // Disk. // add ("-drive", "if=none,id=disk0,file=disk.img,format=raw"); add ("-device", "virtio-blk-pci,scsi=off,drive=disk0"); //"-drive", "if=none,id=disk0,format=raw,file=disk.img" //"-device", "virtio-scsi-pci,id=scsi" //"-device", "scsi-hd,drive=disk0" // USB settings. // // These options should make graphical VMs usable from VNC. // // Note that the "standard" USB bus may not be available on // architectures other than x86 (e.g., aarch64). // add ("-usb"); add ("-device", "usb-kbd"); add ("-device", "usb-tablet"); } // Setup QMP (QEMU Machine Protocol) monitor to act as a log. // // Note that we still have to tell it our "capabilities" so while it will // write to a log file, we need a pipe it will read from. // fdpipe qmp_in; try { qmp_in = fdopen_pipe (); } catch (const io_error& e) { fail << "unable to create QMP input pipe: " << e; } auto_fd qmp_out; try { qmp_out = fdopen (log, (fdopen_mode::out | fdopen_mode::create | fdopen_mode::exclusive)); } catch (const io_error& e) { fail << "unable to create QMP output file: " << e; } // Start the VM. // const char* env[] = {"QEMU_AUDIO_DRV=none", // Disable audio output. nullptr}; proc = run_io_start ( trace, qmp_in, 2, // 1>&2 (QMP goes to stdout) qmp_out, process_env (kvm, md, env), // Run from the machine's directory. "-enable-kvm", "-name", name + ",debug-threads=on", "-S", // Start suspended. "-boot", "c", // Boot from disk. "-no-reboot", // Exit on VM reboot. "-cpu", "host", // RTC settings. // "-rtc", "clock=vm,driftfix=slew", #ifdef __x86_64__ "-no-hpet", "-global", "kvm-pit.lost_tick_policy=discard", #endif // These can override the above but not below. // os, // RAM and CPU configuration. // "-m", to_string (ram / 1024) + 'M', "-smp", (to_string (cpus) + ",sockets=" + to_string (sockets) + ",cores=" + to_string (cores) + ",threads=" + to_string (threads)), // VNC. // // We listen on all IPs for a public VNC session and only on localhost // for private. // // QEMU's -vnc option expects the port offset from 5900 rather than the // absolute value. The low 5901+, 6001+, and 6101+ ports all look good // collision-wise with anything useful. // "-vnc", (pub_vnc ? ":" : "127.0.0.1:") + to_string (offset + m_num), // 5900-base // QMP. // "-chardev", "stdio,id=qmp", "-mon", "chardev=qmp,mode=control,pretty=on", // Monitor. // "-chardev", "socket,id=mon,path=" + monitor.string () + ",server=on,wait=off", "-mon", "chardev=mon,mode=readline"); qmp_out.close (); qmp_in.in.close (); qmp = move (qmp_in.out); // Wait for the QMP greeting. One day we will find a better way. // sleep (1); try { ofdstream os (move (qmp)); os << "{ \"execute\": \"qmp_capabilities\" }" << endl; qmp = os.release (); } catch (const io_error& e) { fail << "unable to initialize QMP: " << e << info << "see " << log; } // Start execution. // try { monitor_command ("cont"); } catch (const system_error& e) { fail << "unable to communicate with qemu monitor: " << e << info << "see " << log; } } void kvm_machine:: cleanup () { try_rmfile (log, true /* ignore_errors */); } // Connect to the QEMU monitor via the UNIX socket and send system_reset. // You may be wondering why not system_powerdown? The reason is that while // not all OS know how to power-down the machine, pretty much all of them // can reboot. So combined with the -no-reboot option above, we get the // same result in a more robust way. // // Note that this setup has one side effect: if the VM decided to reboot, // say, during bootstrap, then we will interpret it as a shutdown. Current // thinking saying this is good since we don't want our VMs to reboot // uncontrollably for security and predictability reasons (e.g., we don't // want Windows to decide to install updates -- this stuff should all be // disabled during the VM preparation). // // Actually, this turned out not to be entirely accurate: reset appears to // be a "hard reset" while powerdown causes a clean shutdown. So we use // powerdown to implement shutdown() and reset/-no-reboot for implement // forcedown(). // bool kvm_machine:: shutdown (size_t& seconds) { // Wait for up to the specified number if seconds for the machine to // shutdown. And handle the case where it was shutdown from within. // try { monitor_command ("system_powerdown"); } catch (const system_error& e) { // There is a window between QEMU closing the monitor socket and exiting // so we wait but only briefly. // size_t t (seconds > 0 ? 1 : 0); seconds -= t; bool r (wait (t)); seconds += t; if (r) return true; fail << "unable to communicate with qemu monitor: " << e << info << "see " << log; } return wait (seconds); } void kvm_machine:: forcedown (bool fh) { // Similar logic to shutdown(). // try { monitor_command ("system_reset"); } catch (const system_error& e) { size_t t (1); if (wait (t, fh)) return; fail (fh) << "unable to communicate with qemu monitor: " << e << info << "see " << log; } wait (fh); } void kvm_machine:: suspend (bool fh) { try { monitor_command ("stop"); } catch (const system_error& e) { fail (fh) << "unable to communicate with qemu monitor: " << e << info << "see " << log; } } void kvm_machine:: print_info (diag_record& dr) { dr << info << "qemu pid: " << proc.id () << info << "qemu log: " << log << info << "qemu vnc: " << vnc << info << "qemu monitor: unix:" << monitor; } bool kvm_machine:: wait (size_t& sec, bool fh) { try { tracer trace ("kvm_machine::wait"); bool t; for (; !(t = proc.try_wait ().has_value ()) && sec != 0; --sec) sleep (1); if (t) { run_io_finish (trace, proc, kvm, fh); net.destroy (); //@@ Always fails hard. try_rmfile (monitor, true /* ignore_errors */); // QEMU doesn't do it. } return t; } catch (const process_error& e) { fail (fh) << "unable to wait for " << kvm << ": " << e << info << "see " << log << endf; } } void kvm_machine:: monitor_command (const string& c) { tracer trace ("kvm_machine::monitor_command", monitor.string ().c_str ()); sockaddr_un addr; addr.sun_family = AF_LOCAL; strcpy (addr.sun_path, monitor.string ().c_str ()); // Size check in ctor auto_fd sock (socket (AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0)); if (sock.get () == -1) throw_system_error (errno); if (connect (sock.get (), reinterpret_cast (&addr), sizeof (addr)) == -1) throw_system_error (errno); // Read until we get something. // auto readsome = [&trace, &sock] () { ifdstream ifs (move (sock), fdstream_mode::non_blocking, ostream::badbit); char buf[256]; for (streamsize n (0), m (0); n == 0 || m != 0; m = ifs.readsome (buf, sizeof (buf) - 1)) { if (m != 0) { n += m; buf[m] = '\0'; l5 ([&]{trace << buf;}); } usleep (100000); // 0.1s } sock = ifs.release (); }; // Read QEMU welcome. // readsome (); // Write our command. // { ofdstream ofs (move (sock), fdstream_mode::blocking); ofs << c << endl; sock = ofs.release (); } // Read QEMU reply (may hit eof). // readsome (); } unique_ptr start_machine (const dir_path& md, const machine_manifest& mm, uint16_t machine_num, size_t cpus, size_t ram, const optional& mac, const string& br_iface, uint16_t tftp_port, bool pub_vnc) { assert (machine_num < 10); switch (mm.type) { case machine_type::kvm: return make_unique ( md, mm, machine_num, cpus, ram, mac, br_iface, tftp_port, pub_vnc); case machine_type::nspawn: assert (false); // @@ TODO } return nullptr; } string machine_vnc (uint16_t num, bool pub) { assert (num < 10); string r (pub ? hip : "127.0.0.1"); r += ':'; r += to_string (5900 + offset + num); return r; } }