diff options
author | Boris Kolpackov <boris@codesynthesis.com> | 2017-05-01 11:53:04 +0200 |
---|---|---|
committer | Boris Kolpackov <boris@codesynthesis.com> | 2017-05-01 11:53:04 +0200 |
commit | 8b896055d4d90b538211784cf9ad1bf335b20dc6 (patch) | |
tree | 755ca33df640a4a29e1d00c7e2e0265a4243bb94 | |
parent | c06600bc5e24389453d9a486c69d9746ae4fef23 (diff) |
Soft-fail on VM forcedown failures
Since we've got the result this should be harmless. And should help with
spurious KVM crashes with Mac OS guest.
-rw-r--r-- | bbot/agent.cxx | 5 | ||||
-rw-r--r-- | bbot/machine.cxx | 145 | ||||
-rw-r--r-- | bbot/machine.hxx | 10 | ||||
-rw-r--r-- | bbot/utility.hxx | 4 | ||||
-rw-r--r-- | bbot/utility.txx | 37 |
5 files changed, 113 insertions, 88 deletions
diff --git a/bbot/agent.cxx b/bbot/agent.cxx index 6b59092..096bafe 100644 --- a/bbot/agent.cxx +++ b/bbot/agent.cxx @@ -659,9 +659,10 @@ try soft_fail ("build terminated abnormally", false); // Force the machine down (there is no need wasting time on clean - // shutdown since the next step is to drop the snapshot). + // shutdown since the next step is to drop the snapshot). Also fail + // softly if things go badly. // - m->forcedown (); + try {m->forcedown (false);} catch (const failed&) {} } run_btrfs (trace, "subvolume", "delete", xp); diff --git a/bbot/machine.cxx b/bbot/machine.cxx index bf0c0bf..2f1e56b 100644 --- a/bbot/machine.cxx +++ b/bbot/machine.cxx @@ -165,13 +165,13 @@ namespace bbot shutdown (size_t& seconds) override; virtual void - forcedown () override; + forcedown (bool fail_hard) override; virtual void suspend () override; bool - wait (size_t& seconds) override; + wait (size_t& seconds, bool fail_hard) override; using machine::wait; @@ -180,7 +180,7 @@ namespace bbot private: void - monitor_command (const string&); + monitor_command (const string&, bool fail_hard = true); private: path kvm; // Hypervisor binary. @@ -341,10 +341,10 @@ namespace bbot } void kvm_machine:: - forcedown () + forcedown (bool fh) { - monitor_command ("system_reset"); - wait (); + monitor_command ("system_reset", fh); + wait (fh); } void kvm_machine:: @@ -362,91 +362,102 @@ namespace bbot } bool kvm_machine:: - wait (size_t& sec) - try + wait (size_t& sec, bool fh) { - tracer trace ("kvm_machine::wait"); + try + { + tracer trace ("kvm_machine::wait"); + + bool t; + for (; !(t = proc.try_wait ()) && sec != 0; --sec) + sleep (1); - bool t; - for (; !(t = proc.try_wait ()) && sec != 0; --sec) - sleep (1); + if (t) + { + run_io_finish (trace, proc, kvm, fh); + net.destroy (); //@@ Always fails hard. + try_rmfile (monitor, true); // QEMU doesn't seem to remove it. + } - if (t) + return t; + } + catch (const process_error& e) { - run_io_finish (trace, proc, kvm); - net.destroy (); - try_rmfile (monitor, true); // QEMU doesn't seem to remove it. + diag_record dr; if (fh) dr << fail; else dr << error; + dr << "unable to execute " << kvm << ": " << e; } - return t; - } - catch (const process_error& e) - { - fail << "unable to execute " << kvm << ": " << e << endf; + throw failed (); } void kvm_machine:: - monitor_command (const string& c) - try + monitor_command (const string& c, bool fh) { - sockaddr_un addr; - addr.sun_family = AF_LOCAL; - strcpy (addr.sun_path, monitor.string ().c_str ()); // Size check in ctor + try + { + sockaddr_un addr; + addr.sun_family = AF_LOCAL; + strcpy (addr.sun_path, monitor.string ().c_str ()); // Size check in ctor - auto_fd sock (socket (AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0)); + auto_fd sock (socket (AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0)); - if (sock.get () == -1) - throw_system_error (errno); + if (sock.get () == -1) + throw_system_error (errno); - if (connect (sock.get (), - reinterpret_cast<sockaddr*> (&addr), - sizeof (addr)) == -1) - throw_system_error (errno); + if (connect (sock.get (), + reinterpret_cast<sockaddr*> (&addr), + sizeof (addr)) == -1) + throw_system_error (errno); - // Read until we get something. - // - auto readsome = [&sock] () - { - ifdstream ifs (move (sock), - fdstream_mode::non_blocking, - ostream::badbit); - - char buf[256]; - for (streamsize n (0), m (0); - n == 0 || m != 0; - m = ifs.readsome (buf, sizeof (buf) - 1)) + // Read until we get something. + // + auto readsome = [&sock] () { - if (m != 0) + ifdstream ifs (move (sock), + fdstream_mode::non_blocking, + ostream::badbit); + + char buf[256]; + for (streamsize n (0), m (0); + n == 0 || m != 0; + m = ifs.readsome (buf, sizeof (buf) - 1)) { - n += m; + if (m != 0) + { + n += m; - //buf[m] = '\0'; - //text << buf; + //buf[m] = '\0'; + //text << buf; + } } - } - sock = ifs.release (); - }; + sock = ifs.release (); + }; - // Read QEMU welcome. - // - readsome (); + // Read QEMU welcome. + // + readsome (); - // Write our command. - // + // Write our command. + // + { + ofdstream ofs (move (sock), fdstream_mode::blocking); + ofs << c << endl; + sock = ofs.release (); + } + + // Read QEMU reply (may hit eof). + // + readsome (); + return; + } + catch (const system_error& e) { - ofdstream ofs (move (sock), fdstream_mode::blocking); - ofs << c << endl; - sock = ofs.release (); + diag_record dr; if (fh) dr << fail; else dr << error; + dr << "unable to communicate with qemu monitor: " << e; } - // Read QEMU reply (may hit eof). - // - readsome (); - } - catch (const system_error& e) - { - fail << "unable to communicate with qemu monitor: " << e; + throw failed (); } unique_ptr<machine> diff --git a/bbot/machine.hxx b/bbot/machine.hxx index 9ea0d48..c15d618 100644 --- a/bbot/machine.hxx +++ b/bbot/machine.hxx @@ -15,6 +15,8 @@ namespace bbot // Note that if the machine is destroyed while it is still running, the // destructor will block until the machine process terminates. // + // Some functions can fail softly if the fail_hard argument is false. + // class machine { public: @@ -29,7 +31,7 @@ namespace bbot // Force the machine down. // virtual void - forcedown () = 0; + forcedown (bool fail_hard = true) = 0; // Suspend the machine. // @@ -42,13 +44,13 @@ namespace bbot // otherwise. // virtual bool - wait (size_t& seconds) = 0; + wait (size_t& seconds, bool fail_hard = true) = 0; bool - wait () + wait (bool fail_hard = true) { size_t sec (~0); // Wait indefinitely. - return wait (sec); + return wait (sec, fail_hard); } // Print information about the machine (as info diagnostics) that can be diff --git a/bbot/utility.hxx b/bbot/utility.hxx index 50756dd..f6f0349 100644 --- a/bbot/utility.hxx +++ b/bbot/utility.hxx @@ -80,11 +80,11 @@ namespace bbot template <typename P> void - run_io_finish (tracer&, process&, const P&); + run_io_finish (tracer&, process&, const P&, bool fail_hard = true); template <typename P> process_exit::code_type - run_io_finish_exit (tracer&, process&, const P&); + run_io_finish_exit (tracer&, process&, const P&, bool fail_hard = true); template <typename P, typename... A> inline void diff --git a/bbot/utility.txx b/bbot/utility.txx index c35db33..519762b 100644 --- a/bbot/utility.txx +++ b/bbot/utility.txx @@ -44,7 +44,7 @@ namespace bbot template <typename P> process_exit::code_type - run_io_finish_exit (tracer&, process& pr, const P& p) + run_io_finish_exit (tracer&, process& pr, const P& p, bool fh) { try { @@ -55,21 +55,32 @@ namespace bbot if (e.normal ()) return e.code (); - fail << "process " << p << " terminated abnormally: " - << e.description () << (e.core () ? " (core dumped)" : "") << endf; + diag_record dr; if (fh) dr << fail; else dr << error; + dr << "process " << p << " terminated abnormally: " + << e.description () << (e.core () ? " (core dumped)" : ""); } catch (const process_error& e) { - fail << "unable to execute " << p << ": " << e << endf; + diag_record dr; if (fh) dr << fail; else dr << error; + dr << "unable to execute " << p << ": " << e; } + + throw failed (); } template <typename P> inline void - run_io_finish (tracer& t, process& pr, const P& p) + run_io_finish (tracer& t, process& pr, const P& p, bool fh) { - if (run_io_finish_exit (t, pr, p) != 0) - fail << "process " << p << " terminated with non-zero exit code"; + if (run_io_finish_exit (t, pr, p, fh) == 0) + return; + + { + diag_record dr; if (fh) dr << fail; else dr << error; + dr << "process " << p << " terminated with non-zero exit code"; + } + + throw failed (); } template <typename I, typename O, typename E, typename P, typename... A> @@ -171,7 +182,7 @@ namespace bbot parse_manifest (istream& is, const string& name, const char* what, - bool hard, + bool fh, bool ignore_unknown) { using namespace butl; @@ -183,14 +194,14 @@ namespace bbot } catch (const manifest_parsing& e) { - diag_record dr; if (hard) dr << fail; else dr << error; + diag_record dr; if (fh) dr << fail; else dr << error; dr << "invalid " << what << " manifest: " << name << ':' << e.line << ':' << e.column << ": " << e.description; } catch (const io_error& e) { - diag_record dr; if (hard) dr << fail; else dr << error; + diag_record dr; if (fh) dr << fail; else dr << error; dr << "unable to read " << what << " manifest " << name << ": " << e; } @@ -227,7 +238,7 @@ namespace bbot ostream& os, const string& name, const char* what, - bool hard) + bool fh) { using namespace butl; @@ -239,13 +250,13 @@ namespace bbot } catch (const manifest_serialization& e) { - diag_record dr; if (hard) dr << fail; else dr << error; + diag_record dr; if (fh) dr << fail; else dr << error; dr << "invalid " << what << " manifest: " << e.description; } catch (const io_error& e) { - diag_record dr; if (hard) dr << fail; else dr << error; + diag_record dr; if (fh) dr << fail; else dr << error; fail << "unable to write " << what << " manifest " << name << ": " << e; } |