diff options
author | Boris Kolpackov <boris@codesynthesis.com> | 2019-01-28 13:10:22 +0200 |
---|---|---|
committer | Boris Kolpackov <boris@codesynthesis.com> | 2019-01-28 13:10:22 +0200 |
commit | 0a2c63de3c508b90d168604f2a5bc1345a12f4c9 (patch) | |
tree | c02166932bf7bbce3e9c3701780fe18c51a52c61 | |
parent | 3fd741756f8b1f75c3051c4c8ba36b56c5175a48 (diff) |
Fix race in QEMU shutdown
-rw-r--r-- | bbot/agent/machine.cxx | 18 | ||||
-rw-r--r-- | bbot/agent/machine.hxx | 8 |
2 files changed, 18 insertions, 8 deletions
diff --git a/bbot/agent/machine.cxx b/bbot/agent/machine.cxx index 69ef3c7..fdc11c0 100644 --- a/bbot/agent/machine.cxx +++ b/bbot/agent/machine.cxx @@ -378,8 +378,16 @@ namespace bbot } catch (const system_error& e) { - size_t t (0); - if (wait (t)) + // There is a window between QEMU closing the monitor socket and exiting + // so we wait but only briefly. + // + size_t t (seconds > 0 ? 1 : 0); + + seconds -= t; + bool r (wait (t)); + seconds += t; + + if (r) return true; fail << "unable to communicate with qemu monitor: " << e; @@ -391,13 +399,15 @@ namespace bbot void kvm_machine:: forcedown (bool fh) { + // Similar logic to shutdown(). + // try { monitor_command ("system_reset"); } catch (const system_error& e) { - size_t t (0); + size_t t (1); if (wait (t, fh)) return; @@ -450,7 +460,7 @@ namespace bbot } catch (const process_error& e) { - fail (fh) << "unable to execute " << kvm << ": " << e << endf; + fail (fh) << "unable to wait for " << kvm << ": " << e << endf; } } diff --git a/bbot/agent/machine.hxx b/bbot/agent/machine.hxx index b1ad874..04da80e 100644 --- a/bbot/agent/machine.hxx +++ b/bbot/agent/machine.hxx @@ -38,10 +38,10 @@ namespace bbot virtual void suspend (bool fail_hard = true) = 0; - // Wait for the machine to terminate up to the specified number of - // seconds. Update the timeout and return false if the machine is still - // running, true if the machine exited successfully, and throw failed - // otherwise. + // Wait for the machine to terminate up to the specified number of seconds + // (with 0 meaning don't wait). Update the timeout and return false if the + // machine is still running, true if the machine exited successfully, and + // throw failed otherwise. // virtual bool wait (size_t& seconds, bool fail_hard = true) = 0; |