aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2019-01-28 13:10:22 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2019-01-28 13:10:22 +0200
commit0a2c63de3c508b90d168604f2a5bc1345a12f4c9 (patch)
treec02166932bf7bbce3e9c3701780fe18c51a52c61
parent3fd741756f8b1f75c3051c4c8ba36b56c5175a48 (diff)
Fix race in QEMU shutdown
-rw-r--r--bbot/agent/machine.cxx18
-rw-r--r--bbot/agent/machine.hxx8
2 files changed, 18 insertions, 8 deletions
diff --git a/bbot/agent/machine.cxx b/bbot/agent/machine.cxx
index 69ef3c7..fdc11c0 100644
--- a/bbot/agent/machine.cxx
+++ b/bbot/agent/machine.cxx
@@ -378,8 +378,16 @@ namespace bbot
}
catch (const system_error& e)
{
- size_t t (0);
- if (wait (t))
+ // There is a window between QEMU closing the monitor socket and exiting
+ // so we wait but only briefly.
+ //
+ size_t t (seconds > 0 ? 1 : 0);
+
+ seconds -= t;
+ bool r (wait (t));
+ seconds += t;
+
+ if (r)
return true;
fail << "unable to communicate with qemu monitor: " << e;
@@ -391,13 +399,15 @@ namespace bbot
void kvm_machine::
forcedown (bool fh)
{
+ // Similar logic to shutdown().
+ //
try
{
monitor_command ("system_reset");
}
catch (const system_error& e)
{
- size_t t (0);
+ size_t t (1);
if (wait (t, fh))
return;
@@ -450,7 +460,7 @@ namespace bbot
}
catch (const process_error& e)
{
- fail (fh) << "unable to execute " << kvm << ": " << e << endf;
+ fail (fh) << "unable to wait for " << kvm << ": " << e << endf;
}
}
diff --git a/bbot/agent/machine.hxx b/bbot/agent/machine.hxx
index b1ad874..04da80e 100644
--- a/bbot/agent/machine.hxx
+++ b/bbot/agent/machine.hxx
@@ -38,10 +38,10 @@ namespace bbot
virtual void
suspend (bool fail_hard = true) = 0;
- // Wait for the machine to terminate up to the specified number of
- // seconds. Update the timeout and return false if the machine is still
- // running, true if the machine exited successfully, and throw failed
- // otherwise.
+ // Wait for the machine to terminate up to the specified number of seconds
+ // (with 0 meaning don't wait). Update the timeout and return false if the
+ // machine is still running, true if the machine exited successfully, and
+ // throw failed otherwise.
//
virtual bool
wait (size_t& seconds, bool fail_hard = true) = 0;