From 6cabe4d21b49ea149e6694c7290ff3e1fa9e08d2 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Mon, 1 May 2017 15:46:25 +0200
Subject: Retry mis-booted machines

---
 bbot/agent.cli           |  12 ++
 bbot/agent.cxx           | 329 ++++++++++++++++++++++++++---------------------
 bbot/bbot-agent@.service |   4 +
 3 files changed, 201 insertions(+), 144 deletions(-)

(limited to 'bbot')
diff --git a/bbot/agent.cli b/bbot/agent.cli
index 497b841..49563f8 100644
--- a/bbot/agent.cli
+++ b/bbot/agent.cli
@@ -106,6 +106,12 @@ namespace bbot
        600 (10 minutes) by default."
     }
 
+    size_t --bootstrap-retries = 2
+    {
+      "<num>",
+      "Number of time to retry a mis-booted bootstrap, 2 by default."
+    }
+
     size_t --build-timeout = 1800
     {
       "<sec>",
@@ -113,6 +119,12 @@ namespace bbot
        minutes) by default."
     }
 
+    size_t --build-retries = 2
+    {
+      "<num>",
+      "Number of time to retry a mis-booted build, 2 by default."
+    }
+
     size_t --request-timeout = 300
     {
       "<sec>",
diff --git a/bbot/agent.cxx b/bbot/agent.cxx
index 096bafe..d30799f 100644
--- a/bbot/agent.cxx
+++ b/bbot/agent.cxx
@@ -121,94 +121,119 @@ bootstrap_machine (const dir_path& md,
     path mf (arm.path () / "manifest");
     try_rmfile (mf);
 
-    tftp_server tftpd ("Gr  ^/?(.+)$  /toolchains/" + tc_name + "/\\1\n" +
-                       "Pr  ^/?(.+)$  /bootstrap/" + tc_name + "/\\1\n",
-                       ops.tftp_port () + tc_num);
-
-    l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
-
-    // Start the machine.
+    // Note that unlike build, here we use the same VM snapshot for retries,
+    // which is not ideal.
     //
-    unique_ptr<machine> m (
-      start_machine (md,
-                     mm,
-                     obmm ? obmm->machine.mac : nullopt,
-                     br,
-                     tftpd.port ()));
-
+    for (size_t retry (0);; ++retry)
     {
-      // If we are terminating with an exception then force the machine down.
-      // Failed that, the machine's destructor will block waiting for its
-      // completion.
-      //
-      auto mg (
-        make_exception_guard (
-          [&m, &md] ()
-          {
-            info << "trying to force machine " << md << " down";
-            try {m->forcedown ();} catch (const failed&) {}
-          }));
-
-      // What happens if the bootstrap process hangs? The simple thing would
-      // be to force the machine down after some timeout and then fail. But
-      // that won't be very helpful for investigating the cause. So instead
-      // the plan is to suspend it after some timeout, issue diagnostics
-      // (without failing and which Build OS monitor will relay to the admin),
-      // and wait for the external intervention.
+      tftp_server tftpd ("Gr  ^/?(.+)$  /toolchains/" + tc_name + "/\\1\n" +
+                         "Pr  ^/?(.+)$  /bootstrap/" + tc_name + "/\\1\n",
+                         ops.tftp_port () + tc_num);
+
+      l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
+
+      // Start the machine.
       //
-      auto soft_fail = [&md, &m] (const char* msg)
+      unique_ptr<machine> m (
+        start_machine (md,
+                       mm,
+                       obmm ? obmm->machine.mac : nullopt,
+                       br,
+                       tftpd.port ()));
+
       {
+        // If we are terminating with an exception then force the machine down.
+        // Failed that, the machine's destructor will block waiting for its
+        // completion.
+        //
+        auto mg (
+          make_exception_guard (
+            [&m, &md] ()
+            {
+              info << "trying to force machine " << md << " down";
+              try {m->forcedown ();} catch (const failed&) {}
+            }));
+
+        // What happens if the bootstrap process hangs? The simple thing would
+        // be to force the machine down after some timeout and then fail. But
+        // that won't be very helpful for investigating the cause. So instead
+        // the plan is to suspend it after some timeout, issue diagnostics
+        // (without failing and which Build OS monitor will relay to the
+        // admin), and wait for the external intervention.
+        //
+        auto soft_fail = [&md, &m] (const char* msg)
         {
-          diag_record dr (error);
-          dr << msg << " for machine " << md << ", suspending";
-          m->print_info (dr);
+          {
+            diag_record dr (error);
+            dr << msg << " for machine " << md << ", suspending";
+            m->print_info (dr);
+          }
+          m->suspend ();
+          m->wait ();
+          return nullopt;
+        };
+
+        // The first request should be the toolchain download. Wait for up to
+        // 5 minutes for that to arrive. In a sense we use it as an indication
+        // that the machine has booted and the bootstrap process has started.
+        // Why wait so long you may wonder? Well, we may be using a new MAC
+        // address and operating systems like Windows may need to digest that.
+        //
+        size_t to;
+        const size_t startup_to   (5 * 60);
+        const size_t bootstrap_to (ops.bootstrap_timeout ());
+        const size_t shutdown_to  (5 * 60);
+
+        // This can mean two things: machine mis-configuration or what we
+        // euphemistically call a "mis-boot": the VM failed to boot for some
+        // unknown/random reason. Mac OS is particularly know for suffering
+        // from this. So the strategy is to retry it a couple of times and
+        // then suspend for investigation.
+        //
+        if (!tftpd.serve ((to = startup_to)))
+        {
+          if (retry > ops.bootstrap_retries ())
+            return soft_fail ("bootstrap startup timeout");
+
+          warn << "machine " << mm.name << " appears to have "
+               << "mis-booted, retrying";
+
+          try {m->forcedown (false);} catch (const failed&) {}
+          continue;
         }
-        m->suspend ();
-        m->wait ();
-        return nullopt;
-      };
-
-      // The first request should be the toolchain download. Wait for up to 5
-      // minutes for that to arrive. In a sense we use it as an indication
-      // that the machine has booted and the bootstrap process has started.
-      // Why wait so long you may wonder? Well, we may be using a new MAC
-      // address and operating systems like Windows may need to digest that.
-      //
-      size_t to;
-      const size_t startup_to   (5 * 60);
-      const size_t bootstrap_to (ops.bootstrap_timeout ());
-      const size_t shutdown_to  (5 * 60);
 
-      if (!tftpd.serve ((to = startup_to)))
-        return soft_fail ("bootstrap startup timeout");
+        l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
 
-      l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
+        // Next the bootstrap process may download additional toolchain
+        // archives, build things, and then upload the result manifest. So on
+        // our side we serve TFTP requests while periodically checking for the
+        // manifest file.
+        //
+        for (to = bootstrap_to;
+             to != 0 && !file_exists (mf);
+             tftpd.serve (to)) ;
 
-      // Next the bootstrap process may download additional toolchain
-      // archives, build things, and then upload the result manifest. So on
-      // our side we serve TFTP requests while periodically checking for the
-      // manifest file.
-      //
-      for (to = bootstrap_to; to != 0 && !file_exists (mf); tftpd.serve (to)) ;
+        if (to == 0)
+          return soft_fail ("bootstrap timeout");
 
-      if (to == 0)
-        return soft_fail ("bootstrap timeout");
+        l3 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";});
 
-      l3 ([&]{trace << "completed bootstrap in " << bootstrap_to - to << "s";});
+        // Shut the machine down cleanly.
+        //
+        if (!m->shutdown ((to = shutdown_to)))
+          return soft_fail ("bootstrap shutdown timeout");
 
-      // Shut the machine down cleanly.
-      //
-      if (!m->shutdown ((to = shutdown_to)))
-        return soft_fail ("bootstrap shutdown timeout");
+        l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";});
+      }
 
-      l3 ([&]{trace << "completed shutdown in " << shutdown_to - to << "s";});
-    }
+      // Parse the result manifest.
+      //
+      r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap");
 
-    // Parse the result manifest.
-    //
-    r.bootstrap = parse_manifest<bootstrap_manifest> (mf, "bootstrap");
+      r.machine.mac = m->mac; // Save the MAC address.
 
-    r.machine.mac = m->mac; // Save the MAC address.
+      break;
+    }
   }
   catch (const system_error& e)
   {
@@ -576,96 +601,112 @@ try
     const dir_path xp (
       md.directory () /= path::traits::temp_name (md.leaf ().string ()));
 
-    run_btrfs (trace, "subvolume", "snapshot", md, xp);
-
     string br ("br1"); // Using private bridge for now.
 
-    // Start the TFTP server.
-    //
-    tftp_server tftpd ("Gr  ^/?(.+)$  /build/" + tc_name + "/get/\\1\n" +
-                       "Pr  ^/?(.+)$  /build/" + tc_name + "/put/\\1\n",
-                       ops.tftp_port () + tc_num);
+    for (size_t retry (0);; ++retry)
+    {
+      if (retry != 0)
+        run_btrfs (trace, "subvolume", "delete", xp);
 
-    l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
+      run_btrfs (trace, "subvolume", "snapshot", md, xp);
 
-    // Start the machine.
-    //
-    unique_ptr<machine> m (
-      start_machine (xp,
-                     mm.machine,
-                     mm.machine.mac,
-                     br,
-                     tftpd.port ()));
-
-    // Note: the machine handling logic is similar to bootstrap.
-    //
-    {
-      auto mg (
-        make_exception_guard (
-          [&m, &xp] ()
-          {
-            info << "trying to force machine " << xp << " down";
-            try {m->forcedown ();} catch (const failed&) {}
-          }));
+      // Start the TFTP server.
+      //
+      tftp_server tftpd ("Gr  ^/?(.+)$  /build/" + tc_name + "/get/\\1\n" +
+                         "Pr  ^/?(.+)$  /build/" + tc_name + "/put/\\1\n",
+                         ops.tftp_port () + tc_num);
+
+      l3 ([&]{trace << "tftp server on port " << tftpd.port ();});
 
-      auto soft_fail = [&xp, &m, &r] (const char* msg, bool wait = true)
+      // Start the machine.
+      //
+      unique_ptr<machine> m (
+        start_machine (xp,
+                       mm.machine,
+                       mm.machine.mac,
+                       br,
+                       tftpd.port ()));
+
+      // Note: the machine handling logic is similar to bootstrap.
+      //
       {
+        auto mg (
+          make_exception_guard (
+            [&m, &xp] ()
+            {
+              info << "trying to force machine " << xp << " down";
+              try {m->forcedown ();} catch (const failed&) {}
+            }));
+
+        auto soft_fail = [&xp, &m, &r] (const char* msg, bool wait = true)
         {
-          diag_record dr (error);
-          dr << msg << " for machine " << xp << ", suspending";
-          m->print_info (dr);
-        }
-        m->suspend ();
+          {
+            diag_record dr (error);
+            dr << msg << " for machine " << xp << ", suspending";
+            m->print_info (dr);
+          }
+          m->suspend ();
 
-        if (wait)
-          m->wait ();
+          if (wait)
+            m->wait ();
 
-        return r;
-      };
+          return r;
+        };
 
-      // The first request should be the task manifest download. Wait for up
-      // to 60 seconds for that to arrive. In a sense we use it as an
-      // indication that the machine has booted and the worker process has
-      // started.
-      //
-      size_t to;
-      const size_t startup_to (60);
-      const size_t build_to   (ops.build_timeout ());
+        // The first request should be the task manifest download. Wait for up
+        // to 60 seconds for that to arrive. In a sense we use it as an
+        // indication that the machine has booted and the worker process has
+        // started.
+        //
+        size_t to;
+        const size_t startup_to (60);
+        const size_t build_to   (ops.build_timeout ());
 
-      if (!tftpd.serve ((to = startup_to)))
-        return soft_fail ("build startup timeout");
+        if (!tftpd.serve ((to = startup_to)))
+        {
+          if (retry > ops.build_retries ())
+            return soft_fail ("build startup timeout");
 
-      l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
+          warn << "machine " << mm.machine.name << " appears to have "
+               << "mis-booted, retrying";
 
-      // Next the worker builds things and then uploads the result manifest.
-      // So on our side we serve TFTP requests while checking for the manifest
-      // file.
-      //
-      for (to = build_to; to != 0 && !file_exists (rf); tftpd.serve (to)) ;
+          try {m->forcedown (false);} catch (const failed&) {}
+          continue;
+        }
 
-      if (to == 0)
-        return soft_fail ("build timeout");
+        l3 ([&]{trace << "completed startup in " << startup_to - to << "s";});
 
-      l3 ([&]{trace << "completed build in " << build_to - to << "s";});
+        // Next the worker builds things and then uploads the result manifest.
+        // So on our side we serve TFTP requests while checking for the
+        // manifest file.
+        //
+        for (to = build_to; to != 0 && !file_exists (rf); tftpd.serve (to)) ;
 
-      // Parse the result manifest.
-      //
-      r = parse_manifest<result_manifest> (rf, "result");
+        if (to == 0)
+          return soft_fail ("build timeout");
 
-      // If the build terminated abnormally, suspent the machine for
-      // investigation (note that here we don't wait or return).
-      //
-      if (r.status == result_status::abnormal)
-        soft_fail ("build terminated abnormally", false);
+        l3 ([&]{trace << "completed build in " << build_to - to << "s";});
 
-      // Force the machine down (there is no need wasting time on clean
-      // shutdown since the next step is to drop the snapshot). Also fail
-      // softly if things go badly.
-      //
-      try {m->forcedown (false);} catch (const failed&) {}
-    }
+        // Parse the result manifest.
+        //
+        r = parse_manifest<result_manifest> (rf, "result");
+
+        // If the build terminated abnormally, suspent the machine for
+        // investigation (note that here we don't wait or return).
+        //
+        if (r.status == result_status::abnormal)
+          soft_fail ("build terminated abnormally", false);
 
-    run_btrfs (trace, "subvolume", "delete", xp);
+        // Force the machine down (there is no need wasting time on clean
+        // shutdown since the next step is to drop the snapshot). Also fail
+        // softly if things go badly.
+        //
+        try {m->forcedown (false);} catch (const failed&) {}
+      }
+
+      run_btrfs (trace, "subvolume", "delete", xp);
+      break;
+    }
   }
 
   // Update package name/version if the returned value as "unknown".
diff --git a/bbot/bbot-agent@.service b/bbot/bbot-agent@.service
index b15108f..dd7ca21 100644
--- a/bbot/bbot-agent@.service
+++ b/bbot/bbot-agent@.service
@@ -11,7 +11,9 @@ Environment=CPU=1
 Environment=RAM=1048576
 
 Environment=BOOTSTRAP_TIMEOUT=600
+Environment=BOOTSTRAP_RETRIES=2
 Environment=BUILD_TIMEOUT=1800
+Environment=BUILD_RETRIES=2
 Environment=REQUEST_TIMEOUT=300
 
 Environment=TOOLCHAIN_NAME=%i
@@ -26,7 +28,9 @@ ExecStart=/build/bots/%i/bin/bbot-agent --systemd-daemon \
   --cpu ${CPU} \
   --ram ${RAM} \
   --bootstrap-timeout ${BOOTSTRAP_TIMEOUT} \
+  --bootstrap-retries ${BOOTSTRAP_RETRIES} \
   --build-timeout ${BUILD_TIMEOUT} \
+  --build-retries ${BUILD_RETRIES} \
   --request-timeout ${REQUEST_TIMEOUT} \
   --toolchain-name ${TOOLCHAIN_NAME} \
   --toolchain-num ${TOOLCHAIN_NUM} \
-- 
cgit v1.1