From 1fd86636b0d70d754caf205d8048893a9c9793c3 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Thu, 7 Feb 2019 08:02:47 +0200
Subject: Various improvements and fixes

---
 buildos | 538 +++++++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 326 insertions(+), 212 deletions(-)

(limited to 'buildos')
diff --git a/buildos b/buildos
index 012869b..52e984e 100755
--- a/buildos
+++ b/buildos
@@ -51,7 +51,7 @@ info "starting build os monitor..."
 #
 # First we separete quoted variables and arguments with newlines (giving
 # priority to assignments). Then we replace whitespaces with newline on
-# lines that don't contain quites. Finally, clean up by removing blank
+# lines that don't contain quites. Finally, we clean up by removing blank
 # lines.
 #
 # Note: the same code as in init.
@@ -137,7 +137,9 @@ function toolchain_value () # <toolchain-prefix> <variable>
   echo "${!n}"
 }
 
+instances=0        # Number of bbot instances across all toolchains.
 toolchain_names=()
+
 for tn in "${!toolchains[@]}"; do
   tp="${toolchains["$tn"]}"
   tu="$(toolchain_value "$tp" toolchain_url)"
@@ -161,8 +163,28 @@ for tn in "${!toolchains[@]}"; do
   declare "${tp}toolchain_ver="
   declare "${tp}toolchain_fver=" # Full version (with snapshot).
 
-  # If buildos.toolchain_trust was not specified, set it to "no" so that
-  # we don't prompt if the repository happens to be signed.
+  # Default to 1 bbot agent instance.
+  #
+  if [ -z "$(toolchain_value "$tp" instances)" ]; then
+    declare "${tp}instances=1"
+  fi
+
+  instances=$(($instances + $(toolchain_value "$tp" instances)))
+
+  # Default to 0 nice value.
+  #
+  if [ -z "$(toolchain_value "$tp" nice)" ]; then
+    declare "${tp}nice=0"
+  fi
+
+  # Default to br1 (private/NAT bridge).
+  #
+  if [ -z "$(toolchain_value "$tp" bridge)" ]; then
+    declare "${tp}bridge=br1"
+  fi
+
+  # If toolchain_trust was not specified, set it to "no" so that we don't
+  # prompt if the repository happens to be signed.
   #
   if [ -z "$(toolchain_value "$tp" toolchain_trust)" ]; then
     declare "${tp}toolchain_trust=no"
@@ -180,27 +202,48 @@ if [ "${#toolchain_names[@]}" -eq 0 ]; then
   info "no buildos.toolchain_url specified, not bootstrapping"
 fi
 
-# Divide CPUs and RAM (in kB) among the toolchains.
+# Divide CPUs and RAM (in KB) among the instances.
+#
+# By default reserve 4G of RAM for ourselves (rootfs, tmpfs).
 #
-# Reserve 4G of RAM for ourselves (rootfs, tmpfs).
+# Note that MemTotal in /proc/meminfo is the available memory, not physical.
+# And to make it easier to provision memory it's really helpful to base it
+# in the physical value.
 #
-ram_total="$(sed -n -re 's/^MemTotal: *([0-9]+) *kB$/\1/p' </proc/meminfo)"
+ram_total=0
+for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*MB.*$/\1/p'); do
+  ram_total=$(($ram_total + $i * 1024))
+done
+
+if [ "$ram_total" -eq 0 ]; then
+  error "unable to determine physical memory size"
+fi
+
 cpu_total="$(lscpu | sed -n -re 's/^CPU\(s\): *([0-9]+)$/\1/p')"
 
+if [ -z "$ram_reserved" ]; then
+  ram_reserved=4
+fi
+ram_reserved=$(($ram_reserved * 1024 * 1024))
+
 if [ -z "$ram_overcommit" ]; then
   ram_overcommit=1
 fi
 
+if [ -z "$cpu_reserved" ]; then
+  cpu_reserved=0
+fi
+
 if [ -z "$cpu_overcommit" ]; then
   cpu_overcommit=1
 fi
 
-ram_slice=$(("$ram_total" - 4 * 1024 * 1024))
-cpu_slice="$cpu_total"
+ram_slice=$(($ram_total - $ram_reserved))
+cpu_slice=$(($cpu_total - $cpu_reserved))
 
-if [ "${#toolchain_names[@]}" -gt 1 ]; then
-  ram_slice=$(("$ram_slice" * "$ram_overcommit" / "${#toolchain_names[@]}"))
-  cpu_slice=$(("$cpu_slice" * "$cpu_overcommit" / "${#toolchain_names[@]}"))
+if [ "$instances" -gt 1 ]; then
+  ram_slice=$(($ram_slice * $ram_overcommit / $instances))
+  cpu_slice=$(($cpu_slice * $cpu_overcommit / $instances))
 
   if [ "$cpu_slice" -eq 0 ]; then
     cpu_slice=1
@@ -212,13 +255,15 @@ fi
 function print ()
 {
   echo "cpu_total:      $cpu_total"
+  echo "cpu_reserved:   $cpu_reserved"
   echo "cpu_overcommit: $cpu_overcommit"
   echo "cpu_slice:      $cpu_slice"
   echo
 
-  echo "ram_total:      $ram_total kB"
+  echo "ram_total:      $ram_total KB"
+  echo "ram_reserved:   $ram_reserved KB"
   echo "ram_overcommit: $ram_overcommit"
-  echo "ram_slice:      $ram_slice kB"
+  echo "ram_slice:      $ram_slice KB"
   echo
 
   echo "buildid:        $buildid"
@@ -228,9 +273,15 @@ function print ()
   local n i tn tp tu tt
   for tn in "${toolchain_names[@]}"; do
     tp="${toolchains["$tn"]}"
+    tc="$(toolchain_value "$tp" nice)"
+    tb="$(toolchain_value "$tp" bridge)"
+    ti="$(toolchain_value "$tp" instances)"
     tu="$(toolchain_value "$tp" toolchain_url)"
     tt="$(toolchain_value "$tp" toolchain_trust)"
 
+    echo "$tn.nice:              $tc"
+    echo "$tn.bridge:            $tb"
+    echo "$tn.instances:         $ti"
     echo "$tn.toolchain_url:     $tu"
     echo "$tn.toolchain_trust:   $tt"
 
@@ -283,7 +334,7 @@ function machines_for () # <function> <function-args>...
   for v in /build/machines/*; do
     if [ ! -d "$v" ]; then
       diag+=("$v: error: invalid volume")
-      fail="true"
+      fail=true
       continue
     fi
 
@@ -292,7 +343,7 @@ function machines_for () # <function> <function-args>...
     for m in *; do
       if [ ! -d "$m" ]; then
 	diag+=("$v/$m: error: invalid machine")
-	fail="true"
+	fail=true
 	continue
       fi
 
@@ -308,19 +359,29 @@ function machines_clean_subvolume () # <subvolume-path>
 {
   if ! btrfs property set -ts "$1" ro false; then
     diag+=("$1: error: unable to change subvolume property")
-    fail="true"
+    fail=true
     return 1
   fi
 
   if ! btrfs subvolume delete "$1"; then
     diag+=("$1: error: unable to delete subvolume")
-    fail="true"
+    fail=true
+    return 1
+  fi
+}
+
+function machines_clean_lockfile () # <lockfile-path>
+{
+  if ! rm -f "$1"; then
+    diag+=("$1: error: unable to delete lockfile")
+    fail=true
     return 1
   fi
 }
 
 # Cleanup the <name>-<toolchain>-<xxx> entries for the specified toolchain
-# called before starting each toolchain.
+# (all instances) as well as <name>-<toolchain>.lock file. Called before
+# starting bbot instances for each toolchain.
 #
 function machines_clean_toolchain () # <volume-dir> <machine> <toolchain>
 {
@@ -330,24 +391,33 @@ function machines_clean_toolchain () # <volume-dir> <machine> <toolchain>
 
   cd "$m"
 
-  local s
-  for s in "$m"-"$tn"-*; do
+  local i
+  for i in "$m"-"$tn"-*; do
 
-    if [ ! -d "$s" ]; then
-      diag+=("$v/$m/$s: error: invalid machine subvolume")
-      fail="true"
+    if [ ! -d "$i" ]; then
+      diag+=("$v/$m/$i: error: invalid machine subvolume")
+      fail=true
       continue
     fi
 
-    if machines_clean_subvolume "$v/$m/$s"; then
-      diag+=("$v/$m/$s: info: deleted stray toolchain working subvolume")
+    if machines_clean_subvolume "$v/$m/$i"; then
+      diag+=("$v/$m/$i: info: deleted stray toolchain working subvolume")
     fi
   done
 
+  i="$m-$tn.lock"
+  if [ -f "$i" ]; then
+
+    if machines_clean_lockfile "$v/$m/$i"; then
+      diag+=("$v/$m/$i: info: deleted stray lockfile")
+    fi
+  fi
+
   cd "$v"
 }
 
-# Cleanup stray snapshots or deleted machines. Called once during startup.
+# Cleanup stray snapshots and lockfiles as well as deleted machines. Called
+# once during startup.
 #
 function machines_clean_stray () # <volume-dir> <machine>
 {
@@ -359,35 +429,48 @@ function machines_clean_stray () # <volume-dir> <machine>
   # Collect current machine symlink's bootstrap protocol numbers. If there
   # are no current machine symlinks, then we delete the whole thing.
   #
-  local s ps=()
-  for s in "$m"-*; do
-    if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then
+  local i ps=()
+  for i in "$m"-*; do
+    if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then
 
-      if [ ! -L "$s" ]; then
-	diag+=("$v/$m/$s: error: not a symlink")
-	fail="true"
+      if [ ! -L "$i" ]; then
+	diag+=("$v/$m/$i: error: not a symlink")
+	fail=true
       fi
 
       # Treat it as if it were a symlink even if its not. Failed that we
       # may try to delete the whole thing.
       #
-      ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$s")")
+      ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$i")")
     fi
   done
 
   # Examine each machine subvolume.
   #
-  for s in "$m"-*; do
+  for i in "$m"-*; do
 
     # <name>-<P> (current machine symlink)
     #
-    if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then
+    if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then
       continue
     fi
 
-    if [ ! -d "$s" ]; then
-      diag+=("$v/$m/$s: error: invalid machine subvolume")
-      fail="true"
+    # Lockfile.
+    #
+    if [ -f "$i" ]; then
+
+      if [[ "$i" =~ ^"$m"-.+\.lock$ ]]; then
+
+	if machines_clean_lockfile "$v/$m/$i"; then
+	  diag+=("$v/$m/$i: info: deleted lockfile")
+	fi
+	continue
+      fi
+    fi
+
+    if [ ! -d "$i" ]; then
+      diag+=("$v/$m/$i: error: invalid machine subvolume")
+      fail=true
       continue
     fi
 
@@ -400,8 +483,8 @@ function machines_clean_stray () # <volume-dir> <machine>
       #
       local p f=
       for p in "${ps[@]}"; do
-	if [[ "$s" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then
-	  f="true"
+	if [[ "$i" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then
+	  f=false
 	  break
 	fi
       done
@@ -415,8 +498,8 @@ function machines_clean_stray () # <volume-dir> <machine>
       f=
       local tn
       for tn in "${toolchain_names[@]}"; do
-	if [[ "$s" =~ ^"$m"-"$tn"$ ]]; then
-	  f="true"
+	if [[ "$i" =~ ^"$m"-"$tn"$ ]]; then
+	  f=false
 	  break
 	fi
       done
@@ -426,11 +509,11 @@ function machines_clean_stray () # <volume-dir> <machine>
       fi
     fi
 
-    # This is either a stray working submodule or a bootsrapped subvolume
+    # This is either a stray working subvolume or a bootsrapped subvolume
     # for a toolchain that was deleted (or we are deleting everything).
     #
-    if machines_clean_subvolume "$v/$m/$s"; then
-      diag+=("$v/$m/$s: info: deleted subvolume")
+    if machines_clean_subvolume "$v/$m/$i"; then
+      diag+=("$v/$m/$i: info: deleted subvolume")
     fi
   done
 
@@ -443,7 +526,7 @@ function machines_clean_stray () # <volume-dir> <machine>
       diag+=("$v/$m: info: deleted machine directory")
     else
       diag+=("$v/$m: error: unable to delete machine directory")
-      fail="true"
+      fail=true
     fi
   fi
 }
@@ -463,8 +546,7 @@ if [ "${#diag[@]}" -gt 0 ]; then
   info "$s" && print_diag 1>&2
 
   if [ -n "$fail" ]; then
-    info "correct and restart the monitor (systemctl restart buildos)"
-    exit 1
+    error "correct and restart the monitor (systemctl restart buildos)"
   fi
 fi
 
@@ -513,7 +595,7 @@ function toolchain_fetch () # <toolchain-name> <line>
       return 1
     fi
 
-    info "toolchain '$tn' version $tv"
+    info "toolchain $tn version $tv"
 
     declare -g "${tp}toolchain_fver=$tv" # Full version.
     echo "$tv" >"$tr/version-full"
@@ -713,9 +795,12 @@ function bbot_check () # <toolchain-name>
 function bbot_start () # <toolchain-name> <toolchain-index>
 {
   local tn="$1"
-  local ti="$2"
+  local tx="$2"
 
   local tp="${toolchains["$tn"]}"
+  local tc="$(toolchain_value "$tp" nice)"
+  local tb="$(toolchain_value "$tp" bridge)"
+  local ti="$(toolchain_value "$tp" instances)"
   local tv="$(toolchain_value "$tp" toolchain_fver)"
   local ts="$(toolchain_value "$tp" toolchain_file_csum)"
 
@@ -741,9 +826,13 @@ function bbot_start () # <toolchain-name> <toolchain-index>
     #
     if [ "$b_word" = "configured" ]; then
 
-      if ! sudo systemctl stop "bbot-agent@$tn"; then
-	info "failed to stop bbot-agent@$tn service, assuming not running"
-      fi
+      for ((i=1; i <= ti; i++)); do
+	if ! sudo systemctl stop "bbot-agent-$tn@$i"; then
+	  info "failed to stop bbot-agent-$tn@$i service, assuming not running"
+	  continue
+	fi
+	info "stopped bbot-agent-$tn@$i service"
+      done
 
       # We may not be able to uninstall if we previously failed to build.
       #
@@ -752,37 +841,45 @@ function bbot_start () # <toolchain-name> <toolchain-index>
       fi
     fi
 
-    # Build and install the bbot agent. Since other agents might already
-    # be running, limit the number of jobs to our slice.
+    # Build and install the bbot agent. Since other agents might already be
+    # running, limit the number of jobs to our slice.
     #
-    if ! bpkg --fetch-timeout "$timeout"                        \
-	      --build-option --jobs --build-option "$cpu_slice" \
+    if ! bpkg --fetch-timeout "$timeout" \
+	      --build-option --jobs=$(($ti * $cpu_slice)) \
 	      build --yes libbbot bbot; then
-      info "failed to build bbot-agent@$tn"
+      info "failed to build bbot-agent for $tn"
       break
     fi
 
     if ! bpkg install "${vars[@]}" bbot; then
-      info "failed to install bbot-agent@$tn"
+      info "failed to install bbot-agent for $tn"
       break
     fi
 
-    # Post-process and install systemd .service file. Note that we cannot use
-    # the systemd pattern machinery since each version of bbot can have its
-    # own version of the .service file.
+    # Post-process and install the systemd .service file. Since we may have
+    # multiple toolchains, we embed the toolchain name into the service name
+    # with the systemd pattern machinery used to run multiple bbot instances
+    # per toolchain.
+    #
+    # We assume `%I` is only used in Description and similar and rewrite it
+    # as `<name>/%i` (e.g., `stage/1`).
     #
     sed -i -r \
-	-e "s/%[iI]/$tn/g" \
-	-e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \
+	-e "s#%I#$tn/%I#g" \
 	-e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \
 	-e "s/^(Environment=RAM)=.*/\1=$ram_slice/" \
+	-e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \
+	-e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \
 	-e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \
-	-e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$ti/" \
+	-e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \
 	-e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \
+	-e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \
+	-e "s/^(Nice)=.*/\1=$tc/" \
+	-e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \
 	"$id/lib/systemd/system/bbot-agent@.service"
 
     # Patch in the controller URLs. These can contain special characters
-    # like & so we have to escape them.
+    # like `&` so we have to escape them.
     #
     n="${tp}controller_url[@]"
     for i in "${!n}"; do
@@ -801,8 +898,11 @@ function bbot_start () # <toolchain-name> <toolchain-index>
 	  "$id/lib/systemd/system/bbot-agent@.service"
     done
 
-    sudo ln -sf "$id/lib/systemd/system/bbot-agent@.service" \
-      "/usr/lib/systemd/system/bbot-agent@$tn.service"
+    # Note: using a hard link to prevent systemd from being too clever and
+    # calling the service bbot-agent@.
+    #
+    sudo ln -f "$id/lib/systemd/system/bbot-agent@.service" \
+      "/usr/lib/systemd/system/bbot-agent-$tn@.service"
 
     # Clean up any machine snapshots that might have been left behind.
     #
@@ -818,25 +918,28 @@ function bbot_start () # <toolchain-name> <toolchain-index>
       print_diag 1>&2
 
       if [ -n "$fail" ]; then
-	info "correct and start bbot-agent@$tn (systemctl start bbot-agent@$tn)"
+	info "correct and start bbot-agent for $tn (systemctl start bbot-agent-$tn@N)"
 	break
       fi
     fi
 
-    # Start the service. With Type=simple start returns as soon as the process
-    # has forked. To see if the service actually started is done as part of
-    # service monitoring.
+    # Start each service instance. With Type=simple start returns as soon as
+    # the process has forked. Making sure the service has actually started is
+    # done as part of the service monitoring.
     #
-    if ! sudo systemctl start "bbot-agent@$tn"; then
-      info "failed to start bbot-agent@$tn service"
-      break
-    fi
-
     r=0
+    for ((i=1; i <= ti; i++)); do
+      if ! sudo systemctl start "bbot-agent-$tn@$i"; then
+	info "failed to start bbot-agent-$tn@$i service instance"
+	r=1
+	break
+      fi
+    done
+
     break
   done
-  cd "$owd"
 
+  cd "$owd"
   return "$r"
 }
 
@@ -855,6 +958,27 @@ while true; do
 
   count=$(($count + 1))
 
+  # Check for OS changes. Do this first in case of any issues in the following
+  # checks.
+  #
+  if [ -n "$buildid_url" ]; then
+    # Fetch the current id. While normally it will be a TFTP URL, it could also
+    # be HTTP(S) so we configure sensible behavior for that.
+    #
+    if id="$("${curl[@]}" "$buildid_url")"; then
+      if [ "$id" != "$buildid" ]; then
+	email "rebooting because of new os build" <<EOF
+old_buildid: $buildid
+new_buildid: $id
+EOF
+	info "new os build ($id), rebooting..."
+	restart
+      fi
+    else
+      info "unable to fetch $buildid_url, will try again"
+    fi
+  fi
+
   # Check for toolchain changes. If this is the first run, bootstrap them.
   #
   for tn in "${toolchain_names[@]}"; do
@@ -886,11 +1010,11 @@ while true; do
 	cs="$(toolchain_checksum "$tp" "$f")"
 
 	if [ "$ts" != "$cs" ]; then
-	  email "rebooting because of new '$tn' toolchain" <<EOF
+	  email "rebooting because of new $tn toolchain" <<EOF
 old_checksum: $ts
 new_checksum: $cs
 EOF
-	  info "new '$tn' toolchain ($cs), rebooting..."
+	  info "new $tn toolchain ($cs), rebooting..."
 	  restart
 	fi
       else
@@ -905,7 +1029,7 @@ EOF
 	# subshell and any variables it sets (like toolchain_ver) won't be
 	# visible to us.
 	#
-	info "bootstrapping '$tn' toolchain..."
+	info "bootstrapping $tn toolchain..."
 
 	toolchain_bootstrap "$tn" 2>&1 | tee "$tr/toolchain-$count.log" 1>&2
 
@@ -917,15 +1041,15 @@ EOF
 	    tv="$(cat $tr/version-full)"
 	    declare "${tp}toolchain_fver=$tv"
 
-	    s="bootstrapped '$tn' toolchain $tv"
+	    s="bootstrapped $tn toolchain $tv"
 	    toolchain_boots+=("$tn")
 	    ;;
 	  1)
-	    s="skipping disabled '$tn' toolchain, waiting for new version"
+	    s="skipping disabled $tn toolchain, waiting for new version"
 	    toolchain_boots+=("") # Skip.
 	    ;;
 	  *)
-	    s="failed to bootstrap '$tn' toolchain, waiting for new version"
+	    s="failed to bootstrap $tn toolchain, waiting for new version"
 	    toolchain_boots+=("") # Skip.
 	    ;;
 	esac
@@ -946,10 +1070,10 @@ EOF
   #
   if [ "${#toolchain_names[@]}" -eq "${#toolchain_boots[@]}" ]; then
 
-    ti=0 # Toolchain index.
+    tx=0 # Toolchain index.
     for tn in "${toolchain_boots[@]}"; do
 
-      ti=$(($ti + 1))
+      tx=$(($tx + 1))
 
       # Skip those that failed to bootstrap.
       #
@@ -958,6 +1082,7 @@ EOF
       fi
 
       tp="${toolchains["$tn"]}"
+      ti="$(toolchain_value "$tp" instances)"
       tr="$(toolchain_value "$tp" toolchain_root)"
 
       # Or those that have no controllers (maybe it would have been better
@@ -969,171 +1094,160 @@ EOF
       fi
 
       s=
-      bbot_check "$tn" 2>&1 | tee "$tr/bbot-$count.log" 1>&2
+      bbot_check "$tn" 2>&1 | tee "$tr/bbot-agent-$count.log" 1>&2
 
       case "${PIPESTATUS[0]}" in
 	0)
-	  rm -f "$tr/bbot-$count.log"
+	  rm -f "$tr/bbot-agent-$count.log"
 
-	  # Check if the service has failed.
+	  # For each service instance check if it has failed.
 	  #
-	  if sudo systemctl is-failed --quiet "bbot-agent@$tn"; then
-	    s="bbot-agent@$tn service has failed, stopping"
-
-	    # Note: ignore errors.
-	    #
-	    sudo systemctl status "bbot-agent@$tn" 2>&1 | \
-	      tee "$tr/bbot-$count.log" 1>&2
-
-	    # Reset it so that we don't keep sending the log on each
-	    # iteration. Note: ignore errors.
-	    #
-	    sudo systemctl reset-failed "bbot-agent@$tn" 2>&1 | \
-	      tee -a "$tr/bbot-$count.log" 1>&2
-	  else
-	    # See if there is any diagnostics in the systemd journal. We
-	    # notify about warning and up.
-	    #
-	    # The old versions journalctl behavior is to not output anything
-	    # (not even the cursor) if there are no new entries. The new
-	    # versions output the old cursor.
-	    #
-	    # Plus, it sometimes changes the cursor even without any errors in
-	    # it (journal rewind/truncation maybe?) so we have to detect that.
-	    #
-	    c=(sudo journalctl --no-pager --quiet --output short-full \
-              --unit "bbot-agent@$tn")
-
-	    # Get the last cursor if any.
-	    #
-	    oc="${toolchain_cursors["$tn"]}"
-	    if [ -n "$oc" ]; then
-	      c+=("--after-cursor" "$oc")
-	    fi
+	  for ((i=1; i <= ti; i++)); do
+
+	    if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then
+	      s="bbot-agent-$tn@$i service has failed, stopping"
+
+	      # Note: ignore errors.
+	      #
+	      sudo systemctl status "bbot-agent-$tn@$i" 2>&1 | \
+		tee "$tr/bbot-agent-$i-$count.log" 1>&2
+
+	      # Reset it so that we don't keep sending the log on each
+	      # iteration. Note: ignore errors.
+	      #
+	      sudo systemctl reset-failed "bbot-agent-$tn@$i" 2>&1 | \
+		tee -a "$tr/bbot-agent-$i-$count.log" 1>&2
+
+	      info "$s"
+	      email "$s" <<EOF
+$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-agent-$i-$count.log
+EOF
+	    else
+	      # See if there is any diagnostics in the systemd journal. We
+	      # notify about warnings and up.
+	      #
+	      # The old versions journalctl behavior is to not output anything
+	      # (not even the cursor) if there are no new entries. The new
+	      # versions output the old cursor.
+	      #
+	      # Plus, it sometimes changes the cursor even without any errors
+	      # in it (journal rewind/truncation maybe?) so we have to detect
+	      # that.
+	      #
+	      c=(sudo journalctl --no-pager --quiet --output short-full \
+                   --unit "bbot-agent-$tn@$i")
+
+	      # Get the last cursor if any.
+	      #
+	      oc="${toolchain_cursors["$tn/$i"]}"
+	      if [ -n "$oc" ]; then
+		c+=("--after-cursor" "$oc")
+	      fi
 
-	    # Get the "log range": the first line is the date of the first
-	    # error, the second line is the date of the last error, and the
-	    # third line is the end cursor. It can also be just one line in
-	    # which case it is the new cursor (that rewind stuff).
-	    #
-	    # Here is what's going on in that sed script:
-	    #
-	    # The first chunk matches the first line. We first put it into
-	    # the hold space (in case that's the only line) and then extract
-	    # and print the date.
-	    #
-	    # The second chunk matches the last line. We first handle the hold
-	    # space which by now should contain the last error line and then
-	    # the cursor.
-	    #
-	    # The last chunk matches every other line. We simply replace the
-	    # hold space with the next line so that at the end we have the
-	    # last line there.
-	    #
-	    lr="$("${c[@]}" --priority 4 --show-cursor | sed -n -r \
+	      # Get the "log range": the first line is the date of the first
+	      # error, the second line is the date of the last error, and the
+	      # third line is the end cursor. It can also be just one line in
+	      # which case it is the new cursor (that rewind stuff).
+	      #
+	      # Here is what's going on in that sed script:
+	      #
+	      # The first chunk matches the first line. We first put it into
+	      # the hold space (in case that's the only line) and then extract
+	      # and print the date.
+	      #
+	      # The second chunk matches the last line. We first handle the
+	      # hold space which by now should contain the last error line and
+	      # then the cursor.
+	      #
+	      # The last chunk matches every other line. We simply replace the
+	      # hold space with the next line so that at the end we have the
+	      # last line there.
+	      #
+	      lr="$("${c[@]}" --priority 4 --show-cursor | sed -n -r \
 -e '1{h;s/^[MTWFS].. ([^ ]+ [^ ]+) .*$/\1/p;t}' \
 -e '${x;s/^[MTWFS].. ([^ ]+ [^ ]+) .*$/\1/p;x;s/^-- cursor: (.+)$/\1/p;t}' \
 -e 'h')"
-	    lc="$(wc -l <<<"$lr")"
-	    nc="$(sed -n -e "${lc}p" <<<"$lr")"
-
-	    # If we have no new entries, then nothing to do.
-	    #
-	    if [ "$nc" != "$oc" ]; then
+	      lc="$(wc -l <<<"$lr")"
+	      nc="$(sed -n -e "${lc}p" <<<"$lr")"
 
-	      # We may have no actual entries (cursor rewind).
+	      # If we have no new entries, then nothing to do.
 	      #
-	      if [ "$lc" -ne 1 ]; then
+	      if [ "$nc" != "$oc" ]; then
 
-		# Try to get some context before the first error and after the
-		# last. This is unexpectedly hard in systemd.
-		#
-		# This can be a lot of output which makes it hard to spot the
-		# error so we are going to print just the error summary first.
-		# Quite a mess, I agree.
+		# We may have no actual entries (cursor rewind).
 		#
-		sd="$(sed -n -e '1p' <<<"$lr")"
-		sd="$(date '+%s' -d "$sd")"                           # sec
-		sd="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($sd - 10))")" # -10sec
-
-		ed="$(sed -n -e '2p' <<<"$lr")"
-		ed="$(date '+%s' -d "$ed")"                           # sec
-		ed="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($ed + 10))")" # +10sec
-
-		s="bbot-agent@$tn service issued new diagnostics"
-
-		info "$s"
-		{
-		  echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}";
-		  echo;
-		  echo "summary:";
-		  echo;
-		  "${c[@]}" --priority 4 | head -n 200;
-		  echo;
-		  echo "context:";
-		  echo;
-		  if [ -n "$oc" ]; then
-		    unset 'c[-1]' # Pop cursor (for --since/--until).
-		    unset 'c[-1]'
-		  fi;
-		  "${c[@]}" --since "$sd" --until "$ed" | head -n 200
-		} | email "$s"
+		if [ "$lc" -ne 1 ]; then
+
+		  # Try to get some context before the first error and after
+		  # the last. This is unexpectedly hard in systemd.
+		  #
+		  # This can be a lot of output which makes it hard to spot
+		  # the error so we are going to print just the error summary
+		  # first. Quite a mess, I agree.
+		  #
+		  sd="$(sed -n -e '1p' <<<"$lr")"
+		  sd="$(date '+%s' -d "$sd")"                           # sec
+		  sd="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($sd - 10))")" # -10sec
+
+		  ed="$(sed -n -e '2p' <<<"$lr")"
+		  ed="$(date '+%s' -d "$ed")"                           # sec
+		  ed="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($ed + 10))")" # +10sec
+
+		  s="bbot-agent-$tn@$i service issued new diagnostics"
+
+		  info "$s"
+		  {
+		    echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}";
+		    echo;
+		    echo "summary:";
+		    echo;
+		    "${c[@]}" --priority 4 | head -n 200;
+		    echo;
+		    echo "context:";
+		    echo;
+		    if [ -n "$oc" ]; then
+		      unset 'c[-1]' # Pop cursor (for --since/--until).
+		      unset 'c[-1]'
+		    fi;
+		    "${c[@]}" --since "$sd" --until "$ed" | head -n 200
+		  } | email "$s"
+		fi
+
+		toolchain_cursors["$tn/$i"]="$nc"
 	      fi
-
-	      toolchain_cursors["$tn"]="$nc"
 	    fi
+	  done
 
-	    continue
-	  fi
+	  continue # We have already issues diagnostics, if any.
 	  ;;
 	1)
 	  s="re"
 	  ;&
 	2)
-	  info "${s}starting bbot-agent@$tn..."
+	  info "${s}starting bbot-agent for $tn..."
 
           # Note: appending to the same log.
 	  #
-	  bbot_start "$tn" "$ti" 2>&1 | tee -a "$tr/bbot-$count.log" 1>&2
+	  bbot_start "$tn" "$tx" 2>&1 | tee -a "$tr/bbot-agent-$count.log" 1>&2
 
 	  if [ "${PIPESTATUS[0]}" -eq 0 ]; then
-	    s="${s}started bbot-agent@$tn"
+	    s="${s}started bbot-agent for $tn, $ti instances"
 	  else
-	    s="failed to ${s}start bbot-agent@$tn, waiting for new version"
+	    s="failed to ${s}start bbot-agent for $tn, waiting for new version"
 	  fi
 	  ;;
 	*)
-	  s="failed to fetch package information for '$tn' toolchain, will try again"
+	  s="failed to fetch package information for $tn toolchain, will try again"
 	  ;;
       esac
 
       info "$s"
       email "$s" <<EOF
-$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-$count.log
+$tn.bbot_log: tftp://$hname/toolchains/$tn/bbot-agent-$count.log
 EOF
     done
   fi
 
-  # Check for OS changes.
-  #
-  if [ -n "$buildid_url" ]; then
-    # Fetch the current id. While normally it will be a TFTP URL, it could also
-    # be HTTP(S) so we configure sensible behavior for that.
-    #
-    if id="$("${curl[@]}" "$buildid_url")"; then
-      if [ "$id" != "$buildid" ]; then
-	email "rebooting because of new os build" <<EOF
-old_buildid: $buildid
-new_buildid: $id
-EOF
-	info "new os build ($id), rebooting..."
-	restart
-      fi
-    else
-      info "unable to fetch $buildid_url, will try again"
-    fi
-  fi
-
   sensors -A
   info "monitoring..."
   sleep 60
-- 
cgit v1.1