1 files changed, 157 insertions, 38 deletions
diff --git a/buildos b/buildos
index 85d1d12..dc4cd55 100755
--- a/buildos
+++ b/buildos
@@ -29,6 +29,16 @@ function error ()
   exit 1
 }
 
+# Note: the arch variant is patched in by the bootstrap script.
+#
+arch="$(uname -m)"
+arch_variant=
+
+arch_with_variant="$arch"
+if [ -n "$arch_variant" ]; then
+  arch_with_variant="$arch_with_variant-$arch_variant"
+fi
+
 # Network timeouts: 60 seconds to connect, 10 minutes to complete, 4 retries
 # (5 attempts total). These are similar to bbot timeouts. Note that the
 # toolchain archives can be quite sizable.
@@ -49,6 +59,11 @@ info "starting build os monitor..."
 # foo='foo fox'
 # bar="bar 'box'"
 #
+# Or (as rewritten by GRUB):
+#
+# 'foo=foo fox'
+# "bar=bar 'box'"
+#
 # First we separete quoted variables and arguments with newlines (giving
 # priority to assignments). Then we replace whitespaces with newline on
 # lines that don't contain quites. Finally, we clean up by removing blank
@@ -71,11 +86,24 @@ declare -A toolchains
 toolchains["default"]=""
 
 for v in "${cmdline[@]}"; do
-  var="$(sed -n -re 's/^buildos\.([^=]+)=.*$/\1/p' <<<"$v")" # Extract name.
+
+  # Rewrite "x=y" as x="y" (as well as the single-quote variant).
+  #
+  v1="$(sed -n -re "s/^\"([^= ]+)=(.*)\"\$/\1=\"\2\"/p" <<<"$v")"
+  if [ -n "$v1" ]; then
+    v="$v1"
+  else
+    v1="$(sed -n -re "s/^'([^= ]+)=(.*)'\$/\1='\2'/p" <<<"$v")"
+    if [ -n "$v1" ]; then
+      v="$v1"
+    fi
+  fi
+
+  var="$(sed -n -re 's/^buildos\.([^= ]+)=.*$/\1/p' <<<"$v")" # Extract name.
 
   if [ -n "$var" ]; then
-    val="$(sed -re 's/^[^=]+=(.*)$/\1/' <<<"$v")"            # Extract value.
-    val="$(sed -re "s/^('(.*)'|\"(.*)\")$/\2\3/" <<<"$val")" # Strip quoted.
+    val="$(sed -re 's/^[^= ]+=(.*)$/\1/' <<<"$v")"            # Extract value.
+    val="$(sed -re "s/^('(.*)'|\"(.*)\")\$/\2\3/" <<<"$val")" # Strip quoted.
 
     # Recognize some variables as arrays.
     #
@@ -122,14 +150,17 @@ function restart ()
   sudo systemctl reboot
 }
 
-if [ -z "$buildid_url" ]; then
+if [ -n "$buildid_url" ]; then
+  buildid_url="$buildid_url-$arch_with_variant"
+else
   info "no buildos.buildid_url specified, not monitoring for new os builds"
 fi
 
 # Process toolchains.
 #
 
-# Return the value of one of the toolchain_* variables for this toolchain.
+# Return the value of one of the <variable>.<toolchain> variables for this
+# toolchain.
 #
 function toolchain_value () # <toolchain-prefix> <variable>
 {
@@ -171,6 +202,12 @@ for tn in "${!toolchains[@]}"; do
 
   instances=$(($instances + $(toolchain_value "$tp" instances)))
 
+  # Default to non-interactive-only.
+  #
+  if [ -z "$(toolchain_value "$tp" interactive)" ]; then
+    declare "${tp}interactive=false"
+  fi
+
   # Default to 0 nice value.
   #
   if [ -z "$(toolchain_value "$tp" nice)" ]; then
@@ -208,27 +245,36 @@ fi
 #
 # Note that MemTotal in /proc/meminfo is the available memory, not physical.
 # And to make it easier to provision memory it's really helpful to base it
-# in the physical value.
+# on the physical value.
 #
-ram_total=0
-for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*MB.*$/\1/p'); do
-  ram_total=$(($ram_total + $i * 1024))
-done
+if [ -z "$ram_total" ]; then
+  ram_total=0
+  for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*GB.*$/\1/p'); do
+    ram_total=$(($ram_total + $i * 1024 * 1024))
+  done
 
-if [ "$ram_total" -eq 0 ]; then
-  error "unable to determine physical memory size"
+  if [ "$ram_total" -eq 0 ]; then
+    error "unable to determine physical memory size, use buildos.ram_total to specify"
+  fi
+else
+  ram_total=$(($ram_total * 1024 * 1024))
 fi
 
 cpu_total="$(lscpu | sed -n -re 's/^CPU\(s\): *([0-9]+)$/\1/p')"
 
+# RAM reserved to the host.
+#
 if [ -z "$ram_reserved" ]; then
   ram_reserved=4
 fi
 ram_reserved=$(($ram_reserved * 1024 * 1024))
 
-if [ -z "$ram_overcommit" ]; then
-  ram_overcommit=1
+# RAM reserved for auxiliary machines.
+#
+if [ -z "$ram_auxiliary" ]; then
+  ram_auxiliary=0
 fi
+ram_auxiliary=$(($ram_auxiliary * 1024 * 1024))
 
 if [ -z "$cpu_reserved" ]; then
   cpu_reserved=0
@@ -238,11 +284,13 @@ if [ -z "$cpu_overcommit" ]; then
   cpu_overcommit=1
 fi
 
-ram_slice=$(($ram_total - $ram_reserved))
+ram_build_slice=$(($ram_total - $ram_reserved - $ram_auxiliary))
+ram_auxil_slice=$ram_auxiliary
 cpu_slice=$(($cpu_total - $cpu_reserved))
 
 if [ "$instances" -gt 1 ]; then
-  ram_slice=$(($ram_slice * $ram_overcommit / $instances))
+  ram_build_slice=$(($ram_build_slice / $instances))
+  ram_auxil_slice=$(($ram_auxil_slice / $instances))
   cpu_slice=$(($cpu_slice * $cpu_overcommit / $instances))
 
   if [ "$cpu_slice" -eq 0 ]; then
@@ -254,20 +302,24 @@ fi
 #
 function print ()
 {
-  echo "cpu_total:      $cpu_total"
-  echo "cpu_reserved:   $cpu_reserved"
-  echo "cpu_overcommit: $cpu_overcommit"
-  echo "cpu_slice:      $cpu_slice"
+  echo "cpu_total:       $cpu_total"
+  echo "cpu_reserved:    $cpu_reserved"
+  echo "cpu_overcommit:  $cpu_overcommit"
+  echo "cpu_slice:       $cpu_slice"
+  if [ -n "$cpu_affinity" ]; then
+    echo "cpu_affinity:    $cpu_affinity"
+  fi
   echo
 
-  echo "ram_total:      $ram_total KB"
-  echo "ram_reserved:   $ram_reserved KB"
-  echo "ram_overcommit: $ram_overcommit"
-  echo "ram_slice:      $ram_slice KB"
+  echo "ram_total:       $ram_total KiB"
+  echo "ram_reserved:    $ram_reserved KiB"
+  echo "ram_auxiliary:   $ram_auxiliary KiB"
+  echo "ram_build_slice: $ram_build_slice KiB"
+  echo "ram_auxil_slice: $ram_auxil_slice KiB"
   echo
 
-  echo "buildid:        $buildid"
-  echo "buildid_url:    $buildid_url"
+  echo "buildid:         $buildid"
+  echo "buildid_url:     $buildid_url"
   echo
 
   local n i tn tp tu tt
@@ -276,23 +328,41 @@ function print ()
     tc="$(toolchain_value "$tp" nice)"
     tb="$(toolchain_value "$tp" bridge)"
     ti="$(toolchain_value "$tp" instances)"
+    ta="$(toolchain_value "$tp" interactive)"
     tu="$(toolchain_value "$tp" toolchain_url)"
     tt="$(toolchain_value "$tp" toolchain_trust)"
 
-    echo "$tn.nice:              $tc"
-    echo "$tn.bridge:            $tb"
-    echo "$tn.instances:         $ti"
-    echo "$tn.toolchain_url:     $tu"
-    echo "$tn.toolchain_trust:   $tt"
+    tbt="$(toolchain_value "$tp" build_timeout)"
+    tst="$(toolchain_value "$tp" bootstrap_timeout)"
+    tat="$(toolchain_value "$tp" interactive_timeout)"
+
+    echo "$tn.nice:                $tc"
+    echo "$tn.bridge:              $tb"
+    echo "$tn.instances:           $ti"
+    echo "$tn.interactive:         $ta"
+    echo "$tn.toolchain_url:       $tu"
+    echo "$tn.toolchain_trust:     $tt"
+
+    if [ -n "$tbt" ]; then
+      echo "$tn.build_timeout:       $tbt"
+    fi
+
+    if [ -n "$tst" ]; then
+      echo "$tn.bootstrap_timeout:   $tst"
+    fi
+
+    if [ -n "$tat" ]; then
+      echo "$tn.interactive_timeout: $tat"
+    fi
 
     n="${tp}controller_url[@]"
     for i in "${!n}"; do
-      echo "$tn.controller_url:    $i"
+      echo "$tn.controller_url:      $i"
     done
 
     n="${tp}controller_trust[@]"
     for i in "${!n}"; do
-      echo "$tn.controller_trust:  $i"
+      echo "$tn.controller_trust:    $i"
     done
 
     echo
@@ -801,9 +871,14 @@ function bbot_start () # <toolchain-name> <toolchain-index>
   local tc="$(toolchain_value "$tp" nice)"
   local tb="$(toolchain_value "$tp" bridge)"
   local ti="$(toolchain_value "$tp" instances)"
+  local ta="$(toolchain_value "$tp" interactive)"
   local tv="$(toolchain_value "$tp" toolchain_fver)"
   local ts="$(toolchain_value "$tp" toolchain_file_csum)"
 
+  local tbt="$(toolchain_value "$tp" build_timeout)"
+  local tst="$(toolchain_value "$tp" bootstrap_timeout)"
+  local tat="$(toolchain_value "$tp" interactive_timeout)"
+
   local id="/build/bots/$tn"
   mkdir -p "$id"
 
@@ -826,7 +901,9 @@ function bbot_start () # <toolchain-name> <toolchain-index>
     #
     if [ "$b_word" = "configured" ]; then
 
-      for ((i=1; i <= ti; i++)); do
+      # Note: stop extra instance.
+      #
+      for ((i=1; i <= ti + 1; i++)); do
 	if ! sudo systemctl stop "bbot-agent-$tn@$i"; then
 	  info "failed to stop bbot-agent-$tn@$i service, assuming not running"
 	  continue
@@ -867,17 +944,44 @@ function bbot_start () # <toolchain-name> <toolchain-index>
     sed -i -r \
 	-e "s#%I#$tn/%I#g" \
 	-e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \
-	-e "s/^(Environment=RAM)=.*/\1=$ram_slice/" \
+	-e "s/^(Environment=RAM_BUILD)=.*/\1=$ram_build_slice/" \
+	-e "s/^(Environment=RAM_AUXIL)=.*/\1=$ram_auxil_slice/" \
 	-e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \
 	-e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \
+	-e "s/^(Environment=INTERACTIVE)=.*/\1=$ta/" \
 	-e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \
 	-e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \
 	-e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \
 	-e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \
+	-e "s/^(Environment=INSTANCE_MAX)=.*/\1=$ti/" \
 	-e "s/^(Nice)=.*/\1=$tc/" \
 	-e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \
 	"$id/lib/systemd/system/bbot-agent@.service"
 
+    # Patch in CPU affinity.
+    #
+    if [ -n "$cpu_affinity" ]; then
+      sed -i -r -e "s/^(CPUAffinity)=.*/\1=$cpu_affinity/" \
+	  "$id/lib/systemd/system/bbot-agent@.service"
+    fi
+
+    # Patch in build/bootstrap/interactive timeouts.
+    #
+    if [ -n "$tbt" ]; then
+      sed -i -r -e "s/^(Environment=BUILD_TIMEOUT)=.*/\1=$tbt/" \
+	  "$id/lib/systemd/system/bbot-agent@.service"
+    fi
+
+    if [ -n "$tst" ]; then
+      sed -i -r -e "s/^(Environment=BOOTSTRAP_TIMEOUT)=.*/\1=$tst/" \
+	  "$id/lib/systemd/system/bbot-agent@.service"
+    fi
+
+    if [ -n "$tat" ]; then
+      sed -i -r -e "s/^(Environment=INTERACTIVE_TIMEOUT)=.*/\1=$tat/" \
+	  "$id/lib/systemd/system/bbot-agent@.service"
+    fi
+
     # Patch in the controller URLs. These can contain special characters
     # like `&` so we have to escape them.
     #
@@ -927,8 +1031,10 @@ function bbot_start () # <toolchain-name> <toolchain-index>
     # the process has forked. Making sure the service has actually started is
     # done as part of the service monitoring.
     #
+    # Note: start extra instance.
+    #
     r=0
-    for ((i=1; i <= ti; i++)); do
+    for ((i=1; i <= ti + 1; i++)); do
       if ! sudo systemctl start "bbot-agent-$tn@$i"; then
 	info "failed to start bbot-agent-$tn@$i service instance"
 	r=1
@@ -953,6 +1059,7 @@ declare -A toolchain_cursors # Latest systemd journal cursor.
 
 # Monitoring loop.
 #
+sensors=true
 count=0
 while true; do
 
@@ -1102,7 +1209,9 @@ EOF
 
 	  # For each service instance check if it has failed.
 	  #
-	  for ((i=1; i <= ti; i++)); do
+	  # Note: check extra instance.
+	  #
+	  for ((i=1; i <= ti + 1; i++)); do
 
 	    if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then
 	      s="bbot-agent-$tn@$i service has failed, stopping"
@@ -1198,6 +1307,8 @@ EOF
 		  # subject line (note that there can be a mix so we have to
 		  # try in the priority order).
 		  #
+		  # @@ pipefail
+		  #
 		  p=2
 		  s="$("${c[@]}" --output cat --priority 2 | head -n 1)"
 		  if [ -z "$s" ]; then
@@ -1214,6 +1325,8 @@ EOF
 
 		  s="bbot-agent-$tn@$i: $s"
 
+		  # @@ pipefail
+		  #
 		  info "$s"
 		  {
 		    echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}";
@@ -1267,7 +1380,13 @@ EOF
     done
   fi
 
-  sensors -A
+  if [ "$sensors" ]; then
+    if ! sensors -A; then
+      info "unable to query sensors, disabling"
+      sensors=
+    fi
+  fi
+
   info "monitoring..."
   sleep 60
 done