aboutsummaryrefslogtreecommitdiff
path: root/buildos
diff options
context:
space:
mode:
Diffstat (limited to 'buildos')
-rwxr-xr-xbuildos195
1 files changed, 157 insertions, 38 deletions
diff --git a/buildos b/buildos
index 85d1d12..dc4cd55 100755
--- a/buildos
+++ b/buildos
@@ -29,6 +29,16 @@ function error ()
exit 1
}
+# Note: the arch variant is patched in by the bootstrap script.
+#
+arch="$(uname -m)"
+arch_variant=
+
+arch_with_variant="$arch"
+if [ -n "$arch_variant" ]; then
+ arch_with_variant="$arch_with_variant-$arch_variant"
+fi
+
# Network timeouts: 60 seconds to connect, 10 minutes to complete, 4 retries
# (5 attempts total). These are similar to bbot timeouts. Note that the
# toolchain archives can be quite sizable.
@@ -49,6 +59,11 @@ info "starting build os monitor..."
# foo='foo fox'
# bar="bar 'box'"
#
+# Or (as rewritten by GRUB):
+#
+# 'foo=foo fox'
+# "bar=bar 'box'"
+#
# First we separete quoted variables and arguments with newlines (giving
# priority to assignments). Then we replace whitespaces with newline on
# lines that don't contain quites. Finally, we clean up by removing blank
@@ -71,11 +86,24 @@ declare -A toolchains
toolchains["default"]=""
for v in "${cmdline[@]}"; do
- var="$(sed -n -re 's/^buildos\.([^=]+)=.*$/\1/p' <<<"$v")" # Extract name.
+
+ # Rewrite "x=y" as x="y" (as well as the single-quote variant).
+ #
+ v1="$(sed -n -re "s/^\"([^= ]+)=(.*)\"\$/\1=\"\2\"/p" <<<"$v")"
+ if [ -n "$v1" ]; then
+ v="$v1"
+ else
+ v1="$(sed -n -re "s/^'([^= ]+)=(.*)'\$/\1='\2'/p" <<<"$v")"
+ if [ -n "$v1" ]; then
+ v="$v1"
+ fi
+ fi
+
+ var="$(sed -n -re 's/^buildos\.([^= ]+)=.*$/\1/p' <<<"$v")" # Extract name.
if [ -n "$var" ]; then
- val="$(sed -re 's/^[^=]+=(.*)$/\1/' <<<"$v")" # Extract value.
- val="$(sed -re "s/^('(.*)'|\"(.*)\")$/\2\3/" <<<"$val")" # Strip quoted.
+ val="$(sed -re 's/^[^= ]+=(.*)$/\1/' <<<"$v")" # Extract value.
+ val="$(sed -re "s/^('(.*)'|\"(.*)\")\$/\2\3/" <<<"$val")" # Strip quoted.
# Recognize some variables as arrays.
#
@@ -122,14 +150,17 @@ function restart ()
sudo systemctl reboot
}
-if [ -z "$buildid_url" ]; then
+if [ -n "$buildid_url" ]; then
+ buildid_url="$buildid_url-$arch_with_variant"
+else
info "no buildos.buildid_url specified, not monitoring for new os builds"
fi
# Process toolchains.
#
-# Return the value of one of the toolchain_* variables for this toolchain.
+# Return the value of one of the <variable>.<toolchain> variables for this
+# toolchain.
#
function toolchain_value () # <toolchain-prefix> <variable>
{
@@ -171,6 +202,12 @@ for tn in "${!toolchains[@]}"; do
instances=$(($instances + $(toolchain_value "$tp" instances)))
+ # Default to non-interactive-only.
+ #
+ if [ -z "$(toolchain_value "$tp" interactive)" ]; then
+ declare "${tp}interactive=false"
+ fi
+
# Default to 0 nice value.
#
if [ -z "$(toolchain_value "$tp" nice)" ]; then
@@ -208,27 +245,36 @@ fi
#
# Note that MemTotal in /proc/meminfo is the available memory, not physical.
# And to make it easier to provision memory it's really helpful to base it
-# in the physical value.
+# on the physical value.
#
-ram_total=0
-for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*MB.*$/\1/p'); do
- ram_total=$(($ram_total + $i * 1024))
-done
+if [ -z "$ram_total" ]; then
+ ram_total=0
+ for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*GB.*$/\1/p'); do
+ ram_total=$(($ram_total + $i * 1024 * 1024))
+ done
-if [ "$ram_total" -eq 0 ]; then
- error "unable to determine physical memory size"
+ if [ "$ram_total" -eq 0 ]; then
+ error "unable to determine physical memory size, use buildos.ram_total to specify"
+ fi
+else
+ ram_total=$(($ram_total * 1024 * 1024))
fi
cpu_total="$(lscpu | sed -n -re 's/^CPU\(s\): *([0-9]+)$/\1/p')"
+# RAM reserved to the host.
+#
if [ -z "$ram_reserved" ]; then
ram_reserved=4
fi
ram_reserved=$(($ram_reserved * 1024 * 1024))
-if [ -z "$ram_overcommit" ]; then
- ram_overcommit=1
+# RAM reserved for auxiliary machines.
+#
+if [ -z "$ram_auxiliary" ]; then
+ ram_auxiliary=0
fi
+ram_auxiliary=$(($ram_auxiliary * 1024 * 1024))
if [ -z "$cpu_reserved" ]; then
cpu_reserved=0
@@ -238,11 +284,13 @@ if [ -z "$cpu_overcommit" ]; then
cpu_overcommit=1
fi
-ram_slice=$(($ram_total - $ram_reserved))
+ram_build_slice=$(($ram_total - $ram_reserved - $ram_auxiliary))
+ram_auxil_slice=$ram_auxiliary
cpu_slice=$(($cpu_total - $cpu_reserved))
if [ "$instances" -gt 1 ]; then
- ram_slice=$(($ram_slice * $ram_overcommit / $instances))
+ ram_build_slice=$(($ram_build_slice / $instances))
+ ram_auxil_slice=$(($ram_auxil_slice / $instances))
cpu_slice=$(($cpu_slice * $cpu_overcommit / $instances))
if [ "$cpu_slice" -eq 0 ]; then
@@ -254,20 +302,24 @@ fi
#
function print ()
{
- echo "cpu_total: $cpu_total"
- echo "cpu_reserved: $cpu_reserved"
- echo "cpu_overcommit: $cpu_overcommit"
- echo "cpu_slice: $cpu_slice"
+ echo "cpu_total: $cpu_total"
+ echo "cpu_reserved: $cpu_reserved"
+ echo "cpu_overcommit: $cpu_overcommit"
+ echo "cpu_slice: $cpu_slice"
+ if [ -n "$cpu_affinity" ]; then
+ echo "cpu_affinity: $cpu_affinity"
+ fi
echo
- echo "ram_total: $ram_total KB"
- echo "ram_reserved: $ram_reserved KB"
- echo "ram_overcommit: $ram_overcommit"
- echo "ram_slice: $ram_slice KB"
+ echo "ram_total: $ram_total KiB"
+ echo "ram_reserved: $ram_reserved KiB"
+ echo "ram_auxiliary: $ram_auxiliary KiB"
+ echo "ram_build_slice: $ram_build_slice KiB"
+ echo "ram_auxil_slice: $ram_auxil_slice KiB"
echo
- echo "buildid: $buildid"
- echo "buildid_url: $buildid_url"
+ echo "buildid: $buildid"
+ echo "buildid_url: $buildid_url"
echo
local n i tn tp tu tt
@@ -276,23 +328,41 @@ function print ()
tc="$(toolchain_value "$tp" nice)"
tb="$(toolchain_value "$tp" bridge)"
ti="$(toolchain_value "$tp" instances)"
+ ta="$(toolchain_value "$tp" interactive)"
tu="$(toolchain_value "$tp" toolchain_url)"
tt="$(toolchain_value "$tp" toolchain_trust)"
- echo "$tn.nice: $tc"
- echo "$tn.bridge: $tb"
- echo "$tn.instances: $ti"
- echo "$tn.toolchain_url: $tu"
- echo "$tn.toolchain_trust: $tt"
+ tbt="$(toolchain_value "$tp" build_timeout)"
+ tst="$(toolchain_value "$tp" bootstrap_timeout)"
+ tat="$(toolchain_value "$tp" interactive_timeout)"
+
+ echo "$tn.nice: $tc"
+ echo "$tn.bridge: $tb"
+ echo "$tn.instances: $ti"
+ echo "$tn.interactive: $ta"
+ echo "$tn.toolchain_url: $tu"
+ echo "$tn.toolchain_trust: $tt"
+
+ if [ -n "$tbt" ]; then
+ echo "$tn.build_timeout: $tbt"
+ fi
+
+ if [ -n "$tst" ]; then
+ echo "$tn.bootstrap_timeout: $tst"
+ fi
+
+ if [ -n "$tat" ]; then
+ echo "$tn.interactive_timeout: $tat"
+ fi
n="${tp}controller_url[@]"
for i in "${!n}"; do
- echo "$tn.controller_url: $i"
+ echo "$tn.controller_url: $i"
done
n="${tp}controller_trust[@]"
for i in "${!n}"; do
- echo "$tn.controller_trust: $i"
+ echo "$tn.controller_trust: $i"
done
echo
@@ -801,9 +871,14 @@ function bbot_start () # <toolchain-name> <toolchain-index>
local tc="$(toolchain_value "$tp" nice)"
local tb="$(toolchain_value "$tp" bridge)"
local ti="$(toolchain_value "$tp" instances)"
+ local ta="$(toolchain_value "$tp" interactive)"
local tv="$(toolchain_value "$tp" toolchain_fver)"
local ts="$(toolchain_value "$tp" toolchain_file_csum)"
+ local tbt="$(toolchain_value "$tp" build_timeout)"
+ local tst="$(toolchain_value "$tp" bootstrap_timeout)"
+ local tat="$(toolchain_value "$tp" interactive_timeout)"
+
local id="/build/bots/$tn"
mkdir -p "$id"
@@ -826,7 +901,9 @@ function bbot_start () # <toolchain-name> <toolchain-index>
#
if [ "$b_word" = "configured" ]; then
- for ((i=1; i <= ti; i++)); do
+ # Note: stop extra instance.
+ #
+ for ((i=1; i <= ti + 1; i++)); do
if ! sudo systemctl stop "bbot-agent-$tn@$i"; then
info "failed to stop bbot-agent-$tn@$i service, assuming not running"
continue
@@ -867,17 +944,44 @@ function bbot_start () # <toolchain-name> <toolchain-index>
sed -i -r \
-e "s#%I#$tn/%I#g" \
-e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \
- -e "s/^(Environment=RAM)=.*/\1=$ram_slice/" \
+ -e "s/^(Environment=RAM_BUILD)=.*/\1=$ram_build_slice/" \
+ -e "s/^(Environment=RAM_AUXIL)=.*/\1=$ram_auxil_slice/" \
-e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \
-e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \
+ -e "s/^(Environment=INTERACTIVE)=.*/\1=$ta/" \
-e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \
-e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \
-e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \
-e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \
+ -e "s/^(Environment=INSTANCE_MAX)=.*/\1=$ti/" \
-e "s/^(Nice)=.*/\1=$tc/" \
-e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \
"$id/lib/systemd/system/bbot-agent@.service"
+ # Patch in CPU affinity.
+ #
+ if [ -n "$cpu_affinity" ]; then
+ sed -i -r -e "s/^(CPUAffinity)=.*/\1=$cpu_affinity/" \
+ "$id/lib/systemd/system/bbot-agent@.service"
+ fi
+
+ # Patch in build/bootstrap/interactive timeouts.
+ #
+ if [ -n "$tbt" ]; then
+ sed -i -r -e "s/^(Environment=BUILD_TIMEOUT)=.*/\1=$tbt/" \
+ "$id/lib/systemd/system/bbot-agent@.service"
+ fi
+
+ if [ -n "$tst" ]; then
+ sed -i -r -e "s/^(Environment=BOOTSTRAP_TIMEOUT)=.*/\1=$tst/" \
+ "$id/lib/systemd/system/bbot-agent@.service"
+ fi
+
+ if [ -n "$tat" ]; then
+ sed -i -r -e "s/^(Environment=INTERACTIVE_TIMEOUT)=.*/\1=$tat/" \
+ "$id/lib/systemd/system/bbot-agent@.service"
+ fi
+
# Patch in the controller URLs. These can contain special characters
# like `&` so we have to escape them.
#
@@ -927,8 +1031,10 @@ function bbot_start () # <toolchain-name> <toolchain-index>
# the process has forked. Making sure the service has actually started is
# done as part of the service monitoring.
#
+ # Note: start extra instance.
+ #
r=0
- for ((i=1; i <= ti; i++)); do
+ for ((i=1; i <= ti + 1; i++)); do
if ! sudo systemctl start "bbot-agent-$tn@$i"; then
info "failed to start bbot-agent-$tn@$i service instance"
r=1
@@ -953,6 +1059,7 @@ declare -A toolchain_cursors # Latest systemd journal cursor.
# Monitoring loop.
#
+sensors=true
count=0
while true; do
@@ -1102,7 +1209,9 @@ EOF
# For each service instance check if it has failed.
#
- for ((i=1; i <= ti; i++)); do
+ # Note: check extra instance.
+ #
+ for ((i=1; i <= ti + 1; i++)); do
if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then
s="bbot-agent-$tn@$i service has failed, stopping"
@@ -1198,6 +1307,8 @@ EOF
# subject line (note that there can be a mix so we have to
# try in the priority order).
#
+ # @@ pipefail
+ #
p=2
s="$("${c[@]}" --output cat --priority 2 | head -n 1)"
if [ -z "$s" ]; then
@@ -1214,6 +1325,8 @@ EOF
s="bbot-agent-$tn@$i: $s"
+ # @@ pipefail
+ #
info "$s"
{
echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}";
@@ -1267,7 +1380,13 @@ EOF
done
fi
- sensors -A
+ if [ "$sensors" ]; then
+ if ! sensors -A; then
+ info "unable to query sensors, disabling"
+ sensors=
+ fi
+ fi
+
info "monitoring..."
sleep 60
done