diff options
Diffstat (limited to 'buildos')
-rwxr-xr-x | buildos | 195 |
1 files changed, 157 insertions, 38 deletions
@@ -29,6 +29,16 @@ function error () exit 1 } +# Note: the arch variant is patched in by the bootstrap script. +# +arch="$(uname -m)" +arch_variant= + +arch_with_variant="$arch" +if [ -n "$arch_variant" ]; then + arch_with_variant="$arch_with_variant-$arch_variant" +fi + # Network timeouts: 60 seconds to connect, 10 minutes to complete, 4 retries # (5 attempts total). These are similar to bbot timeouts. Note that the # toolchain archives can be quite sizable. @@ -49,6 +59,11 @@ info "starting build os monitor..." # foo='foo fox' # bar="bar 'box'" # +# Or (as rewritten by GRUB): +# +# 'foo=foo fox' +# "bar=bar 'box'" +# # First we separete quoted variables and arguments with newlines (giving # priority to assignments). Then we replace whitespaces with newline on # lines that don't contain quites. Finally, we clean up by removing blank @@ -71,11 +86,24 @@ declare -A toolchains toolchains["default"]="" for v in "${cmdline[@]}"; do - var="$(sed -n -re 's/^buildos\.([^=]+)=.*$/\1/p' <<<"$v")" # Extract name. + + # Rewrite "x=y" as x="y" (as well as the single-quote variant). + # + v1="$(sed -n -re "s/^\"([^= ]+)=(.*)\"\$/\1=\"\2\"/p" <<<"$v")" + if [ -n "$v1" ]; then + v="$v1" + else + v1="$(sed -n -re "s/^'([^= ]+)=(.*)'\$/\1='\2'/p" <<<"$v")" + if [ -n "$v1" ]; then + v="$v1" + fi + fi + + var="$(sed -n -re 's/^buildos\.([^= ]+)=.*$/\1/p' <<<"$v")" # Extract name. if [ -n "$var" ]; then - val="$(sed -re 's/^[^=]+=(.*)$/\1/' <<<"$v")" # Extract value. - val="$(sed -re "s/^('(.*)'|\"(.*)\")$/\2\3/" <<<"$val")" # Strip quoted. + val="$(sed -re 's/^[^= ]+=(.*)$/\1/' <<<"$v")" # Extract value. + val="$(sed -re "s/^('(.*)'|\"(.*)\")\$/\2\3/" <<<"$val")" # Strip quoted. # Recognize some variables as arrays. # @@ -122,14 +150,17 @@ function restart () sudo systemctl reboot } -if [ -z "$buildid_url" ]; then +if [ -n "$buildid_url" ]; then + buildid_url="$buildid_url-$arch_with_variant" +else info "no buildos.buildid_url specified, not monitoring for new os builds" fi # Process toolchains. # -# Return the value of one of the toolchain_* variables for this toolchain. +# Return the value of one of the <variable>.<toolchain> variables for this +# toolchain. # function toolchain_value () # <toolchain-prefix> <variable> { @@ -171,6 +202,12 @@ for tn in "${!toolchains[@]}"; do instances=$(($instances + $(toolchain_value "$tp" instances))) + # Default to non-interactive-only. + # + if [ -z "$(toolchain_value "$tp" interactive)" ]; then + declare "${tp}interactive=false" + fi + # Default to 0 nice value. # if [ -z "$(toolchain_value "$tp" nice)" ]; then @@ -208,27 +245,36 @@ fi # # Note that MemTotal in /proc/meminfo is the available memory, not physical. # And to make it easier to provision memory it's really helpful to base it -# in the physical value. +# on the physical value. # -ram_total=0 -for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*MB.*$/\1/p'); do - ram_total=$(($ram_total + $i * 1024)) -done +if [ -z "$ram_total" ]; then + ram_total=0 + for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*GB.*$/\1/p'); do + ram_total=$(($ram_total + $i * 1024 * 1024)) + done -if [ "$ram_total" -eq 0 ]; then - error "unable to determine physical memory size" + if [ "$ram_total" -eq 0 ]; then + error "unable to determine physical memory size, use buildos.ram_total to specify" + fi +else + ram_total=$(($ram_total * 1024 * 1024)) fi cpu_total="$(lscpu | sed -n -re 's/^CPU\(s\): *([0-9]+)$/\1/p')" +# RAM reserved to the host. +# if [ -z "$ram_reserved" ]; then ram_reserved=4 fi ram_reserved=$(($ram_reserved * 1024 * 1024)) -if [ -z "$ram_overcommit" ]; then - ram_overcommit=1 +# RAM reserved for auxiliary machines. +# +if [ -z "$ram_auxiliary" ]; then + ram_auxiliary=0 fi +ram_auxiliary=$(($ram_auxiliary * 1024 * 1024)) if [ -z "$cpu_reserved" ]; then cpu_reserved=0 @@ -238,11 +284,13 @@ if [ -z "$cpu_overcommit" ]; then cpu_overcommit=1 fi -ram_slice=$(($ram_total - $ram_reserved)) +ram_build_slice=$(($ram_total - $ram_reserved - $ram_auxiliary)) +ram_auxil_slice=$ram_auxiliary cpu_slice=$(($cpu_total - $cpu_reserved)) if [ "$instances" -gt 1 ]; then - ram_slice=$(($ram_slice * $ram_overcommit / $instances)) + ram_build_slice=$(($ram_build_slice / $instances)) + ram_auxil_slice=$(($ram_auxil_slice / $instances)) cpu_slice=$(($cpu_slice * $cpu_overcommit / $instances)) if [ "$cpu_slice" -eq 0 ]; then @@ -254,20 +302,24 @@ fi # function print () { - echo "cpu_total: $cpu_total" - echo "cpu_reserved: $cpu_reserved" - echo "cpu_overcommit: $cpu_overcommit" - echo "cpu_slice: $cpu_slice" + echo "cpu_total: $cpu_total" + echo "cpu_reserved: $cpu_reserved" + echo "cpu_overcommit: $cpu_overcommit" + echo "cpu_slice: $cpu_slice" + if [ -n "$cpu_affinity" ]; then + echo "cpu_affinity: $cpu_affinity" + fi echo - echo "ram_total: $ram_total KB" - echo "ram_reserved: $ram_reserved KB" - echo "ram_overcommit: $ram_overcommit" - echo "ram_slice: $ram_slice KB" + echo "ram_total: $ram_total KiB" + echo "ram_reserved: $ram_reserved KiB" + echo "ram_auxiliary: $ram_auxiliary KiB" + echo "ram_build_slice: $ram_build_slice KiB" + echo "ram_auxil_slice: $ram_auxil_slice KiB" echo - echo "buildid: $buildid" - echo "buildid_url: $buildid_url" + echo "buildid: $buildid" + echo "buildid_url: $buildid_url" echo local n i tn tp tu tt @@ -276,23 +328,41 @@ function print () tc="$(toolchain_value "$tp" nice)" tb="$(toolchain_value "$tp" bridge)" ti="$(toolchain_value "$tp" instances)" + ta="$(toolchain_value "$tp" interactive)" tu="$(toolchain_value "$tp" toolchain_url)" tt="$(toolchain_value "$tp" toolchain_trust)" - echo "$tn.nice: $tc" - echo "$tn.bridge: $tb" - echo "$tn.instances: $ti" - echo "$tn.toolchain_url: $tu" - echo "$tn.toolchain_trust: $tt" + tbt="$(toolchain_value "$tp" build_timeout)" + tst="$(toolchain_value "$tp" bootstrap_timeout)" + tat="$(toolchain_value "$tp" interactive_timeout)" + + echo "$tn.nice: $tc" + echo "$tn.bridge: $tb" + echo "$tn.instances: $ti" + echo "$tn.interactive: $ta" + echo "$tn.toolchain_url: $tu" + echo "$tn.toolchain_trust: $tt" + + if [ -n "$tbt" ]; then + echo "$tn.build_timeout: $tbt" + fi + + if [ -n "$tst" ]; then + echo "$tn.bootstrap_timeout: $tst" + fi + + if [ -n "$tat" ]; then + echo "$tn.interactive_timeout: $tat" + fi n="${tp}controller_url[@]" for i in "${!n}"; do - echo "$tn.controller_url: $i" + echo "$tn.controller_url: $i" done n="${tp}controller_trust[@]" for i in "${!n}"; do - echo "$tn.controller_trust: $i" + echo "$tn.controller_trust: $i" done echo @@ -801,9 +871,14 @@ function bbot_start () # <toolchain-name> <toolchain-index> local tc="$(toolchain_value "$tp" nice)" local tb="$(toolchain_value "$tp" bridge)" local ti="$(toolchain_value "$tp" instances)" + local ta="$(toolchain_value "$tp" interactive)" local tv="$(toolchain_value "$tp" toolchain_fver)" local ts="$(toolchain_value "$tp" toolchain_file_csum)" + local tbt="$(toolchain_value "$tp" build_timeout)" + local tst="$(toolchain_value "$tp" bootstrap_timeout)" + local tat="$(toolchain_value "$tp" interactive_timeout)" + local id="/build/bots/$tn" mkdir -p "$id" @@ -826,7 +901,9 @@ function bbot_start () # <toolchain-name> <toolchain-index> # if [ "$b_word" = "configured" ]; then - for ((i=1; i <= ti; i++)); do + # Note: stop extra instance. + # + for ((i=1; i <= ti + 1; i++)); do if ! sudo systemctl stop "bbot-agent-$tn@$i"; then info "failed to stop bbot-agent-$tn@$i service, assuming not running" continue @@ -867,17 +944,44 @@ function bbot_start () # <toolchain-name> <toolchain-index> sed -i -r \ -e "s#%I#$tn/%I#g" \ -e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \ - -e "s/^(Environment=RAM)=.*/\1=$ram_slice/" \ + -e "s/^(Environment=RAM_BUILD)=.*/\1=$ram_build_slice/" \ + -e "s/^(Environment=RAM_AUXIL)=.*/\1=$ram_auxil_slice/" \ -e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \ -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \ + -e "s/^(Environment=INTERACTIVE)=.*/\1=$ta/" \ -e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \ -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \ -e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \ -e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \ + -e "s/^(Environment=INSTANCE_MAX)=.*/\1=$ti/" \ -e "s/^(Nice)=.*/\1=$tc/" \ -e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \ "$id/lib/systemd/system/bbot-agent@.service" + # Patch in CPU affinity. + # + if [ -n "$cpu_affinity" ]; then + sed -i -r -e "s/^(CPUAffinity)=.*/\1=$cpu_affinity/" \ + "$id/lib/systemd/system/bbot-agent@.service" + fi + + # Patch in build/bootstrap/interactive timeouts. + # + if [ -n "$tbt" ]; then + sed -i -r -e "s/^(Environment=BUILD_TIMEOUT)=.*/\1=$tbt/" \ + "$id/lib/systemd/system/bbot-agent@.service" + fi + + if [ -n "$tst" ]; then + sed -i -r -e "s/^(Environment=BOOTSTRAP_TIMEOUT)=.*/\1=$tst/" \ + "$id/lib/systemd/system/bbot-agent@.service" + fi + + if [ -n "$tat" ]; then + sed -i -r -e "s/^(Environment=INTERACTIVE_TIMEOUT)=.*/\1=$tat/" \ + "$id/lib/systemd/system/bbot-agent@.service" + fi + # Patch in the controller URLs. These can contain special characters # like `&` so we have to escape them. # @@ -927,8 +1031,10 @@ function bbot_start () # <toolchain-name> <toolchain-index> # the process has forked. Making sure the service has actually started is # done as part of the service monitoring. # + # Note: start extra instance. + # r=0 - for ((i=1; i <= ti; i++)); do + for ((i=1; i <= ti + 1; i++)); do if ! sudo systemctl start "bbot-agent-$tn@$i"; then info "failed to start bbot-agent-$tn@$i service instance" r=1 @@ -953,6 +1059,7 @@ declare -A toolchain_cursors # Latest systemd journal cursor. # Monitoring loop. # +sensors=true count=0 while true; do @@ -1102,7 +1209,9 @@ EOF # For each service instance check if it has failed. # - for ((i=1; i <= ti; i++)); do + # Note: check extra instance. + # + for ((i=1; i <= ti + 1; i++)); do if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then s="bbot-agent-$tn@$i service has failed, stopping" @@ -1198,6 +1307,8 @@ EOF # subject line (note that there can be a mix so we have to # try in the priority order). # + # @@ pipefail + # p=2 s="$("${c[@]}" --output cat --priority 2 | head -n 1)" if [ -z "$s" ]; then @@ -1214,6 +1325,8 @@ EOF s="bbot-agent-$tn@$i: $s" + # @@ pipefail + # info "$s" { echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}"; @@ -1267,7 +1380,13 @@ EOF done fi - sensors -A + if [ "$sensors" ]; then + if ! sensors -A; then + info "unable to query sensors, disabling" + sensors= + fi + fi + info "monitoring..." sleep 60 done |