#!/bin/bash # Build OS monitor. It starts as a systemd service and performs the following # steps: # # 1. Bootstrap the build2 toolchain. # 2. Build and start bbot. # 3. Build and start bslave. # 4. Monitor for OS and toolchain changes and reboot if detected. # # @@ What will systemd do if we fail? Perhaps configure it to restart # us? Or not since we may hose the logs. # owd="$(pwd)" trap "{ cd '$owd'; exit 1; }" ERR set -o errtrace # Trap in functions. shopt -s nullglob # Expand patterns than don't match to empty. # Note: diagnostics goes to stdout. # function info () { echo "$*" 1>&2; } function error () { if [ "$#" -gt 0 ]; then info "$*"; fi exit 1 } # Note: the arch variant is patched in by the bootstrap script. # arch="$(uname -m)" arch_variant= arch_with_variant="$arch" if [ -n "$arch_variant" ]; then arch_with_variant="$arch_with_variant-$arch_variant" fi # Network timeouts: 60 seconds to connect, 10 minutes to complete, 4 retries # (5 attempts total). These are similar to bbot timeouts. Note that the # toolchain archives can be quite sizable. # timeout=600 curl=(curl -f -L -s -S \ --retry 4 \ --retry-max-time "$timeout" \ --max-time "$timeout" \ --connect-timeout 60) info "starting build os monitor..." # Parse the kernel command line. This is complicated by the fact that the # values can be quoted, for example: # # foo='foo fox' # bar="bar 'box'" # # Or (as rewritten by GRUB): # # 'foo=foo fox' # "bar=bar 'box'" # # First we separete quoted variables and arguments with newlines (giving # priority to assignments). Then we replace whitespaces with newline on # lines that don't contain quites. Finally, we clean up by removing blank # lines. # # Note: the same code as in init. # readarray -t cmdline < <(cat /proc/cmdline | \ sed -r -e "s/([^ ]+=)?('[^']*'|\"[^\"]*\")/\n\1\2\n/g" | \ sed -r -e "/['\"]/!s/ /\n/g" | sed -r -e '/^\s*$/d') # Enter all buildos variables as bash variables. # # Map of toolchain names (as specified in buildos..) to the # corresponding bash variable prefix. # declare -A toolchains toolchains["default"]="" for v in "${cmdline[@]}"; do # Rewrite "x=y" as x="y" (as well as the single-quote variant). # v1="$(sed -n -re "s/^\"([^= ]+)=(.*)\"\$/\1=\"\2\"/p" <<<"$v")" if [ -n "$v1" ]; then v="$v1" else v1="$(sed -n -re "s/^'([^= ]+)=(.*)'\$/\1='\2'/p" <<<"$v")" if [ -n "$v1" ]; then v="$v1" fi fi var="$(sed -n -re 's/^buildos\.([^= ]+)=.*$/\1/p' <<<"$v")" # Extract name. if [ -n "$var" ]; then val="$(sed -re 's/^[^= ]+=(.*)$/\1/' <<<"$v")" # Extract value. val="$(sed -re "s/^('(.*)'|\"(.*)\")\$/\2\3/" <<<"$val")" # Strip quoted. # Recognize some variables as arrays. # a= # If the variable contains a dot, then it is a toolchain name-specific # variable. # if [[ "$var" == *.* ]]; then tn="$(sed -re 's/^[^.]+\.(.+)$/\1/' <<<"$var")" var="$(sed -re 's/^([^.]+)\..+$/\1/' <<<"$var")" if [ "$var" = "controller_url" -o "$var" = "controller_trust" ]; then a=true fi var="${tn}_$var" toolchains["$tn"]="${tn}_" fi if [ -n "$a" ]; then declare -a "$var+=('$val')" else declare "$var=$val" fi fi done hname="$(hostname)" # Get the build id. # buildid="$(sed -n -re 's/^BUILD_ID="(.+)"$/\1/p' /etc/os-release)" function email () # < { (echo -e "Subject: [$hname] $1\n"; cat -) | sendmail -i "$admin_email" } function restart () { sendmail -q # Flush mail queue. sleep 10 # Give any remaining mail chance to go through. sudo systemctl reboot } if [ -n "$buildid_url" ]; then buildid_url="$buildid_url-$arch_with_variant" else info "no buildos.buildid_url specified, not monitoring for new os builds" fi # Process toolchains. # # Return the value of one of the . variables for this # toolchain. # function toolchain_value () # { local n="${1}${2}" echo "${!n}" } instances=0 # Number of bbot instances across all toolchains. toolchain_names=() for tn in "${!toolchains[@]}"; do tp="${toolchains["$tn"]}" tu="$(toolchain_value "$tp" toolchain_url)" if [ -z "$tu" ]; then continue fi toolchain_names+=("$tn") # The toolchain "sums" file (a list of SHA sums and relative file names, as # produced by shaNNNsum). The first entry should always be build2-toolchain # tar archive itself (which we use to figure out the version). Blank lines # and lines that start with '#' are ignored. # tf="$(sed -n -re 's%^.+/([^/]+)$%\1%p' <<<"$tu")" declare "${tp}toolchain_file=$tf" declare "${tp}toolchain_csum=$(sed -n -re 's%^.+\.([^.]+)$%\1%p' <<<"$tf")" declare "${tp}toolchain_root=/build/tftp/toolchains/$tn" declare "${tp}toolchain_ver=" declare "${tp}toolchain_fver=" # Full version (with snapshot). # Default to 1 bbot agent instance. # if [ -z "$(toolchain_value "$tp" instances)" ]; then declare "${tp}instances=1" fi instances=$(($instances + $(toolchain_value "$tp" instances))) # Default to non-interactive-only. # if [ -z "$(toolchain_value "$tp" interactive)" ]; then declare "${tp}interactive=false" fi # Default to 0 nice value. # if [ -z "$(toolchain_value "$tp" nice)" ]; then declare "${tp}nice=0" fi # Default to br1 (private/NAT bridge). # if [ -z "$(toolchain_value "$tp" bridge)" ]; then declare "${tp}bridge=br1" fi # If toolchain_trust was not specified, set it to "no" so that we don't # prompt if the repository happens to be signed. # if [ -z "$(toolchain_value "$tp" toolchain_trust)" ]; then declare "${tp}toolchain_trust=no" fi # Warn if we have no controller URLs for this toolchain. # n="${tp}controller_url[0]" if [ -z "${!n}" ]; then info "no buildos.controller_url.$tn specified, not starting bbot agent" fi done if [ "${#toolchain_names[@]}" -eq 0 ]; then info "no buildos.toolchain_url specified, not bootstrapping" fi # Divide CPUs and RAM (in KB) among the instances. # # By default reserve 4G of RAM for ourselves (rootfs, tmpfs). # # Note that MemTotal in /proc/meminfo is the available memory, not physical. # And to make it easier to provision memory it's really helpful to base it # on the physical value. # if [ -z "$ram_total" ]; then ram_total=0 for i in $(sudo dmidecode -t 17 | sed -n -re 's/^\s*Size:\s*([0-9]+)\s*GB.*$/\1/p'); do ram_total=$(($ram_total + $i * 1024 * 1024)) done if [ "$ram_total" -eq 0 ]; then error "unable to determine physical memory size, use buildos.ram_total to specify" fi else ram_total=$(($ram_total * 1024 * 1024)) fi cpu_total="$(lscpu | sed -n -re 's/^CPU$s$: *([0-9]+)$/\1/p')" # RAM reserved to the host. # if [ -z "$ram_reserved" ]; then ram_reserved=4 fi ram_reserved=$(($ram_reserved * 1024 * 1024)) # RAM reserved for auxiliary machines. # if [ -z "$ram_auxiliary" ]; then ram_auxiliary=0 fi ram_auxiliary=$(($ram_auxiliary * 1024 * 1024)) if [ -z "$cpu_reserved" ]; then cpu_reserved=0 fi if [ -z "$cpu_overcommit" ]; then cpu_overcommit=1 fi ram_build_slice=$(($ram_total - $ram_reserved - $ram_auxiliary)) ram_auxil_slice=$ram_auxiliary cpu_slice=$(($cpu_total - $cpu_reserved)) if [ "$instances" -gt 1 ]; then ram_build_slice=$(($ram_build_slice / $instances)) ram_auxil_slice=$(($ram_auxil_slice / $instances)) cpu_slice=$(($cpu_slice * $cpu_overcommit / $instances)) if [ "$cpu_slice" -eq 0 ]; then cpu_slice=1 fi fi # Print monitor configuration as email body. # function print () { echo "cpu_total: $cpu_total" echo "cpu_reserved: $cpu_reserved" echo "cpu_overcommit: $cpu_overcommit" echo "cpu_slice: $cpu_slice" if [ -n "$cpu_affinity" ]; then echo "cpu_affinity: $cpu_affinity" fi echo echo "ram_total: $ram_total KiB" echo "ram_reserved: $ram_reserved KiB" echo "ram_auxiliary: $ram_auxiliary KiB" echo "ram_build_slice: $ram_build_slice KiB" echo "ram_auxil_slice: $ram_auxil_slice KiB" echo echo "buildid: $buildid" echo "buildid_url: $buildid_url" echo local n i tn tp tu tt for tn in "${toolchain_names[@]}"; do tp="${toolchains["$tn"]}" tc="$(toolchain_value "$tp" nice)" tb="$(toolchain_value "$tp" bridge)" ti="$(toolchain_value "$tp" instances)" ta="$(toolchain_value "$tp" interactive)" tu="$(toolchain_value "$tp" toolchain_url)" tt="$(toolchain_value "$tp" toolchain_trust)" tbt="$(toolchain_value "$tp" build_timeout)" tst="$(toolchain_value "$tp" bootstrap_timeout)" tat="$(toolchain_value "$tp" interactive_timeout)" echo "$tn.nice: $tc" echo "$tn.bridge: $tb" echo "$tn.instances: $ti" echo "$tn.interactive: $ta" echo "$tn.toolchain_url: $tu" echo "$tn.toolchain_trust: $tt" if [ -n "$tbt" ]; then echo "$tn.build_timeout: $tbt" fi if [ -n "$tst" ]; then echo "$tn.bootstrap_timeout: $tst" fi if [ -n "$tat" ]; then echo "$tn.interactive_timeout: $tat" fi n="${tp}controller_url[@]" for i in "${!n}"; do echo "$tn.controller_url: $i" done n="${tp}controller_trust[@]" for i in "${!n}"; do echo "$tn.controller_trust: $i" done echo done echo "host key:" echo openssl rsa -pubout -in /state/etc/host-key.pem 2>/dev/null } print | email "starting build os monitor" # Machines cleanup (/build/machines/). # diag=() fail= function print_diag () { local p for p in "${diag[@]}"; do echo " $p" done } # Iterate over all the machines and call a function (one of the below # machines_clean_*()) for each. # function machines_for () # ... { local f="$1" shift diag=() fail= local v m for v in /build/machines/*; do if [ ! -d "$v" ]; then diag+=("$v: error: invalid volume") fail=true continue fi cd "$v" for m in *; do if [ ! -d "$m" ]; then diag+=("$v/$m: error: invalid machine") fail=true continue fi "$f" "$v" "$m" "$@" done cd "$owd" done } function machines_clean_subvolume () # { if ! btrfs property set -ts "$1" ro false; then diag+=("$1: error: unable to change subvolume property") fail=true return 1 fi if ! btrfs subvolume delete "$1"; then diag+=("$1: error: unable to delete subvolume") fail=true return 1 fi } function machines_clean_lockfile () # { if ! rm -f "$1"; then diag+=("$1: error: unable to delete lockfile") fail=true return 1 fi } # Cleanup the -- entries for the specified toolchain # (all instances) as well as -.lock file. Called before # starting bbot instances for each toolchain. # function machines_clean_toolchain () # { local v="$1" local m="$2" local tn="$3" cd "$m" local i for i in "$m"-"$tn"-*; do if [ ! -d "$i" ]; then diag+=("$v/$m/$i: error: invalid machine subvolume") fail=true continue fi if machines_clean_subvolume "$v/$m/$i"; then diag+=("$v/$m/$i: info: deleted stray toolchain working subvolume") fi done i="$m-$tn.lock" if [ -f "$i" ]; then if machines_clean_lockfile "$v/$m/$i"; then diag+=("$v/$m/$i: info: deleted stray lockfile") fi fi cd "$v" } # Cleanup stray snapshots and lockfiles as well as deleted machines. Called # once during startup. # function machines_clean_stray () # { local v="$1" local m="$2" cd "$m" # Collect current machine symlink's bootstrap protocol numbers. If there # are no current machine symlinks, then we delete the whole thing. # local i ps=() for i in "$m"-*; do if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then if [ ! -L "$i" ]; then diag+=("$v/$m/$i: error: not a symlink") fail=true fi # Treat it as if it were a symlink even if its not. Failed that we # may try to delete the whole thing. # ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$i")") fi done # Examine each machine subvolume. # for i in "$m"-*; do # - (current machine symlink) # if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then continue fi # Lockfile. # if [ -f "$i" ]; then if [[ "$i" =~ ^"$m"-.+\.lock$ ]]; then if machines_clean_lockfile "$v/$m/$i"; then diag+=("$v/$m/$i: info: deleted lockfile") fi continue fi fi if [ ! -d "$i" ]; then diag+=("$v/$m/$i: error: invalid machine subvolume") fail=true continue fi # Unless we are deleting the whole thing, keep initial and bootstrapped # (for known toolchains) subvolumes. # if [ "${#ps[@]}" -gt 0 ]; then # - . (initial image) # local p f= for p in "${ps[@]}"; do if [[ "$i" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then f=false break fi done if [ -n "$f" ]; then continue fi # - (bootstrapped image) # f= local tn for tn in "${toolchain_names[@]}"; do if [[ "$i" =~ ^"$m"-"$tn"$ ]]; then f=false break fi done if [ -n "$f" ]; then continue fi fi # This is either a stray working subvolume or a bootsrapped subvolume # for a toolchain that was deleted (or we are deleting everything). # if machines_clean_subvolume "$v/$m/$i"; then diag+=("$v/$m/$i: info: deleted subvolume") fi done cd "$v" # Delete the machine directory (which we expect to be now empty). # if [ "${#ps[@]}" -eq 0 ]; then if rmdir "$m"; then diag+=("$v/$m: info: deleted machine directory") else diag+=("$v/$m: error: unable to delete machine directory") fail=true fi fi } # Do the initial cleanup. # machines_for machines_clean_stray if [ "${#diag[@]}" -gt 0 ]; then if [ -z "$fail" ]; then s="cleaned up entries in /build/machines/" else s="invalid entries in /build/machines/, halting" fi print_diag | email "$s" info "$s" && print_diag 1>&2 if [ -n "$fail" ]; then error "correct and restart the monitor (systemctl restart buildos)" fi fi # Toolchain-related funtions. # # Calculate the file checksum using the shaNNNsum utility. # function toolchain_checksum () # { "$(toolchain_value "$1" toolchain_csum)sum" -b "$2" | \ sed -n -re 's/^([^ ]+) .+$/\1/p' } # Fetch a file from the sums file into $toolchain_root, verify its checksum, # and make a predictable name (without version) symlink. # function toolchain_fetch () # { local s p f u l tn tp tu tr tv tn="$1" tp="${toolchains["$tn"]}" tu="$(toolchain_value "$tp" toolchain_url)" tr="$(toolchain_value "$tp" toolchain_root)" s="$(sed -n -re 's/^([^ ]+) .+$/\1/p' <<<"$2")" # Checksum. p="$(sed -n -re 's/^[^ ]+ \*([^ ]+)$/\1/p' <<<"$2")" # File path (relative). f="$(sed -n -re 's%^(.+/)?([^/]+)$%\2%p' <<<"$p")" # File name. u="$(sed -n -re 's%^(.+)/[^/]+$%\1%p' <<<"$tu")/$p" # File URL. if [ -z "$s" -o -z "$p" -o -z "$f" -o -z "$u" ]; then info "invalid sum line '$2'" return 1 fi # Extract the version and derive a predictable name link. # tv="$(toolchain_value "$tp" toolchain_ver)" if [ -z "$tv" ]; then tv="$(sed -n -re 's/^build2-toolchain-(.+)\.tar.*/\1/p' <<<"$f")" if [ -z "$tv" ]; then info "unable to extract toolchain version from '$f'" return 1 fi info "toolchain $tn version $tv" declare -g "${tp}toolchain_fver=$tv" # Full version. echo "$tv" >"$tr/version-full" l="$(sed -n -re "s/^(.+)-$tv(.*)$/\1\2/p" <<<"$f")" # Use full version. # Strip snapshot. # tv="$(sed -n -re 's/^([^.]+\.[^.]+\.[^-]+(-[ab]\.[^.+]+)?).*/\1/p' <<<"$tv")" declare -g "${tp}toolchain_ver=$tv" echo "$tv" >"$tr/version" else # For files other that build2-toolchain we expect the version component to # be in the [-] form, for example 1.2.3-stage. # l="$(sed -n -re "s/^(.+)-$tv(-$tn)?(.*)$/\1\3/p" <<<"$f")" fi if [ -z "$l" ]; then info "unable to derive predicatable name from '$f' and '$tv'" return 1 fi # Fetch the file. # info "fetching $u [$l]" if ! "${curl[@]}" -o "$tr/$f" "$u"; then info "unable to fetch $u" return 1 fi # Verify the checksum. # info "verifying checksum for $f" local cs cs="$(toolchain_checksum "$tp" "$tr/$f")" if [ "$cs" != "$s" ]; then info "checksum mismatch for $u" info " expected: $s" info " calculated: $cs" return 1 fi # Make the link. # # Note that the target must be just the file for TFTP chroot to work. # ln -s "$f" "$tr/$l" } # Bootstrap the toolchain. # # Return 0 on success, 1 if the toolchain is disabled, and 2 in case of # an error. # function toolchain_bootstrap () # { local tn="$1" local tp="${toolchains["$tn"]}" local tr="$(toolchain_value "$tp" toolchain_root)" local tf="$(toolchain_value "$tp" toolchain_file)" # Fetch files according to the sums file. Skip empty lines and those that # start with '#'. # local l ls=() readarray -t ls < <(sed -e '/^\s*#/d;/^\s*$/d' "$tr/$tf") if [ "${#ls[@]}" -eq 0 ]; then info "empty $tr/$tf" return 2 fi # Check if this toolchain is disabled. # if [ "${ls[0]}" = "disabled" ]; then return 1 fi for l in "${ls[@]}"; do if ! toolchain_fetch "$tn" "$l"; then return 2 # Diagnostics has already been issued. fi done local tv="$(toolchain_value "$tp" toolchain_fver)" # Set by fetch(). local tt="$(toolchain_value "$tp" toolchain_trust)" # Save the repository certificate fingerprint into the trust file (used # by machine bootstrap). # echo "$tt" >"$tr/trust" # Bootstrap in /tmp/toolchains/$tn/, install to /build/toolchains/$tn/. # local wd="/tmp/toolchains/$tn" local id="/build/toolchains/$tn" mkdir -p "$wd" mkdir -p "$id" local r=2 cd "$wd" while true; do # The "breakout loop". # Extract the toolchain. # if ! tar -xf "$tr/build2-toolchain.tar.xz"; then info "unable to extract $tr/build2-toolchain.tar.xz" break fi cd "build2-toolchain-$tv"*/ # Bootstrap, stage, and install using the provided build.sh script. Do # parallel bootstrap using make. # if ! ./build.sh --make make \ --make "-j$cpu_total" \ --timeout "$timeout" \ --install-dir "$id" \ --trust "$tt" \ g++; then info "failed to build $(pwd)" break fi cd "$wd" rm -r "build2-toolchain-$tv"*/ mv -T build2-toolchain-* build2-toolchain # Strip version. r=0 break done cd "$owd" return "$r" } # Check if we need to build/start or rebuild/restart the bbot agent. Return # 0 if nothing to do, 1 for upgrades, 2 for first build, and 3 for failure. # function bbot_check () # { local tn="$1" export PATH="/build/toolchains/$tn/bin:$PATH" # Running in subshell. cd "/tmp/toolchains/$tn/build2-toolchain" local r=3 local l_stat b_stat while true; do # The "breakout loop". l_stat="$(bpkg status libbbot)" b_stat="$(bpkg status bbot)" if ! bpkg --fetch-timeout "$timeout" fetch -q; then info "failed to fetch package information" break fi # See if this is the first time or if we need to upgrade. # if [ "$(cut -d ' ' -f 2 <<<"$b_stat")" = "configured" ]; then # We assume that if anything has changed in the status line, then we # have a new version. # if [ "$b_stat" = "$(bpkg status bbot)" -a \ "$l_stat" = "$(bpkg status libbbot)" ]; then r=0 break fi r=1 break fi r=2 break done cd "$owd" return "$r" } # Build and start bbot agent using the bpkg configuration created by # toolchain_bootstrap(). # function bbot_start () # { local tn="$1" local tx="$2" local tp="${toolchains["$tn"]}" local tc="$(toolchain_value "$tp" nice)" local tb="$(toolchain_value "$tp" bridge)" local ti="$(toolchain_value "$tp" instances)" local ta="$(toolchain_value "$tp" interactive)" local tv="$(toolchain_value "$tp" toolchain_fver)" local ts="$(toolchain_value "$tp" toolchain_file_csum)" local tbt="$(toolchain_value "$tp" build_timeout)" local tst="$(toolchain_value "$tp" bootstrap_timeout)" local tat="$(toolchain_value "$tp" interactive_timeout)" local id="/build/bots/$tn" mkdir -p "$id" # Install/uninstall vars. # local vars=(config.install.root="$id" config.bin.rpath="$id/lib") export PATH="/build/toolchains/$tn/bin:$PATH" # Running in subshell. cd "/tmp/toolchains/$tn/build2-toolchain" local r=1 local i n b_word while true; do # The "breakout loop". b_word="$(bpkg status bbot | cut -d ' ' -f 2)" # If upgrading, stop the service and uninstall. # if [ "$b_word" = "configured" ]; then # Note: stop extra instance. # for ((i=1; i <= ti + 1; i++)); do if ! sudo systemctl stop "bbot-agent-$tn@$i"; then info "failed to stop bbot-agent-$tn@$i service, assuming not running" continue fi info "stopped bbot-agent-$tn@$i service" done # We may not be able to uninstall if we previously failed to build. # if ! bpkg uninstall "${vars[@]}" bbot; then info "failed to uninstall bbot agent, assuming not installed" fi fi # Build and install the bbot agent. Since other agents might already be # running, limit the number of jobs to our slice. # if ! bpkg --fetch-timeout "$timeout" \ --build-option --jobs=$(($ti * $cpu_slice)) \ build --yes libbbot bbot; then info "failed to build bbot-agent for $tn" break fi if ! bpkg install "${vars[@]}" bbot; then info "failed to install bbot-agent for $tn" break fi # Post-process and install the systemd .service file. Since we may have # multiple toolchains, we embed the toolchain name into the service name # with the systemd pattern machinery used to run multiple bbot instances # per toolchain. # # We assume `%I` is only used in Description and similar and rewrite it # as `/%i` (e.g., `stage/1`). # sed -i -r \ -e "s#%I#$tn/%I#g" \ -e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \ -e "s/^(Environment=RAM_BUILD)=.*/\1=$ram_build_slice/" \ -e "s/^(Environment=RAM_AUXIL)=.*/\1=$ram_auxil_slice/" \ -e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \ -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \ -e "s/^(Environment=INTERACTIVE)=.*/\1=$ta/" \ -e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \ -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \ -e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \ -e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \ -e "s/^(Environment=INSTANCE_MAX)=.*/\1=$ti/" \ -e "s/^(Nice)=.*/\1=$tc/" \ -e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \ "$id/lib/systemd/system/bbot-agent@.service" # Patch in CPU affinity. # if [ -n "$cpu_affinity" ]; then sed -i -r -e "s/^(CPUAffinity)=.*/\1=$cpu_affinity/" \ "$id/lib/systemd/system/bbot-agent@.service" fi # Patch in build/bootstrap/interactive timeouts. # if [ -n "$tbt" ]; then sed -i -r -e "s/^(Environment=BUILD_TIMEOUT)=.*/\1=$tbt/" \ "$id/lib/systemd/system/bbot-agent@.service" fi if [ -n "$tst" ]; then sed -i -r -e "s/^(Environment=BOOTSTRAP_TIMEOUT)=.*/\1=$tst/" \ "$id/lib/systemd/system/bbot-agent@.service" fi if [ -n "$tat" ]; then sed -i -r -e "s/^(Environment=INTERACTIVE_TIMEOUT)=.*/\1=$tat/" \ "$id/lib/systemd/system/bbot-agent@.service" fi # Patch in the controller URLs. These can contain special characters # like `&` so we have to escape them. # n="${tp}controller_url[@]" for i in "${!n}"; do i="$(sed -e 's/[&/\]/\\&/g' <<<"$i")" sed -i -r \ -e "s#^(Environment=\"CONTROLLER_URL=[^\"]*)\"\$#\1 $i\"#" \ "$id/lib/systemd/system/bbot-agent@.service" done # Patch in the controller trust fingerprints. # n="${tp}controller_trust[@]" for i in "${!n}"; do sed -i -r \ -e "s#^(Environment=\"CONTROLLER_TRUST=[^\"]*)\"\$#\1 --trust $i\"#" \ "$id/lib/systemd/system/bbot-agent@.service" done # Note: using a hard link to prevent systemd from being too clever and # calling the service bbot-agent@. # sudo ln -f "$id/lib/systemd/system/bbot-agent@.service" \ "/usr/lib/systemd/system/bbot-agent-$tn@.service" # Clean up any machine snapshots that might have been left behind. # machines_for machines_clean_toolchain "$tn" if [ "${#diag[@]}" -gt 0 ]; then if [ -z "$fail" ]; then info "cleaned up entries in /build/machines/" else info "invalid entries in /build/machines/, not starting" fi print_diag 1>&2 if [ -n "$fail" ]; then info "correct and start bbot-agent for $tn (systemctl start bbot-agent-$tn@N)" break fi fi # Start each service instance. With Type=simple start returns as soon as # the process has forked. Making sure the service has actually started is # done as part of the service monitoring. # # Note: start extra instance. # r=0 for ((i=1; i <= ti + 1; i++)); do if ! sudo systemctl start "bbot-agent-$tn@$i"; then info "failed to start bbot-agent-$tn@$i service instance" r=1 break fi done break done cd "$owd" return "$r" } # Array of bootstrapped toolchains. # # The idea is to collect them until we bootstrap all of them and only then # start their bbot agents. # toolchain_boots=() declare -A toolchain_cursors # Latest systemd journal cursor. # Monitoring loop. # sensors=true count=0 while true; do count=$(($count + 1)) # Check for OS changes. Do this first in case of any issues in the following # checks. # if [ -n "$buildid_url" ]; then # Fetch the current id. While normally it will be a TFTP URL, it could also # be HTTP(S) so we configure sensible behavior for that. # if id="$("${curl[@]}" "$buildid_url")"; then if [ "$id" != "$buildid" ]; then email "rebooting because of new os build" <&1 | tee "$tr/toolchain-$count.log" 1>&2 case "${PIPESTATUS[0]}" in 0) tv="$(cat $tr/version)" declare "${tp}toolchain_ver=$tv" tv="$(cat $tr/version-full)" declare "${tp}toolchain_fver=$tv" s="bootstrapped $tn toolchain $tv" toolchain_boots+=("$tn") ;; 1) s="skipping disabled $tn toolchain, waiting for new version" toolchain_boots+=("") # Skip. ;; *) s="failed to bootstrap $tn toolchain, waiting for new version" toolchain_boots+=("") # Skip. ;; esac info "$s" email "$s" <&1 | tee "$tr/bbot-agent-$count.log" 1>&2 case "${PIPESTATUS[0]}" in 0) rm -f "$tr/bbot-agent-$count.log" # For each service instance check if it has failed. # # Note: check extra instance. # for ((i=1; i <= ti + 1; i++)); do if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then s="bbot-agent-$tn@$i service has failed, stopping" # Note: ignore errors. # sudo systemctl status "bbot-agent-$tn@$i" 2>&1 | \ tee "$tr/bbot-agent-$i-$count.log" 1>&2 # Reset it so that we don't keep sending the log on each # iteration. Note: ignore errors. # sudo systemctl reset-failed "bbot-agent-$tn@$i" 2>&1 | \ tee -a "$tr/bbot-agent-$i-$count.log" 1>&2 info "$s" email "$s" <&1 | tee -a "$tr/bbot-agent-$count.log" 1>&2 if [ "${PIPESTATUS[0]}" -eq 0 ]; then s="${s}started bbot-agent for $tn, $ti instances" else s="failed to ${s}start bbot-agent for $tn, waiting for new version" fi ;; *) s="failed to fetch package information for $tn toolchain, will try again" ;; esac info "$s" email "$s" <