From 1fd86636b0d70d754caf205d8048893a9c9793c3 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Thu, 7 Feb 2019 08:02:47 +0200 Subject: Various improvements and fixes --- buildos | 538 +++++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 326 insertions(+), 212 deletions(-) (limited to 'buildos') diff --git a/buildos b/buildos index 012869b..52e984e 100755 --- a/buildos +++ b/buildos @@ -51,7 +51,7 @@ info "starting build os monitor..." # # First we separete quoted variables and arguments with newlines (giving # priority to assignments). Then we replace whitespaces with newline on -# lines that don't contain quites. Finally, clean up by removing blank +# lines that don't contain quites. Finally, we clean up by removing blank # lines. # # Note: the same code as in init. @@ -137,7 +137,9 @@ function toolchain_value () # echo "${!n}" } +instances=0 # Number of bbot instances across all toolchains. toolchain_names=() + for tn in "${!toolchains[@]}"; do tp="${toolchains["$tn"]}" tu="$(toolchain_value "$tp" toolchain_url)" @@ -161,8 +163,28 @@ for tn in "${!toolchains[@]}"; do declare "${tp}toolchain_ver=" declare "${tp}toolchain_fver=" # Full version (with snapshot). - # If buildos.toolchain_trust was not specified, set it to "no" so that - # we don't prompt if the repository happens to be signed. + # Default to 1 bbot agent instance. + # + if [ -z "$(toolchain_value "$tp" instances)" ]; then + declare "${tp}instances=1" + fi + + instances=$(($instances + $(toolchain_value "$tp" instances))) + + # Default to 0 nice value. + # + if [ -z "$(toolchain_value "$tp" nice)" ]; then + declare "${tp}nice=0" + fi + + # Default to br1 (private/NAT bridge). + # + if [ -z "$(toolchain_value "$tp" bridge)" ]; then + declare "${tp}bridge=br1" + fi + + # If toolchain_trust was not specified, set it to "no" so that we don't + # prompt if the repository happens to be signed. # if [ -z "$(toolchain_value "$tp" toolchain_trust)" ]; then declare "${tp}toolchain_trust=no" @@ -180,27 +202,48 @@ if [ "${#toolchain_names[@]}" -eq 0 ]; then info "no buildos.toolchain_url specified, not bootstrapping" fi -# Divide CPUs and RAM (in kB) among the toolchains. +# Divide CPUs and RAM (in KB) among the instances. +# +# By default reserve 4G of RAM for ourselves (rootfs, tmpfs). # -# Reserve 4G of RAM for ourselves (rootfs, tmpfs). +# Note that MemTotal in /proc/meminfo is the available memory, not physical. +# And to make it easier to provision memory it's really helpful to base it +# in the physical value. # -ram_total="$(sed -n -re 's/^MemTotal: *([0-9]+) *kB$/\1/p' ... for v in /build/machines/*; do if [ ! -d "$v" ]; then diag+=("$v: error: invalid volume") - fail="true" + fail=true continue fi @@ -292,7 +343,7 @@ function machines_for () # ... for m in *; do if [ ! -d "$m" ]; then diag+=("$v/$m: error: invalid machine") - fail="true" + fail=true continue fi @@ -308,19 +359,29 @@ function machines_clean_subvolume () # { if ! btrfs property set -ts "$1" ro false; then diag+=("$1: error: unable to change subvolume property") - fail="true" + fail=true return 1 fi if ! btrfs subvolume delete "$1"; then diag+=("$1: error: unable to delete subvolume") - fail="true" + fail=true + return 1 + fi +} + +function machines_clean_lockfile () # +{ + if ! rm -f "$1"; then + diag+=("$1: error: unable to delete lockfile") + fail=true return 1 fi } # Cleanup the -- entries for the specified toolchain -# called before starting each toolchain. +# (all instances) as well as -.lock file. Called before +# starting bbot instances for each toolchain. # function machines_clean_toolchain () # { @@ -330,24 +391,33 @@ function machines_clean_toolchain () # cd "$m" - local s - for s in "$m"-"$tn"-*; do + local i + for i in "$m"-"$tn"-*; do - if [ ! -d "$s" ]; then - diag+=("$v/$m/$s: error: invalid machine subvolume") - fail="true" + if [ ! -d "$i" ]; then + diag+=("$v/$m/$i: error: invalid machine subvolume") + fail=true continue fi - if machines_clean_subvolume "$v/$m/$s"; then - diag+=("$v/$m/$s: info: deleted stray toolchain working subvolume") + if machines_clean_subvolume "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted stray toolchain working subvolume") fi done + i="$m-$tn.lock" + if [ -f "$i" ]; then + + if machines_clean_lockfile "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted stray lockfile") + fi + fi + cd "$v" } -# Cleanup stray snapshots or deleted machines. Called once during startup. +# Cleanup stray snapshots and lockfiles as well as deleted machines. Called +# once during startup. # function machines_clean_stray () # { @@ -359,35 +429,48 @@ function machines_clean_stray () # # Collect current machine symlink's bootstrap protocol numbers. If there # are no current machine symlinks, then we delete the whole thing. # - local s ps=() - for s in "$m"-*; do - if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then + local i ps=() + for i in "$m"-*; do + if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then - if [ ! -L "$s" ]; then - diag+=("$v/$m/$s: error: not a symlink") - fail="true" + if [ ! -L "$i" ]; then + diag+=("$v/$m/$i: error: not a symlink") + fail=true fi # Treat it as if it were a symlink even if its not. Failed that we # may try to delete the whole thing. # - ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$s")") + ps+=("$(sed -n -re 's/^.+-([0-9]+)$/\1/p' <<<"$i")") fi done # Examine each machine subvolume. # - for s in "$m"-*; do + for i in "$m"-*; do # -

(current machine symlink) # - if [[ "$s" =~ ^"$m"-[0-9]+$ ]]; then + if [[ "$i" =~ ^"$m"-[0-9]+$ ]]; then continue fi - if [ ! -d "$s" ]; then - diag+=("$v/$m/$s: error: invalid machine subvolume") - fail="true" + # Lockfile. + # + if [ -f "$i" ]; then + + if [[ "$i" =~ ^"$m"-.+\.lock$ ]]; then + + if machines_clean_lockfile "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted lockfile") + fi + continue + fi + fi + + if [ ! -d "$i" ]; then + diag+=("$v/$m/$i: error: invalid machine subvolume") + fail=true continue fi @@ -400,8 +483,8 @@ function machines_clean_stray () # # local p f= for p in "${ps[@]}"; do - if [[ "$s" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then - f="true" + if [[ "$i" =~ ^"$m"-"$p"\.[0-9]+$ ]]; then + f=false break fi done @@ -415,8 +498,8 @@ function machines_clean_stray () # f= local tn for tn in "${toolchain_names[@]}"; do - if [[ "$s" =~ ^"$m"-"$tn"$ ]]; then - f="true" + if [[ "$i" =~ ^"$m"-"$tn"$ ]]; then + f=false break fi done @@ -426,11 +509,11 @@ function machines_clean_stray () # fi fi - # This is either a stray working submodule or a bootsrapped subvolume + # This is either a stray working subvolume or a bootsrapped subvolume # for a toolchain that was deleted (or we are deleting everything). # - if machines_clean_subvolume "$v/$m/$s"; then - diag+=("$v/$m/$s: info: deleted subvolume") + if machines_clean_subvolume "$v/$m/$i"; then + diag+=("$v/$m/$i: info: deleted subvolume") fi done @@ -443,7 +526,7 @@ function machines_clean_stray () # diag+=("$v/$m: info: deleted machine directory") else diag+=("$v/$m: error: unable to delete machine directory") - fail="true" + fail=true fi fi } @@ -463,8 +546,7 @@ if [ "${#diag[@]}" -gt 0 ]; then info "$s" && print_diag 1>&2 if [ -n "$fail" ]; then - info "correct and restart the monitor (systemctl restart buildos)" - exit 1 + error "correct and restart the monitor (systemctl restart buildos)" fi fi @@ -513,7 +595,7 @@ function toolchain_fetch () # return 1 fi - info "toolchain '$tn' version $tv" + info "toolchain $tn version $tv" declare -g "${tp}toolchain_fver=$tv" # Full version. echo "$tv" >"$tr/version-full" @@ -713,9 +795,12 @@ function bbot_check () # function bbot_start () # { local tn="$1" - local ti="$2" + local tx="$2" local tp="${toolchains["$tn"]}" + local tc="$(toolchain_value "$tp" nice)" + local tb="$(toolchain_value "$tp" bridge)" + local ti="$(toolchain_value "$tp" instances)" local tv="$(toolchain_value "$tp" toolchain_fver)" local ts="$(toolchain_value "$tp" toolchain_file_csum)" @@ -741,9 +826,13 @@ function bbot_start () # # if [ "$b_word" = "configured" ]; then - if ! sudo systemctl stop "bbot-agent@$tn"; then - info "failed to stop bbot-agent@$tn service, assuming not running" - fi + for ((i=1; i <= ti; i++)); do + if ! sudo systemctl stop "bbot-agent-$tn@$i"; then + info "failed to stop bbot-agent-$tn@$i service, assuming not running" + continue + fi + info "stopped bbot-agent-$tn@$i service" + done # We may not be able to uninstall if we previously failed to build. # @@ -752,37 +841,45 @@ function bbot_start () # fi fi - # Build and install the bbot agent. Since other agents might already - # be running, limit the number of jobs to our slice. + # Build and install the bbot agent. Since other agents might already be + # running, limit the number of jobs to our slice. # - if ! bpkg --fetch-timeout "$timeout" \ - --build-option --jobs --build-option "$cpu_slice" \ + if ! bpkg --fetch-timeout "$timeout" \ + --build-option --jobs=$(($ti * $cpu_slice)) \ build --yes libbbot bbot; then - info "failed to build bbot-agent@$tn" + info "failed to build bbot-agent for $tn" break fi if ! bpkg install "${vars[@]}" bbot; then - info "failed to install bbot-agent@$tn" + info "failed to install bbot-agent for $tn" break fi - # Post-process and install systemd .service file. Note that we cannot use - # the systemd pattern machinery since each version of bbot can have its - # own version of the .service file. + # Post-process and install the systemd .service file. Since we may have + # multiple toolchains, we embed the toolchain name into the service name + # with the systemd pattern machinery used to run multiple bbot instances + # per toolchain. + # + # We assume `%I` is only used in Description and similar and rewrite it + # as `/%i` (e.g., `stage/1`). # sed -i -r \ - -e "s/%[iI]/$tn/g" \ - -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \ + -e "s#%I#$tn/%I#g" \ -e "s/^(Environment=CPU)=.*/\1=$cpu_slice/" \ -e "s/^(Environment=RAM)=.*/\1=$ram_slice/" \ + -e "s/^(Environment=BRIDGE)=.*/\1=$tb/" \ + -e "s#^(Environment=AUTH_KEY)=.*#\1=/state/etc/host-key.pem#" \ -e "s/^(Environment=TOOLCHAIN_ID)=.*/\1=$ts/" \ - -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$ti/" \ + -e "s/^(Environment=TOOLCHAIN_NUM)=.*/\1=$tx/" \ -e "s/^(Environment=TOOLCHAIN_VER)=.*/\1=$tv/" \ + -e "s/^(Environment=TOOLCHAIN_NAME)=.*/\1=$tn/" \ + -e "s/^(Nice)=.*/\1=$tc/" \ + -e "s#^ExecStart=[^ ]+(.*)#ExecStart=$id/bin/bbot-agent\1#" \ "$id/lib/systemd/system/bbot-agent@.service" # Patch in the controller URLs. These can contain special characters - # like & so we have to escape them. + # like `&` so we have to escape them. # n="${tp}controller_url[@]" for i in "${!n}"; do @@ -801,8 +898,11 @@ function bbot_start () # "$id/lib/systemd/system/bbot-agent@.service" done - sudo ln -sf "$id/lib/systemd/system/bbot-agent@.service" \ - "/usr/lib/systemd/system/bbot-agent@$tn.service" + # Note: using a hard link to prevent systemd from being too clever and + # calling the service bbot-agent@. + # + sudo ln -f "$id/lib/systemd/system/bbot-agent@.service" \ + "/usr/lib/systemd/system/bbot-agent-$tn@.service" # Clean up any machine snapshots that might have been left behind. # @@ -818,25 +918,28 @@ function bbot_start () # print_diag 1>&2 if [ -n "$fail" ]; then - info "correct and start bbot-agent@$tn (systemctl start bbot-agent@$tn)" + info "correct and start bbot-agent for $tn (systemctl start bbot-agent-$tn@N)" break fi fi - # Start the service. With Type=simple start returns as soon as the process - # has forked. To see if the service actually started is done as part of - # service monitoring. + # Start each service instance. With Type=simple start returns as soon as + # the process has forked. Making sure the service has actually started is + # done as part of the service monitoring. # - if ! sudo systemctl start "bbot-agent@$tn"; then - info "failed to start bbot-agent@$tn service" - break - fi - r=0 + for ((i=1; i <= ti; i++)); do + if ! sudo systemctl start "bbot-agent-$tn@$i"; then + info "failed to start bbot-agent-$tn@$i service instance" + r=1 + break + fi + done + break done - cd "$owd" + cd "$owd" return "$r" } @@ -855,6 +958,27 @@ while true; do count=$(($count + 1)) + # Check for OS changes. Do this first in case of any issues in the following + # checks. + # + if [ -n "$buildid_url" ]; then + # Fetch the current id. While normally it will be a TFTP URL, it could also + # be HTTP(S) so we configure sensible behavior for that. + # + if id="$("${curl[@]}" "$buildid_url")"; then + if [ "$id" != "$buildid" ]; then + email "rebooting because of new os build" <&1 | tee "$tr/toolchain-$count.log" 1>&2 @@ -917,15 +1041,15 @@ EOF tv="$(cat $tr/version-full)" declare "${tp}toolchain_fver=$tv" - s="bootstrapped '$tn' toolchain $tv" + s="bootstrapped $tn toolchain $tv" toolchain_boots+=("$tn") ;; 1) - s="skipping disabled '$tn' toolchain, waiting for new version" + s="skipping disabled $tn toolchain, waiting for new version" toolchain_boots+=("") # Skip. ;; *) - s="failed to bootstrap '$tn' toolchain, waiting for new version" + s="failed to bootstrap $tn toolchain, waiting for new version" toolchain_boots+=("") # Skip. ;; esac @@ -946,10 +1070,10 @@ EOF # if [ "${#toolchain_names[@]}" -eq "${#toolchain_boots[@]}" ]; then - ti=0 # Toolchain index. + tx=0 # Toolchain index. for tn in "${toolchain_boots[@]}"; do - ti=$(($ti + 1)) + tx=$(($tx + 1)) # Skip those that failed to bootstrap. # @@ -958,6 +1082,7 @@ EOF fi tp="${toolchains["$tn"]}" + ti="$(toolchain_value "$tp" instances)" tr="$(toolchain_value "$tp" toolchain_root)" # Or those that have no controllers (maybe it would have been better @@ -969,171 +1094,160 @@ EOF fi s= - bbot_check "$tn" 2>&1 | tee "$tr/bbot-$count.log" 1>&2 + bbot_check "$tn" 2>&1 | tee "$tr/bbot-agent-$count.log" 1>&2 case "${PIPESTATUS[0]}" in 0) - rm -f "$tr/bbot-$count.log" + rm -f "$tr/bbot-agent-$count.log" - # Check if the service has failed. + # For each service instance check if it has failed. # - if sudo systemctl is-failed --quiet "bbot-agent@$tn"; then - s="bbot-agent@$tn service has failed, stopping" - - # Note: ignore errors. - # - sudo systemctl status "bbot-agent@$tn" 2>&1 | \ - tee "$tr/bbot-$count.log" 1>&2 - - # Reset it so that we don't keep sending the log on each - # iteration. Note: ignore errors. - # - sudo systemctl reset-failed "bbot-agent@$tn" 2>&1 | \ - tee -a "$tr/bbot-$count.log" 1>&2 - else - # See if there is any diagnostics in the systemd journal. We - # notify about warning and up. - # - # The old versions journalctl behavior is to not output anything - # (not even the cursor) if there are no new entries. The new - # versions output the old cursor. - # - # Plus, it sometimes changes the cursor even without any errors in - # it (journal rewind/truncation maybe?) so we have to detect that. - # - c=(sudo journalctl --no-pager --quiet --output short-full \ - --unit "bbot-agent@$tn") - - # Get the last cursor if any. - # - oc="${toolchain_cursors["$tn"]}" - if [ -n "$oc" ]; then - c+=("--after-cursor" "$oc") - fi + for ((i=1; i <= ti; i++)); do + + if sudo systemctl is-failed --quiet "bbot-agent-$tn@$i"; then + s="bbot-agent-$tn@$i service has failed, stopping" + + # Note: ignore errors. + # + sudo systemctl status "bbot-agent-$tn@$i" 2>&1 | \ + tee "$tr/bbot-agent-$i-$count.log" 1>&2 + + # Reset it so that we don't keep sending the log on each + # iteration. Note: ignore errors. + # + sudo systemctl reset-failed "bbot-agent-$tn@$i" 2>&1 | \ + tee -a "$tr/bbot-agent-$i-$count.log" 1>&2 + + info "$s" + email "$s" <&1 | tee -a "$tr/bbot-$count.log" 1>&2 + bbot_start "$tn" "$tx" 2>&1 | tee -a "$tr/bbot-agent-$count.log" 1>&2 if [ "${PIPESTATUS[0]}" -eq 0 ]; then - s="${s}started bbot-agent@$tn" + s="${s}started bbot-agent for $tn, $ti instances" else - s="failed to ${s}start bbot-agent@$tn, waiting for new version" + s="failed to ${s}start bbot-agent for $tn, waiting for new version" fi ;; *) - s="failed to fetch package information for '$tn' toolchain, will try again" + s="failed to fetch package information for $tn toolchain, will try again" ;; esac info "$s" email "$s" <