From 45c9fbacc4b21efd54173bad11998ceffd175676 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Wed, 24 May 2017 15:51:50 +0200 Subject: Improve bbot diagnostics email --- buildos | 86 +++++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 23 deletions(-) (limited to 'buildos') diff --git a/buildos b/buildos index 78c16c2..0845ed1 100755 --- a/buildos +++ b/buildos @@ -976,6 +976,9 @@ EOF # (not even the cursor) if there are no new entries. The new # versions output the old cursor. # + # Plus, it sometimes changes the cursor even without any errors in + # it (journal rewind/truncation maybe?) so we have to detect that. + # c=(sudo journalctl --unit "bbot-agent@$tn") # Get the last cursor if any. @@ -985,36 +988,73 @@ EOF c+=("--after-cursor" "$oc") fi - # Get the log range: the first line is the date of the first error - # and the second line is the end cursor. + # Get the "log range": the first line is the date of the first + # error, the second line is the date of the last error, and the + # third line is the end cursor. It can also be just one line in + # which case it is the new cursor (that rewind stuff). + # + # Here is what's going on in that sed script: + # + # The first chunk matches the first line. We first put it into + # the hold space (in case that's the only line) and then extract + # and print the date. + # + # The second chunk matches the last line. We first handle the hold + # space which by now should contain the last error line and then + # the cursor. + # + # The last chunk matches every other line. We simply replace the + # hold space with the next line so that at the end we have the + # last line there. # lr="$("${c[@]}" --no-pager --quiet --output short-full \ ---priority 4 --show-cursor | \ -sed -n -re '1s/^... ([^ ]+ [^ ]+) .*$/\1/p;s/^-- cursor: (.+)$/\1/p')" - - nc="$(sed -n -e '2p' <<<"$lr")" +--priority 4 --show-cursor | sed -n -r \ +-e '1{h;s/^[MTWFS].. ([^ ]+ [^ ]+) .*$/\1/p;t}' \ +-e '${x;s/^[MTWFS].. ([^ ]+ [^ ]+) .*$/\1/p;x;s/^-- cursor: (.+)$/\1/p;t}' \ +-e 'h')" + lc="$(wc -l <<<"$lr")" + nc="$(sed -n -e "${lc}p" <<<"$lr")" # If we have no new entries, then nothing to do. # - if [ -n "$nc" -a "$nc" != "$oc" ]; then + if [ "$nc" != "$oc" ]; then - # Try to get some context for the first error entry. This is - # unexpectedly hard in systemd. So we are going to get all the - # entries that start a minute ago. + # We may have no actual entries (cursor rewind). # - sd="$(sed -n -e '1p' <<<"$lr")" - sd="$(date '+%Y-%m-%d %H:%M:%S' -d "$sd +00:01")" - - s="bbot-agent@$tn service issued new diagnostics" - - info "$s" - { - echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}"; - echo; - echo; - "${c[@]}" --no-pager --quiet --output short-full \ - --since "$sd" --lines 100 - } | email "$s" + if [ "$lc" -ne 1 ]; then + + # Try to get some context before the first error and after the + # last. This is unexpectedly hard in systemd. + # + # This can be a lot of output which makes it hard to spot the + # error so we are going to print just the error summary first. + # Quite a mess, I agree. + # + sd="$(sed -n -e '1p' <<<"$lr")" + sd="$(date '+%s' -d "$sd")" # sec + sd="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($sd - 10))")" # -10sec + + ed="$(sed -n -e '2p' <<<"$lr")" + ed="$(date '+%s' -d "$ed")" # sec + ed="$(date '+%Y-%m-%d %H:%M:%S' -d "@$(($ed + 10))")" # +10sec + + s="bbot-agent@$tn service issued new diagnostics" + + info "$s" + { + echo "$tn.bbot_cmd: ssh build@$hname ${c[@]}"; + echo; + echo "summary:"; + echo; + "${c[@]}" --no-pager --quiet --output short-full \ + --priority 4 | head -n 200; + echo; + echo "context [$sd -- $ed]:"; + echo; + "${c[@]}" --no-pager --quiet --output short-full \ + --since "$sd" --until "$ed" | head -n 200 + } | email "$s" + fi toolchain_cursors["$tn"]="$nc" fi -- cgit v1.1