Skip to content

Commit

Permalink
Emit error codes on execution status
Browse files Browse the repository at this point in the history
  • Loading branch information
iranzo committed May 3, 2021
1 parent 7cc3eca commit c430a09
Show file tree
Hide file tree
Showing 32 changed files with 238 additions and 9 deletions.
10 changes: 8 additions & 2 deletions checks/alertmanager
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
#!/usr/bin/env bash

# https://access.redhat.com/solutions/4250221
# kb: https://access.redhat.com/solutions/4250221

[ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")

if oc auth can-i get routes -n openshift-monitoring >/dev/null 2>&1; then
alert_url=$(oc -n openshift-monitoring get routes/alertmanager-main -o json | jq -r .spec.host)
alerts=$(curl -s -k -H "Authorization: Bearer $(oc -n openshift-monitoring sa get-token prometheus-k8s)" https://$alert_url/api/v1/alerts | jq '.data[] | {alert:.labels.alertname, severity:.labels.severity, namespace:.labels.namespace, instance:.labels.instance, message:.annotations.message} | select((.severity == "warning") or (.severity == "critical"))')
if [[ -n ${alerts} ]]; then
ALERTS=$(echo "${alerts}" | jq -r '. | "\(.severity)\t\(.alert)\t\(.namespace)\t\(.instance)\t\(.message)"' | column -t -s $'\t' -N "SEVERITY,ALERT,NAMESPACE,INSTANCE,MESSAGE")
msg "Alerts currently firing:\n${RED}${ALERTS}${NOCOLOR}\n"
errors=$(("${errors}" + 1))
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
exit ${OCERROR}
fi
exit ${OCOK}
else
msg "Couldn't get routes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
14 changes: 14 additions & 0 deletions checks/bz1948052
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/usr/bin/env bash
# long_name: Checks for BZ 1948052
# description: Checks for BZ 1948052 based on kernel version
# bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1948052
# priority: 600

[ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils")

Expand All @@ -12,6 +16,16 @@ if oc auth can-i get nodes >/dev/null 2>&1; then
errors=$(("${errors}" + 1))
fi
done
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi
else
msg "Couldn't get nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
10 changes: 10 additions & 0 deletions checks/chronyc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ if oc auth can-i debug node >/dev/null 2>&1; then
fi
fi
done
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi
else
msg "Couldn't debug nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
10 changes: 10 additions & 0 deletions checks/clusterversion_errors
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ if oc auth can-i get clusterversion >/dev/null 2>&1; then
msg "Clusterversion error status message: ${RED}${final}${NOCOLOR}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi
else
msg "Couldn't get clusterversion, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
10 changes: 10 additions & 0 deletions checks/csr
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ if oc auth can-i get csr >/dev/null 2>&1; then
msg "Pending CSRs (${pending_csr}): ${PCSR}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi
else
msg "Couldn't get csr, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
10 changes: 10 additions & 0 deletions checks/ctrlnodes
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ if oc auth can-i get nodes >/dev/null 2>&1; then
msg "Controllers ${RED}Scheduable${NOCOLOR}: ${SCHEDCTRL}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi
else
msg "Couldn't get nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
10 changes: 10 additions & 0 deletions checks/entropy
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@ if oc auth can-i debug node >/dev/null 2>&1; then
fi
fi
done
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi
else
msg "Couldn't debug nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
13 changes: 12 additions & 1 deletion checks/iptables-22623-22624
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
# https://access.redhat.com/solutions/5709711
# kb: https://access.redhat.com/solutions/5709711
#
# To check if the rule exist, we use iptables -C, it returns 0 if the rule exist
# and if it doesn't exist, it exits 1 with the following message:
Expand Down Expand Up @@ -36,6 +36,17 @@ if oc auth can-i debug node >/dev/null 2>&1; then
errors=$(("${errors}" + 1))
fi
done
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi

else
msg "Couldn't debug nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
11 changes: 11 additions & 0 deletions checks/mcp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@ if oc auth can-i get mcp >/dev/null 2>&1; then
msg "MachineConfigProfiles in Degraded State: ${RED}${DEGRADED}${NOCOLOR}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi

else
msg "Couldn't get mcp, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
11 changes: 11 additions & 0 deletions checks/nodes
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ if oc auth can-i get nodes >/dev/null 2>&1; then
msg "Nodes ${RED}Disabled{$NOCOLOR}: ${NODESDISABLED}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi

else
msg "Couldn't get nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
11 changes: 11 additions & 0 deletions checks/notrunningpods
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@ if oc auth can-i get pods -A >/dev/null 2>&1; then
msg "Pods not running ($(echo "${PODS}" | wc -l)):\n${HEADER}\n${RED}${PODS}${NOCOLOR}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi

else
msg "Couldn't get all pods, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
11 changes: 11 additions & 0 deletions checks/operators
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@ if oc auth can-i get co >/dev/null 2>&1; then
msg "Operators in Bad State (${bad_operators}):\n${RED}${BADOPS}${NOCOLOR}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi

else
msg "Couldn't get co, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
8 changes: 8 additions & 0 deletions checks/port-thrasing
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ NAMESPACE="openshift-ovn-kubernetes"

if [[ $(oc get network/cluster -o jsonpath={.spec.networkType}) != "OVNKubernetes" ]]; then
msg "This check only works for OVNKubernetes SDN"
exit ${OCSKIP}
else
if oc auth can-i get pods -n ${NAMESPACE} >/dev/null 2>&1; then
if oc auth can-i get pods --subresource=log -n ${NAMESPACE} >/dev/null 2>&1; then
Expand All @@ -16,11 +17,18 @@ else
msg "${RED}${pod} port thrasing errors detected${NOCOLOR}"
errors=$(("${errors}" + 1))
fi

done
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
else
msg "Couldn't get pods logs, check permissions"
exit ${OCSKIP}
fi
else
msg "Couldn't get pods, check permissions"
exit ${OCSKIP}
fi
fi
exit ${OCUNKOWN}
10 changes: 10 additions & 0 deletions checks/restarts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ if oc auth can-i get pods -A >/dev/null 2>&1; then
msg "Pods that have a high restart count (> $RESTART_THRESHOLD):\n${RED}${RESTARTS}${NOCOLOR}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi
else
msg "Couldn't get all pods, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
10 changes: 10 additions & 0 deletions checks/terminating
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ if oc auth can-i get pods -A >/dev/null 2>&1; then
msg "Pods in Terminating state ($terminating_pods):\n${RED}${TERMPODS}${NOCOLOR}"
errors=$(("${errors}" + 1))
fi
if [ ! -z "${ERRORFILE}" ]; then
echo $errors >${ERRORFILE}
fi
if [ "errors" != "0" ]; then
exit ${OCERROR}
else
exit ${OCOK}
fi
else
msg "Couldn't get all pods, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/00-clusterversion
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

if oc auth can-i get clusterversion >/dev/null 2>&1; then
msg "Cluster version:\n$(oc get clusterversion/version)"
exit ${OCINFO}
else
msg "Couldn't get clusterversion, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/01-clusteroperators
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

if oc auth can-i get clusteroperators >/dev/null 2>&1; then
msg "Cluster operators:\n$(oc get clusteroperators)"
exit ${OCINFO}
else
msg "Couldn't get clusteroperators, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/02-nodes
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ if oc auth can-i get nodes -A >/dev/null 2>&1; then
msg "Workers: $(oc get nodes -o name --no-headers --selector='node-role.kubernetes.io/worker' | wc -l)"
msg "Others: $(oc get nodes -o name --no-headers --selector='!node-role.kubernetes.io/worker,!node-role.kubernetes.io/master' | wc -l)"
msg "Total nodes: $(oc get nodes -o name --no-headers | wc -l)"
exit ${OCINFO}
else
msg "Couldn't get nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/03-pods
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

if oc auth can-i get pods -A >/dev/null 2>&1; then
msg "Total pods: $(oc get pods -A --no-headers | wc -l)"
exit ${OCINFO}
else
msg "Couldn't get pods, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/biosversion
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ if oc auth can-i debug node >/dev/null 2>&1; then
fi
fi
done
exit ${OCINFO}
else
msg "Couldn't debug nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/container-images-running
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
if oc auth can-i get pods -A >/dev/null 2>&1; then
IMAGES=$(oc get pods -A -o go-template --template='{{range .items}}{{range .spec.containers}}{{printf "%s\n" .image -}} {{end}}{{end}}' | sort -u)
msg "Images:\n${IMAGES}"
exit ${OCINFO}
else
msg "Couldn't get pods, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/container-images-stored
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ if oc auth can-i debug node >/dev/null 2>&1; then
for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do
oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "crictl images -o json" 2>/dev/null | jq -r .images[].repoTags[]
done | sort -u
exit ${OCINFO}
else
msg "Couldn't debug nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/ethtool-firmware-version
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ if oc auth can-i debug node >/dev/null 2>&1; then
fi
fi
done
exit ${OCINFO}
else
msg "Couldn't debug nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/intel-firmware-version
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ if oc auth can-i debug node >/dev/null 2>&1; then
fi
fi
done
exit ${OCINFO}
else
msg "Couldn't debug nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
3 changes: 3 additions & 0 deletions info/mellanox-firmware-version
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ if oc auth can-i debug node >/dev/null 2>&1; then
msg "Couldn't find Mellanox firmware version in ${node}"
fi
fi
exit ${OCINFO}
done
else
msg "Couldn't debug nodes, check permissions"
exit ${OCSKIP}
fi
exit ${OCUNKOWN}
Loading

0 comments on commit c430a09

Please sign in to comment.