diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..7a34bf7 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +known_third_party = diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5d8f644..6d75373 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,68 +1,102 @@ +--- fail_fast: true repos: -- repo: meta - hooks: - - id: check-useless-excludes -- repo: https://github.com/pre-commit/mirrors-prettier - rev: v2.2.1 - hooks: - - id: prettier - files: \.(css|js|md|markdown|json) -- repo: https://github.com/python/black - rev: 20.8b1 - hooks: - - id: black -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.4.0 - hooks: - - id: check-added-large-files - - id: check-ast - - id: check-case-conflict - - id: check-executables-have-shebangs - - id: check-json - - id: check-merge-conflict - - id: check-symlinks - - id: check-vcs-permalinks - - id: check-xml - - id: check-yaml + - hooks: + - id: commitizen + stages: + - commit-msg + repo: https://github.com/commitizen-tools/commitizen + rev: v2.35.0 + - hooks: + - id: check-useless-excludes + repo: meta + - hooks: + - files: \.(css|js|md|markdown|json) + id: prettier + repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.0-alpha.3 + - hooks: + - id: seed-isort-config + repo: https://github.com/asottile/seed-isort-config + rev: v2.2.0 + - hooks: + - id: isort + repo: https://github.com/pre-commit/mirrors-isort + rev: v5.10.1 + - hooks: + - id: black + repo: https://github.com/python/black + rev: 22.10.0 + - hooks: + - id: check-added-large-files + - id: check-ast + - id: check-case-conflict + - id: check-executables-have-shebangs + - id: check-json + - id: check-merge-conflict + - id: check-symlinks + - id: check-vcs-permalinks + - id: debug-statements + - id: check-xml + - args: + - --unsafe + id: check-yaml + - id: end-of-file-fixer + - id: forbid-new-submodules + - args: + - --branch + - gh-pages + id: no-commit-to-branch + - id: requirements-txt-fixer + - id: sort-simple-yaml + - id: trailing-whitespace + - id: mixed-line-ending + - id: detect-private-key + - id: check-byte-order-marker + - id: check-docstring-first + repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + - hooks: + - id: flake8 + repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + - hooks: + - additional_dependencies: + - mvdan.cc/sh/v3/cmd/shfmt@v3.1.1 args: - - --unsafe - - id: end-of-file-fixer - - id: fix-encoding-pragma - - id: forbid-new-submodules - - id: no-commit-to-branch - args: - - --branch - - gh-pages - - id: requirements-txt-fixer - - id: sort-simple-yaml - - id: trailing-whitespace -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.0 - hooks: - - id: flake8 -- repo: local - hooks: - - id: shfmt - name: shfmt - minimum_pre_commit_version: 2.4.0 - language: golang - additional_dependencies: - - mvdan.cc/sh/v3/cmd/shfmt@v3.1.1 + - -w + - -i + - "2" + - -s entry: shfmt - args: - - -w - - -i - - '2' + id: shfmt + language: golang + minimum_pre_commit_version: 2.4.0 + name: shfmt types: - - shell -- repo: https://github.com/asottile/blacken-docs - rev: v1.10.0 + - shell + repo: local + - hooks: + - id: blacken-docs + repo: https://github.com/asottile/blacken-docs + rev: v1.12.1 + + # - repo: https://github.com/asottile/pyupgrade + # rev: v2.38.0 + # hooks: + # - id: pyupgrade + # args: [--py39-plus] + + - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt + rev: 0.2.2 # or other specific tag hooks: - - id: blacken-docs -- repo: https://github.com/hcodes/yaspeller.git - rev: v7.0.0 + - id: yamlfmt + args: [--mapping, '2', --sequence, '4', --offset, '2', '--preserve-quotes'] + + + - repo: https://github.com/hcodes/yaspeller.git + rev: v8.0.1 hooks: - - id: yaspeller + - id: yaspeller types: - - markdown + - markdown diff --git a/README.md b/README.md index e4cb0fb..3e0c403 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,6 @@ Using default/api-foobar-example-com:6443/system:admin context > If your kubeconfig file doesn't have the proper permissions you may get the error "KUBECONFIG not set". > In that case verify that the kubeconfig file has read permissions for the user that is used inside the container or just `chmod o+r kubeconfig` in your host. - ### Build your own container You can build your own container with the included @@ -96,29 +95,29 @@ in the [info](./info), [checks](./checks) or [ssh](./ssh) directories. ### Checks -| Script | Description | -| ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | -| [alertmanager](checks/alertmanager) | Checks if there are warning or error alerts firing | -| [bz1948052](checks/bz1948052) | Checks if the node is using a kernel version affected by [BZ1948052](https://bugzilla.redhat.com/show_bug.cgi?id=1948052) | -| [chronyc](checks/chronyc) | Checks if the worker clocks are synced using chronyc | -| [clusterversion_errors](checks/clusterversion_errors) | Checks if there are clusterversion errors | -| [csr](checks/csr) | Checks if there are pending csr | -| [ctrlnodes](checks/ctrlnodes) | Checks if any controller nodes have had the NoSchedule taint removed | -| [entropy](checks/entropy) | Checks if the workers have enough entropy | -| [iptables-22623-22624](checks/iptables-22623-22624) | Checks if the nodes iptables rules are blocking 22623/tpc or 22624/tcp | -| [mcp](checks/mcp) | Checks if there are degraded mcp | -| [mellanox-firmware-version](checks/mellanox-firmware-version) | Checks if the nodes' Mellanox Connect-4 firmware version is below the recommended version. | -| [nodes](checks/nodes) | Checks if there are not ready or not schedulable nodes | -| [notrunningpods](checks/notrunningpods) | Checks if there are not running pods | -| [operators](checks/operators) | Checks if there are operators in 'bad' state | -| [pdb](checks/pdb) | Checks if there are PodDisruptionBudgets with 0 disruptions allowed | -| [port-thrasing](checks/port-thrasing) | Checks if there are OVN pods thrasing | -| [pvc](checks/pvc) | Checks if there are persistent volume claims that are not bound | -| [restarts](checks/restarts) | Checks if there are pods restarted > `n` times (10 by default) | -| [sriov](checks/sriov) | Checks if the SR-IOV network state is synced | -| [terminating](checks/terminating) | Checks if there are pods terminating | -| [ovn-pods-memory-usage](checks/ovn-pods-memory-usage) | Checks if the memory usage of the OVN pods is under the LIMIT threshold | -| [zombies](checks/zombies) | Checks if more than 5 zombie processes exist on the hosts | +| Script | Description | +| ------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | +| [alertmanager](checks/alertmanager) | Checks if there are warning or error alerts firing | +| [bz1948052](checks/bz1948052) | Checks if the node is using a kernel version affected by [BZ1948052](https://bugzilla.redhat.com/show_bug.cgi?id=1948052) | +| [chronyc](checks/chronyc) | Checks if the worker clocks are synced using chronyc | +| [clusterversion_errors](checks/clusterversion_errors) | Checks if there are clusterversion errors | +| [csr](checks/csr) | Checks if there are pending csr | +| [ctrlnodes](checks/ctrlnodes) | Checks if any controller nodes have had the NoSchedule taint removed | +| [entropy](checks/entropy) | Checks if the workers have enough entropy | +| [iptables-22623-22624](checks/iptables-22623-22624) | Checks if the nodes iptables rules are blocking 22623/tpc or 22624/tcp | +| [mcp](checks/mcp) | Checks if there are degraded mcp | +| [mellanox-firmware-version](checks/mellanox-firmware-version) | Checks if the nodes' Mellanox Connect-4 firmware version is below the recommended version. | +| [nodes](checks/nodes) | Checks if there are not ready or not schedulable nodes | +| [notrunningpods](checks/notrunningpods) | Checks if there are not running pods | +| [operators](checks/operators) | Checks if there are operators in 'bad' state | +| [pdb](checks/pdb) | Checks if there are PodDisruptionBudgets with 0 disruptions allowed | +| [port-thrasing](checks/port-thrasing) | Checks if there are OVN pods thrasing | +| [pvc](checks/pvc) | Checks if there are persistent volume claims that are not bound | +| [restarts](checks/restarts) | Checks if there are pods restarted > `n` times (10 by default) | +| [sriov](checks/sriov) | Checks if the SR-IOV network state is synced | +| [terminating](checks/terminating) | Checks if there are pods terminating | +| [ovn-pods-memory-usage](checks/ovn-pods-memory-usage) | Checks if the memory usage of the OVN pods is under the LIMIT threshold | +| [zombies](checks/zombies) | Checks if more than 5 zombie processes exist on the hosts | ### SSH Checks @@ -156,14 +155,14 @@ in the [info](./info), [checks](./checks) or [ssh](./ssh) directories. ### Environment variables -| Environment variable | Default value | Description | -| -------------------- | ---------------------------------------------------- | --------------------------------------------------------------------------------- | -| INTEL_IDS | 8086:158b | Intel device IDs to check for firmware. Can be overridden for non-supported NICs. | -| OCDEBUGIMAGE | registry.redhat.io/rhel8/support-tools:latest | Used by `oc debug`. | -| OSETOOLSIMAGE | registry.redhat.io/openshift4/ose-tools-rhel8:latest | Used by `oc debug` in [ethtool-firmware-version](info/ethtool-firmware-version) | -| RESTART_THRESHOLD | 10 | Used by the [restarts](checks/restarts) script. | -| THRASING_THRESHOLD | 10 | Used by the [port-thrashing](checks/port-thrashing) script. | -| PARALLELJOBS | 1 | By default, all the `oc debug` commands run in a serial fashion, unless this variable is set >1 | +| Environment variable | Default value | Description | +| -------------------- | ---------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| INTEL_IDS | 8086:158b | Intel device IDs to check for firmware. Can be overridden for non-supported NICs. | +| OCDEBUGIMAGE | registry.redhat.io/rhel8/support-tools:latest | Used by `oc debug`. | +| OSETOOLSIMAGE | registry.redhat.io/openshift4/ose-tools-rhel8:latest | Used by `oc debug` in [ethtool-firmware-version](info/ethtool-firmware-version) | +| RESTART_THRESHOLD | 10 | Used by the [restarts](checks/restarts) script. | +| THRASING_THRESHOLD | 10 | Used by the [port-thrashing](checks/port-thrashing) script. | +| PARALLELJOBS | 1 | By default, all the `oc debug` commands run in a serial fashion, unless this variable is set >1 | | OVN_MEMORY_LIMIT | 5000 | Used by the [ovn-pods-memory-usage](checks/ovn-pods-memory-usage) script to set the maximum memory LIMIT (in Mi) to trigger the warning. | ### About firmware version diff --git a/checks/bz1948052 b/checks/bz1948052 index 5fbaf91..6b634bb 100755 --- a/checks/bz1948052 +++ b/checks/bz1948052 @@ -21,7 +21,7 @@ if oc auth can-i get nodes >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/chronyc b/checks/chronyc index 58b7594..1187343 100755 --- a/checks/chronyc +++ b/checks/chronyc @@ -3,14 +3,15 @@ [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") tmperrorfile=$(mktemp) -echo 0 > $tmperrorfile +echo 0 >$tmperrorfile if oc auth can-i debug node >/dev/null 2>&1; then msg "Collecting NTP data... (${BLUE}using oc debug, it can take a while${NOCOLOR})" # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( # shellcheck disable=2016 if ! SOURCES=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'chronyc activity' 2>/dev/null | awk '/sources online/ { print $1 }'); then @@ -18,7 +19,7 @@ if oc auth can-i debug node >/dev/null 2>&1; then else if [ -n "${SOURCES}" ] && [ "${SOURCES}" -lt 1 ]; then msg "${RED}Clock doesn't seem to be synced in ${node}${NOCOLOR}" - echo 1 > $tmperrorfile + echo 1 >$tmperrorfile fi fi ) & diff --git a/checks/clusterversion_errors b/checks/clusterversion_errors index a4220d3..a57b747 100755 --- a/checks/clusterversion_errors +++ b/checks/clusterversion_errors @@ -23,7 +23,7 @@ if oc auth can-i get clusterversion >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/csr b/checks/csr index 43df68e..384540a 100755 --- a/checks/csr +++ b/checks/csr @@ -15,7 +15,7 @@ if oc auth can-i get csr >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/ctrlnodes b/checks/ctrlnodes index f0814d9..8beb38c 100755 --- a/checks/ctrlnodes +++ b/checks/ctrlnodes @@ -18,7 +18,7 @@ if oc auth can-i get nodes >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/entropy b/checks/entropy index ea7ecaf..0f57268 100755 --- a/checks/entropy +++ b/checks/entropy @@ -3,21 +3,22 @@ [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") tmperrorfile=$(mktemp) -echo 0 > $tmperrorfile +echo 0 >$tmperrorfile if oc auth can-i debug node >/dev/null 2>&1; then msg "Collecting entropy data... (${BLUE}using oc debug, it can take a while${NOCOLOR})" # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( if ! ENTROPY=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'cat /proc/sys/kernel/random/entropy_avail' 2>/dev/null); then msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" else if [ -n "${ENTROPY}" ] && [ "${ENTROPY}" -lt 200 ]; then msg "${RED}Low entropy in ${node}${NOCOLOR}" - echo 1 > $tmperrorfile + echo 1 >$tmperrorfile fi fi ) & diff --git a/checks/iptables-22623-22624 b/checks/iptables-22623-22624 index 81ced69..d8d7a7a 100755 --- a/checks/iptables-22623-22624 +++ b/checks/iptables-22623-22624 @@ -12,14 +12,15 @@ [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") tmperrorfile=$(mktemp) -echo 0 > $tmperrorfile +echo 0 >$tmperrorfile if oc auth can-i debug node >/dev/null 2>&1; then msg "Checking if ports 22623/tcp and 22624/tcp are blocked (${BLUE}using oc debug, it can take a while${NOCOLOR})" # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( # shellcheck disable=2016 OUTPUT=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c \ @@ -40,7 +41,7 @@ if oc auth can-i debug node >/dev/null 2>&1; then msg "${ORANGE}Unable to create debug pod in ${node}${NOCOLOR}" else msg "${RED}iptables rules for 22623/tcp or 22624/tcp found in ${node}${NOCOLOR}" - echo 1 > $tmperrorfile + echo 1 >$tmperrorfile fi ) & done diff --git a/checks/mcp b/checks/mcp index 53b796e..33dd1e0 100755 --- a/checks/mcp +++ b/checks/mcp @@ -15,7 +15,7 @@ if oc auth can-i get mcp >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/mellanox-firmware-version b/checks/mellanox-firmware-version index 0574a0f..f76fc30 100755 --- a/checks/mellanox-firmware-version +++ b/checks/mellanox-firmware-version @@ -15,7 +15,7 @@ MIN_VERS=16.28 [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") if oc auth can-i debug node >/dev/null 2>&1; then msg "Checking Mellanox firmware version (${BLUE}using oc debug, it can take a while${NOCOLOR})" - fw_errors=0 + fw_errors=0 # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # shellcheck disable=SC1083 @@ -27,7 +27,7 @@ if oc auth can-i debug node >/dev/null 2>&1; then dev=$(echo ${result} | awk -F, '{print $1}') fw=$(echo ${result} | awk -F, '{print $2}' | awk -F. '{print $1"."$2}') if [[ $(expr ${fw} \< ${MIN_VERS}) -eq 1 ]]; then - msg "Firmware for Mellanox card ${RED}${dev}${NOCOLOR} on ${RED}${node}${NOCOLOR} is below the minimum recommended version. Please upgrade to at least ${GREEN}${MIN_VERS}${NOCOLOR}." + msg "Firmware for Mellanox card ${RED}${dev}${NOCOLOR} on ${RED}${node}${NOCOLOR} is below the minimum recommended version. Please upgrade to at least ${GREEN}${MIN_VERS}${NOCOLOR}." errors=$(("${errors}" + 1)) fw_errors=$(("${fw_errors}" + 1)) if [ ! -z "${ERRORFILE}" ]; then diff --git a/checks/nodes b/checks/nodes index 98a9941..58249ec 100755 --- a/checks/nodes +++ b/checks/nodes @@ -28,7 +28,7 @@ if oc auth can-i get nodes >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/notrunningpods b/checks/notrunningpods index c744d2a..e1636ba 100755 --- a/checks/notrunningpods +++ b/checks/notrunningpods @@ -17,7 +17,7 @@ if oc auth can-i get pods -A >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/operators b/checks/operators index 18d2564..2a86393 100755 --- a/checks/operators +++ b/checks/operators @@ -21,7 +21,7 @@ if oc auth can-i get co >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/ovn-pods-memory-usage b/checks/ovn-pods-memory-usage index f604dd9..7aa97c2 100755 --- a/checks/ovn-pods-memory-usage +++ b/checks/ovn-pods-memory-usage @@ -7,21 +7,21 @@ error=false if oc auth can-i adm top -A >/dev/null 2>&1; then LIMIT="${OVN_MEMORY_LIMIT:=5000}" FLAG=0 - pods_memory_usage=$(oc adm top pod -n openshift-ovn-kubernetes -l app=ovnkube-node --no-headers| awk '{ print $1 " " $3 }' | awk '{$2 = substr($2,0,length($2)-2)} 1' ) + pods_memory_usage=$(oc adm top pod -n openshift-ovn-kubernetes -l app=ovnkube-node --no-headers | awk '{ print $1 " " $3 }' | awk '{$2 = substr($2,0,length($2)-2)} 1') MESSAGE="" OLDIFS=${IFS} IFS=$'\n' for pod_line in ${pods_memory_usage}; do - pod_name=$(echo $pod_line | awk '{ print $1 }') - pod_size=$(echo $pod_line | awk '{ print $2 }') - if [[ ${pod_size} -ge ${LIMIT} ]]; then - MESSAGE="${MESSAGE}The OVN pod memory usage for ${pod_name} is extremely high: ${RED}${pod_size}${NOCOLOR}Mi\n" - FLAG=1 - fi + pod_name=$(echo $pod_line | awk '{ print $1 }') + pod_size=$(echo $pod_line | awk '{ print $2 }') + if [[ ${pod_size} -ge ${LIMIT} ]]; then + MESSAGE="${MESSAGE}The OVN pod memory usage for ${pod_name} is extremely high: ${RED}${pod_size}${NOCOLOR}Mi\n" + FLAG=1 + fi done IFS=${OLDIFS} - + if [[ ${FLAG} -ne 0 ]]; then MESSAGE="${MESSAGE}For more information you can check the KCS https://access.redhat.com/solutions/6493321\n" msg "${MESSAGE}" @@ -33,7 +33,7 @@ if oc auth can-i adm top -A >/dev/null 2>&1; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/pdb b/checks/pdb index 07035ee..ab68662 100755 --- a/checks/pdb +++ b/checks/pdb @@ -12,13 +12,13 @@ if oc auth can-i get pdb >/dev/null 2>&1; then if [[ -n $wrong_pdb ]]; then DEGRADED=$(echo "${wrong_pdb}" | jq .) msg "PodDisruptionBudget with 0 disruptions allowed: ${RED}${DEGRADED}${NOCOLOR}" - errors=$((${errors} + 1)) + errors=$((errors + 1)) error=true fi if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/port-thrasing b/checks/port-thrasing index cb4d381..c8d56e0 100755 --- a/checks/port-thrasing +++ b/checks/port-thrasing @@ -25,7 +25,7 @@ else if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/pvc b/checks/pvc index e64da18..f281808 100755 --- a/checks/pvc +++ b/checks/pvc @@ -15,7 +15,7 @@ if oc auth can-i get pvc -A >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/restarts b/checks/restarts index abc2153..2e17a7e 100755 --- a/checks/restarts +++ b/checks/restarts @@ -15,7 +15,7 @@ if oc auth can-i get pods -A >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/sriov b/checks/sriov index 0aff541..4b90b72 100755 --- a/checks/sriov +++ b/checks/sriov @@ -4,7 +4,7 @@ error=false -oc get subs sriov-network-operator-subscription -n openshift-sriov-network-operator &> /dev/null +oc get subs sriov-network-operator-subscription -n openshift-sriov-network-operator &>/dev/null if [ $? -ne 0 ]; then # SR-IOV operator is not installed exit ${OCSKIP} @@ -21,7 +21,7 @@ if oc auth can-i get SriovNetworkNodeState >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/terminating b/checks/terminating index efbf3a7..c908524 100755 --- a/checks/terminating +++ b/checks/terminating @@ -15,7 +15,7 @@ if oc auth can-i get pods -A >/dev/null 2>&1; then if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/checks/zombies b/checks/zombies index b7b5d11..9d955b7 100755 --- a/checks/zombies +++ b/checks/zombies @@ -3,21 +3,22 @@ [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") tmperrorfile=$(mktemp) -echo 0 > $tmperrorfile +echo 0 >$tmperrorfile if oc auth can-i debug node >/dev/null 2>&1; then msg "Collecting zombie processes... (${BLUE}using oc debug, it can take a while${NOCOLOR})" # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( ZOMBIES=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c 'ps -ef | grep -c "[d]efunct"' 2>/dev/null) if [ -n "${ZOMBIES}" ] && [ "${ZOMBIES}" -gt 0 ]; then - msg "${ORANGE}${ZOMBIES}${NOCOLOR} zombie processes found in ${node}" - if [ "${ZOMBIES}" -ge 5 ]; then - echo 1 > $tmperrorfile - fi + msg "${ORANGE}${ZOMBIES}${NOCOLOR} zombie processes found in ${node}" + if [ "${ZOMBIES}" -ge 5 ]; then + echo 1 >$tmperrorfile + fi fi ) & done diff --git a/cronjob.yaml b/cronjob.yaml index 78b2e1f..282c750 100644 --- a/cronjob.yaml +++ b/cronjob.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: Namespace metadata: @@ -18,9 +19,9 @@ roleRef: kind: ClusterRole name: cluster-admin subjects: -- kind: ServiceAccount - name: checks-openshift - namespace: checks-openshift + - kind: ServiceAccount + name: checks-openshift + namespace: checks-openshift --- apiVersion: batch/v1beta1 kind: CronJob @@ -35,23 +36,19 @@ spec: template: spec: tolerations: - - effect: NoSchedule - key: node-role.kubernetes.io/master - operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists affinity: {} containers: - - name: checks-openshift - image: quay.io/rhsysdeseng/openshift-checks:latest - imagePullPolicy: IfNotPresent - command: [ - "/bin/sh", - "-c", - "/opt/openshift-checks/openshift-checks.sh", - ] - resources: - requests: - cpu: 100m - memory: 256Mi + - name: checks-openshift + image: quay.io/rhsysdeseng/openshift-checks:latest + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c", "/opt/openshift-checks/openshift-checks.sh"] + resources: + requests: + cpu: 100m + memory: 256Mi serviceAccountName: checks-openshift restartPolicy: Never terminationGracePeriodSeconds: 30 diff --git a/info/biosversion b/info/biosversion index c13afc3..3702fc7 100755 --- a/info/biosversion +++ b/info/biosversion @@ -8,7 +8,8 @@ if oc auth can-i debug node >/dev/null 2>&1; then # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( if ! BIOSVER=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "cat /sys/class/dmi/id/bios_version" 2>/dev/null); then msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" diff --git a/info/bmh-machine-node b/info/bmh-machine-node index f1cadf7..1e9c681 100755 --- a/info/bmh-machine-node +++ b/info/bmh-machine-node @@ -7,12 +7,12 @@ NS="openshift-machine-api" if oc auth can-i get nodes -A >/dev/null 2>&1; then if oc auth can-i get bmh -n ${NS} >/dev/null 2>&1; then if oc auth can-i get machines -n ${NS} >/dev/null 2>&1; then - for bmh in $(oc get bmh -n openshift-machine-api -o jsonpath='{.items[*].metadata.name}'); do - MACHINE=$(oc get -n openshift-machine-api bmh/${bmh} -o jsonpath='{.spec.consumerRef.name}') - NODE=$(oc get -n openshift-machine-api machine/${MACHINE} -o jsonpath='{.status.nodeRef.name}') - msg "Node ${NODE} => Machine: ${MACHINE}, BMH: ${bmh}" - done - exit ${OCINFO} + for bmh in $(oc get bmh -n openshift-machine-api -o jsonpath='{.items[*].metadata.name}'); do + MACHINE=$(oc get -n openshift-machine-api bmh/${bmh} -o jsonpath='{.spec.consumerRef.name}') + NODE=$(oc get -n openshift-machine-api machine/${MACHINE} -o jsonpath='{.status.nodeRef.name}') + msg "Node ${NODE} => Machine: ${MACHINE}, BMH: ${bmh}" + done + exit ${OCINFO} else msg "Couldn't get machines, check permissions" exit ${OCSKIP} diff --git a/info/container-images-stored b/info/container-images-stored index 5321765..44e37ac 100755 --- a/info/container-images-stored +++ b/info/container-images-stored @@ -7,7 +7,8 @@ if oc auth can-i debug node >/dev/null 2>&1; then # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "crictl images -o json" 2>/dev/null | jq -r .images[].repoTags[] ) & diff --git a/info/ethtool-firmware-version b/info/ethtool-firmware-version index 2f37148..3b79c92 100755 --- a/info/ethtool-firmware-version +++ b/info/ethtool-firmware-version @@ -7,7 +7,8 @@ if oc auth can-i debug node >/dev/null 2>&1; then # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( if ! FIRMWAREVERS=$(oc debug --image="${OSETOOLSIMAGE}" "${node}" -- sh -c "for interface in \$(ls -d /sys/class/net/*/device | cut -d/ -f5); do echo -n \"\${interface} => \"; ethtool -i \${interface} | awk '/firmware-version/ { print substr(\$0, index(\$0,\$2)) }';done" 2>/dev/null); then msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" diff --git a/info/intel-firmware-version b/info/intel-firmware-version index e2952b8..d4485a7 100755 --- a/info/intel-firmware-version +++ b/info/intel-firmware-version @@ -11,7 +11,8 @@ if oc auth can-i debug node >/dev/null 2>&1; then # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( if ! FIRMWAREVERS=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "for id in ${INTEL_IDS}; do for device in \$(lspci -D -d "\${id}" | awk '{ print \$1 }'); do echo -n \"\${device} => \"; lspci -vv -D -s "\${device}" | egrep \"\[V0\]\" | awk '{print \$NF}' ;done;done" 2>/dev/null); then msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" diff --git a/info/locks b/info/locks index 303243d..031a252 100755 --- a/info/locks +++ b/info/locks @@ -8,7 +8,7 @@ SCRIPT64=$(cat ./scripts/locks.sh | base64 -w 0) [ -z ${UTILSFILE} ] && source $(echo "$(dirname ${0})/../utils") if oc auth can-i debug node >/dev/null 2>&1; then msg "Checking for locks by pod, per node (${BLUE}using oc debug, it can take a while${NOCOLOR})" - fw_errors=0 + fw_errors=0 # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # shellcheck disable=SC1083 @@ -16,10 +16,10 @@ if oc auth can-i debug node >/dev/null 2>&1; then msg "${ORANGE}Error running oc debug in ${node}${NOCOLOR}" else if [ -n "${FILE_LOCKS}" ]; then - msg "File locks found on ${RED}${node}${NOCOLOR}" - for line in ${FILE_LOCKS}; do - echo $line - done + msg "File locks found on ${RED}${node}${NOCOLOR}" + for line in ${FILE_LOCKS}; do + echo $line + done else msg "Couldn't check for locks on ${node}" fi diff --git a/info/mellanox-firmware-version b/info/mellanox-firmware-version index 389417c..d066bb0 100755 --- a/info/mellanox-firmware-version +++ b/info/mellanox-firmware-version @@ -18,7 +18,8 @@ if oc auth can-i debug node >/dev/null 2>&1; then # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( # shellcheck disable=SC1083 if ! FIRMWAREVERS=$(oc debug --image="${OCDEBUGIMAGE}" "${node}" -- chroot /host sh -c "for id in ${IDS}; do for device in \$(lspci -D -d "\${id}" | awk '{ print \$1 }'); do echo -n \"\${device} => \"; grep -aoP '(?<=FFV)[0-9,.]{8}' /sys/bus/pci/devices/\${device}/vpd;done;done" 2>/dev/null); then diff --git a/info/mtu b/info/mtu index fc485f3..72a6e80 100755 --- a/info/mtu +++ b/info/mtu @@ -29,7 +29,8 @@ if oc auth can-i get network/cluster >/dev/null 2>&1; then # shellcheck disable=SC2016 for node in $(oc get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if eq .type "Ready"}}{{if eq .status "True"}}node/{{$node.metadata.name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'); do # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait ( # Get all the information in a single debug to avoid rescheduling unneeded pods # then convert the output into an array for easily consumption diff --git a/openshift-checks.sh b/openshift-checks.sh index b50e5bd..bdfa6d0 100755 --- a/openshift-checks.sh +++ b/openshift-checks.sh @@ -92,13 +92,13 @@ main() { export errors=$(expr $(cat ${ERRORFILE}) + 0) # shellcheck disable=SC1090,SC1091 if [ "${RESULTS_ONLY}" -gt 0 ]; then - "${check}" &> /dev/null + "${check}" &>/dev/null case $? in - 0 | 1) msg "${check:2} ${GREEN}PASS${NOCOLOR}" ;; - 2) msg "${check:2} ${RED}FAIL${NOCOLOR}" ;; - 3) msg "${check:2} ${ORANGE}SKIPPED${NOCOLOR}" ;; - 4) msg "${check:2} ${YELLOW}UNKNOWN${NOCOLOR}" ;; - *) msg "${check:2} ${RED}UNKNOWN RETURN CODE${NOCOLOR}" ;; + 0 | 1) msg "${check:2} ${GREEN}PASS${NOCOLOR}" ;; + 2) msg "${check:2} ${RED}FAIL${NOCOLOR}" ;; + 3) msg "${check:2} ${ORANGE}SKIPPED${NOCOLOR}" ;; + 4) msg "${check:2} ${YELLOW}UNKNOWN${NOCOLOR}" ;; + *) msg "${check:2} ${RED}UNKNOWN RETURN CODE${NOCOLOR}" ;; esac else "${check}" @@ -113,13 +113,13 @@ main() { export errors=$(expr $(cat ${ERRORFILE}) + 0) # shellcheck disable=SC1090,SC1091 if [ "${RESULTS_ONLY}" -gt 0 ]; then - "${ssh}" &> /dev/null + "${ssh}" &>/dev/null case $? in - 0 | 1) msg "${ssh:2} ${GREEN}PASS${NOCOLOR}" ;; - 2) msg "${ssh:2} ${RED}FAIL${NOCOLOR}" ;; - 3) msg "${ssh:2} ${ORANGE}SKIPPED${NOCOLOR}" ;; - 4) msg "${ssh:2} ${YELLOW}UNKNOWN${NOCOLOR}" ;; - *) msg "${ssh:2} ${RED}UNKNOWN RETURN CODE${NOCOLOR}" ;; + 0 | 1) msg "${ssh:2} ${GREEN}PASS${NOCOLOR}" ;; + 2) msg "${ssh:2} ${RED}FAIL${NOCOLOR}" ;; + 3) msg "${ssh:2} ${ORANGE}SKIPPED${NOCOLOR}" ;; + 4) msg "${ssh:2} ${YELLOW}UNKNOWN${NOCOLOR}" ;; + *) msg "${ssh:2} ${RED}UNKNOWN RETURN CODE${NOCOLOR}" ;; esac else "${ssh}" diff --git a/pre/dns-hostnames b/pre/dns-hostnames index 8bf3e26..1ea5036 100755 --- a/pre/dns-hostnames +++ b/pre/dns-hostnames @@ -65,7 +65,7 @@ fi if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi -if [[ "$error" == true ]]; then +if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK} diff --git a/scripts/README.md b/scripts/README.md index 70027f2..73ba2c9 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,6 +1,5 @@ # openshift-check tools - A set of scripts to run basic checks on an OpenShift cluster. PRs welcome! > :warning: This is an unofficial tool, don't blame us if it breaks your cluster @@ -32,25 +31,30 @@ After the execution a logfile will be generated with the name ovn_cleanConntrack ### Examples Saving extra debug lines in the log file: + ```bash $ ./ovn_cleanConntrack.sh -d ``` + Single node execution: + ```bash $ ./ovn_cleanConntrack.sh -s my.node.com ``` For the -k parameter, the original behavior is still the same but if you want to analyse different clusters from the same bastion you can do it using the -k parameter to pass the kubeconfig file to the script, for example: + ```bash $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster2/auth/kubeconfig $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster3/auth/kubeconfig ``` -In the previous example the script will analyse the clusters indicated by the kubeconfig files on /home/kni/clusterconfigs/cluster1/kubeconfig, /home/kni/clusterconfigs/cluster2/kubeconfig and /home/kni/clusterconfigs/cluster3/kubeconfig +In the previous example the script will analyse the clusters indicated by the kubeconfig files on /home/kni/clusterconfigs/cluster1/kubeconfig, /home/kni/clusterconfigs/cluster2/kubeconfig and /home/kni/clusterconfigs/cluster3/kubeconfig If no -k is indicated the script expects to have the KUBECONFIG variable exported in the system otherwise it will give an error because it can't connect. For the -q parameter, instead of printing the output to the standard output now you can indicate the file were to save the output of the script, to cover the commented use case for running on batch mode: + ```bash $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig -q /tmp/cluster1.output $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster2/auth/kubeconfig -q /tmp/cluster2.output @@ -59,8 +63,9 @@ $ ./ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster3/auth/kubeconfig - If no conntracks with issues are found the files /tmp/cluster?.output won't be created. If no -q is indicated, the script will print the results in the standard output. -Here is an example on how to configure a cronjob to run the script every hour (you can place it on /etc/cron.d/1conntracks). +Here is an example on how to configure a cronjob to run the script every hour (you can place it on /etc/cron.d/1conntracks). This example uses the parameters -k and -q indicating the kubeconfig and the file to save the output: + ```bash # Run hourly SHELL=/bin/bash @@ -70,13 +75,14 @@ MAILTO=root 10 * * * * kni /usr/local/bin/ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster2/auth/kubeconfig -q /tmp/ovnconntracks_cluster2.log 20 * * * * kni /usr/local/bin/ovn_cleanConntrack.sh -k /home/kni/clusterconfigs/cluster3/auth/kubeconfig -q /tmp/ovnconntracks_cluster3.log ``` + In that example the debug log is still being generated using the LOG var inside the script, but that is a debug log file in case we need to debug the script behaviour, and it can be modified according to bastion space and needs. ## recover-northd.sh ### Usage -```bash +````bash $ ./recover-northd.sh -h This script checks if northd is stuck and optionally intervene @@ -93,20 +99,21 @@ After the execution a logfile will be generated with the name recover-northd.DAT Saving extra debug lines in the log file: ```bash $ ./recover-northd.sh -d -``` +```` For the -k parameter, the original behavior is still the same but if you want to analyse different clusters from the same bastion you can do it using the -k parameter to pass the kubeconfig file to the script, for example: + ```bash $ ./recover-northd.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig $ ./recover-northd.sh -k /home/kni/clusterconfigs/cluster2/auth/kubeconfig $ ./recover-northd.sh -k /home/kni/clusterconfigs/cluster3/auth/kubeconfig ``` -In the previous example the script will analyse the clusters indicated by the kubeconfig files on /home/kni/clusterconfigs/cluster1/kubeconfig, /home/kni/clusterconfigs/cluster2/kubeconfig and /home/kni/clusterconfigs/cluster3/kubeconfig +In the previous example the script will analyse the clusters indicated by the kubeconfig files on /home/kni/clusterconfigs/cluster1/kubeconfig, /home/kni/clusterconfigs/cluster2/kubeconfig and /home/kni/clusterconfigs/cluster3/kubeconfig If no -k is indicated the script expects to have the KUBECONFIG variable exported in the system otherwise it will give an error because it can't connect. For the -r parameter, the script will send an exit to the northd container for OVN to elect a new leader: + ```bash $ ./recover-northd.sh -k /home/kni/clusterconfigs/cluster1/auth/kubeconfig -r ``` - diff --git a/scripts/locks.sh b/scripts/locks.sh index 1f4fcf5..aa14d09 100755 --- a/scripts/locks.sh +++ b/scripts/locks.sh @@ -5,8 +5,8 @@ ORIG_IFS=$IFS IFS=$(echo -en "\n\b") for line in $(sudo lslocks | egrep -v '(unknown)' | awk '{print $2}' | sort -nr | uniq -c | sort -nr | egrep -v 'unknown|-1' | grep -v PID); do - count=$(echo $line | awk '{print $1}'); - pid=$(echo $line | awk '{print $2}'); + count=$(echo $line | awk '{print $1}') + pid=$(echo $line | awk '{print $2}') orig_pid=$pid ppid=$(grep PPid /proc/${pid}/status | awk '{print $2}') while [[ $ppid -gt 1 ]]; do @@ -20,7 +20,7 @@ for line in $(sudo lslocks | egrep -v '(unknown)' | awk '{print $2}' | sort -nr ns=$(ps -hp $ppid -o cmd | grep conmon | awk '{print $9}' | awk -F/ '{print $5}' | awk -F_ '{print $1}') pod=$(ps -hp $ppid -o cmd | grep conmon | awk '{print $9}' | awk -F/ '{print $5}' | awk -F_ '{print $2}') if [ ${ns_pods["${ns}/${pod}"]} ]; then - ns_pods["${ns}/${pod}"]=`expr ${ns_pods["${ns}/${pod}"]} + $count` + ns_pods["${ns}/${pod}"]=$(expr ${ns_pods["${ns}/${pod}"]} + $count) else ns_pods["${ns}/${pod}"]=$count fi @@ -28,6 +28,6 @@ for line in $(sudo lslocks | egrep -v '(unknown)' | awk '{print $2}' | sort -nr done for pod in "${!ns_pods[@]}"; do echo $pod ${ns_pods[$pod]} -done | sort -nr -k2 | column -t +done | sort -nr -k2 | column -t IFS=$ORIG_IFS diff --git a/scripts/ovn_cleanConntrack.sh b/scripts/ovn_cleanConntrack.sh index 5e85ed8..c4d6e21 100755 --- a/scripts/ovn_cleanConntrack.sh +++ b/scripts/ovn_cleanConntrack.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash ########################################################### # ovn_cleanConntrack.sh script to remove udp conntrack # # lines persistent in a cluster hitted by BZ 2043094 # @@ -23,210 +23,210 @@ OUTPUTLOG='' # usage(): prints the usage of the script ########################################################### function usage() { - echo "This script gives the potential list of commands to clean up wrong conntracks" - echo "It only supports UDP stale entries" - echo "It only considers clusterIP services" - echo "It only works on IPV4 single stack env" - echo "Assumes node subnet is the default /24 cidr" - echo "Assumes Cluster CIDR is /16" - echo "Checks for the Service CIDR to have one of the networks /8 /16 or /24" - echo -e - echo -e "\tUsage: $(basename "$0")" - echo -e "\tHelp: $(basename "$0") -h" - echo -e "\tSave extra DEBUG lines into the log: $(basename "$0") -d" - echo -e "\tLimit the execution to a single node: $(basename "$0") -n node" - echo -e "\tSet the KUBECONFIG env var to /kubeconfig/file: $(basename "$0") -k /kubeconfig/file" - echo -e "\tSet the mode to quiet and save the output to /tmp/output.file: $(basename "$0") -q /tmp/output.file" - echo -e - echo "After the execution a logfile will be generated with the name ovn_cleanConntrack.DATE.log" + echo "This script gives the potential list of commands to clean up wrong conntracks" + echo "It only supports UDP stale entries" + echo "It only considers clusterIP services" + echo "It only works on IPV4 single stack env" + echo "Assumes node subnet is the default /24 cidr" + echo "Assumes Cluster CIDR is /16" + echo "Checks for the Service CIDR to have one of the networks /8 /16 or /24" + echo -e + echo -e "\tUsage: $(basename "$0")" + echo -e "\tHelp: $(basename "$0") -h" + echo -e "\tSave extra DEBUG lines into the log: $(basename "$0") -d" + echo -e "\tLimit the execution to a single node: $(basename "$0") -n node" + echo -e "\tSet the KUBECONFIG env var to /kubeconfig/file: $(basename "$0") -k /kubeconfig/file" + echo -e "\tSet the mode to quiet and save the output to /tmp/output.file: $(basename "$0") -q /tmp/output.file" + echo -e + echo "After the execution a logfile will be generated with the name ovn_cleanConntrack.DATE.log" } ########################################################### # setup(): initializes some variables after setting up # the KUBECONFIG ########################################################### -function setup(){ - # ServiceNetwork of the cluster - svcnetwork=$(oc get network cluster -o jsonpath='{ .spec.serviceNetwork[] }') - # Clusternetwork of the cluster - clusternetwork=$(oc get network cluster -o jsonpath='{ .spec.clusterNetwork[].cidr }' | cut -d'/' -f1 | sed -e 's/.$/2/') +function setup() { + # ServiceNetwork of the cluster + svcnetwork=$(oc get network cluster -o jsonpath='{ .spec.serviceNetwork[] }') + # Clusternetwork of the cluster + clusternetwork=$(oc get network cluster -o jsonpath='{ .spec.clusterNetwork[].cidr }' | cut -d'/' -f1 | sed -e 's/.$/2/') } ########################################################### # getServices(): prepares a list of services -# NOTE: We only care about services of type clusterIP that +# NOTE: We only care about services of type clusterIP that # use UDP protocol ########################################################### -function getServices(){ - #filter by protocol=udp and only clusterips - if [[ -z "${OUTPUTLOG}" ]]; then - echo "# Collecting service info..." - fi - OLDIFS=$IFS - IFS=$'\n' - for line in $(oc get services -A -o jsonpath='{range .items[?(@.spec.type=="ClusterIP")]}{@.spec.ports[*].protocol}{";"}{@.spec.clusterIP}{";"}{@.spec.ports[*].port}{";"}{"\n"}{end}' | grep -v 'None' | grep UDP ); do - words=$(echo "${line}" | wc -w) - protos=$(echo "${line}" | cut -d';' -f1) - ip=$(echo "${line}" | cut -d';' -f2) - port1=$(echo "${line}" | cut -d';' -f3) - if [ "${words}" -gt 1 ]; then - ports=$(echo "${line}" | cut -d';' -f3) - cports=$(echo "${ports}" | wc -w) - while [ "${cports}" -gt 0 ]; do - port=$(echo "${ports}" | cut -d' ' -f"${cports}") - proto=$(echo "${protos}" | cut -d' ' -f"${cports}") - if [ "${proto}" = "UDP" ]; then - services="${services}\n${ip};${port}" - fi - cports=$((( cports - 1 ))) - done - else - if [ "${protos}" = "UDP" ]; then - services="\n${ip};${port1}" - fi - fi - done - IFS=$OLDIFS - echo -e "Services\n-----------------${services}" >> "${LOG}" +function getServices() { + #filter by protocol=udp and only clusterips + if [[ -z ${OUTPUTLOG} ]]; then + echo "# Collecting service info..." + fi + OLDIFS=$IFS + IFS=$'\n' + for line in $(oc get services -A -o jsonpath='{range .items[?(@.spec.type=="ClusterIP")]}{@.spec.ports[*].protocol}{";"}{@.spec.clusterIP}{";"}{@.spec.ports[*].port}{";"}{"\n"}{end}' | grep -v 'None' | grep UDP); do + words=$(echo "${line}" | wc -w) + protos=$(echo "${line}" | cut -d';' -f1) + ip=$(echo "${line}" | cut -d';' -f2) + port1=$(echo "${line}" | cut -d';' -f3) + if [ "${words}" -gt 1 ]; then + ports=$(echo "${line}" | cut -d';' -f3) + cports=$(echo "${ports}" | wc -w) + while [ "${cports}" -gt 0 ]; do + port=$(echo "${ports}" | cut -d' ' -f"${cports}") + proto=$(echo "${protos}" | cut -d' ' -f"${cports}") + if [ "${proto}" = "UDP" ]; then + services="${services}\n${ip};${port}" + fi + cports=$((cports - 1)) + done + else + if [ "${protos}" = "UDP" ]; then + services="\n${ip};${port1}" + fi + fi + done + IFS=$OLDIFS + echo -e "Services\n-----------------${services}" >>"${LOG}" } ########################################################### # getEndpoints(): prepares a list of endpoints ########################################################### -function getEndpoints(){ - if [[ -z "${OUTPUTLOG}" ]]; then - echo "# Collecting endpoints info..." - fi - endpoints="" - #filter by protocol=udp and only clusterips - OLDIFS=$IFS - IFS=$'\n' - for line in $(oc get endpoints -A -o jsonpath='{range .items[*].subsets[*]}{@.addresses[*].ip}{";"}{@.addresses[*].nodeName}{";"}{@.ports[*].port}{";"}{@.ports[*].protocol}{";"}{"\n"}{end}' | grep UDP ); do - ips=$(echo "${line}" | cut -d';' -f1) - cips=$(echo "${ips}" | wc -w) - nodes=$(echo "${line}" | cut -d';' -f2) - ports=$(echo "${line}" | cut -d';' -f3) - cports=$(echo "${ports}" | wc -w) - protocols=$(echo "${line}"| cut -d';' -f4) +function getEndpoints() { + if [[ -z ${OUTPUTLOG} ]]; then + echo "# Collecting endpoints info..." + fi + endpoints="" + #filter by protocol=udp and only clusterips + OLDIFS=$IFS + IFS=$'\n' + for line in $(oc get endpoints -A -o jsonpath='{range .items[*].subsets[*]}{@.addresses[*].ip}{";"}{@.addresses[*].nodeName}{";"}{@.ports[*].port}{";"}{@.ports[*].protocol}{";"}{"\n"}{end}' | grep UDP); do + ips=$(echo "${line}" | cut -d';' -f1) + cips=$(echo "${ips}" | wc -w) + nodes=$(echo "${line}" | cut -d';' -f2) + ports=$(echo "${line}" | cut -d';' -f3) + cports=$(echo "${ports}" | wc -w) + protocols=$(echo "${line}" | cut -d';' -f4) - if [ "${cips}" -gt 1 ]; then - #ep multiple ip multiple ports - if [ "${cports}" -gt 1 ]; then - count=1 - while [ ${count} -le "${cips}" ]; do - ip=$(echo "${ips}" | cut -d' ' -f"${count}") - countports=1 - node=$(echo "${nodes}" | cut -d' ' -f"${count}") - while [ ${countports} -le "${cports}" ]; do - port=$(echo "${ports}" | cut -d' ' -f${countports}) - protocol=$(echo "${protocols}" | cut -d' ' -f${countports}) - if [ "${protocol}" = "UDP" ]; then - endpoints="${endpoints}\n${ip};${node};${port}" - fi - countports=$((( countports + 1 ))) - done - count=$((( count + 1 ))) - done - #ep multiple ip 1 port - else - count=1 - while [ ${count} -le "${cips}" ]; do - ip=$(echo "${ips}" | cut -d' ' -f${count}) - node=$(echo "${nodes}" | cut -d' ' -f${count}) - if [ "${protocols}" = "UDP" ]; then - endpoints="${endpoints}\n${ip};${node};${ports}" - fi - count=$((( count + 1 ))) - done + if [ "${cips}" -gt 1 ]; then + #ep multiple ip multiple ports + if [ "${cports}" -gt 1 ]; then + count=1 + while [ ${count} -le "${cips}" ]; do + ip=$(echo "${ips}" | cut -d' ' -f"${count}") + countports=1 + node=$(echo "${nodes}" | cut -d' ' -f"${count}") + while [ ${countports} -le "${cports}" ]; do + port=$(echo "${ports}" | cut -d' ' -f${countports}) + protocol=$(echo "${protocols}" | cut -d' ' -f${countports}) + if [ "${protocol}" = "UDP" ]; then + endpoints="${endpoints}\n${ip};${node};${port}" + fi + countports=$((countports + 1)) + done + count=$((count + 1)) + done + #ep multiple ip 1 port + else + count=1 + while [ ${count} -le "${cips}" ]; do + ip=$(echo "${ips}" | cut -d' ' -f${count}) + node=$(echo "${nodes}" | cut -d' ' -f${count}) + if [ "${protocols}" = "UDP" ]; then + endpoints="${endpoints}\n${ip};${node};${ports}" + fi + count=$((count + 1)) + done - fi - else - #ep 1 ip multiple ports - if [ "${cports}" -gt 1 ]; then - count=1 - while [ ${count} -le "${cports}" ]; do - port=$(echo "${ports}" | cut -d' ' -f${count}) - protocol=$(echo "${protocols}" | cut -d' ' -f${count}) - if [ "${protocol}" = "UDP" ]; then - endpoints="${endpoints}\n${ips};${nodes};${port}" - fi - count=$((( count + 1 ))) - done - #ep 1 ip 1 port - else - if [ "${protocols}" = "UDP" ]; then - endpoints="${endpoints}\n${ips};${nodes};${ports}" - fi - fi - fi - done - IFS=$OLDIFS - echo -e "\nEndpoints\n-----------------${endpoints}\n" >> "${LOG}" + fi + else + #ep 1 ip multiple ports + if [ "${cports}" -gt 1 ]; then + count=1 + while [ ${count} -le "${cports}" ]; do + port=$(echo "${ports}" | cut -d' ' -f${count}) + protocol=$(echo "${protocols}" | cut -d' ' -f${count}) + if [ "${protocol}" = "UDP" ]; then + endpoints="${endpoints}\n${ips};${nodes};${port}" + fi + count=$((count + 1)) + done + #ep 1 ip 1 port + else + if [ "${protocols}" = "UDP" ]; then + endpoints="${endpoints}\n${ips};${nodes};${ports}" + fi + fi + fi + done + IFS=$OLDIFS + echo -e "\nEndpoints\n-----------------${endpoints}\n" >>"${LOG}" } ########################################################### # isContrackInSvcNetwork(): checks if a contrack line fits # the service network of the cluster ########################################################### -function isContrackInSvcNetwork(){ - line=$1; - node=$2; - dst1=$(echo "${line}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}') - dst1O1=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f1) - dst1O2=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f2) - dst1O3=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f3) - netO1=$(echo "${svcnetwork}" | cut -d'.' -f1) - netO2=$(echo "${svcnetwork}" | cut -d'.' -f2) - netO3=$(echo "${svcnetwork}" | cut -d'.' -f3) - mask=$(echo "${svcnetwork}" | cut -d'/' -f2) - if [[ "${mask}" == "8" ]]; then - if [[ "${dst1O1}" == "${netO1}" && "${dst1O2}" == "${netO2}" && "${dst1O3}" == "${netO3}" ]]; then - if eval "${DEBUG}"; then echo "[${node}:isContrackInSvcNetwork] ${svcnetwork}: ${line}" >> "${LOG}"; fi - return 0 - else - return 1 - fi - fi - if [[ "${mask}" == "16" ]]; then - if [[ "${dst1O1}" == "${netO1}" && "${dst1O2}" == "${netO2}" ]]; then - if eval "${DEBUG}"; then echo "[${node}:isContrackInSvcNetwork] ${svcnetwork}: ${line}" >> "${LOG}"; fi - return 0 - else - return 1 - fi - fi - if [[ "${mask}" == "24" ]]; then - if [[ "${dst1O1}" == "${netO1}" ]]; then - if eval "${DEBUG}"; then echo "[${node}:isContrackInSvcNetwork] ${svcnetwork}: ${line}" >> "${LOG}"; fi - return 0 - else - return 1 - fi - fi +function isContrackInSvcNetwork() { + line=$1 + node=$2 + dst1=$(echo "${line}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}') + dst1O1=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f1) + dst1O2=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f2) + dst1O3=$(echo "${dst1}" | cut -d';' -f1 | cut -d'.' -f3) + netO1=$(echo "${svcnetwork}" | cut -d'.' -f1) + netO2=$(echo "${svcnetwork}" | cut -d'.' -f2) + netO3=$(echo "${svcnetwork}" | cut -d'.' -f3) + mask=$(echo "${svcnetwork}" | cut -d'/' -f2) + if [[ ${mask} == "8" ]]; then + if [[ ${dst1O1} == "${netO1}" && ${dst1O2} == "${netO2}" && ${dst1O3} == "${netO3}" ]]; then + if eval "${DEBUG}"; then echo "[${node}:isContrackInSvcNetwork] ${svcnetwork}: ${line}" >>"${LOG}"; fi + return 0 + else + return 1 + fi + fi + if [[ ${mask} == "16" ]]; then + if [[ ${dst1O1} == "${netO1}" && ${dst1O2} == "${netO2}" ]]; then + if eval "${DEBUG}"; then echo "[${node}:isContrackInSvcNetwork] ${svcnetwork}: ${line}" >>"${LOG}"; fi + return 0 + else + return 1 + fi + fi + if [[ ${mask} == "24" ]]; then + if [[ ${dst1O1} == "${netO1}" ]]; then + if eval "${DEBUG}"; then echo "[${node}:isContrackInSvcNetwork] ${svcnetwork}: ${line}" >>"${LOG}"; fi + return 0 + else + return 1 + fi + fi } ########################################################### # isContrackInServices(): checks if a contrack line fits # one of the services ########################################################### -function isContrackInServices(){ - line=$1; - node=$2; - dst1=$(echo "${line}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}') - dstport1=$(echo "${line}" | awk -F"dport=" '{sub(/ .*/,"",$2);print $2}') - OLDIFS=$IFS - IFS=$'\n' - services=$(echo -e "${services}" | xargs | sed -e 's/ /\n/g') - for service in ${services}; do - srvip=$(echo "${service}" | cut -d';' -f1) - srvport=$(echo "${service}" | cut -d';' -f2) - if [[ "${dst1}" == "${srvip}" && "${dstport1}" == "${srvport}" ]]; then - if eval "${DEBUG}"; then echo "[${node}:isContrackInServices] ${dst1}:${dstport1}: ${srvip}:${srvport}" >> "${LOG}"; fi - return 0 - fi - done - IFS=${OLDIFS} - return 1 +function isContrackInServices() { + line=$1 + node=$2 + dst1=$(echo "${line}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}') + dstport1=$(echo "${line}" | awk -F"dport=" '{sub(/ .*/,"",$2);print $2}') + OLDIFS=$IFS + IFS=$'\n' + services=$(echo -e "${services}" | xargs | sed -e 's/ /\n/g') + for service in ${services}; do + srvip=$(echo "${service}" | cut -d';' -f1) + srvport=$(echo "${service}" | cut -d';' -f2) + if [[ ${dst1} == "${srvip}" && ${dstport1} == "${srvport}" ]]; then + if eval "${DEBUG}"; then echo "[${node}:isContrackInServices] ${dst1}:${dstport1}: ${srvip}:${srvport}" >>"${LOG}"; fi + return 0 + fi + done + IFS=${OLDIFS} + return 1 } ########################################################### @@ -234,46 +234,45 @@ function isContrackInServices(){ # one of the endpoints source IP # and source port ########################################################### -function isContrackInEndPoints(){ - line=$1 - node=$2 - src2=$(echo "${line}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}') - srcport2=$(echo "${line}" | awk -F"sport=" '{sub(/ .*/,"",$3);print $3}') - endpoints=$(echo -e "${endpoints}" | xargs | sed -e 's/ /\n/g') - for endpoint in ${endpoints}; do - epip=$(echo "${endpoint}" | cut -d';' -f1) - epport=$(echo "${endpoint}" | cut -d';' -f3) - if [[ "${epip}" == "${src2}" && "${epport}" == "${srcport2}" ]]; then - if eval "${DEBUG}"; then echo "[${node}:isContrackInEndPoints] ${epip}:${epport}: ${src2}:${srcport2}" >> "${LOG}"; fi - return 0 - fi - done - if eval "${DEBUG}"; then echo "[${node}:isContrackInEndPoints] NOT found ${epip}:${epport}: ${src2}:${srcport2}" >> "${LOG}"; fi - return 1 +function isContrackInEndPoints() { + line=$1 + node=$2 + src2=$(echo "${line}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}') + srcport2=$(echo "${line}" | awk -F"sport=" '{sub(/ .*/,"",$3);print $3}') + endpoints=$(echo -e "${endpoints}" | xargs | sed -e 's/ /\n/g') + for endpoint in ${endpoints}; do + epip=$(echo "${endpoint}" | cut -d';' -f1) + epport=$(echo "${endpoint}" | cut -d';' -f3) + if [[ ${epip} == "${src2}" && ${epport} == "${srcport2}" ]]; then + if eval "${DEBUG}"; then echo "[${node}:isContrackInEndPoints] ${epip}:${epport}: ${src2}:${srcport2}" >>"${LOG}"; fi + return 0 + fi + done + if eval "${DEBUG}"; then echo "[${node}:isContrackInEndPoints] NOT found ${epip}:${epport}: ${src2}:${srcport2}" >>"${LOG}"; fi + return 1 } ############################################################ # isContrackInClusterCIDR: checks if the conntrack src # (2nd tuple) is in the clusterCIDR ############################################################ -function isContrackInClusterCIDR(){ - line=$1 - node=$2 - src2=$(echo "${line}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}') - srcoc1=$(echo "${src2}" | cut -d. -f1) - srcoc2=$(echo "${src2}" | cut -d. -f2) - cnoc1=$(echo "${clusternetwork}" | cut -d. -f1) - cnoc2=$(echo "${clusternetwork}" | cut -d. -f2) - if [[ "${srcoc1}" == "${cnoc1}" && "${srcoc2}" == "${cnoc2}" ]]; then - if eval "${DEBUG}"; then echo "[${node}:isContrackInClusterCIDR] ${clusternetwork}: ${src2}" >> "${LOG}"; fi - if eval "${DEBUG}"; then echo "[${node}:isContrackInClusterCIDR] ${line}" >> "${LOG}"; fi - return 0 - else - return 1 - fi +function isContrackInClusterCIDR() { + line=$1 + node=$2 + src2=$(echo "${line}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}') + srcoc1=$(echo "${src2}" | cut -d. -f1) + srcoc2=$(echo "${src2}" | cut -d. -f2) + cnoc1=$(echo "${clusternetwork}" | cut -d. -f1) + cnoc2=$(echo "${clusternetwork}" | cut -d. -f2) + if [[ ${srcoc1} == "${cnoc1}" && ${srcoc2} == "${cnoc2}" ]]; then + if eval "${DEBUG}"; then echo "[${node}:isContrackInClusterCIDR] ${clusternetwork}: ${src2}" >>"${LOG}"; fi + if eval "${DEBUG}"; then echo "[${node}:isContrackInClusterCIDR] ${line}" >>"${LOG}"; fi + return 0 + else + return 1 + fi } - ########################################################### # generateCommands(): generates the conntrack lines to # remove the faulty line @@ -288,110 +287,118 @@ function isContrackInClusterCIDR(){ # # D.D.D.D is the ovn-k8s-mp0 interface IP. ########################################################### -function generateCommands(){ - node=$1 - line=$2 - pod=$3 - src1=$(echo "${line}" | awk -F"src=" '{sub(/ .*/,"",$2);print $2}') - dst1=$(echo "${line}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}') - src2=$(echo "${line}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}') - nodesubnet=$(oc get node "${node}" -o jsonpath='{.metadata.annotations.k8s\.ovn\.org/node-subnets}' | jq .default | xargs | cut -d'/' -f1) - # shellcheck disable=SC2001 - nodesubnet=$(echo "${nodesubnet}" | sed -e "s/.$/${NODESUBNETIP}/") - clustername=$(oc whoami --show-console | cut -d. -f3-) - if [[ -n "${OUTPUTLOG}" ]]; then - # shellcheck disable=SC2129 - echo "# Cluster: ${clustername}" >> "${OUTPUTLOG}" - echo "# Generating lines for node (${node}) subnet:${nodesubnet}" >> "${OUTPUTLOG}" - echo "# OVN Pod: ${pod}" >> "${OUTPUTLOG}" - echo "# Raw line: ${line}" >> "${OUTPUTLOG}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}" >> "${OUTPUTLOG}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}" >> "${OUTPUTLOG}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}" >> "${OUTPUTLOG}" - else - echo "# Cluster: ${clustername}" - echo "# Generating lines for node (${node}) subnet:${nodesubnet}" - echo "# OVN Pod: ${pod}" - echo "# Raw line: ${line}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}" - fi - # Saving the commands into the log - # shellcheck disable=SC2129 - echo "# Generating lines for node (${node}) subnet:${nodesubnet}" >> "${LOG}" - echo "# OVN Pod: ${pod}" >> "${LOG}" - echo "# Raw line: ${line}" >> "${LOG}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}" >> "${LOG}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}" >> "${LOG}" - echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}" >> "${LOG}" +function generateCommands() { + node=$1 + line=$2 + pod=$3 + src1=$(echo "${line}" | awk -F"src=" '{sub(/ .*/,"",$2);print $2}') + dst1=$(echo "${line}" | awk -F"dst=" '{sub(/ .*/,"",$2);print $2}') + src2=$(echo "${line}" | awk -F"src=" '{sub(/ .*/,"",$3);print $3}') + nodesubnet=$(oc get node "${node}" -o jsonpath='{.metadata.annotations.k8s\.ovn\.org/node-subnets}' | jq .default | xargs | cut -d'/' -f1) + # shellcheck disable=SC2001 + nodesubnet=$(echo "${nodesubnet}" | sed -e "s/.$/${NODESUBNETIP}/") + clustername=$(oc whoami --show-console | cut -d. -f3-) + if [[ -n ${OUTPUTLOG} ]]; then + # shellcheck disable=SC2129 + echo "# Cluster: ${clustername}" >>"${OUTPUTLOG}" + echo "# Generating lines for node (${node}) subnet:${nodesubnet}" >>"${OUTPUTLOG}" + echo "# OVN Pod: ${pod}" >>"${OUTPUTLOG}" + echo "# Raw line: ${line}" >>"${OUTPUTLOG}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}" >>"${OUTPUTLOG}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}" >>"${OUTPUTLOG}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}" >>"${OUTPUTLOG}" + else + echo "# Cluster: ${clustername}" + echo "# Generating lines for node (${node}) subnet:${nodesubnet}" + echo "# OVN Pod: ${pod}" + echo "# Raw line: ${line}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}" + fi + # Saving the commands into the log + # shellcheck disable=SC2129 + echo "# Generating lines for node (${node}) subnet:${nodesubnet}" >>"${LOG}" + echo "# OVN Pod: ${pod}" >>"${LOG}" + echo "# Raw line: ${line}" >>"${LOG}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${dst1} -r ${src2} -q ${src1}" >>"${LOG}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${src1} -d ${src2}" >>"${LOG}" + echo "oc -n openshift-ovn-kubernetes exec pod/${pod} -c ovnkube-node -- conntrack -D -s ${nodesubnet} -d ${src2} -r ${src2} -q ${nodesubnet}" >>"${LOG}" } ########################################################### # getConntrack(): loops over the nodes using the # # ovnkube-node pods, gets the udp # # conntrackts, validates them and # -# generates the lines to remove it # +# generates the lines to remove it # ########################################################### -function getConntrack(){ - if [[ -n "${SINGLENODE}" ]]; then - nodes=$(oc get pods -n openshift-ovn-kubernetes -l app=ovnkube-node -o jsonpath='{range .items[*]}{@.metadata.name}{";"}{@..nodeName}{"\n"}{end}' | grep "${SINGLENODE}") - else - nodes=$(oc get pods -n openshift-ovn-kubernetes -l app=ovnkube-node -o jsonpath='{range .items[*]}{@.metadata.name}{";"}{@..nodeName}{"\n"}{end}') - fi - if [[ -z "${OUTPUTLOG}" ]]; then - echo "# Building cache for clusterIP services..." - fi - if eval "${DEBUG}"; then echo -e "\nConntracks\n-----------------" >> "${LOG}"; fi - for line in ${nodes}; do - # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 - ((i=i%PARALLELJOBS)); ((i++==0)) && wait - ( - OLDIFS=$IFS - IFS=$'\n' - pod=$(echo "${line}" | cut -d';' -f1) - node=$(echo "${line}" | cut -d';' -f2) - conntracks=$(oc -n openshift-ovn-kubernetes exec pod/"${pod}" -c ovnkube-node -- conntrack -L -p udp 2> /dev/null) - for conntrack in $(echo "${conntracks}" | sed 's/udp/\nudp/g' | sed 's/\[UNREPLIED\]//g' | sed 's/\[ASSURED\]//g' | tr -s ' '); do - # if not found in the service network or found in services or if not found in clusterCIDR or - # if found in endpoints, ignore it - # otherwise generate the commands to remove it - if isContrackInSvcNetwork "${conntrack}" "${node}"; then - if isContrackInClusterCIDR "${conntrack}" "${node}"; then - if isContrackInServices "${conntrack}" "${node}"; then - if ! isContrackInEndPoints "${conntrack}" "${node}"; then - echo -e "===> Generating conntrack lines for (${node}:${pod}): $conntrack}" >> "${LOG}" - generateCommands "${node}" "${line}" "${pod}" - fi - fi - fi - fi - done - wait - IFS=$OLDIFS - ) & - done +function getConntrack() { + if [[ -n ${SINGLENODE} ]]; then + nodes=$(oc get pods -n openshift-ovn-kubernetes -l app=ovnkube-node -o jsonpath='{range .items[*]}{@.metadata.name}{";"}{@..nodeName}{"\n"}{end}' | grep "${SINGLENODE}") + else + nodes=$(oc get pods -n openshift-ovn-kubernetes -l app=ovnkube-node -o jsonpath='{range .items[*]}{@.metadata.name}{";"}{@..nodeName}{"\n"}{end}') + fi + if [[ -z ${OUTPUTLOG} ]]; then + echo "# Building cache for clusterIP services..." + fi + if eval "${DEBUG}"; then echo -e "\nConntracks\n-----------------" >>"${LOG}"; fi + for line in ${nodes}; do + # See https://medium.com/@robert.i.sandor/getting-started-with-parallelization-in-bash-e114f4353691 + ((i = i % PARALLELJOBS)) + ((i++ == 0)) && wait + ( + OLDIFS=$IFS + IFS=$'\n' + pod=$(echo "${line}" | cut -d';' -f1) + node=$(echo "${line}" | cut -d';' -f2) + conntracks=$(oc -n openshift-ovn-kubernetes exec pod/"${pod}" -c ovnkube-node -- conntrack -L -p udp 2>/dev/null) + for conntrack in $(echo "${conntracks}" | sed 's/udp/\nudp/g' | sed 's/\[UNREPLIED\]//g' | sed 's/\[ASSURED\]//g' | tr -s ' '); do + # if not found in the service network or found in services or if not found in clusterCIDR or + # if found in endpoints, ignore it + # otherwise generate the commands to remove it + if isContrackInSvcNetwork "${conntrack}" "${node}"; then + if isContrackInClusterCIDR "${conntrack}" "${node}"; then + if isContrackInServices "${conntrack}" "${node}"; then + if ! isContrackInEndPoints "${conntrack}" "${node}"; then + echo -e "===> Generating conntrack lines for (${node}:${pod}): $conntrack}" >>"${LOG}" + generateCommands "${node}" "${line}" "${pod}" + fi + fi + fi + fi + done + wait + IFS=$OLDIFS + ) & + done } - # Main while getopts "dhq:k:n:" flag; do case "${flag}" in - n) SINGLENODE=${OPTARG} - ;; - q) OUTPUTLOG=${OPTARG} - echo "Quiet mode enabled saving output into ${OUTPUTLOG}" >> "${LOG}" - ;; - d) DEBUG=true - ;; - h) usage - exit 1 - ;; - k) export KUBECONFIG="${OPTARG}" - echo "Exported KUBECONFIG=${KUBECONFIG}" >> "${LOG}" - ;; - *) echo >&2 "Invalid option: $*"; usage; exit 1 - ;; + n) + SINGLENODE=${OPTARG} + ;; + q) + OUTPUTLOG=${OPTARG} + echo "Quiet mode enabled saving output into ${OUTPUTLOG}" >>"${LOG}" + ;; + d) + DEBUG=true + ;; + h) + usage + exit 1 + ;; + k) + export KUBECONFIG="${OPTARG}" + echo "Exported KUBECONFIG=${KUBECONFIG}" >>"${LOG}" + ;; + *) + echo >&2 "Invalid option: $*" + usage + exit 1 + ;; esac done @@ -404,6 +411,6 @@ getEndpoints # Loop over the conntrack to find persistent conntracks # and generate the conntrackt commands to remove it getConntrack -if [[ -z "${OUTPUTLOG}" ]]; then - echo "# Logged operations into the file ${LOG}" +if [[ -z ${OUTPUTLOG} ]]; then + echo "# Logged operations into the file ${LOG}" fi diff --git a/scripts/recover-northd.sh b/scripts/recover-northd.sh index fcd7d9d..35b66c0 100755 --- a/scripts/recover-northd.sh +++ b/scripts/recover-northd.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash ########################################################### # recover-northd.sh script to unwedge northd in the event # # of a node failure # @@ -17,18 +17,17 @@ REMDIATE=false # usage(): prints the usage of the script ########################################################### function usage() { - echo "This script checks if northd is stuck and optionally intervene" - echo -e - echo -e "\tUsage: $(basename "$0")" - echo -e "\tHelp: $(basename "$0") -h" - echo -e "\tSave extra DEBUG lines into the log: $(basename "$0") -d" - echo -e "\tSet the KUBECONFIG env var to /kubeconfig/file: $(basename "$0") -k /kubeconfig/file" - echo -e "\tRemediate the issue: $(basename "$0") -r" - echo -e - echo "After the execution a logfile will be generated with the name recover-northd.DATE.log" + echo "This script checks if northd is stuck and optionally intervene" + echo -e + echo -e "\tUsage: $(basename "$0")" + echo -e "\tHelp: $(basename "$0") -h" + echo -e "\tSave extra DEBUG lines into the log: $(basename "$0") -d" + echo -e "\tSet the KUBECONFIG env var to /kubeconfig/file: $(basename "$0") -k /kubeconfig/file" + echo -e "\tRemediate the issue: $(basename "$0") -r" + echo -e + echo "After the execution a logfile will be generated with the name recover-northd.DATE.log" } - ########################################################### # check_northd(): check the current status of northd ########################################################### @@ -41,59 +40,66 @@ function check_northd() { active_pod=${pod} node=$(oc get pod/"$active_pod" -n openshift-ovn-kubernetes -o json | jq .spec.nodeName | sed -e 's/\"//g') date=$(date +"%Y-%m-%d %H:%M:%S") - if eval "${DEBUG}"; then echo "[check_northd:${date}] pod ${pod} is active" >> "${LOG}"; fi - else + if eval "${DEBUG}"; then echo "[check_northd:${date}] pod ${pod} is active" >>"${LOG}"; fi + else date=$(date +"%Y-%m-%d %H:%M:%S") - if eval "${DEBUG}"; then echo "[check_northd:${date}] pod ${pod} NOT active, status:${pod_status}" >> "${LOG}"; fi + if eval "${DEBUG}"; then echo "[check_northd:${date}] pod ${pod} NOT active, status:${pod_status}" >>"${LOG}"; fi fi done - if [[ -z "${active_pod}" ]]; then + if [[ -z ${active_pod} ]]; then date=$(date +"%Y-%m-%d %H:%M:%S") - if eval "${DEBUG}"; then echo "[check_northd:${date}] no active northd leader found" >> "${LOG}"; else + if eval "${DEBUG}"; then echo "[check_northd:${date}] no active northd leader found" >>"${LOG}"; else echo "no active northd leader found..." - fi + fi if eval "${REMDIATE}"; then - if eval "${DEBUG}"; then echo "[check_northd:${date}] ...recovering northd" >> "${LOG}"; else + if eval "${DEBUG}"; then echo "[check_northd:${date}] ...recovering northd" >>"${LOG}"; else echo "...recovering northd" fi for pod in ${pods}; do oc exec -n openshift-ovn-kubernetes -c northd "${pod}" -- ovn-appctl -t ovn-northd exit date=$(date +"%Y-%m-%d %H:%M:%S") - if eval "${DEBUG}"; then echo "[check_northd:${date}] recovering pod ${pod}" >> "${LOG}"; else + if eval "${DEBUG}"; then echo "[check_northd:${date}] recovering pod ${pod}" >>"${LOG}"; else echo "recovering pod ${pod}" - fi + fi done fi else date=$(date +"%Y-%m-%d %H:%M:%S") - if eval "${DEBUG}"; then echo "[check_northd:${date}] found active northd leader (${active_pod}) on ${node}" >> "${LOG}"; else + if eval "${DEBUG}"; then echo "[check_northd:${date}] found active northd leader (${active_pod}) on ${node}" >>"${LOG}"; else echo "found active northd leader (${active_pod}) on ${node}" fi fi - + } # Main while getopts "dhk:r" flag; do case "${flag}" in - d) DEBUG=true - ;; - h) usage - exit 1 - ;; - k) export KUBECONFIG="${OPTARG}" - echo "Exported KUBECONFIG=${KUBECONFIG}" >> "${LOG}" - ;; - r) REMDIATE=true - ;; - *) echo >&2 "Invalid option: $*"; usage; exit 1 - ;; + d) + DEBUG=true + ;; + h) + usage + exit 1 + ;; + k) + export KUBECONFIG="${OPTARG}" + echo "Exported KUBECONFIG=${KUBECONFIG}" >>"${LOG}" + ;; + r) + REMDIATE=true + ;; + *) + echo >&2 "Invalid option: $*" + usage + exit 1 + ;; esac done check_northd -if [[ -f "${LOG}" ]]; then - echo "# Logged operations into the file ${LOG}" +if [[ -f ${LOG} ]]; then + echo "# Logged operations into the file ${LOG}" fi diff --git a/ssh/bz1941840 b/ssh/bz1941840 index ee72163..dd3389c 100755 --- a/ssh/bz1941840 +++ b/ssh/bz1941840 @@ -10,7 +10,7 @@ if oc auth can-i get pods -n openshift-authentication-operator >/dev/null 2>&1; msg "Checking for a hung kubelet..." # shellcheck disable=SC2016 node=$(oc get pods -n openshift-authentication-operator -l app=authentication-operator -o json | jq -r .items[0].spec.nodeName) - container_id=$(oc get pods -n openshift-authentication-operator -l app=authentication-operator -o json | jq -r .items[0].status.containerStatuses[0].containerID | awk -F// '{print $2}' | cut -c-13) + container_id=$(oc get pods -n openshift-authentication-operator -l app=authentication-operator -o json | jq -r .items[0].status.containerStatuses[0].containerID | awk -F// '{print $2}' | cut -c-13) if ! AUTH_OPERATOR_MEMORY=$(ssh -q core@$node "sudo crictl stats --id ${container_id} -o json | jq -r .stats[0].memory.workingSetBytes.value"); then msg "${ORANGE}Error running crictl stats openshift-authentication-operator/${pod}${NOCOLOR}" else @@ -23,7 +23,7 @@ if oc auth can-i get pods -n openshift-authentication-operator >/dev/null 2>&1; if [ ! -z "${ERRORFILE}" ]; then echo $errors >${ERRORFILE} fi - if [[ "$error" == true ]]; then + if [[ $error == true ]]; then exit ${OCERROR} else exit ${OCOK}