From ed43fcb04067febac443b02daf64a1daf08c3904 Mon Sep 17 00:00:00 2001 From: Laszlo Magyar Date: Sat, 11 Nov 2023 21:01:16 +0100 Subject: [PATCH 1/4] use healthcheck --- tailscale/Dockerfile | 5 +++++ tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tailscale/Dockerfile b/tailscale/Dockerfile index 5a06cefc..09a1c552 100755 --- a/tailscale/Dockerfile +++ b/tailscale/Dockerfile @@ -35,6 +35,11 @@ RUN \ # Copy root filesystem COPY rootfs / +HEALTHCHECK \ + CMD /opt/tailscale status --json --self=false --peers=false | jq --exit-status '.BackendState != "Stopped"' > /dev/null + # Note: Plain (non-json) tailscale status returns error when status is not Running or Starting, so NeedsLogin and NeedsMachineAuth would make it unhealthy + # Note: The .Health json filter returns any problems, so even temporary health problems would make it unhealthy + # S6 Overlay stage 2 hook ENV S6_STAGE2_HOOK=/etc/s6-overlay/scripts/stage2_hook.sh diff --git a/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run b/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run index 58490395..22819225 100755 --- a/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run +++ b/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run @@ -90,7 +90,7 @@ unset IFS # Wait for the network to be available and logged in while ! bashio::fs.socket_exists "/var/run/tailscale/tailscaled.sock" || \ ! /opt/tailscale status --json --peers=false --self=false \ - | jq --exit-status '.BackendState == "Running" or .BackendState == "NeedsLogin"' > /dev/null; + | jq --exit-status '.BackendState == "Running" or .BackendState == "NeedsLogin" or .BackendState == "Stopped"' > /dev/null; do sleep 2 done From 800afa7e4c9b1cc63ce7dcd5e57013805f79ce97 Mon Sep 17 00:00:00 2001 From: Laszlo Magyar Date: Fri, 18 Oct 2024 16:12:36 +0200 Subject: [PATCH 2/4] fix healthcheck to recover from offline state --- tailscale/DOCS.md | 18 +++++++++++++++++ tailscale/Dockerfile | 4 +--- tailscale/config.yaml | 1 + tailscale/rootfs/usr/bin/healthcheck | 30 ++++++++++++++++++++++++++++ tailscale/translations/en.yaml | 5 +++++ 5 files changed, 55 insertions(+), 3 deletions(-) create mode 100755 tailscale/rootfs/usr/bin/healthcheck diff --git a/tailscale/DOCS.md b/tailscale/DOCS.md index aa7bdee6..62282116 100644 --- a/tailscale/DOCS.md +++ b/tailscale/DOCS.md @@ -70,6 +70,7 @@ advertise_routes: - 192.168.1.0/24 - fd12:3456:abcd::/64 funnel: false +healthcheck_timeout: 110 log_level: info login_server: "https://controlplane.tailscale.com" proxy: false @@ -189,6 +190,23 @@ port 443 (or the port configured in option `proxy_and_funnel_port`)._ **Note:** _If you encounter strange browser behaviour or strange error messages, try to clear all site related cookies, clear all browser cache, restart browser._ +### Option: `healthcheck_timeout` + +This option allows you to set timeout in seconds for Tailscale to be offline. + +Tailscale is quite resilient and can recover from nearly any network change. But +in case it fails to recover and remains offline longer than healthcheck_timeout +seconds, the add-on can be restarted. The check happens only when Tailscale is +running, ie. it won't have any effect when Tailscale's status is eg. Starting, +NeedsLogin or NeedsMachineAuth. + +The Stopped status is deemed unhealthy by default. + +Note: The add-on's health is checked by Home Assistant in each 30s, ie. the +effective resolution of this option is 30s, not 1s. + +When not set, this option is disabled by default. + ### Option: `log_level` Optionally enable tailscaled debug messages in the add-on's log. Turn it on only diff --git a/tailscale/Dockerfile b/tailscale/Dockerfile index 09a1c552..aea47661 100755 --- a/tailscale/Dockerfile +++ b/tailscale/Dockerfile @@ -36,9 +36,7 @@ RUN \ COPY rootfs / HEALTHCHECK \ - CMD /opt/tailscale status --json --self=false --peers=false | jq --exit-status '.BackendState != "Stopped"' > /dev/null - # Note: Plain (non-json) tailscale status returns error when status is not Running or Starting, so NeedsLogin and NeedsMachineAuth would make it unhealthy - # Note: The .Health json filter returns any problems, so even temporary health problems would make it unhealthy + CMD healthcheck # S6 Overlay stage 2 hook ENV S6_STAGE2_HOOK=/etc/s6-overlay/scripts/stage2_hook.sh diff --git a/tailscale/config.yaml b/tailscale/config.yaml index ef761850..ca24f4f2 100644 --- a/tailscale/config.yaml +++ b/tailscale/config.yaml @@ -35,6 +35,7 @@ schema: advertise_routes: - "match(^(((25[0-5]|(2[0-4]|1\\d|[1-9]?)\\d)\\.){3}(25[0-5]|(2[0-4]|1\\d|[1-9]?)\\d)\\/(3[0-2]|[12]?\\d)|[a-fA-F\\d.:]+:[a-fA-F\\d.:]+\\/(12[0-8]|(1[01]|[1-9]?)\\d))$)?" funnel: bool? + healthcheck_timeout: int? log_level: list(trace|debug|info|notice|warning|error|fatal)? login_server: url? proxy: bool? diff --git a/tailscale/rootfs/usr/bin/healthcheck b/tailscale/rootfs/usr/bin/healthcheck new file mode 100755 index 00000000..e1d02123 --- /dev/null +++ b/tailscale/rootfs/usr/bin/healthcheck @@ -0,0 +1,30 @@ +#!/command/with-contenv bashio +# shellcheck shell=bash + +# Plain (non-json) tailscale status returns error when status is not Running or Starting, so eg. NeedsLogin and NeedsMachineAuth would make it unhealthy +# The .Health json filter returns any problems, so even temporary health problems would make it unhealthy +# This script treats the following situations unhealthy: +# - always: .BackendState == "Stopped" +# - optionally: .BackendState == "Running" && .Self.Online == false for more then healthcheck_timeout seconds (configurable) +# This can handle internal TS bugs, like https://github.com/tailscale/tailscale/issues/12021 where TS fails to recover from rerouting traffic from normal WAN to failover WAN + +declare status_json +declare backend_state self_online +# LAST_ONLINE_TIMESTAMP is in contenv at /var/run/s6/container_environment + +status_json=$(/opt/tailscale status --json --self=true --peers=false) +backend_state=$(jq -r '.BackendState' <<< "${status_json}") +self_online=$(jq -r '.Self.Online' <<< "${status_json}") + +if bashio::var.equals "${backend_state}" "Running" && bashio::var.equals "${self_online}" "true"; then + LAST_ONLINE_TIMESTAMP=$(date +"%s") + printf "${LAST_ONLINE_TIMESTAMP}" > /var/run/s6/container_environment/LAST_ONLINE_TIMESTAMP +fi + +if [[ "${backend_state}" == "Stopped" ]] || \ + bashio::config.has_value "healthcheck_timeout" && \ + bashio::var.has_value "${LAST_ONLINE_TIMESTAMP-}" && \ + (( $(date +"%s") - ${LAST_ONLINE_TIMESTAMP} > $(bashio::config "healthcheck_timeout") )) +then + exit 1 +fi diff --git a/tailscale/translations/en.yaml b/tailscale/translations/en.yaml index 252ca219..4ec0a040 100644 --- a/tailscale/translations/en.yaml +++ b/tailscale/translations/en.yaml @@ -41,6 +41,11 @@ configuration: Home Assistant instance on the wider internet using your Tailscale domain. This requires Tailscale Proxy to be enabled. When not set, this option is disabled by default. + healthcheck_timeout: + name: Healthcheck timeout [s] + description: >- + This option allows you to set timeout in seconds for Tailscale to be offline. + When not set, this option is disabled by default. log_level: name: Log level description: >- From 6e9bef55dc4b227181949f156279039fc054d134 Mon Sep 17 00:00:00 2001 From: Laszlo Magyar Date: Wed, 23 Oct 2024 01:30:36 +0200 Subject: [PATCH 3/4] health docs update --- tailscale/DOCS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tailscale/DOCS.md b/tailscale/DOCS.md index 62282116..00a122f4 100644 --- a/tailscale/DOCS.md +++ b/tailscale/DOCS.md @@ -202,6 +202,8 @@ NeedsLogin or NeedsMachineAuth. The Stopped status is deemed unhealthy by default. +Note: If the network is down, the add-on will be restarted only once. + Note: The add-on's health is checked by Home Assistant in each 30s, ie. the effective resolution of this option is 30s, not 1s. From bc2ba3a03df6dc0eb79afb018d2c27dc002dcf84 Mon Sep 17 00:00:00 2001 From: Laszlo Magyar Date: Thu, 24 Oct 2024 19:17:28 +0200 Subject: [PATCH 4/4] fine tune healthcheck docs --- tailscale/DOCS.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tailscale/DOCS.md b/tailscale/DOCS.md index 00a122f4..f7bc2ba9 100644 --- a/tailscale/DOCS.md +++ b/tailscale/DOCS.md @@ -194,6 +194,8 @@ try to clear all site related cookies, clear all browser cache, restart browser. This option allows you to set timeout in seconds for Tailscale to be offline. +When not set, this option is disabled by default. + Tailscale is quite resilient and can recover from nearly any network change. But in case it fails to recover and remains offline longer than healthcheck_timeout seconds, the add-on can be restarted. The check happens only when Tailscale is @@ -202,12 +204,10 @@ NeedsLogin or NeedsMachineAuth. The Stopped status is deemed unhealthy by default. -Note: If the network is down, the add-on will be restarted only once. +**Note:** _If the network is down, the add-on will be restarted only once._ -Note: The add-on's health is checked by Home Assistant in each 30s, ie. the -effective resolution of this option is 30s, not 1s. - -When not set, this option is disabled by default. +**Note:** _The add-on's health is checked by Home Assistant in each 30s, ie. the +effective resolution of this option is 30s, not 1s._ ### Option: `log_level`