diff --git a/tailscale/DOCS.md b/tailscale/DOCS.md index aa7bdee6..f7bc2ba9 100644 --- a/tailscale/DOCS.md +++ b/tailscale/DOCS.md @@ -70,6 +70,7 @@ advertise_routes: - 192.168.1.0/24 - fd12:3456:abcd::/64 funnel: false +healthcheck_timeout: 110 log_level: info login_server: "https://controlplane.tailscale.com" proxy: false @@ -189,6 +190,25 @@ port 443 (or the port configured in option `proxy_and_funnel_port`)._ **Note:** _If you encounter strange browser behaviour or strange error messages, try to clear all site related cookies, clear all browser cache, restart browser._ +### Option: `healthcheck_timeout` + +This option allows you to set timeout in seconds for Tailscale to be offline. + +When not set, this option is disabled by default. + +Tailscale is quite resilient and can recover from nearly any network change. But +in case it fails to recover and remains offline longer than healthcheck_timeout +seconds, the add-on can be restarted. The check happens only when Tailscale is +running, ie. it won't have any effect when Tailscale's status is eg. Starting, +NeedsLogin or NeedsMachineAuth. + +The Stopped status is deemed unhealthy by default. + +**Note:** _If the network is down, the add-on will be restarted only once._ + +**Note:** _The add-on's health is checked by Home Assistant in each 30s, ie. the +effective resolution of this option is 30s, not 1s._ + ### Option: `log_level` Optionally enable tailscaled debug messages in the add-on's log. Turn it on only diff --git a/tailscale/Dockerfile b/tailscale/Dockerfile index 5a06cefc..aea47661 100755 --- a/tailscale/Dockerfile +++ b/tailscale/Dockerfile @@ -35,6 +35,9 @@ RUN \ # Copy root filesystem COPY rootfs / +HEALTHCHECK \ + CMD healthcheck + # S6 Overlay stage 2 hook ENV S6_STAGE2_HOOK=/etc/s6-overlay/scripts/stage2_hook.sh diff --git a/tailscale/config.yaml b/tailscale/config.yaml index ef761850..ca24f4f2 100644 --- a/tailscale/config.yaml +++ b/tailscale/config.yaml @@ -35,6 +35,7 @@ schema: advertise_routes: - "match(^(((25[0-5]|(2[0-4]|1\\d|[1-9]?)\\d)\\.){3}(25[0-5]|(2[0-4]|1\\d|[1-9]?)\\d)\\/(3[0-2]|[12]?\\d)|[a-fA-F\\d.:]+:[a-fA-F\\d.:]+\\/(12[0-8]|(1[01]|[1-9]?)\\d))$)?" funnel: bool? + healthcheck_timeout: int? log_level: list(trace|debug|info|notice|warning|error|fatal)? login_server: url? proxy: bool? diff --git a/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run b/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run index 58490395..22819225 100755 --- a/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run +++ b/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run @@ -90,7 +90,7 @@ unset IFS # Wait for the network to be available and logged in while ! bashio::fs.socket_exists "/var/run/tailscale/tailscaled.sock" || \ ! /opt/tailscale status --json --peers=false --self=false \ - | jq --exit-status '.BackendState == "Running" or .BackendState == "NeedsLogin"' > /dev/null; + | jq --exit-status '.BackendState == "Running" or .BackendState == "NeedsLogin" or .BackendState == "Stopped"' > /dev/null; do sleep 2 done diff --git a/tailscale/rootfs/usr/bin/healthcheck b/tailscale/rootfs/usr/bin/healthcheck new file mode 100755 index 00000000..e1d02123 --- /dev/null +++ b/tailscale/rootfs/usr/bin/healthcheck @@ -0,0 +1,30 @@ +#!/command/with-contenv bashio +# shellcheck shell=bash + +# Plain (non-json) tailscale status returns error when status is not Running or Starting, so eg. NeedsLogin and NeedsMachineAuth would make it unhealthy +# The .Health json filter returns any problems, so even temporary health problems would make it unhealthy +# This script treats the following situations unhealthy: +# - always: .BackendState == "Stopped" +# - optionally: .BackendState == "Running" && .Self.Online == false for more then healthcheck_timeout seconds (configurable) +# This can handle internal TS bugs, like https://github.com/tailscale/tailscale/issues/12021 where TS fails to recover from rerouting traffic from normal WAN to failover WAN + +declare status_json +declare backend_state self_online +# LAST_ONLINE_TIMESTAMP is in contenv at /var/run/s6/container_environment + +status_json=$(/opt/tailscale status --json --self=true --peers=false) +backend_state=$(jq -r '.BackendState' <<< "${status_json}") +self_online=$(jq -r '.Self.Online' <<< "${status_json}") + +if bashio::var.equals "${backend_state}" "Running" && bashio::var.equals "${self_online}" "true"; then + LAST_ONLINE_TIMESTAMP=$(date +"%s") + printf "${LAST_ONLINE_TIMESTAMP}" > /var/run/s6/container_environment/LAST_ONLINE_TIMESTAMP +fi + +if [[ "${backend_state}" == "Stopped" ]] || \ + bashio::config.has_value "healthcheck_timeout" && \ + bashio::var.has_value "${LAST_ONLINE_TIMESTAMP-}" && \ + (( $(date +"%s") - ${LAST_ONLINE_TIMESTAMP} > $(bashio::config "healthcheck_timeout") )) +then + exit 1 +fi diff --git a/tailscale/translations/en.yaml b/tailscale/translations/en.yaml index 252ca219..4ec0a040 100644 --- a/tailscale/translations/en.yaml +++ b/tailscale/translations/en.yaml @@ -41,6 +41,11 @@ configuration: Home Assistant instance on the wider internet using your Tailscale domain. This requires Tailscale Proxy to be enabled. When not set, this option is disabled by default. + healthcheck_timeout: + name: Healthcheck timeout [s] + description: >- + This option allows you to set timeout in seconds for Tailscale to be offline. + When not set, this option is disabled by default. log_level: name: Log level description: >-