diff --git a/backup/pvc/VERSION.txt b/backup/pvc/VERSION.txt index 268b0334e..fb7a04cff 100644 --- a/backup/pvc/VERSION.txt +++ b/backup/pvc/VERSION.txt @@ -1 +1 @@ -v0.3.0 +v0.4.0 diff --git a/backup/pvc/bin/backup.sh b/backup/pvc/bin/backup.sh index eaa648f98..13d2f0963 100644 --- a/backup/pvc/bin/backup.sh +++ b/backup/pvc/bin/backup.sh @@ -1,39 +1,63 @@ #!/usr/bin/env bash set -eo pipefail +source "$(dirname "$0")/utils.sh" + +[[ ! $# -eq 1 ]] && _log "ERROR" "Usage: $0 BACKUP_NUMBER" && exit 1 +[[ -z "${BACKUP_DIR}" ]] && _log "ERROR" "Required 'BACKUP_DIR' env not set" && exit 1 +[[ -z "${JENKINS_HOME}" ]] && _log "ERROR" "Required 'JENKINS_HOME' env not set" && exit 1 +BACKUP_RETRY_COUNT=${BACKUP_RETRY_COUNT:-3} +BACKUP_RETRY_INTERVAL=${BACKUP_RETRY_INTERVAL:-60} +BACKUP_NUMBER=$1 +TRAP_FILE="/tmp/_backup_${BACKUP_NUMBER}_is_running" + +# --> Check if another backup process is running (operator restart/crash) +for ((i=0; i /dev/null 2>&1 - # Store the exit status of the ls command local ls_exit_status=$? - # Restore the previous value of 'set -e' [ "$previous_e" = "0" ] && set -e - # Return true if ls command succeeded (no files found), otherwise return false [ $ls_exit_status -ne 0 ] } -[[ -z "${BACKUP_DIR}" ]] && { echo "Required 'BACKUP_DIR' env not set"; exit 1; } +[[ -z "${BACKUP_DIR}" ]] && { _log "ERROR" "Required 'BACKUP_DIR' env not set"; exit 1; } # Check if we have any backup if is_backup_not_exist "${BACKUP_DIR}"; then diff --git a/backup/pvc/bin/restore.sh b/backup/pvc/bin/restore.sh index 994208d33..3a692625d 100644 --- a/backup/pvc/bin/restore.sh +++ b/backup/pvc/bin/restore.sh @@ -1,29 +1,47 @@ #!/usr/bin/env bash set -eo pipefail +source "$(dirname "$0")/utils.sh" -[[ ! $# -eq 1 ]] && echo "Usage: $0 backup_number" && exit 1 -[[ -z "${BACKUP_DIR}" ]] && echo "Required 'BACKUP_DIR' env not set" && exit 1; -[[ -z "${JENKINS_HOME}" ]] && echo "Required 'JENKINS_HOME' env not set" && exit 1; +[[ ! $# -eq 1 ]] && _log "ERROR" "Usage: $0 " && exit 1 +[[ -z "${BACKUP_DIR}" ]] && _log "ERROR" "Required 'BACKUP_DIR' env not set" && exit 1 +[[ -z "${JENKINS_HOME}" ]] && _log "ERROR" "Required 'JENKINS_HOME' env not set" && exit 1 +BACKUP_NUMBER=$1 +RESTORE_RETRY_COUNT=${RESTORE_RETRY_COUNT:-10} +RESTORE_RETRY_INTERVAL=${RESTORE_RETRY_INTERVAL:-10} -backup_number=$1 -backup_file="${BACKUP_DIR}/${backup_number}" -echo "Running restore backup with backup number #${backup_number}" +# --> Check if another restore process is running (operator restart/crash) +TRAP_FILE="/tmp/_restore_${BACKUP_NUMBER}_is_running" +trap "rm -f ${TRAP_FILE}" SIGINT SIGTERM -if [[ -f "$backup_file.tar.gz" ]]; then - echo "Old format tar.gz found, restoring it" +for ((i=0; i&2 + _log "ERROR" "[run] backups not found in ${backup_dir}" return fi find "${backup_dir}"/*.tar.zstd -maxdepth 0 -exec basename {} \; | sort -gr | tail -n +$((backup_count +1)) @@ -51,9 +52,9 @@ check_env_var "BACKUP_DIR" check_env_var "JENKINS_HOME" if [[ -z "${BACKUP_COUNT}" ]]; then - echo "ATTENTION! No BACKUP_COUNT set, it means you MUST delete old backups manually or by custom script" + _log "WARNING" "[run] no BACKUP_COUNT set, it means you MUST delete old backups manually or by custom script" else - echo "Retaining only the ${BACKUP_COUNT} most recent backups, cleanup occurs every ${BACKUP_CLEANUP_INTERVAL} seconds" + _log "INFO" "[run] retaining only the ${BACKUP_COUNT} most recent backups, cleanup occurs every ${BACKUP_CLEANUP_INTERVAL} seconds" fi while true; @@ -62,7 +63,7 @@ do if [[ -n "${BACKUP_COUNT}" ]]; then exceeding_backups=$(find_exceeding_backups "${BACKUP_DIR}" "${BACKUP_COUNT}") if [[ -n "$exceeding_backups" ]]; then - echo "Removing backups: $(echo "$exceeding_backups" | tr '\n' ', ' | sed 's/,$//')" + _log "INFO" "[run] removing backups: $(echo "$exceeding_backups" | tr '\n' ', ' | sed 's/,$//')" echo "$exceeding_backups" | while read -r file; do rm "${BACKUP_DIR}/${file}" done diff --git a/backup/pvc/bin/utils.sh b/backup/pvc/bin/utils.sh new file mode 100644 index 000000000..718dfcf1a --- /dev/null +++ b/backup/pvc/bin/utils.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Common utils + +_log() { + local level="$1" + local message="$2" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + if [[ "$level" =~ ^(ERROR|ERR|error|err)$ ]]; then + echo "${timestamp} - ${level} - ${message}" > /proc/1/fd/2 + else + echo "${timestamp} - ${level} - ${message}" > /proc/1/fd/1 + echo "${timestamp} - ${level} - ${message}" >&2 + fi +} diff --git a/chart/jenkins-operator/README.md b/chart/jenkins-operator/README.md index 2c2b2fc11..e61ec2f6f 100644 --- a/chart/jenkins-operator/README.md +++ b/chart/jenkins-operator/README.md @@ -30,7 +30,7 @@ Kubernetes native operator which fully manages Jenkins on Kubernetes | jenkins.backup.env[2].name | string | `"BACKUP_COUNT"` | | | jenkins.backup.env[2].value | string | `"3"` | | | jenkins.backup.getLatestAction[0] | string | `"/home/user/bin/get-latest.sh"` | | -| jenkins.backup.image | string | `"quay.io/jenkins-kubernetes-operator/backup-pvc:v0.2.6"` | | +| jenkins.backup.image | string | `"quay.io/jenkins-kubernetes-operator/backup-pvc:v0.4.1"` | | | jenkins.backup.interval | int | `30` | | | jenkins.backup.makeBackupBeforePodDeletion | bool | `true` | | | jenkins.backup.pvc.className | string | `""` | | diff --git a/chart/jenkins-operator/values.yaml b/chart/jenkins-operator/values.yaml index cc878194b..2ecdb5796 100644 --- a/chart/jenkins-operator/values.yaml +++ b/chart/jenkins-operator/values.yaml @@ -214,7 +214,7 @@ jenkins: # image used by backup feature # By default using prebuilt backup PVC image - image: quay.io/jenkins-kubernetes-operator/backup-pvc:v0.2.6 + image: quay.io/jenkins-kubernetes-operator/backup-pvc:v0.4.1 # containerName is backup container name containerName: backup @@ -262,6 +262,11 @@ jenkins: # BACKUP_DIR - path for storing backup files (default: "/backup") # JENKINS_HOME - path to jenkins home (default: "/jenkins-home") # BACKUP_COUNT - define how much recent backups will be kept + # Optional in case you want to modify the backup and restore retry logic + # BACKUP_RETRY_COUNT + # BACKUP_RETRY_INTERVAL + # RESTORE_RETRY_COUNT + # RESTORE_RETRY_INTERVAL env: - name: BACKUP_DIR value: /backup @@ -269,6 +274,15 @@ jenkins: value: /jenkins-home - name: BACKUP_COUNT value: "3" # keep only the 3 most recent backups + #- name: BACKUP_RETRY_COUNT + # value: "3" + #- name: BACKUP_RETRY_INTERVAL + # value: "60" + #- name: RESTORE_RETRY_COUNT + # value: "10" + #- name: RESTORE_RETRY_INTERVAL + # value: "10" + # volumeMounts holds the mount points for volumes volumeMounts: diff --git a/nix/website-shell.nix b/nix/website-shell.nix index cda6748cb..725425855 100644 --- a/nix/website-shell.nix +++ b/nix/website-shell.nix @@ -4,6 +4,7 @@ let devShellPackages = [ hugo_099_pkgs.hugo #hugo pre-v100 pkgs.nodejs_21 #Node 1.21 + pkgs.helm-docs ]; baseUrl = ((builtins.fromTOML (builtins.readFile ../website/config.toml)).baseURL); in diff --git a/website/content/en/docs/Getting Started/latest/configuring-backup-and-restore.md b/website/content/en/docs/Getting Started/latest/configuring-backup-and-restore.md index 35acf8660..86cccde8d 100644 --- a/website/content/en/docs/Getting Started/latest/configuring-backup-and-restore.md +++ b/website/content/en/docs/Getting Started/latest/configuring-backup-and-restore.md @@ -2,7 +2,7 @@ title: "Configuring backup and restore" linkTitle: "Configuring backup and restore" weight: 5 -date: 2023-01-08 +date: 2024-06-25 description: > Prevent loss of job history --- @@ -115,3 +115,19 @@ spec: command: - /home/user/bin/get-latest.sh # this command is invoked on "backup" container to get last backup number before pod deletion; not having it in the CR may cause loss of data ``` + +#### Customizing pvc backup behaviour + +To prevent situations where the operator crashes or gets killed during a backup and restore process, a retry logic has been implemented. + +This logic can be customized by adjusting the following environment variables: + +* **Backup**: total time wait until giving up by default: 180s + * `BACKUP_RETRY_COUNT`: by default is `3` + * `BACKUP_RETRY_INTERVAL`: by default is `60` + +* **Restore**: total time wait until giving up by default: 100s + * `RESTORE_RETRY_COUNT`: by default is `10` + * `RESTORE_RETRY_INTERVAL`: by default is `10` + +You can adjust the retry logic based on the size of your backup and the duration of the restore process.