From 1e6eb628ce398691254e726789ff0cbf89139825 Mon Sep 17 00:00:00 2001 From: librelad Date: Sat, 23 May 2026 16:55:58 +0100 Subject: [PATCH] fix(backup): survive DB engine first-init restart on restore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live-restore of a server DB (MariaDB/Postgres) raced the engine's first-init: it starts a throwaway temp server, runs setup, then restarts the real one. The old ping-based readiness passed against the temp server and the load hit the restart, failing once. - _backupDbWaitReady now requires a real query to succeed on two consecutive checks, so the restart breaks the streak and we only proceed once the real server is stably up. - The dump load is retried (idempotent — the dump drops+recreates each object) to ride past a final init bounce. Co-Authored-By: Claude Opus 4.7 Signed-off-by: librelad --- scripts/backup/db/backup_db.sh | 73 +++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/scripts/backup/db/backup_db.sh b/scripts/backup/db/backup_db.sh index 35a8a9b..cd8a67c 100644 --- a/scripts/backup/db/backup_db.sh +++ b/scripts/backup/db/backup_db.sh @@ -133,24 +133,50 @@ _backupDbDumpName() esac } -# Wait until a server database accepts connections (it has just been started -# fresh on restore, or is mid-load on a busy host). +# Wait until a server database is genuinely ready for a load. On a fresh init +# (the restore case) the engine starts a throwaway temp server, runs its setup, +# then stops it and starts the real one — a simple ping passes against the temp +# server and the load then races the restart. So require a real query to +# succeed on two consecutive checks: the restart drops the streak, so we only +# return once the real server is stably up. _backupDbWaitReady() { - local kind="$1" container="$2" tries=30 - local i + local kind="$1" container="$2" tries="${3:-45}" + local i ok=0 good for ((i = 0; i < tries; i++)); do + good=0 case "$kind" in postgres) - docker exec "$container" sh -c 'pg_isready -U "${POSTGRES_USER:-postgres}" -q' >/dev/null 2>&1 && return 0 ;; + docker exec "$container" sh -c 'export PGPASSWORD="${POSTGRES_PASSWORD:-}"; psql -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-${POSTGRES_USER:-postgres}}" -tAc "SELECT 1"' >/dev/null 2>&1 && good=1 ;; *) - docker exec "$container" sh -c 'mariadb-admin ping -uroot -p"${MARIADB_ROOT_PASSWORD:-$MYSQL_ROOT_PASSWORD}" 2>/dev/null || mysqladmin ping -uroot -p"${MARIADB_ROOT_PASSWORD:-$MYSQL_ROOT_PASSWORD}"' >/dev/null 2>&1 && return 0 ;; + docker exec "$container" sh -c 'RP="${MARIADB_ROOT_PASSWORD:-$MYSQL_ROOT_PASSWORD}"; mariadb -uroot -p"$RP" -N -e "SELECT 1" 2>/dev/null || mysql -uroot -p"$RP" -N -e "SELECT 1"' >/dev/null 2>&1 && good=1 ;; esac + if [[ $good -eq 1 ]]; then + ok=$((ok + 1)) + [[ $ok -ge 2 ]] && return 0 + else + ok=0 + fi sleep 2 done return 1 } +# Load one server dump into its (freshly initialised) container. stdout is the +# dump piped in; returns non-zero if the load fails so the caller can retry. +_backupDbImport() +{ + local kind="$1" container="$2" dump="$3" + case "$kind" in + postgres) + sudo gzip -dc "$dump" | docker exec -i "$container" sh -c \ + 'export PGPASSWORD="${POSTGRES_PASSWORD:-}"; psql -v ON_ERROR_STOP=1 -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-${POSTGRES_USER:-postgres}}"' >/dev/null 2>&1 ;; + *) + sudo gzip -dc "$dump" | docker exec -i "$container" sh -c \ + 'RP="${MARIADB_ROOT_PASSWORD:-$MYSQL_ROOT_PASSWORD}"; (mariadb -uroot -p"$RP" 2>/dev/null || mysql -uroot -p"$RP")' >/dev/null 2>&1 ;; + esac +} + # Dump every declared database for an app to consistent files inside the app # dir, while the containers keep running. Called on the live path only. backupDbDump() @@ -309,23 +335,22 @@ restoreDbReplayPostStart() continue fi - case "$kind" in - postgres) - if sudo gzip -dc "$dump" | docker exec -i "$container" sh -c \ - 'export PGPASSWORD="${POSTGRES_PASSWORD:-}"; psql -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-${POSTGRES_USER:-postgres}}"' >/dev/null 2>&1; then - isSuccessful "postgres dump loaded into $container" - else - isError "Loading postgres dump into $container failed" - fi - ;; - mysql|mariadb) - if sudo gzip -dc "$dump" | docker exec -i "$container" sh -c \ - 'RP="${MARIADB_ROOT_PASSWORD:-$MYSQL_ROOT_PASSWORD}"; (mariadb -uroot -p"$RP" 2>/dev/null || mysql -uroot -p"$RP")' >/dev/null 2>&1; then - isSuccessful "$kind dump loaded into $container" - else - isError "Loading $kind dump into $container failed" - fi - ;; - esac + # Retry the load: even after the readiness streak, a fresh engine can + # bounce once more as it finishes init. The dump drops+recreates each + # object, so re-running is idempotent. + local attempt loaded=1 + for attempt in 1 2 3 4 5; do + if _backupDbImport "$kind" "$container" "$dump"; then + loaded=0; break + fi + isNotice "$container not ready for load yet (attempt $attempt) — retrying" + sleep 5 + _backupDbWaitReady "$kind" "$container" >/dev/null 2>&1 + done + if [[ $loaded -eq 0 ]]; then + isSuccessful "$kind dump loaded into $container" + else + isError "Loading $kind dump into $container failed after retries; data left for manual recovery" + fi done < <(backupDbDescriptors "$app") }