librelad a3afb2aeae feat(model-a): run app as manager; route bare docker calls through runFileOp
Model A prototype (run start.sh AS the manager, escalate only via helpers):
- check_root.sh: accept the manager user, not root-only (init.sh keeps its own
  install-time root check).
- init.sh: guard the top-level root-check + installer entrypoint with
  BASH_SOURCE!=$0 so it runs ONLY when init.sh is executed directly; when
  start.sh sources it as the manager the entrypoint (and its root check) no
  longer fires.

Also: convert bare daemon-touching 'docker' calls (no helper -> hit the
nonexistent /var/run socket in rootless) to runFileOp docker across
app_status, app_health_*, network_prune, ip_is_available, check_docker_network,
backup_db (db dumps) and crontab_check_processor. cd&&compose rooted-branches
and 'docker compose --version' checks left as-is (rooted-only / no daemon).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
Signed-off-by: librelad <librelad@digitalangels.vip>
2026-05-24 16:53:37 +01:00

358 lines
15 KiB
Bash

#!/bin/bash
# Live, consistent database backups.
#
# A file-level snapshot of a running database is "torn" — pages can be
# half-written when restic reads them, so the restored copy may not even
# mount. The fix is a *logical* dump taken while the service keeps running:
# mysqldump --single-transaction / pg_dump / sqlite3 .backup all produce a
# transactionally-consistent file with zero downtime. We snapshot that dump
# (and exclude the raw data dir, which is now redundant and unreliable), then
# replay it on restore.
#
# Apps declare their databases as compose labels so the metadata travels with
# the app (the compose is always copied to the install dir and always lives in
# the snapshot). One label per database:
#
# labels:
# libreportal.backup.db: "<kind>:<container>:<datadir>:<path>"
#
# kind mysql | mariadb | postgres | sqlite
# container service container_name to `docker exec` into (server engines)
# datadir app-dir-relative folder holding raw DB files, excluded on live
# path app-dir-relative path to the sqlite file (sqlite only)
#
# Examples:
# "mysql:nextcloud-db:db_data:" MariaDB/MySQL in nextcloud-db, raw db_data/ excluded
# "postgres:mastodon-db:postgres_data:" Postgres in mastodon-db
# "sqlite:::data/gitea.db" sqlite file at data/gitea.db
#
# An app with no database can still opt into live snapshots (its files are
# static enough to capture safely) with:
# labels:
# libreportal.backup.live: "true"
# Subdir (relative to the app dir) where consistent dumps are written. It sits
# at the app root so it is never inside an excluded datadir, and rides along in
# the snapshot.
backup_db_dump_subdir=".lp-backup/db"
# Emit one "kind:container:datadir:path" line per declared database, read from
# the *installed* compose so it reflects what is actually deployed.
backupDbDescriptors()
{
local app="$1"
local compose="$containers_dir$app/docker-compose.yml"
[[ -f "$compose" ]] || return 0
grep -E '^[[:space:]]*libreportal\.backup\.db[[:space:]]*:' "$compose" 2>/dev/null \
| sed -E 's/^[[:space:]]*libreportal\.backup\.db[[:space:]]*:[[:space:]]*//' \
| sed -E 's/[[:space:]]*#.*$//' \
| sed -E 's/^["'\'']//; s/["'\'']$//' \
| sed -E 's/[[:space:]]+$//'
}
backupDbHasDescriptors()
{
local app="$1"
if [[ -n "$(backupDbDescriptors "$app")" ]]; then return 0; fi
return 1
}
# True when the app carries `libreportal.backup.live: "true"` — i.e. its data is
# safe to snapshot while running even though it has no database to dump.
backupAppIsLiveSafe()
{
local app="$1"
local compose="$containers_dir$app/docker-compose.yml"
[[ -f "$compose" ]] || return 1
if grep -qE '^[[:space:]]*libreportal\.backup\.live[[:space:]]*:[[:space:]]*["'\'']?true' "$compose" 2>/dev/null; then
return 0
fi
return 1
}
# An app can be backed up live without downtime when we can make it consistent:
# it has a dumpable database, or it is explicitly blessed live-safe.
backupAppLiveCapable()
{
local app="$1"
if backupDbHasDescriptors "$app"; then return 0; fi
if declare -f backupFilesHasDescriptors >/dev/null 2>&1 && backupFilesHasDescriptors "$app"; then return 0; fi
if backupAppIsLiveSafe "$app"; then return 0; fi
return 1
}
# Strategy options valid for one app, in .config "[a:b|c:d]" syntax. live is
# offered only where the app can actually do it, so the UI never shows a choice
# that would just fall back to stop.
backupAppStrategyOptions()
{
local app="$1"
local opts="auto:Automatic — recommended|stop-snapshot-start:Stop → snapshot → start|pause-snapshot-unpause:Pause → snapshot → unpause"
if backupAppLiveCapable "$app"; then
opts="$opts|live:Live — no downtime"
fi
echo "$opts"
return 0
}
# Resolve the effective strategy for one app. Order of precedence:
# 1. per-app override CFG_<APP>_BACKUP_STRATEGY (advanced, defaults to auto)
# 2. global default CFG_BACKUP_STRATEGY (auto)
# An explicit stop/pause/live is honoured as-is; "auto" goes live only where the
# app is live-capable and otherwise uses the always-safe stop-snapshot-start.
backupResolveStrategy()
{
local app="$1"
local override_key="CFG_${app^^}_BACKUP_STRATEGY"
local s="${!override_key}"
if [[ -z "$s" || "$s" == "auto" ]]; then
s="${CFG_BACKUP_STRATEGY:-auto}"
fi
case "$s" in
live|pause-snapshot-unpause|stop-snapshot-start)
echo "$s"; return 0 ;;
esac
if backupAppLiveCapable "$app"; then
echo "live"
else
echo "stop-snapshot-start"
fi
return 0
}
# Deterministic dump filename for a descriptor — backup writes it, restore reads
# it, both deriving the same name from the descriptor with no side metadata.
_backupDbDumpName()
{
local kind="$1" container="$2" path="$3"
case "$kind" in
sqlite) echo "sqlite-$(echo "$path" | tr '/' '_').sqlite.gz" ;;
*) echo "db-${container}.sql.gz" ;;
esac
}
# Wait until a server database is genuinely ready for a load. On a fresh init
# (the restore case) the engine starts a throwaway temp server, runs its setup,
# then stops it and starts the real one — a simple ping passes against the temp
# server and the load then races the restart. So require a real query to
# succeed on two consecutive checks: the restart drops the streak, so we only
# return once the real server is stably up.
_backupDbWaitReady()
{
local kind="$1" container="$2" tries="${3:-45}"
local i ok=0 good
for ((i = 0; i < tries; i++)); do
good=0
case "$kind" in
postgres)
runFileOp docker exec "$container" sh -c 'export PGPASSWORD="${POSTGRES_PASSWORD:-}"; psql -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-${POSTGRES_USER:-postgres}}" -tAc "SELECT 1"' >/dev/null 2>&1 && good=1 ;;
*)
runFileOp docker exec "$container" sh -c 'RP="${MARIADB_ROOT_PASSWORD:-$MYSQL_ROOT_PASSWORD}"; mariadb -uroot -p"$RP" -N -e "SELECT 1" 2>/dev/null || mysql -uroot -p"$RP" -N -e "SELECT 1"' >/dev/null 2>&1 && good=1 ;;
esac
if [[ $good -eq 1 ]]; then
ok=$((ok + 1))
[[ $ok -ge 2 ]] && return 0
else
ok=0
fi
sleep 2
done
return 1
}
# Load one server dump into its (freshly initialised) container. stdout is the
# dump piped in; returns non-zero if the load fails so the caller can retry.
_backupDbImport()
{
local kind="$1" container="$2" dump="$3"
case "$kind" in
postgres)
runFileOp gzip -dc "$dump" | docker exec -i "$container" sh -c \
'export PGPASSWORD="${POSTGRES_PASSWORD:-}"; psql -v ON_ERROR_STOP=1 -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-${POSTGRES_USER:-postgres}}"' >/dev/null 2>&1 ;;
*)
runFileOp gzip -dc "$dump" | docker exec -i "$container" sh -c \
'RP="${MARIADB_ROOT_PASSWORD:-$MYSQL_ROOT_PASSWORD}"; (mariadb -uroot -p"$RP" 2>/dev/null || mysql -uroot -p"$RP")' >/dev/null 2>&1 ;;
esac
}
# Dump every declared database for an app to consistent files inside the app
# dir, while the containers keep running. Called on the live path only.
backupDbDump()
{
local app="$1"
local app_dir="$containers_dir$app"
local dump_dir="$app_dir/$backup_db_dump_subdir"
local desc kind container datadir path dump rc=0
backupDbHasDescriptors "$app" || return 0
runFileOp mkdir -p "$dump_dir"
while IFS= read -r desc; do
[[ -z "$desc" ]] && continue
IFS=':' read -r kind container datadir path <<< "$desc"
dump="$dump_dir/$(_backupDbDumpName "$kind" "$container" "$path")"
case "$kind" in
postgres)
isNotice "Dumping postgres ($container) — live, consistent"
if runFileOp docker exec "$container" sh -c \
'export PGPASSWORD="${POSTGRES_PASSWORD:-}"; pg_dump --clean --if-exists -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-${POSTGRES_USER:-postgres}}"' \
2>/dev/null | gzip | runFileWrite "$dump"; then
isSuccessful "postgres dump written ($container)"
else
isError "postgres dump failed ($container)"; rc=1
fi
;;
mysql|mariadb)
isNotice "Dumping $kind ($container) — live, consistent"
if runFileOp docker exec "$container" sh -c \
'RP="${MARIADB_ROOT_PASSWORD:-$MYSQL_ROOT_PASSWORD}"; DB="${MARIADB_DATABASE:-$MYSQL_DATABASE}"; (mariadb-dump -uroot -p"$RP" --single-transaction --routines --triggers --databases "$DB" 2>/dev/null || mysqldump -uroot -p"$RP" --single-transaction --routines --triggers --databases "$DB")' \
2>/dev/null | gzip | runFileWrite "$dump"; then
isSuccessful "$kind dump written ($container)"
else
isError "$kind dump failed ($container)"; rc=1
fi
;;
sqlite)
isNotice "Dumping sqlite ($path) — live, consistent"
local src="$app_dir/$path"
if [[ ! -f "$src" ]]; then
# Declared but not found — could be a fresh app, or a wrong
# path. Treat as a dump failure so the caller falls back to
# the safe stop-snapshot-start rather than snapshotting a
# live sqlite file untorn.
isError "sqlite file $path not found — cannot dump"
rc=1
continue
fi
# .backup takes a consistent copy even while the app writes.
local tmp="$dump_dir/.$(basename "$path").tmp"
if runFileOp sqlite3 "$src" ".backup '$tmp'" 2>/dev/null && runFileOp gzip -c "$tmp" | runFileWrite "$dump"; then
runFileOp rm -f "$tmp"
isSuccessful "sqlite dump written ($path)"
else
runFileOp rm -f "$tmp"
isError "sqlite dump failed ($path)"; rc=1
fi
;;
*)
isError "Unknown db kind '$kind' for $app — skipping"; rc=1 ;;
esac
done < <(backupDbDescriptors "$app")
runFileOp chown -R "$docker_install_user":"$docker_install_user" "$dump_dir" 2>/dev/null
return $rc
}
# Absolute paths to exclude from a live snapshot: the raw data dirs / sqlite
# files the dumps supersede. Echoed one per line for the engine adapters.
backupDbExcludePaths()
{
local app="$1"
local app_dir="$containers_dir$app"
local desc kind container datadir path
while IFS= read -r desc; do
[[ -z "$desc" ]] && continue
IFS=':' read -r kind container datadir path <<< "$desc"
case "$kind" in
sqlite)
[[ -n "$path" ]] || continue
echo "$app_dir/$path"
echo "$app_dir/$path-wal"
echo "$app_dir/$path-shm"
;;
*)
[[ -n "$datadir" ]] || continue
echo "$app_dir/$datadir"
;;
esac
done < <(backupDbDescriptors "$app")
}
# Pre-start restore step. Runs after the snapshot is laid down but before the
# containers come up:
# server remove the (absent or stale) raw data dir so the engine first-run
# init builds a clean, empty database for us to load into.
# sqlite put the consistent dump back at the real path so the app opens it.
restoreDbRehydratePreStart()
{
local app="$1"
local app_dir="$containers_dir$app"
local dump_dir="$app_dir/$backup_db_dump_subdir"
local desc kind container datadir path dump
backupDbHasDescriptors "$app" || return 0
while IFS= read -r desc; do
[[ -z "$desc" ]] && continue
IFS=':' read -r kind container datadir path <<< "$desc"
dump="$dump_dir/$(_backupDbDumpName "$kind" "$container" "$path")"
case "$kind" in
sqlite)
[[ -f "$dump" ]] || { isNotice "No sqlite dump for $path — leaving app to initialise"; continue; }
runFileOp rm -f "$app_dir/$path" "$app_dir/$path-wal" "$app_dir/$path-shm"
runFileOp mkdir -p "$(dirname "$app_dir/$path")"
runFileOp gzip -dc "$dump" | runFileWrite "$app_dir/$path"
runFileOp chown -R "$docker_install_user":"$docker_install_user" "$(dirname "$app_dir/$path")"
isSuccessful "sqlite $path rehydrated from dump"
;;
*)
[[ -f "$dump" ]] || { isNotice "No dump for $container — keeping restored data dir as-is"; continue; }
[[ -n "$datadir" ]] && runFileOp rm -rf "${app_dir:?}/$datadir"
isNotice "Cleared $datadir$container will init fresh, then load the dump"
;;
esac
done < <(backupDbDescriptors "$app")
}
# Post-start restore step. Server engines load their dump into the freshly
# initialised database once it is accepting connections. sqlite is already in
# place from the pre-start step, so it is a no-op here.
restoreDbReplayPostStart()
{
local app="$1"
local app_dir="$containers_dir$app"
local dump_dir="$app_dir/$backup_db_dump_subdir"
local desc kind container datadir path dump
backupDbHasDescriptors "$app" || return 0
while IFS= read -r desc; do
[[ -z "$desc" ]] && continue
IFS=':' read -r kind container datadir path <<< "$desc"
dump="$dump_dir/$(_backupDbDumpName "$kind" "$container" "$path")"
[[ "$kind" == "sqlite" ]] && continue
[[ -f "$dump" ]] || continue
isNotice "Waiting for $container to accept connections"
if ! _backupDbWaitReady "$kind" "$container"; then
isError "$container never became ready — dump not loaded; data dir left for manual recovery"
continue
fi
# Retry the load: even after the readiness streak, a fresh engine can
# bounce once more as it finishes init. The dump drops+recreates each
# object, so re-running is idempotent.
local attempt loaded=1
for attempt in 1 2 3 4 5; do
if _backupDbImport "$kind" "$container" "$dump"; then
loaded=0; break
fi
isNotice "$container not ready for load yet (attempt $attempt) — retrying"
sleep 5
_backupDbWaitReady "$kind" "$container" >/dev/null 2>&1
done
if [[ $loaded -eq 0 ]]; then
isSuccessful "$kind dump loaded into $container"
else
isError "Loading $kind dump into $container failed after retries; data left for manual recovery"
fi
done < <(backupDbDescriptors "$app")
}