feat(reliability): continue-on-error config + honest checkSuccess

checkSuccess silently reported '✓ Success' for failed commands, which is how
the de-sudo write gaps (throttle stamp, passwords, updater) hid. Rework it:

- Capture the real exit code up front; success path unchanged.
- On failure, ALWAYS append to a greppable $logs_dir/error_report.log tagged
  with the caller's script:line + exit code — a failure can't hide behind a
  green check anymore.
- New CFG_REQUIREMENT_CONTINUE_ON_ERROR (default true): log + continue so one
  failure doesn't abort the run and we surface EVERY issue in a single pass.
  Flip it off later for strict abort/prompt (the prior behaviour, preserved).

Documents the 'local VAR=$(cmd); checkSuccess' footgun (local resets $?), which
the next commit fixes across the tree.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: librelad <librelad@digitalangels.vip>
This commit is contained in:
librelad 2026-05-31 03:05:37 +01:00
parent bc1969dd20
commit eecc5d29ba
2 changed files with 56 additions and 40 deletions

View File

@ -13,5 +13,6 @@ CFG_REQUIREMENT_CONFIGS_AUTO_UPDATE=true # Auto Config Updates -
CFG_REQUIREMENT_CONFIGS_AUTO_DELETE=true # Auto Config Deletes - Remove config options no longer present in the template
CFG_REQUIREMENT_MISSING_IPS=false # IP Configuration Check - Check for and alert about missing IP configurations
CFG_REQUIREMENT_CONTINUE_PROMPT=false # Continue Prompts - Show continue prompts during installation for user confirmation
CFG_REQUIREMENT_CONTINUE_ON_ERROR=true # Continue On Error - Log failures to error_report.log and continue instead of aborting (on by default to surface issues; turn off for strict abort once clean)
CFG_REQUIREMENT_SUGGEST_INSTALLS=false # Install Suggestions - Enable application suggestions and recommendations during installation
CFG_REQUIREMENT_SUGGEST_METRICS=true # Metrics Suggestions - Offer Prometheus and Grafana during first install (requires Install Suggestions enabled)

View File

@ -1,53 +1,68 @@
#!/bin/bash
# checkSuccess "message" — report on the exit status of the PRECEDING command.
#
# IMPORTANT for callers: $? must still be the command's exit when this is called.
# `local VAR=$(cmd); checkSuccess ...` is a BUG — the `local`/`declare` builtin
# resets $? to 0, masking the command's real failure. Use `local VAR; VAR=$(cmd)`
# (split declaration from assignment) so $? survives.
#
# On failure this ALWAYS records the failure to a greppable error report
# ($logs_dir/error_report.log) with the caller's script:line + exit code, so a
# failure can never hide behind a green check again. Then it either continues
# (CFG_REQUIREMENT_CONTINUE_ON_ERROR=true, the default — surface every issue in a
# single pass) or falls back to the strict abort/prompt behaviour when that's off.
function checkSuccess()
{
if [ $? -eq 0 ]; then
isSuccessful "$1"
local rc=$?
local msg="$1"
if [ "$rc" -eq 0 ]; then
isSuccessful "$msg"
if [ -f "$logs_dir/$docker_log_file" ]; then
echo "✓ Success $1" | runInstallWrite -a "$logs_dir/$docker_log_file" >/dev/null
echo "✓ Success $msg" | runInstallWrite -a "$logs_dir/$docker_log_file" >/dev/null
fi
else
isError "$1"
return 0
fi
# Non-interactive (task processor / cron / piped): bail instead of
# blocking on read.
if [[ "$LIBREPORTAL_NONINTERACTIVE" == "1" ]] || [ ! -t 0 ]; then
if [ -f "$logs_dir/$docker_log_file" ]; then
isError " $1" | runInstallWrite -a "$logs_dir/$docker_log_file" >/dev/null
echo "===================================" | runInstallWrite -a "$logs_dir/$docker_log_file" >/dev/null
fi
isNotice "Non-interactive mode: aborting on error."
exit 1
fi
# ---- failure ----
isError "$msg"
while true; do
isQuestion "An error has occurred. Do you want to continue, exit or go to back to the Menu? (c/x/m) "
read -rp "" error_occurred
if [[ -n "$error_occurred" ]]; then
break
fi
isNotice "Please provide a valid input."
done
# Record EVERY failure to a dedicated, greppable report (manager-owned logs),
# tagged with the caller's script:line + exit code. Best-effort; never aborts.
local _where="${BASH_SOURCE[1]##*/}:${BASH_LINENO[0]}"
local _stamp; _stamp="$(date '+%F %T' 2>/dev/null || echo now)"
printf '%s\t[exit %s]\t%s\t(%s)\n' "$_stamp" "$rc" "$msg" "$_where" \
| runInstallWrite -a "$logs_dir/error_report.log" 2>/dev/null || true
if [ -f "$logs_dir/$docker_log_file" ]; then
isError " $msg (exit $rc, $_where)" | runInstallWrite -a "$logs_dir/$docker_log_file" >/dev/null 2>&1 || true
fi
if [[ "$error_occurred" == [cC] ]]; then
isNotice "Continuing after error has occurred."
fi
# Continue-on-error (default true): log and carry on so a single failure
# doesn't abort the whole run and we surface EVERY issue in one pass. Turn
# CFG_REQUIREMENT_CONTINUE_ON_ERROR off for strict abort once things are clean.
if [[ "${CFG_REQUIREMENT_CONTINUE_ON_ERROR:-true}" == "true" ]]; then
isNotice "continue-on-error: logged to $logs_dir/error_report.log — continuing."
return 0
fi
if [[ "$error_occurred" == [xX] ]]; then
# Log the error output to the log file
isError " $1" | runInstallWrite -a "$logs_dir/$docker_log_file"
echo "===================================" | runInstallWrite -a "$logs_dir/$docker_log_file"
exit 1 # Exit the script with a non-zero status to stop the current action
fi
# ---- strict mode (continue-on-error off) ----
# Non-interactive (task processor / cron / piped): bail instead of blocking.
if [[ "$LIBREPORTAL_NONINTERACTIVE" == "1" ]] || [ ! -t 0 ]; then
isNotice "Non-interactive mode: aborting on error."
exit 1
fi
if [[ "$error_occurred" == [mM] ]]; then
# Log the error output to the log file
isError " $1" | runInstallWrite -a "$logs_dir/$docker_log_file"
echo "===================================" | runInstallWrite -a "$logs_dir/$docker_log_file"
if [[ "$initial_command2" == "terminal" ]]; then
resetToMenu;
fi
fi
while true; do
isQuestion "An error has occurred. Do you want to continue, exit or go to back to the Menu? (c/x/m) "
read -rp "" error_occurred
[[ -n "$error_occurred" ]] && break
isNotice "Please provide a valid input."
done
[[ "$error_occurred" == [cC] ]] && isNotice "Continuing after error has occurred."
[[ "$error_occurred" == [xX] ]] && exit 1
if [[ "$error_occurred" == [mM] && "$initial_command2" == "terminal" ]]; then
resetToMenu
fi
}