Compare commits
15 Commits
f150c2bbd2
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 58be12b666 | |||
| 3b9c703bb0 | |||
| 7797fab52e | |||
| 0aac47318a | |||
| 8457a529f7 | |||
| 64f3347058 | |||
| fed4eafbf6 | |||
| e35b83caa7 | |||
| 923ecd4cee | |||
| 759bb698ed | |||
| 57711863ef | |||
| 22ea61b843 | |||
| 85c582d5da | |||
| 9807075149 | |||
| f85636f48f |
219
RESTORE.md
Normal file
219
RESTORE.md
Normal file
@@ -0,0 +1,219 @@
|
|||||||
|
# Restore Procedure Runbook
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This runbook documents the tested disaster recovery procedure for restoring namespaces from OADP backups. The strategy is to perform in-place restores (replacing the original namespace) rather than parallel restores, which avoids resource conflicts and is simpler to validate.
|
||||||
|
|
||||||
|
**Tested Applications:**
|
||||||
|
- **n8n** (monthly): Stateful app with PostgreSQL database. Has independent flow backups as safety net.
|
||||||
|
- **mailhog** (monthly): Stateless app for SMTP testing.
|
||||||
|
- **gitea** (one-shot validation): Full production restore to validate entire strategy.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- kubectl/oc access to cluster
|
||||||
|
- Velero CLI installed (optional but helpful)
|
||||||
|
- ArgoCD access to pause reconciliation
|
||||||
|
- Recent backup available for target namespace
|
||||||
|
|
||||||
|
## Set Variables
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VELERO_NS=openshift-adp
|
||||||
|
SRC_NS=n8n # Namespace to restore (n8n, mailhog, etc)
|
||||||
|
TS=$(date +%Y%m%d-%H%M%S)
|
||||||
|
RESTORE_NAME=${SRC_NS}-restore-${TS}
|
||||||
|
BACKUP_NAME=daily-stateful-* # Use most recent backup, or specify exact name
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 1: Pause GitOps Reconciliation
|
||||||
|
|
||||||
|
Pause ArgoCD to prevent it from recreating resources while we're testing restore:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
oc patch appproject infrastructure -n openshift-gitops \
|
||||||
|
-p '{"spec": {"sourceNamespaces": []}}' --type merge
|
||||||
|
|
||||||
|
# Or via ArgoCD UI: Edit the Application, set Auto-Sync to Manual
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Why**: GitOps will try to recreate namespaces/resources as they're deleted, interfering with the restore test.
|
||||||
|
|
||||||
|
## Step 2: Delete Target Namespace
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "Deleting namespace: $SRC_NS"
|
||||||
|
oc delete ns $SRC_NS --wait=true
|
||||||
|
|
||||||
|
# Verify it's gone
|
||||||
|
oc get ns $SRC_NS 2>&1 | grep -i "not found" && echo "✓ Namespace deleted"
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Note**: PersistentVolumes and backups remain intact.
|
||||||
|
|
||||||
|
## Step 3: Get Latest Backup Name
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List recent backups for the namespace
|
||||||
|
velero backup get --filter=includedNamespaces=$SRC_NS
|
||||||
|
|
||||||
|
# Or via kubectl
|
||||||
|
BACKUP_NAME=$(oc -n $VELERO_NS get backup -o jsonpath='{.items[-1].metadata.name}')
|
||||||
|
echo "Using backup: $BACKUP_NAME"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 4: Create Restore Resource
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cat <<EOF | oc apply -f -
|
||||||
|
apiVersion: velero.io/v1
|
||||||
|
kind: Restore
|
||||||
|
metadata:
|
||||||
|
name: $RESTORE_NAME
|
||||||
|
namespace: $VELERO_NS
|
||||||
|
spec:
|
||||||
|
backupName: $BACKUP_NAME
|
||||||
|
includeClusterResources: false
|
||||||
|
includedNamespaces:
|
||||||
|
- $SRC_NS
|
||||||
|
restorePVs: true
|
||||||
|
excludedResources:
|
||||||
|
- routes.route.openshift.io # Routes are environment-specific
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 5: Monitor Restore Progress
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Watch restore status
|
||||||
|
watch -n 5 "oc -n $VELERO_NS get restore $RESTORE_NAME -o jsonpath='{.status.phase}{\"\\n\"}'"
|
||||||
|
|
||||||
|
# When complete, check for errors
|
||||||
|
oc -n $VELERO_NS describe restore $RESTORE_NAME
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected phases:** New → InProgress → Completed (or PartiallyFailed)
|
||||||
|
|
||||||
|
## Step 6: Validate Application Functionality
|
||||||
|
|
||||||
|
### For n8n:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Wait for pods to be ready
|
||||||
|
oc -n $SRC_NS rollout status statefulset/postgres --timeout=5m
|
||||||
|
oc -n $SRC_NS rollout status deployment/n8n --timeout=5m
|
||||||
|
|
||||||
|
# Check if data is intact
|
||||||
|
oc -n $SRC_NS logs -l app.kubernetes.io/name=n8n -c n8n --tail=50 | grep -i "started\|error\|failed"
|
||||||
|
|
||||||
|
# Port-forward and test UI
|
||||||
|
oc -n $SRC_NS port-forward svc/n8n 5678:5678 &
|
||||||
|
sleep 2
|
||||||
|
curl -s http://localhost:5678/healthz | jq .
|
||||||
|
kill %1
|
||||||
|
```
|
||||||
|
|
||||||
|
### For mailhog:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
oc -n $SRC_NS rollout status deployment/mailhog --timeout=5m
|
||||||
|
|
||||||
|
# Verify service is responding
|
||||||
|
oc -n $SRC_NS port-forward svc/mailhog 1025:1025 8025:8025 &
|
||||||
|
sleep 2
|
||||||
|
curl -s http://localhost:8025/ | head -20
|
||||||
|
kill %1
|
||||||
|
```
|
||||||
|
|
||||||
|
### For any application:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# General validation
|
||||||
|
oc -n $SRC_NS get all
|
||||||
|
oc -n $SRC_NS get pvc
|
||||||
|
oc -n $SRC_NS get secrets
|
||||||
|
oc -n $SRC_NS get configmap
|
||||||
|
|
||||||
|
# Check velero labels (proof of restore)
|
||||||
|
oc -n $SRC_NS get deployment -o jsonpath='{.items[0].metadata.labels.velero\.io/restore-name}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 7: Resume GitOps Reconciliation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Re-enable ArgoCD
|
||||||
|
oc patch appproject infrastructure -n openshift-gitops \
|
||||||
|
-p '{"spec": {"sourceNamespaces": ["*"]}}' --type merge
|
||||||
|
|
||||||
|
# Or via ArgoCD UI: Re-enable Auto-Sync
|
||||||
|
|
||||||
|
# Monitor for reconciliation
|
||||||
|
watch -n 5 "oc -n openshift-gitops get applications.argoproj.io -l argocd.argoproj.io/instance=infrastructure"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 8: Monitor for Reconciliation Flapping
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Watch for any conflicts or drift
|
||||||
|
oc -n $SRC_NS get events --sort-by='.lastTimestamp' | tail -20
|
||||||
|
|
||||||
|
# Check if deployments are stable
|
||||||
|
oc -n $SRC_NS rollout status deployment/$SRC_NS --timeout=5m
|
||||||
|
|
||||||
|
# Verify no pending changes in ArgoCD
|
||||||
|
aoc app get infrastructure-$SRC_NS # Check sync status
|
||||||
|
```
|
||||||
|
|
||||||
|
> If you see repeated reconciliation or conflicts, check:
|
||||||
|
> - Are there immutable fields that changed?
|
||||||
|
> - Did Velero inject labels that conflict with Helm?
|
||||||
|
> - Is GitOps trying to scale/restart pods?
|
||||||
|
|
||||||
|
## Step 9: Cleanup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Delete the restore resource
|
||||||
|
oc -n $VELERO_NS delete restore $RESTORE_NAME
|
||||||
|
|
||||||
|
# (Namespace stays running - that's the point!)
|
||||||
|
echo "✓ Restore test complete. $SRC_NS is now running from backup."
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Restore shows PartiallyFailed
|
||||||
|
|
||||||
|
```bash
|
||||||
|
oc -n $VELERO_NS describe restore $RESTORE_NAME | grep -A 50 "Status:"
|
||||||
|
velero restore logs $RESTORE_NAME # If Velero CLI is installed
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pods stuck in Pending
|
||||||
|
|
||||||
|
```bash
|
||||||
|
oc -n $SRC_NS describe pod <pod-name>
|
||||||
|
oc -n $SRC_NS get pvc # Check if PVCs are bound
|
||||||
|
oc get pv | grep $SRC_NS
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data looks wrong
|
||||||
|
|
||||||
|
- Check if you restored the correct backup
|
||||||
|
- For databases (n8n, postgres): Check logs for corruption warnings
|
||||||
|
- If corrupted: Re-delete namespace and restore from earlier backup
|
||||||
|
|
||||||
|
## Testing Schedule
|
||||||
|
|
||||||
|
- **Monthly**: n8n and mailhog (in-place, validated)
|
||||||
|
- **One-shot after major changes**: Full application restores to validate strategy
|
||||||
|
- **After backup retention policy changes**: Restore oldest available backup to verify
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
✅ Namespace deleted cleanly
|
||||||
|
✅ Restore completes without PartiallyFailed
|
||||||
|
✅ All pods reach Running state
|
||||||
|
✅ Application data is intact and queryable
|
||||||
|
✅ UI/APIs respond correctly
|
||||||
|
✅ GitOps reconciliation completes without conflicts
|
||||||
|
✅ velero.io/restore-name label visible on resources
|
||||||
@@ -6,7 +6,8 @@ metadata:
|
|||||||
name: monthly-restore-test
|
name: monthly-restore-test
|
||||||
namespace: openshift-adp
|
namespace: openshift-adp
|
||||||
spec:
|
spec:
|
||||||
schedule: "0 06 15 * *" # 15th of month, 6 AM
|
timeZone: "Australia/Sydney"
|
||||||
|
schedule: "0 06 15 * *" # 15th of month, 6 AM
|
||||||
concurrencyPolicy: Forbid
|
concurrencyPolicy: Forbid
|
||||||
successfulJobsHistoryLimit: 3
|
successfulJobsHistoryLimit: 3
|
||||||
failedJobsHistoryLimit: 3
|
failedJobsHistoryLimit: 3
|
||||||
@@ -20,44 +21,45 @@ spec:
|
|||||||
serviceAccountName: velero
|
serviceAccountName: velero
|
||||||
restartPolicy: OnFailure
|
restartPolicy: OnFailure
|
||||||
containers:
|
containers:
|
||||||
- name: restore-test
|
- name: restore-test
|
||||||
image: quay.io/konveyor/velero:latest
|
image: quay.io/konveyor/velero:latest
|
||||||
env:
|
env:
|
||||||
- name: VELERO_NAMESPACE
|
- name: VELERO_NAMESPACE
|
||||||
value: openshift-adp
|
value: openshift-adp
|
||||||
command:
|
command:
|
||||||
- /bin/bash
|
- /bin/bash
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
echo "=== Velero Restore Test ==="
|
echo "=== Velero Restore Test ==="
|
||||||
echo "Date: $(date)"
|
echo "Date: $(date)"
|
||||||
|
|
||||||
# Get latest daily-config backup
|
# Get latest daily-config backup
|
||||||
CONFIG_BACKUP=$(velero backup get --selector="backup-type=config" \
|
CONFIG_BACKUP=$(velero backup get --selector="backup-type=config" \
|
||||||
-o json | jq -r '.items[0].metadata.name')
|
-o json | jq -r '.items[0].metadata.name')
|
||||||
|
|
||||||
# Get latest daily-stateful backup
|
# Get latest daily-stateful backup
|
||||||
STATEFUL_BACKUP=$(velero backup get --selector="backup-type=stateful" \
|
STATEFUL_BACKUP=$(velero backup get --selector="backup-type=stateful" \
|
||||||
-o json | jq -r '.items[0].metadata.name')
|
-o json | jq -r '.items[0].metadata.name')
|
||||||
|
|
||||||
echo "Latest config backup: $CONFIG_BACKUP"
|
echo "Latest config backup: $CONFIG_BACKUP"
|
||||||
echo "Latest stateful backup: $STATEFUL_BACKUP"
|
echo "Latest stateful backup: $STATEFUL_BACKUP"
|
||||||
|
|
||||||
# Verify backups are successful
|
# Verify backups are successful
|
||||||
CONFIG_STATUS=$(velero backup get $CONFIG_BACKUP -o json | \
|
CONFIG_STATUS=$(velero backup get $CONFIG_BACKUP -o json | \
|
||||||
jq -r '.status.phase')
|
jq -r '.status.phase')
|
||||||
STATEFUL_STATUS=$(velero backup get $STATEFUL_BACKUP -o json | \
|
STATEFUL_STATUS=$(velero backup get $STATEFUL_BACKUP -o json | \
|
||||||
jq -r '.status.phase')
|
jq -r '.status.phase')
|
||||||
|
|
||||||
echo "Config backup status: $CONFIG_STATUS"
|
echo "Config backup status: $CONFIG_STATUS"
|
||||||
echo "Stateful backup status: $STATEFUL_STATUS"
|
echo "Stateful backup status: $STATEFUL_STATUS"
|
||||||
|
|
||||||
if [ "$CONFIG_STATUS" != "Completed" ] || [ "$STATEFUL_STATUS" != "Completed" ]; then
|
if [ "$CONFIG_STATUS" != "Completed" ] || [ "$STATEFUL_STATUS" != "Completed" ]; then
|
||||||
echo "ERROR: Backups not in Completed state"
|
echo "ERROR: Backups not in Completed state"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
echo "=== Test Passed ==="
|
||||||
|
echo "All backups verified successfully"
|
||||||
|
|
||||||
echo "=== Test Passed ==="
|
|
||||||
echo "All backups verified successfully"
|
|
||||||
@@ -6,7 +6,7 @@ metadata:
|
|||||||
name: daily-config
|
name: daily-config
|
||||||
namespace: openshift-adp
|
namespace: openshift-adp
|
||||||
spec:
|
spec:
|
||||||
schedule: "0 02 * * *" # 2 AM daily
|
schedule: "CRON_TZ=Australia/Sydney 0 02 * * *" # 2 AM daily
|
||||||
|
|
||||||
# Make backups readable, sortable, unique
|
# Make backups readable, sortable, unique
|
||||||
#nameTemplate: "{{ .ScheduleName }}-{{ .Timestamp }}"
|
#nameTemplate: "{{ .ScheduleName }}-{{ .Timestamp }}"
|
||||||
|
|||||||
@@ -33,11 +33,11 @@ spec:
|
|||||||
podConfig:
|
podConfig:
|
||||||
resourceAllocations:
|
resourceAllocations:
|
||||||
limits:
|
limits:
|
||||||
cpu: "1" # Increased for database compression
|
cpu: 1
|
||||||
memory: "1Gi" # Increased for larger chunks
|
memory: 1Gi
|
||||||
requests:
|
requests:
|
||||||
cpu: "200m"
|
cpu: 200m
|
||||||
memory: "512Mi"
|
memory: 512Mi
|
||||||
velero:
|
velero:
|
||||||
defaultPlugins:
|
defaultPlugins:
|
||||||
- openshift
|
- openshift
|
||||||
@@ -49,10 +49,10 @@ spec:
|
|||||||
podConfig:
|
podConfig:
|
||||||
resourceAllocations:
|
resourceAllocations:
|
||||||
limits:
|
limits:
|
||||||
cpu: "500m"
|
cpu: 1
|
||||||
memory: "512Mi"
|
memory: 2Gi
|
||||||
requests:
|
requests:
|
||||||
cpu: "100m"
|
cpu: 100m
|
||||||
memory: "256Mi"
|
memory: 512Mi
|
||||||
|
|
||||||
logFormat: text
|
logFormat: text
|
||||||
@@ -6,7 +6,7 @@ metadata:
|
|||||||
name: daily-stateful
|
name: daily-stateful
|
||||||
namespace: openshift-adp
|
namespace: openshift-adp
|
||||||
spec:
|
spec:
|
||||||
schedule: "0 03 * * *" # 3 AM daily (after config backup)
|
schedule: "CRON_TZ=Australia/Sydney 0 03 * * *" # 3 AM daily (after config backup)
|
||||||
|
|
||||||
#nameTemplate: "{{ .ScheduleName }}-{{ .Timestamp }}"
|
#nameTemplate: "{{ .ScheduleName }}-{{ .Timestamp }}"
|
||||||
|
|
||||||
@@ -20,6 +20,8 @@ spec:
|
|||||||
- n8n
|
- n8n
|
||||||
- apim
|
- apim
|
||||||
- gitea-ci
|
- gitea-ci
|
||||||
|
- openclaw
|
||||||
|
- clawdbox
|
||||||
|
|
||||||
#labels:
|
#labels:
|
||||||
# backup-type: stateful
|
# backup-type: stateful
|
||||||
@@ -32,8 +34,6 @@ spec:
|
|||||||
- events.events.k8s.io
|
- events.events.k8s.io
|
||||||
- pipelineruns.tekton.dev
|
- pipelineruns.tekton.dev
|
||||||
- taskruns.tekton.dev
|
- taskruns.tekton.dev
|
||||||
- replicasets.apps
|
|
||||||
- pods
|
|
||||||
|
|
||||||
# Use Kopia for volume backups
|
# Use Kopia for volume backups
|
||||||
snapshotVolumes: false
|
snapshotVolumes: false
|
||||||
@@ -63,23 +63,23 @@ spec:
|
|||||||
onError: Continue
|
onError: Continue
|
||||||
|
|
||||||
# Gitea PostgreSQL: checkpoint before backup
|
# Gitea PostgreSQL: checkpoint before backup
|
||||||
- name: gitea-postgres-checkpoint
|
#- name: gitea-postgres-checkpoint
|
||||||
includedNamespaces:
|
# includedNamespaces:
|
||||||
- gitea
|
# - gitea
|
||||||
labelSelector:
|
# labelSelector:
|
||||||
matchLabels:
|
# matchLabels:
|
||||||
app.kubernetes.io/name: postgresql
|
# app.kubernetes.io/name: postgresql
|
||||||
app.kubernetes.io/instance: gitea
|
# app.kubernetes.io/instance: gitea
|
||||||
pre:
|
# pre:
|
||||||
- exec:
|
# - exec:
|
||||||
container: postgresql
|
# container: postgresql
|
||||||
command:
|
# command:
|
||||||
- /bin/bash
|
# - /bin/bash
|
||||||
- -c
|
# - -c
|
||||||
- psql -U postgres -c 'CHECKPOINT;'
|
# - PGPASSWORD=spVTpND34K psql -U postgres -c 'CHECKPOINT;'
|
||||||
timeout: 2m
|
# timeout: 2m
|
||||||
onError: Continue
|
# onError: Continue
|
||||||
|
# Authentik PostgreSQL: checkpoint before backup
|
||||||
# Authentik PostgreSQL: checkpoint before backup
|
# Authentik PostgreSQL: checkpoint before backup
|
||||||
- name: authentik-postgres-checkpoint
|
- name: authentik-postgres-checkpoint
|
||||||
includedNamespaces:
|
includedNamespaces:
|
||||||
@@ -94,6 +94,23 @@ spec:
|
|||||||
command:
|
command:
|
||||||
- /bin/bash
|
- /bin/bash
|
||||||
- -c
|
- -c
|
||||||
- psql -U postgres -c 'CHECKPOINT;'
|
- PGPASSWORD=th1rt33nletterS. psql -U authentik -c 'CHECKPOINT;'
|
||||||
|
timeout: 2m
|
||||||
|
onError: Continue
|
||||||
|
|
||||||
|
# n8n PostgreSQL: checkpoint before backup
|
||||||
|
- name: n8n-postgres-checkpoint
|
||||||
|
includedNamespaces:
|
||||||
|
- n8n
|
||||||
|
labelSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/service: postgres-n8n
|
||||||
|
pre:
|
||||||
|
- exec:
|
||||||
|
container: postgres
|
||||||
|
command:
|
||||||
|
- /bin/bash
|
||||||
|
- -c
|
||||||
|
- psql -d n8n -U root -c 'CHECKPOINT;'
|
||||||
timeout: 2m
|
timeout: 2m
|
||||||
onError: Continue
|
onError: Continue
|
||||||
|
|||||||
Reference in New Issue
Block a user