From c1e8b9bf35c3d5e8209c6d9c7e77275cdd045f7b Mon Sep 17 00:00:00 2001 From: Conan Scott Date: Tue, 30 Dec 2025 17:38:06 +1100 Subject: [PATCH] completely reworked --- argo-git.yaml | 21 ++++++++++ backup-test-cronjob.yaml | 63 +++++++++++++++++++++++++++++ config-backups.yaml | 33 +++++++++++++--- dpa.yaml | 12 +++++- stateful-backups.yaml | 85 +++++++++++++++++++++++++++++++++++++--- 5 files changed, 201 insertions(+), 13 deletions(-) create mode 100644 argo-git.yaml create mode 100644 backup-test-cronjob.yaml diff --git a/argo-git.yaml b/argo-git.yaml new file mode 100644 index 0000000..242b35a --- /dev/null +++ b/argo-git.yaml @@ -0,0 +1,21 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: oadp-backups + namespace: openshift-gitops +spec: + project: default + source: + repoURL: + targetRevision: main + path: gitops/oadp/base + destination: + server: https://kubernetes.default.svc + namespace: openshift-adp + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + - ServerSideApply=true diff --git a/backup-test-cronjob.yaml b/backup-test-cronjob.yaml new file mode 100644 index 0000000..2a69069 --- /dev/null +++ b/backup-test-cronjob.yaml @@ -0,0 +1,63 @@ +--- +# Optional: Monthly restore test automation +apiVersion: batch/v1 +kind: CronJob +metadata: + name: monthly-restore-test + namespace: openshift-adp +spec: + schedule: "0 06 15 * *" # 15th of month, 6 AM + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + metadata: + labels: + app: restore-test + spec: + serviceAccountName: velero + restartPolicy: OnFailure + containers: + - name: restore-test + image: quay.io/konveyor/velero:latest + env: + - name: VELERO_NAMESPACE + value: openshift-adp + command: + - /bin/bash + - -c + - | + set -e + + echo "=== Velero Restore Test ===" + echo "Date: $(date)" + + # Get latest daily-config backup + CONFIG_BACKUP=$(velero backup get --selector="backup-type=config" \ + -o json | jq -r '.items[0].metadata.name') + + # Get latest daily-stateful backup + STATEFUL_BACKUP=$(velero backup get --selector="backup-type=stateful" \ + -o json | jq -r '.items[0].metadata.name') + + echo "Latest config backup: $CONFIG_BACKUP" + echo "Latest stateful backup: $STATEFUL_BACKUP" + + # Verify backups are successful + CONFIG_STATUS=$(velero backup get $CONFIG_BACKUP -o json | \ + jq -r '.status.phase') + STATEFUL_STATUS=$(velero backup get $STATEFUL_BACKUP -o json | \ + jq -r '.status.phase') + + echo "Config backup status: $CONFIG_STATUS" + echo "Stateful backup status: $STATEFUL_STATUS" + + if [ "$CONFIG_STATUS" != "Completed" ] || [ "$STATEFUL_STATUS" != "Completed" ]; then + echo "ERROR: Backups not in Completed state" + exit 1 + fi + + echo "=== Test Passed ===" + echo "All backups verified successfully" \ No newline at end of file diff --git a/config-backups.yaml b/config-backups.yaml index ee30c6a..fba0785 100644 --- a/config-backups.yaml +++ b/config-backups.yaml @@ -1,20 +1,41 @@ +--- +# Schedule 1: Daily config-only backup (fast, all namespaces) apiVersion: velero.io/v1 kind: Schedule metadata: name: daily-config namespace: openshift-adp spec: - schedule: "0 02 * * *" - + schedule: "0 02 * * *" # 2 AM daily + # Make backups readable, sortable, unique nameTemplate: "{{ .ScheduleName }}-{{ .Timestamp }}" template: + # Backup all namespaces includedNamespaces: - "*" - - # No PV data — just manifests + + # Labels for filtering and reporting + labels: + backup-type: config + schedule: daily + retention: short + + # Exclude ephemeral/generated resources + excludedResources: + - events + - events.events.k8s.io + - pipelineruns.tekton.dev # Completed pipeline runs + - taskruns.tekton.dev # Completed task runs + - replicasets.apps # Managed by deployments + - pods # Recreated by controllers + - endpoints # Auto-generated + - endpointslices.discovery.k8s.io + + # No volume data - manifests only snapshotVolumes: false defaultVolumesToFsBackup: false - - ttl: 336h + + # 14 days retention + ttl: 336h \ No newline at end of file diff --git a/dpa.yaml b/dpa.yaml index 7b08fd2..7b45e3a 100644 --- a/dpa.yaml +++ b/dpa.yaml @@ -1,3 +1,5 @@ +--- +# DPA Configuration - optimized for SNO with Kopia apiVersion: oadp.openshift.io/v1alpha1 kind: DataProtectionApplication metadata: @@ -28,6 +30,14 @@ spec: nodeAgent: enable: true uploaderType: kopia + podConfig: + resourceAllocations: + limits: + cpu: "1" # Increased for database compression + memory: "1Gi" # Increased for larger chunks + requests: + cpu: "200m" + memory: "512Mi" velero: defaultPlugins: - openshift @@ -45,4 +55,4 @@ spec: cpu: "100m" memory: "256Mi" - logFormat: text + logFormat: text \ No newline at end of file diff --git a/stateful-backups.yaml b/stateful-backups.yaml index 3347e77..e761168 100644 --- a/stateful-backups.yaml +++ b/stateful-backups.yaml @@ -1,14 +1,17 @@ +--- +# Schedule 2: Daily stateful backup (with volume data) apiVersion: velero.io/v1 kind: Schedule metadata: name: daily-stateful namespace: openshift-adp spec: - schedule: "0 03 * * *" - + schedule: "0 03 * * *" # 3 AM daily (after config backup) + nameTemplate: "{{ .ScheduleName }}-{{ .Timestamp }}" template: + # Only namespaces with persistent data includedNamespaces: - gitea - authentik @@ -17,10 +20,80 @@ spec: - n8n - apim - gitea-ci - - - # No CSI snapshots; use nodeAgent/kopia only + + labels: + backup-type: stateful + schedule: daily + retention: short + + # Exclude ephemeral resources + excludedResources: + - events + - events.events.k8s.io + - pipelineruns.tekton.dev + - taskruns.tekton.dev + - replicasets.apps + - pods + + # Use Kopia for volume backups snapshotVolumes: false defaultVolumesToFsBackup: true - + + # 14 days retention ttl: 336h + + # Pre-backup hooks for data consistency + hooks: + resources: + # Cassandra: flush memtables to disk before backup + - name: cassandra-flush + includedNamespaces: + - cassandra + labelSelector: + matchLabels: + app.kubernetes.io/name: cassandra + pre: + - exec: + container: cassandra + command: + - /bin/bash + - -c + - nodetool flush + timeout: 5m + onError: Continue + + # Gitea PostgreSQL: checkpoint before backup + - name: gitea-postgres-checkpoint + includedNamespaces: + - gitea + labelSelector: + matchLabels: + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: gitea + pre: + - exec: + container: postgresql + command: + - /bin/bash + - -c + - psql -U postgres -c 'CHECKPOINT;' + timeout: 2m + onError: Continue + + # Authentik PostgreSQL: checkpoint before backup + - name: authentik-postgres-checkpoint + includedNamespaces: + - authentik + labelSelector: + matchLabels: + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: authentik + pre: + - exec: + container: postgresql + command: + - /bin/bash + - -c + - psql -U postgres -c 'CHECKPOINT;' + timeout: 2m + onError: Continue \ No newline at end of file