This comprehensive troubleshooting guide covers common issues with Neo4j backup and restore operations when using the Neo4j Kubernetes Operator.
The Neo4j Kubernetes Operator provides comprehensive backup and restore capabilities including:
- Automated backups with scheduling and retention policies
- Point-in-Time Recovery (PITR) for Neo4j 2025.x
- Multi-cloud storage support (S3, GCS, Azure Blob)
- Backup sidecars automatically added to all pods
- Automatic RBAC management for backup operations
kubectl get jobs -l app.kubernetes.io/component=backup
# STATUS: Failed or no jobs createdDiagnosis:
# Check backup resource status
kubectl get neo4jbackup
kubectl describe neo4jbackup production-backup
# Check operator logs for backup controller errors
kubectl logs -n neo4j-operator-system deployment/neo4j-operator-controller-manager | grep -i backup
# Verify RBAC permissions
kubectl auth can-i create jobs --as=system:serviceaccount:default:production-cluster-backupCommon Causes & Solutions:
-
Missing RBAC Permissions:
# The operator automatically creates RBAC - check if it exists kubectl get serviceaccount production-cluster-backup kubectl get role production-cluster-backup-role kubectl get rolebinding production-cluster-backup-binding # If missing, trigger operator reconciliation kubectl annotate neo4jenterprisecluster production-cluster operator.neo4j.com/force-reconcile="$(date +%s)"
-
Storage Configuration Issues:
# Verify storage configuration in backup spec spec: storage: s3: bucket: "valid-bucket-name" # Must exist region: "us-west-2" # Correct region # Credentials must be valid
-
Cluster Reference Problems:
# Verify cluster exists and is ready kubectl get neo4jenterprisecluster production-cluster kubectl get pods -l neo4j.com/cluster=production-cluster
Diagnosis:
# Check backup job logs
kubectl logs job/production-backup-$(date +%Y%m%d)-001
# Check backup sidecar logs
kubectl logs production-cluster-server-0 -c backup-sidecar
# Check Neo4j server logs for backup-related errors
kubectl logs production-cluster-server-0 -c neo4j | grep -i backupCommon Solutions:
-
Insufficient Disk Space:
# Check available storage kubectl exec production-cluster-server-0 -c backup-sidecar -- df -h /backup-staging # Solution: Increase backup sidecar storage or cleanup old backups
-
Database Lock Issues:
# Check for long-running transactions kubectl exec production-cluster-server-0 -- cypher-shell -u neo4j -p password \ "CALL db.listTransactions() YIELD transactionId, elapsedTimeMillis WHERE elapsedTimeMillis > 30000" # Solution: Wait for transactions to complete or consider using secondary for backup
-
Memory Issues in Backup Process:
# Increase backup sidecar resources spec: backups: sidecar: resources: requests: memory: "1Gi" # Increase from default 512Mi limits: memory: "2Gi" # Increase from default 1Gi
Authentication Issues:
# Check AWS credentials
kubectl exec production-cluster-server-0 -c backup-sidecar -- aws sts get-caller-identity
# Test S3 access
kubectl exec production-cluster-server-0 -c backup-sidecar -- aws s3 ls s3://your-backup-bucket/Solutions:
-
IAM Role Issues:
# Use IAM roles for service accounts (IRSA) spec: serviceAccount: name: production-cluster-backup annotations: eks.amazonaws.com/role-arn: "arn:aws:iam::123456789:role/Neo4jBackupRole"
-
Bucket Policy Problems:
{ "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "AWS": "arn:aws:iam::123456789:role/Neo4jBackupRole" }, "Action": [ "s3:GetObject", "s3:PutObject", "s3:DeleteObject", "s3:ListBucket" ], "Resource": [ "arn:aws:s3:::your-backup-bucket", "arn:aws:s3:::your-backup-bucket/*" ] } ] }
Service Account Problems:
# Check GCP credentials
kubectl exec production-cluster-server-0 -c backup-sidecar -- gcloud auth list
# Test GCS access
kubectl exec production-cluster-server-0 -c backup-sidecar -- gsutil ls gs://your-backup-bucket/Solutions:
# Use Workload Identity
spec:
serviceAccount:
name: production-cluster-backup
annotations:
iam.gke.io/gcp-service-account: "neo4j-backup@project.iam.gserviceaccount.com"Authentication Problems:
# Check Azure credentials
kubectl exec production-cluster-server-0 -c backup-sidecar -- az account show
# Test storage access
kubectl exec production-cluster-server-0 -c backup-sidecar -- az storage blob list --account-name storageaccount --container-name backupsDiagnosis:
# Check CronJob status
kubectl get cronjob
kubectl describe cronjob production-backup-schedule
# Check backup schedule configuration
kubectl get neo4jbackup production-backup -o yaml | grep -A 10 scheduleCommon Solutions:
-
Invalid Cron Expression:
# Correct cron syntax spec: schedule: "0 2 * * *" # Daily at 2 AM # NOT: "0 2 * * * *" # Invalid - too many fields
-
Timezone Issues:
spec: schedule: "0 2 * * *" timezone: "UTC" # Explicitly set timezone
-
Backup Window Conflicts:
# Check for overlapping backup jobs kubectl get jobs -l app.kubernetes.io/component=backup --sort-by=.metadata.creationTimestamp
Diagnosis:
# Check restore resource status
kubectl get neo4jrestore
kubectl describe neo4jrestore production-restore
# Check operator logs
kubectl logs -n neo4j-operator-system deployment/neo4j-operator-controller-manager | grep -i restoreCommon Solutions:
-
Invalid Backup Reference:
# Verify backup exists kubectl get neo4jbackup production-backup # Check backup completion status kubectl get neo4jbackup production-backup -o jsonpath='{.status.phase}'
-
Target Cluster Issues:
# Ensure target cluster is ready kubectl get neo4jenterprisecluster target-cluster kubectl get pods -l neo4j.com/cluster=target-cluster -
Storage Access Problems:
# Test access to backup storage location kubectl exec target-cluster-server-0 -c backup-sidecar -- \ aws s3 ls s3://backup-bucket/path/to/backup/
Diagnosis:
# Check restore job logs
kubectl logs job/production-restore-$(date +%Y%m%d)
# Check target cluster logs during restore
kubectl logs target-cluster-server-0 | grep -i restoreCommon Solutions:
-
Insufficient Storage Space:
# Check available space on target cluster kubectl exec target-cluster-server-0 -- df -h /data # Solution: Increase PVC size before restore
-
Database Already Exists:
# Use force option to overwrite spec: options: force: true
-
Version Incompatibility:
# Check Neo4j versions kubectl exec source-cluster-server-0 -- neo4j version kubectl exec target-cluster-server-0 -- neo4j version
Diagnosis:
# Check backup logs for transaction timestamps
kubectl logs job/production-backup-latest | grep -i "restore-until"
# Verify PITR capability
kubectl exec production-cluster-server-0 -- neo4j-admin database info systemSolutions:
-
Invalid Timestamp Format:
# Correct ISO 8601 format spec: restoreUntil: "2025-01-15T14:30:00Z" # NOT: "2025-01-15 14:30:00"
-
Timestamp Outside Backup Range:
# Check backup time range kubectl logs job/production-backup-20250115 | grep -E "(start|end).*time"
-
Neo4j Version Compatibility:
# PITR only available in Neo4j 2025.x spec: image: repository: "neo4j" tag: "2025.01.0-enterprise"
Diagnosis:
# Check sidecar status
kubectl get pods -l neo4j.com/cluster=production-cluster -o wide
kubectl describe pod production-cluster-server-0
# Check sidecar logs
kubectl logs production-cluster-server-0 -c backup-sidecarCommon Solutions:
-
Resource Constraints:
# Increase sidecar resources spec: backups: sidecar: resources: requests: memory: "512Mi" cpu: "200m" limits: memory: "1Gi" cpu: "500m"
-
Storage Mount Issues:
# Check volume mounts kubectl describe pod production-cluster-server-0 | grep -A 10 "Mounts:"
-
Permission Problems:
# Check file permissions kubectl exec production-cluster-server-0 -c backup-sidecar -- ls -la /backup-requests kubectl exec production-cluster-server-0 -c backup-sidecar -- id
Diagnosis:
# Check backup request queue
kubectl exec production-cluster-server-0 -c backup-sidecar -- ls -la /backup-requests/
# Test manual backup request
kubectl exec production-cluster-server-0 -c backup-sidecar -- sh -c \
'echo "{\"path\":\"/data/backups/manual-test\",\"type\":\"FULL\"}" > /backup-requests/test.request'Solutions:
-
Request Format Issues:
// Correct format { "path": "/data/backups/test", "type": "FULL", "databases": ["neo4j", "system"] }
-
Sidecar Communication Problems:
# Check shared volume kubectl exec production-cluster-server-0 -c neo4j -- ls -la /backup-requests/ kubectl exec production-cluster-server-0 -c backup-sidecar -- ls -la /backup-requests/
Diagnosis:
# Monitor backup progress
kubectl logs job/production-backup-latest -f
# Check resource utilization during backup
kubectl top pod production-cluster-server-0Optimization Strategies:
-
Use Secondary Servers for Backup:
spec: backupSource: "secondary" # Backup from secondary to reduce primary load
-
Parallel Backup Processing:
spec: backups: parallelism: 2 # Multiple backup jobs can run simultaneously
-
Storage Performance Tuning:
# Use high-performance storage for backup staging spec: backups: sidecar: storage: className: "fast-ssd" size: "100Gi"
-
Network Optimization:
spec: config: # Increase buffer sizes for backup operations dbms.memory.off_heap.max_size: "2g" dbms.memory.pagecache.size: "4g"
Optimization:
-
Target Cluster Resources:
spec: resources: requests: memory: "8Gi" cpu: "4" limits: memory: "16Gi" cpu: "8"
-
Storage Configuration:
spec: storage: className: "fast-ssd" size: "1Ti"
Prometheus Metrics:
# Monitor backup success rate
neo4j_backup_success_total
neo4j_backup_failure_total
neo4j_backup_duration_seconds
# Alert rules
groups:
- name: neo4j-backup
rules:
- alert: BackupFailure
expr: increase(neo4j_backup_failure_total[24h]) > 0
labels:
severity: critical
annotations:
summary: "Neo4j backup failed"
description: "Backup for cluster {{ $labels.cluster }} failed"Log Monitoring:
# Monitor backup logs
kubectl logs -f job/production-backup-latest | grep -E "(ERROR|WARN|SUCCESS)"
# Set up log alerts
kubectl logs -f -n neo4j-operator-system deployment/neo4j-operator-controller-manager | \
grep -i "backup.*failed" --line-buffered | \
while read line; do
echo "BACKUP ALERT: $line"
# Send to alerting system
doneAutomated Validation Script:
#!/bin/bash
# Validate backup completeness
BACKUP_NAME="production-backup"
NAMESPACE="default"
validate_backup() {
local backup_status=$(kubectl get neo4jbackup $BACKUP_NAME -n $NAMESPACE -o jsonpath='{.status.phase}')
if [ "$backup_status" != "Succeeded" ]; then
echo "❌ Backup failed or incomplete: $backup_status"
return 1
fi
# Check backup size
local backup_size=$(kubectl get neo4jbackup $BACKUP_NAME -n $NAMESPACE -o jsonpath='{.status.backupSize}')
if [ "$backup_size" -lt 1000000 ]; then # Less than 1MB
echo "⚠️ Backup size suspiciously small: $backup_size bytes"
fi
echo "✅ Backup validation passed"
return 0
}
# Run validation
validate_backupScenario: Primary database corrupted, need complete restore
# 1. Create new cluster for restoration
kubectl apply -f - <<EOF
apiVersion: neo4j.neo4j.com/v1alpha1
kind: Neo4jEnterpriseCluster
metadata:
name: recovery-cluster
spec:
topology:
servers: 3
# Use same configuration as original cluster
storage:
className: "fast-ssd"
size: "1Ti"
EOF
# 2. Wait for cluster to be ready
kubectl wait --for=condition=Ready neo4jenterprisecluster/recovery-cluster --timeout=600s
# 3. Restore from latest backup
kubectl apply -f - <<EOF
apiVersion: neo4j.neo4j.com/v1alpha1
kind: Neo4jRestore
metadata:
name: emergency-restore
spec:
targetCluster: recovery-cluster
source:
backupName: production-backup-latest
databaseName: neo4j
force: true
EOF
# 4. Monitor restore progress
kubectl logs -f job/emergency-restore
# 5. Verify data integrity
kubectl exec recovery-cluster-server-0 -- cypher-shell -u neo4j -p password \
"MATCH (n) RETURN count(n) as total_nodes"# Restore to specific point before corruption
kubectl apply -f - <<EOF
apiVersion: neo4j.neo4j.com/v1alpha1
kind: Neo4jRestore
metadata:
name: pitr-emergency-restore
spec:
targetCluster: recovery-cluster
source:
backupName: production-backup-latest
databaseName: neo4j
options:
restoreUntil: "2025-01-15T10:30:00Z" # Before corruption occurred
force: true
EOF- Regular Testing: Test backup and restore procedures regularly
- Multiple Storage Locations: Store backups in multiple locations/regions
- Retention Policies: Implement appropriate retention policies
- Monitoring: Set up comprehensive backup monitoring and alerting
- Documentation: Document recovery procedures and test them
- Security: Encrypt backups and use secure storage access
- Validation: Always validate restored data integrity
- Staging Environment: Test restores in staging before production
- Downtime Planning: Plan for service interruption during restore
- Data Consistency: Ensure cluster consistency after restore
- Application Testing: Test applications after database restore
- Resource Allocation: Adequate resources for backup/restore operations
- Storage Performance: Use high-performance storage for operations
- Network Optimization: Optimize network for data transfer
- Scheduling: Schedule backups during low-activity periods
- Parallel Operations: Use parallelism where possible
For additional help, see: