From a79bf36e5340411ecb98a92f9724966fc58cc3c2 Mon Sep 17 00:00:00 2001 From: Viktor Date: Sat, 25 May 2024 13:35:04 +0200 Subject: [PATCH] backup and cleanup on s3 --- charts/dremio_v2/templates/backup.yaml | 54 +++++++++ charts/dremio_v2/templates/cleanup.yaml | 150 ++++++++++++++++++++++++ charts/dremio_v2/values.yaml | 6 + 3 files changed, 210 insertions(+) create mode 100644 charts/dremio_v2/templates/backup.yaml create mode 100644 charts/dremio_v2/templates/cleanup.yaml diff --git a/charts/dremio_v2/templates/backup.yaml b/charts/dremio_v2/templates/backup.yaml new file mode 100644 index 00000000..1a7e0e3f --- /dev/null +++ b/charts/dremio_v2/templates/backup.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: dremio-jobs-role + namespace: {{.Release.Namespace}} +rules: + - apiGroups: [""] + resources: ["pods", "pods/exec"] + verbs: ["get", "list", "watch", "create"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: dremio-jobs + namespace: {{.Release.Namespace}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: dremio-jobs + namespace: {{.Release.Namespace}} +subjects: + - kind: ServiceAccount + name: dremio-jobs + namespace: {{.Release.Namespace}} +roleRef: + kind: Role + name: dremio-jobs-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: dremio-backup-s3 + namespace: {{.Release.Namespace}} +spec: + schedule: "0 */3 * * *" + failedJobsHistoryLimit: 1 + successfulJobsHistoryLimit: 1 + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - name: backup + image: bitnami/kubectl:latest + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -c + - kubectl exec dremio-master-0 --container dremio-master-coordinator -- /opt/dremio/bin/dremio-admin backup -l -d dremioS3:///{{ $.Values.distStorage.aws.bucketName }}/backup/daily + restartPolicy: Never + serviceAccountName: dremio-jobs diff --git a/charts/dremio_v2/templates/cleanup.yaml b/charts/dremio_v2/templates/cleanup.yaml new file mode 100644 index 00000000..b8668c5d --- /dev/null +++ b/charts/dremio_v2/templates/cleanup.yaml @@ -0,0 +1,150 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: cleanup-backups +data: + cleanup.py: | + import os + import boto3 + import logging + from datetime import datetime, timedelta + from botocore.client import Config + + # Configure logging + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger() + + # Create S3 client for object storage + def create_s3_client(endpoint, access_key, secret_key, region): + try: + session = boto3.session.Session() + s3_client = session.client( + 's3', + endpoint_url=endpoint, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + config=Config(signature_version='s3v4'), + region_name=region + ) + return s3_client + except Exception as e: + logger.error(f"Failed to create S3 client: {e}") + raise + + # Generate S3 keys + def generate_s3_keys(bucket, s3_client, prefix='/', delimiter='/', start_after=''): + try: + s3_paginator = s3_client.get_paginator('list_objects_v2') + prefix = prefix.lstrip(delimiter) + start_after = start_after or prefix + for page in s3_paginator.paginate(Bucket=bucket, Prefix=prefix, StartAfter=start_after): + for content in page.get('Contents', []): + yield content['Key'] + except Exception as e: + logger.error(f"Failed to list objects: {e}") + raise + + # Cleanup backups older than the threshold date + def cleanup_backups(s3_client, bucket, obj, threshold_date): + try: + folder_name = obj.split('/')[2] + creation_date_str = folder_name.split('_')[-2] + creation_date = datetime.strptime(creation_date_str, '%Y-%m-%d') + if creation_date < threshold_date: + s3_client.delete_object(Bucket=bucket, Key=obj) + logger.info(f"Deleted folder: {obj}", ) + except (IndexError, ValueError) as e: + logger.warning(f"Skipping invalid folder name: {obj} {e}") + + def main(): + # Retrieve environment variables + storage_endpoint = os.getenv('ENDPOINT') + storage_access_key = os.getenv('S3_ACCESS_KEY_ID') + storage_secret_key = os.getenv('S3_SECRET_ACCESS_KEY') + bucket_name = os.getenv('BUCKET_NAME') + folder_prefix = os.getenv('FOLDER_PREFIX') + region_name = os.getenv('REGION_NAME') + number_days_max_to_keep = int(os.getenv('NUMBER_DAYS_MAX_TO_KEEP')) + number_backups_to_keep = int(os.getenv('NUMBER_BACKUPS_TO_KEEP')) + + # Validate essential environment variables + if not all([storage_endpoint, storage_access_key, storage_secret_key, bucket_name, folder_prefix, region_name, number_days_max_to_keep, number_backups_to_keep]): + logger.error("Missing one or more required environment variables") + return + + # Calculate the threshold date + threshold_date = datetime.now() - timedelta(days=number_days_max_to_keep) + + # Create S3 client + s3_client = create_s3_client(storage_endpoint, storage_access_key, storage_secret_key, region_name) + + # Retrieve and process objects + all_objects = [] + unique_folders = set() + objects_generator = generate_s3_keys(bucket_name, s3_client, prefix=folder_prefix) + + for obj in objects_generator: + all_objects.append(obj) + unique_folders.add(obj.split('/')[2]) + + folder_count = len(unique_folders) + logger.info(f'Total number of backups in {folder_prefix}: {folder_count}') + + if folder_count > number_backups_to_keep: #56 represents a week of Dremio backups that runs every 3 hours + for obj in all_objects: + cleanup_backups(s3_client, bucket_name, obj, threshold_date) + else: + logger.info(f'Due to low number of backups {folder_count} no backups in {folder_prefix} have been deleted' ) + + if __name__ == "__main__": + main() + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: dremio-backup-cleanup + namespace: {{.Release.Namespace}} +spec: + schedule: "* 22 * * *" + failedJobsHistoryLimit: 1 + successfulJobsHistoryLimit: 1 + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - name: cleanup-job + image: python:3.12-alpine + env: + - name: ENDPOINT + value: "{{ $.Values.distStorage.aws.backup.host }}" + - name: S3_ACCESS_KEY_ID + value: "{{ $.Values.distStorage.aws.credentials.accessKey }}" + - name: S3_SECRET_ACCESS_KEY + value: "{{ $.Values.distStorage.aws.credentials.secret }}" + - name: BUCKET_NAME + value: "{{ $.Values.distStorage.aws.bucketName }}" + - name: FOLDER_PREFIX + value: "{{ $.Values.distStorage.aws.backup.path }}" + - name: REGION_NAME + value: "{{ $.Values.distStorage.aws.backup.region }}" + - name: NUMBER_DAYS_MAX_TO_KEEP + value: "{{ $.Values.distStorage.aws.backup.numberDaysMaxToKeep }}" + - name: NUMBER_BACKUPS_TO_KEEP + value: "{{ $.Values.distStorage.aws.backup.numberBackupsToKeep }}" + command: ["/bin/sh", "-c"] + args: + [ + "pip install -q boto3 botocore && python /tmp/python/cleanup.py", + ] + volumeMounts: + - name: scripts + mountPath: /tmp/python + restartPolicy: Never + volumes: + - name: scripts + configMap: + name: cleanup-backups diff --git a/charts/dremio_v2/values.yaml b/charts/dremio_v2/values.yaml index acb0a740..5cff5f7e 100644 --- a/charts/dremio_v2/values.yaml +++ b/charts/dremio_v2/values.yaml @@ -404,6 +404,12 @@ distStorage: bucketName: "AWS Bucket Name" path: "/" authentication: "metadata" + backup: + path: "backup/daily/" + host: "https://s3.amazonaws.com" + region: "eu-central-01" + numberDaysMaxToKeep: "30" #define how backup cleanup should behaive + numberBackupsToKeep: "56" #56 represents a week of Dremio backups that runs every 3 hours # If using accessKeySecret for authentication against S3, uncomment the lines below and use the values # to configure the appropriate credentials. #