coreos-assembler/src/cmd-coreos-prune at 1e8ac9111e450eddfc06c205d2572854a36d1f38 · jbtrystram/coreos-assembler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
#!/usr/bin/python3 -u

# This script parses a policy.yaml file, which outlines the specific
# pruning actions required for each stream and the age threshold for
# deleting artifacts within them.
# Example of policy.yaml
# rawhide:
#     # all cloud images
#     cloud-uploads: 2y
#     # artifacts in meta.json's `images` key
#     images: 2y
#     images-keep: [qemu, live-iso]
#     build: 3y
#     containers: 2w
# The script also updates the builds.json for the respective stream by
# adding the policy-cleanup key when we set the upload_builds_json flag.
# It adds the relevant actions completed to that key
# For eg:
#     "builds": [
#         {
#             "id": "40.20240425.dev.1",
#             "arches": [
#                 "x86_64"
#             ],
#             "policy-cleanup": {
#                 "cloud-uploads": true,
#                 "images": true,
#                 "images-kept": ["qemu", "live-iso"]
#             }
#         }
#
# We should also prune unreferenced build directories here. See also
# `get_unreferenced_s3_builds()` in the git log

import argparse
import json
import subprocess
from urllib.parse import urlparse
import requests
import yaml
import collections
import datetime
import os
import boto3
import botocore
from dateutil.relativedelta import relativedelta
from cosalib.gcp import remove_gcp_image
from cosalib.aws import deregister_aws_resource
from cosalib.builds import BUILDFILES
from cosalib.s3 import s3_copy
from cosalib.cmdlib import parse_fcos_version_to_timestamp
from cosalib.cmdlib import convert_duration_to_days

Build = collections.namedtuple("Build", ["id", "images", "arch", "meta_json"])
# set metadata caching to 5m
CACHE_MAX_AGE_METADATA = 60 * 5
# These lists are up to date as of schema hash
# 1711aa3ce997c644130139e1318d2683e2a5f11538e2b0528771a8658d62ee64. If changing
# this hash, ensure that the list of SUPPORTED and UNSUPPORTED artifacts below
# is up to date.
SUPPORTED = ["amis", "gcp"]
UNSUPPORTED = ["aliyun", "azure", "ibmcloud", "powervs"]


def parse_args():
    parser = argparse.ArgumentParser(prog="coreos-assembler coreos-prune")
    parser.add_argument("--policy", required=True, type=str, help="Path to policy YAML file")
    parser.add_argument("--dry-run", help="Don't actually delete anything", action='store_true')
    parser.add_argument("--upload-builds-json", help="Push builds.json", action='store_true')
    parser.add_argument("--stream", type=str, help="CoreOS stream", required=True)
    parser.add_argument("--gcp-json-key", help="GCP Service Account JSON Auth", default=os.environ.get("GCP_JSON_AUTH"))
    parser.add_argument("--acl", help="ACL for objects", action='store', default='private')
    parser.add_argument("--aws-config-file", default=os.environ.get("AWS_CONFIG_FILE"), help="Path to AWS config file")
    parser.add_argument("--registry-auth-file", default=os.environ.get("REGISTRY_AUTH_FILE"),
                        help="Path to docker registry auth file. Directly passed to skopeo.")
    return parser.parse_args()


def main():
    # Parse arguments and initialize variables
    args = parse_args()
    with open(BUILDFILES['sourceurl'], "r") as file:
        builds_source_data_url = file.read()
    bucket, prefix = get_s3_bucket_and_prefix(builds_source_data_url)
    cloud_config = get_cloud_config(args)
    stream = args.stream
    today_date = datetime.datetime.now()

    # Boto3 loads credentials from ~/.aws/config by default and we can change
    # this default location by setting the AWS_CONFIG_FILE environment variable.
    # The Python bindings don't support passing a config file.
    # The alternative is to manually pass ACCESS_KEY and SECRET_KEY which isn't favourable.
    if args.aws_config_file:
        os.environ["AWS_CONFIG_FILE"] = args.aws_config_file
    s3_client = boto3.client("s3")

    # Upload builds.json to s3 bucket
    if args.upload_builds_json:
        # This copies the local builds.json and updates the S3 bucket version.
        return handle_upload_builds_json(s3_client, bucket, prefix, args.dry_run, args.acl)

    with open(args.policy, "r") as f:
        policy = yaml.safe_load(f)
    if stream in policy:
        validate_policy(stream, policy)
    else:
        print(f"There is no policy defined in gc-policy.yaml for {stream}")
        return

    with open(BUILDFILES['list'], "r") as f:
        builds_json_data = json.load(f)
    # Original list of builds
    builds = builds_json_data["builds"]
    builds_to_prune = set()
    images_to_keep = policy.get(stream, {}).get("images-keep", [])
    barrier_releases = set()
    # Get the update graph for stable streams
    if stream in ['stable', 'testing', 'next']:
        update_graph = get_update_graph(stream)['releases']
        # Keep only the barrier releases
        barrier_releases = set([release["version"] for release in update_graph if "barrier" in release])

    # Iterate through builds from oldest to newest
    for build in reversed(builds):
        build_id = build["id"]
        build_date = parse_fcos_version_to_timestamp(build_id)
        actions_completed = []
        print(f"Processing build {build_id}")

        # For each build, iterate over arches first to minimize downloads of meta.json per arch
        for arch in build["arches"]:
            print(f"\tProcessing {arch} for build {build_id}")
            meta_prefix = os.path.join(prefix, f"{build_id}/{arch}/meta.json")
            meta_json = get_json_from_s3(s3_client, bucket, meta_prefix)  # Download meta.json once per arch
            images = get_supported_images(meta_json)
            current_build = Build(id=build_id, images=images, arch=arch, meta_json=meta_json)

            # Iterate over actions (policy types) to apply pruning
            for action in ['cloud-uploads', 'images', 'containers', 'build']:
                if action not in policy[stream]:
                    continue
                action_duration = convert_duration_to_days(policy[stream][action])
                ref_date = today_date - relativedelta(days=int(action_duration))

                # Check if build date is beyond the reference date
                if build_date < ref_date:
                    previous_cleanup = build.get("policy-cleanup", {})

                    # Skip if the action has been handled previously for the build
                    if action in previous_cleanup:
                        print(f"\t\tBuild {build_id} has already had {action} pruning completed")
                        continue
                    # We have to handle the "images" action separately because the key in the previous
                    # cleanup is "images-kept" and not "images"
                    if action == 'images' and 'images-kept' in previous_cleanup:
                        # OK `images` has been pruned before, but we need to check
                        # that all the images were pruned that match the current policy.
                        # i.e. there may be additional images we need prune
                        if set(images_to_keep) == set(previous_cleanup.get("images-kept", [])):
                            print(f"\t\tBuild {build_id} has already had {action} pruning completed")
                            continue

                    # Pruning actions based on type
                    print(f"\t\t{arch} {action} for {build_id}")
                    match action:
                        case "cloud-uploads":
                            prune_cloud_uploads(current_build, cloud_config, args.dry_run)
                        # Prune through images that are not mentioned in images-keep
                        case "images":
                            prune_images(s3_client, current_build, images_to_keep, args.dry_run, bucket, prefix)
                        # Fully prune releases that are very old including deleting the directory in s3 for that build.
                        case "build":
                            # Since pruning a build prunes s3 for all architectures
                            # we'll prune later so that we do it only once and we
                            # make sure we've completed any architecture specific
                            # operations, (i.e. pruning aarch64 AMIs).
                            builds_to_prune.add(build_id)
                        case "containers":
                            # Our containers are manifest listed, which means deleting the container tag
                            # for one architecture deletes it for all of them. We'll choose to only prune
                            # for x86_64 since it is the one architecture that exists for all builds.
                            if arch == "x86_64":
                                if build_id in barrier_releases:
                                    # Since containers are used for updates we need to keep around containers for barrier releases.
                                    print(f"\t\t\tRelease {build_id} is a barrier release. Skipping container prune.")
                                    continue
                                # Retrieve container tags excluding the stream name since it updates with each release.
                                containers = get_container_tags(meta_json, exclude=[stream])
                                for (container_repo, container_tags) in containers:
                                    if container_tags:
                                        for tag in container_tags:
                                            prune_container(tag, args.dry_run, container_repo, args.registry_auth_file)
                                    else:
                                        print(f"\t\t\tNo container tags to prune for build {build_id}.")
                    actions_completed.append(action)  # Append action to completed list
        # Only add policy-cleanup for the build in builds.json if any
        # of the cleanup actions were completed.
        if actions_completed:
            policy_cleanup = build.setdefault("policy-cleanup", {})
            # Update policy-cleanup for completed actions
            for action in actions_completed:
                if action == "images":
                    policy_cleanup["images-kept"] = images_to_keep
                else:
                    policy_cleanup[action] = True

    if builds_to_prune:
        for build_id in builds_to_prune:
            prune_build(s3_client, bucket, prefix, build_id, args.dry_run)
        if "tombstone-builds" not in builds_json_data:
            builds_json_data["tombstone-builds"] = []
        # Separate the builds into remaining builds and tombstone builds
        remaining_builds = [build for build in builds if build["id"] not in builds_to_prune]
        tombstone_builds = [build for build in builds if build["id"] in builds_to_prune]
        # Update the data structure
        builds_json_data["builds"] = remaining_builds
        builds_json_data["tombstone-builds"].extend(tombstone_builds)

    # Save the updated builds.json to local builds/builds.json
    save_builds_json(builds_json_data, BUILDFILES['list'])


def get_s3_bucket_and_prefix(builds_source_data_url):
    parsed_url = urlparse(builds_source_data_url)
    if parsed_url.scheme == "s3":
        bucket, prefix = parsed_url.netloc, parsed_url.path.lstrip("/")
        return bucket, prefix
    raise Exception("Invalid scheme: only s3:// supported")


def get_cloud_config(args):
    return {
        "gcp": {
            "json-key": args.gcp_json_key,
        },
        "aws": {
            "credentials": args.aws_config_file
        }
    }


def validate_policy(stream, policy):
    # If the build key is set in the policy file, then the cloud-uploads key must
    # also be present, and the duration of cloud-uploads must be equal or shorter
    if "build" in policy[stream]:
        actions = policy[stream]
        if 'cloud-uploads' not in actions:
            raise Exception("Pruning for cloud-uploads must be set before we prune the builds")
        cloud_uploads_duration = convert_duration_to_days(actions["cloud-uploads"])
        build_duration = convert_duration_to_days(actions["build"])
        if cloud_uploads_duration > build_duration:
            raise Exception("Duration of pruning cloud-uploads must be less than or equal to pruning a build")


def get_supported_images(meta_json):
    images = {}
    for key in meta_json:
        if key in UNSUPPORTED:
            raise Exception(f"The platform {key} is not supported")
        if key in SUPPORTED:
            images[key] = meta_json[key]
    return images


def get_json_from_s3(s3, bucket, key):
    try:
        response = s3.get_object(Bucket=bucket, Key=key)
        content = response["Body"].read().decode("utf-8")
        return json.loads(content)
    except Exception as e:
        raise Exception(f"Error fetching the JSON file from S3 {bucket}/{key}: {e}")


def save_builds_json(builds_json_data, location):
    builds_json_data["timestamp"] = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    with open(location, "w") as json_file:
        json.dump(builds_json_data, json_file, indent=4)


def handle_upload_builds_json(s3_client, bucket, prefix, dry_run, acl):
    remote_builds_json = get_json_from_s3(s3_client, bucket, os.path.join(prefix, "builds.json"))
    # This is the copy of builds.json from what we last downloaded from the source
    with open(BUILDFILES['sourcedata'], "r") as f:
        builds_json_source_data = json.load(f)
    # This is the current list of builds at builds/builds.json
    with open(BUILDFILES['list'], "r") as f:
        current_builds_json = json.load(f)

    # If there are no changes to the local builds/builds.json we won't need to upload
    # anything to the s3 bucket. Will return in this scenario.
    if builds_json_source_data.get('builds') == current_builds_json.get('builds'):
        print("There are no changes to the local builds/builds.json. No upload needed")
        return

    # Check if there are any changes that were made to remote(s3 version) builds.json
    # while the pruning was in progress
    if remote_builds_json != builds_json_source_data:
        # Before we merge the changes, let's update the local tmp/builds-source.json with the latest remote_builds_json
        save_builds_json(remote_builds_json, BUILDFILES['sourcedata'])
        print("Detected remote updates to builds.json. Merging it to the local builds.json file")
        remote_builds_json = update_policy_cleanup(current_builds_json, remote_builds_json)
        # Make sure we have the merged json as local builds/builds.json
        save_builds_json(remote_builds_json, BUILDFILES['list'])

    # Print the updated builds.json before the s3 update
    with open(BUILDFILES['list'], 'r') as file:
        updated_builds_json = json.load(file)
    print("----")
    print(json.dumps(updated_builds_json, indent=4))
    print("----")

    with open(BUILDFILES['sourcedata'], 'r') as file:
        builds_json_source_data = json.load(file)
    # Make sure the size of the builds+tombstone-builds array is the same in the original and our modified builds.json
    source_builds_count = len(builds_json_source_data.get('builds', [])) + len(builds_json_source_data.get('tombstone-builds', []))
    updated_builds_count = len(updated_builds_json.get('builds', [])) + len(updated_builds_json.get('tombstone-builds', []))
    assert source_builds_count == updated_builds_count

    # Before uploading builds.json, copy the updated tmp/builds-source.json as builds.json.bak as a backup
    s3_copy(s3_client, BUILDFILES['sourcedata'], bucket, f'{prefix}/builds.json.bak', CACHE_MAX_AGE_METADATA, acl, extra_args={}, dry_run=dry_run)

    # Upload the local builds.json to s3
    return s3_copy(s3_client, BUILDFILES['list'], bucket, f'{prefix}/builds.json', CACHE_MAX_AGE_METADATA, acl, extra_args={}, dry_run=dry_run)


# Function to update policy-cleanup keys into remote_builds
def update_policy_cleanup(current_builds, remote_builds):
    current_builds_dict = {build['id']: build for build in current_builds['builds']}
    for remote_build in remote_builds['builds']:
        build_id = remote_build['id']
        if build_id in current_builds_dict:
            current_build = current_builds_dict[build_id]
            if 'policy-cleanup' in current_build:
                remote_build['policy-cleanup'] = current_build['policy-cleanup']
    return remote_builds


def prune_cloud_uploads(build, cloud_config, dry_run):
    # Ensure AWS AMIs and GCP images are removed based on the configuration
    errors = []
    errors.extend(deregister_aws_amis(build, cloud_config, dry_run))
    errors.extend(delete_gcp_image(build, cloud_config, dry_run))

    if errors:
        print(f"\t\t\tFound errors when removing cloud-uploads for {build.id}:")
        for e in errors:
            print(e)
        raise Exception("Some errors were encountered")


def deregister_aws_amis(build, cloud_config, dry_run):
    errors = []
    aws_credentials = cloud_config.get("aws", {}).get("credentials")
    amis = build.images.get("amis")
    if not amis:
        print(f"\t\t\tNo AMI/Snapshot to prune for {build.id} for {build.arch}")
        return errors
    for ami in amis:
        region_name = ami.get("name")
        ami_id = ami.get("hvm")
        # If we are dealing with an old manifest where the snapshot ID isn't stored
        # then let's instruct ore to detect the snapshot ID from the AMI.
        snapshot_id = ami.get("snapshot", "detectFromAMI")
        if dry_run:
            print(f"\t\t\tWould delete {ami_id} and {snapshot_id} for {build.id}")
            continue
        if (ami_id or snapshot_id) and region_name:
            try:
                deregister_aws_resource(ami_id, snapshot_id, region=region_name, credentials_file=aws_credentials)
            except Exception as e:
                errors.append(e)
        else:
            errors.append(f"Missing parameters to remove {ami_id} and {snapshot_id}")
    return errors


def delete_gcp_image(build, cloud_config, dry_run):
    errors = []
    gcp = build.images.get("gcp")
    if not gcp:
        print(f"\t\t\tNo GCP image to prune for {build.id} for {build.arch}")
        return errors
    gcp_image = gcp.get("image")
    project = gcp.get("project") or "fedora-coreos-cloud"
    json_key = cloud_config.get("gcp", {}).get("json-key")
    if dry_run:
        print(f"\t\t\tWould delete {gcp_image} GCP image for {build.id}")
    elif gcp_image and json_key and project:
        try:
            remove_gcp_image(gcp_image, json_key, project)
        except Exception as e:
            errors.append(e)
    else:
        errors.append(f"Missing parameters to remove {gcp_image}")
    return errors


def prune_images(s3, build, images_to_keep, dry_run, bucket, prefix):
    images_from_meta_json = build.meta_json.get("images", [])
    # Get the image names and paths currently in meta.json
    current_images_data = [(name, data.get("path")) for name, data in images_from_meta_json.items()]
    errors = []

    for name, path in current_images_data:
        if name not in images_to_keep:
            image_prefix = os.path.join(prefix, f"{build.id}/{build.arch}/{path}")
            if dry_run:
                print(f"\t\t\tWould prune {bucket}/{image_prefix}")
            else:
                try:
                    s3.delete_object(Bucket=bucket, Key=image_prefix)
                    print(f"\t\t\tPruned {name} image for {build.id} for {build.arch}")
                except botocore.exceptions.ClientError as e:
                    # Note that even if the object doesn't exist the delete_object()
                    # will still return Success (`204 Success (No Content)`) so we
                    # don't need to handle that error case here.
                    # https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html#API_DeleteObject_RequestSyntax
                    errors.append(e)
    if errors:
        print(f"\t\t\tFound errors when pruning images for {build.id}:")
        for e in errors:
            print(e)
        raise Exception("Some errors were encountered")


def prune_build(s3_client, bucket, prefix, build_id, dry_run):
    build_prefix = os.path.join(prefix, f"{build_id}/")
    if dry_run:
        print(f"\t\t\tWould delete all resources in {bucket}/{build_prefix}.")
    else:
        try:
            # List all objects under the specified prefix
            objects_to_delete = s3_client.list_objects_v2(Bucket=bucket, Prefix=build_prefix)
            if 'Contents' in objects_to_delete:
                # Extract the object keys and format them for deletion
                delete_keys = [{'Key': obj['Key']} for obj in objects_to_delete['Contents']]
                # Delete objects
                s3_client.delete_objects(Bucket=bucket, Delete={'Objects': delete_keys})
                print(f"\t\t\tPruned {build_id} completely from {bucket}/{build_prefix}.")
            else:
                print(f"\t\t\tNo objects found to delete in {bucket}/{build_prefix}.")
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'NoSuchKey':
                print(f"\t\t\t{bucket}/{build_prefix} already pruned or doesn't exist.")
            else:
                raise Exception(f"Error pruning {build_id}: {e.response['Error']['Message']}")


def get_container_tags(meta_json, exclude):
    base_oscontainer = meta_json.get("base-oscontainer")
    if base_oscontainer:
        tags = base_oscontainer.get("tags", [])
        filtered_tags = [tag for tag in tags if tag not in exclude]
        container_repos = [(base_oscontainer.get("image", ""), filtered_tags)]
        additional_images = base_oscontainer.get("additional-images", [])
        for image in additional_images:
            tags = image.get("tags", [])
            filtered_tags = [tag for tag in tags if tag not in exclude]
            container_repos.append((image.get("image", ""), filtered_tags))
        return container_repos
    return []


def prune_container(tag, dry_run, container_repo, registry_auth_file):
    if dry_run:
        print(f"\t\t\tWould prune image {container_repo}:{tag}")
    else:
        skopeo_delete(container_repo, tag, registry_auth_file)


def get_update_graph(stream):
    url = f"https://builds.coreos.fedoraproject.org/updates/{stream}.json"
    r = requests.get(url, timeout=5)
    if r.status_code != 200:
        raise Exception(f"Could not download update graph for {stream}. HTTP {r.status_code}")
    return r.json()


def skopeo_inspect(repo, tag, auth):
    skopeo_args = ["skopeo", "inspect", "--no-tags", "--retry-times=10", f"docker://{repo}:{tag}"]
    if auth:
        skopeo_args.extend(["--authfile", auth])
    try:
        subprocess.check_output(skopeo_args, stderr=subprocess.STDOUT)
        return True  # Inspection succeeded
    except subprocess.CalledProcessError as e:
        exit_code = e.returncode
        error_message = e.output.decode("utf-8")

        # Exit code 2 indicates the image tag does not exist. We will consider it as pruned.
        if exit_code == 2:
            print(f"\t\t\tSkipping deletion for {repo}:{tag} since the tag does not exist.")
            return False
        else:
            # Handle other types of errors
            raise Exception(f"Inspection failed for {repo}:{tag} with exit code {exit_code}: {error_message}")


def skopeo_delete(repo, image, auth):
    if skopeo_inspect(repo, image, auth):  # Only proceed if inspection succeeds
        skopeo_args = ["skopeo", "delete", f"docker://{repo}:{image}"]
        if auth:
            skopeo_args.extend(["--authfile", auth])
        try:
            subprocess.check_output(skopeo_args, stderr=subprocess.STDOUT)
            print(f"\t\t\tImage {repo}:{image} deleted successfully.")
        except subprocess.CalledProcessError as e:
            # Throw an exception in case the delete command fail despite the image existing
            raise Exception("An error occurred during deletion:", e.output.decode("utf-8"))


if __name__ == "__main__":
    main()