Skip to content

Commit 6a56e53

Browse files
rzvoncekmaburadehaburadehadejanovski
authored
Allow specifying storage classes (#777)
--------- Co-authored-by: aburadeh <[email protected]> Co-authored-by: mohammad-aburadeh <[email protected]> Co-authored-by: Alexander Dejanovski <[email protected]>
1 parent 3da2655 commit 6a56e53

File tree

14 files changed

+138
-16
lines changed

14 files changed

+138
-16
lines changed

docs/Configuration.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,18 @@ storage_provider = <Storage system used for backups>
5252
; storage_provider should be either of "local", "google_storage" or "s3"
5353
region = <Region hosting the storage>
5454
55+
; Storage class to use when uploading objects.
56+
; Use a value specific to chosen `storage_provider` that supports both reads and writes (eg S3's GLACIER and Azure's ARCHIVE won't work).
57+
; If not specified, we default to the 'hottest' class (STANDARD, STANDARD, HOT for GCP, AWS, AZURE respectively).
58+
; Supported values:
59+
; AWS S3: STANDARD | REDUCED_REDUNDANCY | STANDARD_IA | ONEZONE_IA | INTELLIGENT_TIERING
60+
; GCP: STANDARD | Unsupported | Unsupported | Unsupported
61+
; AZURE: HOT | COOL | COLD
62+
; https://aws.amazon.com/s3/storage-classes/
63+
; https://cloud.google.com/storage/docs/storage-classes
64+
; https://learn.microsoft.com/en-us/azure/storage/blobs/access-tiers-overview
65+
; storage_class = <Storage Class Name used to store backups>
66+
5567
; Name of the bucket used for storing backups
5668
bucket_name = cassandra_backups
5769

medusa-example.ini

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@ storage_provider = <Storage system used for backups>
6060
; storage_provider should be either of "local", "google_storage", "azure_blobs" or the s3_* values from
6161
; https://github.com/apache/libcloud/blob/trunk/libcloud/storage/types.py
6262

63+
; Storage class to use when uploading objects.
64+
; Use a value specific to chosen `storage_provider` that supports both reads and writes (eg S3's GLACIER and Azure's ARCHIVE won't work).
65+
; If not specified, we default to the 'hottest' class (STANDARD, STANDARD, HOT for GCP, AWS, AZURE respectively).
66+
; Supported values:
67+
; AWS S3: STANDARD | REDUCED_REDUNDANCY | STANDARD_IA | ONEZONE_IA | INTELLIGENT_TIERING
68+
; GCP: STANDARD | Unsupported | Unsupported | Unsupported
69+
; AZURE: HOT | COOL | COLD
70+
; https://aws.amazon.com/s3/storage-classes/
71+
; https://cloud.google.com/storage/docs/storage-classes
72+
; https://learn.microsoft.com/en-us/azure/storage/blobs/access-tiers-overview
73+
; storage_class = <Storage Class Name used to store backups>
74+
6375
; Name of the bucket used for storing backups
6476
bucket_name = cassandra_backups
6577

medusa/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
StorageConfig = collections.namedtuple(
3030
'StorageConfig',
31-
['bucket_name', 'key_file', 'prefix', 'fqdn', 'host_file_separator', 'storage_provider',
31+
['bucket_name', 'key_file', 'prefix', 'fqdn', 'host_file_separator', 'storage_provider', 'storage_class',
3232
'base_path', 'max_backup_age', 'max_backup_count', 'api_profile', 'transfer_max_bandwidth',
3333
'concurrent_transfers', 'multi_part_upload_threshold', 'host', 'region', 'port', 'secure', 'ssl_verify',
3434
'aws_cli_path', 'kms_id', 'backup_grace_period_in_days', 'use_sudo_for_restore', 'k8s_mode', 'read_timeout']

medusa/storage/abstract_storage.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
MAX_UP_DOWN_LOAD_RETRIES = 5
3636

3737

38-
AbstractBlob = collections.namedtuple('AbstractBlob', ['name', 'size', 'hash', 'last_modified'])
38+
AbstractBlob = collections.namedtuple('AbstractBlob', ['name', 'size', 'hash', 'last_modified', 'storage_class'])
3939

4040
AbstractBlobMetadata = collections.namedtuple('AbstractBlobMetadata', ['name', 'sse_enabled', 'sse_key_id'])
4141

@@ -442,6 +442,12 @@ def additional_upload_headers(self):
442442
"""
443443
return {}
444444

445+
def get_storage_class(self):
446+
if self.config.storage_class is not None:
447+
return self.config.storage_class.upper()
448+
else:
449+
return None
450+
445451
@staticmethod
446452
def human_readable_size(size, decimal_places=3):
447453
for unit in ["B", "KiB", "MiB", "GiB", "TiB"]:

medusa/storage/azure_storage.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from azure.core.credentials import AzureNamedKeyCredential
2727
from azure.storage.blob.aio import BlobServiceClient
28-
from azure.storage.blob import BlobProperties
28+
from azure.storage.blob import BlobProperties, StandardBlobTier
2929
from medusa.storage.abstract_storage import AbstractStorage, AbstractBlob, AbstractBlobMetadata, ObjectDoesNotExistError
3030
from pathlib import Path
3131
from retrying import retry
@@ -95,7 +95,8 @@ async def _list_blobs(self, prefix=None) -> t.List[AbstractBlob]:
9595
b_props.name,
9696
b_props.size,
9797
self._get_blob_hash(b_props),
98-
b_props.last_modified)
98+
b_props.last_modified,
99+
b_props.blob_tier)
99100
)
100101
return blobs
101102

@@ -116,17 +117,20 @@ async def _upload_object(self, data: io.BytesIO, object_key: str, headers: t.Dic
116117
self.config.bucket_name, object_key
117118
)
118119
)
120+
storage_class = self.get_storage_class()
119121
blob_client = await self.azure_container_client.upload_blob(
120122
name=object_key,
121123
data=data,
122124
overwrite=True,
125+
standard_blob_tier=StandardBlobTier(storage_class.capitalize()) if storage_class else None,
123126
)
124127
blob_properties = await blob_client.get_blob_properties()
125128
return AbstractBlob(
126129
blob_properties.name,
127130
blob_properties.size,
128131
self._get_blob_hash(blob_properties),
129132
blob_properties.last_modified,
133+
blob_properties.blob_tier
130134
)
131135

132136
@retry(stop_max_attempt_number=MAX_UP_DOWN_LOAD_RETRIES, wait_fixed=5000)
@@ -173,6 +177,7 @@ async def _stat_blob(self, object_key: str) -> AbstractBlob:
173177
blob_properties.size,
174178
self._get_blob_hash(blob_properties),
175179
blob_properties.last_modified,
180+
blob_properties.blob_tier
176181
)
177182

178183
@retry(stop_max_attempt_number=MAX_UP_DOWN_LOAD_RETRIES, wait_fixed=5000)
@@ -188,13 +193,14 @@ async def _upload_blob(self, src: str, dest: str) -> ManifestObject:
188193
src, self.human_readable_size(file_size), self.config.bucket_name, object_key
189194
)
190195
)
191-
196+
storage_class = self.get_storage_class()
192197
with open(src, "rb") as data:
193198
blob_client = await self.azure_container_client.upload_blob(
194199
name=object_key,
195200
data=data,
196201
overwrite=True,
197202
max_concurrency=16,
203+
standard_blob_tier=StandardBlobTier(storage_class.capitalize()) if storage_class else None,
198204
)
199205
blob_properties = await blob_client.get_blob_properties()
200206
mo = ManifestObject(

medusa/storage/google_storage.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ async def _list_blobs(self, prefix=None) -> t.List[AbstractBlob]:
8282
int(o['size']),
8383
o['md5Hash'],
8484
# datetime comes as a string like 2023-08-31T14:23:24.957Z
85-
datetime.datetime.strptime(o['timeCreated'], '%Y-%m-%dT%H:%M:%S.%fZ')
85+
datetime.datetime.strptime(o['timeCreated'], '%Y-%m-%dT%H:%M:%S.%fZ'),
86+
o['storageClass']
8687
)
8788
async for o in objects
8889
]
@@ -125,14 +126,20 @@ async def _upload_object(self, data: io.BytesIO, object_key: str, headers: t.Dic
125126
self.config.bucket_name, object_key
126127
)
127128
)
129+
130+
storage_class = self.get_storage_class()
131+
ex_header = {"storageClass": storage_class} if storage_class else {}
128132
resp = await self.gcs_storage.upload(
129133
bucket=self.bucket_name,
130134
object_name=object_key,
131135
file_data=data,
132136
force_resumable_upload=True,
133137
timeout=-1,
138+
headers=ex_header,
139+
)
140+
return AbstractBlob(
141+
resp['name'], int(resp['size']), resp['md5Hash'], resp['timeCreated'], storage_class.upper()
134142
)
135-
return AbstractBlob(resp['name'], int(resp['size']), resp['md5Hash'], resp['timeCreated'])
136143

137144
@retry(stop_max_attempt_number=MAX_UP_DOWN_LOAD_RETRIES, wait_fixed=5000)
138145
async def _download_blob(self, src: str, dest: str):
@@ -181,7 +188,8 @@ async def _stat_blob(self, object_key: str) -> AbstractBlob:
181188
int(blob['size']),
182189
blob['md5Hash'],
183190
# datetime comes as a string like 2023-08-31T14:23:24.957Z
184-
datetime.datetime.strptime(blob['timeCreated'], '%Y-%m-%dT%H:%M:%S.%fZ')
191+
datetime.datetime.strptime(blob['timeCreated'], '%Y-%m-%dT%H:%M:%S.%fZ'),
192+
blob['storageClass']
185193
)
186194

187195
@retry(stop_max_attempt_number=MAX_UP_DOWN_LOAD_RETRIES, wait_fixed=5000)
@@ -197,12 +205,16 @@ async def _upload_blob(self, src: str, dest: str) -> ManifestObject:
197205
src, self.config.bucket_name, object_key
198206
)
199207
)
208+
209+
storage_class = self.get_storage_class()
210+
ex_header = {"storageClass": storage_class} if storage_class else {}
200211
resp = await self.gcs_storage.copy(
201212
bucket=self.bucket_name,
202213
object_name=f'{src}'.replace(f'gs://{self.bucket_name}/', ''),
203214
destination_bucket=self.bucket_name,
204215
new_name=object_key,
205216
timeout=-1,
217+
headers=ex_header,
206218
)
207219
resp = resp['resource']
208220
else:

medusa/storage/local_storage.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ async def _list_blobs(self, prefix=None):
6060
str(p.relative_to(self.root_dir)),
6161
os.stat(self.root_dir / p).st_size,
6262
self._md5(self.root_dir / p),
63-
datetime.datetime.fromtimestamp(os.stat(self.root_dir / p).st_mtime)
63+
datetime.datetime.fromtimestamp(os.stat(self.root_dir / p).st_mtime),
64+
None
6465
)
6566
for p in paths if not p.is_dir()
6667
]
@@ -92,7 +93,8 @@ async def _upload_object(self, data: io.BytesIO, object_key: str, headers: t.Dic
9293
object_key,
9394
os.stat(object_path).st_size,
9495
md5.hexdigest(),
95-
datetime.datetime.fromtimestamp(os.stat(object_path).st_mtime)
96+
datetime.datetime.fromtimestamp(os.stat(object_path).st_mtime),
97+
None
9698
)
9799

98100
async def _download_blob(self, src: str, dest: str):
@@ -160,7 +162,8 @@ async def _get_object(self, object_key: t.Union[Path, str]) -> AbstractBlob:
160162
str(object_key),
161163
os.stat(object_path).st_size,
162164
self._md5(object_path),
163-
datetime.datetime.fromtimestamp(os.stat(object_path).st_mtime)
165+
datetime.datetime.fromtimestamp(os.stat(object_path).st_mtime),
166+
None
164167
)
165168

166169
async def _read_blob_as_bytes(self, blob: AbstractBlob) -> bytes:

medusa/storage/s3_base_storage.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ async def _list_blobs(self, prefix=None) -> t.List[AbstractBlob]:
252252

253253
for o in response.get('Contents', []):
254254
obj_hash = o['ETag'].replace('"', '')
255-
blobs.append(AbstractBlob(o['Key'], o['Size'], obj_hash, o['LastModified']))
255+
blobs.append(AbstractBlob(o['Key'], o['Size'], obj_hash, o['LastModified'], o['StorageClass']))
256256

257257
return blobs
258258

@@ -264,6 +264,10 @@ async def _upload_object(self, data: io.BytesIO, object_key: str, headers: t.Dic
264264
kms_args['ServerSideEncryption'] = 'aws:kms'
265265
kms_args['SSEKMSKeyId'] = self.kms_id
266266

267+
storage_class = self.get_storage_class()
268+
if storage_class is not None:
269+
kms_args['StorageClass'] = storage_class
270+
267271
logging.debug(
268272
'[S3 Storage] Uploading object from stream -> s3://{}/{}'.format(
269273
self.bucket_name, object_key
@@ -326,7 +330,7 @@ async def _stat_blob(self, object_key: str) -> AbstractBlob:
326330
try:
327331
resp = self.s3_client.head_object(Bucket=self.bucket_name, Key=object_key)
328332
item_hash = resp['ETag'].replace('"', '')
329-
return AbstractBlob(object_key, int(resp['ContentLength']), item_hash, resp['LastModified'])
333+
return AbstractBlob(object_key, int(resp['ContentLength']), item_hash, resp['LastModified'], None)
330334
except ClientError as e:
331335
if e.response['Error']['Code'] == 'NoSuchKey' or e.response['Error']['Code'] == '404':
332336
logging.debug("[S3 Storage] Object {} not found".format(object_key))
@@ -339,7 +343,7 @@ async def _stat_blob(self, object_key: str) -> AbstractBlob:
339343
def __stat_blob(self, key):
340344
resp = self.s3_client.head_object(Bucket=self.bucket_name, Key=key)
341345
item_hash = resp['ETag'].replace('"', '')
342-
return AbstractBlob(key, int(resp['ContentLength']), item_hash, resp['LastModified'])
346+
return AbstractBlob(key, int(resp['ContentLength']), item_hash, resp['LastModified'], None)
343347

344348
@retry(stop_max_attempt_number=MAX_UP_DOWN_LOAD_RETRIES, wait_fixed=5000)
345349
async def _upload_blob(self, src: str, dest: str) -> ManifestObject:
@@ -353,6 +357,10 @@ async def _upload_blob(self, src: str, dest: str) -> ManifestObject:
353357
kms_args['ServerSideEncryption'] = 'aws:kms'
354358
kms_args['SSEKMSKeyId'] = self.kms_id
355359

360+
storage_class = self.get_storage_class()
361+
if storage_class is not None:
362+
kms_args['StorageClass'] = storage_class
363+
356364
file_size = os.stat(src).st_size
357365
logging.debug(
358366
'[S3 Storage] Uploading {} ({}) -> {}'.format(

tests/integration/features/integration_tests.feature

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1146,3 +1146,41 @@ Feature: Integration tests
11461146
Examples: Local storage
11471147
| storage | client encryption |
11481148
| local | with_client_encryption |
1149+
1150+
@32
1151+
Scenario Outline: Perform a differential backup with explicit storage class, then verify it
1152+
Given I have a fresh ccm cluster "<client encryption>" running named "scenario32"
1153+
Given I will use "<storage class>" as storage class in the storage
1154+
Given I am using "<storage>" as storage provider in ccm cluster "<client encryption>" with gRPC server
1155+
When I create the "test" table with secondary index in keyspace "medusa"
1156+
When I load 100 rows in the "medusa.test" table
1157+
When I run a "ccm node1 nodetool -- -Dcom.sun.jndi.rmiURLParsing=legacy flush" command
1158+
When I perform a backup in "differential" mode of the node named "first_backup" with md5 checks "disabled"
1159+
Then I can see the backup named "first_backup" when I list the backups
1160+
Then I can verify the backup named "first_backup" with md5 checks "disabled" successfully
1161+
Then I can see 2 SSTables with "<storage class>" in the SSTable pool for the "test" table in keyspace "medusa"
1162+
1163+
@s3
1164+
Examples: S3 storage
1165+
| storage | client encryption | storage class |
1166+
| s3_us_west_oregon | without_client_encryption | STANDARD |
1167+
| s3_us_west_oregon | without_client_encryption | REDUCED_REDUNDANCY |
1168+
| s3_us_west_oregon | without_client_encryption | STANDARD_IA |
1169+
| s3_us_west_oregon | without_client_encryption | ONEZONE_IA |
1170+
| s3_us_west_oregon | without_client_encryption | INTELLIGENT_TIERING |
1171+
1172+
@gcs
1173+
Examples: Google Cloud Storage
1174+
| storage | client encryption | storage class |
1175+
| google_storage | without_client_encryption | STANDARD |
1176+
# this is buggy for now, the library does not propagate the custom storage class headers
1177+
# | google_storage | without_client_encryption | NEARLINE |
1178+
# | google_storage | without_client_encryption | COLDLINE |
1179+
# | google_storage | without_client_encryption | ARCHIVE |
1180+
1181+
@azure
1182+
Examples: Azure Blob Storage
1183+
| storage | client encryption | storage class |
1184+
| azure_blobs | without_client_encryption | HOT |
1185+
| azure_blobs | without_client_encryption | COOL |
1186+
| azure_blobs | without_client_encryption | COLD |

tests/integration/features/steps/integration_steps.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,11 @@ def _i_run_a_dse_command(context, command):
390390
])
391391

392392

393+
@given(r'I will use "{storage_class}" as storage class in the storage')
394+
def i_will_use_storage_class(context, storage_class):
395+
context.storage_class = storage_class
396+
397+
393398
@given(r'I am using "{storage_provider}" as storage provider in ccm cluster "{client_encryption}"')
394399
def i_am_using_storage_provider(context, storage_provider, client_encryption):
395400
context.storage_provider = storage_provider
@@ -531,6 +536,9 @@ def get_args(context, storage_provider, client_encryption, cassandra_url, use_mg
531536
stop_cmd = f'resources/dse/stop-dse.sh {context.dse_version}'
532537

533538
storage_args = {"prefix": storage_prefix}
539+
if hasattr(context, "storage_class"):
540+
storage_args.update({"storage_class": context.storage_class})
541+
534542
cassandra_args = {
535543
"is_ccm": str(is_ccm),
536544
"stop_cmd": stop_cmd,
@@ -1268,6 +1276,15 @@ def _the_backup_index_exists(context):
12681276
)
12691277
def _i_can_see_nb_sstables_in_the_sstable_pool(
12701278
context, nb_sstables, table_name, keyspace
1279+
):
1280+
_i_can_see_nb_sstables_with_storage_class_in_the_sstable_pool(context, nb_sstables, None, table_name, keyspace)
1281+
1282+
1283+
# Then I can see 2 SSTables with "<storage class>" in the SSTable pool for the "test" table in keyspace "medusa"
1284+
@then(r'I can see {nb_sstables} SSTables with "{storage_class}" in the SSTable pool '
1285+
r'for the "{table_name}" table in keyspace "{keyspace}"')
1286+
def _i_can_see_nb_sstables_with_storage_class_in_the_sstable_pool(
1287+
context, nb_sstables, storage_class, table_name, keyspace
12711288
):
12721289
with Storage(config=context.medusa_config.storage) as storage:
12731290
path = os.path.join(
@@ -1280,6 +1297,11 @@ def _i_can_see_nb_sstables_in_the_sstable_pool(
12801297
logging.error("Was expecting {} SSTables".format(nb_sstables))
12811298
assert len(sstables) == int(nb_sstables)
12821299

1300+
if storage_class is not None:
1301+
for sstable in sstables:
1302+
logging.info(f'{storage_class.upper()} vs {sstable.storage_class.upper()}')
1303+
assert storage_class.upper() == sstable.storage_class.upper()
1304+
12831305

12841306
@then(
12851307
r'backup named "{backup_name}" has {nb_files} files '

0 commit comments

Comments
 (0)