Skip to content

Commit 38575ab

Browse files
authored
[nexus] Decommission disks in reconfigurator, clean their DB state (#6059)
This PR adds the following step to the reconfigurator's execution step: For all disks that are marked **expunged**, mark them **decommissioned**. This notably happens after the `deploy_disks` step of execution. This PR also adds a background task that looks for all disks that are **decommissioned**, but have **zpools**. It deletes these zpools (and their **datasets**) as long as no regions nor region snapshots are referencing the contained datasets. Fixes #6051
1 parent e346fd1 commit 38575ab

File tree

21 files changed

+1013
-51
lines changed

21 files changed

+1013
-51
lines changed

dev-tools/omdb/tests/env.out

+15
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ task: "crdb_node_id_collector"
4747
Collects node IDs of running CockroachDB zones
4848

4949

50+
task: "decommissioned_disk_cleaner"
51+
deletes DB records for decommissioned disks, after regions and region
52+
snapshots have been replaced
53+
54+
5055
task: "dns_config_external"
5156
watches external DNS data stored in CockroachDB
5257

@@ -187,6 +192,11 @@ task: "crdb_node_id_collector"
187192
Collects node IDs of running CockroachDB zones
188193

189194

195+
task: "decommissioned_disk_cleaner"
196+
deletes DB records for decommissioned disks, after regions and region
197+
snapshots have been replaced
198+
199+
190200
task: "dns_config_external"
191201
watches external DNS data stored in CockroachDB
192202

@@ -314,6 +324,11 @@ task: "crdb_node_id_collector"
314324
Collects node IDs of running CockroachDB zones
315325

316326

327+
task: "decommissioned_disk_cleaner"
328+
deletes DB records for decommissioned disks, after regions and region
329+
snapshots have been replaced
330+
331+
317332
task: "dns_config_external"
318333
watches external DNS data stored in CockroachDB
319334

dev-tools/omdb/tests/successes.out

+12
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,11 @@ task: "crdb_node_id_collector"
248248
Collects node IDs of running CockroachDB zones
249249

250250

251+
task: "decommissioned_disk_cleaner"
252+
deletes DB records for decommissioned disks, after regions and region
253+
snapshots have been replaced
254+
255+
251256
task: "dns_config_external"
252257
watches external DNS data stored in CockroachDB
253258

@@ -453,6 +458,13 @@ task: "crdb_node_id_collector"
453458
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
454459
last completion reported error: no blueprint
455460

461+
task: "decommissioned_disk_cleaner"
462+
configured period: every 1m
463+
currently executing: no
464+
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
465+
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
466+
warning: unknown background task: "decommissioned_disk_cleaner" (don't know how to interpret details: Object {"deleted": Number(0), "error": Null, "error_count": Number(0), "found": Number(0), "not_ready_to_be_deleted": Number(0)})
467+
456468
task: "external_endpoints"
457469
configured period: every 1m
458470
currently executing: no

nexus-config/src/nexus_config.rs

+23
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,8 @@ pub struct BackgroundTaskConfig {
361361
pub inventory: InventoryConfig,
362362
/// configuration for physical disk adoption tasks
363363
pub physical_disk_adoption: PhysicalDiskAdoptionConfig,
364+
/// configuration for decommissioned disk cleaner task
365+
pub decommissioned_disk_cleaner: DecommissionedDiskCleanerConfig,
364366
/// configuration for phantom disks task
365367
pub phantom_disks: PhantomDiskConfig,
366368
/// configuration for blueprint related tasks
@@ -444,6 +446,20 @@ pub struct PhysicalDiskAdoptionConfig {
444446
pub disable: bool,
445447
}
446448

449+
#[serde_as]
450+
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
451+
pub struct DecommissionedDiskCleanerConfig {
452+
/// period (in seconds) for periodic activations of this background task
453+
#[serde_as(as = "DurationSeconds<u64>")]
454+
pub period_secs: Duration,
455+
456+
/// A toggle to disable automated disk cleanup
457+
///
458+
/// Default: Off
459+
#[serde(default)]
460+
pub disable: bool,
461+
}
462+
447463
#[serde_as]
448464
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
449465
pub struct NatCleanupConfig {
@@ -822,6 +838,7 @@ mod test {
822838
inventory.nkeep = 11
823839
inventory.disable = false
824840
physical_disk_adoption.period_secs = 30
841+
decommissioned_disk_cleaner.period_secs = 30
825842
phantom_disks.period_secs = 30
826843
blueprints.period_secs_load = 10
827844
blueprints.period_secs_execute = 60
@@ -947,6 +964,11 @@ mod test {
947964
period_secs: Duration::from_secs(30),
948965
disable: false,
949966
},
967+
decommissioned_disk_cleaner:
968+
DecommissionedDiskCleanerConfig {
969+
period_secs: Duration::from_secs(30),
970+
disable: false,
971+
},
950972
phantom_disks: PhantomDiskConfig {
951973
period_secs: Duration::from_secs(30),
952974
},
@@ -1049,6 +1071,7 @@ mod test {
10491071
inventory.nkeep = 3
10501072
inventory.disable = false
10511073
physical_disk_adoption.period_secs = 30
1074+
decommissioned_disk_cleaner.period_secs = 30
10521075
phantom_disks.period_secs = 30
10531076
blueprints.period_secs_load = 10
10541077
blueprints.period_secs_execute = 60

nexus/db-queries/src/db/datastore/physical_disk.rs

+34-40
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ use omicron_common::api::external::LookupType;
3737
use omicron_common::api::external::ResourceType;
3838
use omicron_uuid_kinds::CollectionUuid;
3939
use omicron_uuid_kinds::GenericUuid;
40+
use omicron_uuid_kinds::PhysicalDiskUuid;
4041
use uuid::Uuid;
4142

4243
impl DataStore {
@@ -278,23 +279,36 @@ impl DataStore {
278279
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
279280
}
280281

282+
/// Decommissions all expunged disks.
283+
pub async fn physical_disk_decommission_all_expunged(
284+
&self,
285+
opctx: &OpContext,
286+
) -> Result<(), Error> {
287+
opctx.authorize(authz::Action::Modify, &authz::FLEET).await?;
288+
use db::schema::physical_disk::dsl;
289+
290+
let conn = &*self.pool_connection_authorized(&opctx).await?;
291+
diesel::update(dsl::physical_disk)
292+
.filter(dsl::time_deleted.is_null())
293+
.physical_disk_filter(DiskFilter::ExpungedButActive)
294+
.set(dsl::disk_state.eq(PhysicalDiskState::Decommissioned))
295+
.execute_async(conn)
296+
.await
297+
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
298+
Ok(())
299+
}
300+
281301
/// Deletes a disk from the database.
282302
pub async fn physical_disk_delete(
283303
&self,
284304
opctx: &OpContext,
285-
vendor: String,
286-
serial: String,
287-
model: String,
288-
sled_id: Uuid,
305+
id: PhysicalDiskUuid,
289306
) -> DeleteResult {
290307
opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
291308
let now = Utc::now();
292309
use db::schema::physical_disk::dsl;
293310
diesel::update(dsl::physical_disk)
294-
.filter(dsl::vendor.eq(vendor))
295-
.filter(dsl::serial.eq(serial))
296-
.filter(dsl::model.eq(model))
297-
.filter(dsl::sled_id.eq(sled_id))
311+
.filter(dsl::id.eq(id.into_untyped_uuid()))
298312
.filter(dsl::time_deleted.is_null())
299313
.set(dsl::time_deleted.eq(now))
300314
.execute_async(&*self.pool_connection_authorized(opctx).await?)
@@ -451,8 +465,9 @@ mod test {
451465
let sled = create_test_sled(&datastore).await;
452466

453467
// Insert a disk
468+
let disk_id = PhysicalDiskUuid::new_v4();
454469
let disk = PhysicalDisk::new(
455-
Uuid::new_v4(),
470+
disk_id.into_untyped_uuid(),
456471
String::from("Oxide"),
457472
String::from("123"),
458473
String::from("FakeDisk"),
@@ -472,13 +487,7 @@ mod test {
472487

473488
// Delete the inserted disk
474489
datastore
475-
.physical_disk_delete(
476-
&opctx,
477-
disk.vendor.clone(),
478-
disk.serial.clone(),
479-
disk.model.clone(),
480-
disk.sled_id,
481-
)
490+
.physical_disk_delete(&opctx, disk_id)
482491
.await
483492
.expect("Failed to delete disk");
484493
let disks = datastore
@@ -489,13 +498,7 @@ mod test {
489498

490499
// Deleting again should not throw an error
491500
datastore
492-
.physical_disk_delete(
493-
&opctx,
494-
disk.vendor,
495-
disk.serial,
496-
disk.model,
497-
disk.sled_id,
498-
)
501+
.physical_disk_delete(&opctx, disk_id)
499502
.await
500503
.expect("Failed to delete disk");
501504

@@ -520,8 +523,9 @@ mod test {
520523
let sled_b = create_test_sled(&datastore).await;
521524

522525
// Insert a disk
526+
let disk_id = PhysicalDiskUuid::new_v4();
523527
let disk = PhysicalDisk::new(
524-
Uuid::new_v4(),
528+
disk_id.into_untyped_uuid(),
525529
String::from("Oxide"),
526530
String::from("123"),
527531
String::from("FakeDisk"),
@@ -546,13 +550,7 @@ mod test {
546550

547551
// Delete the inserted disk
548552
datastore
549-
.physical_disk_delete(
550-
&opctx,
551-
disk.vendor,
552-
disk.serial,
553-
disk.model,
554-
disk.sled_id,
555-
)
553+
.physical_disk_delete(&opctx, disk_id)
556554
.await
557555
.expect("Failed to delete disk");
558556
let disks = datastore
@@ -567,8 +565,9 @@ mod test {
567565
assert!(disks.is_empty());
568566

569567
// Attach the disk to the second sled
568+
let disk_id = PhysicalDiskUuid::new_v4();
570569
let disk = PhysicalDisk::new(
571-
Uuid::new_v4(),
570+
disk_id.into_untyped_uuid(),
572571
String::from("Oxide"),
573572
String::from("123"),
574573
String::from("FakeDisk"),
@@ -613,8 +612,9 @@ mod test {
613612
let sled_b = create_test_sled(&datastore).await;
614613

615614
// Insert a disk
615+
let disk_id = PhysicalDiskUuid::new_v4();
616616
let disk = PhysicalDisk::new(
617-
Uuid::new_v4(),
617+
disk_id.into_untyped_uuid(),
618618
String::from("Oxide"),
619619
String::from("123"),
620620
String::from("FakeDisk"),
@@ -639,13 +639,7 @@ mod test {
639639

640640
// Remove the disk from the first sled
641641
datastore
642-
.physical_disk_delete(
643-
&opctx,
644-
disk.vendor.clone(),
645-
disk.serial.clone(),
646-
disk.model.clone(),
647-
disk.sled_id,
648-
)
642+
.physical_disk_delete(&opctx, disk_id)
649643
.await
650644
.expect("Failed to delete disk");
651645

0 commit comments

Comments
 (0)