Skip to content

Commit 4676653

Browse files
[K9VULN-14660] fix(agentless-azure): harden agentless azure destroy command (#187)
* fix(agentless-azure): display manual destroy command only if RG exists * fix(agentless-azure): harden destroy cleanup output and key vault soft-delete handling * build(agentless-azure): re-build dist scripts * fix(agentless-azure): support collaborative incremental setup between multiple users * fix(agentless-azure): purge key vault on destroy * build(agentless-azure): re-build dist scripts * fix(azure-agentless): fix keyvault redeploy after destroy * fix(azure-agentless): pass scanner subscription to blob data role grant * fix(azure-agentless): thread scanner subscription through Key Vault and destroy paths * fix(azure-agentless): stop masking RBAC failures and skip ssh-keygen on destroy * refactor(agentless-azure): remove unused ensure_state_storage / ensure_api_key_secret helpers * refactor(agentless-azure): unify storage blob existence probes behind a shared primitive * refactor(agentless-azure): consolidate current-user RBAC grants into a shared helper * refactor(agentless-azure): reuse destroy metadata read instead of re-fetching for terraform regen * refactor(agentless-azure): introduce InfoReporter protocol and lift shared destroy utilities * refactor(agentless-azure): replace KeyVault/StorageAccount error boilerplate with a decorator * fix(agentless-azure): include inherited and group RBAC assignments in idempotency check * build(agentless-azure): re-build dist scripts
1 parent 8d65f26 commit 4676653

14 files changed

Lines changed: 1518 additions & 594 deletions

File tree

26.7 KB
Binary file not shown.

azure/agentless/src/azure_agentless_setup/config.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,40 @@
2525
INSTALL_ID_LEN = 12
2626

2727

28+
def parse_credentials() -> tuple[str, str, str]:
29+
"""Read and validate Datadog credentials from environment variables.
30+
31+
Returns ``(api_key, app_key, site)``.
32+
33+
Used by the destroy command, which only needs credentials and not
34+
the full deploy-time configuration. The deploy path goes through
35+
:func:`parse_config` instead so it can aggregate credential errors
36+
with the rest of the env-var validation in a single message.
37+
38+
Raises:
39+
ConfigurationError: If any of the three credentials is missing.
40+
"""
41+
api_key = os.environ.get("DD_API_KEY", "").strip()
42+
app_key = os.environ.get("DD_APP_KEY", "").strip()
43+
site = os.environ.get("DD_SITE", "").strip()
44+
45+
errors = []
46+
if not api_key:
47+
errors.append("DD_API_KEY is required")
48+
if not app_key:
49+
errors.append("DD_APP_KEY is required")
50+
if not site:
51+
errors.append("DD_SITE is required")
52+
53+
if errors:
54+
raise ConfigurationError(
55+
"Missing credentials",
56+
"\n".join(f" - {e}" for e in errors),
57+
)
58+
59+
return api_key, app_key, site
60+
61+
2862
def compute_install_id(scanner_subscription: str, resource_group: str) -> str:
2963
"""Derive a stable per-install identifier from ``(subscription, RG)``.
3064

azure/agentless/src/azure_agentless_setup/destroy.py

Lines changed: 142 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -10,29 +10,37 @@
1010
from pathlib import Path
1111
from typing import Optional, Tuple
1212

13-
from az_shared.execute_cmd import execute
14-
from common.shell import Cmd
15-
1613
from .agentless_api import deactivate_scan_options
1714
from .config import (
1815
CONFIG_BASE_DIR,
1916
Config,
2017
DEFAULT_RESOURCE_GROUP,
2118
compute_install_id,
2219
get_config_dir,
20+
parse_credentials,
2321
)
2422
from .errors import ConfigurationError, SetupError
2523
from .metadata import (
24+
MetadataReadResult,
2625
MetadataReadStatus,
2726
delete_metadata,
2827
read_metadata,
2928
rg_mismatch_detail,
3029
terraform_state_exists,
3130
)
32-
from .secrets import API_KEY_SECRET_NAME, get_key_vault_name
31+
from .secrets import (
32+
API_KEY_SECRET_NAME,
33+
get_key_vault_name,
34+
key_vault_exists,
35+
purge_key_vault,
36+
soft_deleted_key_vault_exists,
37+
)
38+
from .reporter import PrintReporter
3339
from .state_storage import (
40+
ensure_current_user_blob_data_access,
3441
find_agentless_resource_groups,
3542
get_storage_account_name,
43+
resource_group_exists,
3644
storage_account_exists,
3745
)
3846
from .terraform import generate_ssh_key, generate_terraform_config, generate_tfvars
@@ -154,47 +162,23 @@ def _resolve_destroy_resource_group(
154162
return env_rg or DEFAULT_RESOURCE_GROUP
155163

156164

157-
def get_credentials_from_env() -> tuple:
158-
"""Get Datadog credentials from environment variables.
159-
160-
Returns:
161-
Tuple of (api_key, app_key, site)
162-
163-
Raises:
164-
SetupError: If any required credentials are missing.
165-
"""
166-
api_key = os.environ.get("DD_API_KEY", "").strip()
167-
app_key = os.environ.get("DD_APP_KEY", "").strip()
168-
site = os.environ.get("DD_SITE", "").strip()
169-
170-
errors = []
171-
if not api_key:
172-
errors.append("DD_API_KEY is required")
173-
if not app_key:
174-
errors.append("DD_APP_KEY is required")
175-
if not site:
176-
errors.append("DD_SITE is required")
177-
178-
if errors:
179-
raise SetupError(
180-
"Missing credentials",
181-
"\n".join(f" - {e}" for e in errors),
182-
)
183-
184-
return api_key, app_key, site
185-
186-
187165
def regenerate_terraform_config(
188166
scanner_subscription: str,
189167
storage_account: str,
190168
resource_group: str,
191169
config_folder: Path,
170+
metadata_result: Optional[MetadataReadResult] = None,
192171
) -> Path:
193172
"""Regenerate Terraform configuration when local folder doesn't exist.
194173
195-
Reads deployment metadata from Azure Blob Storage to discover all
196-
locations and subscriptions. Falls back to requiring environment
197-
variables if metadata is not available.
174+
When ``metadata_result`` is provided, reuses the caller's metadata
175+
read instead of issuing a second ``az storage blob show`` round-trip:
176+
``cmd_destroy`` already reads the blob to recover the scan
177+
subscription list before deciding to call this helper, so a fresh
178+
read here would only duplicate work and risk inconsistency on
179+
transient failures. Falls back to environment variables if metadata
180+
is not available (the typical "user wiped local config and we have
181+
no blob to read" path).
198182
199183
Generates a throwaway SSH key pair with :func:`terraform.generate_ssh_key` (same as
200184
deploy): Azure only needs a decodable public key in ``terraform.tfvars`` for refresh/destroy;
@@ -206,9 +190,9 @@ def regenerate_terraform_config(
206190
"""
207191
print("Local config not found, but Terraform state exists in storage account.")
208192

209-
api_key, app_key, site = get_credentials_from_env()
193+
api_key, app_key, site = parse_credentials()
210194

211-
result = read_metadata(storage_account)
195+
result = metadata_result if metadata_result is not None else read_metadata(storage_account)
212196
metadata = result.metadata if result.status == MetadataReadStatus.PRESENT else None
213197
if metadata:
214198
print(
@@ -279,9 +263,14 @@ def get_working_directory(
279263
scanner_subscription: str,
280264
storage_account: str,
281265
resource_group: str,
266+
metadata_result: Optional[MetadataReadResult] = None,
282267
) -> Tuple[Path, Optional[Path]]:
283268
"""Get the working directory for Terraform, regenerating config if needed.
284269
270+
``metadata_result`` is forwarded to :func:`regenerate_terraform_config`
271+
so the caller can pass a previously-read ``MetadataReadResult`` and
272+
avoid re-reading the deployment metadata blob on the regen path.
273+
285274
Returns:
286275
``(work_dir, ssh_key_temp_dir)``. The second item is a directory to remove
287276
after destroy (only set when config was regenerated and contains a
@@ -317,7 +306,11 @@ def get_working_directory(
317306
sys.exit(1)
318307

319308
ssh_tmp = regenerate_terraform_config(
320-
scanner_subscription, storage_account, resource_group, config_folder
309+
scanner_subscription,
310+
storage_account,
311+
resource_group,
312+
config_folder,
313+
metadata_result,
321314
)
322315
return config_folder, ssh_tmp
323316

@@ -362,64 +355,88 @@ def run_terraform_destroy(work_dir: Path) -> None:
362355
os.chdir(original_dir)
363356

364357

365-
def delete_key_vault(vault_name: str) -> bool:
366-
"""Delete the Key Vault.
367-
368-
Returns:
369-
True if deleted, False if the az CLI reported failure.
358+
def cleanup_key_vault(
359+
install_id: str, resource_group: str, subscription: str
360+
) -> None:
361+
"""Purge the Key Vault left behind by Terraform.
362+
363+
Terraform does not manage the vault itself (the API-key secret is
364+
written by the wizard before any plan runs), so destroy never
365+
cleans it up automatically. Without a purge, Azure keeps the
366+
soft-deleted vault reserved for its retention window (7 days for
367+
wizard-created vaults) - the recurring root cause of
368+
``VaultAlreadyExists`` on re-deploy.
369+
370+
Always purges: ``terraform destroy`` has already confirmed the
371+
user's intent by the time we get here. Triggers purge when either
372+
a live vault or a soft-deleted descriptor exists; skips only when
373+
Azure has neither. The soft-deleted check is essential because a
374+
previous destroy may have run ``az keyvault delete`` and then
375+
failed at the purge step - ``key_vault_exists`` then returns False
376+
on retry and would otherwise leave the name reserved for the full
377+
retention window.
370378
"""
371-
try:
372-
execute(
373-
Cmd(["az", "keyvault", "delete"])
374-
.param("--name", vault_name)
375-
.flag("--no-wait"),
376-
)
377-
return True
378-
except Exception:
379-
return False
379+
vault_name = get_key_vault_name(install_id)
380380

381+
live = key_vault_exists(vault_name, resource_group, subscription)
382+
soft_deleted = (
383+
not live and soft_deleted_key_vault_exists(vault_name, subscription)
384+
)
381385

382-
def prompt_key_vault_cleanup(install_id: str) -> None:
383-
"""Ask user if they want to delete the Key Vault."""
384-
vault_name = get_key_vault_name(install_id)
386+
if not live and not soft_deleted:
387+
print("Key Vault already removed:")
388+
print(f" {vault_name}")
389+
print()
390+
return
385391

386-
print("=" * 60)
387-
print(" Cleanup Options")
388-
print("=" * 60)
389-
print()
390-
print("The Key Vault was NOT deleted by Terraform:")
391-
print(f" {vault_name}")
392-
print()
393-
print("This vault holds the Datadog API key and may be reused for future deployments.")
392+
if soft_deleted:
393+
print(f"Purging soft-deleted Key Vault {vault_name}...")
394+
else:
395+
print(f"Purging Key Vault {vault_name}...")
396+
if purge_key_vault(vault_name, subscription):
397+
print(f"✅ Key Vault purged: {vault_name}")
398+
else:
399+
print(f"⚠️ Failed to purge Key Vault {vault_name}.")
394400
print()
395401

396-
try:
397-
response = input("Do you want to delete the Key Vault? (y/N): ").strip().lower()
398-
if response in ("y", "yes"):
399-
print("Deleting Key Vault...")
400-
if delete_key_vault(vault_name):
401-
print("✅ Key Vault deleted.")
402-
print(" Note: Azure retains soft-deleted vaults for the configured")
403-
print(" retention period. It will be auto-purged after that.")
404-
else:
405-
print("⚠️ Failed to delete Key Vault (it may not exist or you lack permissions).")
406-
else:
407-
print("Key Vault kept.")
408-
except EOFError:
409-
print("Key Vault kept (non-interactive mode).")
410402

403+
def print_final_notes(
404+
storage_account: str, resource_group: str, scanner_subscription: str
405+
) -> None:
406+
"""Print final notes about resources that Terraform did not delete.
411407
412-
def print_final_notes(storage_account: str, resource_group: str) -> None:
413-
"""Print final notes about resources not deleted."""
408+
The state Storage Account, Key Vault, and Resource Group are created
409+
by the Python wizard rather than Terraform, so ``terraform destroy``
410+
leaves them in place by design. When the user has already removed
411+
them out-of-band (typically via ``az group delete``), telling them to
412+
run it again would only surface a ``ResourceGroupNotFound`` error,
413+
so we adjust the notes to match the actual state.
414+
"""
414415
print()
416+
rg_present = resource_group_exists(resource_group, scanner_subscription)
417+
sa_present = rg_present and storage_account_exists(
418+
storage_account, resource_group, scanner_subscription
419+
)
420+
421+
if not rg_present:
422+
print("=" * 60)
423+
print(" Cleanup Complete")
424+
print("=" * 60)
425+
print()
426+
print(f"Resource group already removed: {resource_group}")
427+
print("No further manual cleanup needed.")
428+
print()
429+
return
430+
415431
print("=" * 60)
416432
print(" Notes")
417433
print("=" * 60)
418434
print()
419435
print("The following resources were NOT deleted:")
420436
print()
421437
print(f" Resource Group: {resource_group}")
422-
print(f" Storage Account: {storage_account} (contains Terraform state)")
438+
if sa_present:
439+
print(f" Storage Account: {storage_account} (contains Terraform state)")
423440
print()
424441
print("You can delete the resource group manually if no longer needed:")
425442
print(f" az group delete --name {resource_group} --yes --no-wait")
@@ -443,7 +460,7 @@ def cmd_destroy() -> None:
443460
# Fail fast if Datadog credentials aren't set — they're required
444461
# below for scan options cleanup and for regenerate_terraform_config.
445462
# The returned values are re-read by the callers that actually use them.
446-
get_credentials_from_env()
463+
parse_credentials()
447464

448465
scanner_subscription = get_scanner_subscription()
449466

@@ -463,15 +480,48 @@ def cmd_destroy() -> None:
463480
install_id = compute_install_id(scanner_subscription, resource_group)
464481
storage_account = get_storage_account(install_id)
465482

466-
# Metadata is no longer authoritative for the resource group, but
467-
# we still read it to recover the list of scan subscriptions for
468-
# the Datadog-API cleanup at the end.
469-
result = read_metadata(storage_account)
470-
metadata = result.metadata if result.status == MetadataReadStatus.PRESENT else None
471-
subscriptions_to_scan = metadata.subscriptions_to_scan if metadata else []
483+
# Storage Blob Data Contributor is a *data-plane* role; Owner on
484+
# the resource group does not include it. Grant it to the current
485+
# user before the metadata read so a user destroying a deployment
486+
# created by someone else doesn't trip over an opaque
487+
# "permissions" error from `az storage blob show`. When the SA
488+
# does not exist (RG already nuked out-of-band, or a partial
489+
# deploy never created it) the metadata read is also skipped:
490+
# ``az storage blob show`` would DNS-fail on the missing account
491+
# endpoint and produce noisy stderr that ``_classify_blob_show_failure``
492+
# cannot recognise as a clean MISSING.
493+
sa_present = storage_account_exists(
494+
storage_account, resource_group, scanner_subscription
495+
)
496+
497+
metadata_result: Optional[MetadataReadResult] = None
498+
subscriptions_to_scan: list[str] = []
499+
if sa_present:
500+
ensure_current_user_blob_data_access(
501+
storage_account,
502+
resource_group,
503+
scanner_subscription,
504+
PrintReporter(),
505+
)
506+
507+
# Metadata is no longer authoritative for the resource group, but
508+
# we still read it to recover the list of scan subscriptions for
509+
# the Datadog-API cleanup at the end. The result is also handed
510+
# down to ``get_working_directory`` so the regen path doesn't
511+
# round-trip the same blob again.
512+
metadata_result = read_metadata(storage_account)
513+
metadata = (
514+
metadata_result.metadata
515+
if metadata_result.status == MetadataReadStatus.PRESENT
516+
else None
517+
)
518+
subscriptions_to_scan = metadata.subscriptions_to_scan if metadata else []
472519

473520
work_dir, destroy_ssh_key_dir = get_working_directory(
474-
scanner_subscription, storage_account, resource_group
521+
scanner_subscription,
522+
storage_account,
523+
resource_group,
524+
metadata_result,
475525
)
476526

477527
try:
@@ -498,8 +548,8 @@ def cmd_destroy() -> None:
498548
else:
499549
print("⚠️ Keeping deployment metadata so you can re-run `destroy` to retry.")
500550

501-
prompt_key_vault_cleanup(install_id)
502-
print_final_notes(storage_account, resource_group)
551+
cleanup_key_vault(install_id, resource_group, scanner_subscription)
552+
print_final_notes(storage_account, resource_group, scanner_subscription)
503553

504554
except SetupError as e:
505555
print()

0 commit comments

Comments
 (0)