Skip to content

Commit 768962e

Browse files
authored
feat: support provider v0.12.0 — Gateway API migration + cert-manager automation (#66)
* docs: add Gateway API migration implementation plan (#65) Step-by-step plan for adding v0.12.0 Gateway API support to provider-console-api. Covers new install flow, dedicated migration endpoint, cert-manager + DNS-01 (Cloudflare/CloudDNS) automation, helm values backup, and manual verification checklist. * config: bump provider defaults to v0.12.0 and add Gateway API/cert-manager pins (#65) * config: clarify version-prefix difference between GATEWAY_API_CRD_REF and NGINX_GATEWAY_FABRIC_VERSION (#65) * model: add CertManagerInput for cert-manager DNS-01 config (#65) * model: use Pydantic v2 ConfigDict + field_validator and reject malformed clouddns SA (#65) * model: require cert_manager block on ProviderBuildInput, default acme_email from provider.config.email (#65) * fix: handle empty loc on root validators and avoid in-place dict mutation (#65) * service: add GatewayApiService for NGF + akash-gateway install (#65) * service: ensure akash-default-tls temp key cleanup runs on failure (#65) * service: add CertManagerService with DNS-01 cloudflare/clouddns + wait-for-Ready (#65) * fix(certmgr): drop cloudflare solver email, suppress streaming on redact, harden ClusterIssuer + Certificate (#65) * service: drop ingress-nginx install from ProviderService and clean up NGF values file (#65) * service: drop redundant NGF values cleanup entry, ~/provider/ already covers it (#65) * service: wire Gateway API + cert-manager into build-provider task flow (#65) * service: add MigrationService for v0.11.x to v0.12.0 Gateway API migration (#65) * fix(migrate): catch InvalidVersion, refresh price script, drop dead __init__, use 404 for missing provider (#65) * api: add POST /provider/migrate-gateway-api endpoint with 14-task orchestrator (#65) * service: close SSH client and log errors in migrate_gateway_api (#65) * fix: heartbeat during cert wait + validate domain in migrate endpoint (#65) - Add periodic heartbeat to certificate readiness wait to prevent operator seeing timeout/hang appearance for 600s wait window (Fix C1) - Validate domain is non-empty string in migrate_gateway_api endpoint before attempting ACME challenge creation (Fix I3) * service: enable nginx-gateway-fabric snippets in NGF values (#65) Adds nginxGateway.snippets.enable=true to the values file written by install_nginx_gateway_fabric, which is shared by both the new-provider build flow and the v0.11.x to v0.12.0 migration flow. Providers stood up or migrated through the console now get NGF with snippets support enabled by default. * chore: ignore docs/ directory Keeps local design notes and planning docs (e.g. superpowers specs) out of the repo. * fix: preserve exception chains and tighten keyfile lookup (#65) Address reviewer nits across the migration code path: - provider_build: use dict.get for control_machine keyfile; suppress chaining on the ValidationError/KeyError/Exception handlers via `raise ... from None`. - cert_manager_service, gateway_api_service: chain ApplicationError with `from e` and use `{e!s}` formatting so the original traceback survives. - migration_service: explicitly suppress the InvalidVersion context with `raise ... from None`. * chore: bump AKASH_VERSION to v2.0.1 and node helm chart to 15.0.0 * fix: preserve traceback and exception chain in migration error handler
1 parent ce1bd4e commit 768962e

13 files changed

Lines changed: 2475 additions & 83 deletions

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,6 @@ __pycache__/
1616
bin/
1717
lib/
1818
Projects/
19-
pyvenv.cfg
19+
pyvenv.cfg
20+
21+
docs/*

application/api/provider_build.py

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from application.exception.application_error import ApplicationError
1111
from application.model.provider_build_input import ProviderBuildInput
1212
from application.model.machine_input import ControlMachineInput
13+
from application.model.cert_manager_input import CertManagerInput
1314
from application.service.akash_cluster_service import AkashClusterService
1415
from application.service.provider_service import ProviderService
1516
from application.utils.ssh_utils import get_ssh_client
@@ -52,7 +53,7 @@ def process_provider_build_input(data: Dict) -> ProviderBuildInput:
5253
"message": "The provided configuration is invalid.",
5354
"error_code": "VAL_005",
5455
"details": [
55-
{"field": error["loc"][0], "message": error["msg"]}
56+
{"field": error["loc"][0] if error["loc"] else "__root__", "message": error["msg"]}
5657
for error in e.errors()
5758
],
5859
},
@@ -134,7 +135,7 @@ async def build_provider(
134135
"message": "Invalid provider build input",
135136
"error_code": "VAL_006",
136137
"details": [
137-
{"field": error["loc"][0], "message": error["msg"]}
138+
{"field": error["loc"][0] if error["loc"] else "__root__", "message": error["msg"]}
138139
for error in ve.errors()
139140
],
140141
},
@@ -476,6 +477,100 @@ async def upgrade_provider(
476477
)
477478

478479

480+
@router.post("/provider/migrate-gateway-api", include_in_schema=False)
481+
async def migrate_gateway_api(
482+
background_tasks: BackgroundTasks,
483+
machine_input: Dict,
484+
wallet_address: str = Depends(verify_token),
485+
) -> Dict:
486+
try:
487+
control_machine = machine_input["control_machine"]
488+
keyfile_value = control_machine.get("keyfile")
489+
if keyfile_value:
490+
control_machine["keyfile"] = decode_keyfile_to_uploadfile(keyfile_value)
491+
control_machine_input = ControlMachineInput(**control_machine)
492+
493+
cert_manager_input = CertManagerInput(**machine_input["cert_manager"])
494+
domain = machine_input["domain"]
495+
if not isinstance(domain, str) or not domain.strip():
496+
raise HTTPException(
497+
status_code=status.HTTP_400_BAD_REQUEST,
498+
detail={
499+
"status": "error",
500+
"error": {
501+
"message": "domain is required and must be a non-empty string",
502+
"error_code": "VAL_010",
503+
},
504+
},
505+
)
506+
if not cert_manager_input.acme_email:
507+
raise HTTPException(
508+
status_code=status.HTTP_400_BAD_REQUEST,
509+
detail={
510+
"status": "error",
511+
"error": {
512+
"message": "cert_manager.acme_email is required for migration",
513+
"error_code": "VAL_007",
514+
},
515+
},
516+
)
517+
518+
action_id = str(uuid4())
519+
akash_cluster_service = AkashClusterService()
520+
background_tasks.add_task(
521+
akash_cluster_service.migrate_gateway_api,
522+
action_id,
523+
control_machine_input,
524+
cert_manager_input,
525+
domain,
526+
wallet_address,
527+
)
528+
return {
529+
"message": "Gateway API migration started successfully",
530+
"action_id": action_id,
531+
}
532+
except HTTPException:
533+
raise
534+
except ValidationError as ve:
535+
raise HTTPException(
536+
status_code=status.HTTP_400_BAD_REQUEST,
537+
detail={
538+
"status": "error",
539+
"error": {
540+
"message": "Invalid migration input",
541+
"error_code": "VAL_008",
542+
"details": [
543+
{"field": err["loc"][0] if err["loc"] else "__root__", "message": err["msg"]}
544+
for err in ve.errors()
545+
],
546+
},
547+
},
548+
) from None
549+
except KeyError as ke:
550+
raise HTTPException(
551+
status_code=status.HTTP_400_BAD_REQUEST,
552+
detail={
553+
"status": "error",
554+
"error": {
555+
"message": f"Missing required field: {ke}",
556+
"error_code": "VAL_009",
557+
},
558+
},
559+
) from None
560+
except Exception as e:
561+
log.exception("Error starting Gateway API migration")
562+
raise HTTPException(
563+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
564+
detail={
565+
"status": "error",
566+
"error": {
567+
"message": f"An error occurred while starting migration: {e}",
568+
"error_code": "PRV_009",
569+
},
570+
},
571+
) from e
572+
573+
479574
@router.post("/restart-provider", include_in_schema=False)
480575
async def restart_provider(
481576
data: Dict,

application/config/config.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,24 +25,32 @@ class Config:
2525
"https://raw.githubusercontent.com/akash-network/provider-configs/main/devices/pcie/gpus.json",
2626
)
2727
KEYRING_BACKEND = environ.get("KEYRING_BACKEND", "file")
28-
AKASH_VERSION = environ.get("AKASH_VERSION", "v1.0.0")
28+
AKASH_VERSION = environ.get("AKASH_VERSION", "v2.0.1")
2929
AKASH_NODE_HELM_CHART_VERSION = environ.get(
30-
"AKASH_NODE_HELM_CHART_VERSION", "14.0.0"
30+
"AKASH_NODE_HELM_CHART_VERSION", "15.0.0"
3131
)
32-
INGRESS_NGINX_VERSION = environ.get("INGRESS_NGINX_VERSION", "4.11.3")
33-
PROVIDER_SERVICES_VERSION = environ.get("PROVIDER_SERVICES_VERSION", "v0.10.1")
32+
PROVIDER_SERVICES_VERSION = environ.get("PROVIDER_SERVICES_VERSION", "v0.12.0")
3433
PROVIDER_SERVICES_HELM_CHART_VERSION = environ.get(
35-
"PROVIDER_SERVICES_HELM_CHART_VERSION", "14.0.3"
34+
"PROVIDER_SERVICES_HELM_CHART_VERSION", "16.0.0"
3635
)
37-
PROVIDER_HOSTNAME_OPERATOR_VERSION = environ.get("PROVIDER_HOSTNAME_OPERATOR_VERSION", "v0.10.0")
38-
PROVIDER_INVENTORY_OPERATOR_VERSION = environ.get("PROVIDER_INVENTORY_OPERATOR_VERSION", "v0.10.0")
36+
PROVIDER_HOSTNAME_OPERATOR_VERSION = environ.get("PROVIDER_HOSTNAME_OPERATOR_VERSION", "v0.12.0")
37+
PROVIDER_INVENTORY_OPERATOR_VERSION = environ.get("PROVIDER_INVENTORY_OPERATOR_VERSION", "v0.12.0")
3938
PROVIDER_PRICE_SCRIPT_URL = environ.get(
4039
"PROVIDER_PRICE_SCRIPT_URL",
4140
"https://raw.githubusercontent.com/akash-network/helm-charts/main/charts/akash-provider/scripts/price_script_generic.sh",
4241
)
4342
NVIDIA_DEVICE_PLUGIN_VERSION = environ.get("NVIDIA_DEVICE_PLUGIN_VERSION", "0.14.5")
4443
ROOK_CEPH_VERSION = environ.get("ROOK_CEPH_VERSION", "1.15.3")
4544

45+
# Gateway API + cert-manager (v0.12.0 Gateway API migration)
46+
CERT_MANAGER_VERSION = environ.get("CERT_MANAGER_VERSION", "v1.19.1")
47+
GATEWAY_API_CRD_REF = environ.get("GATEWAY_API_CRD_REF", "v2.5.1") # git tag ref for kubectl kustomize ?ref= — v-prefix required
48+
NGINX_GATEWAY_FABRIC_VERSION = environ.get("NGINX_GATEWAY_FABRIC_VERSION", "2.5.1") # helm --version, no v-prefix
49+
AKASH_GATEWAY_HELM_CHART_VERSION = environ.get("AKASH_GATEWAY_HELM_CHART_VERSION", "1.0.0")
50+
CERT_READY_TIMEOUT_SECONDS = int(environ.get("CERT_READY_TIMEOUT_SECONDS", "600"))
51+
LETSENCRYPT_PROD_SERVER = "https://acme-v02.api.letsencrypt.org/directory"
52+
LETSENCRYPT_STAGING_SERVER = "https://acme-staging-v02.api.letsencrypt.org/directory"
53+
4654
# Authentication
4755
HOST_NAME = environ.get("HOST_NAME")
4856
SECURITY_HOST = environ.get("SECURITY_HOST")
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from base64 import b64decode
2+
from typing import Literal, Optional
3+
4+
from pydantic import BaseModel, ConfigDict, EmailStr, SecretStr, field_validator, model_validator
5+
6+
7+
class CloudflareConfig(BaseModel):
8+
model_config = ConfigDict(extra="forbid")
9+
10+
api_token: SecretStr
11+
12+
13+
class CloudDnsConfig(BaseModel):
14+
model_config = ConfigDict(extra="forbid")
15+
16+
project: str
17+
service_account_json: SecretStr # raw JSON or base64-encoded JSON
18+
19+
@field_validator("service_account_json", mode="before")
20+
@classmethod
21+
def _normalize_b64(cls, v):
22+
if not isinstance(v, str):
23+
return v
24+
raw = v.strip()
25+
if raw.startswith("{"):
26+
return raw
27+
try:
28+
decoded = b64decode(raw).decode()
29+
except Exception as exc:
30+
raise ValueError(
31+
"service_account_json must be raw JSON (starting with '{') "
32+
"or base64-encoded JSON"
33+
) from exc
34+
if not decoded.startswith("{"):
35+
raise ValueError(
36+
"service_account_json must be raw JSON (starting with '{') "
37+
"or base64-encoded JSON"
38+
)
39+
return decoded
40+
41+
42+
class CertManagerInput(BaseModel):
43+
model_config = ConfigDict(extra="forbid")
44+
45+
acme_email: Optional[EmailStr] = None
46+
use_staging: bool = False
47+
dns_provider: Literal["cloudflare", "clouddns"]
48+
cloudflare: Optional[CloudflareConfig] = None
49+
clouddns: Optional[CloudDnsConfig] = None
50+
51+
@model_validator(mode="after")
52+
def _exactly_one_provider_block(self):
53+
if self.dns_provider == "cloudflare":
54+
if self.cloudflare is None or self.clouddns is not None:
55+
raise ValueError("dns_provider=cloudflare requires `cloudflare` block and no `clouddns` block")
56+
else:
57+
if self.clouddns is None or self.cloudflare is not None:
58+
raise ValueError("dns_provider=clouddns requires `clouddns` block and no `cloudflare` block")
59+
return self

application/model/provider_build_input.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from pydantic import BaseModel, model_validator, field_validator
22
from typing import List, Optional, Literal
3-
from typing import Optional
43
from fastapi import UploadFile
54
from base64 import b64decode
65
import io
76

7+
from application.model.cert_manager_input import CertManagerInput
8+
89

910
class Node(BaseModel):
1011
hostname: str
@@ -93,3 +94,24 @@ class ProviderBuildInput(BaseModel):
9394
wallet: Wallet
9495
nodes: List[Node]
9596
provider: Provider
97+
cert_manager: CertManagerInput
98+
99+
@model_validator(mode="before")
100+
@classmethod
101+
def _default_acme_email_from_provider_email(cls, data):
102+
if isinstance(data, dict):
103+
cm = data.get("cert_manager")
104+
if isinstance(cm, dict) and not cm.get("acme_email"):
105+
provider = data.get("provider") or {}
106+
config = provider.get("config") or {}
107+
email = config.get("email")
108+
if email:
109+
cm = {**cm, "acme_email": email}
110+
data = {**data, "cert_manager": cm}
111+
return data
112+
113+
@model_validator(mode="after")
114+
def _require_acme_email(self):
115+
if self.cert_manager.acme_email is None:
116+
raise ValueError("cert_manager.acme_email is required (or set provider.config.email)")
117+
return self

0 commit comments

Comments
 (0)