remote/keras_remote/cli/infra/program.py at 26fbe168c96b535535a3039a7ad29f57a1712463 · keras-team/remote · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""Pulumi inline program for keras-remote infrastructure.

Defines all GCP resources needed for keras-remote: API services,
Artifact Registry, GKE cluster, and optional accelerator node pools.
"""

import pulumi
import pulumi_gcp as gcp

from keras_remote.cli.constants import (
  MAX_CLUSTER_CPU,
  MAX_CLUSTER_MEMORY_GB,
  NODE_MAX_RUN_DURATION_SECONDS,
  REQUIRED_APIS,
  RESOURCE_NAME_PREFIX,
)
from keras_remote.constants import zone_to_ar_location, zone_to_region
from keras_remote.core.accelerators import GpuConfig, TpuConfig

# OAuth scopes required by all node pools (including accelerator pools).
_BASE_OAUTH_SCOPES = [
  # Read/write access to GCS for storing checkpoints, datasets, and logs.
  "https://www.googleapis.com/auth/devstorage.read_write",
  # Write application logs to Cloud Logging.
  "https://www.googleapis.com/auth/logging.write",
  # Export metrics to Cloud Monitoring.
  "https://www.googleapis.com/auth/monitoring",
]

# Additional scopes for the default (system) node pool, which runs GKE
# control-plane components that need deeper platform integration.
_DEFAULT_POOL_OAUTH_SCOPES = _BASE_OAUTH_SCOPES + [
  # Report service status to Google Service Control.
  "https://www.googleapis.com/auth/servicecontrol",
  # Read managed-service configuration from Service Management.
  "https://www.googleapis.com/auth/service.management.readonly",
  # Send distributed traces to Cloud Trace.
  "https://www.googleapis.com/auth/trace.append",
]


def create_program(config):
  """Create a Pulumi inline program function closed over the config.

  Args:
      config: InfraConfig instance.

  Returns:
      A callable suitable for pulumi.automation.create_or_select_stack().
  """

  def pulumi_program():
    project_id = config.project
    zone = config.zone
    ar_location = zone_to_ar_location(zone)
    cluster_name = config.cluster_name
    node_pools = config.node_pools

    # 1. Enable GCP APIs
    enabled_apis = []
    for api in REQUIRED_APIS:
      svc = gcp.projects.Service(
        f"api-{api.split('.')[0]}",
        service=api,
        project=project_id,
        disable_on_destroy=False,
        disable_dependent_services=False,
      )
      enabled_apis.append(svc)

    # 2. Artifact Registry docker repository
    repo = gcp.artifactregistry.Repository(
      "keras-remote-repo",
      repository_id=f"kr-{cluster_name}",
      location=ar_location,
      format="DOCKER",
      description="keras-remote container images",
      project=project_id,
      opts=pulumi.ResourceOptions(depends_on=enabled_apis),
    )

    # 3. Cloud Storage buckets
    region = zone_to_region(zone)

    gcp.storage.Bucket(
      "keras-remote-jobs-bucket",
      name=f"{project_id}-kr-{cluster_name}-jobs",
      location=region,
      project=project_id,
      force_destroy=True,
      opts=pulumi.ResourceOptions(depends_on=enabled_apis),
    )

    gcp.storage.Bucket(
      "keras-remote-builds-bucket",
      name=f"{project_id}-kr-{cluster_name}-builds",
      location=ar_location,
      project=project_id,
      force_destroy=True,
      opts=pulumi.ResourceOptions(depends_on=enabled_apis),
    )

    # 4. GKE Cluster
    cluster = gcp.container.Cluster(
      "keras-remote-cluster",
      name=cluster_name,
      location=zone,
      project=project_id,
      initial_node_count=1,
      remove_default_node_pool=False,
      node_config=gcp.container.ClusterNodeConfigArgs(
        machine_type="e2-standard-4",
        disk_size_gb=50,
        oauth_scopes=_DEFAULT_POOL_OAUTH_SCOPES,
      ),
      # Match setup.sh: --no-enable-autoupgrade
      release_channel=gcp.container.ClusterReleaseChannelArgs(
        channel="UNSPECIFIED",
      ),
      deletion_protection=False,
      cluster_autoscaling=gcp.container.ClusterClusterAutoscalingArgs(
        enabled=True,
        autoscaling_profile="OPTIMIZE_UTILIZATION",
        auto_provisioning_defaults=gcp.container.ClusterClusterAutoscalingAutoProvisioningDefaultsArgs(
          oauth_scopes=_DEFAULT_POOL_OAUTH_SCOPES,
          management=gcp.container.ClusterClusterAutoscalingAutoProvisioningDefaultsManagementArgs(
            auto_upgrade=True,
            auto_repair=True,
          ),
        ),
        resource_limits=[
          gcp.container.ClusterClusterAutoscalingResourceLimitArgs(
            resource_type="cpu",
            maximum=MAX_CLUSTER_CPU,
          ),
          gcp.container.ClusterClusterAutoscalingResourceLimitArgs(
            resource_type="memory",
            maximum=MAX_CLUSTER_MEMORY_GB,
          ),
        ],
      ),
      opts=pulumi.ResourceOptions(depends_on=enabled_apis),
    )

    # 5. Accelerator node pools (zero or more)
    pool_entries = []
    for np in node_pools:
      accel = np.accelerator
      pool_name = np.name
      if isinstance(accel, GpuConfig):
        pool = _create_gpu_node_pool(
          cluster, accel, zone, project_id, pool_name
        )
      elif isinstance(accel, TpuConfig):
        pool = _create_tpu_node_pool(
          cluster, accel, zone, project_id, pool_name
        )
      else:
        continue
      pool_entries.append((accel, pool))

    # 6. Stack exports
    # Exports that reference resource outputs (e.g. cluster.name,
    # repo.name, pool.name) create Pulumi dependencies — the export
    # only resolves when the underlying resource is successfully created.
    pulumi.export("project", project_id)
    pulumi.export("zone", zone)
    pulumi.export("cluster_name", cluster.name)
    pulumi.export("cluster_endpoint", cluster.endpoint)
    pulumi.export(
      "ar_registry",
      repo.name.apply(
        lambda _: f"{ar_location}-docker.pkg.dev/{project_id}/kr-{cluster_name}"
      ),
    )

    # 7. Accelerator node pool exports (list of dicts)
    if not pool_entries:
      pulumi.export("accelerators", [])
    else:
      export_outputs = []
      for accel, pool in pool_entries:
        if isinstance(accel, GpuConfig):
          entry = pool.name.apply(
            lambda pn, a=accel: {
              "type": "GPU",
              "name": a.name,
              "count": a.count,
              "machine_type": a.machine_type,
              "node_pool": pn,
              "node_count": 1,
            }
          )
        else:  # TpuConfig
          entry = pool.name.apply(
            lambda pn, a=accel: {
              "type": "TPU",
              "name": a.name,
              "chips": a.chips,
              "topology": a.topology,
              "machine_type": a.machine_type,
              "node_pool": pn,
              "node_count": a.num_nodes,
            }
          )
        export_outputs.append(entry)
      pulumi.export("accelerators", pulumi.Output.all(*export_outputs))

  return pulumi_program


def _create_gpu_node_pool(cluster, gpu: GpuConfig, zone, project_id, pool_name):
  """Create a GPU-accelerated GKE node pool."""
  return gcp.container.NodePool(
    pool_name,
    name=pool_name,
    cluster=cluster.name,
    location=zone,
    project=project_id,
    initial_node_count=0,
    autoscaling=gcp.container.NodePoolAutoscalingArgs(
      min_node_count=0,
      max_node_count=10,
    ),
    management=gcp.container.NodePoolManagementArgs(
      auto_repair=True,
      auto_upgrade=True,
    ),
    node_config=gcp.container.NodePoolNodeConfigArgs(
      machine_type=gpu.machine_type,
      oauth_scopes=_BASE_OAUTH_SCOPES,
      guest_accelerators=[
        gcp.container.NodePoolNodeConfigGuestAcceleratorArgs(
          type=gpu.gke_label,
          count=gpu.count,
        ),
      ],
      labels={RESOURCE_NAME_PREFIX: "true"},
      max_run_duration=f"{NODE_MAX_RUN_DURATION_SECONDS}s",  # 24 hours
    ),
  )


def _create_tpu_node_pool(cluster, tpu: TpuConfig, zone, project_id, pool_name):
  """Create a TPU GKE node pool."""
  # Single-host TPU slices (1 node) must not specify placement_policy;
  # multi-host slices require COMPACT placement with an explicit topology.
  is_multi_host = tpu.num_nodes > 1
  # Autoscaling is enabled, so we need to set the min_node_count to 0.
  min_nodes = 0

  placement = (
    gcp.container.NodePoolPlacementPolicyArgs(
      type="COMPACT",
      tpu_topology=tpu.topology,
    )
    if is_multi_host
    else None
  )
  return gcp.container.NodePool(
    pool_name,
    name=pool_name,
    cluster=cluster.name,
    location=zone,
    project=project_id,
    initial_node_count=min_nodes,
    autoscaling=gcp.container.NodePoolAutoscalingArgs(
      min_node_count=min_nodes,
      max_node_count=tpu.num_nodes,
    ),
    management=gcp.container.NodePoolManagementArgs(
      auto_repair=True,
      auto_upgrade=True,
    ),
    node_config=gcp.container.NodePoolNodeConfigArgs(
      machine_type=tpu.machine_type,
      oauth_scopes=_BASE_OAUTH_SCOPES,
      labels={RESOURCE_NAME_PREFIX: "true"},
      max_run_duration=f"{NODE_MAX_RUN_DURATION_SECONDS}s",  # 24 hours
    ),
    placement_policy=placement,
  )