-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathmojo_host_platform.bzl
More file actions
234 lines (196 loc) · 9 KB
/
mojo_host_platform.bzl
File metadata and controls
234 lines (196 loc) · 9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""Setup a host platform that takes into account current GPU hardware"""
def _verbose_log(rctx, msg):
if rctx.getenv("MOJO_VERBOSE_GPU_DETECT"):
# buildifier: disable=print
print(msg)
def _log_result(rctx, binary, result):
_verbose_log(
rctx,
"\n------ {binary}:\nexit status: {exit_status}\nstdout: {stdout}\nstderr: {stderr}\n------ end {binary} info"
.format(
binary = binary,
exit_status = result.return_code,
stdout = result.stdout,
stderr = result.stderr,
),
)
def _fail(rctx, msg):
if rctx.getenv("MOJO_IGNORE_UNKNOWN_GPUS") == "1":
# buildifier: disable=print
print("WARNING: ignoring unknown GPU, to support it, add it to the gpu_mapping in the MODULE.bazel: {}".format(msg))
else:
fail(msg)
def _get_amdgpu_constraint(rctx, series, gpu_mapping):
for gpu_name, constraint in gpu_mapping.items():
if gpu_name in series:
if constraint:
return "@mojo_gpu_toolchains//:{}_gpu".format(constraint)
else:
return None
_fail(rctx, "Unrecognized amd-smi/rocm-smi output, please add it to your gpu_mapping in the MODULE.bazel file: {}".format(series))
return None
def _get_rocm_constraint(rctx, blob, gpu_mapping):
for value in blob.values():
series = value["Card Series"]
return _get_amdgpu_constraint(rctx, series, gpu_mapping)
fail("Unrecognized rocm-smi output, please report: {}".format(blob))
def _get_amd_constraint(rctx, blob, gpu_mapping):
for value in blob:
series = value["board"]["product_name"]
return _get_amdgpu_constraint(rctx, series, gpu_mapping)
fail("Unrecognized amd-smi output, please report: {}".format(blob))
def _get_nvidia_constraint(rctx, lines, gpu_mapping):
line = lines[0]
for gpu_name, constraint in gpu_mapping.items():
if gpu_name in line:
if constraint:
return "@mojo_gpu_toolchains//:{}_gpu".format(constraint)
else:
return None
_fail(rctx, "Unrecognized nvidia-smi output, please add it to your gpu_mapping in the MODULE.bazel file: {}".format(lines))
return None
def _get_amd_constraints_with_rocm_smi(rctx, rocm_smi, gpu_mapping):
if not rocm_smi:
return []
result = rctx.execute([rocm_smi, "--json", "--showproductname"])
_log_result(rctx, rocm_smi, result)
constraints = []
if result.return_code == 0 and len(result.stdout) > 0: #len(result.stdout) == 0 when the driver is not initialized
blob = json.decode(result.stdout)
if len(blob.keys()) == 0:
fail("rocm-smi succeeded but didn't actually have any GPUs, please report this issue")
rocm_constraint = _get_rocm_constraint(rctx, blob, gpu_mapping)
if rocm_constraint:
constraints.extend([
rocm_constraint,
"@mojo_gpu_toolchains//:amd_gpu",
"@mojo_gpu_toolchains//:has_gpu",
])
if len(blob.keys()) > 1:
constraints.append("@mojo_gpu_toolchains//:has_multi_gpu")
if len(blob.keys()) >= 4:
constraints.append("@mojo_gpu_toolchains//:has_4_gpus")
return constraints
def _get_apple_constraint(rctx, gpu_mapping):
result = rctx.execute(["/usr/bin/sw_vers", "--productVersion"])
_log_result(rctx, "/usr/sbin/sw_vers --productVersion", result)
if result.return_code != 0:
fail("sw_vers failed, please report this issue: {}".format(result.stderr))
major_version = int(result.stdout.split(".")[0])
if major_version < 15:
return None # Metal < 3.2 is not supported
result = rctx.execute(["/usr/sbin/system_profiler", "SPDisplaysDataType"])
if result.return_code != 0:
return None # TODO: Should we fail instead?
_log_result(rctx, "/usr/sbin/system_profiler SPDisplaysDataType", result)
metal_version = None
for line in result.stdout.splitlines():
if "Metal Support:" in line:
metal_version = line
break
if not metal_version: # macOS VMs may not have GPUs attached
return None
for gpu_name, constraint in gpu_mapping.items():
if gpu_name in metal_version:
if constraint:
return "@mojo_gpu_toolchains//:{}_gpu".format(constraint)
else:
return None
_fail(rctx, "Unrecognized system_profiler output, please add it to your gpu_mapping in the MODULE.bazel file: {}".format(result.stdout))
return None
def _impl(rctx):
constraints = []
if rctx.os.name == "linux" and (rctx.os.arch == "amd64" or rctx.os.arch == "aarch64"):
# A system may have both rocm-smi and nvidia-smi installed, check both.
nvidia_smi = rctx.which("nvidia-smi")
# amd-smi supersedes rocm-smi
amd_smi = rctx.which("amd-smi")
rocm_smi = rctx.which("rocm-smi")
_verbose_log(rctx, "nvidia-smi path: {}, rocm-smi path: {}, amd-smi path: {}".format(nvidia_smi, rocm_smi, amd_smi))
# NVIDIA
if nvidia_smi:
result = rctx.execute([nvidia_smi, "--query-gpu=gpu_name", "--format=csv,noheader"])
_log_result(rctx, nvidia_smi, result)
if result.return_code == 0:
lines = result.stdout.splitlines()
if len(lines) == 0:
fail("nvidia-smi succeeded but had no GPUs, please report this issue")
constraint = _get_nvidia_constraint(rctx, lines, rctx.attr.gpu_mapping)
if constraint:
constraints.extend([
"@mojo_gpu_toolchains//:nvidia_gpu",
"@mojo_gpu_toolchains//:has_gpu",
constraint,
])
if len(lines) > 1:
constraints.append("@mojo_gpu_toolchains//:has_multi_gpu")
if len(lines) >= 4:
constraints.append("@mojo_gpu_toolchains//:has_4_gpus")
# AMD
if amd_smi:
result = rctx.execute([amd_smi, "static", "--json"])
_log_result(rctx, amd_smi, result)
if result.return_code == 0:
# amd-smi outputs warnings to stdout, filter them out
json_lines = []
for line in result.stdout.splitlines():
if line.startswith("WARNING:"):
continue
json_lines.append(line)
failure_sentinel = {"DECODE": "FAILED"}
blob = json.decode("\n".join(json_lines), default = failure_sentinel)
if blob == failure_sentinel:
fail("amd-smi output was not valid json, please report this issue: {}".format(result.stdout))
if "gpu_data" in blob:
blob = blob["gpu_data"]
if len(blob) == 0:
fail("amd-smi succeeded but didn't actually have any GPUs, please report this issue")
amd_constraint = _get_amd_constraint(rctx, blob, rctx.attr.gpu_mapping)
if amd_constraint:
constraints.extend([
amd_constraint,
"@mojo_gpu_toolchains//:amd_gpu",
"@mojo_gpu_toolchains//:has_gpu",
])
if len(blob) > 1:
constraints.append("@mojo_gpu_toolchains//:has_multi_gpu")
if len(blob) >= 4:
constraints.append("@mojo_gpu_toolchains//:has_4_gpus")
else:
# amd-smi can fail when rocm-smi succeeds, fallback accordingly
constraints.extend(_get_amd_constraints_with_rocm_smi(rctx, rocm_smi, rctx.attr.gpu_mapping))
else:
constraints.extend(_get_amd_constraints_with_rocm_smi(rctx, rocm_smi, rctx.attr.gpu_mapping))
elif rctx.os.name == "mac os x" and rctx.os.arch == "aarch64":
apple_constraint = _get_apple_constraint(rctx, rctx.attr.gpu_mapping)
if apple_constraint:
constraints.extend([
apple_constraint,
"@mojo_gpu_toolchains//:apple_gpu",
"@mojo_gpu_toolchains//:has_gpu",
])
rctx.file("WORKSPACE.bazel", "workspace(name = {})".format(rctx.attr.name))
rctx.file("BUILD.bazel", """
platform(
name = "mojo_host_platform",
parents = ["@platforms//host"],
visibility = ["//visibility:public"],
constraint_values = [{constraints}],
exec_properties = {{
"no-remote-exec": "1",
}},
)
""".format(constraints = ", ".join(['"{}"'.format(x) for x in constraints])))
mojo_host_platform = repository_rule(
implementation = _impl,
configure = True,
environ = [
"MOJO_IGNORE_UNKNOWN_GPUS",
"MOJO_VERBOSE_GPU_DETECT",
],
attrs = {
"gpu_mapping": attr.string_dict(
doc = "A dictionary of GPU strings from nvidia-smi or amd-smi, mapped to supported GPUs defined by mojo.gpu_toolchains()",
),
},
)