Skip to content

Commit 401dd36

Browse files
apivovarovtqchen
andauthored
[Jenkins] Use 8 and 16 jobs to build for GPU and CPU (#254)
* [Jenkins] Use 8 and 16 jobs to build for GPU and CPU * [TEST] Refactor RPC test to isolate runs into a sub-function (apache#8656) We kill the rpc server in the del function. When a server co-exist with remote resources in the same function scope, the destruction order is not determined. This can cause server to be destructed before the actual remote array. As a side effect, it can cause sometime test to timeout due to waiting on the socket. Co-authored-by: Tianqi Chen <[email protected]>
1 parent c4c4cd7 commit 401dd36

9 files changed

+152
-123
lines changed

Jenkinsfile

+6-6
Original file line numberDiff line numberDiff line change
@@ -165,11 +165,11 @@ stage('Build') {
165165
ws(per_exec_ws("tvm/build-gpu")) {
166166
init_git()
167167
sh "${docker_run} ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh"
168-
make(ci_gpu, 'build', '-j2')
168+
make(ci_gpu, 'build', '-j8')
169169
pack_lib('gpu', tvm_multilib)
170170
// compiler test
171171
sh "${docker_run} ${ci_gpu} ./tests/scripts/task_config_build_gpu_vulkan.sh"
172-
make(ci_gpu, 'build2', '-j2')
172+
make(ci_gpu, 'build2', '-j8')
173173
}
174174
}
175175
},
@@ -178,7 +178,7 @@ stage('Build') {
178178
ws(per_exec_ws("tvm/build-cpu")) {
179179
init_git()
180180
sh "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh"
181-
make(ci_cpu, 'build', '-j2')
181+
make(ci_cpu, 'build', '-j16')
182182
pack_lib('cpu', tvm_multilib)
183183
timeout(time: max_time, unit: 'MINUTES') {
184184
sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_setup.sh"
@@ -199,7 +199,7 @@ stage('Build') {
199199
ws(per_exec_ws("tvm/build-wasm")) {
200200
init_git()
201201
sh "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh"
202-
make(ci_wasm, 'build', '-j2')
202+
make(ci_wasm, 'build', '-j16')
203203
timeout(time: max_time, unit: 'MINUTES') {
204204
sh "${docker_run} ${ci_wasm} ./tests/scripts/task_ci_setup.sh"
205205
sh "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh"
@@ -212,7 +212,7 @@ stage('Build') {
212212
ws(per_exec_ws("tvm/build-i386")) {
213213
init_git()
214214
sh "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh"
215-
make(ci_i386, 'build', '-j2')
215+
make(ci_i386, 'build', '-j16')
216216
pack_lib('i386', tvm_multilib)
217217
}
218218
}
@@ -232,7 +232,7 @@ stage('Build') {
232232
ws(per_exec_ws("tvm/build-qemu")) {
233233
init_git()
234234
sh "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh"
235-
make(ci_qemu, 'build', '-j2')
235+
make(ci_qemu, 'build', '-j16')
236236
timeout(time: max_time, unit: 'MINUTES') {
237237
sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_setup.sh"
238238
sh "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh"

tests/python/contrib/test_edgetpu_runtime.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def init_interpreter(model_path, target_edgetpu):
5151
interpreter = tflite.Interpreter(model_path=model_path)
5252
return interpreter
5353

54-
def check_remote(target_edgetpu=False):
54+
def check_remote(server, target_edgetpu=False):
5555
tflite_model_path = get_tflite_model_path(target_edgetpu)
5656

5757
# inference via tflite interpreter python apis
@@ -67,7 +67,6 @@ def check_remote(target_edgetpu=False):
6767
tflite_output = interpreter.get_tensor(output_details[0]["index"])
6868

6969
# inference via remote tvm tflite runtime
70-
server = rpc.Server("127.0.0.1")
7170
remote = rpc.connect(server.host, server.port)
7271
dev = remote.cpu(0)
7372
if target_edgetpu:
@@ -83,9 +82,9 @@ def check_remote(target_edgetpu=False):
8382
np.testing.assert_equal(out.numpy(), tflite_output)
8483

8584
# Target CPU on coral board
86-
check_remote()
85+
check_remote(rpc.Server("127.0.0.1"))
8786
# Target EdgeTPU on coral board
88-
check_remote(target_edgetpu=True)
87+
check_remote(rpc.Server("127.0.0.1"), target_edgetpu=True)
8988

9089

9190
if __name__ == "__main__":

tests/python/contrib/test_random.py

+12-9
Original file line numberDiff line numberDiff line change
@@ -122,17 +122,20 @@ def test_rpc(dtype):
122122
return
123123

124124
np_ones = np.ones((512, 512), dtype=dtype)
125-
server = rpc.Server("127.0.0.1")
126-
remote = rpc.connect(server.host, server.port)
127-
value = tvm.nd.empty((512, 512), dtype, remote.cpu())
128-
random_fill = remote.get_function("tvm.contrib.random.random_fill")
129-
random_fill(value)
130125

131-
assert np.count_nonzero(value.numpy()) == 512 * 512
126+
def check_remote(server):
127+
remote = rpc.connect(server.host, server.port)
128+
value = tvm.nd.empty((512, 512), dtype, remote.cpu())
129+
random_fill = remote.get_function("tvm.contrib.random.random_fill")
130+
random_fill(value)
132131

133-
# make sure arithmentic doesn't overflow too
134-
np_values = value.numpy()
135-
assert np.isfinite(np_values * np_values + np_values).any()
132+
assert np.count_nonzero(value.numpy()) == 512 * 512
133+
134+
# make sure arithmentic doesn't overflow too
135+
np_values = value.numpy()
136+
assert np.isfinite(np_values * np_values + np_values).any()
137+
138+
check_remote(rpc.Server("127.0.0.1"))
136139

137140
for dtype in [
138141
"bool",

tests/python/contrib/test_tflite_runtime.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -128,18 +128,18 @@ def test_remote():
128128
tflite_output = interpreter.get_tensor(output_details[0]["index"])
129129

130130
# inference via remote tvm tflite runtime
131-
server = rpc.Server("127.0.0.1")
132-
remote = rpc.connect(server.host, server.port)
133-
a = remote.upload(tflite_model_path)
134-
135-
with open(tflite_model_path, "rb") as model_fin:
136-
runtime = tflite_runtime.create(model_fin.read(), remote.cpu(0))
137-
runtime.set_input(0, tvm.nd.array(tflite_input, remote.cpu(0)))
138-
runtime.invoke()
139-
out = runtime.get_output(0)
140-
np.testing.assert_equal(out.numpy(), tflite_output)
141-
142-
server.terminate()
131+
def check_remote(server):
132+
remote = rpc.connect(server.host, server.port)
133+
a = remote.upload(tflite_model_path)
134+
135+
with open(tflite_model_path, "rb") as model_fin:
136+
runtime = tflite_runtime.create(model_fin.read(), remote.cpu(0))
137+
runtime.set_input(0, tvm.nd.array(tflite_input, remote.cpu(0)))
138+
runtime.invoke()
139+
out = runtime.get_output(0)
140+
np.testing.assert_equal(out.numpy(), tflite_output)
141+
142+
check_remote(rpc.Server("127.0.0.1"))
143143

144144

145145
if __name__ == "__main__":

tests/python/relay/test_vm.py

+19-23
Original file line numberDiff line numberDiff line change
@@ -853,29 +853,25 @@ def test_vm_rpc():
853853
# Use local rpc server for testing.
854854
# Server must use popen so it doesn't inherit the current process state. It
855855
# will crash otherwise.
856-
server = rpc.Server("localhost", port=9120)
857-
remote = rpc.connect(server.host, server.port, session_timeout=10)
858-
859-
# Upload the serialized Executable.
860-
remote.upload(path)
861-
# Get a handle to remote Executable.
862-
rexec = remote.load_module("vm_library.so")
863-
864-
ctx = remote.cpu()
865-
# Build a VM out of the executable and context.
866-
vm_factory = runtime.vm.VirtualMachine(rexec, ctx)
867-
np_input = np.random.uniform(size=(10, 1)).astype("float32")
868-
input_tensor = tvm.nd.array(np_input, ctx)
869-
# Invoke its "main" function.
870-
out = vm_factory.invoke("main", input_tensor)
871-
# Check the result.
872-
np.testing.assert_allclose(out.numpy(), np_input + np_input)
873-
874-
# delete tensors before the server shuts down so we don't throw errors.
875-
del input_tensor
876-
del out
877-
878-
server.terminate()
856+
def check_remote(server):
857+
remote = rpc.connect(server.host, server.port, session_timeout=10)
858+
859+
# Upload the serialized Executable.
860+
remote.upload(path)
861+
# Get a handle to remote Executable.
862+
rexec = remote.load_module("vm_library.so")
863+
864+
ctx = remote.cpu()
865+
# Build a VM out of the executable and context.
866+
vm_factory = runtime.vm.VirtualMachine(rexec, ctx)
867+
np_input = np.random.uniform(size=(10, 1)).astype("float32")
868+
input_tensor = tvm.nd.array(np_input, ctx)
869+
# Invoke its "main" function.
870+
out = vm_factory.invoke("main", input_tensor)
871+
# Check the result.
872+
np.testing.assert_allclose(out.numpy(), np_input + np_input)
873+
874+
check_remote(rpc.Server("127.0.0.1"))
879875

880876

881877
def test_get_output_single():

tests/python/unittest/test_runtime_graph.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,8 @@ def check_verify():
6565
out = mod.get_output(0, tvm.nd.empty((n,)))
6666
np.testing.assert_equal(out.numpy(), a + 1)
6767

68-
def check_remote():
68+
def check_remote(server):
6969
mlib = tvm.build(s, [A, B], "llvm", name="myadd")
70-
server = rpc.Server("127.0.0.1")
7170
remote = rpc.connect(server.host, server.port)
7271
temp = utils.tempdir()
7372
dev = remote.cpu(0)
@@ -115,7 +114,7 @@ def check_sharing():
115114
del mod
116115

117116
check_verify()
118-
check_remote()
117+
check_remote(rpc.Server("127.0.0.1"))
119118
check_sharing()
120119

121120

tests/python/unittest/test_runtime_graph_debug.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333

3434
@tvm.testing.requires_llvm
35+
@tvm.testing.requires_rpc
3536
def test_graph_simple():
3637
n = 4
3738
A = te.placeholder((n,), name="A")
@@ -160,9 +161,8 @@ def split_debug_line(i):
160161
# verify dump root delete after cleanup
161162
assert not os.path.exists(directory)
162163

163-
def check_remote():
164+
def check_remote(server):
164165
mlib = tvm.build(s, [A, B], "llvm", name="myadd")
165-
server = rpc.Server("127.0.0.1")
166166
remote = rpc.connect(server.host, server.port)
167167
temp = utils.tempdir()
168168
dev = remote.cpu(0)
@@ -182,7 +182,7 @@ def check_remote():
182182
np.testing.assert_equal(out.numpy(), a + 1)
183183

184184
check_verify()
185-
check_remote()
185+
check_remote(rpc.Server("127.0.0.1"))
186186

187187

188188
if __name__ == "__main__":

tests/python/unittest/test_runtime_module_based_interface.py

+25-23
Original file line numberDiff line numberDiff line change
@@ -275,29 +275,31 @@ def verify_rpc_gpu_export(obj_format):
275275

276276
from tvm import rpc
277277

278-
server = rpc.Server("127.0.0.1", port=9094)
279-
remote = rpc.connect(server.host, server.port)
280-
remote.upload(path_lib)
281-
loaded_lib = remote.load_module(path_lib)
282-
data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
283-
dev = remote.cuda()
284-
285-
# raw api
286-
gmod = loaded_lib["default"](dev)
287-
set_input = gmod["set_input"]
288-
run = gmod["run"]
289-
get_output = gmod["get_output"]
290-
set_input("data", tvm.nd.array(data, device=dev))
291-
run()
292-
out = get_output(0).numpy()
293-
tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
294-
295-
# graph executor wrapper
296-
gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
297-
gmod.set_input("data", data)
298-
gmod.run()
299-
out = gmod.get_output(0).numpy()
300-
tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
278+
def check_remote(server):
279+
remote = rpc.connect(server.host, server.port)
280+
remote.upload(path_lib)
281+
loaded_lib = remote.load_module(path_lib)
282+
data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
283+
dev = remote.cuda()
284+
285+
# raw api
286+
gmod = loaded_lib["default"](dev)
287+
set_input = gmod["set_input"]
288+
run = gmod["run"]
289+
get_output = gmod["get_output"]
290+
set_input("data", tvm.nd.array(data, device=dev))
291+
run()
292+
out = get_output(0).numpy()
293+
tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
294+
295+
# graph executor wrapper
296+
gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
297+
gmod.set_input("data", data)
298+
gmod.run()
299+
out = gmod.get_output(0).numpy()
300+
tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
301+
302+
check_remote(rpc.Server("127.0.0.1"))
301303

302304
for obj_format in [".so", ".tar"]:
303305
verify_cpu_export(obj_format)

0 commit comments

Comments
 (0)