[Jenkins] Use 8 and 16 jobs to build for GPU and CPU (#254)

apivovarov · tqchen · web-flow · commit 401dd36e14e8 · 2022-05-08T11:40:47.000-07:00
* [Jenkins] Use 8 and 16 jobs to build for GPU and CPU * [TEST] Refactor RPC test to isolate runs into a sub-function (apache#8656) We kill the rpc server in the del function. When a server co-exist with remote resources in the same function scope, the destruction order is not determined. This can cause server to be destructed before the actual remote array. As a side effect, it can cause sometime test to timeout due to waiting on the socket. Co-authored-by: Tianqi Chen <tqchen@users.noreply.github.com>
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -165,11 +165,11 @@ stage('Build') {
       ws(per_exec_ws("tvm/build-gpu")) {
         init_git()
         sh "${docker_run} ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh"
-        make(ci_gpu, 'build', '-j2')
+        make(ci_gpu, 'build', '-j8')
         pack_lib('gpu', tvm_multilib)
         // compiler test
         sh "${docker_run} ${ci_gpu} ./tests/scripts/task_config_build_gpu_vulkan.sh"
-        make(ci_gpu, 'build2', '-j2')
+        make(ci_gpu, 'build2', '-j8')
       }
     }
   },
@@ -178,7 +178,7 @@ stage('Build') {
       ws(per_exec_ws("tvm/build-cpu")) {
         init_git()
         sh "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh"
-        make(ci_cpu, 'build', '-j2')
+        make(ci_cpu, 'build', '-j16')
         pack_lib('cpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_setup.sh"
@@ -199,7 +199,7 @@ stage('Build') {
       ws(per_exec_ws("tvm/build-wasm")) {
         init_git()
         sh "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh"
-        make(ci_wasm, 'build', '-j2')
+        make(ci_wasm, 'build', '-j16')
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_wasm} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh"
@@ -212,7 +212,7 @@ stage('Build') {
       ws(per_exec_ws("tvm/build-i386")) {
         init_git()
         sh "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh"
-        make(ci_i386, 'build', '-j2')
+        make(ci_i386, 'build', '-j16')
         pack_lib('i386', tvm_multilib)
       }
     }
@@ -232,7 +232,7 @@ stage('Build') {
       ws(per_exec_ws("tvm/build-qemu")) {
         init_git()
         sh "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh"
-        make(ci_qemu, 'build', '-j2')
+        make(ci_qemu, 'build', '-j16')
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh"
diff --git a/tests/python/contrib/test_edgetpu_runtime.py b/tests/python/contrib/test_edgetpu_runtime.py
@@ -51,7 +51,7 @@ def init_interpreter(model_path, target_edgetpu):
             interpreter = tflite.Interpreter(model_path=model_path)
         return interpreter
 
-    def check_remote(target_edgetpu=False):
+    def check_remote(server, target_edgetpu=False):
         tflite_model_path = get_tflite_model_path(target_edgetpu)
 
         # inference via tflite interpreter python apis
@@ -67,7 +67,6 @@ def check_remote(target_edgetpu=False):
         tflite_output = interpreter.get_tensor(output_details[0]["index"])
 
         # inference via remote tvm tflite runtime
-        server = rpc.Server("127.0.0.1")
         remote = rpc.connect(server.host, server.port)
         dev = remote.cpu(0)
         if target_edgetpu:
@@ -83,9 +82,9 @@ def check_remote(target_edgetpu=False):
             np.testing.assert_equal(out.numpy(), tflite_output)
 
     # Target CPU on coral board
-    check_remote()
+    check_remote(rpc.Server("127.0.0.1"))
     # Target EdgeTPU on coral board
-    check_remote(target_edgetpu=True)
+    check_remote(rpc.Server("127.0.0.1"), target_edgetpu=True)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
@@ -122,17 +122,20 @@ def test_rpc(dtype):
             return
 
         np_ones = np.ones((512, 512), dtype=dtype)
-        server = rpc.Server("127.0.0.1")
-        remote = rpc.connect(server.host, server.port)
-        value = tvm.nd.empty((512, 512), dtype, remote.cpu())
-        random_fill = remote.get_function("tvm.contrib.random.random_fill")
-        random_fill(value)
 
-        assert np.count_nonzero(value.numpy()) == 512 * 512
+        def check_remote(server):
+            remote = rpc.connect(server.host, server.port)
+            value = tvm.nd.empty((512, 512), dtype, remote.cpu())
+            random_fill = remote.get_function("tvm.contrib.random.random_fill")
+            random_fill(value)
 
-        # make sure arithmentic doesn't overflow too
-        np_values = value.numpy()
-        assert np.isfinite(np_values * np_values + np_values).any()
+            assert np.count_nonzero(value.numpy()) == 512 * 512
+
+            # make sure arithmentic doesn't overflow too
+            np_values = value.numpy()
+            assert np.isfinite(np_values * np_values + np_values).any()
+
+        check_remote(rpc.Server("127.0.0.1"))
 
     for dtype in [
         "bool",
diff --git a/tests/python/contrib/test_tflite_runtime.py b/tests/python/contrib/test_tflite_runtime.py
@@ -128,18 +128,18 @@ def test_remote():
     tflite_output = interpreter.get_tensor(output_details[0]["index"])
 
     # inference via remote tvm tflite runtime
-    server = rpc.Server("127.0.0.1")
-    remote = rpc.connect(server.host, server.port)
-    a = remote.upload(tflite_model_path)
-
-    with open(tflite_model_path, "rb") as model_fin:
-        runtime = tflite_runtime.create(model_fin.read(), remote.cpu(0))
-        runtime.set_input(0, tvm.nd.array(tflite_input, remote.cpu(0)))
-        runtime.invoke()
-        out = runtime.get_output(0)
-        np.testing.assert_equal(out.numpy(), tflite_output)
-
-    server.terminate()
+    def check_remote(server):
+        remote = rpc.connect(server.host, server.port)
+        a = remote.upload(tflite_model_path)
+
+        with open(tflite_model_path, "rb") as model_fin:
+            runtime = tflite_runtime.create(model_fin.read(), remote.cpu(0))
+            runtime.set_input(0, tvm.nd.array(tflite_input, remote.cpu(0)))
+            runtime.invoke()
+            out = runtime.get_output(0)
+            np.testing.assert_equal(out.numpy(), tflite_output)
+
+    check_remote(rpc.Server("127.0.0.1"))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
@@ -853,29 +853,25 @@ def test_vm_rpc():
     # Use local rpc server for testing.
     # Server must use popen so it doesn't inherit the current process state. It
     # will crash otherwise.
-    server = rpc.Server("localhost", port=9120)
-    remote = rpc.connect(server.host, server.port, session_timeout=10)
-
-    # Upload the serialized Executable.
-    remote.upload(path)
-    # Get a handle to remote Executable.
-    rexec = remote.load_module("vm_library.so")
-
-    ctx = remote.cpu()
-    # Build a VM out of the executable and context.
-    vm_factory = runtime.vm.VirtualMachine(rexec, ctx)
-    np_input = np.random.uniform(size=(10, 1)).astype("float32")
-    input_tensor = tvm.nd.array(np_input, ctx)
-    # Invoke its "main" function.
-    out = vm_factory.invoke("main", input_tensor)
-    # Check the result.
-    np.testing.assert_allclose(out.numpy(), np_input + np_input)
-
-    # delete tensors before the server shuts down so we don't throw errors.
-    del input_tensor
-    del out
-
-    server.terminate()
+    def check_remote(server):
+        remote = rpc.connect(server.host, server.port, session_timeout=10)
+
+        # Upload the serialized Executable.
+        remote.upload(path)
+        # Get a handle to remote Executable.
+        rexec = remote.load_module("vm_library.so")
+
+        ctx = remote.cpu()
+        # Build a VM out of the executable and context.
+        vm_factory = runtime.vm.VirtualMachine(rexec, ctx)
+        np_input = np.random.uniform(size=(10, 1)).astype("float32")
+        input_tensor = tvm.nd.array(np_input, ctx)
+        # Invoke its "main" function.
+        out = vm_factory.invoke("main", input_tensor)
+        # Check the result.
+        np.testing.assert_allclose(out.numpy(), np_input + np_input)
+
+    check_remote(rpc.Server("127.0.0.1"))
 
 
 def test_get_output_single():
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
@@ -65,9 +65,8 @@ def check_verify():
         out = mod.get_output(0, tvm.nd.empty((n,)))
         np.testing.assert_equal(out.numpy(), a + 1)
 
-    def check_remote():
+    def check_remote(server):
         mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-        server = rpc.Server("127.0.0.1")
         remote = rpc.connect(server.host, server.port)
         temp = utils.tempdir()
         dev = remote.cpu(0)
@@ -115,7 +114,7 @@ def check_sharing():
             del mod
 
     check_verify()
-    check_remote()
+    check_remote(rpc.Server("127.0.0.1"))
     check_sharing()
 
 
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
@@ -32,6 +32,7 @@
 
 
 @tvm.testing.requires_llvm
+@tvm.testing.requires_rpc
 def test_graph_simple():
     n = 4
     A = te.placeholder((n,), name="A")
@@ -160,9 +161,8 @@ def split_debug_line(i):
         # verify dump root delete after cleanup
         assert not os.path.exists(directory)
 
-    def check_remote():
+    def check_remote(server):
         mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-        server = rpc.Server("127.0.0.1")
         remote = rpc.connect(server.host, server.port)
         temp = utils.tempdir()
         dev = remote.cpu(0)
@@ -182,7 +182,7 @@ def check_remote():
         np.testing.assert_equal(out.numpy(), a + 1)
 
     check_verify()
-    check_remote()
+    check_remote(rpc.Server("127.0.0.1"))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -275,29 +275,31 @@ def verify_rpc_gpu_export(obj_format):
 
         from tvm import rpc
 
-        server = rpc.Server("127.0.0.1", port=9094)
-        remote = rpc.connect(server.host, server.port)
-        remote.upload(path_lib)
-        loaded_lib = remote.load_module(path_lib)
-        data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        dev = remote.cuda()
-
-        # raw api
-        gmod = loaded_lib["default"](dev)
-        set_input = gmod["set_input"]
-        run = gmod["run"]
-        get_output = gmod["get_output"]
-        set_input("data", tvm.nd.array(data, device=dev))
-        run()
-        out = get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
-
-        # graph executor wrapper
-        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
-        gmod.set_input("data", data)
-        gmod.run()
-        out = gmod.get_output(0).numpy()
-        tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
+        def check_remote(server):
+            remote = rpc.connect(server.host, server.port)
+            remote.upload(path_lib)
+            loaded_lib = remote.load_module(path_lib)
+            data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
+            dev = remote.cuda()
+
+            # raw api
+            gmod = loaded_lib["default"](dev)
+            set_input = gmod["set_input"]
+            run = gmod["run"]
+            get_output = gmod["get_output"]
+            set_input("data", tvm.nd.array(data, device=dev))
+            run()
+            out = get_output(0).numpy()
+            tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
+
+            # graph executor wrapper
+            gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
+            gmod.set_input("data", data)
+            gmod.run()
+            out = gmod.get_output(0).numpy()
+            tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
+
+        check_remote(rpc.Server("127.0.0.1"))
 
     for obj_format in [".so", ".tar"]:
         verify_cpu_export(obj_format)
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py