Revert "Add concurrent loading speedup test (#5032)" (#5179)

nv-kmcgill53 · mc-nv · commit 3cfeca26674f · 2022-12-16T10:03:22.000-08:00
This reverts commit 8a1a015.
diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py
@@ -39,7 +39,6 @@
 import infer_util as iu
 import test_util as tu
 import threading
-import concurrent.futures
 
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
@@ -2626,122 +2625,6 @@ def test_load_gpu_limit(self):
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-    def test_concurrent_load_speedup(self):
-        # Initialize client
-        try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
-        except Exception as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        # Load both models concurrently
-        model_names = ["identity_zero_1_int32_1", "identity_zero_1_int32_2"]
-        threads = []
-        for model_name in model_names:
-            threads.append(
-                threading.Thread(target=triton_client.load_model,
-                                 args=(model_name,)))
-        start_time = time.time()
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join()
-        end_time = time.time()
-        loading_time = end_time - start_time
-        # Each of the two models has a minimum loading delay of 10 seconds
-        # Speedup is observed when the concurrent loading time < 20 seconds but
-        # use a tighter bound of 15 seconds
-        self.assertLess(loading_time, 15.0,
-                        "Concurrent loading speedup not observed")
-        # Concurrent loading time cannot be < 10 seconds
-        self.assertGreaterEqual(loading_time, 10.0,
-                                "Invalid concurrent loading time")
-        # Make sure the models are loaded
-        self.assertTrue(triton_client.is_server_live())
-        self.assertTrue(triton_client.is_server_ready())
-        for model_name in model_names:
-            self.assertTrue(triton_client.is_model_ready(model_name))
-
-    def test_concurrent_load(self):
-        # Initialize client
-        try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
-        except Exception as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        # Load same model concurrently
-        with concurrent.futures.ThreadPoolExecutor() as pool:
-            thread_1 = pool.submit(triton_client.load_model,
-                                   "identity_zero_1_int32")
-            time.sleep(2)  # wait between load and unload
-            thread_2 = pool.submit(triton_client.load_model,
-                                   "identity_zero_1_int32")
-            thread_1.result()
-            with self.assertRaises(Exception) as ex:
-                thread_2.result()
-                self.assertEqual(
-                    str(ex.exception),
-                    "[StatusCode.INVALID_ARGUMENT] a related model 'identity_zero_1_int32' to a load/unload request is currently loading or unloading"
-                )
-        self.assertTrue(triton_client.is_server_live())
-        self.assertTrue(triton_client.is_server_ready())
-        self.assertTrue(triton_client.is_model_ready("identity_zero_1_int32"))
-
-    def test_concurrent_load_unload(self):
-        # Initialize client
-        try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
-        except Exception as ex:
-            self.assertTrue(False, "unexpected error {}".format(ex))
-        # Load identity_zero_1_int32 and unload it while it is loading
-        # The unload operation should have no effect
-        with concurrent.futures.ThreadPoolExecutor() as pool:
-            load_thread = pool.submit(triton_client.load_model,
-                                      "identity_zero_1_int32")
-            time.sleep(2)  # wait between load and unload
-            unload_thread = pool.submit(triton_client.unload_model,
-                                        "identity_zero_1_int32")
-            load_thread.result()
-            with self.assertRaises(Exception) as ex:
-                unload_thread.result()
-                self.assertEqual(
-                    str(ex.exception),
-                    "[StatusCode.INVALID_ARGUMENT] a related model 'identity_zero_1_int32' to a load/unload request is currently loading or unloading"
-                )
-        self.assertTrue(triton_client.is_server_live())
-        self.assertTrue(triton_client.is_server_ready())
-        self.assertTrue(triton_client.is_model_ready("identity_zero_1_int32"))
-        # Load ensemble_zero_1_float32 and unload its dependency while it is loading
-        # The unload operation should have no effect
-        with concurrent.futures.ThreadPoolExecutor() as pool:
-            load_thread = pool.submit(triton_client.load_model,
-                                      "ensemble_zero_1_float32")
-            time.sleep(2)  # wait between load and unload
-            unload_thread = pool.submit(triton_client.unload_model,
-                                        "custom_zero_1_float32")
-            load_thread.result()
-            with self.assertRaises(Exception) as ex:
-                unload_thread.result()
-                self.assertEqual(
-                    str(ex.exception),
-                    "[StatusCode.INVALID_ARGUMENT] a related model 'custom_zero_1_float32' to a load/unload request is currently loading or unloading"
-                )
-        self.assertTrue(triton_client.is_server_live())
-        self.assertTrue(triton_client.is_server_ready())
-        self.assertTrue(triton_client.is_model_ready("ensemble_zero_1_float32"))
-        self.assertTrue(triton_client.is_model_ready("custom_zero_1_float32"))
-        # Unload models concurrently
-        model_names = ["identity_zero_1_int32", "ensemble_zero_1_float32"]
-        with concurrent.futures.ThreadPoolExecutor() as pool:
-            threads = []
-            for model_name in model_names:
-                threads.append(
-                    pool.submit(triton_client.unload_model, model_name))
-            for thread in concurrent.futures.as_completed(threads):
-                thread.result()
-        for model_name in model_names:
-            self.assertFalse(triton_client.is_model_ready(model_name))
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh
@@ -1650,8 +1650,6 @@ fi
 kill $SERVER_PID
 wait $SERVER_PID
 
-LOG_IDX=$((LOG_IDX+1))
-
 # LifeCycleTest.test_load_gpu_limit
 # dependency of the Python model to be used
 pip install cuda-python
@@ -1675,8 +1673,6 @@ elif [ `grep -c "expects device ID >= 0, got -1" $SERVER_LOG` == "0" ]; then
     RET=1
 fi
 
-LOG_IDX=$((LOG_IDX+1))
-
 SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --model-load-gpu-limit 0:-0.4"
 SERVER_LOG="./inference_server_$LOG_IDX.log"
 run_server
@@ -1691,8 +1687,6 @@ elif [ `grep -c "expects limit fraction to be in range \[0.0, 1.0\], got -0.4" $
     RET=1
 fi
 
-LOG_IDX=$((LOG_IDX+1))
-
 # Run server to stop model loading if > 60% of GPU 0 memory is used
 SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --model-load-gpu-limit 0:0.6"
 SERVER_LOG="./inference_server_$LOG_IDX.log"
@@ -1711,100 +1705,6 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-LOG_IDX=$((LOG_IDX+1))
-
-# LifeCycleTest.test_concurrent_load_speedup
-rm -rf models
-mkdir models
-cp -r identity_zero_1_int32 models/identity_zero_1_int32_1 && mkdir -p models/identity_zero_1_int32_1/1
-cp -r models/identity_zero_1_int32_1 models/identity_zero_1_int32_2
-sed -i "s/identity_zero_1_int32/identity_zero_1_int32_1/" models/identity_zero_1_int32_1/config.pbtxt
-sed -i "s/identity_zero_1_int32/identity_zero_1_int32_2/" models/identity_zero_1_int32_2/config.pbtxt
-
-SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit"
-SERVER_LOG="./inference_server_$LOG_IDX.log"
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-set +e
-python $LC_TEST LifeCycleTest.test_concurrent_load_speedup >>$CLIENT_LOG 2>&1
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Test Failed\n***"
-    RET=1
-fi
-set -e
-
-kill $SERVER_PID
-wait $SERVER_PID
-
-LOG_IDX=$((LOG_IDX+1))
-
-# LifeCycleTest.test_concurrent_load
-rm -rf models
-mkdir models
-cp -r identity_zero_1_int32 models && mkdir -p models/identity_zero_1_int32/1
-
-SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit"
-SERVER_LOG="./inference_server_$LOG_IDX.log"
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-set +e
-python $LC_TEST LifeCycleTest.test_concurrent_load >>$CLIENT_LOG 2>&1
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Test Failed\n***"
-    RET=1
-fi
-set -e
-
-kill $SERVER_PID
-wait $SERVER_PID
-
-LOG_IDX=$((LOG_IDX+1))
-
-# LifeCycleTest.test_concurrent_load_unload
-rm -rf models
-mkdir models
-cp -r identity_zero_1_int32 models && mkdir -p models/identity_zero_1_int32/1
-cp -r ensemble_zero_1_float32 models && mkdir -p models/ensemble_zero_1_float32/1
-cp -r ../custom_models/custom_zero_1_float32 models/. && \
-    mkdir -p models/custom_zero_1_float32/1 && \
-    (cd models/custom_zero_1_float32 && \
-        echo "parameters [" >> config.pbtxt && \
-        echo "{ key: \"creation_delay_sec\"; value: { string_value: \"10\" }}" >> config.pbtxt && \
-        echo "]" >> config.pbtxt)
-
-SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit"
-SERVER_LOG="./inference_server_$LOG_IDX.log"
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-set +e
-python $LC_TEST LifeCycleTest.test_concurrent_load_unload >>$CLIENT_LOG 2>&1
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Test Failed\n***"
-    RET=1
-fi
-set -e
-
-kill $SERVER_PID
-wait $SERVER_PID
-
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
 fi