diff --git a/docs/user_guide/install.md b/docs/user_guide/install.md index 87854649..02a30343 100644 --- a/docs/user_guide/install.md +++ b/docs/user_guide/install.md @@ -82,12 +82,10 @@ For Linux+GPU devices, Parallax provides a docker environment for quick setup. C Run a docker container as below. Please note that generally the argument ```--gpus all``` is necessary for the docker to run on GPUs. ```sh -# For Blackwell -docker run -it --gpus all --network host gradientservice/parallax:latest-blackwell bash -# For Ampere/Hopper -docker run -it --gpus all --network host gradientservice/parallax:latest-hopper bash +# For Blackwell/Ampere/Hopper +docker run -it --gpus all --network host gradientservice/parallax:latest bash # For DGX Spark -docker run -it --gpus all --network host gradientservice/parallax:spark-spark bash +docker run -it --gpus all --network host gradientservice/parallax:latest-spark bash ``` The container starts under parallax workspace and you should be able to run parallax directly. diff --git a/src/parallax/launch.py b/src/parallax/launch.py index f26f2e60..56589445 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -136,7 +136,9 @@ ) args.start_layer = gradient_server.block_start_index args.end_layer = gradient_server.block_end_index - args.model_path = gradient_server.model_name + # Only read model_name from scheduler if model_path is not set, so we can use local path as model_path + if args.model_path is None: + args.model_path = gradient_server.model_name args.tp_size = gradient_server.tp_size logger.debug( diff --git a/src/scheduling/request_routing.py b/src/scheduling/request_routing.py index e4d5e09b..b478607f 100644 --- a/src/scheduling/request_routing.py +++ b/src/scheduling/request_routing.py @@ -430,7 +430,7 @@ def find_optimal_path(self, nodes: List[Node], num_layers: int) -> Tuple[List[st prev = node self._rr_cursor += 1 attempts += 1 - if viable: + if viable and total_latency != float("inf"): return candidate_ids, total_latency # Attempt a one-shot repair if the selected pipeline is not viable repaired = self._attempt_repair_pipeline(candidate_ids, nodes, num_layers)