GradientHQ
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/download_model_shard.sh‎
Lines changed: 0 additions & 45 deletions b/‎scripts/download_model_shard.sh‎
Lines changed: 0 additions & 45 deletions
diff --git a/‎scripts/download_shard.py‎
Lines changed: 0 additions & 81 deletions b/‎scripts/download_shard.py‎
Lines changed: 0 additions & 81 deletions
diff --git a/‎src/backend/benchmark/backend_request_func.py‎
Lines changed: 4 additions & 5 deletions b/‎src/backend/benchmark/backend_request_func.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎src/parallax/server/shard_loader.py‎
Lines changed: 5 additions & 5 deletions b/‎src/parallax/server/shard_loader.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/parallax/sglang/model_runner.py‎
Lines changed: 2 additions & 2 deletions b/‎src/parallax/sglang/model_runner.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/parallax/utils/model_download.py‎
Lines changed: 153 additions & 0 deletions b/‎src/parallax/utils/model_download.py‎
Lines changed: 153 additions & 0 deletions
@@ -20,6 +20,7 @@ dependencies = [
   "msgpack>=1.0.7",
   "safetensors>=0.5.1",
   "huggingface-hub",
+  "modelscope",
   "transformers>=4.57.1",
   "jinja2>=3.1.0",
   "numpy>=1.26",
@@ -69,7 +70,6 @@ benchmark = [
   "tqdm",
   "datasets",
   "pillow",
-  "modelscope",
 ]
 
 dev = [
 
@@ -11,11 +11,11 @@
 from typing import List, Optional, Union
 
 import aiohttp
-import huggingface_hub.constants
-from huggingface_hub import snapshot_download
 from tqdm.asyncio import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
+from parallax.utils.model_download import download_model_snapshot
+
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 
@@ -268,12 +268,11 @@ async def async_request_openai_chat_completions(
 
 def get_model(pretrained_model_name_or_path: str) -> str:
 
-    model_path = snapshot_download(
+    model_path = download_model_snapshot(
         repo_id=pretrained_model_name_or_path,
-        local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
         ignore_patterns=[".*.pt", ".*.safetensors", ".*.bin"],
     )
-    return model_path
+    return str(model_path)
 
 
 def get_tokenizer(
 
@@ -10,7 +10,6 @@
 from typing import Any, Dict, Optional, Tuple
 
 import mlx.core as mx
-from huggingface_hub import snapshot_download
 from mlx import nn
 from mlx.utils import tree_unflatten
 from mlx_lm.models.switch_layers import QuantizedSwitchLinear, SwitchLinear
@@ -19,6 +18,7 @@
 from mlx_lm.utils import _download, load_config
 
 from parallax.server.model import ShardedModel
+from parallax.utils.model_download import download_model_snapshot
 from parallax.utils.tokenizer_utils import load_tokenizer
 from parallax.utils.utils import normalize_model_config
 from parallax_utils.logging_config import get_logger
@@ -195,7 +195,7 @@ def load_lora(self, base_model: nn.Module, adapter_path: str) -> nn.Module:
                 logger.info(
                     f"Adapter path {adapter_path} not found locally. Attempting to download from Hugging Face..."
                 )
-                downloaded_path = snapshot_download(
+                downloaded_path = download_model_snapshot(
                     repo_id=str(adapter_path), local_dir=str(adapter_path)
                 )
                 adapter_path = pathlib.Path(downloaded_path)
@@ -236,14 +236,14 @@ def load(
             A tuple containing the loaded sharded MLX model and its configuration dictionary.
         """
         if use_selective_download and self.start_layer is not None and self.end_layer is not None:
-            from parallax.utils.selective_download import (
-                get_model_path_with_selective_download,
+            from parallax.utils.model_download import (
+                selective_model_download,
             )
 
             logger.info(
                 f"Using selective download for layers [{self.start_layer}, {self.end_layer})"
             )
-            model_path = get_model_path_with_selective_download(
+            model_path = selective_model_download(
                 self.model_path_str,
                 start_layer=self.start_layer,
                 end_layer=self.end_layer,
 
@@ -303,12 +303,12 @@ def initialize_sgl_model_runner(
     use_hfcache = kwargs.get("use_hfcache", False)
     nccl_port = kwargs.get("nccl_port", None)
     # Use selective download for GPU models to save bandwidth and disk space
-    from parallax.utils.selective_download import get_model_path_with_selective_download
+    from parallax.utils.model_download import selective_model_download
 
     logger.info(
         f"Downloading model with selective weight files for layers [{start_layer}, {end_layer})"
     )
-    model_path = get_model_path_with_selective_download(
+    model_path = selective_model_download(
         model_repo, start_layer=start_layer, end_layer=end_layer, local_files_only=use_hfcache
     )
 
 
@@ -0,0 +1,153 @@
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+
+from huggingface_hub import hf_hub_download as _hf_hub_download
+from huggingface_hub import snapshot_download as _snapshot_download
+from modelscope import snapshot_download as _ms_snapshot_download
+from modelscope.hub.file_download import model_file_download as _ms_model_file_download
+
+from parallax.utils.weight_filter_utils import (
+    determine_needed_weight_files_for_download,
+)
+
+logger = logging.getLogger(__name__)
+_USE_MODELSCOPE_ENV = "USE_MODELSCOPE"
+
+__all__ = [
+    "download_model_file",
+    "download_model_snapshot",
+    "selective_model_download",
+]
+
+
+def download_model_snapshot(
+    repo_id: str,
+    allow_patterns: Optional[list[str] | str] = None,
+    ignore_patterns: Optional[list[str] | str] = None,
+    local_dir: Optional[str | Path] = None,
+    local_files_only: bool = False,
+) -> Path:
+    if _use_modelscope():
+        return Path(
+            _ms_snapshot_download(
+                model_id=repo_id,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+                local_dir=str(local_dir) if local_dir is not None else None,
+                local_files_only=local_files_only,
+            )
+        )
+
+    return Path(
+        _snapshot_download(
+            repo_id=repo_id,
+            allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
+            local_dir=local_dir,
+            local_files_only=local_files_only,
+        )
+    )
+
+
+def download_model_file(
+    repo_id: str,
+    filename: str,
+    local_files_only: bool = False,
+) -> Path:
+    if _use_modelscope():
+        return Path(
+            _ms_model_file_download(
+                model_id=repo_id,
+                file_path=filename,
+                local_files_only=local_files_only,
+            )
+        )
+
+    return Path(
+        _hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            local_files_only=local_files_only,
+        )
+    )
+
+
+def selective_model_download(
+    repo_id: str,
+    start_layer: Optional[int] = None,
+    end_layer: Optional[int] = None,
+    local_files_only: bool = False,
+) -> Path:
+    local_path = Path(repo_id)
+    if local_path.exists():
+        logger.debug(f"Using local model path: {local_path}")
+        return local_path
+
+    logger.debug(f"Downloading model metadata for {repo_id}")
+    model_path = download_model_snapshot(
+        repo_id=repo_id,
+        ignore_patterns=_EXCLUDE_WEIGHT_PATTERNS,
+        local_files_only=local_files_only,
+    )
+    logger.debug(f"Downloaded model metadata to {model_path}")
+
+    if start_layer is not None and end_layer is not None:
+        logger.debug(f"Determining required weight files for layers [{start_layer}, {end_layer})")
+
+        needed_weight_files = determine_needed_weight_files_for_download(
+            model_path=model_path,
+            start_layer=start_layer,
+            end_layer=end_layer,
+        )
+
+        if not needed_weight_files:
+            logger.debug("Could not determine specific weight files, downloading all")
+            download_model_snapshot(repo_id=repo_id, local_files_only=local_files_only)
+        else:
+            # Step 3: Download only the needed weight files
+            logger.info(f"Downloading {len(needed_weight_files)} weight files")
+
+            for weight_file in needed_weight_files:
+                # Check if file already exists in local cache before downloading
+                weight_file_path = model_path / weight_file
+                if weight_file_path.exists():
+                    continue
+
+                logger.debug(f"Downloading {weight_file}")
+                try:
+                    download_model_file(
+                        repo_id=repo_id,
+                        filename=weight_file,
+                        local_files_only=local_files_only,
+                    )
+                except Exception as e:
+                    logger.error(f"Failed to download {weight_file} for {repo_id}: {e}")
+                    logger.error(
+                        "This node cannot reach Hugging Face Hub to download weight files. "
+                        "Please check network connectivity or pre-download the model."
+                    )
+                    raise
+
+            logger.debug(f"Downloaded weight files for layers [{start_layer}, {end_layer})")
+    else:
+        logger.debug("No layer range specified, downloading all model files")
+        download_model_snapshot(repo_id=repo_id, local_files_only=local_files_only)
+
+    return model_path
+
+
+_EXCLUDE_WEIGHT_PATTERNS = [
+    "*.safetensors",
+    "*.bin",
+    "*.pt",
+    "*.pth",
+    "pytorch_model*.bin",
+    "model*.safetensors",
+    "weight*.safetensors",
+]
+
+
+def _use_modelscope() -> bool:
+    return _USE_MODELSCOPE_ENV in os.environ
Original file line number	Diff line number	Diff line change
`@@ -303,12 +303,12 @@ def initialize_sgl_model_runner(`
`303`	`303`	`use_hfcache = kwargs.get("use_hfcache", False)`
`304`	`304`	`nccl_port = kwargs.get("nccl_port", None)`
`305`	`305`	`# Use selective download for GPU models to save bandwidth and disk space`
`306`		`- from parallax.utils.selective_download import get_model_path_with_selective_download`
	`306`	`+ from parallax.utils.model_download import selective_model_download`
`307`	`307`
`308`	`308`	`logger.info(`
`309`	`309`	`f"Downloading model with selective weight files for layers [{start_layer}, {end_layer})"`
`310`	`310`	`)`
`311`		`- model_path = get_model_path_with_selective_download(`
	`311`	`+ model_path = selective_model_download(`
`312`	`312`	`model_repo, start_layer=start_layer, end_layer=end_layer, local_files_only=use_hfcache`
`313`	`313`	`)`
`314`	`314`