Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion src/mmirage/core/process/processors/llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dataclasses import dataclass, field

import logging
import os
from typing import Dict, Optional, Sequence, Type, Any, List
from pydantic import BaseModel, create_model

Expand All @@ -15,6 +16,39 @@
env = Environment()


def _parse_tp_size_from_env() -> int:
"""Parse tensor parallelism size from SLURM_GPUS_ON_NODE environment variable.

Defensively parses the environment variable, handling invalid values:
- Returns 1 if the variable is None or empty
- Strips whitespace before parsing
- Returns 1 for non-integer values
- Returns 1 for values <= 0

Returns:
Tensor parallelism size (>= 1), defaults to 1 on any parsing error.
"""
env_value = os.environ.get("SLURM_GPUS_ON_NODE")
if not env_value:
return 1

try:
tp_size = int(env_value.strip())
# Ensure tp_size is positive (must be >= 1)
if tp_size <= 0:
logger.warning(
f"Invalid SLURM_GPUS_ON_NODE value '{env_value}' (must be > 0), defaulting tp_size to 1"
)
return 1
return tp_size
except ValueError:
# ValueError: invalid integer format
logger.warning(
f"Invalid SLURM_GPUS_ON_NODE value '{env_value}', defaulting tp_size to 1"
)
return 1


@dataclass
class SGLangServerArgs:
"""Server arguments for SGLang engine.
Expand All @@ -27,7 +61,7 @@ class SGLangServerArgs:
"""

model_path: str = "none"
tp_size: int = 1
tp_size: int = field(default_factory=_parse_tp_size_from_env)
trust_remote_code: bool = True
Comment on lines 63 to 65
Copy link

Copilot AI Feb 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tp_size default parsing can raise ValueError if SLURM_GPUS_ON_NODE is set but not a plain integer (or includes whitespace). It can also result in tp_size=0 if the env var is "0", which is likely invalid for tensor parallelism and will fail later when constructing the SGLang engine. Consider using a small helper to parse the env var defensively (strip, try/except, and clamp to >=1), falling back to 1 on invalid values.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot open a new pull request to apply changes based on this feedback

disable_custom_all_reduce: bool = False

Expand Down
Loading