GradientHQ
diff --git a/‎AGENTS.md‎
Lines changed: 26 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎build_rust.sh‎
Lines changed: 142 additions & 0 deletions b/‎build_rust.sh‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎scripts/generate.py‎
Lines changed: 73 additions & 27 deletions b/‎scripts/generate.py‎
Lines changed: 73 additions & 27 deletions
@@ -0,0 +1,26 @@
+mac env:
+    python: .venv/bin/python
+
+gpu env:
+    ssh H200 server, then use conda activate parallax
+
+
+start a local server:
+    python src/parallax/launch.py --model-path <MODEL_NAME> --log-level DEBUG
+
+test the server:
+    curl --location 'http://localhost:3000/v1/chat/completions' --header 'Content-Type: application/json' --data '{
+        "max_tokens": 1024,
+        "messages": [
+            {
+                "role": "user",
+                "content": "hello"
+            }
+        ],
+        "stream": true
+    }'
+
+final check:
+    run when user ask to git commit
+    pre-commit run --all-files
+    pytest
@@ -0,0 +1,142 @@
+#!/bin/bash
+# Build the vLLM Rust frontend binary and install it into Parallax's pip scripts dir.
+# Usage:
+#   ./build_rust.sh [--debug]
+#
+# By default builds in release mode. Pass --debug for faster compile times
+# during development.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+show_help() {
+    cat <<'EOF'
+Build the vLLM Rust frontend binary and install it into Parallax's pip scripts dir.
+
+Usage:
+  ./build_rust.sh [--debug]
+
+Environment:
+  VLLM_REF             vLLM git branch/tag to clone. Defaults to main.
+  PARALLAX_PYTHON      Python interpreter for the Parallax installation.
+EOF
+}
+
+resolve_parallax_python() {
+    if [[ -n "${PARALLAX_PYTHON:-}" ]]; then
+        if [[ ! -x "$PARALLAX_PYTHON" ]]; then
+            echo "PARALLAX_PYTHON is not executable: $PARALLAX_PYTHON" >&2
+            exit 1
+        fi
+        echo "$PARALLAX_PYTHON"
+        return
+    fi
+
+    if [[ -n "${VIRTUAL_ENV:-}" && -x "$VIRTUAL_ENV/bin/python" ]]; then
+        echo "$VIRTUAL_ENV/bin/python"
+        return
+    fi
+
+    if [[ -n "${CONDA_PREFIX:-}" && -x "$CONDA_PREFIX/bin/python" ]]; then
+        echo "$CONDA_PREFIX/bin/python"
+        return
+    fi
+
+    if command -v python &>/dev/null; then
+        command -v python
+        return
+    fi
+
+    echo "Unable to find Python for the Parallax installation." >&2
+    echo "Activate the Parallax environment or set PARALLAX_PYTHON." >&2
+    exit 1
+}
+
+resolve_parallax_bin_dir() {
+    local parallax_python
+    parallax_python="$(resolve_parallax_python)"
+    "$parallax_python" - <<'PY'
+import sysconfig
+
+print(sysconfig.get_path("scripts"))
+PY
+}
+
+if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
+    show_help
+    exit 0
+fi
+
+if [[ $# -gt 1 || ( $# -eq 1 && "${1:-}" != "--debug" ) ]]; then
+    show_help >&2
+    exit 2
+fi
+
+VLLM_REF="${VLLM_REF:-main}"
+CLONE_PARENT="$(mktemp -d "${TMPDIR:-/tmp}/parallax-vllm-rs.XXXXXX")"
+VLLM_CLONE_ROOT="$CLONE_PARENT/vllm"
+
+cleanup_clone() {
+    rm -rf "$CLONE_PARENT"
+}
+
+trap cleanup_clone EXIT
+
+if ! command -v git &>/dev/null; then
+    echo "git not found; install git and rerun this script." >&2
+    exit 1
+fi
+
+echo "Cloning vLLM from https://github.com/vllm-project/vllm.git (ref: $VLLM_REF)"
+git clone --depth 1 --branch "$VLLM_REF" \
+    https://github.com/vllm-project/vllm.git \
+    "$VLLM_CLONE_ROOT"
+
+RUST_DIR="$VLLM_CLONE_ROOT/rust"
+PARALLAX_SCRIPTS_DIR="$(resolve_parallax_bin_dir)"
+TARGET_PATH="$PARALLAX_SCRIPTS_DIR/vllm-rs"
+
+if [[ ! -f "$RUST_DIR/Cargo.toml" || ! -f "$VLLM_CLONE_ROOT/rust-toolchain.toml" ]]; then
+    echo "Cloned repository does not contain the expected vLLM Rust frontend sources." >&2
+    exit 1
+fi
+
+# Read the required toolchain from rust-toolchain.toml.
+TOOLCHAIN=$(grep '^channel' "$VLLM_CLONE_ROOT/rust-toolchain.toml" | sed 's/.*= *"\(.*\)"/\1/')
+
+if [[ -z "$TOOLCHAIN" ]]; then
+    echo "Unable to read Rust toolchain from $VLLM_CLONE_ROOT/rust-toolchain.toml" >&2
+    exit 1
+fi
+
+# Ensure rustup and the required toolchain are available.
+if ! command -v rustup &>/dev/null; then
+    echo "rustup not found, installing..."
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none
+    # shellcheck disable=SC1091
+    source "$HOME/.cargo/env"
+fi
+
+if ! rustup run "$TOOLCHAIN" rustc --version &>/dev/null; then
+    echo "Installing Rust toolchain: $TOOLCHAIN"
+    rustup toolchain install "$TOOLCHAIN"
+fi
+
+if [[ "${1:-}" == "--debug" ]]; then
+    PROFILE_ARGS=()
+    PROFILE_DIR="debug"
+else
+    PROFILE_ARGS=(--release)
+    PROFILE_DIR="release"
+fi
+
+cargo +"$TOOLCHAIN" build "${PROFILE_ARGS[@]}" \
+    --manifest-path "$RUST_DIR/Cargo.toml" \
+    --bin vllm-rs \
+    --features native-tls-vendored
+
+mkdir -p "$(dirname "$TARGET_PATH")"
+cp "$RUST_DIR/target/$PROFILE_DIR/vllm-rs" "$TARGET_PATH"
+chmod +x "$TARGET_PATH"
+echo "Installed vllm-rs to $TARGET_PATH"
@@ -25,11 +25,13 @@
 import time
 
 import mlx.core as mx
+from mlx_lm.server import convert_chat, process_message_content
 
 from parallax.server.cache_manager import CacheManager
 from parallax.server.request import InitialRequest
 from parallax.server.sampling.sampler import SamplingBatchInfo
 from parallax.server.sampling.sampling_params import SamplingParams
+from parallax.server.scheduler import _normalize_token_ids
 from parallax.server.shard_loader import MLXModelLoader
 from parallax.utils.utils import create_causal_mask, get_layer_types
 
@@ -44,6 +46,40 @@ def print_rank(message):
         print(f"[Rank {tp_rank}] {message}")
 
 
+def get_eos_token_ids(config, tokenizer):
+    eos_token_id = config.get("eos_token_id")
+    tokenizer_eos_token_id = getattr(tokenizer, "eos_token_id", None)
+    if eos_token_id is None:
+        eos_token_id = tokenizer_eos_token_id
+
+    eos_token_ids = _normalize_token_ids(eos_token_id)
+    eos_token_ids.update(_normalize_token_ids(tokenizer_eos_token_id))
+    return eos_token_ids
+
+
+def build_prompt(messages, tokenizer):
+    if tokenizer.chat_template:
+        process_message_content(messages)
+        prompt_tokens = tokenizer.apply_chat_template(
+            messages,
+            None,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=False,
+        )
+        full_prompt = tokenizer.apply_chat_template(
+            messages,
+            None,
+            tokenize=False,
+            add_generation_prompt=True,
+            return_dict=False,
+        )
+    else:
+        full_prompt = convert_chat(messages, None)
+        prompt_tokens = tokenizer.encode(full_prompt)
+    return full_prompt, prompt_tokens
+
+
 def main():
     parser = argparse.ArgumentParser(description="Simple offline inference script")
     parser.add_argument(
@@ -76,6 +112,8 @@ def main():
     # 2. Initialize CacheManager
     num_layers = config.get("num_hidden_layers")
     num_kv_heads = config.get("num_key_value_heads")
+    if num_kv_heads is None:
+        num_kv_heads = config.get("num_attention_groups")
     head_dim = config.get("head_dim") or config.get("hidden_size") // config.get(
         "num_attention_heads"
     )
@@ -88,6 +126,18 @@ def main():
 
     v_head_dim = config.get("v_head_dim")
     layer_types = get_layer_types(config, 0, num_layers)
+    linear_key_head_dim = config.get("linear_key_head_dim")
+    linear_value_head_dim = config.get("linear_value_head_dim")
+    linear_conv_kernel_dim = config.get("linear_conv_kernel_dim")
+    linear_num_key_heads = config.get("linear_num_key_heads")
+    linear_num_value_heads = config.get("linear_num_value_heads")
+    key_dim, value_dim, conv_dim = None, None, None
+    if linear_key_head_dim is not None and linear_num_key_heads is not None:
+        key_dim = linear_key_head_dim * linear_num_key_heads
+    if linear_value_head_dim is not None and linear_num_value_heads is not None:
+        value_dim = linear_value_head_dim * linear_num_value_heads
+    if key_dim is not None and value_dim is not None:
+        conv_dim = key_dim * 2 + value_dim
 
     cache_manager = CacheManager(
         num_layers=num_layers,
@@ -98,19 +148,17 @@ def main():
         cache_memory_fraction=0.1,
         head_dim_v=v_head_dim,
         layer_types=layer_types,
+        conv_dim=conv_dim,
+        conv_kernel_size=linear_conv_kernel_dim,
+        linear_k_dim=linear_key_head_dim,
+        linear_v_dim=linear_value_head_dim,
+        linear_num_k_heads=linear_num_key_heads,
+        linear_num_v_heads=linear_num_value_heads,
     )
 
     # 3. Tokenize and Create Request
     messages = [{"role": "user", "content": args.prompt}]
-
-    if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
-        full_prompt = tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-    else:
-        full_prompt = args.prompt
-
-    prompt_tokens = tokenizer.encode(full_prompt)
+    full_prompt, prompt_tokens = build_prompt(messages, tokenizer)
     sampling_params = SamplingParams(temperature=args.temp, top_k=args.topk)
     request = InitialRequest(
         prompt=full_prompt,
@@ -119,22 +167,9 @@ def main():
         max_new_tokens=args.max_tokens,
     )
 
-    eos_token_ids = []
-    if tokenizer.eos_token_id is not None:
-        if isinstance(tokenizer.eos_token_id, list):
-            eos_token_ids.extend(tokenizer.eos_token_id)
-        else:
-            eos_token_ids.append(tokenizer.eos_token_id)
-    config_eos = config.get("eos_token_id")
-    if config_eos is not None:
-        if isinstance(config_eos, list):
-            for e in config_eos:
-                if e not in eos_token_ids:
-                    eos_token_ids.append(e)
-        elif config_eos not in eos_token_ids:
-            eos_token_ids.append(config_eos)
-
-    eos_token_ids = set(eos_token_ids)
+    eos_token_ids = get_eos_token_ids(config, tokenizer)
+    if not eos_token_ids:
+        raise ValueError("EOS token ID must be set for generation.")
 
     # 4. Prefill
     print_rank(f"Full prompt:\n {full_prompt}")
@@ -151,6 +186,9 @@ def main():
     input_ids = mx.array([request.input_ids])
     block_table = mx.array([cache_manager.get_block_table(request.request_id)], dtype=mx.int32)
     context_lengths = mx.array([request.prompt_len], dtype=mx.int32)
+    state_slot_mapping = None
+    if cache_manager.needs_slots:
+        state_slot_mapping = mx.array([cache_manager.get_slot(request.request_id)], dtype=mx.int32)
 
     block_size = cache_manager.block_size
     slot_mapping = []
@@ -172,21 +210,27 @@ def main():
         block_tables=block_table,
         context_lengths=context_lengths,
         slot_mapping=slot_mapping,
+        state_slot_mapping=state_slot_mapping,
     )
 
     sampling_info = SamplingBatchInfo.from_reqs([request])
 
     next_token_id = model.logits_to_tokens(logits, context_lengths, sampling_info)
 
     token_id = int(next_token_id[0])
-    request.commit_new_token(token_id)
+    is_finished = token_id in eos_token_ids
+    if not is_finished:
+        request.commit_new_token(token_id)
 
     prefill_time = time.perf_counter() - prefill_start
     print_rank(f"Token 1 (Prefill) time: {prefill_time * 1000:.2f} ms")
 
     # 5. Decode Loop
     total_decode_time = 0
     for i in range(args.max_tokens - 1):
+        if is_finished:
+            break
+
         decode_step_start = time.perf_counter()
 
         success = cache_manager.append_slot(request.request_id)
@@ -204,12 +248,14 @@ def main():
             mask=None,
             block_tables=block_table,
             context_lengths=context_lengths,
+            state_slot_mapping=state_slot_mapping,
         )
 
         next_token_id = model.logits_to_tokens(logits, mx.array([1]), sampling_info)
 
         token_id = int(next_token_id[0])
-        if token_id in eos_token_ids:
+        is_finished = token_id in eos_token_ids
+        if is_finished:
             break
         request.commit_new_token(token_id)