Skip to content

Commit ec40978

Browse files
authored
Merge pull request #142 from rebellions-sw/dev
fix(version): update release version to v0.9.2.post1
2 parents c00260f + de96015 commit ec40978

76 files changed

Lines changed: 5765 additions & 3333 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/rbln_dispatch_trigger_on_pr_ci.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
permissions:
1818
contents: write
1919
outputs:
20-
sync_branch: ${{ steps.push.outputs.push.sync_branch }}
20+
sync_branch: ${{ steps.push.outputs.sync_branch }}
2121
steps:
2222
- name: Checkout
2323
uses: actions/checkout@v4
@@ -82,7 +82,7 @@ jobs:
8282
cleanup:
8383
runs-on: runner-vllm-ci
8484
needs: [sync-and-call, check_code_quaility, check_device_run]
85-
if: always() && needs.sync-and-call.outputs.sync_branch
85+
if: always()
8686
permissions:
8787
contents: write
8888
steps:
@@ -99,6 +99,8 @@ jobs:
9999
100100
git config --unset-all http.https://github.com/.extraheader || true
101101
git remote set-url origin "https://x-access-token:${{ secrets.GIT_PAT }}@github.com/${{ github.repository }}.git"
102-
103-
echo "Deleting branch: $SYNC_BRANCH"
104-
git push origin --delete "$SYNC_BRANCH" || echo "Branch $SYNC_BRANCH may have already been deleted"
102+
103+
if [ -n "$SYNC_BRANCH" ]; then
104+
echo "Deleting branch: $SYNC_BRANCH"
105+
git push origin --delete "$SYNC_BRANCH" || echo "Branch $SYNC_BRANCH may have already been deleted"
106+
fi

.github/workflows/rbln_optimum_ci.yaml

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ jobs:
5858
5959
- name: Install rebel-compiler
6060
run: |
61+
sudo apt-get update
62+
sudo apt-get install -y build-essential
63+
export CXX=$(which g++)
6164
python3 -m pip uninstall rebel-compiler -y
6265
PYPI_URL=$(echo ${{ env.REBEL_PYPI_ENDPOINT }} | sed "s/\/\//\0${{ env.REBEL_PYPI_USERNAME }}:${{ env.REBEL_PYPI_PASSWORD }}@/")
6366
VERSION=${{ inputs.rebel_compiler_version || steps.get_latest_rebel_compiler.outputs.LATEST_COMPILER_VER }}
@@ -88,63 +91,75 @@ jobs:
8891
8992
- name: Run decoder-only test (eager attn) (V1)
9093
run: >
91-
VLLM_USE_V1=1 python3 examples/optimum/run_decoder_only.py
94+
python3 examples/optimum/run_decoder_only.py
9295
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
9396
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/llama2-7b_batch2
9497
--prompt_txt ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/prompts/copy_prompts.txt
9598
--golden_json ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/golden/golden_llama7b_result_copy_prompts.json
9699
97100
- name: Run decoder-only test (Flash-attention mode) (V1)
98101
run: >
99-
VLLM_USE_V1=1 python3 examples/optimum/run_decoder_only.py --batch_size 4 --max_seq_len 131072 --kvcache_block_size 16384
102+
python3 examples/optimum/run_decoder_only.py --max_seq_len 131072
100103
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
101104
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/llama3_2-3b-128k_kv16k_batch4
102105
--prompt_txt ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/prompts/copy_prompts.txt
103106
--golden_json ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/golden/golden_llama3_2_3b_instruct_128k_copy_prompts.json
104107
105108
- name : Run Llava-next (Eager mode) (V1)
106109
run: >
107-
VLLM_USE_V1=1 python3 examples/optimum/run_llava.py --max_seq_len 32768 --kvcache_partition_len 32768
110+
python3 examples/optimum/run_llava.py
111+
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
112+
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/llava-v1.6-mistral-7b-hf-32k-b4/
113+
114+
- name : Run Llava-next (Eager mode) (V0)
115+
run: >
116+
VLLM_USE_V1=0 python3 examples/optimum/run_llava.py
108117
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
109118
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/llava-v1.6-mistral-7b-hf-32k-b4/
110119
111120
- name : Run Llava-next (Flash-attention mode) (V1)
112121
run: >
113-
VLLM_USE_V1=1 python3 examples/optimum/run_llava.py --max_seq_len 32768 --kvcache_partition_len 16384
122+
python3 examples/optimum/run_llava.py
123+
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
124+
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/llava-v1.6-mistral-7b-hf-32k-b4-kv16k
125+
126+
- name : Run Llava-next (Flash-attention mode) (V0)
127+
run: >
128+
VLLM_USE_V1=0 python3 examples/optimum/run_llava.py
114129
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
115130
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/llava-v1.6-mistral-7b-hf-32k-b4-kv16k
116131
117132
- name : Run Idefics3 (Eager mode) (V1)
118133
run: >
119-
VLLM_USE_V1=1 python3 examples/optimum/run_idefics3.py --max_seq_len 32768 --kvcache_partition_len 32768
134+
python3 examples/optimum/run_idefics3.py
120135
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
121136
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/idefics3-8b-llama3-32k-b4
122137
123138
- name : Run Idefics3 (Flash-attention mode) (V1)
124139
run: >
125-
VLLM_USE_V1=1 python3 examples/optimum/run_idefics3.py --max_seq_len 32768 --kvcache_partition_len 16384
140+
python3 examples/optimum/run_idefics3.py
126141
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
127142
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/idefics3-8b-llama3-32k-b4-kv16k
128143
129144
- name : Run Blip2 (V1)
130145
run: >
131-
VLLM_USE_V1=1 python3 examples/optimum/run_blip2.py
146+
python3 examples/optimum/run_blip2.py
132147
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
133148
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/blip2-opt-2.7b-2k-b4
134149
135150
- name : Run Qwen2.5_VL (V1)
136151
run: >
137-
VLLM_USE_V1=1 python3 examples/optimum/run_qwen_vl.py
152+
python3 examples/optimum/run_qwen_vl.py
138153
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
139154
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/qwen2_5-vl-7b-32k-b4-kv16k
140155
141-
- name : Run encoder-decoder
156+
- name : Run encoder-decoder (V1)
142157
run: >
143158
python3 examples/optimum/run_encoder_decoder.py
144159
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}
145160
--model_id ${{ env.REBEL_VLLM_PRE_COMPILED_DIR }}/rbln_bart-small_batch2
146161
147-
- name : Run text embedding model
162+
- name : Run text embedding model (V1)
148163
run: >
149164
python3 examples/optimum/run_encoder_only.py
150165
--num_input_prompt ${{ env.NUM_INPUT_PROMPT }}

.github/workflows/rbln_trigger_on_pr.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
if: ${{ needs.check-skip-ci.outputs.should_skip != 'true' }}
3232
outputs:
3333
is_team_member: ${{ steps.check_member.outputs.IS_TEAM_MEMBER }}
34-
is_collaborator: ${{ steps.check_member.outputs.IS_COLLABORATOR }}
34+
is_collaborator: ${{ steps.check_collaborator.outputs.IS_COLLABORATOR }}
3535
steps:
3636
- name: Fetch team members
3737
id: fetch_team
@@ -57,8 +57,9 @@ jobs:
5757
echo "❌ IS_TEAM_MEMBER set to: $result"
5858
fi
5959
60-
- name: Check if collaborator
61-
if: ${{ needs.check-team-member.outputs.is_team_member != 'true' }}
60+
- name: Check if PR author is a collaborator
61+
id: check_collaborator
62+
if: ${{ steps.check_member.outputs.IS_TEAM_MEMBER != 'true' }}
6263
run: |
6364
pr_author=${{ github.event.pull_request.user.login }}
6465
echo "Checking if PR author '$pr_author' is a collaborator..."

.github/workflows/rbln_vllm-rbln_pytest.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ jobs:
6464
ref: ${{ inputs.pr_number && format('refs/pull/{0}/merge', inputs.pr_number) || inputs.ref || github.sha }}
6565
fetch-depth: 0
6666

67-
- name: Set up Python 3.9
67+
- name: Set up Python 3.12
6868
uses: actions/setup-python@v5
6969
with:
70-
python-version: "3.9"
70+
python-version: "3.12"
7171

7272
- name: Install build dependencies and build vllm-rbln wheel
7373
run: |
@@ -135,10 +135,10 @@ jobs:
135135
136136
echo "skip=$SKIP" >> $GITHUB_OUTPUT
137137
138-
- name: Set up Python 3.9
138+
- name: Set up Python 3.12
139139
uses: actions/setup-python@v5
140140
with:
141-
python-version: "3.9"
141+
python-version: "3.12"
142142

143143
- name: Install rebel-compiler
144144
run: |

examples/optimum/run_blip2.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,10 @@ async def generate(engine: AsyncLLMEngine, tokenizer, request_id, request):
6161

6262

6363
async def main(
64-
batch_size: int,
65-
max_seq_len: int,
6664
num_input_prompt: int,
6765
model_id: str,
6866
):
69-
engine_args = AsyncEngineArgs(model=model_id,
70-
device="auto",
71-
max_num_seqs=batch_size,
72-
max_num_batched_tokens=max_seq_len,
73-
max_model_len=max_seq_len,
74-
block_size=max_seq_len)
67+
engine_args = AsyncEngineArgs(model=model_id)
7568

7669
engine = AsyncLLMEngine.from_engine_args(engine_args)
7770
tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -96,19 +89,13 @@ async def main(
9689

9790

9891
def entry_point(
99-
batch_size: int = 4,
100-
max_seq_len: int = 2048,
10192
num_input_prompt: int = 10,
10293
model_id: str = "/blip2-opt-2.7b-2k-b4",
10394
):
104-
loop = asyncio.get_event_loop()
105-
loop.run_until_complete(
106-
main(
107-
batch_size=batch_size,
108-
max_seq_len=max_seq_len,
109-
num_input_prompt=num_input_prompt,
110-
model_id=model_id,
111-
))
95+
asyncio.run(main(
96+
num_input_prompt=num_input_prompt,
97+
model_id=model_id,
98+
))
11299

113100

114101
if __name__ == "__main__":

examples/optimum/run_decoder_only.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -85,20 +85,13 @@ def compare_copy_prompt_task_result(
8585

8686

8787
async def main(
88-
batch_size: int,
8988
max_seq_len: int,
90-
kvcache_block_size: int,
9189
num_input_prompt: int,
9290
model_id: str,
9391
prompt_txt: str,
9492
golden_json: str,
9593
):
96-
engine_args = AsyncEngineArgs(model=model_id,
97-
device="auto",
98-
max_num_seqs=batch_size,
99-
max_num_batched_tokens=max_seq_len,
100-
max_model_len=max_seq_len,
101-
block_size=kvcache_block_size)
94+
engine_args = AsyncEngineArgs(model=model_id)
10295

10396
engine = AsyncLLMEngine.from_engine_args(engine_args)
10497
prompt = get_input_prompts(prompt_txt)
@@ -124,20 +117,15 @@ async def main(
124117

125118

126119
def entry_point(
127-
batch_size: int = 2,
128120
max_seq_len: int = 4096,
129-
kvcache_block_size: int = 4096,
130121
num_input_prompt: int = 1,
131122
model_id: str = "/llama2-7b_batch2",
132123
prompt_txt: str = "/prompts/copy_prompts.txt",
133124
golden_json: str = "/golden/golden_llama7b_result_copy_prompts.json",
134125
):
135-
loop = asyncio.get_event_loop()
136-
loop.run_until_complete(
126+
asyncio.run(
137127
main(
138-
batch_size=batch_size,
139128
max_seq_len=max_seq_len,
140-
kvcache_block_size=kvcache_block_size,
141129
num_input_prompt=num_input_prompt,
142130
model_id=model_id,
143131
prompt_txt=prompt_txt,

examples/optimum/run_encoder_decoder.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,10 @@ def compare(result):
7272

7373

7474
async def main(
75-
batch_size: int,
76-
max_seq_len: int,
7775
num_input_prompt: int,
7876
model_id: str,
7977
):
80-
engine_args = AsyncEngineArgs(model=model_id,
81-
device="auto",
82-
max_num_seqs=batch_size,
83-
max_num_batched_tokens=max_seq_len,
84-
max_model_len=max_seq_len,
85-
block_size=max_seq_len)
78+
engine_args = AsyncEngineArgs(model=model_id)
8679

8780
engine = AsyncLLMEngine.from_engine_args(engine_args)
8881
prompt = get_input_prompts(num_prompts=num_input_prompt)
@@ -104,19 +97,13 @@ async def main(
10497

10598

10699
def entry_point(
107-
batch_size: int = 2,
108-
max_seq_len: int = 512,
109100
num_input_prompt: int = 10,
110101
model_id: str = "/rbln_bart-small_batch2",
111102
):
112-
loop = asyncio.get_event_loop()
113-
loop.run_until_complete(
114-
main(
115-
batch_size=batch_size,
116-
max_seq_len=max_seq_len,
117-
num_input_prompt=num_input_prompt,
118-
model_id=model_id,
119-
))
103+
asyncio.run(main(
104+
num_input_prompt=num_input_prompt,
105+
model_id=model_id,
106+
))
120107

121108

122109
if __name__ == "__main__":

examples/optimum/run_encoder_only.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def compare_copy_prompt_task_result(scores: list[float], golden_json: str):
4545

4646

4747
async def encode(engine, prompt, request_id):
48-
pooling_params = PoolingParams()
48+
pooling_params = PoolingParams(task="embed")
4949
results_generator = engine.encode(prompt=prompt,
5050
pooling_params=pooling_params,
5151
request_id=str(request_id))
@@ -69,15 +69,9 @@ async def get_result(engine, model_id, prompt, num_input_prompt):
6969
return results
7070

7171

72-
async def main(model_id: str, max_seq_len: int, batch_size: int,
73-
num_input_prompt: int, q_prompt_txt: str, p_prompt_txt: str,
74-
golden_json: str):
75-
engine_args = AsyncEngineArgs(model=model_id,
76-
device="auto",
77-
max_num_seqs=batch_size,
78-
max_num_batched_tokens=max_seq_len,
79-
block_size=max_seq_len,
80-
max_model_len=max_seq_len)
72+
async def main(model_id: str, num_input_prompt: int, q_prompt_txt: str,
73+
p_prompt_txt: str, golden_json: str):
74+
engine_args = AsyncEngineArgs(model=model_id)
8175

8276
engine = AsyncLLMEngine.from_engine_args(engine_args)
8377
q_prompt = get_input_prompts(q_prompt_txt)
@@ -105,19 +99,14 @@ async def main(model_id: str, max_seq_len: int, batch_size: int,
10599

106100

107101
def entry_point(
108-
max_seq_len: int = 4096,
109-
batch_size: int = 4,
110102
num_input_prompt: int = 3,
111103
model_id: str = "/bge-m3-1k-batch4",
112104
q_prompt_txt: str = "/prompts/q_prompts.txt",
113105
p_prompt_txt: str = "/prompts/p_prompts.txt",
114106
golden_json: str = "/golden/golden_bge_m3_result_qp_prompts.json",
115107
):
116-
loop = asyncio.get_event_loop()
117-
loop.run_until_complete(
108+
asyncio.run(
118109
main(
119-
max_seq_len=max_seq_len,
120-
batch_size=batch_size,
121110
num_input_prompt=num_input_prompt,
122111
model_id=model_id,
123112
q_prompt_txt=q_prompt_txt,

examples/optimum/run_gemma3.py

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -84,18 +84,10 @@ async def generate(engine: AsyncLLMEngine, tokenizer, request_id, request):
8484

8585

8686
async def main(
87-
batch_size: int,
88-
max_seq_len: int,
89-
kvcache_partition_len: int,
9087
num_input_prompt: int,
9188
model_id: str,
9289
):
93-
engine_args = AsyncEngineArgs(model=model_id,
94-
device="auto",
95-
max_num_seqs=batch_size,
96-
max_num_batched_tokens=max_seq_len,
97-
max_model_len=max_seq_len,
98-
block_size=kvcache_partition_len)
90+
engine_args = AsyncEngineArgs(model=model_id)
9991

10092
engine = AsyncLLMEngine.from_engine_args(engine_args)
10193
tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -120,19 +112,13 @@ async def main(
120112

121113

122114
def entry_point(
123-
batch_size: int = 4,
124-
max_seq_len: int = 32768,
125-
kvcache_partition_len: int = 16384,
126115
num_input_prompt: int = 10,
127116
model_id: str = "/gemma3-4b-conditional-b4-flash",
128117
):
129-
loop = asyncio.get_event_loop()
130-
loop.run_until_complete(
131-
main(batch_size=batch_size,
132-
max_seq_len=max_seq_len,
133-
kvcache_partition_len=kvcache_partition_len,
134-
num_input_prompt=num_input_prompt,
135-
model_id=model_id))
118+
asyncio.run(main(
119+
num_input_prompt=num_input_prompt,
120+
model_id=model_id,
121+
))
136122

137123

138124
if __name__ == "__main__":

0 commit comments

Comments
 (0)