ATOM/atom/model_engine/model_runner.py at 7f50a8b36b2feb414b595a43aab50c171499e5c6 · ROCm/ATOM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# SPDX-License-Identifier: MIT
# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

import logging
import math
import os
import time
import gzip
from contextlib import nullcontext
from typing import Any, Optional, Union

import numpy as np
import torch
import torch.profiler as torch_profiler
import tqdm
from aiter import destroy_dist_env, dtypes, init_dist_env
from aiter.dist.parallel_state import (
    get_dp_group,
    get_pp_group,
    get_tp_group,
    graph_capture,
)
from aiter.dist.utils import get_distributed_init_method
from torch.profiler import record_function
from atom.config import Config, KVCacheTensor, set_current_atom_config
from atom.utils import envs
from atom.model_engine.scheduler import ScheduledBatch, ScheduledBatchOutput
from atom.model_engine.sequence import Sequence, SequenceStatus, SequenceType
from atom.model_loader.loader import load_model
from atom.model_ops.rejection_sampler import RejectionSampler
from atom.model_ops.sampler import SAMPLER_EPS, Sampler
from atom.spec_decode.eagle import EagleProposer
from atom.utils import (
    CpuGpuBuffer,
    get_hf_text_config,
    init_exit_handler,
    resolve_obj_by_qualname,
)
from atom.utils.forward_context import (
    Context,
    DPMetadata,
    get_forward_context,
    reset_forward_context,
    set_forward_context,
    set_kv_cache_data,
)
from atom.utils.selector import get_attn_backend

logger = logging.getLogger("atom")

support_model_arch_dict = {
    "Qwen3ForCausalLM": "atom.models.qwen3.Qwen3ForCausalLM",
    "Qwen3MoeForCausalLM": "atom.models.qwen3_moe.Qwen3MoeForCausalLM",
    "LlamaForCausalLM": "atom.models.llama.LlamaForCausalLM",
    "MixtralForCausalLM": "atom.models.mixtral.MixtralForCausalLM",
    "DeepseekV3ForCausalLM": "atom.models.deepseek_v2.DeepseekV2ForCausalLM",
    "DeepseekV32ForCausalLM": "atom.models.deepseek_v2.DeepseekV2ForCausalLM",
    "GptOssForCausalLM": "atom.models.gpt_oss.GptOssForCausalLM",
    "GlmMoeDsaForCausalLM": "atom.models.deepseek_v2.GlmMoeDsaForCausalLM",
    "Glm4MoeForCausalLM": "atom.models.glm4_moe.Glm4MoeForCausalLM",
    "Qwen3NextForCausalLM": "atom.models.qwen3_next.Qwen3NextForCausalLM",
}
# seed = 34567
# np.random.seed(seed)
# torch.cuda.manual_seed_all(seed)


class tokenIDProcessor:

    def __init__(
        self,
        runner: "ModelRunner",
        max_num_batched_tokens: int,
        use_spec: bool = False,
        num_spec_tokens: int = 0,
    ):
        """Asynchronously copy the sampled_token_ids tensor to the host."""
        # self.is_deferred_out = False
        self.is_deferred_out = True

        self.runner = runner
        device = runner.device
        self.input_ids = CpuGpuBuffer(
            max_num_batched_tokens + 1, dtype=torch.int32, device=device
        )
        self.input_ids_loc = CpuGpuBuffer(
            max_num_batched_tokens, dtype=torch.int64, device=device
        )
        self.use_spec = use_spec
        self.num_spec_tokens = num_spec_tokens

        # Event on the copy stream so we can synchronize the non-blocking copy.
        self.async_copy_event = torch.cuda.Event()
        self.async_copy_stream = torch.cuda.Stream()
        self.default_num_rejected_tokens = torch.zeros(
            max_num_batched_tokens, dtype=torch.int32, device=device
        )
        self.clean()

    def send_to_cpu_async(
        self,
        gpu_tensor: torch.Tensor,
        cpu_tensor_handle,
        data_ready: torch.cuda.Event,
        copy_done: Optional[torch.cuda.Event] = None,
    ):
        copy_done = copy_done or torch.cuda.Event()
        with torch.cuda.stream(self.async_copy_stream):
            data_ready.wait(stream=self.async_copy_stream)
            cpu_tensor = gpu_tensor.to("cpu", non_blocking=True)
            copy_done.record(self.async_copy_stream)
        cpu_tensor_handle.append((cpu_tensor, copy_done))

    def recv_async_output(self, cpu_tensor_handle) -> list[int]:
        if not cpu_tensor_handle:
            return []
        cpu_tensor, event = cpu_tensor_handle.pop(0)
        event.synchronize()
        token_ids = cpu_tensor.tolist()
        return token_ids

    def send_to_cpu_async_draft(self, gpu_tensor: torch.Tensor):
        default_stream = torch.cuda.current_stream()
        with torch.cuda.stream(self.async_copy_stream):
            self.async_copy_stream.wait_stream(default_stream)
            cpu_tensor = gpu_tensor.to("cpu", non_blocking=True)
            event = torch.cuda.Event()
            event.record(self.async_copy_stream)
        self.draft_token_ids_cpu.append((cpu_tensor, event))

    def recv_async_output_draft(self) -> np.ndarray:
        if not self.draft_token_ids_cpu:
            return np.array([], dtype=np.int32)
        token_ids, event = self.draft_token_ids_cpu.pop(0)
        event.synchronize()
        return token_ids.numpy()

    def send_mtp_status_to_cpu_async(
        self, num_rejected: torch.Tensor, num_bonus: torch.Tensor
    ):
        # rejected num and bonus num are slightly different info for mtp
        # take mtp=1 for example:
        #   first decode after prefill have 0 rej, 0 bonus
        #   prev acc decode have 0 rej, 1 bonus
        #   prev rej decode have 1 rej, 0 bonus
        # It is clear that only rejected number is not sufficient for all status tracking, bonus number is also needed.
        default_stream = torch.cuda.current_stream()
        with torch.cuda.stream(self.async_copy_stream):
            self.async_copy_stream.wait_stream(default_stream)
            num_rejected_cpu = num_rejected.to("cpu", non_blocking=True)
            num_bonus_cpu = num_bonus.to("cpu", non_blocking=True)
            self.async_copy_event.record(self.async_copy_stream)
        self.rejected_tokens_cpu.append(num_rejected_cpu)
        self.bonus_tokens_cpu.append(num_bonus_cpu)

    def recv_mtp_status_async(
        self,
    ) -> tuple[Optional[np.ndarray], Optional[np.ndarray]]:
        if not self.rejected_tokens_cpu:
            return None, None
        self.async_copy_event.synchronize()
        return (
            self.rejected_tokens_cpu.pop(0).numpy(),
            self.bonus_tokens_cpu.pop(0).numpy(),
        )

    def clean(self):
        self.token_ids_cpu: list[torch.Tensor] = []

        self.prev_batch: Optional[ScheduledBatch] = None

        self.pre_num_decode_token_per_seq = 1
        self.draft_token_ids: Optional[torch.Tensor] = None
        self.draft_token_ids_cpu: list[torch.Tensor] = []
        self.rejected_tokens_cpu: list[torch.Tensor] = (
            []
        )  # Async queue for num_bonus_tokens
        self.bonus_tokens_cpu: list[torch.Tensor] = []
        self.mapped_bonus_list: Optional[list[int]] = (
            None  # Mapped to current batch order
        )

    @staticmethod
    def _batch_process_token_ids(token_ids: list) -> list[tuple[int, ...]]:
        """Batch process token_ids: vectorized -1 truncation using numpy."""
        arr = np.array(token_ids, dtype=np.int64)
        mask = arr == -1
        if not mask.any():
            # No -1 sentinel in any row, convert each row to tuple directly
            return [tuple(row) for row in arr.tolist()]
        # Per-row: find first -1, truncate
        # Use argmax on mask; rows without -1 get 0, disambiguate with ~mask.any(axis=1)
        has_sentinel = mask.any(axis=1)
        first_neg = mask.argmax(axis=1)
        result = []
        rows = arr.tolist()
        for i, row in enumerate(rows):
            if has_sentinel[i]:
                result.append(tuple(row[: first_neg[i]]))
            else:
                result.append(tuple(row))
        return result

    def prepare_sampled_ids(
        self,
        batch: ScheduledBatch,
        sampled_token_ids: torch.Tensor,
        sync_event: torch.cuda.Event,
    ) -> tuple[list[int], list[tuple[int, ...]]]:
        if not self.is_deferred_out:
            token_ids = sampled_token_ids.tolist()
            req_ids = batch.req_ids
            if token_ids and isinstance(token_ids[0], list):
                processed = self._batch_process_token_ids(token_ids)
            else:
                processed = [(tid,) for tid in token_ids]
            return req_ids, processed

        token_ids = self.recv_async_output(self.token_ids_cpu)
        self.send_to_cpu_async(sampled_token_ids, self.token_ids_cpu, sync_event)
        req_ids_out: list[int] = []
        processed_out: list[tuple[int, ...]] = []
        self.prev_req_ids = None
        if self.prev_batch is not None:
            self.prev_req_ids = self.prev_batch.req_ids
            req_ids_out = self.prev_req_ids
            if token_ids and isinstance(token_ids[0], list):
                processed_out = self._batch_process_token_ids(token_ids)
            else:
                processed_out = [(tid,) for tid in token_ids]

        self.prev_batch = batch
        self.prev_token_ids = sampled_token_ids

        return req_ids_out, processed_out

    def get_token_locations(
        self, batch: ScheduledBatch
    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, bool]:
        prev_req_ids = self.prev_batch.req_ids
        cur_req_ids = batch.req_ids
        num_prev = len(prev_req_ids)
        num_cur = len(cur_req_ids)

        prev_id_to_idx = dict(zip(prev_req_ids, range(num_prev)))

        deferred_curr = np.empty(num_cur, dtype=np.intp)
        deferred_prev = np.empty(num_cur, dtype=np.intp)
        new_curr = np.empty(num_cur, dtype=np.intp)
        n_deferred = 0
        n_new = 0

        for cur_idx in range(num_cur):
            prev_idx = prev_id_to_idx.get(cur_req_ids[cur_idx])
            if prev_idx is not None:
                deferred_curr[n_deferred] = cur_idx
                deferred_prev[n_deferred] = prev_idx
                n_deferred += 1
            else:
                new_curr[n_new] = cur_idx
                n_new += 1

        deferred_curr = deferred_curr[:n_deferred]
        deferred_prev = deferred_prev[:n_deferred]
        new_curr = new_curr[:n_new]

        is_all_same = (
            n_new == 0
            and n_deferred == num_prev
            and np.array_equal(deferred_curr, deferred_prev)
        )

        return deferred_curr, deferred_prev, new_curr, is_all_same

    def prepare_input_ids(
        self,
        batch: ScheduledBatch,
    ) -> torch.Tensor:
        """Prepare the input IDs for the current batch.

        Carefully handles the `prev_sampled_token_ids` which can be cached
        from the previous engine iteration, in which case those tokens on the
        GPU need to be copied into the corresponding slots into input_ids."""
        scheduled_tokens = batch.scheduled_tokens  # tokens per req
        total_tokens = batch.total_tokens_num
        total_tokens_prefill = batch.total_tokens_num_prefill
        total_tokens_decode = batch.total_tokens_num_decode
        total_reqs_prefill = batch.total_seqs_num_prefill
        """for prefill: all input ids are new"""
        self.input_ids.np[:total_tokens_prefill] = scheduled_tokens[
            :total_tokens_prefill
        ]
        self.input_ids.copy_to_gpu(total_tokens_prefill)

        self.prev_rejected_num, self.prev_bonus_num = self.recv_mtp_status_async()

        # TODO: remove this when we support mixed prefill and decode in one batch
        if total_reqs_prefill > 0:
            return self.input_ids.gpu[:total_tokens_prefill]

        if not self.is_deferred_out:
            token_ids = scheduled_tokens[
                total_tokens_prefill : total_tokens_prefill + total_tokens_decode
            ]
            if self.use_spec:
                token_ids[:, 1:] = batch.scheduled_spec_decode_tokens

            self.input_ids.np[:total_tokens_decode] = token_ids
            return self.input_ids.copy_to_gpu(total_tokens_decode)

        """for decode: input ids are from prev_sampled_token_ids"""
        deferred_curr_indices, deferred_prev_indices, new_curr_indices, is_all_same = (
            self.get_token_locations(batch)
        )
        num_deferred_seqs = len(deferred_curr_indices)
        num_new_seqs = len(new_curr_indices)

        # Calculate token counts: in MTP mode, each seq has multiple tokens
        if self.use_spec:
            tokens_per_seq = self.num_spec_tokens + 1
            num_deferred_tokens = num_deferred_seqs * tokens_per_seq
            num_new_tokens = num_new_seqs * tokens_per_seq
        else:
            tokens_per_seq = 1
            num_deferred_tokens = num_deferred_seqs
            num_new_tokens = num_new_seqs

        # Receive and map bonus_list to current batch order
        self.num_rejected = batch.num_rejected
        self.num_bonus = batch.num_bonus
        if num_deferred_seqs > 0 and self.prev_rejected_num is not None:
            # Map: prev_bonus_list[prev_idx] → mapped_bonus_list[curr_idx]
            self.num_rejected[deferred_curr_indices] = self.prev_rejected_num[
                deferred_prev_indices
            ]
            self.num_bonus[deferred_curr_indices] = self.prev_bonus_num[
                deferred_prev_indices
            ]

        if is_all_same:
            # All requests are the same, only deferred tokens
            if self.use_spec:
                # MTP mode: combine prev_token_ids and draft_token_ids
                if (
                    self.draft_token_ids is not None
                    and self.pre_num_decode_token_per_seq > 1
                ):
                    combined = torch.cat(
                        [
                            self.prev_token_ids.unsqueeze(1),  # (num_seqs, 1)
                            self.draft_token_ids,  # (num_seqs, mtp_n_grams-1)
                        ],
                        dim=1,
                    ).reshape(
                        -1
                    )  # (num_deferred_tokens,)
                else:
                    combined = self.prev_token_ids
                self.input_ids.gpu[:num_deferred_tokens] = combined
            else:
                # Non-MTP mode: only prev_token_ids
                self.input_ids.gpu[:num_deferred_tokens] = self.prev_token_ids
        else:
            """
            (1) prev_batch=[301], cur_batch=[0..255, 301] → Layout: [301 prefill | new | deferred]
            (2) prev_batch=[0..255], cur_batch=[0..253, 256, 257] → Layout: [deferred | new 256, 257] when conc > max_num_seq
            """
            is_prev_prefill = self.prev_batch.total_tokens_num_prefill > 0
            new_decode_front = (
                is_prev_prefill
                and np.array_equal(new_curr_indices, np.arange(num_new_seqs))
                and np.array_equal(
                    deferred_curr_indices,
                    np.arange(num_new_seqs, num_new_seqs + num_deferred_seqs),
                )
            )

            gathered_tokens = None
            # old requests (deferred)
            if num_deferred_seqs > 0:
                self.input_ids_loc.np[:num_deferred_seqs] = deferred_prev_indices
                deferred_indices_gpu = self.input_ids_loc.copy_to_gpu(num_deferred_seqs)
                gathered_prev = torch.gather(
                    self.prev_token_ids,
                    0,
                    deferred_indices_gpu,
                )
                if self.use_spec:
                    # MTP mode: combine prev_token_ids and draft_token_ids
                    if (
                        self.draft_token_ids is not None
                        and self.pre_num_decode_token_per_seq > 1
                    ):
                        # draft_token_ids is 2D (num_seqs, mtp_n_grams-1), use direct indexing
                        gathered_draft = self.draft_token_ids[deferred_indices_gpu]
                        gathered_tokens = torch.cat(
                            [
                                gathered_prev.unsqueeze(1),  # (num_deferred_seqs, 1)
                                gathered_draft,  # (num_deferred_seqs, mtp_n_grams-1)
                            ],
                            dim=1,
                        ).reshape(
                            -1
                        )  # (num_deferred_tokens,)
                    else:
                        # normal decode (fallback)
                        gathered_tokens = gathered_prev
                else:
                    # Non-MTP mode: only prev_token_ids
                    gathered_tokens = gathered_prev

            if new_decode_front:
                # Layout: [new | deferred]
                if gathered_tokens is not None:
                    self.input_ids.gpu[
                        num_new_tokens : num_new_tokens + num_deferred_tokens
                    ] = gathered_tokens
                if num_new_tokens > 0:
                    token_ids = scheduled_tokens[
                        total_tokens_prefill : total_tokens_prefill + num_new_tokens
                    ].reshape(num_new_seqs, tokens_per_seq)
                    if self.use_spec:
                        token_ids[:, 1:] = batch.scheduled_spec_decode_tokens[
                            :num_new_seqs
                        ]
                    self.input_ids.np[:num_new_tokens] = token_ids.flatten()
                    self.input_ids.copy_to_gpu(num_new_tokens)
            else:
                # Layout: [deferred | new] - deferred at front, new is from previous finished prefill and waiting for decode
                if num_new_tokens > 0:
                    new_token_ids = scheduled_tokens[new_curr_indices].reshape(
                        num_new_seqs, tokens_per_seq
                    )
                    if self.use_spec:
                        # MTP mode: combine scheduled_tokens and draft_tokens
                        # For new_decode_front=False, use new_curr_indices to get the right sequences
                        draft_tokens = batch.scheduled_spec_decode_tokens[
                            new_curr_indices
                        ]
                        new_token_ids[:, 1:] = draft_tokens
                    self.input_ids.np[:num_new_tokens] = new_token_ids.flatten()
                    self.input_ids.gpu[
                        num_deferred_tokens : num_deferred_tokens + num_new_tokens
                    ].copy_(self.input_ids.cpu[:num_new_tokens], non_blocking=True)
                if gathered_tokens is not None:
                    self.input_ids.gpu[:num_deferred_tokens] = gathered_tokens
        input_ids = self.input_ids.gpu[:total_tokens]
        return input_ids

    def prepare_draft_ids(
        self, batch: ScheduledBatch, draft_token_ids: torch.Tensor
    ) -> np.ndarray:
        if not self.is_deferred_out:
            ret = draft_token_ids.numpy()
        else:
            self.draft_token_ids = draft_token_ids
            self.pre_num_decode_token_per_seq = self.num_spec_tokens + 1
            token_ids = self.recv_async_output_draft()
            self.send_to_cpu_async_draft(draft_token_ids)
            ret = (
                token_ids
                if self.prev_req_ids is not None
                else np.array([], dtype=np.int32)
            )
        return ret


class ModelRunner:

    def __init__(self, rank: int, config: Config):
        self.config = config
        self.mark_trace = getattr(config, "mark_trace", False)
        from atom.utils.graph_marker import set_graph_marker_enabled

        set_graph_marker_enabled(self.mark_trace)
        set_current_atom_config(config)
        hf_config = config.hf_config
        self.block_size = config.kv_cache_block_size
        self.enforce_eager = config.enforce_eager
        self.world_size = config.tensor_parallel_size
        self.rank = rank
        self.label = f"Model Runner{rank}/{self.world_size}"
        self.hf_text_config = get_hf_text_config(hf_config)
        if self.hf_text_config.model_type in ["llama"] and self.config.torch_dtype in [
            torch.bfloat16,
            torch.float16,
        ]:
            os.environ["AITER_QUICK_REDUCE_QUANTIZATION"] = "INT4"
        self.use_mla = self.is_deepseek_mla()
        self.use_gdn = self.is_qwen_next()
        self.is_deepseek_v32 = (
            hasattr(hf_config, "index_topk") if self.use_mla else False
        )
        # Calculate local device rank considering both TP and DP
        # When data parallelism is enabled on the same node, different DP ranks
        # need to use different sets of GPUs
        dp_rank_local = config.parallel_config.data_parallel_rank_local
        if dp_rank_local is None:
            dp_rank_local = 0
        local_device_rank = dp_rank_local * config.tensor_parallel_size + rank
        num_gpus = torch.cuda.device_count()
        if local_device_rank >= num_gpus:
            raise ValueError(
                f"Calculated local_device_rank={local_device_rank} exceeds available GPUs ({num_gpus}). "
            )

        device = torch.device(f"cuda:{local_device_rank}")
        logger.info(
            f"ModelRunner rank={rank}, dp_rank_local={dp_rank_local}, local_device_rank={local_device_rank}, device={device}"
        )
        self.device = device

        # Initialize profiler for this rank
        self.profiler = None
        self.profiler_dir = None
        if config.torch_profiler_dir is not None:
            # Create rank-specific profiler directory
            if dp_rank_local > 0 or config.parallel_config.data_parallel_size > 1:
                rank_name = f"dp{dp_rank_local}_tp{rank}"
            else:
                rank_name = f"rank_{rank}"
            self.profiler_dir = os.path.join(config.torch_profiler_dir, rank_name)
            os.makedirs(self.profiler_dir, exist_ok=True)

        self.graph_bs = [0]  # for eager fallback

        torch.cuda.set_device(self.device)
        os.environ["MASTER_ADDR"] = self.config.master_addr
        os.environ["MASTER_PORT"] = str(self.config.port)
        distributed_init_method = get_distributed_init_method(
            config.parallel_config.data_parallel_master_ip,
            config.parallel_config.data_parallel_base_port,
        )
        init_dist_env(
            config.tensor_parallel_size,
            rankID=rank,
            backend="nccl",
            distributed_init_method=distributed_init_method,
            data_parallel_size=config.parallel_config.data_parallel_size,
            data_parallel_rank=config.parallel_config.data_parallel_rank,
        )
        init_exit_handler(self)
        default_dtype = self.config.torch_dtype
        torch.set_default_dtype(default_dtype)
        torch.set_default_device(self.device)
        self.attn_backend = get_attn_backend(
            self.block_size,
            use_mla=self.use_mla,
            use_gdn=self.use_gdn,
        )
        if self.config.speculative_config and get_pp_group().is_last_rank:
            from atom.utils.backends import set_model_tag

            with set_model_tag("drafter"):
                self.drafter = EagleProposer(self.config, self.device, self)
            self.rejection_sampler = RejectionSampler()
            self.mtp_total_draft_tokens = 0
            self.mtp_total_accepted_tokens = 0
        self.num_spec_tokens = self.drafter.mtp_k if hasattr(self, "drafter") else 0
        self.tokenID_processor = tokenIDProcessor(
            self,
            self.config.max_num_batched_tokens,
            hasattr(self, "drafter"),
            self.num_spec_tokens,
        )
        self.sampler = Sampler()
        self.arange_np = np.arange(
            max(
                self.config.max_num_seqs + 1,
                self.config.max_model_len,
                self.config.max_num_batched_tokens,
            ),
            dtype=np.int64,
        )

        model_class = resolve_obj_by_qualname(support_model_arch_dict[hf_config.architectures[0]])  # type: ignore
        # The model construction depends on quant_config,
        # so we must complete the remapping for layers before constructing the model.
        config.quant_config.remap_layer_name(
            config.hf_config, getattr(model_class, "packed_modules_mapping", {})
        )
        self.model = model_class(config)
        torch.set_default_device(None)
        load_model(self.model, config.model, config.hf_config, config.load_dummy)
        logger.info(f"Model load done: {config.model}")

        if hasattr(self, "drafter"):
            logger.info("Loading drafter model...")
            self.drafter.load_model(self.model)
        torch.set_default_device(self.device)
        self.allocate_forward_vars()
        self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
            model_runner=self
        )
        self.physical_block_size = self.attn_metadata_builder.block_size
        self.forward_done_event = torch.cuda.Event()
        self.warmup_model()
        logger.info(f"Model warmup done: {config.model}")

        torch.set_default_device("cpu")
        torch.set_default_dtype(default_dtype)

        if self.config.compilation_config.level == 1:
            self.model = torch.compile(self.model, fullgraph=True, backend="eager")
            if hasattr(self, "drafter"):
                self.drafter.model = torch.compile(
                    self.drafter.model, fullgraph=True, backend="eager"
                )

    def is_deepseek_mla(self) -> bool:
        if not hasattr(self.hf_text_config, "model_type"):
            return False
        elif self.hf_text_config.model_type in (
            "deepseek_v2",
            "deepseek_v3",
            "deepseek_v32",
            "deepseek_mtp",
            "glm_moe_dsa",
        ):
            return self.hf_text_config.kv_lora_rank is not None
        elif self.hf_text_config.model_type == "eagle":
            # if the model is an EAGLE module, check for the
            # underlying architecture
            return (
                self.hf_text_config.model.model_type in ("deepseek_v2", "deepseek_v3")
                and self.hf_text_config.kv_lora_rank is not None
            )
        return False

    def is_qwen_next(self) -> bool:
        if not hasattr(self.hf_text_config, "model_type"):
            return False
        elif self.hf_text_config.model_type in ("qwen3_next", "qwen3_next_mtp"):
            return True
        return False

    def _make_buffer(
        self, *size: Union[int, torch.SymInt], dtype: torch.dtype, numpy: bool = True
    ) -> CpuGpuBuffer:
        # Bfloat16 torch tensors cannot be directly cast to a numpy array, so
        # if a bfloat16 buffer is needed without a corresponding numpy array,
        # don't bother instantiating the numpy array.
        return CpuGpuBuffer(
            *size, dtype=dtype, device=self.device, pin_memory=True, with_numpy=numpy
        )

    def _get_cumsum_and_arange(
        self,
        num_tokens: np.ndarray,
        cumsum_dtype: Optional[np.dtype] = None,
    ) -> tuple[np.ndarray, np.ndarray]:
        """Get the cumulative sum and batched arange of the given array.
        # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
        # Equivalent to but faster than:
        # np.concatenate([np.arange(n) for n in num_tokens])
        """
        # Step 1. [2, 5, 3] -> [2, 7, 10]
        cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype)
        total_num_tokens = cu_num_tokens[-1]
        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
        cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens)
        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
        arange = self.arange_np[:total_num_tokens] - cumsums_offsets

        return cu_num_tokens, arange

    def exit(self):
        if not self.still_running:
            return
        self.still_running = False
        # 1. Destroy distributed env (NCCL + CustomAllreduce + process groups)
        #    Must happen while ops module is still alive for CustomAllreduce cleanup.
        destroy_dist_env()
        # 2. Release CUDA graphs
        if not self.enforce_eager:
            self.graphs = self.graph_pool = None  # type: ignore
        # 3. Release GPU tensors
        for attr in (
            "kv_cache",
            "kv_scale",
            "index_cache",
            "mamba_k_cache",
            "mamba_v_cache",
        ):
            if hasattr(self, attr):
                delattr(self, attr)
        if hasattr(self, "model"):
            del self.model
        if hasattr(self, "drafter"):
            del self.drafter
        torch.cuda.empty_cache()
        return True

    def start_profiler(self, trace_name: Optional[str] = None):
        """
        Start profiling for this rank.

        The ATOM_PROFILER_MORE environment variable controls detailed profiling features:
        - Set to "1" to enable record_shapes, with_stack, and profile_memory.
        - Set to "0" or unset to disable these features (default).
        """
        if self.profiler_dir is not None and self.profiler is None:
            enable_detailed_profiling = envs.ATOM_PROFILER_MORE
            model_name = os.path.basename(self.config.model.rstrip("/"))
            safe_model_name = "".join(
                c if c.isalnum() or c in ("_", "-", ".") else "_" for c in model_name
            )
            worker_name = safe_model_name or "trace"
            if isinstance(trace_name, str) and trace_name:
                worker_name = "".join(
                    c if c.isalnum() or c in ("_", "-", ".") else "_"
                    for c in trace_name
                )
            if worker_name == "capture_graph":
                if safe_model_name:
                    worker_name = f"{worker_name}_{safe_model_name}"
            output_prefix = os.path.join(self.profiler_dir, worker_name)

            def _on_trace_ready(prof):
                # Use a short human-readable timestamp in file name.
                ts = time.strftime("%Y%m%d_%H%M%S", time.localtime())
                ms = int((time.time() % 1) * 1000)
                output_path = f"{output_prefix}_ts_{ts}_{ms:03d}.pt.trace.json.gz"
                tmp_json_path = output_path[:-3]
                prof.export_chrome_trace(tmp_json_path)
                with (
                    open(tmp_json_path, "rb") as src,
                    gzip.open(output_path, "wb") as dst,
                ):
                    dst.write(src.read())
                os.remove(tmp_json_path)

            self.profiler = torch_profiler.profile(
                activities=[
                    torch_profiler.ProfilerActivity.CPU,
                    torch_profiler.ProfilerActivity.CUDA,
                ],
                record_shapes=enable_detailed_profiling,
                with_stack=enable_detailed_profiling,
                profile_memory=enable_detailed_profiling,
                on_trace_ready=_on_trace_ready,
            )
            self.profiler.__enter__()
        return True

    def stop_profiler(self):
        """Stop profiling for this rank"""
        if self.profiler is not None:
            self.profiler.__exit__(None, None, None)
            self.profiler = None
        return True

    def debug(self, *args: Any):
        if self.rank == 0:
            logger.info(*args)

    def dummy_execution(self):
        """Execute dummy decode batch for DP synchronization."""
        num_tokens_original = 1

        seq = Sequence([0] * num_tokens_original, block_size=self.block_size)
        seq.status = SequenceStatus.RUNNING
        seq.type = SequenceType.DECODE
        seq.block_table = [0]
        bs = 1

        dummy_batch = ScheduledBatch(
            seqs={seq.id: seq},
            num_scheduled_tokens=np.array([num_tokens_original], dtype=np.int32),
            total_tokens_num=num_tokens_original,  # original value
            total_tokens_num_decode=num_tokens_original,
            total_seqs_num=1,
            total_seqs_num_decode=1,
            is_dummy_run=True,
        )

        bs = self.prepare_inputs(dummy_batch)
        actual_num_tokens = dummy_batch.total_tokens_num

        # self.tokenID_processor.input_ids.np[:actual_num_tokens] = [0] * actual_num_tokens
        # self.tokenID_processor.input_ids.copy_to_gpu(actual_num_tokens)
        # input_ids = self.tokenID_processor.input_ids.gpu[:actual_num_tokens]
        # input_ids = torch.zeros(actual_num_tokens, dtype=torch.int32, device=self.device)
        self.forward_vars["input_ids"].gpu[:bs].zero_()
        input_ids = self.forward_vars["input_ids"].gpu[:bs]

        self.run_model(input_ids)

        reset_forward_context()
        logger.debug(
            f"{self.label}: dummy batch executed with {actual_num_tokens} tokens"
        )
        return True

    def dummy_prefill_execution(self, num_tokens: int):
        """
        Execute dummy prefill batch for DP synchronization.
        """
        if num_tokens <= 0:
            num_tokens = 1
        seq = Sequence([0] * num_tokens, block_size=self.block_size)
        seqs = {seq.id: seq}

        dummy_batch = ScheduledBatch(
            seqs=seqs,
            num_scheduled_tokens=np.array([num_tokens], dtype=np.int32),
            total_tokens_num=num_tokens,
            total_tokens_num_prefill=num_tokens,
            total_seqs_num=1,
            total_seqs_num_prefill=1,
            is_dummy_run=True,
        )

        bs = self.prepare_inputs(dummy_batch)

        # self.tokenID_processor.input_ids.np[:num_tokens] = [0] * num_tokens
        # self.tokenID_processor.input_ids.copy_to_gpu(num_tokens)
        # input_ids = self.tokenID_processor.input_ids.gpu[:num_tokens]
        # input_ids= torch.zeros(num_tokens, dtype=torch.int32, device=self.device)
        self.forward_vars["input_ids"].gpu[:bs].zero_()
        input_ids = self.forward_vars["input_ids"].gpu[:bs]

        # not exe run_model and synchronize: acc 0.79

        with torch.no_grad():
            self.run_model(input_ids)

        torch.cuda.synchronize()

        reset_forward_context()

        logger.info(
            f"{self.label}: dummy PREFILL batch executed with {num_tokens} tokens"
        )
        return True

    def warmup_model(self):
        start_time = time.time()
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        max_num_batched_tokens, max_model_len = (
            self.config.max_num_batched_tokens,
            self.config.max_model_len,
        )
        dp_size = get_dp_group().world_size
        warmup_max_tokens = max_num_batched_tokens // dp_size

        num_seqs = min(warmup_max_tokens // max_model_len, self.config.max_num_seqs)

        if num_seqs == 0:
            num_seqs = 1
            seq_len = min(warmup_max_tokens, max_model_len)
            if seq_len == 0:
                seq_len = 1
            logger.warning(
                f"{self.label}: DP size={dp_size} too large, warmup_max_tokens={warmup_max_tokens} < max_model_len={max_model_len}. "
                f"Using {num_seqs} seq with length {seq_len} for warmup."
            )
        else:
            seq_len = max_model_len

        seqs = [
            Sequence([0] * seq_len, block_size=self.block_size) for _ in range(num_seqs)
        ]
        seqs = {seq.id: seq for seq in seqs}

        num_scheduled_tokens = np.array([seq_len] * num_seqs, dtype=np.int32)
        total_tokens_num = int(num_scheduled_tokens.sum())

        dummy_batch = ScheduledBatch(
            seqs=seqs,
            num_scheduled_tokens=num_scheduled_tokens,
            total_tokens_num=total_tokens_num,
            total_tokens_num_prefill=total_tokens_num,
            total_seqs_num=num_seqs,
            total_seqs_num_prefill=num_seqs,
            is_dummy_run=True,
        )
        self.forward(dummy_batch)
        self.tokenID_processor.clean()
        torch.cuda.empty_cache()
        logger.info(
            f"{self.label}: warmup_model {time.time() - start_time:.2f} seconds with {num_seqs} reqs {total_tokens_num} tokens"
        )

    def allocate_forward_vars(self):
        config = self.config
        hidden_size = config.hf_config.hidden_size
        hidden_type = config.torch_dtype
        self.max_bs = self.config.max_num_seqs
        self.max_num_batched_tokens = config.max_num_batched_tokens
        i64_kwargs = {"dtype": torch.int64, "device": self.device}
        i32_kwargs = {"dtype": torch.int32, "device": self.device}
        f32_kwargs = {"dtype": torch.float, "device": self.device}

        # TODO: remove it in forward_context
        self.forward_vars = {
            "input_ids": self.tokenID_processor.input_ids,
            "positions": CpuGpuBuffer(self.max_num_batched_tokens, **i64_kwargs),
            "temperatures": CpuGpuBuffer(self.max_bs, **f32_kwargs),
            "top_ks": CpuGpuBuffer(self.max_bs, **i32_kwargs),
            "top_ps": CpuGpuBuffer(self.max_bs, **f32_kwargs),
            # Keep enough space for MTP decode (max_q_len > 1).
            "outputs": torch.empty(
                self.max_num_batched_tokens, hidden_size, dtype=hidden_type
            ),
        }
        if hasattr(self, "drafter"):
            self.forward_vars["mtp_k"] = self.drafter.mtp_k
            self.forward_vars["num_accepted_tokens"] = CpuGpuBuffer(
                self.max_bs, **i32_kwargs
            )

    def _get_num_kv_heads(self):
        """Return the per-rank number of KV heads."""
        hf_config = self.config.hf_config
        if hf_config.num_key_value_heads >= self.world_size:
            assert hf_config.num_key_value_heads % self.world_size == 0
            return hf_config.num_key_value_heads // self.world_size
        else:
            assert self.world_size % hf_config.num_key_value_heads == 0
            return 1

    def _get_total_num_layers(self):
        """Return total layer count including draft (MTP) layers."""
        total = self.config.hf_config.num_hidden_layers
        if self.config.speculative_config and hasattr(self, "drafter"):
            draft_hf = self.config.speculative_config.draft_model_hf_config
            total += getattr(draft_hf, "num_nextn_predict_layers", 1)
        return total

    def _compute_block_bytes(self):
        """Compute the TRUE per-block memory cost including all tensors.

        This must match exactly what allocate_kv_cache() allocates.
        Includes: kv_cache tensor + kv_scale tensor + draft model layers.
        """
        config = self.config
        hf_config = config.hf_config
        num_kv_heads = self._get_num_kv_heads()
        total_num_layers = self._get_total_num_layers()
        kv_dtype_size = dtypes.d_dtypes[config.kv_cache_dtype].itemsize

        if self.use_mla:
            # MLA: shape [total_layers, blocks, block_size, 576]
            # No kv_scale for MLA
            block_bytes = total_num_layers * self.block_size * 576 * kv_dtype_size
            if self.is_deepseek_v32:
                index_dim = hf_config.index_head_dim + 4
                aligned_index_dim = ((index_dim + 15) // 16) * 16
                block_bytes += (
                    hf_config.num_hidden_layers
                    * self.block_size
                    * aligned_index_dim
                    * dtypes.fp8.itemsize
                )
        elif self.is_qwen_next():
            self.full_attention_interval = hf_config.full_attention_interval
            self.num_full_attn = (
                hf_config.num_hidden_layers // self.full_attention_interval
            )
            self.num_gdn_attn_state = hf_config.num_hidden_layers - self.num_full_attn
            num_draft_layers = total_num_layers - hf_config.num_hidden_layers
            full_attn_layers = self.num_full_attn + num_draft_layers

            # full attention kv_cache bytes
            block_bytes = (
                2
                * full_attn_layers
                * self.physical_block_size
                * num_kv_heads
                * hf_config.head_dim
                * kv_dtype_size
            )

            # kv_scale for full attention: [2, full_attn_layers, blocks, kv_heads, phys_block_size] float32
            block_bytes += (
                2
                * full_attn_layers
                * num_kv_heads
                * self.physical_block_size
                * 4  # float32
            )

            # gdn attn bytes
            mamba_shape = self.gated_delta_net_state_shape(
                get_tp_group().world_size,
                hf_config.linear_num_key_heads,
                hf_config.linear_num_value_heads,
                hf_config.linear_key_head_dim,
                hf_config.linear_value_head_dim,
                hf_config.linear_conv_kernel_dim,
                self.num_spec_tokens,
            )
            one_layer_byte = (
                sum(math.prod(subtuple) for subtuple in mamba_shape) * kv_dtype_size
            )
            block_bytes += self.num_gdn_attn_state * one_layer_byte
        else:
            # Standard attention: kv_cache [2, num_hidden_layers, blocks, ...]