|
| 1 | +import mlx.core as mx |
| 2 | +from mlx_lm.tokenizer_utils import TokenizerWrapper |
| 3 | +from .kv_cache import * |
| 4 | +from .qwen2_week2 import Qwen2ModelWeek2 |
| 5 | +from typing import Callable |
| 6 | +from datetime import datetime |
| 7 | + |
| 8 | + |
| 9 | +def _step(model, y, offsets, kv_cache): |
| 10 | + logits = model(y, offsets, kv_cache) |
| 11 | + logits = logits[:, -1, :] |
| 12 | + logprobs = logits - mx.logsumexp(logits, keepdims=True) |
| 13 | + sampler = lambda x: mx.argmax(x, axis=-1) |
| 14 | + y = sampler(logprobs) |
| 15 | + return y |
| 16 | + |
| 17 | + |
| 18 | +class Request: |
| 19 | + def __init__( |
| 20 | + self, |
| 21 | + model: any, |
| 22 | + tokenizer: TokenizerWrapper, |
| 23 | + prompt: str, |
| 24 | + prefill_max_step: int = 128, |
| 25 | + prompt_idx: int = 0, |
| 26 | + ): |
| 27 | + self.prompt = prompt |
| 28 | + self.kv_cache = [TinyKvFullCache() for _ in range(model.num_hidden_layers)] |
| 29 | + self.model = model |
| 30 | + self.detokenizer = tokenizer.detokenizer.__class__(tokenizer._tokenizer) |
| 31 | + self.prefill_tokens = mx.array( |
| 32 | + tokenizer.encode(prompt, add_special_tokens=False) |
| 33 | + ) |
| 34 | + self.prefill_max_step = prefill_max_step |
| 35 | + self.is_done = False |
| 36 | + self.is_prefill_done = False |
| 37 | + self.eos_token_id = tokenizer.eos_token_id |
| 38 | + self.next_token = None |
| 39 | + self.offset = 0 |
| 40 | + self.prompt_idx = prompt_idx |
| 41 | + |
| 42 | + def try_prefill(self): |
| 43 | + """ |
| 44 | + Prefill this request up to max_step size, returns None if prefill is not done |
| 45 | + """ |
| 46 | + if self.is_prefill_done: |
| 47 | + raise ValueError("prefill called after done") |
| 48 | + # TODO: in task 4, prefill the full request at once; in task 5, prefill a chunk at a time |
| 49 | + |
| 50 | + def decode_done(self, token, update_offset=True): |
| 51 | + if self.is_done: |
| 52 | + raise ValueError("decode called after done") |
| 53 | + if token == self.eos_token_id: |
| 54 | + self.is_done = True |
| 55 | + return |
| 56 | + # TODO: update the offset and add the token to the detokenizer |
| 57 | + |
| 58 | + def text(self): |
| 59 | + return self.detokenizer.text |
| 60 | + |
| 61 | + |
| 62 | +def _print_progress( |
| 63 | + requests: list[Request | None], |
| 64 | + is_idle: list[bool], |
| 65 | + pending_prefill_request: Request | None, |
| 66 | + queue_size: int, |
| 67 | + progress_cnt: int, |
| 68 | + start_time: datetime, |
| 69 | +): |
| 70 | + print(f" --- {datetime.now() - start_time}") |
| 71 | + animation_frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] |
| 72 | + animation_frame = animation_frames[progress_cnt % len(animation_frames)] |
| 73 | + for i in range(len(requests)): |
| 74 | + if is_idle[i]: |
| 75 | + print(f" Decode #{i}: idle", flush=True) |
| 76 | + else: |
| 77 | + text_preview = requests[i].text()[-80:].replace("\n", " ") |
| 78 | + print( |
| 79 | + f"{animation_frame} Decode [req {requests[i].prompt_idx}, {requests[i].offset}]: {text_preview}", |
| 80 | + flush=True, |
| 81 | + ) |
| 82 | + if pending_prefill_request is not None: |
| 83 | + if pending_prefill_request.is_prefill_done: |
| 84 | + print( |
| 85 | + f" Prefill [req {pending_prefill_request.prompt_idx}]: done, waiting for slot, {queue_size} requests in queue", |
| 86 | + flush=True, |
| 87 | + ) |
| 88 | + return |
| 89 | + precentage = ( |
| 90 | + pending_prefill_request.offset / pending_prefill_request.prefill_tokens.size |
| 91 | + ) * 100 |
| 92 | + print( |
| 93 | + f"{animation_frame} Prefill [req {pending_prefill_request.prompt_idx}]: {precentage:.2f}% ({pending_prefill_request.prefill_tokens.size - pending_prefill_request.offset} remaining tokens)", |
| 94 | + flush=True, |
| 95 | + ) |
| 96 | + else: |
| 97 | + print(f" Prefill: idle, {queue_size} requests in queue", flush=True) |
| 98 | + |
| 99 | + |
| 100 | +def batch_generate( |
| 101 | + model: any, |
| 102 | + tokenizer: TokenizerWrapper, |
| 103 | + prompts: list[str], |
| 104 | + max_seq_len=512, |
| 105 | + batch_size=5, |
| 106 | + prefill_step=128, |
| 107 | +): |
| 108 | + decode_requests: list[Request] = [None] * batch_size |
| 109 | + is_idle = [True] * batch_size |
| 110 | + kv_cache = [ |
| 111 | + BatchingKvCache(max_active_requests=batch_size, max_seq_len=max_seq_len) |
| 112 | + for _ in range(model.num_hidden_layers) |
| 113 | + ] |
| 114 | + result = [] |
| 115 | + pending_prefill_request = None |
| 116 | + next_request_idx = 0 |
| 117 | + progress_cnt = 0 |
| 118 | + start_time = datetime.now() |
| 119 | + |
| 120 | + while True: |
| 121 | + if len(prompts) == 0 and all(is_idle): |
| 122 | + break |
| 123 | + # prefill until no idle slots |
| 124 | + if len(prompts) > 0 and pending_prefill_request is None: |
| 125 | + prompt = prompts.pop(0) |
| 126 | + pending_prefill_request = Request( |
| 127 | + model, tokenizer, prompt, prefill_step, next_request_idx |
| 128 | + ) |
| 129 | + next_request_idx += 1 |
| 130 | + |
| 131 | + # In every iteration, we do a prefill first |
| 132 | + if pending_prefill_request is not None: |
| 133 | + made_progress = False |
| 134 | + if not pending_prefill_request.is_prefill_done: |
| 135 | + pending_prefill_request.try_prefill() |
| 136 | + made_progress = True |
| 137 | + if pending_prefill_request.is_prefill_done: |
| 138 | + # Implement this: find an idle slot and add the request to the decode requests |
| 139 | + pass |
| 140 | + if made_progress: |
| 141 | + _print_progress( |
| 142 | + decode_requests, |
| 143 | + is_idle, |
| 144 | + pending_prefill_request, |
| 145 | + len(prompts), |
| 146 | + progress_cnt, |
| 147 | + start_time, |
| 148 | + ) |
| 149 | + progress_cnt += 1 |
| 150 | + |
| 151 | + # After the prefill request moves forward one step, we do the decode |
| 152 | + if not all(is_idle): |
| 153 | + next_tokens = [] |
| 154 | + offsets = [] |
| 155 | + # TODO: collect the next tokens and offsets from the decode requests |
| 156 | + next_tokens = _step(model, next_tokens.reshape(-1, 1), offsets, kv_cache) |
| 157 | + for i in range(batch_size): |
| 158 | + # TODO: check if the decode has finished by comparing EOS or the seqlength. If so, |
| 159 | + # remove the request from the decode requests and add the result to the result list; |
| 160 | + # otherwise, call `decode_done` to update the offset and add the token to the detokenizer |
| 161 | + pass |
| 162 | + _print_progress( |
| 163 | + decode_requests, |
| 164 | + is_idle, |
| 165 | + pending_prefill_request, |
| 166 | + len(prompts), |
| 167 | + progress_cnt, |
| 168 | + start_time, |
| 169 | + ) |
| 170 | + progress_cnt += 1 |
| 171 | + return result |
0 commit comments