@@ -1851,7 +1851,7 @@ def _prepare_chunked_prefill(self, req_id: str):
18511851
18521852 # Case III
18531853
1854- No left paddings and more than one chunk
1854+ No left padding and more than one chunk
18551855
18561856 13 tokens
18571857 4 blocks
@@ -1862,8 +1862,8 @@ def _prepare_chunked_prefill(self, req_id: str):
18621862
18631863 NOTE: The goal of this "illustration" is to depics strategies to write
18641864 code to create the chunks, not necessarily enumerate the possible
1865- scenario . Of course there are interpretations where these cases
1866- overlaps .
1865+ scenarios . Of course there are interpretations where these cases
1866+ overlap .
18671867
18681868 '''
18691869
@@ -2105,7 +2105,7 @@ def add_new_request(self, request: NewRequestData):
21052105 new_tokens = (sampling_params .max_tokens
21062106 if sampling_params is not None else 0 )
21072107 total_tokens = prompt_len + new_tokens - 1
2108- # subtract the padding blocks from the reserved blocks
2108+ # calculate the number of reserved blocks
21092109 n_reserved_blocks = math .ceil (total_tokens / self .block_size )
21102110
21112111 self .req_ids2num_reserved_blocks [req_id ] = n_reserved_blocks
@@ -2216,8 +2216,7 @@ def check_incomplete_prefill(self, scheduler_output: SchedulerOutput):
22162216 return False
22172217
22182218 # possible prefill
2219- req_id = new_reqs [0 ].req_id if \
2220- len (new_reqs ) == 1 else \
2219+ req_id = new_reqs [0 ].req_id if len (new_reqs ) == 1 else \
22212220 cached_reqs .req_ids [0 ]
22222221
22232222 num_scheduled_tokens = \
@@ -2302,6 +2301,9 @@ def execute_model(
23022301 if not self .is_driver_worker :
23032302 return self .get_empty_output ()
23042303
2304+ t1 = time .time () - t0
2305+ logger .debug ("t_forward_pass: %.2fms [prefill single chunk]" ,
2306+ (t1 * 1000 ))
23052307 return CPSpyreModelRunnerOutput (
23062308 req_ids = list (req_id_to_index .keys ()),
23072309 req_id_to_index = req_id_to_index ,
@@ -2319,19 +2321,14 @@ def execute_model(
23192321 sampling_metadata = self .get_sampling_metadata (is_prefill ),
23202322 )
23212323 t1 = time .time () - t0
2322- logger .debug ("t_token: %.2fms" , (t1 * 1000 ))
2323-
2324- # Add the sampled token(s) to the request cache
2325- req_ids = ([r .req_id for r in scheduler_output .scheduled_new_reqs ]
2326- if len (scheduler_output .scheduled_new_reqs ) > 0 \
2327- else self .input_batch .sorted_requests_ids )
2324+ step_type = "[prefill last chunk]" if is_prefill else "[decode]"
2325+ logger .debug ("t_token: %.2fms %s" , (t1 * 1000 ), step_type )
23282326
23292327 # Get the right batch, if this is the last chunk to conclude the
23302328 # prefill, we'll generate a token and we should get from the prefill
23312329 # batch because input_batch may have other request that are were
23322330 # not processed at this step.
2333- batch = self .prefill_batch if is_prefill \
2334- else self .input_batch
2331+ batch = self .prefill_batch if is_prefill else self .input_batch
23352332
23362333 # Add the sampled token(s) to the request cache
23372334 req_ids = ([r .req_id for r in scheduler_output .scheduled_new_reqs ]
0 commit comments