ai-decentralized
diff --git a/‎benchmarks/benchmark_speculative_decoding.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_speculative_decoding.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/bloombee/models/llama/spe_dec_tree.py‎
Lines changed: 42 additions & 28 deletions b/‎src/bloombee/models/llama/spe_dec_tree.py‎
Lines changed: 42 additions & 28 deletions
diff --git a/‎src/bloombee/models/llama/speculative_model.py‎
Lines changed: 1 addition & 1 deletion b/‎src/bloombee/models/llama/speculative_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/bloombee/server/backend.py‎
Lines changed: 25 additions & 7 deletions b/‎src/bloombee/server/backend.py‎
Lines changed: 25 additions & 7 deletions
@@ -73,7 +73,7 @@ def benchmark_inference(process_idx, args, result_pipe):
 
     drafter = MultiSSMDrafter(
         ssm_model_name="JackFram/llama-68m",
-        num_workers=2,
+        num_workers=1,
         device="cuda"
     )
     model = AutoDistributedSpeculativeModel.from_pretrained(
 
@@ -195,7 +195,11 @@ def prepare_incremental_tree_batch(
 ) -> Tuple[torch.Tensor, torch.Tensor, List[List[List[TreeNode]]]]:
     """
     准备增量 tree batch，支持不同序列长度
+    attention mask 直接输出 float score：0.0 表示可attend，-65504.0 表示被遮蔽
     """
+    MASKED = -65504.0
+    ATTEND = 0.0
+
     batch_size = len(trees)
 
     if not trees or all(tree.total_nodes <= 1 for tree in trees):
@@ -228,64 +232,71 @@ def prepare_incremental_tree_batch(
         batch_tree_tokens.append(padded_tokens)
 
         tree_len = len(tree_token_ids)  # 不包含 root
-        inputs_len = tree_len + 1  # root + tree tokens
+        inputs_len = tree_len + 1       # root + tree tokens
 
         if is_prefill:
-            # ============ Prefill 阶段（不变） ============
+            # ============ Prefill 阶段 ============
             past_len = input_ids.shape[1]
             total_len = past_len + tree_len
-            mask = torch.zeros(1, total_len, total_len, dtype=torch.bool, device=device)
+            mask = torch.full((1, total_len, total_len), MASKED, dtype=torch.float, device=device)
 
             prompt_len = curr_seq_len - 1 if curr_seq_len > 0 else 0
             root_pos = prompt_len
 
+            # Prompt 部分：causal mask
             if prompt_len > 0:
                 row_idx = torch.arange(prompt_len, device=device).view(-1, 1)
                 col_idx = torch.arange(prompt_len, device=device).view(1, -1)
-                causal_mask = row_idx >= col_idx
-                mask[0, :prompt_len, :prompt_len] = causal_mask
+                causal_mask = row_idx >= col_idx  # bool
+                mask[0, :prompt_len, :prompt_len] = torch.where(causal_mask, ATTEND, MASKED)
 
+            # Root attend to prompt + 自己
             if prompt_len > 0:
-                mask[0, root_pos, :prompt_len] = True
-            mask[0, root_pos, root_pos] = True
+                mask[0, root_pos, :prompt_len] = ATTEND
+            mask[0, root_pos, root_pos] = ATTEND
 
+            # Tree tokens attend to prompt + root
             if tree_len > 0:
                 if prompt_len > 0:
-                    mask[0, past_len:past_len + tree_len, :prompt_len] = True
-                mask[0, past_len:past_len + tree_len, root_pos] = True
+                    mask[0, past_len:past_len + tree_len, :prompt_len] = ATTEND
+                mask[0, past_len:past_len + tree_len, root_pos] = ATTEND
 
+            # Tree tokens 之间
             if tree_len > 0:
                 tree_mask = build_tree_attention_mask_with_root(tree_len, parent_indices, device)
                 mask[0, past_len:past_len + tree_len, past_len:past_len + tree_len] = tree_mask
 
+            # Padding
             if tree_len < max_tree_size:
                 total_padded_len = past_len + max_tree_size
-                padded_mask = torch.zeros(1, total_padded_len, total_padded_len, dtype=torch.bool, device=device)
+                padded_mask = torch.full(
+                    (1, total_padded_len, total_padded_len), MASKED, dtype=torch.float, device=device
+                )
                 padded_mask[0, :total_len, :total_len] = mask[0]
+                # Padding 行 attend to prompt（避免 NaN）
                 if curr_seq_len > 0:
-                    padded_mask[0, total_len:, :curr_seq_len] = True
+                    padded_mask[0, total_len:, :curr_seq_len] = ATTEND
                 mask = padded_mask
-        
+
         else:
             # ============ Generation 阶段 ============
-            # 总长度 = cache + 本轮输入
             total_len = cache_len + inputs_len
+            mask = torch.full((1, inputs_len, total_len), MASKED, dtype=torch.float, device=device)
 
-            mask = torch.zeros(1, inputs_len, total_len, dtype=torch.bool, device=device)
-            
-            # 计算 cache 中的有效位置
-            cache_valid_mask = _compute_single_cache_valid_mask(
+            # 计算 cache 中的有效位置（bool），再映射为 score
+            cache_valid_bool = _compute_single_cache_valid_mask(
                 kv_cache_position_ids[i], cache_len, device
-            )
-            
+            )  # shape: (cache_len,), dtype: bool
+            cache_scores = torch.where(cache_valid_bool, ATTEND, MASKED)  # float scores
+
             # 1. Root attend to cache + 自己
-            mask[0, 0, :cache_len] = cache_valid_mask
-            mask[0, 0, cache_len] = True  # root attend 自己
+            mask[0, 0, :cache_len] = cache_scores
+            mask[0, 0, cache_len] = ATTEND  # root attend 自己
 
             # 2. Tree tokens attend to cache + root
             if tree_len > 0:
-                mask[0, 1:inputs_len, :cache_len] = cache_valid_mask.unsqueeze(0).expand(tree_len, cache_len)
-                mask[0, 1:inputs_len, cache_len] = True  # tree tokens attend to root
+                mask[0, 1:inputs_len, :cache_len] = cache_scores.unsqueeze(0).expand(tree_len, cache_len)
+                mask[0, 1:inputs_len, cache_len] = ATTEND  # tree tokens attend to root
 
             # 3. Tree tokens 之间
             if tree_len > 0:
@@ -297,10 +308,12 @@ def prepare_incremental_tree_batch(
             if inputs_len < max_inputs_len:
                 pad_len = max_inputs_len - inputs_len
                 total_padded_len = cache_len + max_inputs_len
-                padded_mask = torch.zeros(1, max_inputs_len, total_padded_len, dtype=torch.bool, device=device)
+                padded_mask = torch.full(
+                    (1, max_inputs_len, total_padded_len), MASKED, dtype=torch.float, device=device
+                )
                 padded_mask[0, :inputs_len, :total_len] = mask[0]
                 # Padding 行 attend to cache（避免 NaN）
-                padded_mask[0, inputs_len:, :cache_len] = cache_valid_mask.unsqueeze(0).expand(pad_len, cache_len)
+                padded_mask[0, inputs_len:, :cache_len] = cache_scores.unsqueeze(0).expand(pad_len, cache_len)
                 mask = padded_mask
 
         batch_attention_masks.append(mask)
@@ -358,16 +371,17 @@ def build_tree_attention_mask_with_root(
 ) -> torch.Tensor:
     """
     构建 tree tokens 之间的 attention mask（不包含 root）
+    直接返回 float score mask：0.0 表示可attend，-65504.0 表示被遮蔽
     """
-    mask = torch.zeros(tree_len, tree_len, dtype=torch.bool, device=device)
+    mask = torch.full((tree_len, tree_len), -65504.0, dtype=torch.float, device=device)
 
     for i in range(tree_len):
-        mask[i, i] = True
+        mask[i, i] = 0.0
         current = i
         while current >= 0:
             parent = parent_indices[current]
             if parent >= 0:
-                mask[i, parent] = True
+                mask[i, parent] = 0.0
                 current = parent
             else:
                 break
 
@@ -134,7 +134,7 @@ def _sample_with_session(
         has_printed_first_reach = False # 确保只打印一次
         sample_finish_times = [None] * batch_size
         sample_finished = torch.zeros(batch_size, dtype=torch.bool, device=input_ids.device)
-        while not finished and (seq_lengths - initial_seq_lengths).min().item() < max_new_tokens:
+        while not finished and (seq_lengths - initial_seq_lengths).max().item() < max_new_tokens:
             # 1. Build speculative trees using SSM - 传入 seq_lengths
             t1 = time.perf_counter()
             spec_trees = drafter.build_trees_parallel(
 
@@ -141,7 +141,7 @@ def __init__(
                 BatchTensorDescriptor((), dtype=self.dtype),
                 BatchTensorDescriptor((), dtype=torch.int64),
                 BatchTensorDescriptor(
-                    1, 64, 64, dtype=self.dtype
+                    1, 64, 64, dtype=torch.float
                 ), # tree_attention_mask
                 BatchTensorDescriptor(
                     128, dtype=torch.int64
@@ -284,7 +284,7 @@ def inference_step( # Each block will execute once
 
             self._ensure_model_on_device()
 
-            # t0 = time.perf_counter()
+            t0 = time.perf_counter()
             with self.cache_manager.use_cache(
                 *inference_info.cache_handles  # Use cache to reduce memory requirements
             ) as cache_tensors, self._peft_module.using_adapter(inference_info.active_adapter): # Use adapter for inference
@@ -319,6 +319,8 @@ def _flag_to_bool(value) -> bool:
                             f"micro_batch_size={inference_info.micro_batch_size}, "
                             f"full_batch_size={inference_info.full_batch_size}")
 
+                t1 = time.perf_counter()
+                
                 if kv_cache_position_ids is not None and kv_cache_position_ids.numel() > 0:
                     k_pkv, v_pkv, cache_len = self.cache_manager.select_cache_without_reorder(
                         kv_cache_position_ids, 
@@ -339,8 +341,8 @@ def _flag_to_bool(value) -> bool:
                     )
                     cache_len = k_pkv.shape[2] if k_pkv is not None else 0
 
-                # t2 = time.perf_counter()
-                # logger.info(f"inference_step: cache reorder (if needed) and selection took {t2 - t1:.4f} seconds")
+                t2 = time.perf_counter()
+                logger.info(f"inference_step: cache reorder (if needed) and selection took {t2 - t1:.4f} seconds")
 
                 layer_past = (k_pkv, v_pkv) if k_pkv is not None else None
 
@@ -349,11 +351,15 @@ def _flag_to_bool(value) -> bool:
 
                 if self._is_spec_decoding:
                     full_mask = inference_info.tree_attention_mask.to(device)
-                    attention_mask = self.convert_mask_to_scores(full_mask) if full_mask is not None else None
+                    attention_mask = full_mask
                 if full_mask == None:
                     full_mask = self._create_causal_attention_mask(batch_size, (seq_len + cache_len), cache_len, hidden_states.device)
                     attention_mask = self.convert_mask_to_scores(full_mask) if full_mask is not None else None
 
+                t3 = time.perf_counter()
+                logger.info(f"convert_mask_to_scores took {t3 - t2:.4f} seconds")
+
+                    
                 for offset in range(0, seq_len, max_chunk_length): # Iterate through sequence to process hidden states in chunks   only run offset=0
                     hidden_states_chunk = hidden_states[:, offset : offset + max_chunk_length, :] # Get current hidden states chunk
                     # print('transformer backend inference step() offset ', offset )
@@ -378,6 +384,9 @@ def _flag_to_bool(value) -> bool:
                             target_seq_len=seq_len)
                     else:
                         rotary_position_ids = None
+                        
+                    t4 = time.perf_counter()
+                    logger.info(f"_create_tree_position_ids_with_invalid_cache took {t4 - t3:.4f} seconds")
 
                     try:
                         # Fixed: Properly handle forward method return values with position_ids
@@ -391,8 +400,8 @@ def _flag_to_bool(value) -> bool:
                             rotary_position_ids=rotary_position_ids,
                         )
 
-                        # t5 = time.perf_counter()
-                        # logger.info(f"inference_step: module.forward call took {t5 - t4:.4f} seconds")
+                        t5 = time.perf_counter()
+                        logger.info(f"inference_step: module.forward call took {t5 - t4:.4f} seconds")
 
                         if forward_result is None:
                             logger.info(f" ERROR: module.forward returned None!")
@@ -438,6 +447,10 @@ def _flag_to_bool(value) -> bool:
                         batch_offset=inference_info.batch_offset,
                         full_batch_size=inference_info.full_batch_size,
                         micro_batch_size=inference_info.micro_batch_size,) 
+                
+                t6 = time.perf_counter()
+                logger.info(f"update_cache_and_async_reorder took {t6 - t5:.4f} seconds")
+                        
 
                 keep_indices = self._normalize_keep_indices(
                     inference_info.keep_indices,
@@ -471,6 +484,11 @@ def _flag_to_bool(value) -> bool:
                     norm_hidden_states = self.module.rms_norm(output_hidden_states)
                     keep_indices = self.prune_draft_tree(norm_hidden_states, inference_info.draft_tokens, full_mask)
                     keep_indices = keep_indices
+                    t7 = time.perf_counter()
+                    logger.info(f"prune_draft_tree took {t7 - t6:.4f} seconds")
+                      
+                    
+                
 
                 if not training_mode and self._is_spec_decoding and self._is_last_block:
                     original_hidden_states = output_hidden_states
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ def benchmark_inference(process_idx, args, result_pipe):`
`73`	`73`
`74`	`74`	`drafter = MultiSSMDrafter(`
`75`	`75`	`ssm_model_name="JackFram/llama-68m",`
`76`		`- num_workers=2,`
	`76`	`+ num_workers=1,`
`77`	`77`	`device="cuda"`
`78`	`78`	`)`
`79`	`79`	`model = AutoDistributedSpeculativeModel.from_pretrained(`