evaluate_sd_mb

Xu Xiong · Xu Xiong · commit 5cbbe854b448 · 2026-03-31T09:06:15.000Z
diff --git a/benchmarks/benchmark_speculative_decoding.py b/benchmarks/benchmark_speculative_decoding.py
@@ -74,25 +74,9 @@ def benchmark_inference(process_idx, args, result_pipe):
     indices = random.sample(range(len(dataset)), batch_size)
     sampled = dataset.select(indices)
     test_prompts = []
-    # for item in sampled:
-        # test_prompts.append(item["instruction"])
-        
-    base_prompt = (
-        "Quantum mechanics explains the behavior of particles at very small scales. "
-        "Neural networks learn patterns by adjusting weights through backpropagation. "
-        "Distributed systems require robust consensus mechanisms to maintain state. "
-        "Optimization algorithms like gradient descent are fundamental to machine learning. "
-        "Transformer architectures rely on attention mechanisms to capture dependencies. "
-        "Reinforcement learning optimizes actions by maximizing cumulative rewards. "
-        "Bayesian inference updates beliefs based on observed evidence and prior knowledge. "
-        "Convex optimization problems guarantee global minima under certain conditions. "
-        "Signal processing extracts meaningful information from noisy measurements. "
-    )
-    prompts = [
-        f"{base_prompt} Example {i + 1} discusses large-scale AI systems and scientific discovery."
-        for i in range(batch_size)
-    ]
-    test_prompts = prompts
+    for item in sampled:
+        test_prompts.append(item["instruction"])
+    
 
     tokenizer.pad_token = tokenizer.eos_token
     input_ids = tokenizer(test_prompts, return_tensors="pt", padding=True).to(device)["input_ids"]
diff --git a/eval_indices.json b/eval_indices.json
@@ -0,0 +1 @@
+[[41905, 7296, 1639, 48598, 18024, 16049, 14628, 9144, 48265, 6717, 44348, 48540, 35741, 5697, 38698, 27651, 2082, 1952, 6140, 14328, 15247, 33118, 39453, 1739, 36781, 13031, 46925, 42590, 45962, 35713, 27493, 14446], [29439, 38618, 18231, 425, 49729, 10463, 45753, 27696, 22298, 18210, 10189, 14110, 50036, 22059, 6698, 6078, 24898, 6338, 23526, 22541, 39565, 17335, 2847, 47823, 30108, 35142, 8180, 24807, 5164, 36178, 19213, 41198], [40535, 23700, 37837, 12601, 46174, 4558, 3003, 43336, 14935, 50663, 18965, 5229, 15256, 6619, 24911, 18217, 29714, 41660, 23909, 10659, 24260, 23283, 13730, 43920, 17496, 45994, 44796, 42469, 4679, 39920, 41613, 11215], [35005, 47784, 16043, 10708, 30294, 24867, 17691, 41943, 45099, 36500, 14392, 44866, 21252, 50352, 50855, 3665, 15010, 2103, 20673, 26290, 17546, 4337, 13826, 37170, 47049, 20622, 13934, 42954, 32717, 25928, 42129, 30071], [9363, 17359, 9150, 16162, 48823, 36789, 35322, 17219, 48956, 38311, 28077, 38242, 26175, 23723, 14373, 9065, 33392, 32343, 5957, 49530, 3087, 7185, 10016, 41120, 10484, 51909, 44596, 27666, 39086, 4163, 25216, 25009], [39052, 30674, 34676, 16476, 36256, 752, 44583, 47233, 7507, 44676, 35190, 49209, 17486, 50370, 42006, 22293, 7310, 19234, 28492, 10365, 29735, 212, 47323, 47164, 17261, 32806, 49935, 11708, 33271, 6973, 40979, 19558], [41874, 33270, 39909, 13035, 10016, 24504, 49971, 10587, 35348, 51028, 34757, 37, 39252, 21243, 32021, 1276, 7331, 23788, 20153, 15692, 3796, 15785, 37182, 5161, 5613, 47966, 31849, 4535, 49846, 34911, 50189, 8241], [8414, 43237, 31148, 36031, 10821, 17370, 34581, 39753, 27730, 13880, 35343, 49497, 47836, 45211, 13182, 46723, 20428, 26148, 44019, 42590, 24472, 28711, 33919, 29588, 7930, 16246, 14725, 4196, 22156, 1378, 38555, 36301], [15080, 38564, 14432, 471, 4652, 46389, 41359, 3858, 15003, 4417, 2058, 21654, 4643, 33695, 15597, 18250, 43842, 31812, 14040, 35339, 8671, 47405, 37423, 37762, 30976, 15925, 51420, 30996, 26677, 12478, 6181, 6352], [43187, 28249, 23219, 27759, 26941, 30606, 47780, 3550, 44129, 42824, 42348, 6449, 3972, 26386, 47724, 22236, 7161, 16295, 12556, 12465, 35146, 29400, 9186, 27648, 12025, 18254, 30318, 16371, 4940, 29041, 36066, 6416]]
diff --git a/src/bloombee/models/llama/speculative_model.py b/src/bloombee/models/llama/speculative_model.py
@@ -35,8 +35,8 @@ def generate(
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         streamer: Optional["BaseStreamer"] = None,
-        beam_width: int = 1,
-        max_tree_depth: int = 4,
+        beam_width: int = 2,
+        max_tree_depth: int = 3,
         use_kv_cache: bool = True,
         kv_cache_window: int = 2048,
         max_new_tokens: int = 128,
diff --git a/src/bloombee/server/backend.py b/src/bloombee/server/backend.py
@@ -369,8 +369,8 @@ def _flag_to_bool(value) -> bool:
                     position_ids = self._position_ids_cache[cache_key] + (cache_len + offset)
                     if self._is_spec_decoding:
                         rotary_position_ids = self._create_tree_position_ids_with_invalid_cache(
-                            width=1,
-                            depth=4,
+                            width=2,
+                            depth=3,
                             prefill_length=inference_info.prefill_length - 1,
                             kv_cache_position_ids=kv_cache_position_ids,
                             batch_offset=inference_info.batch_offset,
@@ -472,15 +472,7 @@ def _flag_to_bool(value) -> bool:
                     keep_indices = self.prune_draft_tree(norm_hidden_states, inference_info.draft_tokens, full_mask)
                     keep_indices = keep_indices
                     
-                if not training_mode and self._is_spec_decoding and self._is_last_block:
-                    original_hidden_states = output_hidden_states
-                    batch_size, seq_len, hidden_size = original_hidden_states.shape
-                    device = original_hidden_states.device
-                    valid_mask = keep_indices >= 0
-                    batch_idx = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(keep_indices)
-                    valid_hidden_states = original_hidden_states[batch_idx[valid_mask], keep_indices[valid_mask], :]
-                    output_hidden_states = valid_hidden_states.unsqueeze(0)
-                    
+                
                 self._last_keep_indices = keep_indices + cache_len
                 return (output_hidden_states, keep_indices) # Return output hidden states
                 
diff --git a/src/bloombee/server/block_functions.py b/src/bloombee/server/block_functions.py
@@ -1571,8 +1571,21 @@ async def process_microbatch_merged(mb_idx: int, mb_start: int, mb_end: int, tot
                     micro_hidden_list = [r[0] for r in results]
                     micro_keep_list = [r[1] for r in results]
                     
+
+                    padded_keep_list = []
+                    for keep_indices in micro_keep_list:
+                        current_len = keep_indices.shape[1]  # dim 1, 当前是 1
+                        pad_size = length_increment - current_len
+                        if pad_size > 0:
+                            # pad shape: [batch_size, pad_size]
+                            pad_shape = (keep_indices.shape[0], pad_size)
+                            padding = torch.full(pad_shape, -1, dtype=keep_indices.dtype, device=keep_indices.device)
+                            keep_indices = torch.cat([keep_indices, padding], dim=1)  # → [16, length_increment]
+                        padded_keep_list.append(keep_indices)
+                    
+                    
                     hidden_states = merge_microbatch_outputs(micro_hidden_list, dim=0)
-                    keep_indices = merge_microbatch_outputs(micro_keep_list, dim=0)
+                    keep_indices = merge_microbatch_outputs(padded_keep_list, dim=0)
                     
                     # Calculate overlap statistics
                     total_pipeline_time = (pipeline_end_time - pipeline_start_time) * 1000  # ms
@@ -1860,6 +1873,16 @@ async def process_microbatch(mb_idx: int, mb_start: int, mb_end: int):
                 dtype=torch.int64,
                 device=hidden_states.device
             ).unsqueeze(0).expand(hidden_states.shape[0], -1)
+            
+        if is_spec_dec:
+            original_hidden_states = hidden_states
+            batch_size, seq_len, hidden_size = original_hidden_states.shape
+            device = original_hidden_states.device
+            valid_mask = keep_indices >= 0
+            batch_idx = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(keep_indices)
+            valid_hidden_states = original_hidden_states[batch_idx[valid_mask], keep_indices[valid_mask], :]
+            hidden_states = valid_hidden_states.unsqueeze(0)
+            
         
         serialize_start = perf_counter()
         need_pruning_next = torch.tensor(0)
diff --git a/src/bloombee/server/server.py b/src/bloombee/server/server.py
@@ -292,7 +292,7 @@ def __init__(
 
         self.policy = Policy(
             gpu_batch_size, 1,        # gpu_batch_size controls GPU cache allocation
-            50, 50,                   # w_gpu_percent, w_cpu_percent
+            100, 0,                   # w_gpu_percent, w_cpu_percent
             100, 0,                   # cache_gpu_percent=100% (GPU cache only holds micro_batch_size slots)
             100, 0,                   # act_gpu_percent, act_cpu_percent (activations on GPU)
             overlap=False, sep_layer=True, pin_weight=True,
@@ -324,13 +324,13 @@ def __init__(
         self.weight_home = array_1d(self.num_blocks, ValueHolder)
         self.path = os.path.join(tempfile.gettempdir(), 'data', 'llama_weights')
         
-        hidden_size = 4096
+        hidden_size = 6656
         vocab_size = 32000
         
         # Create configuration
         config = PruningConfig(
             method=PruningMethod.ADAPTIVE_NEURAL,
-            neural_threshold=0.9,
+            neural_threshold=0.5,
             simple_threshold=0.1
         )
         
diff --git a/src/bloombee/server/speculative_pruner/adaptive_neural_pruner.py b/src/bloombee/server/speculative_pruner/adaptive_neural_pruner.py
@@ -55,7 +55,7 @@ def __init__(
         self.lm_head = MidLMHead(hidden_size=hidden_size, vocab_size=vocab_size).to("cuda")
         lm_head_weights_path = hf_hub_download(
             repo_id="xxiong59/lm-head-for-speculative-pruning",
-            filename="lm_head_weights_15.pt",
+            filename="lm_head_llama30B-15.pt",
             cache_dir="./cache"
         )
         lm_head_checkpoint = torch.load(lm_head_weights_path, map_location="cuda")
diff --git a/src/bloombee/server/speculative_pruner/pruner_manager.py b/src/bloombee/server/speculative_pruner/pruner_manager.py
@@ -42,7 +42,7 @@ def __init__(
         self.iteration = 0
         self.middle_states = None
         
-        train_lm_head_mode = True
+        train_lm_head_mode = False
         self.lm_head_trainer = LM_head_trainer(hidden_size, vocab_size, device, config) if train_lm_head_mode else None
         
     def switch_method(self, method: Union[str, PruningMethod], keep_stats: bool = False):
diff --git a/src/bloombee/utils/microbatch_config.py b/src/bloombee/utils/microbatch_config.py
@@ -24,7 +24,7 @@
 
 # Default values
 # Micro-batch size for pipeline overlap. Each micro-batch writes to its own slice of the KV cache.
-DEFAULT_MICRO_BATCH_SIZE = 0  # Default micro-batch size for pipeline overlap
+DEFAULT_MICRO_BATCH_SIZE = 16  # Default micro-batch size for pipeline overlap
 
 
 def _is_microbatch_flag_enabled() -> bool:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+[[41905, 7296, 1639, 48598, 18024, 16049, 14628, 9144, 48265, 6717, 44348, 48540, 35741, 5697, 38698, 27651, 2082, 1952, 6140, 14328, 15247, 33118, 39453, 1739, 36781, 13031, 46925, 42590, 45962, 35713, 27493, 14446], [29439, 38618, 18231, 425, 49729, 10463, 45753, 27696, 22298, 18210, 10189, 14110, 50036, 22059, 6698, 6078, 24898, 6338, 23526, 22541, 39565, 17335, 2847, 47823, 30108, 35142, 8180, 24807, 5164, 36178, 19213, 41198], [40535, 23700, 37837, 12601, 46174, 4558, 3003, 43336, 14935, 50663, 18965, 5229, 15256, 6619, 24911, 18217, 29714, 41660, 23909, 10659, 24260, 23283, 13730, 43920, 17496, 45994, 44796, 42469, 4679, 39920, 41613, 11215], [35005, 47784, 16043, 10708, 30294, 24867, 17691, 41943, 45099, 36500, 14392, 44866, 21252, 50352, 50855, 3665, 15010, 2103, 20673, 26290, 17546, 4337, 13826, 37170, 47049, 20622, 13934, 42954, 32717, 25928, 42129, 30071], [9363, 17359, 9150, 16162, 48823, 36789, 35322, 17219, 48956, 38311, 28077, 38242, 26175, 23723, 14373, 9065, 33392, 32343, 5957, 49530, 3087, 7185, 10016, 41120, 10484, 51909, 44596, 27666, 39086, 4163, 25216, 25009], [39052, 30674, 34676, 16476, 36256, 752, 44583, 47233, 7507, 44676, 35190, 49209, 17486, 50370, 42006, 22293, 7310, 19234, 28492, 10365, 29735, 212, 47323, 47164, 17261, 32806, 49935, 11708, 33271, 6973, 40979, 19558], [41874, 33270, 39909, 13035, 10016, 24504, 49971, 10587, 35348, 51028, 34757, 37, 39252, 21243, 32021, 1276, 7331, 23788, 20153, 15692, 3796, 15785, 37182, 5161, 5613, 47966, 31849, 4535, 49846, 34911, 50189, 8241], [8414, 43237, 31148, 36031, 10821, 17370, 34581, 39753, 27730, 13880, 35343, 49497, 47836, 45211, 13182, 46723, 20428, 26148, 44019, 42590, 24472, 28711, 33919, 29588, 7930, 16246, 14725, 4196, 22156, 1378, 38555, 36301], [15080, 38564, 14432, 471, 4652, 46389, 41359, 3858, 15003, 4417, 2058, 21654, 4643, 33695, 15597, 18250, 43842, 31812, 14040, 35339, 8671, 47405, 37423, 37762, 30976, 15925, 51420, 30996, 26677, 12478, 6181, 6352], [43187, 28249, 23219, 27759, 26941, 30606, 47780, 3550, 44129, 42824, 42348, 6449, 3972, 26386, 47724, 22236, 7161, 16295, 12556, 12465, 35146, 29400, 9186, 27648, 12025, 18254, 30318, 16371, 4940, 29041, 36066, 6416]]
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def __init__(`
`55`	`55`	`self.lm_head = MidLMHead(hidden_size=hidden_size, vocab_size=vocab_size).to("cuda")`
`56`	`56`	`lm_head_weights_path = hf_hub_download(`
`57`	`57`	`repo_id="xxiong59/lm-head-for-speculative-pruning",`
`58`		`- filename="lm_head_weights_15.pt",`
	`58`	`+ filename="lm_head_llama30B-15.pt",`
`59`	`59`	`cache_dir="./cache"`
`60`	`60`	`)`
`61`	`61`	`lm_head_checkpoint = torch.load(lm_head_weights_path, map_location="cuda")`