llama13B

Xu Xiong · Xu Xiong · commit 5b186f0fb0bc · 2026-03-26T07:43:08.000Z
diff --git a/benchmarks/benchmark_speculative_decoding.py b/benchmarks/benchmark_speculative_decoding.py
@@ -73,7 +73,7 @@ def benchmark_inference(process_idx, args, result_pipe):
     
     drafter = MultiSSMDrafter(
         ssm_model_name="JackFram/llama-68m",
-        num_workers=4,
+        num_workers=2,
         device="cuda"
     )
     model = AutoDistributedSpeculativeModel.from_pretrained(
@@ -88,6 +88,7 @@ def benchmark_inference(process_idx, args, result_pipe):
     test_prompts = []
     for item in sampled:
         test_prompts.append(item["instruction"])
+        # test_prompts.append("Generate a list of the best places to eat in London.")
         
     # base_prompt = (
     #     "Quantum mechanics explains the behavior of particles at very small scales. "
diff --git a/src/bloombee/server/backend.py b/src/bloombee/server/backend.py
@@ -468,8 +468,8 @@ def _flag_to_bool(value) -> bool:
                     self.pruner_manager.train_lm_head(middle_norm_hidden_states, norm_hidden_states)
                 
                 if not training_mode and self._is_spec_decoding and self._need_pruning and self._is_last_block:
-                    # norm_hidden_states = self.module.rms_norm(output_hidden_states)
-                    # keep_indices = self.prune_draft_tree(norm_hidden_states, inference_info.draft_tokens, full_mask)
+                    norm_hidden_states = self.module.rms_norm(output_hidden_states)
+                    keep_indices = self.prune_draft_tree(norm_hidden_states, inference_info.draft_tokens, full_mask)
                     keep_indices = keep_indices
                     
                 if not training_mode and self._is_spec_decoding and self._is_last_block:
diff --git a/src/bloombee/server/handler.py b/src/bloombee/server/handler.py
@@ -795,7 +795,7 @@ async def _cross_stage_push_wrapper(mb_hidden, mb_keep, push_metadata):
                                 push_tensor_bytes = sum(len(t.buffer) for t in next_tensors)
 
                                 # 模拟网络传输延时
-                                NETWORK_SPEED_BYTES_PER_SEC = 10 * 1024 * 1024  # 10 MB/s
+                                NETWORK_SPEED_BYTES_PER_SEC = 50 * 1024 * 1024  # 10 MB/s
                                 transfer_delay = push_tensor_bytes / NETWORK_SPEED_BYTES_PER_SEC
                                 await asyncio.sleep(transfer_delay)
                                 task = asyncio.create_task(self._push_outputs(request, output_tensors, step_metadata))
diff --git a/src/bloombee/server/server.py b/src/bloombee/server/server.py
@@ -324,13 +324,13 @@ def __init__(
         self.weight_home = array_1d(self.num_blocks, ValueHolder)
         self.path = os.path.join(tempfile.gettempdir(), 'data', 'llama_weights')
         
-        hidden_size = 6656
+        hidden_size = 5120
         vocab_size = 32000
         
         # Create configuration
         config = PruningConfig(
             method=PruningMethod.ADAPTIVE_NEURAL,
-            neural_threshold=0.6,
+            neural_threshold=0.5,
             simple_threshold=0.1
         )
         
diff --git a/src/bloombee/server/speculative_pruner/adaptive_neural_pruner.py b/src/bloombee/server/speculative_pruner/adaptive_neural_pruner.py
@@ -55,7 +55,7 @@ def __init__(
         self.lm_head = MidLMHead(hidden_size=hidden_size, vocab_size=vocab_size).to("cuda")
         lm_head_weights_path = hf_hub_download(
             repo_id="xxiong59/lm-head-for-speculative-pruning",
-            filename="lm_head_llama30B-15.pt",
+            filename="lm_head_llama13B-20.pt",
             cache_dir="./cache"
         )
         lm_head_checkpoint = torch.load(lm_head_weights_path, map_location="cuda")

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def __init__(`
`55`	`55`	`self.lm_head = MidLMHead(hidden_size=hidden_size, vocab_size=vocab_size).to("cuda")`
`56`	`56`	`lm_head_weights_path = hf_hub_download(`
`57`	`57`	`repo_id="xxiong59/lm-head-for-speculative-pruning",`
`58`		`- filename="lm_head_llama30B-15.pt",`
	`58`	`+ filename="lm_head_llama13B-20.pt",`
`59`	`59`	`cache_dir="./cache"`
`60`	`60`	`)`
`61`	`61`	`lm_head_checkpoint = torch.load(lm_head_weights_path, map_location="cuda")`