llama-30B

Xu Xiong · Xu Xiong · commit c8b346d47655 · 2026-04-07T20:19:16.000Z
diff --git a/src/bloombee/server/handler.py b/src/bloombee/server/handler.py
@@ -795,8 +795,8 @@ async def _cross_stage_push_wrapper(mb_hidden, mb_keep, push_metadata):
                                 push_tensor_bytes = sum(len(t.buffer) for t in next_tensors)
 
                                 # 模拟网络传输延时
-                                NETWORK_SPEED_BYTES_PER_SEC = 31.25 * 1024 * 1024 
-                                transfer_delay = push_tensor_bytes / NETWORK_SPEED_BYTES_PER_SEC + 0.05
+                                NETWORK_SPEED_BYTES_PER_SEC = 15.6 * 1024 * 1024 
+                                transfer_delay = push_tensor_bytes / NETWORK_SPEED_BYTES_PER_SEC + 0.075
                                 await asyncio.sleep(transfer_delay)
                                 task = asyncio.create_task(self._push_outputs(request, output_tensors, step_metadata))
                                 background_tasks.add(task)  # Keep reference until it is done to save it from GC
diff --git a/src/bloombee/server/server.py b/src/bloombee/server/server.py
@@ -324,7 +324,7 @@ def __init__(
         self.weight_home = array_1d(self.num_blocks, ValueHolder)
         self.path = os.path.join(tempfile.gettempdir(), 'data', 'llama_weights')
         
-        hidden_size = 5120
+        hidden_size = 6656
         vocab_size = 32000
         
         # Create configuration
diff --git a/src/bloombee/server/speculative_pruner/adaptive_neural_pruner.py b/src/bloombee/server/speculative_pruner/adaptive_neural_pruner.py
@@ -55,7 +55,7 @@ def __init__(
         self.lm_head = MidLMHead(hidden_size=hidden_size, vocab_size=vocab_size).to("cuda")
         lm_head_weights_path = hf_hub_download(
             repo_id="xxiong59/lm-head-for-speculative-pruning",
-            filename="lm_head_llama13B-20.pt",
+            filename="lm_head_llama30B-20.pt",
             cache_dir="./cache"
         )
         lm_head_checkpoint = torch.load(lm_head_weights_path, map_location="cuda")

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def __init__(`
`55`	`55`	`self.lm_head = MidLMHead(hidden_size=hidden_size, vocab_size=vocab_size).to("cuda")`
`56`	`56`	`lm_head_weights_path = hf_hub_download(`
`57`	`57`	`repo_id="xxiong59/lm-head-for-speculative-pruning",`
`58`		`- filename="lm_head_llama13B-20.pt",`
	`58`	`+ filename="lm_head_llama30B-20.pt",`
`59`	`59`	`cache_dir="./cache"`
`60`	`60`	`)`
`61`	`61`	`lm_head_checkpoint = torch.load(lm_head_weights_path, map_location="cuda")`