llama30B

Xu Xiong · Xu Xiong · commit ccc4a10f330b · 2026-03-26T06:32:02.000Z
diff --git a/benchmarks/benchmark_speculative_decoding.py b/benchmarks/benchmark_speculative_decoding.py
@@ -73,7 +73,7 @@ def benchmark_inference(process_idx, args, result_pipe):
     
     drafter = MultiSSMDrafter(
         ssm_model_name="JackFram/llama-68m",
-        num_workers=1,
+        num_workers=4,
         device="cuda"
     )
     model = AutoDistributedSpeculativeModel.from_pretrained(
@@ -82,12 +82,12 @@ def benchmark_inference(process_idx, args, result_pipe):
     tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
     
     batch_size = getattr(args, 'batch_size', 8)
-    # dataset = load_dataset("tatsu-lab/alpaca")["train"]
-    # indices = random.sample(range(len(dataset)), batch_size)
-    # sampled = dataset.select(indices)
-    # test_prompts = []
-    # for item in sampled:
-    #     test_prompts.append(item["instruction"])
+    dataset = load_dataset("tatsu-lab/alpaca")["train"]
+    indices = random.sample(range(len(dataset)), batch_size)
+    sampled = dataset.select(indices)
+    test_prompts = []
+    for item in sampled:
+        test_prompts.append(item["instruction"])
         
     # base_prompt = (
     #     "Quantum mechanics explains the behavior of particles at very small scales. "
@@ -104,11 +104,11 @@ def benchmark_inference(process_idx, args, result_pipe):
     #     f"{base_prompt} Example {i + 1} discusses large-scale AI systems and scientific discovery."
     #     for i in range(batch_size)
     # ]
-    prompt_indices = [args.prompt_start_index + i for i in range(batch_size)]
-    if "{i}" not in args.prompt_template:
-        raise ValueError("--prompt_template must include '{i}' placeholder")
-    prompts = [args.prompt_template.format(i=i) for i in prompt_indices]
-    test_prompts = prompts
+    # prompt_indices = [args.prompt_start_index + i for i in range(batch_size)]
+    # if "{i}" not in args.prompt_template:
+    #     raise ValueError("--prompt_template must include '{i}' placeholder")
+    # prompts = [args.prompt_template.format(i=i) for i in prompt_indices]
+    # test_prompts = prompts
 
     tokenizer.pad_token = tokenizer.eos_token
     input_ids = tokenizer(test_prompts, return_tensors="pt", padding=True).to(device)["input_ids"]
diff --git a/src/bloombee/models/llama/spec_decoding_drafter.py b/src/bloombee/models/llama/spec_decoding_drafter.py
@@ -15,22 +15,26 @@ def __init__(self, ssm_model_name: str, num_workers: int = 2, device: str = 'cud
         from transformers import AutoModelForCausalLM
         
         self.num_workers = num_workers
-        self.device = torch.device(device)
         
         self.ssms = []
         self.streams = []
-        for _ in range(num_workers):
+        self.devices = []
+
+        for i in range(num_workers):
+            device_i = torch.device(f'cuda:{i}')
+            self.devices.append(device_i)
+
             ssm = AutoModelForCausalLM.from_pretrained(
                 ssm_model_name,
                 torch_dtype=torch.float16)
-            ssm = ssm.to(self.device)
+            ssm = ssm.to(device_i)
             ssm.eval()
             self.ssms.append(ssm)
-            self.streams.append(torch.cuda.Stream(device=self.device))
+            self.streams.append(torch.cuda.Stream(device=device_i))
         
         with torch.no_grad():
-            dummy = torch.ones(1, 8, dtype=torch.long, device=self.device)
-            for ssm in self.ssms:
+            for i, ssm in enumerate(self.ssms):
+                dummy = torch.ones(1, 8, dtype=torch.long, device=self.devices[i])
                 ssm(dummy, attention_mask=torch.ones_like(dummy))
     
     def build_trees_parallel(
@@ -49,10 +53,11 @@ def build_trees_parallel(
         def worker_fn(worker_idx: int, batch_indices: List[int]):
             ssm = self.ssms[worker_idx]
             stream = self.streams[worker_idx]
+            device = self.devices[worker_idx]
             
             with torch.cuda.stream(stream):
                 results = self._build_trees_batched(
-                    batch_indices, input_ids, seq_lengths, ssm, beam_width, max_depth
+                    batch_indices, input_ids, seq_lengths, ssm, beam_width, max_depth, device
                 )
                 for batch_idx, tree in results:
                     all_results[batch_idx] = tree
@@ -72,8 +77,9 @@ def worker_fn(worker_idx: int, batch_indices: List[int]):
             t.join()
         
         # 同步所有 streams
-        for stream in self.streams:
-            stream.synchronize()
+        for i, stream in enumerate(self.streams):
+            with torch.cuda.device(self.devices[i]):
+                stream.synchronize()
         
         return all_results
 
@@ -85,9 +91,10 @@ def _build_trees_batched(
         ssm,
         beam_width: int,
         max_depth: int,
+        device: torch.device,
     ) -> List:
         
-        pad_token_id = getattr(ssm.config, 'pad_token_id', 0)
+        pad_token_id = getattr(ssm.config, 'pad_token_id', None) or 0
         
         trees = {}
         valid_inputs = {}
@@ -96,7 +103,7 @@ def _build_trees_batched(
         
         for batch_idx in batch_indices:
             actual_len = seq_lengths[batch_idx].item()
-            valid_input_ids = input_ids[batch_idx, :actual_len]
+            valid_input_ids = input_ids[batch_idx, :actual_len].to(device)
             valid_inputs[batch_idx] = valid_input_ids
             prefix_lengths[batch_idx] = max(actual_len - 1, 0)
             
@@ -118,23 +125,23 @@ def _build_trees_batched(
                 if pf_len > 0:
                     prefix = valid_inputs[batch_idx][:-1]
                 else:
-                    prefix = torch.tensor([], dtype=torch.long, device=self.device)
+                    prefix = torch.tensor([], dtype=torch.long, device=device)
                 
                 pad_len = max_prefix_len - pf_len
                 
                 if pf_len > 0:
                     padded_prefixes.append(torch.cat([
-                        torch.full((pad_len,), pad_token_id, dtype=torch.long, device=self.device),
+                        torch.full((pad_len,), pad_token_id, dtype=torch.long, device=device),
                         prefix
                     ]))
                 else:
                     padded_prefixes.append(
-                        torch.full((max_prefix_len,), pad_token_id, dtype=torch.long, device=self.device)
+                        torch.full((max_prefix_len,), pad_token_id, dtype=torch.long, device=device)
                     )
                 
                 prefix_masks.append(torch.cat([
-                    torch.zeros(pad_len, dtype=torch.long, device=self.device),
-                    torch.ones(pf_len, dtype=torch.long, device=self.device)
+                    torch.zeros(pad_len, dtype=torch.long, device=device),
+                    torch.ones(pf_len, dtype=torch.long, device=device)
                 ]))
             
             batch_prefixes = torch.stack(padded_prefixes)
@@ -166,7 +173,7 @@ def _build_trees_batched(
                 
                 for node in tree.get_nodes_at_depth(depth):
                     path = node.get_path_from_root()
-                    path_tokens = torch.tensor([root_token] + path, dtype=torch.long, device=self.device)
+                    path_tokens = torch.tensor([root_token] + path, dtype=torch.long, device=device)
                     all_paths.append(path_tokens)
                     node_mapping.append((batch_idx, node))
                     cache_indices.append(idx_map[batch_idx])
@@ -180,8 +187,8 @@ def _build_trees_batched(
             total_mask_len = max_pf_len + max_path_len
             
             # 预分配
-            batch_paths = torch.full((num_nodes, max_path_len), pad_token_id, dtype=torch.long, device=self.device)
-            batch_path_masks = torch.zeros((num_nodes, total_mask_len), dtype=torch.long, device=self.device)
+            batch_paths = torch.full((num_nodes, max_path_len), pad_token_id, dtype=torch.long, device=device)
+            batch_path_masks = torch.zeros((num_nodes, total_mask_len), dtype=torch.long, device=device)
             
             # 填充
             for i, path in enumerate(all_paths):
@@ -212,7 +219,7 @@ def _build_trees_batched(
                 all_logits = outputs.logits[:, -1, :]
             
             t_forward += time.perf_counter() - t0
-            t0 =    time.perf_counter()
+            t0 = time.perf_counter()
             # 批量 topk
             _, all_top_k_indices = torch.topk(all_logits, k=beam_width, dim=-1)
             all_probs = torch.softmax(all_logits, dim=-1)
diff --git a/src/bloombee/models/llama/speculative_model.py b/src/bloombee/models/llama/speculative_model.py
@@ -35,8 +35,8 @@ def generate(
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         streamer: Optional["BaseStreamer"] = None,
-        beam_width: int = 1,
-        max_tree_depth: int = 4,
+        beam_width: int = 2,
+        max_tree_depth: int = 3,
         use_kv_cache: bool = True,
         kv_cache_window: int = 2048,
         max_new_tokens: int = 128,
diff --git a/src/bloombee/server/backend.py b/src/bloombee/server/backend.py
@@ -369,8 +369,8 @@ def _flag_to_bool(value) -> bool:
                     position_ids = self._position_ids_cache[cache_key] + (cache_len + offset)
                     if self._is_spec_decoding:
                         rotary_position_ids = self._create_tree_position_ids_with_invalid_cache(
-                            width=1,
-                            depth=4,
+                            width=2,
+                            depth=3,
                             prefill_length=inference_info.prefill_length - 1,
                             kv_cache_position_ids=kv_cache_position_ids,
                             batch_offset=inference_info.batch_offset,
diff --git a/src/bloombee/server/handler.py b/src/bloombee/server/handler.py
@@ -795,7 +795,7 @@ async def _cross_stage_push_wrapper(mb_hidden, mb_keep, push_metadata):
                                 push_tensor_bytes = sum(len(t.buffer) for t in next_tensors)
 
                                 # 模拟网络传输延时
-                                NETWORK_SPEED_BYTES_PER_SEC = 5 * 1024 * 1024  # 10 MB/s
+                                NETWORK_SPEED_BYTES_PER_SEC = 10 * 1024 * 1024  # 10 MB/s
                                 transfer_delay = push_tensor_bytes / NETWORK_SPEED_BYTES_PER_SEC
                                 await asyncio.sleep(transfer_delay)
                                 task = asyncio.create_task(self._push_outputs(request, output_tensors, step_metadata))
diff --git a/src/bloombee/server/server.py b/src/bloombee/server/server.py
@@ -324,13 +324,13 @@ def __init__(
         self.weight_home = array_1d(self.num_blocks, ValueHolder)
         self.path = os.path.join(tempfile.gettempdir(), 'data', 'llama_weights')
         
-        hidden_size = 4096
+        hidden_size = 6656
         vocab_size = 32000
         
         # Create configuration
         config = PruningConfig(
             method=PruningMethod.ADAPTIVE_NEURAL,
-            neural_threshold=0.9,
+            neural_threshold=0.6,
             simple_threshold=0.1
         )
         
diff --git a/src/bloombee/server/speculative_pruner/adaptive_neural_pruner.py b/src/bloombee/server/speculative_pruner/adaptive_neural_pruner.py
@@ -55,7 +55,7 @@ def __init__(
         self.lm_head = MidLMHead(hidden_size=hidden_size, vocab_size=vocab_size).to("cuda")
         lm_head_weights_path = hf_hub_download(
             repo_id="xxiong59/lm-head-for-speculative-pruning",
-            filename="lm_head_weights_15.pt",
+            filename="lm_head_llama30B-15.pt",
             cache_dir="./cache"
         )
         lm_head_checkpoint = torch.load(lm_head_weights_path, map_location="cuda")
diff --git a/src/bloombee/server/speculative_pruner/pruner_manager.py b/src/bloombee/server/speculative_pruner/pruner_manager.py
@@ -42,7 +42,7 @@ def __init__(
         self.iteration = 0
         self.middle_states = None
         
-        train_lm_head_mode = True
+        train_lm_head_mode = False
         self.lm_head_trainer = LM_head_trainer(hidden_size, vocab_size, device, config) if train_lm_head_mode else None
         
     def switch_method(self, method: Union[str, PruningMethod], keep_stats: bool = False):
diff --git a/src/bloombee/utils/lossless_wrapper_config.py b/src/bloombee/utils/lossless_wrapper_config.py
@@ -5,7 +5,7 @@
 """
 
 # 0 = disable lossless wrapper, 1 = enable
-ENABLE_LOSSLESS_WRAPPER = 0
+ENABLE_LOSSLESS_WRAPPER = 1
 
 # "zstd" (recommended), "zlib", "none"
 LOSSLESS_ALGO = "zstd"

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def __init__(`
`55`	`55`	`self.lm_head = MidLMHead(hidden_size=hidden_size, vocab_size=vocab_size).to("cuda")`
`56`	`56`	`lm_head_weights_path = hf_hub_download(`
`57`	`57`	`repo_id="xxiong59/lm-head-for-speculative-pruning",`
`58`		`- filename="lm_head_weights_15.pt",`
	`58`	`+ filename="lm_head_llama30B-15.pt",`
`59`	`59`	`cache_dir="./cache"`
`60`	`60`	`)`
`61`	`61`	`lm_head_checkpoint = torch.load(lm_head_weights_path, map_location="cuda")`