neuralmagic
diff --git a/‎lm_eval/api/model.py‎
Lines changed: 72 additions & 12 deletions b/‎lm_eval/api/model.py‎
Lines changed: 72 additions & 12 deletions
diff --git a/‎lm_eval/models/huggingface.py‎
Lines changed: 23 additions & 25 deletions b/‎lm_eval/models/huggingface.py‎
Lines changed: 23 additions & 25 deletions
diff --git a/‎lm_eval/models/utils.py‎
Lines changed: 18 additions & 1 deletion b/‎lm_eval/models/utils.py‎
Lines changed: 18 additions & 1 deletion
@@ -324,10 +324,11 @@ class TemplateLM(LM):
     """
 
     tokenizer = None
+    backend = "causal"
 
     @property
     @abc.abstractmethod
-    def eot_token_id(self):
+    def eot_token_id(self) -> int:
         pass
 
     @property
@@ -336,9 +337,13 @@ def prefix_token_id(self):
         return self.eot_token_id
 
     @abc.abstractmethod
-    def tok_encode(self, string: str, **kwargs) -> list[int]:
+    def tok_encode(
+        self, string: str, add_special_tokens: Optional[bool] = None, **kwargs
+    ) -> list[int]:
         """
         Tokenize a string using the model's tokenizer and return a list of token IDs.
+        NOTE: This method is expected to handle strings which already contain the BOS token (when add_special_tokens=None).
+        Otherwise, will use add_special_tokens if specified.
         """
         pass
 
@@ -351,38 +356,93 @@ def _loglikelihood_tokens(
     def _encode_pair(
         self, context: str, continuation: str
     ) -> tuple[list[int], list[int]]:
-        import transformers
+        """
+        Encode a context-continuation pair into separate token ID lists.
+
+        This method handles the tokenization of context and continuation strings while
+        preserving proper boundary handling. Trailing spaces in the context are moved
+        to the beginning of the continuation to ensure correct tokenization at the
+        word boundary.
+
+        For Seq2Seq models (encoder-decoder), context and continuation are encoded
+        separately. For other model types (decoder-only), the full sequence is encoded
+        together to ensure proper tokenization, then split at the context boundary.
+
+        :param context: str
+            The context string. Can be empty (will be handled by the caller).
+        :param continuation: str
+            The continuation string to be scored.
+
+        :return: tuple[list[int], list[int]]
+            A tuple of (context_enc, continuation_enc) where:
+            - context_enc: Token IDs for the context
+            - continuation_enc: Token IDs for the continuation
+
+        Note:
+            This method does NOT handle empty context. The caller should
+            handle empty context (see loglikelihood method).
+        """
+        assert context, "Context cannot be empty!"
 
         n_spaces = len(context) - len(context.rstrip())
         if n_spaces > 0:
             continuation = context[-n_spaces:] + continuation
             context = context[:-n_spaces]
 
-        model_class = getattr(self, "AUTO_MODEL_CLASS", None)
-
-        if model_class == transformers.AutoModelForSeq2SeqLM:
-            context_enc = self.tok_encode(context)
-            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
-        else:
+        if self.backend == "causal":
             whole_enc = self.tok_encode(context + continuation)
             context_enc = self.tok_encode(context)
 
             context_enc_len = len(context_enc)
             continuation_enc = whole_enc[context_enc_len:]
+        else:
+            # for SEQ2SEQ case we need to encode separately
+            context_enc = self.tok_encode(context)
+            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
 
         return context_enc, continuation_enc
 
     def loglikelihood(
         self, requests: list["Instance"], disable_tqdm: bool = False
     ) -> list[tuple[float, bool]]:
+        """
+        Compute log-likelihood of generating continuations from contexts.
+
+        This is the concrete implementation for TemplateLM and its subclasses.
+        It tokenizes context-continuation pairs and delegates scoring to
+        _loglikelihood_tokens.
+
+        **IMPORTANT**: This method is expected to handle empty context strings.
+        When context is empty (""), it uses the model's prefix_token_id (typically
+        BOS or EOS token) as context. If the continuation already starts with the
+        prefix token, it reuses that token as context instead of duplicating it.
+
+        :param requests: list[Instance]
+            List of Instance objects with property `args` returning (context, continuation) tuples.
+        :param disable_tqdm: bool
+            Whether to disable the progress bar in _loglikelihood_tokens.
+
+        :return: list[tuple[float, bool]]
+            List of (log_prob, is_greedy) tuples for each request.
+
+        Implementation details:
+            - Empty context: Uses prefix_token_id (BOS/EOS) as context
+            - Non-empty context: Uses _encode_pair for proper tokenization
+            - Avoids token duplication when continuation starts with prefix_token_id
+        """
         new_reqs = []
         for context, continuation in [req.args for req in requests]:
             if context == "":
-                # BOS or EOS as context
+                continuation_enc = self.tok_encode(
+                    continuation, add_special_tokens=False
+                )
+                # BOS or EOS as context: handle when context is empty -> (context + continuation) -> (BOS + continuation
                 context_enc, continuation_enc = (
-                    [self.prefix_token_id],
-                    self.tok_encode(continuation),
+                    ([self.prefix_token_id], continuation_enc)
+                    if self.prefix_token_id != continuation_enc[0]
+                    else (continuation_enc[:1], continuation_enc[1:])
                 )
+                # BOS or EOS as context
             else:
                 context_enc, continuation_enc = self._encode_pair(context, continuation)
 
 
@@ -32,10 +32,12 @@
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
     Collator,
+    _add_special_kwargs,
     clear_torch_cache,
     configure_pad_token,
     get_dtype,
     handle_stop_sequences,
+    has_bos_prefix,
     pad_and_concat,
     postprocess_generated_text,
     stop_sequences_criteria,
@@ -84,7 +86,7 @@ def __init__(
         max_batch_size: int | None = 64,
         trust_remote_code: bool | None = False,
         use_fast_tokenizer: bool | None = True,
-        add_bos_token: bool | None = False,
+        add_bos_token: bool | None = None,
         prefix_token_id: int | None = None,
         # arguments used for splitting a model across GPUs naively.
         # only used if `parallelize=True`.
@@ -258,11 +260,6 @@ def __init__(
         )
 
         self.add_bos_token = add_bos_token
-        if "gemma" in getattr(self.config, "model_type", ""):
-            self.add_bos_token = True
-            eval_logger.info(
-                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
-            )
 
         self._max_length = max_length
         self.pretrained = pretrained
@@ -744,7 +741,7 @@ def _create_tokenizer(
         trust_remote_code: bool | None = False,
         use_fast_tokenizer: bool | None = True,
         gguf_file: str | None = None,
-        add_bos_token: bool | None = False,
+        add_bos_token: bool | None = None,
         subfolder: str | None = "",
     ) -> None:
         """Helper method during initialization.
@@ -763,8 +760,8 @@ def _create_tokenizer(
         else:
             kwargs["use_fast"] = use_fast_tokenizer
 
-        if add_bos_token:
-            kwargs["add_bos_token"] = True
+        if add_bos_token is not None:
+            kwargs["add_bos_token"] = add_bos_token
 
         if subfolder:
             kwargs["subfolder"] = subfolder
@@ -858,24 +855,20 @@ def forward_batch(batch_size: int):
     def tok_encode(
         self,
         string: str,
-        left_truncate_len: int | None = None,
         add_special_tokens: bool | None = None,
+        left_truncate_len: int | None = None,
+        **kwargs,
     ) -> list[int]:
-        """ """
         # default for None - empty dict, use predefined tokenizer param
         # used for all models except for CausalLM or predefined value
-        special_tokens_kwargs = {}
-
-        # by default for CausalLM - false or self.add_bos_token is set
-        if add_special_tokens is None:
-            if self.backend == "causal":
-                special_tokens_kwargs = {
-                    "add_special_tokens": False or self.add_bos_token
-                }
-        # otherwise the method explicitly defines the value
-        else:
-            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
-
+        special_tokens_kwargs = _add_special_kwargs(
+            add_special_tokens, self.add_bos_token
+        )
+        # set add_special_tokens=False if the string already starts with BOS token.
+        if add_special_tokens is None and has_bos_prefix(
+            string, self.tokenizer.decode(self.prefix_token_id)
+        ):
+            special_tokens_kwargs["add_special_tokens"] = False
         encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
 
         # left-truncate the encoded context to be at most `left_truncate_len` tokens long
@@ -897,7 +890,12 @@ def tok_batch_encode(
 
         add_special_tokens = {}
         if self.backend == "causal":
-            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+            if has_bos_prefix(strings[0], getattr(self.tokenizer, "bos_token", None)):
+                add_special_tokens = {"add_special_tokens": False}
+            elif self.add_bos_token is not None:
+                add_special_tokens = {"add_special_tokens": self.add_bos_token}
+            else:
+                add_special_tokens = {}
 
         encoding = self.tokenizer(
             strings,
@@ -971,7 +969,7 @@ def _model_generate(
         context,
         max_length: int,
         stop: list[str],
-        **generation_kwargs: dict[str, Any],
+        **generation_kwargs,
     ) -> torch.Tensor:
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
 
@@ -150,7 +150,7 @@ def get_original(self, grouped_dict):
 
 def pad_and_concat(
     max_length: int,
-    tensors: List[torch.Tensor],
+    tensors: list[torch.Tensor],
     padding_side: Literal["right", "left"] = "right",
 ):
     """
@@ -881,3 +881,20 @@ def postprocess_generated_text(
         generation = generation.split(think_end_token)[-1].lstrip()
 
     return generation
+
+
+def has_bos_prefix(sequence: str, bos_str: str | Iterable[str] | None = None):
+    if bos_str is None:
+        return False
+    elif isinstance(bos_str, str):
+        return sequence.startswith(bos_str)
+    else:
+        return any(sequence.startswith(x) for x in bos_str)
+
+
+def _add_special_kwargs(add_special_tokens: bool | None, add_bos: bool | None = None):
+    if add_special_tokens is not None:
+        return {"add_special_tokens": add_special_tokens}
+    if add_bos is not None:
+        return {"add_special_tokens": add_bos}
+    return {}