Skip to content

Commit 4998915

Browse files
committed
y random
1 parent 0e1a276 commit 4998915

1 file changed

Lines changed: 7 additions & 0 deletions

File tree

benchmark/compare_models.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,13 @@ def _create_random_text_with_num_tokens(
8686
seed: int | None = 42,
8787
**tokenize_kwargs,
8888
) -> str:
89+
"""
90+
Return a text of random single-character words that tokenizes to exactly `target_num_tokens` tokens.
91+
"""
92+
93+
# Random chars should give more diverse embeddings than repeating. Want to avoid numerical drift flying under the
94+
# radar in the correctness check. Ideally we have a small corpus of single-token words to pick from, but it's not
95+
# clear to me that's not a marginal improvement.
8996
def count_tokens(text):
9097
return tokenize_fn([text], **tokenize_kwargs)["input_ids"].shape[1]
9198

0 commit comments

Comments
 (0)