We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 0e1a276 commit 4998915Copy full SHA for 4998915
1 file changed
benchmark/compare_models.py
@@ -86,6 +86,13 @@ def _create_random_text_with_num_tokens(
86
seed: int | None = 42,
87
**tokenize_kwargs,
88
) -> str:
89
+ """
90
+ Return a text of random single-character words that tokenizes to exactly `target_num_tokens` tokens.
91
92
+
93
+ # Random chars should give more diverse embeddings than repeating. Want to avoid numerical drift flying under the
94
+ # radar in the correctness check. Ideally we have a small corpus of single-token words to pick from, but it's not
95
+ # clear to me that's not a marginal improvement.
96
def count_tokens(text):
97
return tokenize_fn([text], **tokenize_kwargs)["input_ids"].shape[1]
98
0 commit comments