forked from redhat-et/semantic_router
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathverify_tokenization.py
More file actions
109 lines (89 loc) · 3.89 KB
/
verify_tokenization.py
File metadata and controls
109 lines (89 loc) · 3.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Verify tokenization script to compare Go-generated tokens with Python sentence_transformers tokens.
"""
import requests
import re
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
# Function to read tokens from the file generated by Go
def read_go_tokens(filename):
token_ids = []
tokens = []
with open(filename, 'r') as f:
# Skip header lines
for _ in range(4): # Skip the first 4 lines
next(f)
for line in f:
if line.strip():
parts = line.strip().split('\t')
if len(parts) == 2:
token_id, token = parts
token_ids.append(int(token_id))
tokens.append(token)
return {'tokens': tokens, 'token_ids': token_ids}
# Function to fetch and process text from norvig.com/big.txt
def fetch_norvig_text():
url = "https://norvig.com/big.txt"
try:
response = requests.get(url)
response.raise_for_status() # Raises an exception for 4XX/5XX responses
text_content = response.text
# Trim to a reasonable size for testing
max_content_length = 20000
if len(text_content) > max_content_length:
text_content = text_content[:max_content_length]
return text_content
except Exception as e:
print(f"Error fetching text from norvig.com: {e}")
return "Failed to download text from norvig.com/big.txt. Using this text as fallback."
# Function to tokenize text using sentence_transformers
def tokenize_with_sentence_transformers(text, model_id):
# Load tokenizer directly from the model
tokenizer = AutoTokenizer.from_pretrained(model_id)
encoding = tokenizer(text, truncation=True, max_length=512)
# Get tokens and token IDs
token_ids = encoding.input_ids
tokens = tokenizer.convert_ids_to_tokens(token_ids)
return {'tokens': tokens, 'token_ids': token_ids}
def main():
# Check if the file exists, if not generate it
try:
go_tokens = read_go_tokens("/tmp/norvig_big_tokens.txt")
print(f"Successfully read {len(go_tokens['tokens'])} tokens from Go-generated file")
except FileNotFoundError:
print("Token file not found. Please run the Go test first to generate norvig_big_tokens.txt")
return
# Use the same model as in the Go test
model_id = "sentence-transformers/all-MiniLM-L6-v2"
# Get text content and tokenize it
print("Fetching text from norvig.com/big.txt...")
text_content = fetch_norvig_text()
print(f"Content length: {len(text_content)} characters")
print(f"Tokenizing with {model_id}...")
py_tokenization = tokenize_with_sentence_transformers(text_content, model_id)
# Compare results
go_tokens_count = len(go_tokens['tokens'])
py_tokens_count = len(py_tokenization['tokens'])
print("\n=== Tokenization Comparison ===")
print(f"Go tokens count: {go_tokens_count}")
print(f"Python tokens count: {py_tokens_count}")
# Calculate token overlap
min_length = min(len(go_tokens['tokens']), len(py_tokenization['tokens']))
matches = 0
mismatches = []
for i in range(min_length):
if go_tokens['tokens'][i] == py_tokenization['tokens'][i]:
matches += 1
else:
mismatches.append((i, go_tokens['tokens'][i], py_tokenization['tokens'][i]))
match_percentage = (matches / min_length) * 100 if min_length > 0 else 0
print(f"\nToken match rate: {match_percentage:.2f}% ({matches}/{min_length})")
# Print some mismatches
if mismatches:
print("\nSample mismatches (first 5):")
print("INDEX\tGO_TOKEN\tPYTHON_TOKEN")
for i, go_token, py_token in mismatches[:5]:
print(f"{i}\t{go_token}\t{py_token}")
if __name__ == "__main__":
main()