semantic_router/verify_tokenization.py at classify · noyitz/semantic_router · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Verify tokenization script to compare Go-generated tokens with Python sentence_transformers tokens.
"""
import requests
import re
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

# Function to read tokens from the file generated by Go
def read_go_tokens(filename):
    token_ids = []
    tokens = []
    with open(filename, 'r') as f:
        # Skip header lines
        for _ in range(4):  # Skip the first 4 lines
            next(f)

        for line in f:
            if line.strip():
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    token_id, token = parts
                    token_ids.append(int(token_id))
                    tokens.append(token)

    return {'tokens': tokens, 'token_ids': token_ids}

# Function to fetch and process text from norvig.com/big.txt
def fetch_norvig_text():
    url = "https://norvig.com/big.txt"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an exception for 4XX/5XX responses
        text_content = response.text

        # Trim to a reasonable size for testing
        max_content_length = 20000
        if len(text_content) > max_content_length:
            text_content = text_content[:max_content_length]

        return text_content
    except Exception as e:
        print(f"Error fetching text from norvig.com: {e}")
        return "Failed to download text from norvig.com/big.txt. Using this text as fallback."

# Function to tokenize text using sentence_transformers
def tokenize_with_sentence_transformers(text, model_id):
    # Load tokenizer directly from the model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    encoding = tokenizer(text, truncation=True, max_length=512)

    # Get tokens and token IDs
    token_ids = encoding.input_ids
    tokens = tokenizer.convert_ids_to_tokens(token_ids)

    return {'tokens': tokens, 'token_ids': token_ids}

def main():
    # Check if the file exists, if not generate it
    try:
        go_tokens = read_go_tokens("/tmp/norvig_big_tokens.txt")
        print(f"Successfully read {len(go_tokens['tokens'])} tokens from Go-generated file")
    except FileNotFoundError:
        print("Token file not found. Please run the Go test first to generate norvig_big_tokens.txt")
        return

    # Use the same model as in the Go test
    model_id = "sentence-transformers/all-MiniLM-L6-v2"

    # Get text content and tokenize it
    print("Fetching text from norvig.com/big.txt...")
    text_content = fetch_norvig_text()
    print(f"Content length: {len(text_content)} characters")

    print(f"Tokenizing with {model_id}...")
    py_tokenization = tokenize_with_sentence_transformers(text_content, model_id)

    # Compare results
    go_tokens_count = len(go_tokens['tokens'])
    py_tokens_count = len(py_tokenization['tokens'])

    print("\n=== Tokenization Comparison ===")
    print(f"Go tokens count: {go_tokens_count}")
    print(f"Python tokens count: {py_tokens_count}")

    # Calculate token overlap
    min_length = min(len(go_tokens['tokens']), len(py_tokenization['tokens']))
    matches = 0
    mismatches = []

    for i in range(min_length):
        if go_tokens['tokens'][i] == py_tokenization['tokens'][i]:
            matches += 1
        else:
            mismatches.append((i, go_tokens['tokens'][i], py_tokenization['tokens'][i]))

    match_percentage = (matches / min_length) * 100 if min_length > 0 else 0
    print(f"\nToken match rate: {match_percentage:.2f}% ({matches}/{min_length})")

    # Print some mismatches
    if mismatches:
        print("\nSample mismatches (first 5):")
        print("INDEX\tGO_TOKEN\tPYTHON_TOKEN")
        for i, go_token, py_token in mismatches[:5]:
            print(f"{i}\t{go_token}\t{py_token}")

if __name__ == "__main__":
    main()