-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
Copy pathmegatron_tokenizer.py
116 lines (116 loc) · 4.17 KB
/
megatron_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import json
from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import Any
import numpy
class MegatronTokenizer(ABC):
"""Abstract class for tokenizer
Absent a config or class-specific tracking of which objects are uniquely identifying, we must
include all key word arguments as unique identifiers
Args:
tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes
tokenizer_options (Dict[str, Any]): All tokenizer options
"""
def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
self.unique_identifiers = OrderedDict()
self.unique_identifiers["class"] = type(self).__name__
self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
for option in tokenizer_options:
self.unique_identifiers[option] = str(tokenizer_options[option])
self.unique_description = json.dumps(self.unique_identifiers, indent=4)
super().__init__()
@abstractmethod
def tokenize(self, text: str) -> numpy.ndarray:
"""Convert text to embedding ids
Args:
text (str): The text to convert
Returns:
numpy.ndarray: The converted embedding ids
"""
pass
def detokenize(self, ids: numpy.ndarray) -> str:
"""Convert embedding ids to text
Args:
ids (numpy.ndarray): The ids to convert
Returns:
str: The converted text
Raises:
NotImplementedError: Non-abstract, optional method
"""
raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__))
def offsets(self, ids: list[int], text: str) -> list[int]:
"""Convert embedding ids to text offsets
Args:
ids (list[int]): The ids to convert
text (str): The text to convert
Returns:
list[int]: The converted offsets
Raises:
NotImplementedError: Non-abstract, optional method
"""
raise NotImplementedError("{} has no method 'offsets'".format(type(self).__name__))
@property
@abstractmethod
def vocab(self):
"""Dictionary from vocab text token to id token"""
pass
@property
@abstractmethod
def inv_vocab(self):
"""Dictionary from vocab id token to text token"""
pass
@property
@abstractmethod
def vocab_size(self):
"""The vocabulary size"""
pass
@property
def cls(self):
"""The CLS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__))
@property
def sep(self):
"""The SEP token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__))
@property
def pad(self):
"""The PAD token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__))
@property
def eod(self):
"""The EOD token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__))
@property
def bos(self):
"""The BOS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__))
@property
def eos(self):
"""The EOS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__))
@property
def mask(self):
"""The MASK token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__))