training/DeepSpeed-Domino/megatron/core/datasets/megatron_tokenizer.py

# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import json
from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import Any
import numpy
class MegatronTokenizer(ABC):
    """Abstract class for tokenizer
    Absent a config or class-specific tracking of which objects are uniquely identifying, we must
    include all key word arguments as unique identifiers
    Args:
        tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes
        tokenizer_options (Dict[str, Any]): All tokenizer options
    """
    def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
        self.unique_identifiers = OrderedDict()
        self.unique_identifiers["class"] = type(self).__name__
        self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
        for option in tokenizer_options:
            self.unique_identifiers[option] = str(tokenizer_options[option])
        self.unique_description = json.dumps(self.unique_identifiers, indent=4)
        super().__init__()
    @abstractmethod
    def tokenize(self, text: str) -> numpy.ndarray:
        """Convert text to embedding ids
        Args:
            text (str): The text to convert
        Returns:
            numpy.ndarray: The converted embedding ids
        """
        pass
    def detokenize(self, ids: numpy.ndarray) -> str:
        """Convert embedding ids to text
        Args:
            ids (numpy.ndarray): The ids to convert
        Returns:
            str: The converted text
        Raises:
            NotImplementedError: Non-abstract, optional method
        """
        raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__))
    def offsets(self, ids: list[int], text: str) -> list[int]:
        """Convert embedding ids to text offsets
        Args:
            ids (list[int]): The ids to convert
            text (str): The text to convert
        Returns:
            list[int]: The converted offsets
        Raises:
            NotImplementedError: Non-abstract, optional method
        """
        raise NotImplementedError("{} has no method 'offsets'".format(type(self).__name__))
    @property
    @abstractmethod
    def vocab(self):
        """Dictionary from vocab text token to id token"""
        pass
    @property
    @abstractmethod
    def inv_vocab(self):
        """Dictionary from vocab id token to text token"""
        pass
    @property
    @abstractmethod
    def vocab_size(self):
        """The vocabulary size"""
        pass
    @property
    def cls(self):
        """The CLS token id
        Raises:
            NotImplementedError: Non-abstract, optional attribute
        """
        raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__))
    @property
    def sep(self):
        """The SEP token id
        Raises:
            NotImplementedError: Non-abstract, optional attribute
        """
        raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__))
    @property
    def pad(self):
        """The PAD token id
        Raises:
            NotImplementedError: Non-abstract, optional attribute
        """
        raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__))
    @property
    def eod(self):
        """The EOD token id
        Raises:
            NotImplementedError: Non-abstract, optional attribute
        """
        raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__))
    @property
    def bos(self):
        """The BOS token id
        Raises:
            NotImplementedError: Non-abstract, optional attribute
        """
        raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__))
    @property
    def eos(self):
        """The EOS token id
        Raises:
            NotImplementedError: Non-abstract, optional attribute
        """
        raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__))
    @property
    def mask(self):
        """The MASK token id
        Raises:
            NotImplementedError: Non-abstract, optional attribute
        """
        raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__))