Skip to content

Tokenizer

Tokenizer

Tokenizer interface.

Tokenizers are used to split text into tokens. Typically, this is used in Metric that requires word-level statistics.

Source code in flexeval/core/tokenizer/base.py
 6
 7
 8
 9
10
11
12
13
14
15
16
class Tokenizer(ABC):
    """
    Tokenizer interface.

    Tokenizers are used to split text into tokens.
    Typically, this is used in `Metric` that requires word-level statistics.
    """

    @abstractmethod
    def tokenize(self, text: str) -> list[str]:
        raise NotImplementedError

tokenize abstractmethod

tokenize(text: str) -> list[str]
Source code in flexeval/core/tokenizer/base.py
14
15
16
@abstractmethod
def tokenize(self, text: str) -> list[str]:
    raise NotImplementedError

MecabTokenizer

MeCab tokenizer for Japanese text.

Source code in flexeval/core/tokenizer/mecab.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
class MecabTokenizer(Tokenizer):
    """
    MeCab tokenizer for Japanese text.
    """

    def __init__(self) -> None:
        import fugashi

        self._tagger = fugashi.Tagger("-Owakati")

    def tokenize(self, text: str) -> list[str]:
        tokens = self._tagger(text)
        return [token.surface for token in tokens]

__init__

__init__() -> None
Source code in flexeval/core/tokenizer/mecab.py
11
12
13
14
def __init__(self) -> None:
    import fugashi

    self._tagger = fugashi.Tagger("-Owakati")

tokenize

tokenize(text: str) -> list[str]
Source code in flexeval/core/tokenizer/mecab.py
16
17
18
def tokenize(self, text: str) -> list[str]:
    tokens = self._tagger(text)
    return [token.surface for token in tokens]

SacreBleuTokenizer

A tokenizer imported from uses the sacrebleu library.

Parameters:

  • name (str) –

    The name of the tokenizer.

Source code in flexeval/core/tokenizer/sacrebleu_tokenizer.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
class SacreBleuTokenizer(Tokenizer):
    """
    A tokenizer imported from uses the sacrebleu library.

    Args:
        name: The name of the tokenizer.
    """

    def __init__(self, name: str) -> None:
        self.tokenizer = _get_tokenizer(name)()

    def tokenize(self, text: str) -> list[str]:
        return self.tokenizer(text).split(" ")

tokenizer instance-attribute

tokenizer = _get_tokenizer(name)()

__init__

__init__(name: str) -> None
Source code in flexeval/core/tokenizer/sacrebleu_tokenizer.py
16
17
def __init__(self, name: str) -> None:
    self.tokenizer = _get_tokenizer(name)()

tokenize

tokenize(text: str) -> list[str]
Source code in flexeval/core/tokenizer/sacrebleu_tokenizer.py
19
20
def tokenize(self, text: str) -> list[str]:
    return self.tokenizer(text).split(" ")

TiktokenTokenizer

Source code in flexeval/core/tokenizer/tiktoken_tokenizer.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class TiktokenTokenizer(Tokenizer):
    def __init__(self, tokenizer_name: str | None = None, model_name: str | None = None) -> None:
        # raise error, if both tokenizer_name and model_name are provided
        if tokenizer_name is not None and model_name is not None:
            msg = "Only one of tokenizer_name or model_name must be provided."
            raise ValueError(msg)

        if tokenizer_name:
            self.encoding = tiktoken.get_encoding(tokenizer_name)
        elif model_name:
            self.encoding = tiktoken.encoding_for_model(model_name)
        else:
            msg = "Either tokenizer_name or model_name must be provided"
            raise ValueError(msg)

    def tokenize(self, text: str) -> list[str]:
        token_ids = self.encoding.encode(text)
        return [self.encoding.decode([token_id]) for token_id in token_ids]

encoding instance-attribute

encoding = get_encoding(tokenizer_name)

__init__

__init__(
    tokenizer_name: str | None = None,
    model_name: str | None = None,
) -> None
Source code in flexeval/core/tokenizer/tiktoken_tokenizer.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
def __init__(self, tokenizer_name: str | None = None, model_name: str | None = None) -> None:
    # raise error, if both tokenizer_name and model_name are provided
    if tokenizer_name is not None and model_name is not None:
        msg = "Only one of tokenizer_name or model_name must be provided."
        raise ValueError(msg)

    if tokenizer_name:
        self.encoding = tiktoken.get_encoding(tokenizer_name)
    elif model_name:
        self.encoding = tiktoken.encoding_for_model(model_name)
    else:
        msg = "Either tokenizer_name or model_name must be provided"
        raise ValueError(msg)

tokenize

tokenize(text: str) -> list[str]
Source code in flexeval/core/tokenizer/tiktoken_tokenizer.py
23
24
25
def tokenize(self, text: str) -> list[str]:
    token_ids = self.encoding.encode(text)
    return [self.encoding.decode([token_id]) for token_id in token_ids]

TransformersTokenizer

Source code in flexeval/core/tokenizer/transformers_tokenizer.py
10
11
12
13
14
15
16
17
18
19
20
21
22
class TransformersTokenizer(Tokenizer):
    def __init__(
        self,
        path: str,
        init_kwargs: dict[str, Any] | None = None,
        tokenize_kwargs: dict[str, Any] | None = None,
    ) -> None:
        init_kwargs = init_kwargs or {}
        self.tokenizer = AutoTokenizer.from_pretrained(path, **init_kwargs)
        self.tokenize_kwargs = tokenize_kwargs or {}

    def tokenize(self, text: str) -> list[str]:
        return self.tokenizer.tokenize(text, **self.tokenize_kwargs)

tokenizer instance-attribute

tokenizer = from_pretrained(path, **init_kwargs)

tokenize_kwargs instance-attribute

tokenize_kwargs = tokenize_kwargs or {}

__init__

__init__(
    path: str,
    init_kwargs: dict[str, Any] | None = None,
    tokenize_kwargs: dict[str, Any] | None = None,
) -> None
Source code in flexeval/core/tokenizer/transformers_tokenizer.py
11
12
13
14
15
16
17
18
19
def __init__(
    self,
    path: str,
    init_kwargs: dict[str, Any] | None = None,
    tokenize_kwargs: dict[str, Any] | None = None,
) -> None:
    init_kwargs = init_kwargs or {}
    self.tokenizer = AutoTokenizer.from_pretrained(path, **init_kwargs)
    self.tokenize_kwargs = tokenize_kwargs or {}

tokenize

tokenize(text: str) -> list[str]
Source code in flexeval/core/tokenizer/transformers_tokenizer.py
21
22
def tokenize(self, text: str) -> list[str]:
    return self.tokenizer.tokenize(text, **self.tokenize_kwargs)

WhitespaceTokenizer

A simple whitespace tokenizer.

Source code in flexeval/core/tokenizer/whitespace.py
 6
 7
 8
 9
10
11
12
class WhitespaceTokenizer(Tokenizer):
    """
    A simple whitespace tokenizer.
    """

    def tokenize(self, text: str) -> list[str]:
        return text.split()

tokenize

tokenize(text: str) -> list[str]
Source code in flexeval/core/tokenizer/whitespace.py
11
12
def tokenize(self, text: str) -> list[str]:
    return text.split()