Skip to content

TextDataset

TextDataset

This class represents a dataset of text examples.

Source code in flexeval/core/text_dataset/base.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class TextDataset(Sequence[TextInstance], ABC):
    """
    This class represents a dataset of text examples.
    """

    @abstractmethod
    def __len__(self) -> int:
        pass

    @abstractmethod
    def __getitem__(self, item: int) -> TextInstance:
        pass

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(num_instances={len(self)})"

__len__ abstractmethod

__len__() -> int
Source code in flexeval/core/text_dataset/base.py
19
20
21
@abstractmethod
def __len__(self) -> int:
    pass

__getitem__ abstractmethod

__getitem__(item: int) -> TextInstance
Source code in flexeval/core/text_dataset/base.py
23
24
25
@abstractmethod
def __getitem__(self, item: int) -> TextInstance:
    pass

__repr__

__repr__() -> str
Source code in flexeval/core/text_dataset/base.py
27
28
def __repr__(self) -> str:
    return f"{self.__class__.__name__}(num_instances={len(self)})"

TextInstance dataclass

Source code in flexeval/core/text_dataset/base.py
 8
 9
10
11
@dataclass
class TextInstance:
    text: str
    prefix: str = ""

text instance-attribute

text: str

prefix class-attribute instance-attribute

prefix: str = ''

__init__

__init__(text: str, prefix: str = '') -> None

HFTextDataset

This class represents a dataset of text examples loaded from Hugging Face datasets.

Parameters:

  • path (str) –

    The name of the dataset to load.

  • split (str) –

    The split of the dataset to load.

  • text_template (str) –

    A Jinja2 template for the text.

  • subset (str | None, default: None ) –

    The subset of the dataset to load.

  • keep_conditions (dict[str, str] | None, default: None ) –

    A dictionary to indicate the condition to filter certain items. The key is a Jinja2 template string to embed the item into a string, and the value is the value to keep.

  • remove_conditions (dict[str, str] | None, default: None ) –

    A dictionary to indicate the condition to remove certain items. The key is a Jinja2 template string to embed the item into a string, and the value is the value to remove.

  • dataset_kwargs (dict[str, Any] | None, default: None ) –

    Additional keyword arguments for datasets.load_dataset.

Source code in flexeval/core/text_dataset/hf.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
class HFTextDataset(TextDataset):
    """
    This class represents a dataset of text examples loaded from Hugging Face datasets.

    Args:
        path: The name of the dataset to load.
        split: The split of the dataset to load.
        text_template: A Jinja2 template for the text.
        subset: The subset of the dataset to load.
        keep_conditions: A dictionary to indicate the condition to filter certain items.
            The key is a Jinja2 template string to embed the item into a string, and the value is the value to keep.
        remove_conditions: A dictionary to indicate the condition to remove certain items.
            The key is a Jinja2 template string to embed the item into a string, and the value is the value to remove.
        dataset_kwargs: Additional keyword arguments for `datasets.load_dataset`.
    """

    def __init__(
        self,
        path: str,
        split: str,
        text_template: str,
        prefix_template: str | None = None,
        subset: str | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
        dataset_kwargs: dict[str, Any] | None = None,
    ) -> None:
        dataset_kwargs = dataset_kwargs or {}
        self.dataset = datasets.load_dataset(path, split=split, name=subset, **dataset_kwargs)

        keep_conditions = keep_conditions or {}
        for template_str, value_to_keep in keep_conditions.items():
            filter_template = JINJA2_ENV.from_string(template_str)
            self.dataset = self.dataset.filter(lambda x, t=filter_template, v=value_to_keep: t.render(**x) == v)
        remove_conditions = remove_conditions or {}
        for template_str, value_to_remove in remove_conditions.items():
            filter_template = JINJA2_ENV.from_string(template_str)
            self.dataset = self.dataset.filter(lambda x, t=filter_template, v=value_to_remove: t.render(**x) != v)

        self.text_template = JINJA2_ENV.from_string(text_template)
        self.prefix_template = None
        if prefix_template:
            self.prefix_template = JINJA2_ENV.from_string(prefix_template)

    def __len__(self) -> int:
        return len(self.dataset)

    def __getitem__(self, i: int) -> TextInstance:
        item = self.dataset[i]
        text = self.text_template.render(**item)
        prefix = ""
        if self.prefix_template:
            prefix = self.prefix_template.render(**item)
        return TextInstance(text=text, prefix=prefix)

dataset instance-attribute

dataset = filter(
    lambda x, t=filter_template, v=value_to_remove: render(
        **x
    )
    != v
)

text_template instance-attribute

text_template = from_string(text_template)

prefix_template instance-attribute

prefix_template = None

__init__

__init__(
    path: str,
    split: str,
    text_template: str,
    prefix_template: str | None = None,
    subset: str | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
    dataset_kwargs: dict[str, Any] | None = None,
) -> None
Source code in flexeval/core/text_dataset/hf.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(
    self,
    path: str,
    split: str,
    text_template: str,
    prefix_template: str | None = None,
    subset: str | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
    dataset_kwargs: dict[str, Any] | None = None,
) -> None:
    dataset_kwargs = dataset_kwargs or {}
    self.dataset = datasets.load_dataset(path, split=split, name=subset, **dataset_kwargs)

    keep_conditions = keep_conditions or {}
    for template_str, value_to_keep in keep_conditions.items():
        filter_template = JINJA2_ENV.from_string(template_str)
        self.dataset = self.dataset.filter(lambda x, t=filter_template, v=value_to_keep: t.render(**x) == v)
    remove_conditions = remove_conditions or {}
    for template_str, value_to_remove in remove_conditions.items():
        filter_template = JINJA2_ENV.from_string(template_str)
        self.dataset = self.dataset.filter(lambda x, t=filter_template, v=value_to_remove: t.render(**x) != v)

    self.text_template = JINJA2_ENV.from_string(text_template)
    self.prefix_template = None
    if prefix_template:
        self.prefix_template = JINJA2_ENV.from_string(prefix_template)

__len__

__len__() -> int
Source code in flexeval/core/text_dataset/hf.py
56
57
def __len__(self) -> int:
    return len(self.dataset)

__getitem__

__getitem__(i: int) -> TextInstance
Source code in flexeval/core/text_dataset/hf.py
59
60
61
62
63
64
65
def __getitem__(self, i: int) -> TextInstance:
    item = self.dataset[i]
    text = self.text_template.render(**item)
    prefix = ""
    if self.prefix_template:
        prefix = self.prefix_template.render(**item)
    return TextInstance(text=text, prefix=prefix)

JsonlTextDataset

This class represents a dataset of text examples loaded from a JSONL file.

Parameters:

  • path (str | PathLike[str]) –

    The path to the JSONL file.

  • field (str) –

    The field to extract from the JSONL file.

Source code in flexeval/core/text_dataset/jsonl.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
class JsonlTextDataset(TextDataset):
    """
    This class represents a dataset of text examples loaded from a JSONL file.

    Args:
        path: The path to the JSONL file.
        field: The field to extract from the JSONL file.
    """

    def __init__(self, path: str | PathLike[str], field: str) -> None:
        self._text_list: list[str] = []
        with open(path) as f:
            for line in f:
                item = json.loads(line)
                self._text_list.append(item[field])

    def __len__(self) -> int:
        return len(self._text_list)

    def __getitem__(self, item: int) -> TextInstance:
        return TextInstance(self._text_list[item])

__init__

__init__(path: str | PathLike[str], field: str) -> None
Source code in flexeval/core/text_dataset/jsonl.py
20
21
22
23
24
25
def __init__(self, path: str | PathLike[str], field: str) -> None:
    self._text_list: list[str] = []
    with open(path) as f:
        for line in f:
            item = json.loads(line)
            self._text_list.append(item[field])

__len__

__len__() -> int
Source code in flexeval/core/text_dataset/jsonl.py
27
28
def __len__(self) -> int:
    return len(self._text_list)

__getitem__

__getitem__(item: int) -> TextInstance
Source code in flexeval/core/text_dataset/jsonl.py
30
31
def __getitem__(self, item: int) -> TextInstance:
    return TextInstance(self._text_list[item])