Skip to content

ChatDataset

ChatDataset

A dataset holding ChatInstance.

Source code in flexeval/core/chat_dataset/base.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class ChatDataset(Sequence[ChatInstance], ABC):
    """A dataset holding `ChatInstance`."""

    @abstractmethod
    def __len__(self) -> int:
        """
        Returns the number of chat instances in the dataset.
        """
        raise NotImplementedError

    @abstractmethod
    def __getitem__(self, i: int) -> ChatInstance:
        """
        Returns the i-th chat instance.
        """
        raise NotImplementedError

    def require_incremental_response(self) -> bool:
        """If true, the inputs consist of multiple user utterances and the
        model should generate responses for each utterance incrementally.

        Otherwise, the model just has to continue the conversation from the last user utterance.
        """
        return False

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(num_instances={len(self)})"

__len__ abstractmethod

__len__() -> int

Returns the number of chat instances in the dataset.

Source code in flexeval/core/chat_dataset/base.py
65
66
67
68
69
70
@abstractmethod
def __len__(self) -> int:
    """
    Returns the number of chat instances in the dataset.
    """
    raise NotImplementedError

__getitem__ abstractmethod

__getitem__(i: int) -> ChatInstance

Returns the i-th chat instance.

Source code in flexeval/core/chat_dataset/base.py
72
73
74
75
76
77
@abstractmethod
def __getitem__(self, i: int) -> ChatInstance:
    """
    Returns the i-th chat instance.
    """
    raise NotImplementedError

require_incremental_response

require_incremental_response() -> bool

If true, the inputs consist of multiple user utterances and the model should generate responses for each utterance incrementally.

Otherwise, the model just has to continue the conversation from the last user utterance.

Source code in flexeval/core/chat_dataset/base.py
79
80
81
82
83
84
85
def require_incremental_response(self) -> bool:
    """If true, the inputs consist of multiple user utterances and the
    model should generate responses for each utterance incrementally.

    Otherwise, the model just has to continue the conversation from the last user utterance.
    """
    return False

__repr__

__repr__() -> str
Source code in flexeval/core/chat_dataset/base.py
87
88
def __repr__(self) -> str:
    return f"{self.__class__.__name__}(num_instances={len(self)})"

ChatInstance dataclass

A dataclass representing a single chat that will be fed to a chat language model.

Source code in flexeval/core/chat_dataset/base.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@dataclass
class ChatInstance:
    """
    A dataclass representing a single chat that will be fed to a chat language model.
    """

    messages: list[dict[str, Any]]
    """
    A list of messages in the chat.
    The format of messages typically follows [OpenAI's Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api).
    ```json
    [
        {
            "role": "assistant",
            "content": "Hello! How can I help you today?"
        },
        {
            "role": "user",
            "content": "I'd like to book a flight to Paris."
        }
    ]
    ```
    """
    references: list[str] = field(default_factory=list)
    """
    A list of reference responses to the user's last message.
    The model's response will be evaluated against these references.
    """
    extra_info: dict[str, Any] = field(default_factory=dict)
    """
    Extra information that can be used by passing to `Metric`.
    """

    def __post_init__(self) -> None:
        if "messages" in self.extra_info:
            msg = (
                "'extra_info' in ChatInstance cannot contain a key named 'messages', "
                "as it will conflict with the 'messages' attribute. "
                "The key 'messages' will be removed."
            )
            warnings.warn(msg, stacklevel=2)
            self.extra_info.pop("messages")

    @property
    def inputs(self) -> list[dict[str, str]]:
        """
        Alias for `messages`.
        This is used in `FewShotGenerator` so that it can access the inputs with the same attribute name as
        `GenerationInstance` and `MultipleChoiceInstance`.
        """
        return self.messages

messages instance-attribute

messages: list[dict[str, Any]]

A list of messages in the chat. The format of messages typically follows OpenAI's Chat Completions API.

[
    {
        "role": "assistant",
        "content": "Hello! How can I help you today?"
    },
    {
        "role": "user",
        "content": "I'd like to book a flight to Paris."
    }
]

references class-attribute instance-attribute

references: list[str] = field(default_factory=list)

A list of reference responses to the user's last message. The model's response will be evaluated against these references.

extra_info class-attribute instance-attribute

extra_info: dict[str, Any] = field(default_factory=dict)

Extra information that can be used by passing to Metric.

inputs property

inputs: list[dict[str, str]]

Alias for messages. This is used in FewShotGenerator so that it can access the inputs with the same attribute name as GenerationInstance and MultipleChoiceInstance.

__init__

__init__(
    messages: list[dict[str, Any]],
    references: list[str] = list(),
    extra_info: dict[str, Any] = dict(),
) -> None

__post_init__

__post_init__() -> None
Source code in flexeval/core/chat_dataset/base.py
42
43
44
45
46
47
48
49
50
def __post_init__(self) -> None:
    if "messages" in self.extra_info:
        msg = (
            "'extra_info' in ChatInstance cannot contain a key named 'messages', "
            "as it will conflict with the 'messages' attribute. "
            "The key 'messages' will be removed."
        )
        warnings.warn(msg, stacklevel=2)
        self.extra_info.pop("messages")

HFChatDataset

Load ChatInstances from a Hugging Face dataset.

Parameters:

  • path (str) –

    The path to the Hugging Face dataset.

  • split (str) –

    The split of the dataset.

  • subset (str | None, default: None ) –

    The subset of the dataset.

  • dataset_kwargs (dict[str, Any] | None, default: None ) –

    The keyword arguments to pass to the Hugging Face dataset.

Source code in flexeval/core/chat_dataset/template_based.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class HFChatDataset(TemplateChatDataset):
    """
    Load ChatInstances from a Hugging Face dataset.

    Args:
        path: The path to the Hugging Face dataset.
        split: The split of the dataset.
        subset: The subset of the dataset.
        dataset_kwargs: The keyword arguments to pass to the Hugging Face dataset.
    """

    def __init__(
        self,
        path: str,
        split: str,
        input_template: str,
        subset: str | None = None,
        dataset_kwargs: dict[str, Any] | None = None,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        dataset_kwargs = dataset_kwargs or {}
        dataset = datasets.load_dataset(path, name=subset, split=split, **dataset_kwargs)
        items = [dict(item) for item in dataset]

        super().__init__(
            items=items,
            input_template=input_template,
            reference_template=reference_template,
            reference_list_template=reference_list_template,
            require_incremental_response=require_incremental_response,
            extra_info_templates=extra_info_templates,
            system_message_template=system_message_template,
            data_range=data_range,
            keep_conditions=keep_conditions,
            remove_conditions=remove_conditions,
        )

__init__

__init__(
    path: str,
    split: str,
    input_template: str,
    subset: str | None = None,
    dataset_kwargs: dict[str, Any] | None = None,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None
Source code in flexeval/core/chat_dataset/template_based.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def __init__(
    self,
    path: str,
    split: str,
    input_template: str,
    subset: str | None = None,
    dataset_kwargs: dict[str, Any] | None = None,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    dataset_kwargs = dataset_kwargs or {}
    dataset = datasets.load_dataset(path, name=subset, split=split, **dataset_kwargs)
    items = [dict(item) for item in dataset]

    super().__init__(
        items=items,
        input_template=input_template,
        reference_template=reference_template,
        reference_list_template=reference_list_template,
        require_incremental_response=require_incremental_response,
        extra_info_templates=extra_info_templates,
        system_message_template=system_message_template,
        data_range=data_range,
        keep_conditions=keep_conditions,
        remove_conditions=remove_conditions,
    )

JsonlChatDataset

Load ChatInstances from a JSONL file.

Parameters:

  • path (str) –

    The path to the JSONL file.

Source code in flexeval/core/chat_dataset/template_based.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
class JsonlChatDataset(TemplateChatDataset):
    """
    Load ChatInstances from a JSONL file.

    Args:
        path: The path to the JSONL file.
    """

    def __init__(
        self,
        path: str,
        input_template: str,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        with open(path) as f:
            items = [json.loads(line) for line in f]

        super().__init__(
            items=items,
            input_template=input_template,
            reference_template=reference_template,
            reference_list_template=reference_list_template,
            require_incremental_response=require_incremental_response,
            extra_info_templates=extra_info_templates,
            system_message_template=system_message_template,
            data_range=data_range,
            keep_conditions=keep_conditions,
            remove_conditions=remove_conditions,
        )

__init__

__init__(
    path: str,
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None
Source code in flexeval/core/chat_dataset/template_based.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def __init__(
    self,
    path: str,
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    with open(path) as f:
        items = [json.loads(line) for line in f]

    super().__init__(
        items=items,
        input_template=input_template,
        reference_template=reference_template,
        reference_list_template=reference_list_template,
        require_incremental_response=require_incremental_response,
        extra_info_templates=extra_info_templates,
        system_message_template=system_message_template,
        data_range=data_range,
        keep_conditions=keep_conditions,
        remove_conditions=remove_conditions,
    )

TemplateChatDataset

This class only supports single-turn chat.

Parameters:

  • items (list[dict[str, Any]]) –

    A list of items in a dict format.

  • input_template (str) –

    A Jinja2 template for the user input.

  • reference_template (str | None, default: None ) –

    Specify the Jinja2 template to render the reference string if the dataset has a single reference.

  • reference_list_template (str | None, default: None ) –

    Specify the Jinja2 template to render a list of reference strings if the dataset has multiple references.

  • require_incremental_response (bool, default: False ) –

    Whether the dataset requires incremental response.

  • extra_info_templates (dict[str, str] | None, default: None ) –

    A dictionary of Jinja2 templates for extra information.

  • system_message_template (str | None, default: None ) –

    A Jinja2 template for the system message.

  • data_range (tuple[int, int] | None, default: None ) –

    The range of data to use.

  • keep_conditions (dict[str, str] | None, default: None ) –

    A dictionary to indicate the condition to filter certain items. The key is a Jinja2 template string to embed the item into a string, and the value is the value to keep.

  • remove_conditions (dict[str, str] | None, default: None ) –

    A dictionary to indicate the condition to remove certain items. The key is a Jinja2 template string to embed the item into a string, and the value is the value to remove.

Source code in flexeval/core/chat_dataset/template_based.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class TemplateChatDataset(ChatDataset):
    """
    This class only supports single-turn chat.

    Args:
        items: A list of items in a dict format.
        input_template: A Jinja2 template for the user input.
        reference_template: Specify the Jinja2 template to render the reference string
            if the dataset has a single reference.
        reference_list_template: Specify the Jinja2 template to render a list of reference strings
            if the dataset has multiple references.
        require_incremental_response: Whether the dataset requires incremental response.
        extra_info_templates: A dictionary of Jinja2 templates for extra information.
        system_message_template: A Jinja2 template for the system message.
        data_range: The range of data to use.
        keep_conditions: A dictionary to indicate the condition to filter certain items.
            The key is a Jinja2 template string to embed the item into a string, and the value is the value to keep.
        remove_conditions: A dictionary to indicate the condition to remove certain items.
            The key is a Jinja2 template string to embed the item into a string, and the value is the value to remove.
    """

    def __init__(
        self,
        items: list[dict[str, Any]],
        input_template: str,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        if reference_template and reference_list_template:
            msg = "Only one of reference_template and reference_list_template can be set."
            raise ValueError(msg)

        if data_range:
            start, end = data_range
            items = items[start:end]

        keep_conditions = keep_conditions or {}
        for template_str, value_to_keep in keep_conditions.items():
            key_template = JINJA2_ENV.from_string(template_str)
            items = [item for item in items if key_template.render(**item) == value_to_keep]
        remove_conditions = remove_conditions or {}
        for template_str, value_to_remove in remove_conditions.items():
            key_template = JINJA2_ENV.from_string(template_str)
            items = [item for item in items if key_template.render(**item) != value_to_remove]

        self.items = items

        self.input_template = JINJA2_ENV.from_string(input_template)
        self.reference_template = JINJA2_ENV.from_string(reference_template) if reference_template else None
        self.reference_list_template = (
            JINJA2_ENV.from_string(reference_list_template) if reference_list_template else None
        )

        extra_info_templates = extra_info_templates or {}
        self._extra_info_templates: dict[str, Template] = {
            key: JINJA2_ENV.from_string(template) for key, template in extra_info_templates.items()
        }

        self._system_message_template: Template | None = (
            JINJA2_ENV.from_string(system_message_template) if system_message_template else None
        )

        self._require_incremental_response = require_incremental_response

    def require_incremental_response(self) -> bool:
        return self._require_incremental_response

    def __len__(self) -> int:
        return len(self.items)

    def __getitem__(self, i: int) -> ChatInstance:
        item = self.items[i]
        input_utterance = self.input_template.render(**item)
        messages = [{"role": "user", "content": input_utterance}]

        if self._system_message_template:
            system_message = self._system_message_template.render(**item)
            messages.insert(0, {"role": "system", "content": system_message})

        reference_list: list[str] = []
        if self.reference_template:
            reference_string = self.reference_template.render(**item)
            reference_list.append(reference_string)
        if self.reference_list_template:
            reference_list_string = self.reference_list_template.render(**item)
            if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
                msg = (
                    f"The reference_list_template should render a list of strings "
                    f"but we got `{reference_list_string}`."
                )
                raise ValueError(msg)
            reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])

        extra_info = dict(item.items())
        extra_info_from_templates = {
            key: template.render(**item) for key, template in self._extra_info_templates.items()
        }
        extra_info.update(extra_info_from_templates)

        return ChatInstance(messages=messages, references=reference_list, extra_info=extra_info)

items instance-attribute

items = items

input_template instance-attribute

input_template = from_string(input_template)

reference_template instance-attribute

reference_template = (
    from_string(reference_template)
    if reference_template
    else None
)

reference_list_template instance-attribute

reference_list_template = (
    from_string(reference_list_template)
    if reference_list_template
    else None
)

__init__

__init__(
    items: list[dict[str, Any]],
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None
Source code in flexeval/core/chat_dataset/template_based.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def __init__(
    self,
    items: list[dict[str, Any]],
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    if reference_template and reference_list_template:
        msg = "Only one of reference_template and reference_list_template can be set."
        raise ValueError(msg)

    if data_range:
        start, end = data_range
        items = items[start:end]

    keep_conditions = keep_conditions or {}
    for template_str, value_to_keep in keep_conditions.items():
        key_template = JINJA2_ENV.from_string(template_str)
        items = [item for item in items if key_template.render(**item) == value_to_keep]
    remove_conditions = remove_conditions or {}
    for template_str, value_to_remove in remove_conditions.items():
        key_template = JINJA2_ENV.from_string(template_str)
        items = [item for item in items if key_template.render(**item) != value_to_remove]

    self.items = items

    self.input_template = JINJA2_ENV.from_string(input_template)
    self.reference_template = JINJA2_ENV.from_string(reference_template) if reference_template else None
    self.reference_list_template = (
        JINJA2_ENV.from_string(reference_list_template) if reference_list_template else None
    )

    extra_info_templates = extra_info_templates or {}
    self._extra_info_templates: dict[str, Template] = {
        key: JINJA2_ENV.from_string(template) for key, template in extra_info_templates.items()
    }

    self._system_message_template: Template | None = (
        JINJA2_ENV.from_string(system_message_template) if system_message_template else None
    )

    self._require_incremental_response = require_incremental_response

require_incremental_response

require_incremental_response() -> bool
Source code in flexeval/core/chat_dataset/template_based.py
86
87
def require_incremental_response(self) -> bool:
    return self._require_incremental_response

__len__

__len__() -> int
Source code in flexeval/core/chat_dataset/template_based.py
89
90
def __len__(self) -> int:
    return len(self.items)

__getitem__

__getitem__(i: int) -> ChatInstance
Source code in flexeval/core/chat_dataset/template_based.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def __getitem__(self, i: int) -> ChatInstance:
    item = self.items[i]
    input_utterance = self.input_template.render(**item)
    messages = [{"role": "user", "content": input_utterance}]

    if self._system_message_template:
        system_message = self._system_message_template.render(**item)
        messages.insert(0, {"role": "system", "content": system_message})

    reference_list: list[str] = []
    if self.reference_template:
        reference_string = self.reference_template.render(**item)
        reference_list.append(reference_string)
    if self.reference_list_template:
        reference_list_string = self.reference_list_template.render(**item)
        if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
            msg = (
                f"The reference_list_template should render a list of strings "
                f"but we got `{reference_list_string}`."
            )
            raise ValueError(msg)
        reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])

    extra_info = dict(item.items())
    extra_info_from_templates = {
        key: template.render(**item) for key, template in self._extra_info_templates.items()
    }
    extra_info.update(extra_info_from_templates)

    return ChatInstance(messages=messages, references=reference_list, extra_info=extra_info)

ChatbotBench

This class loads data with the jsonl format used in chat evaluation benchmarks such as MT-Bench (Multi-turn Benchmark) or Vicuna QA Benchmark.

Example of a line from a jsonl file

{ "question_id": 00, "category": "writing", "turns": [ "Compose an engaging travel blog post about a recent trip to Hawaii.", "Rewrite your previous response. Start every sentence with the letter A." ] }

Source code in flexeval/core/chat_dataset/chatbot_bench.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class ChatbotBench(ChatDataset):
    """This class loads data with the jsonl format used in chat evaluation benchmarks such as
    MT-Bench (Multi-turn Benchmark) or Vicuna QA Benchmark.

    Example of a line from a jsonl file:
        {
          "question_id": 00,
          "category": "writing",
          "turns": [
            "Compose an engaging travel blog post about a recent trip to Hawaii.",
            "Rewrite your previous response. Start every sentence with the letter A."
          ]
        }
    """

    def __init__(
        self,
        path_or_name: str,
        ref_path_or_name: str | None = None,
        need_ref_categories: list[str] | None = None,
        load_only_first_n: int | None = None,
    ) -> None:
        file_path = resolve_path_or_name(path_or_name)

        self._id_to_question_id: list[int | str] = []
        self._id_to_category: list[str] = []
        self._messages_dict: dict[int | str, list[dict[str, str]]] = {}
        with open(file_path) as f:
            for line in f:
                item = json.loads(line)
                self._id_to_question_id.append(item["question_id"])
                self._id_to_category.append(item["category"])
                input_messages = [{"role": "user", "content": turn} for turn in item["turns"]]
                if load_only_first_n is not None:
                    input_messages = input_messages[:load_only_first_n]
                self._messages_dict[item["question_id"]] = input_messages

        self._references_dict: dict[int | str, list[str]] = {}
        if ref_path_or_name is not None:
            ref_file_path = resolve_path_or_name(ref_path_or_name)
            with open(ref_file_path) as f:
                for line in f:
                    item = json.loads(line)
                    self._references_dict[item["question_id"]] = item["choices"][0]["turns"]

        self.need_ref_categories = need_ref_categories or [
            "math",
            "coding",
            "reasoning",
        ]

    def require_incremental_response(self) -> bool:
        return True

    def __len__(self) -> int:
        return len(self._id_to_question_id)

    def __getitem__(self, i: int) -> ChatInstance:
        question_id = self._id_to_question_id[i]
        category = self._id_to_category[i]
        references: list[str] = []
        if category in self.need_ref_categories:
            references = self._references_dict.get(question_id, [])
        return ChatInstance(self._messages_dict[question_id], references=references, extra_info={"category": category})

need_ref_categories instance-attribute

need_ref_categories = need_ref_categories or [
    "math",
    "coding",
    "reasoning",
]

__init__

__init__(
    path_or_name: str,
    ref_path_or_name: str | None = None,
    need_ref_categories: list[str] | None = None,
    load_only_first_n: int | None = None,
) -> None
Source code in flexeval/core/chat_dataset/chatbot_bench.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def __init__(
    self,
    path_or_name: str,
    ref_path_or_name: str | None = None,
    need_ref_categories: list[str] | None = None,
    load_only_first_n: int | None = None,
) -> None:
    file_path = resolve_path_or_name(path_or_name)

    self._id_to_question_id: list[int | str] = []
    self._id_to_category: list[str] = []
    self._messages_dict: dict[int | str, list[dict[str, str]]] = {}
    with open(file_path) as f:
        for line in f:
            item = json.loads(line)
            self._id_to_question_id.append(item["question_id"])
            self._id_to_category.append(item["category"])
            input_messages = [{"role": "user", "content": turn} for turn in item["turns"]]
            if load_only_first_n is not None:
                input_messages = input_messages[:load_only_first_n]
            self._messages_dict[item["question_id"]] = input_messages

    self._references_dict: dict[int | str, list[str]] = {}
    if ref_path_or_name is not None:
        ref_file_path = resolve_path_or_name(ref_path_or_name)
        with open(ref_file_path) as f:
            for line in f:
                item = json.loads(line)
                self._references_dict[item["question_id"]] = item["choices"][0]["turns"]

    self.need_ref_categories = need_ref_categories or [
        "math",
        "coding",
        "reasoning",
    ]

require_incremental_response

require_incremental_response() -> bool
Source code in flexeval/core/chat_dataset/chatbot_bench.py
73
74
def require_incremental_response(self) -> bool:
    return True

__len__

__len__() -> int
Source code in flexeval/core/chat_dataset/chatbot_bench.py
76
77
def __len__(self) -> int:
    return len(self._id_to_question_id)

__getitem__

__getitem__(i: int) -> ChatInstance
Source code in flexeval/core/chat_dataset/chatbot_bench.py
79
80
81
82
83
84
85
def __getitem__(self, i: int) -> ChatInstance:
    question_id = self._id_to_question_id[i]
    category = self._id_to_category[i]
    references: list[str] = []
    if category in self.need_ref_categories:
        references = self._references_dict.get(question_id, [])
    return ChatInstance(self._messages_dict[question_id], references=references, extra_info={"category": category})

SacreBleuChatDataset

Load datasets from the sacrebleu library. The available datasets are defined in sacrebleu.DATASETS.

Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class SacreBleuChatDataset(ChatDataset):
    """Load datasets from the [sacrebleu](https://github.com/mjpost/sacrebleu) library.
    The available datasets are defined in sacrebleu.DATASETS.
    """

    def __init__(self, name: str, langpair: str) -> None:
        self._source_list: list[str] = list(sacrebleu.DATASETS[name].source(langpair))
        self._references_list: list[list[str]] = [
            [r.strip() for r in refs] for refs in sacrebleu.DATASETS[name].references(langpair)
        ]

        if len(self._source_list) != len(self._references_list):
            msg = "The number of source and reference pairs should be the same."
            raise ValueError(msg)

    def require_incremental_response(self) -> bool:
        return False

    def __len__(self) -> int:
        return len(self._source_list)

    def __getitem__(self, i: int) -> ChatInstance:
        return ChatInstance(
            messages=[{"role": "user", "content": self._source_list[i]}],
            references=self._references_list[i],
            extra_info={},
        )

__init__

__init__(name: str, langpair: str) -> None
Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
11
12
13
14
15
16
17
18
19
def __init__(self, name: str, langpair: str) -> None:
    self._source_list: list[str] = list(sacrebleu.DATASETS[name].source(langpair))
    self._references_list: list[list[str]] = [
        [r.strip() for r in refs] for refs in sacrebleu.DATASETS[name].references(langpair)
    ]

    if len(self._source_list) != len(self._references_list):
        msg = "The number of source and reference pairs should be the same."
        raise ValueError(msg)

require_incremental_response

require_incremental_response() -> bool
Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
21
22
def require_incremental_response(self) -> bool:
    return False

__len__

__len__() -> int
Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
24
25
def __len__(self) -> int:
    return len(self._source_list)

__getitem__

__getitem__(i: int) -> ChatInstance
Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
27
28
29
30
31
32
def __getitem__(self, i: int) -> ChatInstance:
    return ChatInstance(
        messages=[{"role": "user", "content": self._source_list[i]}],
        references=self._references_list[i],
        extra_info={},
    )