Skip to content

ChatDataset

ChatDataset

A dataset holding ChatInstance.

Source code in flexeval/core/chat_dataset/base.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class ChatDataset(Sequence[ChatInstance], ABC):
    """A dataset holding `ChatInstance`."""

    @abstractmethod
    def __len__(self) -> int:
        """
        Returns the number of chat instances in the dataset.
        """
        raise NotImplementedError

    @abstractmethod
    def __getitem__(self, i: int) -> ChatInstance:
        """
        Returns the i-th chat instance.
        """
        raise NotImplementedError

    def require_incremental_response(self) -> bool:
        """If true, the inputs consist of multiple user utterances and the
        model should generate responses for each utterance incrementally.

        Otherwise, the model just has to continue the conversation from the last user utterance.
        """
        return False

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(num_instances={len(self)})"

__len__ abstractmethod

__len__() -> int

Returns the number of chat instances in the dataset.

Source code in flexeval/core/chat_dataset/base.py
71
72
73
74
75
76
@abstractmethod
def __len__(self) -> int:
    """
    Returns the number of chat instances in the dataset.
    """
    raise NotImplementedError

__getitem__ abstractmethod

__getitem__(i: int) -> ChatInstance

Returns the i-th chat instance.

Source code in flexeval/core/chat_dataset/base.py
78
79
80
81
82
83
@abstractmethod
def __getitem__(self, i: int) -> ChatInstance:
    """
    Returns the i-th chat instance.
    """
    raise NotImplementedError

require_incremental_response

require_incremental_response() -> bool

If true, the inputs consist of multiple user utterances and the model should generate responses for each utterance incrementally.

Otherwise, the model just has to continue the conversation from the last user utterance.

Source code in flexeval/core/chat_dataset/base.py
85
86
87
88
89
90
91
def require_incremental_response(self) -> bool:
    """If true, the inputs consist of multiple user utterances and the
    model should generate responses for each utterance incrementally.

    Otherwise, the model just has to continue the conversation from the last user utterance.
    """
    return False

__repr__

__repr__() -> str
Source code in flexeval/core/chat_dataset/base.py
93
94
def __repr__(self) -> str:
    return f"{self.__class__.__name__}(num_instances={len(self)})"

ChatInstance dataclass

A dataclass representing a single chat that will be fed to a chat language model.

Source code in flexeval/core/chat_dataset/base.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
@dataclass
class ChatInstance:
    """
    A dataclass representing a single chat that will be fed to a chat language model.
    """

    messages: list[dict[str, Any]]
    """
    A list of messages in the chat.
    The format of messages typically follows [OpenAI's Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api).
    ```json
    [
        {
            "role": "assistant",
            "content": "Hello! How can I help you today?"
        },
        {
            "role": "user",
            "content": "I'd like to book a flight to Paris."
        }
    ]
    ```
    """
    tools: list[dict[str, Any]] | None = None
    """
    A list of definitions of tools in the chat.
    The format of tools typically follows [OpenAI's Chat Completion API](https://platform.openai.com/docs/guides/function-calling#function-calling-steps)
    Currently, only function calling (tools with type="function") is supported.
    """
    references: list[str] = field(default_factory=list)
    """
    A list of reference responses to the user's last message.
    The model's response will be evaluated against these references.
    """
    extra_info: dict[str, Any] = field(default_factory=dict)
    """
    Extra information that can be used by passing to `Metric`.
    """

    def __post_init__(self) -> None:
        if "messages" in self.extra_info:
            msg = (
                "'extra_info' in ChatInstance cannot contain a key named 'messages', "
                "as it will conflict with the 'messages' attribute. "
                "The key 'messages' will be removed."
            )
            warnings.warn(msg, stacklevel=2)
            self.extra_info.pop("messages")

    @property
    def inputs(self) -> list[dict[str, str]]:
        """
        Alias for `messages`.
        This is used in `FewShotGenerator` so that it can access the inputs with the same attribute name as
        `GenerationInstance` and `MultipleChoiceInstance`.
        """
        return self.messages

messages instance-attribute

messages: list[dict[str, Any]]

A list of messages in the chat. The format of messages typically follows OpenAI's Chat Completions API.

[
    {
        "role": "assistant",
        "content": "Hello! How can I help you today?"
    },
    {
        "role": "user",
        "content": "I'd like to book a flight to Paris."
    }
]

tools class-attribute instance-attribute

tools: list[dict[str, Any]] | None = None

A list of definitions of tools in the chat. The format of tools typically follows OpenAI's Chat Completion API Currently, only function calling (tools with type="function") is supported.

references class-attribute instance-attribute

references: list[str] = field(default_factory=list)

A list of reference responses to the user's last message. The model's response will be evaluated against these references.

extra_info class-attribute instance-attribute

extra_info: dict[str, Any] = field(default_factory=dict)

Extra information that can be used by passing to Metric.

inputs property

inputs: list[dict[str, str]]

Alias for messages. This is used in FewShotGenerator so that it can access the inputs with the same attribute name as GenerationInstance and MultipleChoiceInstance.

__init__

__init__(
    messages: list[dict[str, Any]],
    tools: list[dict[str, Any]] | None = None,
    references: list[str] = list(),
    extra_info: dict[str, Any] = dict(),
) -> None

__post_init__

__post_init__() -> None
Source code in flexeval/core/chat_dataset/base.py
48
49
50
51
52
53
54
55
56
def __post_init__(self) -> None:
    if "messages" in self.extra_info:
        msg = (
            "'extra_info' in ChatInstance cannot contain a key named 'messages', "
            "as it will conflict with the 'messages' attribute. "
            "The key 'messages' will be removed."
        )
        warnings.warn(msg, stacklevel=2)
        self.extra_info.pop("messages")

HFChatDataset

Load ChatInstances from a Hugging Face dataset.

Parameters:

  • path (str) –

    The path to the Hugging Face dataset.

  • split (str) –

    The split of the dataset.

  • subset (str | None, default: None ) –

    The subset of the dataset.

  • dataset_kwargs (dict[str, Any] | None, default: None ) –

    The keyword arguments to pass to the Hugging Face dataset.

Source code in flexeval/core/chat_dataset/template_based.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
class HFChatDataset(TemplateChatDataset):
    """
    Load ChatInstances from a Hugging Face dataset.

    Args:
        path: The path to the Hugging Face dataset.
        split: The split of the dataset.
        subset: The subset of the dataset.
        dataset_kwargs: The keyword arguments to pass to the Hugging Face dataset.
    """

    def __init__(
        self,
        path: str,
        split: str,
        input_template: str,
        subset: str | None = None,
        dataset_kwargs: dict[str, Any] | None = None,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        dataset_kwargs = dataset_kwargs or {}
        dataset = datasets.load_dataset(path, name=subset, split=split, **dataset_kwargs)
        items = [dict(item) for item in dataset]

        super().__init__(
            items=items,
            input_template=input_template,
            reference_template=reference_template,
            reference_list_template=reference_list_template,
            require_incremental_response=require_incremental_response,
            extra_info_templates=extra_info_templates,
            system_message_template=system_message_template,
            data_range=data_range,
            keep_conditions=keep_conditions,
            remove_conditions=remove_conditions,
        )

__init__

__init__(
    path: str,
    split: str,
    input_template: str,
    subset: str | None = None,
    dataset_kwargs: dict[str, Any] | None = None,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None
Source code in flexeval/core/chat_dataset/template_based.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def __init__(
    self,
    path: str,
    split: str,
    input_template: str,
    subset: str | None = None,
    dataset_kwargs: dict[str, Any] | None = None,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    dataset_kwargs = dataset_kwargs or {}
    dataset = datasets.load_dataset(path, name=subset, split=split, **dataset_kwargs)
    items = [dict(item) for item in dataset]

    super().__init__(
        items=items,
        input_template=input_template,
        reference_template=reference_template,
        reference_list_template=reference_list_template,
        require_incremental_response=require_incremental_response,
        extra_info_templates=extra_info_templates,
        system_message_template=system_message_template,
        data_range=data_range,
        keep_conditions=keep_conditions,
        remove_conditions=remove_conditions,
    )

JsonlChatDataset

Load ChatInstances from a JSONL file.

Parameters:

  • path (str) –

    The path to the JSONL file.

Source code in flexeval/core/chat_dataset/template_based.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
class JsonlChatDataset(TemplateChatDataset):
    """
    Load ChatInstances from a JSONL file.

    Args:
        path: The path to the JSONL file.
    """

    def __init__(
        self,
        path: str,
        input_template: str,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        with open(path) as f:
            items = [json.loads(line) for line in f]

        super().__init__(
            items=items,
            input_template=input_template,
            reference_template=reference_template,
            reference_list_template=reference_list_template,
            require_incremental_response=require_incremental_response,
            extra_info_templates=extra_info_templates,
            system_message_template=system_message_template,
            data_range=data_range,
            keep_conditions=keep_conditions,
            remove_conditions=remove_conditions,
        )

__init__

__init__(
    path: str,
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None
Source code in flexeval/core/chat_dataset/template_based.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def __init__(
    self,
    path: str,
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    with open(path) as f:
        items = [json.loads(line) for line in f]

    super().__init__(
        items=items,
        input_template=input_template,
        reference_template=reference_template,
        reference_list_template=reference_list_template,
        require_incremental_response=require_incremental_response,
        extra_info_templates=extra_info_templates,
        system_message_template=system_message_template,
        data_range=data_range,
        keep_conditions=keep_conditions,
        remove_conditions=remove_conditions,
    )

TemplateChatDataset

This class only supports single-turn chat.

Parameters:

  • items (list[dict[str, Any]]) –

    A list of items in a dict format. The "tools" key for each item can contain the list of function definitions. They should be in JSON Schema format as in the OpenAI Chat Completion API. https://platform.openai.com/docs/guides/function-calling?api-mode=chat#defining-functions

  • input_template (str) –

    A Jinja2 template for the user input.

  • reference_template (str | None, default: None ) –

    Specify the Jinja2 template to render the reference string if the dataset has a single reference.

  • reference_list_template (str | None, default: None ) –

    Specify the Jinja2 template to render a list of reference strings if the dataset has multiple references.

  • require_incremental_response (bool, default: False ) –

    Whether the dataset requires incremental response.

  • extra_info_templates (dict[str, str] | None, default: None ) –

    A dictionary of Jinja2 templates for extra information.

  • system_message_template (str | None, default: None ) –

    A Jinja2 template for the system message.

  • data_range (tuple[int, int] | None, default: None ) –

    The range of data to use.

  • keep_conditions (dict[str, str] | None, default: None ) –

    A dictionary to indicate the condition to filter certain items. The key is a Jinja2 template string to embed the item into a string, and the value is the value to keep.

  • remove_conditions (dict[str, str] | None, default: None ) –

    A dictionary to indicate the condition to remove certain items. The key is a Jinja2 template string to embed the item into a string, and the value is the value to remove.

Source code in flexeval/core/chat_dataset/template_based.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
class TemplateChatDataset(ChatDataset):
    """
    This class only supports single-turn chat.

    Args:
        items: A list of items in a dict format.
            The "tools" key for each item can contain the list of function definitions.
            They should be in JSON Schema format as in the OpenAI Chat Completion API.
            https://platform.openai.com/docs/guides/function-calling?api-mode=chat#defining-functions
        input_template: A Jinja2 template for the user input.
        reference_template: Specify the Jinja2 template to render the reference string
            if the dataset has a single reference.
        reference_list_template: Specify the Jinja2 template to render a list of reference strings
            if the dataset has multiple references.
        require_incremental_response: Whether the dataset requires incremental response.
        extra_info_templates: A dictionary of Jinja2 templates for extra information.
        system_message_template: A Jinja2 template for the system message.
        data_range: The range of data to use.
        keep_conditions: A dictionary to indicate the condition to filter certain items.
            The key is a Jinja2 template string to embed the item into a string, and the value is the value to keep.
        remove_conditions: A dictionary to indicate the condition to remove certain items.
            The key is a Jinja2 template string to embed the item into a string, and the value is the value to remove.
    """

    def __init__(
        self,
        items: list[dict[str, Any]],
        input_template: str,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        if reference_template and reference_list_template:
            msg = "Only one of reference_template and reference_list_template can be set."
            raise ValueError(msg)

        if data_range:
            start, end = data_range
            items = items[start:end]

        keep_conditions = keep_conditions or {}
        for template_str, value_to_keep in keep_conditions.items():
            key_template = JINJA2_ENV.from_string(template_str)
            items = [item for item in items if key_template.render(**item) == value_to_keep]
        remove_conditions = remove_conditions or {}
        for template_str, value_to_remove in remove_conditions.items():
            key_template = JINJA2_ENV.from_string(template_str)
            items = [item for item in items if key_template.render(**item) != value_to_remove]

        self.items = items

        self.input_template = JINJA2_ENV.from_string(input_template)
        self.reference_template = JINJA2_ENV.from_string(reference_template) if reference_template else None
        self.reference_list_template = (
            JINJA2_ENV.from_string(reference_list_template) if reference_list_template else None
        )

        extra_info_templates = extra_info_templates or {}
        self._extra_info_templates: dict[str, Template] = {
            key: JINJA2_ENV.from_string(template) for key, template in extra_info_templates.items()
        }

        self._system_message_template: Template | None = (
            JINJA2_ENV.from_string(system_message_template) if system_message_template else None
        )

        self._require_incremental_response = require_incremental_response

    def require_incremental_response(self) -> bool:
        return self._require_incremental_response

    def __len__(self) -> int:
        return len(self.items)

    def __getitem__(self, i: int) -> ChatInstance:
        item = self.items[i]
        input_utterance = self.input_template.render(**item)
        messages = [{"role": "user", "content": input_utterance}]

        if self._system_message_template:
            system_message = self._system_message_template.render(**item)
            messages.insert(0, {"role": "system", "content": system_message})

        reference_list: list[str] = []
        if self.reference_template:
            reference_string = self.reference_template.render(**item)
            reference_list.append(reference_string)
        if self.reference_list_template:
            reference_list_string = self.reference_list_template.render(**item)
            if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
                msg = (
                    f"The reference_list_template should render a list of strings "
                    f"but we got `{reference_list_string}`."
                )
                raise ValueError(msg)
            reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])

        extra_info = dict(item.items())
        extra_info_from_templates = {
            key: template.render(**item) for key, template in self._extra_info_templates.items()
        }
        extra_info.update(extra_info_from_templates)

        return ChatInstance(
            messages=messages, tools=item.get("tools"), references=reference_list, extra_info=extra_info
        )

items instance-attribute

items = items

input_template instance-attribute

input_template = from_string(input_template)

reference_template instance-attribute

reference_template = (
    from_string(reference_template)
    if reference_template
    else None
)

reference_list_template instance-attribute

reference_list_template = (
    from_string(reference_list_template)
    if reference_list_template
    else None
)

__init__

__init__(
    items: list[dict[str, Any]],
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None
Source code in flexeval/core/chat_dataset/template_based.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def __init__(
    self,
    items: list[dict[str, Any]],
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    if reference_template and reference_list_template:
        msg = "Only one of reference_template and reference_list_template can be set."
        raise ValueError(msg)

    if data_range:
        start, end = data_range
        items = items[start:end]

    keep_conditions = keep_conditions or {}
    for template_str, value_to_keep in keep_conditions.items():
        key_template = JINJA2_ENV.from_string(template_str)
        items = [item for item in items if key_template.render(**item) == value_to_keep]
    remove_conditions = remove_conditions or {}
    for template_str, value_to_remove in remove_conditions.items():
        key_template = JINJA2_ENV.from_string(template_str)
        items = [item for item in items if key_template.render(**item) != value_to_remove]

    self.items = items

    self.input_template = JINJA2_ENV.from_string(input_template)
    self.reference_template = JINJA2_ENV.from_string(reference_template) if reference_template else None
    self.reference_list_template = (
        JINJA2_ENV.from_string(reference_list_template) if reference_list_template else None
    )

    extra_info_templates = extra_info_templates or {}
    self._extra_info_templates: dict[str, Template] = {
        key: JINJA2_ENV.from_string(template) for key, template in extra_info_templates.items()
    }

    self._system_message_template: Template | None = (
        JINJA2_ENV.from_string(system_message_template) if system_message_template else None
    )

    self._require_incremental_response = require_incremental_response

require_incremental_response

require_incremental_response() -> bool
Source code in flexeval/core/chat_dataset/template_based.py
89
90
def require_incremental_response(self) -> bool:
    return self._require_incremental_response

__len__

__len__() -> int
Source code in flexeval/core/chat_dataset/template_based.py
92
93
def __len__(self) -> int:
    return len(self.items)

__getitem__

__getitem__(i: int) -> ChatInstance
Source code in flexeval/core/chat_dataset/template_based.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def __getitem__(self, i: int) -> ChatInstance:
    item = self.items[i]
    input_utterance = self.input_template.render(**item)
    messages = [{"role": "user", "content": input_utterance}]

    if self._system_message_template:
        system_message = self._system_message_template.render(**item)
        messages.insert(0, {"role": "system", "content": system_message})

    reference_list: list[str] = []
    if self.reference_template:
        reference_string = self.reference_template.render(**item)
        reference_list.append(reference_string)
    if self.reference_list_template:
        reference_list_string = self.reference_list_template.render(**item)
        if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
            msg = (
                f"The reference_list_template should render a list of strings "
                f"but we got `{reference_list_string}`."
            )
            raise ValueError(msg)
        reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])

    extra_info = dict(item.items())
    extra_info_from_templates = {
        key: template.render(**item) for key, template in self._extra_info_templates.items()
    }
    extra_info.update(extra_info_from_templates)

    return ChatInstance(
        messages=messages, tools=item.get("tools"), references=reference_list, extra_info=extra_info
    )

ChatbotBench

This class loads data with the jsonl format used in chat evaluation benchmarks such as MT-Bench (Multi-turn Benchmark) or Vicuna QA Benchmark.

Example of a line from a jsonl file

{ "question_id": 00, "category": "writing", "turns": [ "Compose an engaging travel blog post about a recent trip to Hawaii.", "Rewrite your previous response. Start every sentence with the letter A." ] # 'tools' key is optional. # It should be in the same format as FunctionCalling in the OpenAI ChatCompletion API. # https://platform.openai.com/docs/guides/function-calling?api-mode=chat#defining-functions "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get current temperature for a given location.", "parameters": { "type": "object", "properties": { "location": {"type": "string", "description": "City and country e.g. Bogotá, Colombia"}, }, "required": ["location"], "additionalProperties": False}, "strict": True }, }, ], # 'system_message' key is optional. # If set, it will be inserted in the first turn as a system prompt "system_message": "You are a helpful assistant." }

Source code in flexeval/core/chat_dataset/chatbot_bench.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
class ChatbotBench(ChatDataset):
    """This class loads data with the jsonl format used in chat evaluation benchmarks such as
    MT-Bench (Multi-turn Benchmark) or Vicuna QA Benchmark.

    Example of a line from a jsonl file:
        {
          "question_id": 00,
          "category": "writing",
          "turns": [
            "Compose an engaging travel blog post about a recent trip to Hawaii.",
            "Rewrite your previous response. Start every sentence with the letter A."
          ]
          # 'tools' key is optional.
          # It should be in the same format as FunctionCalling in the OpenAI ChatCompletion API.
          # https://platform.openai.com/docs/guides/function-calling?api-mode=chat#defining-functions
          "tools": [
            {
              "type": "function",
              "function": {
                "name": "get_weather",
                "description": "Get current temperature for a given location.",
                "parameters": {
                  "type": "object",
                  "properties": {
                    "location": {"type": "string", "description": "City and country e.g. Bogotá, Colombia"},
                  },
                  "required": ["location"],
                  "additionalProperties": False},
                "strict": True
              },
            },
          ],
          # 'system_message' key is optional.
          # If set, it will be inserted in the first turn as a system prompt
          "system_message": "You are a helpful assistant."
        }
    """

    def __init__(
        self,
        path_or_name: str,
        ref_path_or_name: str | None = None,
        need_ref_categories: list[str] | None = None,
        load_only_first_n: int | None = None,
    ) -> None:
        file_path = resolve_path_or_name(path_or_name)

        self._id_to_question_id: list[int | str] = []
        self._id_to_category: list[str] = []
        self._messages_dict: dict[int | str, list[dict[str, str]]] = {}
        self._tools_dict: dict[int | str, list[dict[str, Any] | None]] = {}
        with open(file_path) as f:
            for line in f:
                item = json.loads(line)
                self._id_to_question_id.append(item["question_id"])
                self._id_to_category.append(item["category"])
                input_messages = [{"role": "user", "content": turn} for turn in item["turns"]]
                if item.get("system_message"):
                    input_messages = [{"role": "system", "content": item["system_message"]}, *input_messages]
                if load_only_first_n is not None:
                    input_messages = input_messages[:load_only_first_n]
                self._messages_dict[item["question_id"]] = input_messages
                self._tools_dict[item["question_id"]] = item.get("tools")

        self._references_dict: dict[int | str, list[str]] = {}
        if ref_path_or_name is not None:
            ref_file_path = resolve_path_or_name(ref_path_or_name)
            with open(ref_file_path) as f:
                for line in f:
                    item = json.loads(line)
                    self._references_dict[item["question_id"]] = item["choices"][0]["turns"]

        self.need_ref_categories = need_ref_categories or [
            "math",
            "coding",
            "reasoning",
        ]

    def require_incremental_response(self) -> bool:
        return True

    def __len__(self) -> int:
        return len(self._id_to_question_id)

    def __getitem__(self, i: int) -> ChatInstance:
        question_id = self._id_to_question_id[i]
        category = self._id_to_category[i]
        references: list[str] = []
        if category in self.need_ref_categories:
            references = self._references_dict.get(question_id, [])
        return ChatInstance(
            self._messages_dict[question_id],
            tools=self._tools_dict[question_id],
            references=references,
            extra_info={"category": category},
        )

need_ref_categories instance-attribute

need_ref_categories = need_ref_categories or [
    "math",
    "coding",
    "reasoning",
]

__init__

__init__(
    path_or_name: str,
    ref_path_or_name: str | None = None,
    need_ref_categories: list[str] | None = None,
    load_only_first_n: int | None = None,
) -> None
Source code in flexeval/core/chat_dataset/chatbot_bench.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def __init__(
    self,
    path_or_name: str,
    ref_path_or_name: str | None = None,
    need_ref_categories: list[str] | None = None,
    load_only_first_n: int | None = None,
) -> None:
    file_path = resolve_path_or_name(path_or_name)

    self._id_to_question_id: list[int | str] = []
    self._id_to_category: list[str] = []
    self._messages_dict: dict[int | str, list[dict[str, str]]] = {}
    self._tools_dict: dict[int | str, list[dict[str, Any] | None]] = {}
    with open(file_path) as f:
        for line in f:
            item = json.loads(line)
            self._id_to_question_id.append(item["question_id"])
            self._id_to_category.append(item["category"])
            input_messages = [{"role": "user", "content": turn} for turn in item["turns"]]
            if item.get("system_message"):
                input_messages = [{"role": "system", "content": item["system_message"]}, *input_messages]
            if load_only_first_n is not None:
                input_messages = input_messages[:load_only_first_n]
            self._messages_dict[item["question_id"]] = input_messages
            self._tools_dict[item["question_id"]] = item.get("tools")

    self._references_dict: dict[int | str, list[str]] = {}
    if ref_path_or_name is not None:
        ref_file_path = resolve_path_or_name(ref_path_or_name)
        with open(ref_file_path) as f:
            for line in f:
                item = json.loads(line)
                self._references_dict[item["question_id"]] = item["choices"][0]["turns"]

    self.need_ref_categories = need_ref_categories or [
        "math",
        "coding",
        "reasoning",
    ]

require_incremental_response

require_incremental_response() -> bool
Source code in flexeval/core/chat_dataset/chatbot_bench.py
101
102
def require_incremental_response(self) -> bool:
    return True

__len__

__len__() -> int
Source code in flexeval/core/chat_dataset/chatbot_bench.py
104
105
def __len__(self) -> int:
    return len(self._id_to_question_id)

__getitem__

__getitem__(i: int) -> ChatInstance
Source code in flexeval/core/chat_dataset/chatbot_bench.py
107
108
109
110
111
112
113
114
115
116
117
118
def __getitem__(self, i: int) -> ChatInstance:
    question_id = self._id_to_question_id[i]
    category = self._id_to_category[i]
    references: list[str] = []
    if category in self.need_ref_categories:
        references = self._references_dict.get(question_id, [])
    return ChatInstance(
        self._messages_dict[question_id],
        tools=self._tools_dict[question_id],
        references=references,
        extra_info={"category": category},
    )

OpenAIMessagesDataset

This class loads data with OpenAI-like format in jsonl file. The difference lies in that this class has 'tool_definition' field, in which available tools are listed.

Parameters:

  • file_path (str | list[str] | None, default: None ) –

    Path or list of paths to .jsonl file(s).

  • message_key (str, default: 'messages' ) –

    Key used to extract the list of messages from each JSON object.

  • tool_definitions_key (str | None, default: None ) –

    Key used to extract the list of tool definitions from each JSON object. Set to None (default) for data without tool_calls.

  • drop_if_last_from_assistant (bool, default: False ) –

    If true, when the last utterance is given by assistant, drop it.

In Jsonl, each line must have a following structure:

{ '': [ { 'role': 'user', 'content': 'こんにちわ。元気になる言葉を教えて下さい。' }, { 'role': 'assistant', 'content': 'こんなのはどうでしょう。どんどんやってください!' } ], '': [ { 'type': 'function', 'function': { ... } } ] }

Source code in flexeval/core/chat_dataset/openai_messages.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class OpenAIMessagesDataset(ChatDataset):
    """This class loads data with OpenAI-like format in jsonl file.
    The difference lies in that this class has 'tool_definition' field, in which
    available tools are listed.

    Parameters:
        file_path (str | list[str] | None): Path or list of paths to `.jsonl` file(s).
        message_key (str): Key used to extract the list of messages from each JSON object.
        tool_definitions_key (str | None): Key used to extract the list of tool definitions from each JSON object.
            Set to `None` (default) for data without tool_calls.
        drop_if_last_from_assistant (bool): If true, when the last utterance is given by assistant, drop it.

    In Jsonl, each line must have a following structure:

    {
      '<message_key>': [
        {
          'role': 'user',
          'content': 'こんにちわ。元気になる言葉を教えて下さい。'
        },
        {
          'role': 'assistant',
          'content': 'こんなのはどうでしょう。どんどんやってください!'
        }
      ],
      '<tool_definitions_key>': [
        {
          'type': 'function',
          'function': { ... }
        }
      ]
    }
    """

    def __init__(
        self,
        file_path: str | None = None,
        message_key: str = "messages",
        tool_definitions_key: str | None = None,
        drop_if_last_from_assistant: bool = False,
    ) -> None:
        self.conversations: list[ChatInstance] = []
        with open(file_path) as f:
            dataset = [json.loads(line) for line in f]
        for sample in dataset:
            tool_dicts = None
            if tool_definitions_key is not None:
                tool_dicts = sample.get(tool_definitions_key, None)

            messages: list[dict[str, Any]] = sample.pop(message_key)
            if drop_if_last_from_assistant and messages[-1]["role"] == "assistant":
                messages = messages[:-1]
            self.conversations.append(ChatInstance(messages=messages, tools=tool_dicts, extra_info=sample))

    def __len__(self) -> int:
        return len(self.conversations)

    def __getitem__(self, idx: int) -> ChatInstance:
        return self.conversations[idx]

conversations instance-attribute

conversations: list[ChatInstance] = []

__init__

__init__(
    file_path: str | None = None,
    message_key: str = "messages",
    tool_definitions_key: str | None = None,
    drop_if_last_from_assistant: bool = False,
) -> None
Source code in flexeval/core/chat_dataset/openai_messages.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def __init__(
    self,
    file_path: str | None = None,
    message_key: str = "messages",
    tool_definitions_key: str | None = None,
    drop_if_last_from_assistant: bool = False,
) -> None:
    self.conversations: list[ChatInstance] = []
    with open(file_path) as f:
        dataset = [json.loads(line) for line in f]
    for sample in dataset:
        tool_dicts = None
        if tool_definitions_key is not None:
            tool_dicts = sample.get(tool_definitions_key, None)

        messages: list[dict[str, Any]] = sample.pop(message_key)
        if drop_if_last_from_assistant and messages[-1]["role"] == "assistant":
            messages = messages[:-1]
        self.conversations.append(ChatInstance(messages=messages, tools=tool_dicts, extra_info=sample))

__len__

__len__() -> int
Source code in flexeval/core/chat_dataset/openai_messages.py
68
69
def __len__(self) -> int:
    return len(self.conversations)

__getitem__

__getitem__(idx: int) -> ChatInstance
Source code in flexeval/core/chat_dataset/openai_messages.py
71
72
def __getitem__(self, idx: int) -> ChatInstance:
    return self.conversations[idx]

SacreBleuChatDataset

Load datasets from the sacrebleu library. The available datasets are defined in sacrebleu.DATASETS.

Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class SacreBleuChatDataset(ChatDataset):
    """Load datasets from the [sacrebleu](https://github.com/mjpost/sacrebleu) library.
    The available datasets are defined in sacrebleu.DATASETS.
    """

    def __init__(self, name: str, langpair: str) -> None:
        self._source_list: list[str] = list(sacrebleu.DATASETS[name].source(langpair))
        self._references_list: list[list[str]] = [
            [r.strip() for r in refs] for refs in sacrebleu.DATASETS[name].references(langpair)
        ]

        if len(self._source_list) != len(self._references_list):
            msg = "The number of source and reference pairs should be the same."
            raise ValueError(msg)

    def require_incremental_response(self) -> bool:
        return False

    def __len__(self) -> int:
        return len(self._source_list)

    def __getitem__(self, i: int) -> ChatInstance:
        return ChatInstance(
            messages=[{"role": "user", "content": self._source_list[i]}],
            references=self._references_list[i],
            extra_info={},
        )

__init__

__init__(name: str, langpair: str) -> None
Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
11
12
13
14
15
16
17
18
19
def __init__(self, name: str, langpair: str) -> None:
    self._source_list: list[str] = list(sacrebleu.DATASETS[name].source(langpair))
    self._references_list: list[list[str]] = [
        [r.strip() for r in refs] for refs in sacrebleu.DATASETS[name].references(langpair)
    ]

    if len(self._source_list) != len(self._references_list):
        msg = "The number of source and reference pairs should be the same."
        raise ValueError(msg)

require_incremental_response

require_incremental_response() -> bool
Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
21
22
def require_incremental_response(self) -> bool:
    return False

__len__

__len__() -> int
Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
24
25
def __len__(self) -> int:
    return len(self._source_list)

__getitem__

__getitem__(i: int) -> ChatInstance
Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py
27
28
29
30
31
32
def __getitem__(self, i: int) -> ChatInstance:
    return ChatInstance(
        messages=[{"role": "user", "content": self._source_list[i]}],
        references=self._references_list[i],
        extra_info={},
    )