ChatDataset

ChatDataset ¶

A dataset holding ChatInstance.

Source code in flexeval/core/chat_dataset/base.py

class ChatDataset(Sequence[ChatInstance], ABC):
    """A dataset holding `ChatInstance`."""

    @abstractmethod
    def __len__(self) -> int:
        """
        Returns the number of chat instances in the dataset.
        """
        raise NotImplementedError

    @abstractmethod
    def __getitem__(self, i: int) -> ChatInstance:
        """
        Returns the i-th chat instance.
        """
        raise NotImplementedError

    def require_incremental_response(self) -> bool:
        """If true, the inputs consist of multiple user utterances and the
        model should generate responses for each utterance incrementally.

        Otherwise, the model just has to continue the conversation from the last user utterance.
        """
        return False

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(num_instances={len(self)})"

len `abstractmethod` ¶

__len__() -> int

Returns the number of chat instances in the dataset.

Source code in flexeval/core/chat_dataset/base.py

@abstractmethod
def __len__(self) -> int:
    """
    Returns the number of chat instances in the dataset.
    """
    raise NotImplementedError

getitem `abstractmethod` ¶

__getitem__(i: int) -> ChatInstance

Returns the i-th chat instance.

Source code in flexeval/core/chat_dataset/base.py

@abstractmethod
def __getitem__(self, i: int) -> ChatInstance:
    """
    Returns the i-th chat instance.
    """
    raise NotImplementedError

require_incremental_response ¶

require_incremental_response() -> bool

If true, the inputs consist of multiple user utterances and the model should generate responses for each utterance incrementally.

Otherwise, the model just has to continue the conversation from the last user utterance.

Source code in flexeval/core/chat_dataset/base.py

def require_incremental_response(self) -> bool:
    """If true, the inputs consist of multiple user utterances and the
    model should generate responses for each utterance incrementally.

    Otherwise, the model just has to continue the conversation from the last user utterance.
    """
    return False

repr ¶

__repr__() -> str

Source code in flexeval/core/chat_dataset/base.py

def __repr__(self) -> str:
    return f"{self.__class__.__name__}(num_instances={len(self)})"

ChatInstance `dataclass` ¶

A dataclass representing a single chat that will be fed to a chat language model.

Source code in flexeval/core/chat_dataset/base.py

@dataclass
class ChatInstance:
    """
    A dataclass representing a single chat that will be fed to a chat language model.
    """

    messages: list[dict[str, Any]]
    """
    A list of messages in the chat.
    The format of messages typically follows [OpenAI's Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api).
    ```json
    [
        {
            "role": "assistant",
            "content": "Hello! How can I help you today?"
        },
        {
            "role": "user",
            "content": "I'd like to book a flight to Paris."
        }
    ]
    ```
    """
    tools: list[dict[str, Any]] | None = None
    """
    A list of definitions of tools in the chat.
    The format of tools typically follows [OpenAI's Chat Completion API](https://platform.openai.com/docs/guides/function-calling#function-calling-steps)
    Currently, only function calling (tools with type="function") is supported.
    """
    references: list[str] = field(default_factory=list)
    """
    A list of reference responses to the user's last message.
    The model's response will be evaluated against these references.
    """
    extra_info: dict[str, Any] = field(default_factory=dict)
    """
    Extra information that can be used by passing to `Metric`.
    """

    def __post_init__(self) -> None:
        if "messages" in self.extra_info:
            msg = (
                "'extra_info' in ChatInstance cannot contain a key named 'messages', "
                "as it will conflict with the 'messages' attribute. "
                "The key 'messages' will be removed."
            )
            warnings.warn(msg, stacklevel=2)
            self.extra_info.pop("messages")

    @property
    def inputs(self) -> list[dict[str, str]]:
        """
        Alias for `messages`.
        This is used in `FewShotGenerator` so that it can access the inputs with the same attribute name as
        `GenerationInstance` and `MultipleChoiceInstance`.
        """
        return self.messages

messages `instance-attribute` ¶

messages: list[dict[str, Any]]

A list of messages in the chat. The format of messages typically follows OpenAI's Chat Completions API.

[
    {
        "role": "assistant",
        "content": "Hello! How can I help you today?"
    },
    {
        "role": "user",
        "content": "I'd like to book a flight to Paris."
    }
]

tools `class-attribute` `instance-attribute` ¶

tools: list[dict[str, Any]] | None = None

A list of definitions of tools in the chat. The format of tools typically follows OpenAI's Chat Completion API Currently, only function calling (tools with type="function") is supported.

references `class-attribute` `instance-attribute` ¶

references: list[str] = field(default_factory=list)

A list of reference responses to the user's last message. The model's response will be evaluated against these references.

extra_info `class-attribute` `instance-attribute` ¶

extra_info: dict[str, Any] = field(default_factory=dict)

Extra information that can be used by passing to Metric.

inputs `property` ¶

inputs: list[dict[str, str]]

Alias for messages. This is used in FewShotGenerator so that it can access the inputs with the same attribute name as GenerationInstance and MultipleChoiceInstance.

init ¶

__init__(
    messages: list[dict[str, Any]],
    tools: list[dict[str, Any]] | None = None,
    references: list[str] = list(),
    extra_info: dict[str, Any] = dict(),
) -> None

__post_init__ ¶

__post_init__() -> None

Source code in flexeval/core/chat_dataset/base.py

def __post_init__(self) -> None:
    if "messages" in self.extra_info:
        msg = (
            "'extra_info' in ChatInstance cannot contain a key named 'messages', "
            "as it will conflict with the 'messages' attribute. "
            "The key 'messages' will be removed."
        )
        warnings.warn(msg, stacklevel=2)
        self.extra_info.pop("messages")

HFChatDataset ¶

Load ChatInstances from a Hugging Face dataset.

Parameters:

path (str) –

The path to the Hugging Face dataset.
split (str) –

The split of the dataset.
subset (str | None, default: None ) –

The subset of the dataset.
dataset_kwargs (dict[str, Any] | None, default: None ) –

The keyword arguments to pass to the Hugging Face dataset.

Source code in flexeval/core/chat_dataset/template_based.py

class HFChatDataset(TemplateChatDataset):
    """
    Load ChatInstances from a Hugging Face dataset.

    Args:
        path: The path to the Hugging Face dataset.
        split: The split of the dataset.
        subset: The subset of the dataset.
        dataset_kwargs: The keyword arguments to pass to the Hugging Face dataset.
    """

    def __init__(
        self,
        path: str,
        split: str,
        input_template: str,
        subset: str | None = None,
        dataset_kwargs: dict[str, Any] | None = None,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        dataset_kwargs = dataset_kwargs or {}
        dataset = datasets.load_dataset(path, name=subset, split=split, **dataset_kwargs)
        items = [dict(item) for item in dataset]

        super().__init__(
            items=items,
            input_template=input_template,
            reference_template=reference_template,
            reference_list_template=reference_list_template,
            require_incremental_response=require_incremental_response,
            extra_info_templates=extra_info_templates,
            system_message_template=system_message_template,
            data_range=data_range,
            keep_conditions=keep_conditions,
            remove_conditions=remove_conditions,
        )

init ¶

__init__(
    path: str,
    split: str,
    input_template: str,
    subset: str | None = None,
    dataset_kwargs: dict[str, Any] | None = None,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None

Source code in flexeval/core/chat_dataset/template_based.py

def __init__(
    self,
    path: str,
    split: str,
    input_template: str,
    subset: str | None = None,
    dataset_kwargs: dict[str, Any] | None = None,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    dataset_kwargs = dataset_kwargs or {}
    dataset = datasets.load_dataset(path, name=subset, split=split, **dataset_kwargs)
    items = [dict(item) for item in dataset]

    super().__init__(
        items=items,
        input_template=input_template,
        reference_template=reference_template,
        reference_list_template=reference_list_template,
        require_incremental_response=require_incremental_response,
        extra_info_templates=extra_info_templates,
        system_message_template=system_message_template,
        data_range=data_range,
        keep_conditions=keep_conditions,
        remove_conditions=remove_conditions,
    )

JsonlChatDataset ¶

Load ChatInstances from a JSONL file.

Parameters:

path (str) –

The path to the JSONL file.

Source code in flexeval/core/chat_dataset/template_based.py

class JsonlChatDataset(TemplateChatDataset):
    """
    Load ChatInstances from a JSONL file.

    Args:
        path: The path to the JSONL file.
    """

    def __init__(
        self,
        path: str,
        input_template: str,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        with open(path) as f:
            items = [json.loads(line) for line in f]

        super().__init__(
            items=items,
            input_template=input_template,
            reference_template=reference_template,
            reference_list_template=reference_list_template,
            require_incremental_response=require_incremental_response,
            extra_info_templates=extra_info_templates,
            system_message_template=system_message_template,
            data_range=data_range,
            keep_conditions=keep_conditions,
            remove_conditions=remove_conditions,
        )

init ¶

__init__(
    path: str,
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None

Source code in flexeval/core/chat_dataset/template_based.py

def __init__(
    self,
    path: str,
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    with open(path) as f:
        items = [json.loads(line) for line in f]

    super().__init__(
        items=items,
        input_template=input_template,
        reference_template=reference_template,
        reference_list_template=reference_list_template,
        require_incremental_response=require_incremental_response,
        extra_info_templates=extra_info_templates,
        system_message_template=system_message_template,
        data_range=data_range,
        keep_conditions=keep_conditions,
        remove_conditions=remove_conditions,
    )

TemplateChatDataset ¶

This class only supports single-turn chat.

Parameters:

items (list[dict[str, Any]]) –

A list of items in a dict format. The "tools" key for each item can contain the list of function definitions. They should be in JSON Schema format as in the OpenAI Chat Completion API. https://platform.openai.com/docs/guides/function-calling?api-mode=chat#defining-functions
input_template (str) –

A Jinja2 template for the user input.
reference_template (str | None, default: None ) –

Specify the Jinja2 template to render the reference string if the dataset has a single reference.
reference_list_template (str | None, default: None ) –

Specify the Jinja2 template to render a list of reference strings if the dataset has multiple references.
require_incremental_response (bool, default: False ) –

Whether the dataset requires incremental response.
extra_info_templates (dict[str, str] | None, default: None ) –

A dictionary of Jinja2 templates for extra information.
system_message_template (str | None, default: None ) –

A Jinja2 template for the system message.
data_range (tuple[int, int] | None, default: None ) –

The range of data to use.
keep_conditions (dict[str, str] | None, default: None ) –

A dictionary to indicate the condition to filter certain items. The key is a Jinja2 template string to embed the item into a string, and the value is the value to keep.
remove_conditions (dict[str, str] | None, default: None ) –

A dictionary to indicate the condition to remove certain items. The key is a Jinja2 template string to embed the item into a string, and the value is the value to remove.

Source code in flexeval/core/chat_dataset/template_based.py

class TemplateChatDataset(ChatDataset):
    """
    This class only supports single-turn chat.

    Args:
        items: A list of items in a dict format.
            The "tools" key for each item can contain the list of function definitions.
            They should be in JSON Schema format as in the OpenAI Chat Completion API.
            https://platform.openai.com/docs/guides/function-calling?api-mode=chat#defining-functions
        input_template: A Jinja2 template for the user input.
        reference_template: Specify the Jinja2 template to render the reference string
            if the dataset has a single reference.
        reference_list_template: Specify the Jinja2 template to render a list of reference strings
            if the dataset has multiple references.
        require_incremental_response: Whether the dataset requires incremental response.
        extra_info_templates: A dictionary of Jinja2 templates for extra information.
        system_message_template: A Jinja2 template for the system message.
        data_range: The range of data to use.
        keep_conditions: A dictionary to indicate the condition to filter certain items.
            The key is a Jinja2 template string to embed the item into a string, and the value is the value to keep.
        remove_conditions: A dictionary to indicate the condition to remove certain items.
            The key is a Jinja2 template string to embed the item into a string, and the value is the value to remove.
    """

    def __init__(
        self,
        items: list[dict[str, Any]],
        input_template: str,
        reference_template: str | None = None,
        reference_list_template: str | None = None,
        require_incremental_response: bool = False,
        extra_info_templates: dict[str, str] | None = None,
        system_message_template: str | None = None,
        data_range: tuple[int, int] | None = None,
        keep_conditions: dict[str, str] | None = None,
        remove_conditions: dict[str, str] | None = None,
    ) -> None:
        if reference_template and reference_list_template:
            msg = "Only one of reference_template and reference_list_template can be set."
            raise ValueError(msg)

        if data_range:
            start, end = data_range
            items = items[start:end]

        keep_conditions = keep_conditions or {}
        for template_str, value_to_keep in keep_conditions.items():
            key_template = JINJA2_ENV.from_string(template_str)
            items = [item for item in items if key_template.render(**item) == value_to_keep]
        remove_conditions = remove_conditions or {}
        for template_str, value_to_remove in remove_conditions.items():
            key_template = JINJA2_ENV.from_string(template_str)
            items = [item for item in items if key_template.render(**item) != value_to_remove]

        self.items = items

        self.input_template = JINJA2_ENV.from_string(input_template)
        self.reference_template = JINJA2_ENV.from_string(reference_template) if reference_template else None
        self.reference_list_template = (
            JINJA2_ENV.from_string(reference_list_template) if reference_list_template else None
        )

        extra_info_templates = extra_info_templates or {}
        self._extra_info_templates: dict[str, Template] = {
            key: JINJA2_ENV.from_string(template) for key, template in extra_info_templates.items()
        }

        self._system_message_template: Template | None = (
            JINJA2_ENV.from_string(system_message_template) if system_message_template else None
        )

        self._require_incremental_response = require_incremental_response

    def require_incremental_response(self) -> bool:
        return self._require_incremental_response

    def __len__(self) -> int:
        return len(self.items)

    def __getitem__(self, i: int) -> ChatInstance:
        item = self.items[i]
        input_utterance = self.input_template.render(**item)
        messages = [{"role": "user", "content": input_utterance}]

        if self._system_message_template:
            system_message = self._system_message_template.render(**item)
            messages.insert(0, {"role": "system", "content": system_message})

        reference_list: list[str] = []
        if self.reference_template:
            reference_string = self.reference_template.render(**item)
            reference_list.append(reference_string)
        if self.reference_list_template:
            reference_list_string = self.reference_list_template.render(**item)
            if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
                msg = (
                    f"The reference_list_template should render a list of strings "
                    f"but we got `{reference_list_string}`."
                )
                raise ValueError(msg)
            reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])

        extra_info = dict(item.items())
        extra_info_from_templates = {
            key: template.render(**item) for key, template in self._extra_info_templates.items()
        }
        extra_info.update(extra_info_from_templates)

        return ChatInstance(
            messages=messages, tools=item.get("tools"), references=reference_list, extra_info=extra_info
        )

items `instance-attribute` ¶

items = items

input_template `instance-attribute` ¶

input_template = from_string(input_template)

reference_template `instance-attribute` ¶

reference_template = (
    from_string(reference_template)
    if reference_template
    else None
)

reference_list_template `instance-attribute` ¶

reference_list_template = (
    from_string(reference_list_template)
    if reference_list_template
    else None
)

init ¶

__init__(
    items: list[dict[str, Any]],
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None

Source code in flexeval/core/chat_dataset/template_based.py

def __init__(
    self,
    items: list[dict[str, Any]],
    input_template: str,
    reference_template: str | None = None,
    reference_list_template: str | None = None,
    require_incremental_response: bool = False,
    extra_info_templates: dict[str, str] | None = None,
    system_message_template: str | None = None,
    data_range: tuple[int, int] | None = None,
    keep_conditions: dict[str, str] | None = None,
    remove_conditions: dict[str, str] | None = None,
) -> None:
    if reference_template and reference_list_template:
        msg = "Only one of reference_template and reference_list_template can be set."
        raise ValueError(msg)

    if data_range:
        start, end = data_range
        items = items[start:end]

    keep_conditions = keep_conditions or {}
    for template_str, value_to_keep in keep_conditions.items():
        key_template = JINJA2_ENV.from_string(template_str)
        items = [item for item in items if key_template.render(**item) == value_to_keep]
    remove_conditions = remove_conditions or {}
    for template_str, value_to_remove in remove_conditions.items():
        key_template = JINJA2_ENV.from_string(template_str)
        items = [item for item in items if key_template.render(**item) != value_to_remove]

    self.items = items

    self.input_template = JINJA2_ENV.from_string(input_template)
    self.reference_template = JINJA2_ENV.from_string(reference_template) if reference_template else None
    self.reference_list_template = (
        JINJA2_ENV.from_string(reference_list_template) if reference_list_template else None
    )

    extra_info_templates = extra_info_templates or {}
    self._extra_info_templates: dict[str, Template] = {
        key: JINJA2_ENV.from_string(template) for key, template in extra_info_templates.items()
    }

    self._system_message_template: Template | None = (
        JINJA2_ENV.from_string(system_message_template) if system_message_template else None
    )

    self._require_incremental_response = require_incremental_response

require_incremental_response ¶

require_incremental_response() -> bool

Source code in flexeval/core/chat_dataset/template_based.py

def require_incremental_response(self) -> bool:
    return self._require_incremental_response

len ¶

__len__() -> int

Source code in flexeval/core/chat_dataset/template_based.py

def __len__(self) -> int:
    return len(self.items)

getitem ¶

__getitem__(i: int) -> ChatInstance

Source code in flexeval/core/chat_dataset/template_based.py

def __getitem__(self, i: int) -> ChatInstance:
    item = self.items[i]
    input_utterance = self.input_template.render(**item)
    messages = [{"role": "user", "content": input_utterance}]

    if self._system_message_template:
        system_message = self._system_message_template.render(**item)
        messages.insert(0, {"role": "system", "content": system_message})

    reference_list: list[str] = []
    if self.reference_template:
        reference_string = self.reference_template.render(**item)
        reference_list.append(reference_string)
    if self.reference_list_template:
        reference_list_string = self.reference_list_template.render(**item)
        if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
            msg = (
                f"The reference_list_template should render a list of strings "
                f"but we got `{reference_list_string}`."
            )
            raise ValueError(msg)
        reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])

    extra_info = dict(item.items())
    extra_info_from_templates = {
        key: template.render(**item) for key, template in self._extra_info_templates.items()
    }
    extra_info.update(extra_info_from_templates)

    return ChatInstance(
        messages=messages, tools=item.get("tools"), references=reference_list, extra_info=extra_info
    )

ChatbotBench ¶

This class loads data with the jsonl format used in chat evaluation benchmarks such as MT-Bench (Multi-turn Benchmark) or Vicuna QA Benchmark.

Example of a line from a jsonl file

{ "question_id": 00, "category": "writing", "turns": [ "Compose an engaging travel blog post about a recent trip to Hawaii.", "Rewrite your previous response. Start every sentence with the letter A." ] # 'tools' key is optional. # It should be in the same format as FunctionCalling in the OpenAI ChatCompletion API. # https://platform.openai.com/docs/guides/function-calling?api-mode=chat#defining-functions "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get current temperature for a given location.", "parameters": { "type": "object", "properties": { "location": {"type": "string", "description": "City and country e.g. Bogotá, Colombia"}, }, "required": ["location"], "additionalProperties": False}, "strict": True }, }, ], # 'system_message' key is optional. # If set, it will be inserted in the first turn as a system prompt "system_message": "You are a helpful assistant." }

Source code in flexeval/core/chat_dataset/chatbot_bench.py

class ChatbotBench(ChatDataset):
    """This class loads data with the jsonl format used in chat evaluation benchmarks such as
    MT-Bench (Multi-turn Benchmark) or Vicuna QA Benchmark.

    Example of a line from a jsonl file:
        {
          "question_id": 00,
          "category": "writing",
          "turns": [
            "Compose an engaging travel blog post about a recent trip to Hawaii.",
            "Rewrite your previous response. Start every sentence with the letter A."
          ]
          # 'tools' key is optional.
          # It should be in the same format as FunctionCalling in the OpenAI ChatCompletion API.
          # https://platform.openai.com/docs/guides/function-calling?api-mode=chat#defining-functions
          "tools": [
            {
              "type": "function",
              "function": {
                "name": "get_weather",
                "description": "Get current temperature for a given location.",
                "parameters": {
                  "type": "object",
                  "properties": {
                    "location": {"type": "string", "description": "City and country e.g. Bogotá, Colombia"},
                  },
                  "required": ["location"],
                  "additionalProperties": False},
                "strict": True
              },
            },
          ],
          # 'system_message' key is optional.
          # If set, it will be inserted in the first turn as a system prompt
          "system_message": "You are a helpful assistant."
        }
    """

    def __init__(
        self,
        path_or_name: str,
        ref_path_or_name: str | None = None,
        need_ref_categories: list[str] | None = None,
        load_only_first_n: int | None = None,
    ) -> None:
        file_path = resolve_path_or_name(path_or_name)

        self._id_to_question_id: list[int | str] = []
        self._id_to_category: list[str] = []
        self._messages_dict: dict[int | str, list[dict[str, str]]] = {}
        self._tools_dict: dict[int | str, list[dict[str, Any] | None]] = {}
        with open(file_path) as f:
            for line in f:
                item = json.loads(line)
                self._id_to_question_id.append(item["question_id"])
                self._id_to_category.append(item["category"])
                input_messages = [{"role": "user", "content": turn} for turn in item["turns"]]
                if item.get("system_message"):
                    input_messages = [{"role": "system", "content": item["system_message"]}, *input_messages]
                if load_only_first_n is not None:
                    input_messages = input_messages[:load_only_first_n]
                self._messages_dict[item["question_id"]] = input_messages
                self._tools_dict[item["question_id"]] = item.get("tools")

        self._references_dict: dict[int | str, list[str]] = {}
        if ref_path_or_name is not None:
            ref_file_path = resolve_path_or_name(ref_path_or_name)
            with open(ref_file_path) as f:
                for line in f:
                    item = json.loads(line)
                    self._references_dict[item["question_id"]] = item["choices"][0]["turns"]

        self.need_ref_categories = need_ref_categories or [
            "math",
            "coding",
            "reasoning",
        ]

    def require_incremental_response(self) -> bool:
        return True

    def __len__(self) -> int:
        return len(self._id_to_question_id)

    def __getitem__(self, i: int) -> ChatInstance:
        question_id = self._id_to_question_id[i]
        category = self._id_to_category[i]
        references: list[str] = []
        if category in self.need_ref_categories:
            references = self._references_dict.get(question_id, [])
        return ChatInstance(
            self._messages_dict[question_id],
            tools=self._tools_dict[question_id],
            references=references,
            extra_info={"category": category},
        )

need_ref_categories `instance-attribute` ¶

need_ref_categories = need_ref_categories or [
    "math",
    "coding",
    "reasoning",
]

init ¶

__init__(
    path_or_name: str,
    ref_path_or_name: str | None = None,
    need_ref_categories: list[str] | None = None,
    load_only_first_n: int | None = None,
) -> None

Source code in flexeval/core/chat_dataset/chatbot_bench.py

def __init__(
    self,
    path_or_name: str,
    ref_path_or_name: str | None = None,
    need_ref_categories: list[str] | None = None,
    load_only_first_n: int | None = None,
) -> None:
    file_path = resolve_path_or_name(path_or_name)

    self._id_to_question_id: list[int | str] = []
    self._id_to_category: list[str] = []
    self._messages_dict: dict[int | str, list[dict[str, str]]] = {}
    self._tools_dict: dict[int | str, list[dict[str, Any] | None]] = {}
    with open(file_path) as f:
        for line in f:
            item = json.loads(line)
            self._id_to_question_id.append(item["question_id"])
            self._id_to_category.append(item["category"])
            input_messages = [{"role": "user", "content": turn} for turn in item["turns"]]
            if item.get("system_message"):
                input_messages = [{"role": "system", "content": item["system_message"]}, *input_messages]
            if load_only_first_n is not None:
                input_messages = input_messages[:load_only_first_n]
            self._messages_dict[item["question_id"]] = input_messages
            self._tools_dict[item["question_id"]] = item.get("tools")

    self._references_dict: dict[int | str, list[str]] = {}
    if ref_path_or_name is not None:
        ref_file_path = resolve_path_or_name(ref_path_or_name)
        with open(ref_file_path) as f:
            for line in f:
                item = json.loads(line)
                self._references_dict[item["question_id"]] = item["choices"][0]["turns"]

    self.need_ref_categories = need_ref_categories or [
        "math",
        "coding",
        "reasoning",
    ]

require_incremental_response ¶

require_incremental_response() -> bool

Source code in flexeval/core/chat_dataset/chatbot_bench.py

def require_incremental_response(self) -> bool:
    return True

len ¶

__len__() -> int

Source code in flexeval/core/chat_dataset/chatbot_bench.py

def __len__(self) -> int:
    return len(self._id_to_question_id)

getitem ¶

__getitem__(i: int) -> ChatInstance

Source code in flexeval/core/chat_dataset/chatbot_bench.py

def __getitem__(self, i: int) -> ChatInstance:
    question_id = self._id_to_question_id[i]
    category = self._id_to_category[i]
    references: list[str] = []
    if category in self.need_ref_categories:
        references = self._references_dict.get(question_id, [])
    return ChatInstance(
        self._messages_dict[question_id],
        tools=self._tools_dict[question_id],
        references=references,
        extra_info={"category": category},
    )

OpenAIMessagesDataset ¶

This class loads data with OpenAI-like format in jsonl file. The difference lies in that this class has 'tool_definition' field, in which available tools are listed.

Parameters:

file_path (str | list[str] | None, default: None ) –

Path or list of paths to .jsonl file(s).
message_key (str, default: 'messages' ) –

Key used to extract the list of messages from each JSON object.
tool_definitions_key (str | None, default: None ) –

Key used to extract the list of tool definitions from each JSON object. Set to None (default) for data without tool_calls.
drop_if_last_from_assistant (bool, default: False ) –

If true, when the last utterance is given by assistant, drop it.

In Jsonl, each line must have a following structure:

{ '': [ { 'role': 'user', 'content': 'こんにちわ。元気になる言葉を教えて下さい。' }, { 'role': 'assistant', 'content': 'こんなのはどうでしょう。どんどんやってください！' } ], '': [ { 'type': 'function', 'function': { ... } } ] }

Source code in flexeval/core/chat_dataset/openai_messages.py

class OpenAIMessagesDataset(ChatDataset):
    """This class loads data with OpenAI-like format in jsonl file.
    The difference lies in that this class has 'tool_definition' field, in which
    available tools are listed.

    Parameters:
        file_path (str | list[str] | None): Path or list of paths to `.jsonl` file(s).
        message_key (str): Key used to extract the list of messages from each JSON object.
        tool_definitions_key (str | None): Key used to extract the list of tool definitions from each JSON object.
            Set to `None` (default) for data without tool_calls.
        drop_if_last_from_assistant (bool): If true, when the last utterance is given by assistant, drop it.

    In Jsonl, each line must have a following structure:

    {
      '<message_key>': [
        {
          'role': 'user',
          'content': 'こんにちわ。元気になる言葉を教えて下さい。'
        },
        {
          'role': 'assistant',
          'content': 'こんなのはどうでしょう。どんどんやってください！'
        }
      ],
      '<tool_definitions_key>': [
        {
          'type': 'function',
          'function': { ... }
        }
      ]
    }
    """

    def __init__(
        self,
        file_path: str | None = None,
        message_key: str = "messages",
        tool_definitions_key: str | None = None,
        drop_if_last_from_assistant: bool = False,
    ) -> None:
        self.conversations: list[ChatInstance] = []
        with open(file_path) as f:
            dataset = [json.loads(line) for line in f]
        for sample in dataset:
            tool_dicts = None
            if tool_definitions_key is not None:
                tool_dicts = sample.get(tool_definitions_key, None)

            messages: list[dict[str, Any]] = sample.pop(message_key)
            if drop_if_last_from_assistant and messages[-1]["role"] == "assistant":
                messages = messages[:-1]
            self.conversations.append(ChatInstance(messages=messages, tools=tool_dicts, extra_info=sample))

    def __len__(self) -> int:
        return len(self.conversations)

    def __getitem__(self, idx: int) -> ChatInstance:
        return self.conversations[idx]

conversations `instance-attribute` ¶

conversations: list[ChatInstance] = []

init ¶

__init__(
    file_path: str | None = None,
    message_key: str = "messages",
    tool_definitions_key: str | None = None,
    drop_if_last_from_assistant: bool = False,
) -> None

Source code in flexeval/core/chat_dataset/openai_messages.py

def __init__(
    self,
    file_path: str | None = None,
    message_key: str = "messages",
    tool_definitions_key: str | None = None,
    drop_if_last_from_assistant: bool = False,
) -> None:
    self.conversations: list[ChatInstance] = []
    with open(file_path) as f:
        dataset = [json.loads(line) for line in f]
    for sample in dataset:
        tool_dicts = None
        if tool_definitions_key is not None:
            tool_dicts = sample.get(tool_definitions_key, None)

        messages: list[dict[str, Any]] = sample.pop(message_key)
        if drop_if_last_from_assistant and messages[-1]["role"] == "assistant":
            messages = messages[:-1]
        self.conversations.append(ChatInstance(messages=messages, tools=tool_dicts, extra_info=sample))

len ¶

__len__() -> int

Source code in flexeval/core/chat_dataset/openai_messages.py

def __len__(self) -> int:
    return len(self.conversations)

getitem ¶

__getitem__(idx: int) -> ChatInstance

Source code in flexeval/core/chat_dataset/openai_messages.py

def __getitem__(self, idx: int) -> ChatInstance:
    return self.conversations[idx]

SacreBleuChatDataset ¶

Load datasets from the sacrebleu library. The available datasets are defined in sacrebleu.DATASETS.

Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py

class SacreBleuChatDataset(ChatDataset):
    """Load datasets from the [sacrebleu](https://github.com/mjpost/sacrebleu) library.
    The available datasets are defined in sacrebleu.DATASETS.
    """

    def __init__(self, name: str, langpair: str) -> None:
        self._source_list: list[str] = list(sacrebleu.DATASETS[name].source(langpair))
        self._references_list: list[list[str]] = [
            [r.strip() for r in refs] for refs in sacrebleu.DATASETS[name].references(langpair)
        ]

        if len(self._source_list) != len(self._references_list):
            msg = "The number of source and reference pairs should be the same."
            raise ValueError(msg)

    def require_incremental_response(self) -> bool:
        return False

    def __len__(self) -> int:
        return len(self._source_list)

    def __getitem__(self, i: int) -> ChatInstance:
        return ChatInstance(
            messages=[{"role": "user", "content": self._source_list[i]}],
            references=self._references_list[i],
            extra_info={},
        )

init ¶

__init__(name: str, langpair: str) -> None

Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py

def __init__(self, name: str, langpair: str) -> None:
    self._source_list: list[str] = list(sacrebleu.DATASETS[name].source(langpair))
    self._references_list: list[list[str]] = [
        [r.strip() for r in refs] for refs in sacrebleu.DATASETS[name].references(langpair)
    ]

    if len(self._source_list) != len(self._references_list):
        msg = "The number of source and reference pairs should be the same."
        raise ValueError(msg)

require_incremental_response ¶

require_incremental_response() -> bool

Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py

def require_incremental_response(self) -> bool:
    return False

len ¶

__len__() -> int

Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py

def __len__(self) -> int:
    return len(self._source_list)

getitem ¶

__getitem__(i: int) -> ChatInstance

Source code in flexeval/core/chat_dataset/sacrebleu_dataset.py

def __getitem__(self, i: int) -> ChatInstance:
    return ChatInstance(
        messages=[{"role": "user", "content": self._source_list[i]}],
        references=self._references_list[i],
        extra_info={},
    )

ChatDataset

ChatDataset ¶

__len__ abstractmethod ¶

__getitem__ abstractmethod ¶

require_incremental_response ¶

__repr__ ¶

ChatInstance dataclass ¶

messages instance-attribute ¶

tools class-attribute instance-attribute ¶

references class-attribute instance-attribute ¶

extra_info class-attribute instance-attribute ¶

inputs property ¶

__init__ ¶

__post_init__ ¶

HFChatDataset ¶

__init__ ¶

JsonlChatDataset ¶

__init__ ¶

TemplateChatDataset ¶

items instance-attribute ¶

input_template instance-attribute ¶

reference_template instance-attribute ¶

reference_list_template instance-attribute ¶

__init__ ¶

require_incremental_response ¶

__len__ ¶

__getitem__ ¶

ChatbotBench ¶

need_ref_categories instance-attribute ¶

__init__ ¶

require_incremental_response ¶

__len__ ¶

__getitem__ ¶

OpenAIMessagesDataset ¶

conversations instance-attribute ¶

__init__ ¶

__len__ ¶

__getitem__ ¶

SacreBleuChatDataset ¶

__init__ ¶

require_incremental_response ¶

__len__ ¶

__getitem__ ¶

len `abstractmethod` ¶

getitem `abstractmethod` ¶

repr ¶

ChatInstance `dataclass` ¶

messages `instance-attribute` ¶

tools `class-attribute` `instance-attribute` ¶

references `class-attribute` `instance-attribute` ¶

extra_info `class-attribute` `instance-attribute` ¶

inputs `property` ¶

init ¶

init ¶

init ¶

items `instance-attribute` ¶

input_template `instance-attribute` ¶

reference_template `instance-attribute` ¶

reference_list_template `instance-attribute` ¶

init ¶

len ¶

getitem ¶

need_ref_categories `instance-attribute` ¶

init ¶

len ¶

getitem ¶

conversations `instance-attribute` ¶

init ¶

len ¶

getitem ¶

init ¶

len ¶

getitem ¶