PairwiseJudge

PairwiseJudge ¶

Judge which model is better given two items.

The output is a tuple of the winner and the rationale.

Source code in flexeval/core/pairwise_comparison/judge/base.py

class PairwiseJudge(ABC):
    """Judge which model is better given two items.

    The output is a tuple of the winner and the rationale.
    """

    @abstractmethod
    def batch_judge(
        self,
        batch_model_items: list[tuple[dict[str, Any], dict[str, Any]]],
    ) -> list[tuple[Winner, str]]:
        """
        Judge which model is better given a batch of item pairs.

        Args:
            batch_model_items: A list of tuples, each containing two model items.
        """

batch_judge `abstractmethod` ¶

batch_judge(
    batch_model_items: list[
        tuple[dict[str, Any], dict[str, Any]]
    ],
) -> list[tuple[Winner, str]]

Judge which model is better given a batch of item pairs.

Parameters:

batch_model_items (list[tuple[dict[str, Any], dict[str, Any]]]) –

A list of tuples, each containing two model items.

Source code in flexeval/core/pairwise_comparison/judge/base.py

@abstractmethod
def batch_judge(
    self,
    batch_model_items: list[tuple[dict[str, Any], dict[str, Any]]],
) -> list[tuple[Winner, str]]:
    """
    Judge which model is better given a batch of item pairs.

    Args:
        batch_model_items: A list of tuples, each containing two model items.
    """

Winner ¶

Enum class to indicate the winner of a pairwise comparison.

Source code in flexeval/core/pairwise_comparison/judge/base.py

class Winner(Enum):
    """
    Enum class to indicate the winner of a pairwise comparison.
    """

    MODEL1 = "model1"
    MODEL2 = "model2"
    DRAW = "draw"

    def __str__(self) -> str:
        # used when serializing to JSON
        return self.value

MODEL1 `class-attribute` `instance-attribute` ¶

MODEL1 = 'model1'

MODEL2 `class-attribute` `instance-attribute` ¶

MODEL2 = 'model2'

DRAW `class-attribute` `instance-attribute` ¶

DRAW = 'draw'

str ¶

__str__() -> str

Source code in flexeval/core/pairwise_comparison/judge/base.py

def __str__(self) -> str:
    # used when serializing to JSON
    return self.value

ChatLLMPairwiseJudge ¶

Pairwise judge using a chat language model to compare two model outputs.

Parameters:

language_model (LanguageModel) –

The language model to use for pairwise comparison.
prompt_template (PromptTemplate) –

The prompt template to embed the model outputs to be compared.
system_message (str | PromptTemplate | None, default: None ) –

The system message to prepend to the chat messages.

Source code in flexeval/core/pairwise_comparison/judge/llm_judge.py

class ChatLLMPairwiseJudge(PairwiseJudge):
    """
    Pairwise judge using a chat language model to compare two model outputs.

    Args:
        language_model: The language model to use for pairwise comparison.
        prompt_template: The prompt template to embed the model outputs to be compared.
        system_message: The system message to prepend to the chat messages.
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        system_message: str | PromptTemplate | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.system_message = system_message

    @staticmethod
    def _parse_judge_output(judge_output: str) -> tuple[Winner, str]:
        """Extract the last integer value from the judge output and return the
        corresponding Winner and its rationale.

        Return `Winner.DRAW` if parsing fails.
        """
        try:
            matched = re.findall(r"(\d+)", judge_output)
            value = int(matched[-1])
            winner: Winner
            rationale = judge_output
            if value == 1:
                winner = Winner.MODEL1
            elif value == 2:
                winner = Winner.MODEL2
            elif value == 3:
                winner = Winner.DRAW
            else:
                logger.warning(f"Invalid number {value} was extracted:\n\n{judge_output}")
                winner = Winner.DRAW
                rationale = f"Invalid judge '{value}': {judge_output}"
        except (IndexError, ValueError):
            logger.warning(f"Failed to extract the judgment result:\n\n{judge_output}")
            return Winner.DRAW, f"Parsing failure: {judge_output}"
        else:
            return winner, rationale

    def batch_judge(self, batch_model_items: list[tuple[dict[str, Any], dict[str, Any]]]) -> list[tuple[Winner, str]]:
        input_chat_messages_list: list[list[dict[str, str]]] = []
        for model1_item, model2_item in batch_model_items:
            references = model1_item["references"]
            prompt_inputs = {
                "model1_item": model1_item,
                "model2_item": model2_item,
                "references": references,
            }
            self.prompt_template.embed_inputs(prompt_inputs)
            judge_input = self.prompt_template.embed_inputs(prompt_inputs)
            input_chat_messages = [{"role": "user", "content": judge_input}]
            if self.system_message:
                if isinstance(self.system_message, str):
                    system_message = self.system_message
                else:
                    system_message = self.system_message.embed_inputs(prompt_inputs)
                input_chat_messages.insert(
                    0,
                    {"role": "system", "content": system_message},
                )
            input_chat_messages_list.append(input_chat_messages)
        judge_outputs = self.language_model.generate_chat_response(input_chat_messages_list)
        return [self._parse_judge_output(output.text) for output in judge_outputs]

language_model `instance-attribute` ¶

language_model = language_model

prompt_template `instance-attribute` ¶

prompt_template = prompt_template

system_message `instance-attribute` ¶

system_message = system_message

init ¶

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
) -> None

Source code in flexeval/core/pairwise_comparison/judge/llm_judge.py

def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.system_message = system_message

batch_judge ¶

batch_judge(
    batch_model_items: list[
        tuple[dict[str, Any], dict[str, Any]]
    ],
) -> list[tuple[Winner, str]]

Source code in flexeval/core/pairwise_comparison/judge/llm_judge.py

def batch_judge(self, batch_model_items: list[tuple[dict[str, Any], dict[str, Any]]]) -> list[tuple[Winner, str]]:
    input_chat_messages_list: list[list[dict[str, str]]] = []
    for model1_item, model2_item in batch_model_items:
        references = model1_item["references"]
        prompt_inputs = {
            "model1_item": model1_item,
            "model2_item": model2_item,
            "references": references,
        }
        self.prompt_template.embed_inputs(prompt_inputs)
        judge_input = self.prompt_template.embed_inputs(prompt_inputs)
        input_chat_messages = [{"role": "user", "content": judge_input}]
        if self.system_message:
            if isinstance(self.system_message, str):
                system_message = self.system_message
            else:
                system_message = self.system_message.embed_inputs(prompt_inputs)
            input_chat_messages.insert(
                0,
                {"role": "system", "content": system_message},
            )
        input_chat_messages_list.append(input_chat_messages)
    judge_outputs = self.language_model.generate_chat_response(input_chat_messages_list)
    return [self._parse_judge_output(output.text) for output in judge_outputs]

PairwiseJudge

PairwiseJudge ¶

batch_judge abstractmethod ¶

Winner ¶

MODEL1 class-attribute instance-attribute ¶

MODEL2 class-attribute instance-attribute ¶

DRAW class-attribute instance-attribute ¶

__str__ ¶

ChatLLMPairwiseJudge ¶

language_model instance-attribute ¶

prompt_template instance-attribute ¶

system_message instance-attribute ¶

__init__ ¶

batch_judge ¶

batch_judge `abstractmethod` ¶

MODEL1 `class-attribute` `instance-attribute` ¶

MODEL2 `class-attribute` `instance-attribute` ¶

DRAW `class-attribute` `instance-attribute` ¶

str ¶

language_model `instance-attribute` ¶

prompt_template `instance-attribute` ¶

system_message `instance-attribute` ¶

init ¶