Skip to content

PairwiseJudge

PairwiseJudge

Judge which model is better given two items.

The output is a tuple of the winner and the rationale.

Source code in flexeval/core/pairwise_comparison/judge/base.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class PairwiseJudge(ABC):
    """Judge which model is better given two items.

    The output is a tuple of the winner and the rationale.
    """

    @abstractmethod
    def batch_judge(
        self,
        batch_model_items: list[tuple[dict[str, Any], dict[str, Any]]],
    ) -> list[tuple[Winner, str]]:
        """
        Judge which model is better given a batch of item pairs.

        Args:
            batch_model_items: A list of tuples, each containing two model items.
        """

batch_judge abstractmethod

batch_judge(
    batch_model_items: list[
        tuple[dict[str, Any], dict[str, Any]]
    ],
) -> list[tuple[Winner, str]]

Judge which model is better given a batch of item pairs.

Parameters:

  • batch_model_items (list[tuple[dict[str, Any], dict[str, Any]]]) –

    A list of tuples, each containing two model items.

Source code in flexeval/core/pairwise_comparison/judge/base.py
28
29
30
31
32
33
34
35
36
37
38
@abstractmethod
def batch_judge(
    self,
    batch_model_items: list[tuple[dict[str, Any], dict[str, Any]]],
) -> list[tuple[Winner, str]]:
    """
    Judge which model is better given a batch of item pairs.

    Args:
        batch_model_items: A list of tuples, each containing two model items.
    """

Winner

Enum class to indicate the winner of a pairwise comparison.

Source code in flexeval/core/pairwise_comparison/judge/base.py
 8
 9
10
11
12
13
14
15
16
17
18
19
class Winner(Enum):
    """
    Enum class to indicate the winner of a pairwise comparison.
    """

    MODEL1 = "model1"
    MODEL2 = "model2"
    DRAW = "draw"

    def __str__(self) -> str:
        # used when serializing to JSON
        return self.value

MODEL1 class-attribute instance-attribute

MODEL1 = 'model1'

MODEL2 class-attribute instance-attribute

MODEL2 = 'model2'

DRAW class-attribute instance-attribute

DRAW = 'draw'

__str__

__str__() -> str
Source code in flexeval/core/pairwise_comparison/judge/base.py
17
18
19
def __str__(self) -> str:
    # used when serializing to JSON
    return self.value

ChatLLMPairwiseJudge

Pairwise judge using a chat language model to compare two model outputs.

Parameters:

  • language_model (LanguageModel) –

    The language model to use for pairwise comparison.

  • prompt_template (PromptTemplate) –

    The prompt template to embed the model outputs to be compared.

  • system_message (str | PromptTemplate | None, default: None ) –

    The system message to prepend to the chat messages.

Source code in flexeval/core/pairwise_comparison/judge/llm_judge.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class ChatLLMPairwiseJudge(PairwiseJudge):
    """
    Pairwise judge using a chat language model to compare two model outputs.

    Args:
        language_model: The language model to use for pairwise comparison.
        prompt_template: The prompt template to embed the model outputs to be compared.
        system_message: The system message to prepend to the chat messages.
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        system_message: str | PromptTemplate | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.system_message = system_message

    @staticmethod
    def _parse_judge_output(judge_output: str) -> tuple[Winner, str]:
        """Extract the last integer value from the judge output and return the
        corresponding Winner and its rationale.

        Return `Winner.DRAW` if parsing fails.
        """
        try:
            matched = re.findall(r"(\d+)", judge_output)
            value = int(matched[-1])
            winner: Winner
            rationale = judge_output
            if value == 1:
                winner = Winner.MODEL1
            elif value == 2:
                winner = Winner.MODEL2
            elif value == 3:
                winner = Winner.DRAW
            else:
                logger.warning(f"Invalid number {value} was extracted:\n\n{judge_output}")
                winner = Winner.DRAW
                rationale = f"Invalid judge '{value}': {judge_output}"
        except (IndexError, ValueError):
            logger.warning(f"Failed to extract the judgment result:\n\n{judge_output}")
            return Winner.DRAW, f"Parsing failure: {judge_output}"
        else:
            return winner, rationale

    def batch_judge(self, batch_model_items: list[tuple[dict[str, Any], dict[str, Any]]]) -> list[tuple[Winner, str]]:
        input_chat_messages_list: list[list[dict[str, str]]] = []
        for model1_item, model2_item in batch_model_items:
            references = model1_item["references"]
            prompt_inputs = {
                "model1_item": model1_item,
                "model2_item": model2_item,
                "references": references,
            }
            self.prompt_template.embed_inputs(prompt_inputs)
            judge_input = self.prompt_template.embed_inputs(prompt_inputs)
            input_chat_messages = [{"role": "user", "content": judge_input}]
            if self.system_message:
                if isinstance(self.system_message, str):
                    system_message = self.system_message
                else:
                    system_message = self.system_message.embed_inputs(prompt_inputs)
                input_chat_messages.insert(
                    0,
                    {"role": "system", "content": system_message},
                )
            input_chat_messages_list.append(input_chat_messages)
        judge_outputs = self.language_model.generate_chat_response(input_chat_messages_list)
        return [self._parse_judge_output(output.text) for output in judge_outputs]

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

system_message instance-attribute

system_message = system_message

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
) -> None
Source code in flexeval/core/pairwise_comparison/judge/llm_judge.py
24
25
26
27
28
29
30
31
32
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.system_message = system_message

batch_judge

batch_judge(
    batch_model_items: list[
        tuple[dict[str, Any], dict[str, Any]]
    ],
) -> list[tuple[Winner, str]]
Source code in flexeval/core/pairwise_comparison/judge/llm_judge.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def batch_judge(self, batch_model_items: list[tuple[dict[str, Any], dict[str, Any]]]) -> list[tuple[Winner, str]]:
    input_chat_messages_list: list[list[dict[str, str]]] = []
    for model1_item, model2_item in batch_model_items:
        references = model1_item["references"]
        prompt_inputs = {
            "model1_item": model1_item,
            "model2_item": model2_item,
            "references": references,
        }
        self.prompt_template.embed_inputs(prompt_inputs)
        judge_input = self.prompt_template.embed_inputs(prompt_inputs)
        input_chat_messages = [{"role": "user", "content": judge_input}]
        if self.system_message:
            if isinstance(self.system_message, str):
                system_message = self.system_message
            else:
                system_message = self.system_message.embed_inputs(prompt_inputs)
            input_chat_messages.insert(
                0,
                {"role": "system", "content": system_message},
            )
        input_chat_messages_list.append(input_chat_messages)
    judge_outputs = self.language_model.generate_chat_response(input_chat_messages_list)
    return [self._parse_judge_output(output.text) for output in judge_outputs]