Skip to content

RewardModel

RewardModel

Base class for reward models.

Source code in flexeval/core/reward_model/base.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class RewardModel(ABC):
    """Base class for reward models."""

    @abstractmethod
    def batch_judge(
        self,
        batch_reward_bench_instances: list[RewardBenchInstance],
    ) -> tuple[list[bool], list[dict[str, Any]]]:
        """Judge a batch of reward bench instances.

        Args:
            batch_reward_bench_instances (list[RewardBenchInstance]): A list of tuples, each containing two model items.

        Returns:
            tuple[list[bool], list[Any]]: A tuple with the following elements:
                - chosen_is_betters: Indicating whether each `chosen` item is considered better by the model.
                - judge_outputs: A list of outputs (rationale, score, etc....) from the model.
        """

batch_judge abstractmethod

batch_judge(
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]

Judge a batch of reward bench instances.

Parameters:

  • batch_reward_bench_instances (list[RewardBenchInstance]) –

    A list of tuples, each containing two model items.

Returns:

  • tuple[list[bool], list[dict[str, Any]]]

    tuple[list[bool], list[Any]]: A tuple with the following elements: - chosen_is_betters: Indicating whether each chosen item is considered better by the model. - judge_outputs: A list of outputs (rationale, score, etc....) from the model.

Source code in flexeval/core/reward_model/base.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
@abstractmethod
def batch_judge(
    self,
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]:
    """Judge a batch of reward bench instances.

    Args:
        batch_reward_bench_instances (list[RewardBenchInstance]): A list of tuples, each containing two model items.

    Returns:
        tuple[list[bool], list[Any]]: A tuple with the following elements:
            - chosen_is_betters: Indicating whether each `chosen` item is considered better by the model.
            - judge_outputs: A list of outputs (rationale, score, etc....) from the model.
    """

LogProbRewardModel

A reward model that judges the quality of a response based on the log probability computed by the auto-regressive language model.

Source code in flexeval/core/reward_model/log_prob.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
class LogProbRewardModel(RewardModel):
    """
    A reward model that judges the quality of a response
    based on the log probability computed by the auto-regressive language model.
    """

    def __init__(self, language_model: LanguageModel) -> None:
        self.language_model = language_model

    def batch_judge(
        self,
        batch_reward_bench_instances: list[RewardBenchInstance],
    ) -> tuple[list[bool], list[dict[str, Any]]]:
        if not all(len(instance.chosen) == 1 for instance in batch_reward_bench_instances):
            msg = "`chosen` field must have exactly one element."
            raise ValueError(msg)
        if not all(len(instance.rejected) == 1 for instance in batch_reward_bench_instances):
            msg = "`rejected` field must have exactly one element."
            raise ValueError(msg)

        chosen_log_probs = self.language_model.compute_chat_log_probs(
            prompt=[instance.prompt for instance in batch_reward_bench_instances],
            response=[instance.chosen[0] for instance in batch_reward_bench_instances],
        )
        rejected_log_probs = self.language_model.compute_chat_log_probs(
            prompt=[instance.prompt for instance in batch_reward_bench_instances],
            response=[instance.rejected[0] for instance in batch_reward_bench_instances],
        )
        chosen_is_better = [
            chosen_log_prob > rejected_log_prob
            for chosen_log_prob, rejected_log_prob in zip(chosen_log_probs, rejected_log_probs)
        ]
        outputs = [
            {
                "chosen_log_prob": chosen_log_prob,
                "rejected_log_prob": rejected_log_prob,
            }
            for chosen_log_prob, rejected_log_prob in zip(chosen_log_probs, rejected_log_probs)
        ]
        return chosen_is_better, outputs

language_model instance-attribute

language_model = language_model

__init__

__init__(language_model: LanguageModel) -> None
Source code in flexeval/core/reward_model/log_prob.py
16
17
def __init__(self, language_model: LanguageModel) -> None:
    self.language_model = language_model

batch_judge

batch_judge(
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]
Source code in flexeval/core/reward_model/log_prob.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def batch_judge(
    self,
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]:
    if not all(len(instance.chosen) == 1 for instance in batch_reward_bench_instances):
        msg = "`chosen` field must have exactly one element."
        raise ValueError(msg)
    if not all(len(instance.rejected) == 1 for instance in batch_reward_bench_instances):
        msg = "`rejected` field must have exactly one element."
        raise ValueError(msg)

    chosen_log_probs = self.language_model.compute_chat_log_probs(
        prompt=[instance.prompt for instance in batch_reward_bench_instances],
        response=[instance.chosen[0] for instance in batch_reward_bench_instances],
    )
    rejected_log_probs = self.language_model.compute_chat_log_probs(
        prompt=[instance.prompt for instance in batch_reward_bench_instances],
        response=[instance.rejected[0] for instance in batch_reward_bench_instances],
    )
    chosen_is_better = [
        chosen_log_prob > rejected_log_prob
        for chosen_log_prob, rejected_log_prob in zip(chosen_log_probs, rejected_log_probs)
    ]
    outputs = [
        {
            "chosen_log_prob": chosen_log_prob,
            "rejected_log_prob": rejected_log_prob,
        }
        for chosen_log_prob, rejected_log_prob in zip(chosen_log_probs, rejected_log_probs)
    ]
    return chosen_is_better, outputs

PairwiseJudgeRewardModel

Pairwise judge using a chat language model to compare two model or human outputs.

Parameters:

  • language_model (LanguageModel) –

    The language model to use for pairwise comparison. This model is expected to output PairwiseChoice.

  • prompt_template (PromptTemplate) –

    The prompt template to embed the model outputs to be compared. Be sure to include {{prompt}}, {{answer_a}}, and {{answer_b}}.

  • system_message (str | PromptTemplate | None, default: None ) –

    The system message to prepend to the chat messages.

  • gen_kwargs (dict[str, Any] | None, default: None ) –

    Generation kwargs for the language model.

Source code in flexeval/core/reward_model/pairwise_judge_reward_model.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class PairwiseJudgeRewardModel(RewardModel):
    """Pairwise judge using a chat language model to compare two model or human
    outputs.

    Args:
        language_model: The language model to use for pairwise comparison.
                        This model is expected to output PairwiseChoice.
        prompt_template: The prompt template to embed the model outputs to be compared.
                         Be sure to include {{prompt}}, {{answer_a}}, and {{answer_b}}.
        system_message: The system message to prepend to the chat messages.
        gen_kwargs: Generation kwargs for the language model.
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        system_message: str | PromptTemplate | None = None,
        gen_kwargs: dict[str, Any] | None = None,
    ) -> None:
        if gen_kwargs is None:
            gen_kwargs = {}
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.system_message = system_message
        self.gen_kwargs = gen_kwargs

    def _create_input_chat_messages_list(self, pairwise_instance: PairwiseInstance) -> list[dict[str, str]]:
        pairwise_instance_asdict = asdict(pairwise_instance)
        judge_input = self.prompt_template.embed_inputs(pairwise_instance_asdict)
        input_chat_messages = [{"role": "user", "content": judge_input}]
        if self.system_message:
            if isinstance(self.system_message, str):
                system_message = self.system_message
            elif isinstance(self.system_message, PromptTemplate):
                system_message = self.system_message.embed_inputs(pairwise_instance_asdict)
            else:
                msg = "system_message should be str or PromptTemplate."
                raise ValueError(msg)
            input_chat_messages.insert(
                0,
                {"role": "system", "content": system_message},
            )
        return input_chat_messages

    def _is_correct_llm_answer(self, llm_answer: str, pairwise_choice: PairwiseChoice) -> bool:
        # Check if the answer is one of the valid choices.
        if PairwiseChoice.A.value in PairwiseChoice.B in llm_answer and PairwiseChoice.B in llm_answer:
            return False
        if pairwise_choice.value in llm_answer:
            return True
        return False

    def batch_judge(
        self,
        batch_reward_bench_instances: list[RewardBenchInstance],
    ) -> tuple[list[bool], list[dict[str, Any]]]:
        input_chat_messages_list: list[list[dict[str, str]]] = []
        all_pairwise_instances: list[PairwiseInstance] = []
        outputs: list[dict[str, Any]] = []
        for reward_bench_instance in batch_reward_bench_instances:
            pairwise_instance_answer_a_is_chosen = PairwiseInstance(
                prompt=reward_bench_instance.prompt,
                answer_a=reward_bench_instance.chosen,
                answer_b=reward_bench_instance.rejected,
                answer_label=PairwiseChoice.A,
            )
            input_chat_messages_a_is_chosen = self._create_input_chat_messages_list(
                pairwise_instance_answer_a_is_chosen
            )
            input_chat_messages_list.append(input_chat_messages_a_is_chosen)

            pairwise_instance_answer_b_is_chosen = PairwiseInstance(
                prompt=reward_bench_instance.prompt,
                answer_a=reward_bench_instance.rejected,
                answer_b=reward_bench_instance.chosen,
                answer_label=PairwiseChoice.B,
            )
            input_chat_messages_b_is_chosen = self._create_input_chat_messages_list(
                pairwise_instance_answer_b_is_chosen
            )
            input_chat_messages_list.append(input_chat_messages_b_is_chosen)
            all_pairwise_instances += [pairwise_instance_answer_a_is_chosen, pairwise_instance_answer_b_is_chosen]

            output = {
                "llm_inputs": [input_chat_messages_a_is_chosen, input_chat_messages_b_is_chosen],
            }
            outputs.append(output)
        judge_outputs = self.language_model.generate_chat_response(input_chat_messages_list, **self.gen_kwargs)
        chosen_is_betters: list[bool] = [
            self._is_correct_llm_answer(judge_output.text, shuffle_pairwise_instance.answer_label)
            for judge_output, shuffle_pairwise_instance in zip(judge_outputs, all_pairwise_instances)
        ]

        if len(outputs) * 2 != len(chosen_is_betters):
            msg = "The number of outputs should be twice the number of inputs."
            raise ValueError(msg)

        for i in range(len(outputs)):
            outputs[i]["llm_outputs"] = [judge_outputs[i * 2].text, judge_outputs[i * 2 + 1].text]
            outputs[i]["evaluation_results"] = [chosen_is_betters[i * 2], chosen_is_betters[i * 2 + 1]]

        return chosen_is_betters, outputs

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

system_message instance-attribute

system_message = system_message

gen_kwargs instance-attribute

gen_kwargs = gen_kwargs

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    gen_kwargs: dict[str, Any] | None = None,
) -> None
Source code in flexeval/core/reward_model/pairwise_judge_reward_model.py
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    gen_kwargs: dict[str, Any] | None = None,
) -> None:
    if gen_kwargs is None:
        gen_kwargs = {}
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.system_message = system_message
    self.gen_kwargs = gen_kwargs

batch_judge

batch_judge(
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]
Source code in flexeval/core/reward_model/pairwise_judge_reward_model.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def batch_judge(
    self,
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]:
    input_chat_messages_list: list[list[dict[str, str]]] = []
    all_pairwise_instances: list[PairwiseInstance] = []
    outputs: list[dict[str, Any]] = []
    for reward_bench_instance in batch_reward_bench_instances:
        pairwise_instance_answer_a_is_chosen = PairwiseInstance(
            prompt=reward_bench_instance.prompt,
            answer_a=reward_bench_instance.chosen,
            answer_b=reward_bench_instance.rejected,
            answer_label=PairwiseChoice.A,
        )
        input_chat_messages_a_is_chosen = self._create_input_chat_messages_list(
            pairwise_instance_answer_a_is_chosen
        )
        input_chat_messages_list.append(input_chat_messages_a_is_chosen)

        pairwise_instance_answer_b_is_chosen = PairwiseInstance(
            prompt=reward_bench_instance.prompt,
            answer_a=reward_bench_instance.rejected,
            answer_b=reward_bench_instance.chosen,
            answer_label=PairwiseChoice.B,
        )
        input_chat_messages_b_is_chosen = self._create_input_chat_messages_list(
            pairwise_instance_answer_b_is_chosen
        )
        input_chat_messages_list.append(input_chat_messages_b_is_chosen)
        all_pairwise_instances += [pairwise_instance_answer_a_is_chosen, pairwise_instance_answer_b_is_chosen]

        output = {
            "llm_inputs": [input_chat_messages_a_is_chosen, input_chat_messages_b_is_chosen],
        }
        outputs.append(output)
    judge_outputs = self.language_model.generate_chat_response(input_chat_messages_list, **self.gen_kwargs)
    chosen_is_betters: list[bool] = [
        self._is_correct_llm_answer(judge_output.text, shuffle_pairwise_instance.answer_label)
        for judge_output, shuffle_pairwise_instance in zip(judge_outputs, all_pairwise_instances)
    ]

    if len(outputs) * 2 != len(chosen_is_betters):
        msg = "The number of outputs should be twice the number of inputs."
        raise ValueError(msg)

    for i in range(len(outputs)):
        outputs[i]["llm_outputs"] = [judge_outputs[i * 2].text, judge_outputs[i * 2 + 1].text]
        outputs[i]["evaluation_results"] = [chosen_is_betters[i * 2], chosen_is_betters[i * 2 + 1]]

    return chosen_is_betters, outputs

SequenceClassificationRewardModel

Pairwise judge using a chat language model to compare two model or human outputs.

Source code in flexeval/core/reward_model/sequence_classification.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class SequenceClassificationRewardModel(RewardModel):
    """Pairwise judge using a chat language model to compare two model or human
    outputs.
    """

    def __init__(
        self,
        model: str,
        model_kwargs: dict[str, Any] | None = None,
        tokenizer: str | None = None,
        tokenizer_kwargs: dict[str, Any] | None = None,
    ) -> None:
        tokenizer = tokenizer if tokenizer else model
        tokenizer_kwargs = tokenizer_kwargs or {}
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, **tokenizer_kwargs)

        model_kwargs = get_default_model_kwargs(model_kwargs)
        self.model = AutoModelForSequenceClassification.from_pretrained(model, **model_kwargs)
        # Set pad_token_id if not set
        # to avoid "ValueError: Cannot handle batch sizes > 1 if no padding token is defined." in self.model()
        if self.model.config.pad_token_id is None:
            self.model.config.pad_token_id = self.tokenizer.pad_token_id
        self.model.eval()

    @torch.inference_mode()
    def batch_judge(
        self,
        batch_reward_bench_instances: list[RewardBenchInstance],
    ) -> tuple[list[bool], list[dict[str, Any]]]:
        chosen_messages = [instance.prompt + instance.chosen for instance in batch_reward_bench_instances]
        chosen_inputs = self.tokenizer.apply_chat_template(
            chosen_messages, return_tensors="pt", padding=True, return_dict=True
        )
        chosen_outputs = self.model(**{k: v.to(self.model.device) for k, v in chosen_inputs.items()})
        chosen_rewards = chosen_outputs.logits[:, 0]

        rejected_messages = [instance.prompt + instance.rejected for instance in batch_reward_bench_instances]
        rejected_inputs = self.tokenizer.apply_chat_template(
            rejected_messages, return_tensors="pt", padding=True, return_dict=True
        )
        rejected_outputs = self.model(**{k: v.to(self.model.device) for k, v in rejected_inputs.items()})
        rejected_rewards = rejected_outputs.logits[:, 0]

        chosen_is_better = (chosen_rewards > rejected_rewards).tolist()
        outputs = [
            {
                "chosen_reward": chosen_reward.item(),
                "rejected_reward": rejected_reward.item(),
            }
            for chosen_reward, rejected_reward in zip(chosen_rewards, rejected_rewards)
        ]
        return chosen_is_better, outputs

tokenizer instance-attribute

tokenizer = from_pretrained(tokenizer, **tokenizer_kwargs)

model instance-attribute

model = from_pretrained(model, **model_kwargs)

__init__

__init__(
    model: str,
    model_kwargs: dict[str, Any] | None = None,
    tokenizer: str | None = None,
    tokenizer_kwargs: dict[str, Any] | None = None,
) -> None
Source code in flexeval/core/reward_model/sequence_classification.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(
    self,
    model: str,
    model_kwargs: dict[str, Any] | None = None,
    tokenizer: str | None = None,
    tokenizer_kwargs: dict[str, Any] | None = None,
) -> None:
    tokenizer = tokenizer if tokenizer else model
    tokenizer_kwargs = tokenizer_kwargs or {}
    self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, **tokenizer_kwargs)

    model_kwargs = get_default_model_kwargs(model_kwargs)
    self.model = AutoModelForSequenceClassification.from_pretrained(model, **model_kwargs)
    # Set pad_token_id if not set
    # to avoid "ValueError: Cannot handle batch sizes > 1 if no padding token is defined." in self.model()
    if self.model.config.pad_token_id is None:
        self.model.config.pad_token_id = self.tokenizer.pad_token_id
    self.model.eval()

batch_judge

batch_judge(
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]
Source code in flexeval/core/reward_model/sequence_classification.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
@torch.inference_mode()
def batch_judge(
    self,
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]:
    chosen_messages = [instance.prompt + instance.chosen for instance in batch_reward_bench_instances]
    chosen_inputs = self.tokenizer.apply_chat_template(
        chosen_messages, return_tensors="pt", padding=True, return_dict=True
    )
    chosen_outputs = self.model(**{k: v.to(self.model.device) for k, v in chosen_inputs.items()})
    chosen_rewards = chosen_outputs.logits[:, 0]

    rejected_messages = [instance.prompt + instance.rejected for instance in batch_reward_bench_instances]
    rejected_inputs = self.tokenizer.apply_chat_template(
        rejected_messages, return_tensors="pt", padding=True, return_dict=True
    )
    rejected_outputs = self.model(**{k: v.to(self.model.device) for k, v in rejected_inputs.items()})
    rejected_rewards = rejected_outputs.logits[:, 0]

    chosen_is_better = (chosen_rewards > rejected_rewards).tolist()
    outputs = [
        {
            "chosen_reward": chosen_reward.item(),
            "rejected_reward": rejected_reward.item(),
        }
        for chosen_reward, rejected_reward in zip(chosen_rewards, rejected_rewards)
    ]
    return chosen_is_better, outputs