Skip to content

RewardModel

RewardModel

Base class for reward models.

Source code in flexeval/core/reward_model/base.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class RewardModel(ABC):
    """Base class for reward models."""

    @abstractmethod
    def batch_judge(
        self,
        batch_reward_bench_instances: list[RewardBenchInstance],
    ) -> tuple[list[bool], list[dict[str, Any]]]:
        """Judge a batch of reward bench instances.

        Args:
            batch_reward_bench_instances (list[RewardBenchInstance]): A list of tuples, each containing two model items.

        Returns:
            tuple[list[bool], list[Any]]: A tuple with the following elements:
                - chosen_is_betters: Indicating whether each `chosen` item is considered better by the model.
                - judge_outputs: A list of outputs (rationale, score, etc....) from the model.
        """

batch_judge abstractmethod

batch_judge(
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]

Judge a batch of reward bench instances.

Parameters:

  • batch_reward_bench_instances (list[RewardBenchInstance]) –

    A list of tuples, each containing two model items.

Returns:

  • tuple[list[bool], list[dict[str, Any]]]

    tuple[list[bool], list[Any]]: A tuple with the following elements: - chosen_is_betters: Indicating whether each chosen item is considered better by the model. - judge_outputs: A list of outputs (rationale, score, etc....) from the model.

Source code in flexeval/core/reward_model/base.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
@abstractmethod
def batch_judge(
    self,
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]:
    """Judge a batch of reward bench instances.

    Args:
        batch_reward_bench_instances (list[RewardBenchInstance]): A list of tuples, each containing two model items.

    Returns:
        tuple[list[bool], list[Any]]: A tuple with the following elements:
            - chosen_is_betters: Indicating whether each `chosen` item is considered better by the model.
            - judge_outputs: A list of outputs (rationale, score, etc....) from the model.
    """

LogProbRewardModel

A reward model that judges the quality of a response based on the log probability computed by the auto-regressive language model.

Source code in flexeval/core/reward_model/log_prob.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
class LogProbRewardModel(RewardModel):
    """
    A reward model that judges the quality of a response
    based on the log probability computed by the auto-regressive language model.
    """

    def __init__(self, language_model: LanguageModel) -> None:
        self.language_model = language_model

    def batch_judge(
        self,
        batch_reward_bench_instances: list[RewardBenchInstance],
    ) -> tuple[list[bool], list[dict[str, Any]]]:
        if not all(len(instance.chosen) == 1 for instance in batch_reward_bench_instances):
            msg = "`chosen` field must have exactly one element."
            raise ValueError(msg)
        if not all(len(instance.rejected) == 1 for instance in batch_reward_bench_instances):
            msg = "`rejected` field must have exactly one element."
            raise ValueError(msg)

        chosen_log_probs = self.language_model.compute_chat_log_probs(
            prompt=[instance.prompt for instance in batch_reward_bench_instances],
            response=[instance.chosen[0] for instance in batch_reward_bench_instances],
        )
        rejected_log_probs = self.language_model.compute_chat_log_probs(
            prompt=[instance.prompt for instance in batch_reward_bench_instances],
            response=[instance.rejected[0] for instance in batch_reward_bench_instances],
        )
        chosen_is_better = [
            chosen_log_prob > rejected_log_prob
            for chosen_log_prob, rejected_log_prob in zip(chosen_log_probs, rejected_log_probs)
        ]
        outputs = [
            {
                "chosen_log_prob": chosen_log_prob,
                "rejected_log_prob": rejected_log_prob,
            }
            for chosen_log_prob, rejected_log_prob in zip(chosen_log_probs, rejected_log_probs)
        ]
        return chosen_is_better, outputs

language_model instance-attribute

language_model = language_model

__init__

__init__(language_model: LanguageModel) -> None
Source code in flexeval/core/reward_model/log_prob.py
16
17
def __init__(self, language_model: LanguageModel) -> None:
    self.language_model = language_model

batch_judge

batch_judge(
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]
Source code in flexeval/core/reward_model/log_prob.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def batch_judge(
    self,
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]:
    if not all(len(instance.chosen) == 1 for instance in batch_reward_bench_instances):
        msg = "`chosen` field must have exactly one element."
        raise ValueError(msg)
    if not all(len(instance.rejected) == 1 for instance in batch_reward_bench_instances):
        msg = "`rejected` field must have exactly one element."
        raise ValueError(msg)

    chosen_log_probs = self.language_model.compute_chat_log_probs(
        prompt=[instance.prompt for instance in batch_reward_bench_instances],
        response=[instance.chosen[0] for instance in batch_reward_bench_instances],
    )
    rejected_log_probs = self.language_model.compute_chat_log_probs(
        prompt=[instance.prompt for instance in batch_reward_bench_instances],
        response=[instance.rejected[0] for instance in batch_reward_bench_instances],
    )
    chosen_is_better = [
        chosen_log_prob > rejected_log_prob
        for chosen_log_prob, rejected_log_prob in zip(chosen_log_probs, rejected_log_probs)
    ]
    outputs = [
        {
            "chosen_log_prob": chosen_log_prob,
            "rejected_log_prob": rejected_log_prob,
        }
        for chosen_log_prob, rejected_log_prob in zip(chosen_log_probs, rejected_log_probs)
    ]
    return chosen_is_better, outputs

PairwiseJudgeRewardModel

Pairwise judge using a chat language model to compare two model or human outputs. The reward model’s judgment is counted as correct only if it is order‑invariant: when given (A = chosen, B = rejected) it prefers A, and when the inputs are swapped (A = rejected, B = chosen) it prefers B.

Examples:

  • ✅ Correct (order‑invariant):
  • judge(prompt, A=chosen, B=rejected) → A
  • judge(prompt, A=rejected, B=chosen) → B
  • ❌ Incorrect (position bias; same answer regardless of order):
  • judge(prompt, A=chosen, B=rejected) → A
  • judge(prompt, A=rejected, B=chosen) → A
  • ❌ Incorrect (both wrong):
  • judge(prompt, A=chosen, B=rejected) → B
  • judge(prompt, A=rejected, B=chosen) → A

Parameters:

  • language_model (LanguageModel) –

    The language model to use for pairwise comparison. This model is expected to output PairwiseChoice.

  • prompt_template (PromptTemplate) –

    The prompt template to embed the model outputs to be compared. Be sure to include {{prompt}}, {{answer_a}}, and {{answer_b}}.

  • system_message (str | PromptTemplate | None, default: None ) –

    The system message to prepend to the chat messages.

  • gen_kwargs (dict[str, Any] | None, default: None ) –

    Generation kwargs for the language model.

Source code in flexeval/core/reward_model/pairwise_judge_reward_model.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
class PairwiseJudgeRewardModel(RewardModel):
    """Pairwise judge using a chat language model to compare two model or human
    outputs.
    The reward model’s judgment is counted as **correct only if it is order‑invariant**:
    when given (A = chosen, B = rejected) it prefers **A**, and when the inputs are swapped
    (A = rejected, B = chosen) it prefers **B**.

    Examples:
    - ✅ Correct (order‑invariant):
      - judge(prompt, A=chosen, B=rejected) → **A**
      - judge(prompt, A=rejected, B=chosen) → **B**
    - ❌ Incorrect (position bias; same answer regardless of order):
      - judge(prompt, A=chosen, B=rejected) → **A**
      - judge(prompt, A=rejected, B=chosen) → **A**
    - ❌ Incorrect (both wrong):
      - judge(prompt, A=chosen, B=rejected) → **B**
      - judge(prompt, A=rejected, B=chosen) → **A**

    Args:
        language_model: The language model to use for pairwise comparison.
                        This model is expected to output PairwiseChoice.
        prompt_template: The prompt template to embed the model outputs to be compared.
                         Be sure to include {{prompt}}, {{answer_a}}, and {{answer_b}}.
        system_message: The system message to prepend to the chat messages.
        gen_kwargs: Generation kwargs for the language model.
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        system_message: str | PromptTemplate | None = None,
        gen_kwargs: dict[str, Any] | None = None,
    ) -> None:
        if gen_kwargs is None:
            gen_kwargs = {}
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.system_message = system_message
        self.gen_kwargs = gen_kwargs

    def _create_input_chat_messages_list(self, pairwise_instance: PairwiseInstance) -> list[dict[str, str]]:
        pairwise_instance_asdict = asdict(pairwise_instance)
        judge_input = self.prompt_template.embed_inputs(pairwise_instance_asdict)
        input_chat_messages = [{"role": "user", "content": judge_input}]
        if self.system_message:
            if isinstance(self.system_message, str):
                system_message = self.system_message
            elif isinstance(self.system_message, PromptTemplate):
                system_message = self.system_message.embed_inputs(pairwise_instance_asdict)
            else:
                msg = "system_message should be str or PromptTemplate."
                raise ValueError(msg)
            input_chat_messages.insert(
                0,
                {"role": "system", "content": system_message},
            )
        return input_chat_messages

    def batch_judge(
        self,
        batch_reward_bench_instances: list[RewardBenchInstance],
    ) -> tuple[list[bool], list[dict[str, Any]]]:
        input_chat_messages_list: list[list[dict[str, str]]] = []
        all_pairwise_instances: list[PairwiseInstance] = []
        outputs: list[dict[str, Any]] = []
        for reward_bench_instance in batch_reward_bench_instances:
            # to address position biases, create two inputs by swapping chosen/rejected orderings
            pairwise_instance_answer_a_is_chosen = PairwiseInstance(
                prompt=reward_bench_instance.prompt,
                answer_a=reward_bench_instance.chosen,
                answer_b=reward_bench_instance.rejected,
                answer_label=PairwiseChoice.A,
            )
            input_chat_messages_a_is_chosen = self._create_input_chat_messages_list(
                pairwise_instance_answer_a_is_chosen
            )
            input_chat_messages_list.append(input_chat_messages_a_is_chosen)

            pairwise_instance_answer_b_is_chosen = PairwiseInstance(
                prompt=reward_bench_instance.prompt,
                answer_a=reward_bench_instance.rejected,
                answer_b=reward_bench_instance.chosen,
                answer_label=PairwiseChoice.B,
            )
            input_chat_messages_b_is_chosen = self._create_input_chat_messages_list(
                pairwise_instance_answer_b_is_chosen
            )
            input_chat_messages_list.append(input_chat_messages_b_is_chosen)
            all_pairwise_instances += [pairwise_instance_answer_a_is_chosen, pairwise_instance_answer_b_is_chosen]

            output = {
                "llm_inputs": [input_chat_messages_a_is_chosen, input_chat_messages_b_is_chosen],
            }
            outputs.append(output)
        judge_outputs = self.language_model.generate_chat_response(input_chat_messages_list, **self.gen_kwargs)
        chosen_is_better_list: list[bool] = [
            evaluate_model_output(judge_output.text, pairwise_instance.answer_label)
            for judge_output, pairwise_instance in zip(judge_outputs, all_pairwise_instances)
        ]

        if len(outputs) * 2 != len(chosen_is_better_list):
            msg = "The number of outputs should be twice the number of inputs."
            raise ValueError(msg)

        aggregated_results, aggregated_outputs = aggregate_judge_results(outputs, judge_outputs, chosen_is_better_list)

        return aggregated_results, aggregated_outputs

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

system_message instance-attribute

system_message = system_message

gen_kwargs instance-attribute

gen_kwargs = gen_kwargs

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    gen_kwargs: dict[str, Any] | None = None,
) -> None
Source code in flexeval/core/reward_model/pairwise_judge_reward_model.py
109
110
111
112
113
114
115
116
117
118
119
120
121
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    gen_kwargs: dict[str, Any] | None = None,
) -> None:
    if gen_kwargs is None:
        gen_kwargs = {}
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.system_message = system_message
    self.gen_kwargs = gen_kwargs

batch_judge

batch_judge(
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]
Source code in flexeval/core/reward_model/pairwise_judge_reward_model.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def batch_judge(
    self,
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]:
    input_chat_messages_list: list[list[dict[str, str]]] = []
    all_pairwise_instances: list[PairwiseInstance] = []
    outputs: list[dict[str, Any]] = []
    for reward_bench_instance in batch_reward_bench_instances:
        # to address position biases, create two inputs by swapping chosen/rejected orderings
        pairwise_instance_answer_a_is_chosen = PairwiseInstance(
            prompt=reward_bench_instance.prompt,
            answer_a=reward_bench_instance.chosen,
            answer_b=reward_bench_instance.rejected,
            answer_label=PairwiseChoice.A,
        )
        input_chat_messages_a_is_chosen = self._create_input_chat_messages_list(
            pairwise_instance_answer_a_is_chosen
        )
        input_chat_messages_list.append(input_chat_messages_a_is_chosen)

        pairwise_instance_answer_b_is_chosen = PairwiseInstance(
            prompt=reward_bench_instance.prompt,
            answer_a=reward_bench_instance.rejected,
            answer_b=reward_bench_instance.chosen,
            answer_label=PairwiseChoice.B,
        )
        input_chat_messages_b_is_chosen = self._create_input_chat_messages_list(
            pairwise_instance_answer_b_is_chosen
        )
        input_chat_messages_list.append(input_chat_messages_b_is_chosen)
        all_pairwise_instances += [pairwise_instance_answer_a_is_chosen, pairwise_instance_answer_b_is_chosen]

        output = {
            "llm_inputs": [input_chat_messages_a_is_chosen, input_chat_messages_b_is_chosen],
        }
        outputs.append(output)
    judge_outputs = self.language_model.generate_chat_response(input_chat_messages_list, **self.gen_kwargs)
    chosen_is_better_list: list[bool] = [
        evaluate_model_output(judge_output.text, pairwise_instance.answer_label)
        for judge_output, pairwise_instance in zip(judge_outputs, all_pairwise_instances)
    ]

    if len(outputs) * 2 != len(chosen_is_better_list):
        msg = "The number of outputs should be twice the number of inputs."
        raise ValueError(msg)

    aggregated_results, aggregated_outputs = aggregate_judge_results(outputs, judge_outputs, chosen_is_better_list)

    return aggregated_results, aggregated_outputs

SequenceClassificationRewardModel

Pairwise judge using a chat language model to compare two model or human outputs.

Source code in flexeval/core/reward_model/sequence_classification.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class SequenceClassificationRewardModel(RewardModel):
    """Pairwise judge using a chat language model to compare two model or human
    outputs.
    """

    def __init__(
        self,
        model: str,
        model_kwargs: dict[str, Any] | None = None,
        tokenizer: str | None = None,
        tokenizer_kwargs: dict[str, Any] | None = None,
    ) -> None:
        tokenizer = tokenizer if tokenizer else model
        tokenizer_kwargs = tokenizer_kwargs or {}
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, **tokenizer_kwargs)

        model_kwargs = get_default_model_kwargs(model_kwargs)
        self.model = AutoModelForSequenceClassification.from_pretrained(model, **model_kwargs)
        # Set pad_token_id if not set
        # to avoid "ValueError: Cannot handle batch sizes > 1 if no padding token is defined." in self.model()
        if self.model.config.pad_token_id is None:
            self.model.config.pad_token_id = self.tokenizer.pad_token_id
        self.model.eval()

    @torch.inference_mode()
    def batch_judge(
        self,
        batch_reward_bench_instances: list[RewardBenchInstance],
    ) -> tuple[list[bool], list[dict[str, Any]]]:
        chosen_messages = [instance.prompt + instance.chosen for instance in batch_reward_bench_instances]
        chosen_inputs = self.tokenizer.apply_chat_template(
            chosen_messages, return_tensors="pt", padding=True, return_dict=True
        )
        chosen_outputs = self.model(**{k: v.to(self.model.device) for k, v in chosen_inputs.items()})
        chosen_rewards = chosen_outputs.logits[:, 0]

        rejected_messages = [instance.prompt + instance.rejected for instance in batch_reward_bench_instances]
        rejected_inputs = self.tokenizer.apply_chat_template(
            rejected_messages, return_tensors="pt", padding=True, return_dict=True
        )
        rejected_outputs = self.model(**{k: v.to(self.model.device) for k, v in rejected_inputs.items()})
        rejected_rewards = rejected_outputs.logits[:, 0]

        chosen_is_better = (chosen_rewards > rejected_rewards).tolist()
        outputs = [
            {
                "chosen_reward": chosen_reward.item(),
                "rejected_reward": rejected_reward.item(),
            }
            for chosen_reward, rejected_reward in zip(chosen_rewards, rejected_rewards)
        ]
        return chosen_is_better, outputs

tokenizer instance-attribute

tokenizer = from_pretrained(tokenizer, **tokenizer_kwargs)

model instance-attribute

model = from_pretrained(model, **model_kwargs)

__init__

__init__(
    model: str,
    model_kwargs: dict[str, Any] | None = None,
    tokenizer: str | None = None,
    tokenizer_kwargs: dict[str, Any] | None = None,
) -> None
Source code in flexeval/core/reward_model/sequence_classification.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(
    self,
    model: str,
    model_kwargs: dict[str, Any] | None = None,
    tokenizer: str | None = None,
    tokenizer_kwargs: dict[str, Any] | None = None,
) -> None:
    tokenizer = tokenizer if tokenizer else model
    tokenizer_kwargs = tokenizer_kwargs or {}
    self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, **tokenizer_kwargs)

    model_kwargs = get_default_model_kwargs(model_kwargs)
    self.model = AutoModelForSequenceClassification.from_pretrained(model, **model_kwargs)
    # Set pad_token_id if not set
    # to avoid "ValueError: Cannot handle batch sizes > 1 if no padding token is defined." in self.model()
    if self.model.config.pad_token_id is None:
        self.model.config.pad_token_id = self.tokenizer.pad_token_id
    self.model.eval()

batch_judge

batch_judge(
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]
Source code in flexeval/core/reward_model/sequence_classification.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
@torch.inference_mode()
def batch_judge(
    self,
    batch_reward_bench_instances: list[RewardBenchInstance],
) -> tuple[list[bool], list[dict[str, Any]]]:
    chosen_messages = [instance.prompt + instance.chosen for instance in batch_reward_bench_instances]
    chosen_inputs = self.tokenizer.apply_chat_template(
        chosen_messages, return_tensors="pt", padding=True, return_dict=True
    )
    chosen_outputs = self.model(**{k: v.to(self.model.device) for k, v in chosen_inputs.items()})
    chosen_rewards = chosen_outputs.logits[:, 0]

    rejected_messages = [instance.prompt + instance.rejected for instance in batch_reward_bench_instances]
    rejected_inputs = self.tokenizer.apply_chat_template(
        rejected_messages, return_tensors="pt", padding=True, return_dict=True
    )
    rejected_outputs = self.model(**{k: v.to(self.model.device) for k, v in rejected_inputs.items()})
    rejected_rewards = rejected_outputs.logits[:, 0]

    chosen_is_better = (chosen_rewards > rejected_rewards).tolist()
    outputs = [
        {
            "chosen_reward": chosen_reward.item(),
            "rejected_reward": rejected_reward.item(),
        }
        for chosen_reward, rejected_reward in zip(chosen_rewards, rejected_rewards)
    ]
    return chosen_is_better, outputs