Skip to content

EvalSetup

EvalSetup

Abstract class to give evaluation functions a common interface.

Source code in flexeval/core/eval_setups.py
22
23
24
25
26
27
28
29
30
class EvalSetup(ABC):
    """Abstract class to give evaluation functions a common interface."""

    @abstractmethod
    def evaluate_lm(
        self,
        language_model: LanguageModel,
    ) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
        pass

evaluate_lm abstractmethod

evaluate_lm(
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]
Source code in flexeval/core/eval_setups.py
25
26
27
28
29
30
@abstractmethod
def evaluate_lm(
    self,
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
    pass

ChatResponse dataclass

Evaluation setup for chat response generation. In this setup, the model receives context in a chat format like:

[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of France?"},
    {"role": "assistant", "content": "The capital of France is Paris."}
]

Source code in flexeval/core/eval_setups.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
@dataclass
class ChatResponse(EvalSetup):
    """
    Evaluation setup for chat response generation.
    In this setup, the model receives context in a chat format like:
    ```json
    [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."}
    ]
    ```
    """

    eval_dataset: ChatDataset
    gen_kwargs: dict[str, Any]
    few_shot_generator: FewShotGenerator | None = None
    metrics: list[Metric] | Metric | None = None
    batch_size: int = 4
    max_instances: int | None = None

    def evaluate_lm(
        self,
        language_model: LanguageModel,
    ) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
        metrics = self.metrics or []
        if isinstance(metrics, Metric):
            metrics = [metrics]

        return evaluate_chat_response(
            language_model=language_model,
            gen_kwargs=self.gen_kwargs,
            eval_dataset=self.eval_dataset,
            metrics=metrics,
            batch_size=self.batch_size,
            max_instances=self.max_instances,
            few_shot_generator=self.few_shot_generator,
        )

eval_dataset instance-attribute

eval_dataset: ChatDataset

gen_kwargs instance-attribute

gen_kwargs: dict[str, Any]

few_shot_generator class-attribute instance-attribute

few_shot_generator: FewShotGenerator | None = None

metrics class-attribute instance-attribute

metrics: list[Metric] | Metric | None = None

batch_size class-attribute instance-attribute

batch_size: int = 4

max_instances class-attribute instance-attribute

max_instances: int | None = None

__init__

__init__(
    eval_dataset: ChatDataset,
    gen_kwargs: dict[str, Any],
    few_shot_generator: FewShotGenerator | None = None,
    metrics: list[Metric] | Metric | None = None,
    batch_size: int = 4,
    max_instances: int | None = None,
) -> None

evaluate_lm

evaluate_lm(
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]
Source code in flexeval/core/eval_setups.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def evaluate_lm(
    self,
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
    metrics = self.metrics or []
    if isinstance(metrics, Metric):
        metrics = [metrics]

    return evaluate_chat_response(
        language_model=language_model,
        gen_kwargs=self.gen_kwargs,
        eval_dataset=self.eval_dataset,
        metrics=metrics,
        batch_size=self.batch_size,
        max_instances=self.max_instances,
        few_shot_generator=self.few_shot_generator,
    )

Generation dataclass

Evaluation setup for text generation. The model receives a prompt in a plain text format and generates its continuation.

Source code in flexeval/core/eval_setups.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
@dataclass
class Generation(EvalSetup):
    """
    Evaluation setup for text generation.
    The model receives a prompt in a plain text format and generates its continuation.
    """

    eval_dataset: GenerationDataset
    prompt_template: PromptTemplate | str
    gen_kwargs: dict[str, Any]
    few_shot_generator: FewShotGenerator | None = None
    metrics: list[Metric] | Metric | None = None
    batch_size: int = 4
    max_instances: int | None = None

    def __post_init__(self) -> None:
        if isinstance(self.prompt_template, str):
            self.prompt_template = instantiate_prompt_template_from_string(self.prompt_template)

    def evaluate_lm(
        self,
        language_model: LanguageModel,
    ) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
        metrics = self.metrics or []
        if isinstance(metrics, Metric):
            metrics = [metrics]

        return evaluate_generation(
            language_model=language_model,
            gen_kwargs=self.gen_kwargs,
            eval_dataset=self.eval_dataset,
            prompt_template=self.prompt_template,
            few_shot_generator=self.few_shot_generator,
            metrics=metrics,
            batch_size=self.batch_size,
            max_instances=self.max_instances,
        )

eval_dataset instance-attribute

eval_dataset: GenerationDataset

prompt_template instance-attribute

prompt_template: PromptTemplate | str

gen_kwargs instance-attribute

gen_kwargs: dict[str, Any]

few_shot_generator class-attribute instance-attribute

few_shot_generator: FewShotGenerator | None = None

metrics class-attribute instance-attribute

metrics: list[Metric] | Metric | None = None

batch_size class-attribute instance-attribute

batch_size: int = 4

max_instances class-attribute instance-attribute

max_instances: int | None = None

__init__

__init__(
    eval_dataset: GenerationDataset,
    prompt_template: PromptTemplate | str,
    gen_kwargs: dict[str, Any],
    few_shot_generator: FewShotGenerator | None = None,
    metrics: list[Metric] | Metric | None = None,
    batch_size: int = 4,
    max_instances: int | None = None,
) -> None

__post_init__

__post_init__() -> None
Source code in flexeval/core/eval_setups.py
88
89
90
def __post_init__(self) -> None:
    if isinstance(self.prompt_template, str):
        self.prompt_template = instantiate_prompt_template_from_string(self.prompt_template)

evaluate_lm

evaluate_lm(
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]
Source code in flexeval/core/eval_setups.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def evaluate_lm(
    self,
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
    metrics = self.metrics or []
    if isinstance(metrics, Metric):
        metrics = [metrics]

    return evaluate_generation(
        language_model=language_model,
        gen_kwargs=self.gen_kwargs,
        eval_dataset=self.eval_dataset,
        prompt_template=self.prompt_template,
        few_shot_generator=self.few_shot_generator,
        metrics=metrics,
        batch_size=self.batch_size,
        max_instances=self.max_instances,
    )

MultipleChoice dataclass

Evaluation setup for multiple choice questions. The model receives a prompt and a list of choices and selects the answer with the highest probability.

Source code in flexeval/core/eval_setups.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
@dataclass
class MultipleChoice(EvalSetup):
    """
    Evaluation setup for multiple choice questions.
    The model receives a prompt and a list of choices and selects the answer with the highest probability.
    """

    eval_dataset: MultipleChoiceDataset
    prompt_template: PromptTemplate | str
    few_shot_generator: FewShotGenerator | None = None
    batch_size: int = 4
    max_instances: int | None = None

    def __post_init__(self) -> None:
        if isinstance(self.prompt_template, str):
            self.prompt_template = instantiate_prompt_template_from_string(self.prompt_template)

    def evaluate_lm(
        self,
        language_model: LanguageModel,
    ) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
        return evaluate_multiple_choice(
            language_model=language_model,
            eval_dataset=self.eval_dataset,
            prompt_template=self.prompt_template,
            few_shot_generator=self.few_shot_generator,
            batch_size=self.batch_size,
            max_instances=self.max_instances,
        )

eval_dataset instance-attribute

eval_dataset: MultipleChoiceDataset

prompt_template instance-attribute

prompt_template: PromptTemplate | str

few_shot_generator class-attribute instance-attribute

few_shot_generator: FewShotGenerator | None = None

batch_size class-attribute instance-attribute

batch_size: int = 4

max_instances class-attribute instance-attribute

max_instances: int | None = None

__init__

__init__(
    eval_dataset: MultipleChoiceDataset,
    prompt_template: PromptTemplate | str,
    few_shot_generator: FewShotGenerator | None = None,
    batch_size: int = 4,
    max_instances: int | None = None,
) -> None

__post_init__

__post_init__() -> None
Source code in flexeval/core/eval_setups.py
125
126
127
def __post_init__(self) -> None:
    if isinstance(self.prompt_template, str):
        self.prompt_template = instantiate_prompt_template_from_string(self.prompt_template)

evaluate_lm

evaluate_lm(
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]
Source code in flexeval/core/eval_setups.py
129
130
131
132
133
134
135
136
137
138
139
140
def evaluate_lm(
    self,
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
    return evaluate_multiple_choice(
        language_model=language_model,
        eval_dataset=self.eval_dataset,
        prompt_template=self.prompt_template,
        few_shot_generator=self.few_shot_generator,
        batch_size=self.batch_size,
        max_instances=self.max_instances,
    )

Perplexity dataclass

Evaluation setup for perplexity. The model receives plain text and computes the perplexity of the text.

Source code in flexeval/core/eval_setups.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
@dataclass
class Perplexity(EvalSetup):
    """
    Evaluation setup for perplexity.
    The model receives plain text and computes the perplexity of the text.
    """

    eval_dataset: TextDataset
    batch_size: int = 4
    tokenizer: Tokenizer | None = None
    max_instances: int | None = None

    def evaluate_lm(
        self,
        language_model: LanguageModel,
    ) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
        metrics = evaluate_perplexity(
            language_model=language_model,
            eval_dataset=self.eval_dataset,
            batch_size=self.batch_size,
            tokenizer=self.tokenizer,
            max_instances=self.max_instances,
        )
        return metrics, None

eval_dataset instance-attribute

eval_dataset: TextDataset

batch_size class-attribute instance-attribute

batch_size: int = 4

tokenizer class-attribute instance-attribute

tokenizer: Tokenizer | None = None

max_instances class-attribute instance-attribute

max_instances: int | None = None

__init__

__init__(
    eval_dataset: TextDataset,
    batch_size: int = 4,
    tokenizer: Tokenizer | None = None,
    max_instances: int | None = None,
) -> None

evaluate_lm

evaluate_lm(
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]
Source code in flexeval/core/eval_setups.py
155
156
157
158
159
160
161
162
163
164
165
166
def evaluate_lm(
    self,
    language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
    metrics = evaluate_perplexity(
        language_model=language_model,
        eval_dataset=self.eval_dataset,
        batch_size=self.batch_size,
        tokenizer=self.tokenizer,
        max_instances=self.max_instances,
    )
    return metrics, None