Evaluation setup for chat response generation.
In this setup, the model receives context in a chat format like:
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
{"role": "assistant", "content": "The capital of France is Paris."}
]
Source code in flexeval/core/eval_setups.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 | @dataclass
class ChatResponse(EvalSetup):
"""
Evaluation setup for chat response generation.
In this setup, the model receives context in a chat format like:
```json
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
{"role": "assistant", "content": "The capital of France is Paris."}
]
```
"""
eval_dataset: ChatDataset
gen_kwargs: dict[str, Any] = field(default_factory=dict)
few_shot_generator: FewShotGenerator | None = None
metrics: list[Metric] | Metric | None = None
batch_size: int = 4
max_instances: int | None = None
random_seed: int = 42
def evaluate_lm(
self,
language_model: LanguageModel,
**kwargs,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
metrics = self.metrics or []
if isinstance(metrics, Metric):
metrics = [metrics]
metrics += [FinishReasonCount(), OutputLengthStats()]
language_model.set_random_seed(self.random_seed)
return evaluate_chat_response(
language_model=language_model,
gen_kwargs=self.gen_kwargs,
eval_dataset=self.eval_dataset,
metrics=metrics,
batch_size=self.batch_size,
max_instances=self.max_instances,
few_shot_generator=self.few_shot_generator,
cleanup_after_generation=kwargs.pop("cleanup_after_generation", False),
)
|
eval_dataset
instance-attribute
eval_dataset: ChatDataset
gen_kwargs
class-attribute
instance-attribute
gen_kwargs: dict[str, Any] = field(default_factory=dict)
few_shot_generator
class-attribute
instance-attribute
few_shot_generator: FewShotGenerator | None = None
metrics
class-attribute
instance-attribute
metrics: list[Metric] | Metric | None = None
batch_size
class-attribute
instance-attribute
max_instances
class-attribute
instance-attribute
max_instances: int | None = None
random_seed
class-attribute
instance-attribute
__init__
__init__(
eval_dataset: ChatDataset,
gen_kwargs: dict[str, Any] = dict(),
few_shot_generator: FewShotGenerator | None = None,
metrics: list[Metric] | Metric | None = None,
batch_size: int = 4,
max_instances: int | None = None,
random_seed: int = 42,
) -> None
evaluate_lm
evaluate_lm(
language_model: LanguageModel, **kwargs
) -> tuple[dict[str, float], list[dict[str, Any]] | None]
Source code in flexeval/core/eval_setups.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 | def evaluate_lm(
self,
language_model: LanguageModel,
**kwargs,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
metrics = self.metrics or []
if isinstance(metrics, Metric):
metrics = [metrics]
metrics += [FinishReasonCount(), OutputLengthStats()]
language_model.set_random_seed(self.random_seed)
return evaluate_chat_response(
language_model=language_model,
gen_kwargs=self.gen_kwargs,
eval_dataset=self.eval_dataset,
metrics=metrics,
batch_size=self.batch_size,
max_instances=self.max_instances,
few_shot_generator=self.few_shot_generator,
cleanup_after_generation=kwargs.pop("cleanup_after_generation", False),
)
|