Evaluation setup for chat response generation.
In this setup, the model receives context in a chat format like:
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
{"role": "assistant", "content": "The capital of France is Paris."}
]
Source code in flexeval/core/eval_setups.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70 | @dataclass
class ChatResponse(EvalSetup):
"""
Evaluation setup for chat response generation.
In this setup, the model receives context in a chat format like:
```json
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
{"role": "assistant", "content": "The capital of France is Paris."}
]
```
"""
eval_dataset: ChatDataset
gen_kwargs: dict[str, Any]
few_shot_generator: FewShotGenerator | None = None
metrics: list[Metric] | Metric | None = None
batch_size: int = 4
max_instances: int | None = None
def evaluate_lm(
self,
language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
metrics = self.metrics or []
if isinstance(metrics, Metric):
metrics = [metrics]
return evaluate_chat_response(
language_model=language_model,
gen_kwargs=self.gen_kwargs,
eval_dataset=self.eval_dataset,
metrics=metrics,
batch_size=self.batch_size,
max_instances=self.max_instances,
few_shot_generator=self.few_shot_generator,
)
|
eval_dataset
instance-attribute
eval_dataset: ChatDataset
gen_kwargs
instance-attribute
gen_kwargs: dict[str, Any]
few_shot_generator
class-attribute
instance-attribute
few_shot_generator: FewShotGenerator | None = None
metrics
class-attribute
instance-attribute
metrics: list[Metric] | Metric | None = None
batch_size
class-attribute
instance-attribute
max_instances
class-attribute
instance-attribute
max_instances: int | None = None
__init__
__init__(
eval_dataset: ChatDataset,
gen_kwargs: dict[str, Any],
few_shot_generator: FewShotGenerator | None = None,
metrics: list[Metric] | Metric | None = None,
batch_size: int = 4,
max_instances: int | None = None,
) -> None
evaluate_lm
evaluate_lm(
language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]
Source code in flexeval/core/eval_setups.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70 | def evaluate_lm(
self,
language_model: LanguageModel,
) -> tuple[dict[str, float], list[dict[str, Any]] | None]:
metrics = self.metrics or []
if isinstance(metrics, Metric):
metrics = [metrics]
return evaluate_chat_response(
language_model=language_model,
gen_kwargs=self.gen_kwargs,
eval_dataset=self.eval_dataset,
metrics=metrics,
batch_size=self.batch_size,
max_instances=self.max_instances,
few_shot_generator=self.few_shot_generator,
)
|