Skip to content

Metric

Metric

Base class for metrics.

Source code in flexeval/core/metric/base.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class Metric(ABC):
    """
    Base class for metrics.
    """

    @abstractmethod
    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        """
        Evaluate the outputs of `LanguageModel` against the references.

        Args:
            lm_outputs: List of model outputs.
            references_list: List of reference outputs.
            task_inputs_list: List of task inputs.
        """
        raise NotImplementedError

evaluate abstractmethod

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult

Evaluate the outputs of LanguageModel against the references.

Parameters:

  • lm_outputs (list[str]) –

    List of model outputs.

  • references_list (list[list[str]]) –

    List of reference outputs.

  • task_inputs_list (list[dict[str, str]] | None, default: None ) –

    List of task inputs.

Source code in flexeval/core/metric/base.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@abstractmethod
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    """
    Evaluate the outputs of `LanguageModel` against the references.

    Args:
        lm_outputs: List of model outputs.
        references_list: List of reference outputs.
        task_inputs_list: List of task inputs.
    """
    raise NotImplementedError

MetricResult dataclass

A dataclass representing the result of a metric evaluation.

Source code in flexeval/core/metric/base.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
@dataclass
class MetricResult:
    """
    A dataclass representing the result of a metric evaluation.
    """

    summary: dict[str, Any]
    """
    Summary containing aggregated metric values.
    """
    instance_details: list[dict[str, Any]] | None = None
    """
    A list of evaluate details for each instance.
    Useful for error analysis.
    """

summary instance-attribute

summary: dict[str, Any]

Summary containing aggregated metric values.

instance_details class-attribute instance-attribute

instance_details: list[dict[str, Any]] | None = None

A list of evaluate details for each instance. Useful for error analysis.

__init__

__init__(
    summary: dict[str, Any],
    instance_details: list[dict[str, Any]] | None = None,
) -> None

BLEU

An implementation of BLEU. The calculation is based on the sacrebleu library.

Parameters:

  • tokenize_option (str | None, default: None ) –

    Tokenization option for sacrebleu. If None, sacrebleu will use the default tokenization.

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of StringProcessor to apply to the references before comparison.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

Examples:

>>> from flexeval import BLEU
>>> bleu = BLEU()
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
>>> result = bleu.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={
        'bleu_score': 1.0,
        'bleu_bp': 1.0,
        'bleu_signature': nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.1},
        instance_details=[
            {'bleu_score': 1.0, 'bleu_bp': 1.0},
            {'bleu_score': 1.0, 'bleu_bp': 1.0}
        ]
    )
Source code in flexeval/core/metric/bleu.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class BLEU(Metric):
    """An implementation of [BLEU](https://aclanthology.org/P02-1040/).
    The calculation is based on the [sacrebleu](https://github.com/mjpost/sacrebleu) library.

    Args:
        tokenize_option: Tokenization option for sacrebleu.
            If `None`, sacrebleu will use the default tokenization.
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.

    Examples:
        >>> from flexeval import BLEU
        >>> bleu = BLEU()
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
        >>> result = bleu.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={
                'bleu_score': 1.0,
                'bleu_bp': 1.0,
                'bleu_signature': nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.1},
                instance_details=[
                    {'bleu_score': 1.0, 'bleu_bp': 1.0},
                    {'bleu_score': 1.0, 'bleu_bp': 1.0}
                ]
            )
    """

    def __init__(
        self,
        tokenize_option: str | None = None,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        self._corpus_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option)
        # For sentence BLEU, we need to set `effective_order=True` as recommended by sacrebleu.
        self._sentence_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option, effective_order=True)

        if isinstance(lm_output_processor, StringProcessor):
            lm_output_processor = [lm_output_processor]
        if isinstance(reference_processor, StringProcessor):
            reference_processor = [reference_processor]

        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if len(lm_outputs) != len(references_list):
            msg = (
                f"lm_outputs and references_list must have the same length, "
                f"but got {len(lm_outputs)} and {len(references_list)}."
            )
            raise ValueError(msg)

        if self.lm_output_processors:
            lm_outputs = [
                functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
            ]

        if self.reference_processors:
            references_list = [
                [functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references]
                for references in references_list
            ]

        # we need restructure the references to match the format expected by sacrebleu
        max_num_refs = max(len(refs) for refs in references_list)
        references_for_sacrebleu: list[list[str]] = []
        for i in range(max_num_refs):
            set_of_references: list[str] = []
            for refs_for_source in references_list:
                if i < len(refs_for_source):
                    set_of_references.append(refs_for_source[i])
                else:
                    set_of_references.append("")
            references_for_sacrebleu.append(set_of_references)

        bleu = self._corpus_bleu.corpus_score([o.strip() for o in lm_outputs], references_for_sacrebleu)
        sentence_bleu_list = [
            self._sentence_bleu.sentence_score(o.strip(), refs) for o, refs in zip(lm_outputs, references_list)
        ]

        summary = {
            "bleu_score": bleu.score / 100,
            "bleu_bp": bleu.bp,
            "bleu_signature": self._corpus_bleu.get_signature(),
        }

        if self.category_key:
            categories = [task_input[self.category_key] for task_input in task_inputs_list]
            sentence_bleu_score_list = [b.score / 100 for b in sentence_bleu_list]
            category_wise_scores = aggregate_category_wise_scores(sentence_bleu_score_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"sentence_bleu_score/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"bleu_score": b.score / 100, "bleu_bp": b.bp} for b in sentence_bleu_list],
        )

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

category_key instance-attribute

category_key = category_key

__init__

__init__(
    tokenize_option: str | None = None,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/bleu.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def __init__(
    self,
    tokenize_option: str | None = None,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    self._corpus_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option)
    # For sentence BLEU, we need to set `effective_order=True` as recommended by sacrebleu.
    self._sentence_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option, effective_order=True)

    if isinstance(lm_output_processor, StringProcessor):
        lm_output_processor = [lm_output_processor]
    if isinstance(reference_processor, StringProcessor):
        reference_processor = [reference_processor]

    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/bleu.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if len(lm_outputs) != len(references_list):
        msg = (
            f"lm_outputs and references_list must have the same length, "
            f"but got {len(lm_outputs)} and {len(references_list)}."
        )
        raise ValueError(msg)

    if self.lm_output_processors:
        lm_outputs = [
            functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
        ]

    if self.reference_processors:
        references_list = [
            [functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references]
            for references in references_list
        ]

    # we need restructure the references to match the format expected by sacrebleu
    max_num_refs = max(len(refs) for refs in references_list)
    references_for_sacrebleu: list[list[str]] = []
    for i in range(max_num_refs):
        set_of_references: list[str] = []
        for refs_for_source in references_list:
            if i < len(refs_for_source):
                set_of_references.append(refs_for_source[i])
            else:
                set_of_references.append("")
        references_for_sacrebleu.append(set_of_references)

    bleu = self._corpus_bleu.corpus_score([o.strip() for o in lm_outputs], references_for_sacrebleu)
    sentence_bleu_list = [
        self._sentence_bleu.sentence_score(o.strip(), refs) for o, refs in zip(lm_outputs, references_list)
    ]

    summary = {
        "bleu_score": bleu.score / 100,
        "bleu_bp": bleu.bp,
        "bleu_signature": self._corpus_bleu.get_signature(),
    }

    if self.category_key:
        categories = [task_input[self.category_key] for task_input in task_inputs_list]
        sentence_bleu_score_list = [b.score / 100 for b in sentence_bleu_list]
        category_wise_scores = aggregate_category_wise_scores(sentence_bleu_score_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"sentence_bleu_score/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"bleu_score": b.score / 100, "bleu_bp": b.bp} for b in sentence_bleu_list],
    )

CharF1

A metric that calculates how many characters in the output string are included in the characters of the expected output. If there are multiple expected outputs, the highest score is adopted.

Parameters:

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of Normalizers to apply to the model outputs before comparison.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of Normalizers to apply to the references before comparison.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

Examples:

>>> from flexeval import CharF1
>>> char_f1 = CharF1()
>>> lm_outputs = ["abcd", "efgh"]
>>> references_list = [["abcd", "ABCD"], ["efGH"]]
>>> result = char_f1.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(summary={'char_f1': 0.75}, instance_details=[{'char_f1': 1.0}, {'char_f1': 0.5}])
Source code in flexeval/core/metric/char_f1.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class CharF1(Metric):
    """
    A metric that calculates how many characters in the output string are included
    in the characters of the expected output.
    If there are multiple expected outputs, the highest score is adopted.

    Args:
        lm_output_processor: StringProcessor or list of Normalizers to apply to the model outputs before comparison.
        reference_processor: StringProcessor or list of Normalizers to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.

    Examples:
        >>> from flexeval import CharF1
        >>> char_f1 = CharF1()
        >>> lm_outputs = ["abcd", "efgh"]
        >>> references_list = [["abcd", "ABCD"], ["efGH"]]
        >>> result = char_f1.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(summary={'char_f1': 0.75}, instance_details=[{'char_f1': 1.0}, {'char_f1': 0.5}])
    """

    def __init__(
        self,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        if isinstance(lm_output_processor, StringProcessor):
            lm_output_processor = [lm_output_processor]
        if isinstance(reference_processor, StringProcessor):
            reference_processor = [reference_processor]

        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if self.lm_output_processors:
            lm_outputs = [
                functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
            ]

        if self.reference_processors:
            references_list = [
                [functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references]
                for references in references_list
            ]

        char_f1_scores: list[float] = []
        for lm_output, expected_output in zip(lm_outputs, references_list):
            score = max(fuzz.ratio(lm_output, o) for o in expected_output) / 100
            char_f1_scores.append(score)

        summary = {"char_f1": sum(char_f1_scores) / len(char_f1_scores)}

        if self.category_key:
            categories = [task_input[self.category_key] for task_input in task_inputs_list]
            category_wise_scores = aggregate_category_wise_scores(char_f1_scores, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"char_f1/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"char_f1": s} for s in char_f1_scores],
        )

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

category_key instance-attribute

category_key = category_key

__init__

__init__(
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/char_f1.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(
    self,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    if isinstance(lm_output_processor, StringProcessor):
        lm_output_processor = [lm_output_processor]
    if isinstance(reference_processor, StringProcessor):
        reference_processor = [reference_processor]

    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/char_f1.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if self.lm_output_processors:
        lm_outputs = [
            functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
        ]

    if self.reference_processors:
        references_list = [
            [functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references]
            for references in references_list
        ]

    char_f1_scores: list[float] = []
    for lm_output, expected_output in zip(lm_outputs, references_list):
        score = max(fuzz.ratio(lm_output, o) for o in expected_output) / 100
        char_f1_scores.append(score)

    summary = {"char_f1": sum(char_f1_scores) / len(char_f1_scores)}

    if self.category_key:
        categories = [task_input[self.category_key] for task_input in task_inputs_list]
        category_wise_scores = aggregate_category_wise_scores(char_f1_scores, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"char_f1/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"char_f1": s} for s in char_f1_scores],
    )

CodeEval

A metric that evaluates generated code with test cases.

Parameters:

  • code_template (str | None, default: None ) –

    A Jinja2 template string to make the generated code. The template can contain variables from task inputs. If None, the code prompt will be the generated text itself.

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    String processors applied to model outputs before evaluation.

  • evaluate_module (str, default: 'code_eval' ) –

    An evaluate module to use.

Examples:

>>> from flexeval import CodeEval
>>> code_eval = CodeEval()
>>> lm_outputs = ["def add(a, b):\n    return a + b", "def is_equal(a, b):\n    return a = b"]
>>> references_list = [["assert add(1, 2) == 3"], ["assert is_equal(1, 2) == False"]]
>>> result = code_eval.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'pass@1': 0.5},
    instance_details=[
        {'passed': True, 'result': 'passed'},
        {'passed': False, 'result': 'failed: invalid syntax (<string>, line 2)'}
    ]
)
Source code in flexeval/core/metric/code_eval.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
class CodeEval(Metric):
    """
    A metric that evaluates generated code with test cases.

    Args:
        code_template: A Jinja2 template string to make the generated code.
            The template can contain variables from task inputs.
            If `None`, the code prompt will be the generated text itself.
        lm_output_processor: String processors applied to model outputs before evaluation.
        evaluate_module: An evaluate module to use.

    Examples:
        >>> from flexeval import CodeEval
        >>> code_eval = CodeEval()
        >>> lm_outputs = ["def add(a, b):\\n    return a + b", "def is_equal(a, b):\\n    return a = b"]
        >>> references_list = [["assert add(1, 2) == 3"], ["assert is_equal(1, 2) == False"]]
        >>> result = code_eval.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'pass@1': 0.5},
            instance_details=[
                {'passed': True, 'result': 'passed'},
                {'passed': False, 'result': 'failed: invalid syntax (<string>, line 2)'}
            ]
        )
    """

    def __init__(
        self,
        code_template: str | None = None,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        evaluate_module: str = "code_eval",
    ) -> None:
        if code_template is None:
            code_template = "{{ lm_output }}"

        self.code_template = JINJA2_ENV.from_string(code_template)
        self.code_eval = evaluate.load(evaluate_module)

        if isinstance(lm_output_processor, StringProcessor):
            lm_output_processor = [lm_output_processor]
        self.lm_output_processors = lm_output_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if task_inputs_list is None:
            task_inputs_list = [{} for _ in lm_outputs]

        if self.lm_output_processors:
            lm_outputs = [
                functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
            ]

        generated_code_list: list[str] = []
        test_case_list: list[str] = []
        # in code generation tasks, references_list contains the test cases
        for lm_output, task_inputs, test_cases in zip(
            lm_outputs,
            task_inputs_list,
            references_list,
        ):
            generated_code = self.code_template.render(lm_output=lm_output, **task_inputs)
            generated_code_list.append(generated_code)
            test_case_list.append("\n".join(test_cases))
        pass_at_k, results = self.code_eval.compute(
            references=test_case_list,
            predictions=[[c] for c in generated_code_list],
            k=[1],
        )

        # `results` contain the detailed results for each test case
        # e.g., {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed", 'completion_id': 0})]}
        results: dict[int, list[tuple[int, dict[str, Any]]]]

        instance_details: list[dict[str, Any]] = []
        for i in range(len(lm_outputs)):
            first_result = results[i][0]  # we only assume one candidate code per instance, so we take the first result
            _, detail_result = first_result  # the first element is just the index so we ignore it
            # remove unnecessary fields to save space
            detail_result.pop("completion_id")
            detail_result.pop("task_id")
            instance_details.append(detail_result)

        return MetricResult(pass_at_k, instance_details=instance_details)

code_template instance-attribute

code_template = from_string(code_template)

code_eval instance-attribute

code_eval = load(evaluate_module)

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

__init__

__init__(
    code_template: str | None = None,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    evaluate_module: str = "code_eval",
) -> None
Source code in flexeval/core/metric/code_eval.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
    self,
    code_template: str | None = None,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    evaluate_module: str = "code_eval",
) -> None:
    if code_template is None:
        code_template = "{{ lm_output }}"

    self.code_template = JINJA2_ENV.from_string(code_template)
    self.code_eval = evaluate.load(evaluate_module)

    if isinstance(lm_output_processor, StringProcessor):
        lm_output_processor = [lm_output_processor]
    self.lm_output_processors = lm_output_processor

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/code_eval.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if task_inputs_list is None:
        task_inputs_list = [{} for _ in lm_outputs]

    if self.lm_output_processors:
        lm_outputs = [
            functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
        ]

    generated_code_list: list[str] = []
    test_case_list: list[str] = []
    # in code generation tasks, references_list contains the test cases
    for lm_output, task_inputs, test_cases in zip(
        lm_outputs,
        task_inputs_list,
        references_list,
    ):
        generated_code = self.code_template.render(lm_output=lm_output, **task_inputs)
        generated_code_list.append(generated_code)
        test_case_list.append("\n".join(test_cases))
    pass_at_k, results = self.code_eval.compute(
        references=test_case_list,
        predictions=[[c] for c in generated_code_list],
        k=[1],
    )

    # `results` contain the detailed results for each test case
    # e.g., {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed", 'completion_id': 0})]}
    results: dict[int, list[tuple[int, dict[str, Any]]]]

    instance_details: list[dict[str, Any]] = []
    for i in range(len(lm_outputs)):
        first_result = results[i][0]  # we only assume one candidate code per instance, so we take the first result
        _, detail_result = first_result  # the first element is just the index so we ignore it
        # remove unnecessary fields to save space
        detail_result.pop("completion_id")
        detail_result.pop("task_id")
        instance_details.append(detail_result)

    return MetricResult(pass_at_k, instance_details=instance_details)

CommonPrefixLength

A metric that calculates the length of the longest common prefix between the model output and the reference.

Examples:

>>> from flexeval import CommonPrefixLength
>>> common_prefix_length = CommonPrefixLength()
>>> lm_outputs = ["ABCDEFG"]
>>> references_list = [["ABCdefg"]]
>>> result = common_prefix_length.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"average_common_prefix_length": 3.0, "longest_common_prefix_length": 3},
    instance_details=[{"common_prefix_length": 3}],
)
Source code in flexeval/core/metric/common_prefix_length.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class CommonPrefixLength(Metric):
    """
    A metric that calculates the length of the longest common prefix between the model output and the reference.

    Examples:
        >>> from flexeval import CommonPrefixLength
        >>> common_prefix_length = CommonPrefixLength()
        >>> lm_outputs = ["ABCDEFG"]
        >>> references_list = [["ABCdefg"]]
        >>> result = common_prefix_length.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"average_common_prefix_length": 3.0, "longest_common_prefix_length": 3},
            instance_details=[{"common_prefix_length": 3}],
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        common_prefix_length_list: list[int] = []
        for lm_output, references in zip(lm_outputs, references_list):
            common_prefix_length = max(len(get_longest_common_prefix(lm_output, gt)) for gt in references)
            common_prefix_length_list.append(common_prefix_length)

        return MetricResult(
            {
                "average_common_prefix_length": sum(common_prefix_length_list) / len(common_prefix_length_list),
                "longest_common_prefix_length": max(common_prefix_length_list),
            },
            instance_details=[{"common_prefix_length": s} for s in common_prefix_length_list],
        )

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/common_prefix_length.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    common_prefix_length_list: list[int] = []
    for lm_output, references in zip(lm_outputs, references_list):
        common_prefix_length = max(len(get_longest_common_prefix(lm_output, gt)) for gt in references)
        common_prefix_length_list.append(common_prefix_length)

    return MetricResult(
        {
            "average_common_prefix_length": sum(common_prefix_length_list) / len(common_prefix_length_list),
            "longest_common_prefix_length": max(common_prefix_length_list),
        },
        instance_details=[{"common_prefix_length": s} for s in common_prefix_length_list],
    )

CommonStringLength

A metric that calculates the length of the longest common substring between the model output and the reference.

Examples:

>>> from flexeval import CommonStringLength
>>> common_string_length = CommonStringLength()
>>> lm_outputs = ["aBCDEFG"]
>>> references_list = [["ABCDefg"]]
>>> result = common_string_length.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"average_common_string_length": 3.0, "longest_common_string_length": 3},
    instance_details=[{"common_string_length": 3}],
)
Source code in flexeval/core/metric/common_string_length.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class CommonStringLength(Metric):
    """
    A metric that calculates the length of the longest common substring between the model output and the reference.

    Examples:
        >>> from flexeval import CommonStringLength
        >>> common_string_length = CommonStringLength()
        >>> lm_outputs = ["aBCDEFG"]
        >>> references_list = [["ABCDefg"]]
        >>> result = common_string_length.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"average_common_string_length": 3.0, "longest_common_string_length": 3},
            instance_details=[{"common_string_length": 3}],
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        common_string_length_list: list[int] = []
        for lm_output, references in zip(lm_outputs, references_list):
            common_string_length = max(len(get_longest_common_substring(lm_output, gt)) for gt in references)
            common_string_length_list.append(common_string_length)

        return MetricResult(
            {
                "average_common_string_length": sum(common_string_length_list) / len(common_string_length_list),
                "longest_common_string_length": max(common_string_length_list),
            },
            instance_details=[{"common_string_length": s} for s in common_string_length_list],
        )

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/common_string_length.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    common_string_length_list: list[int] = []
    for lm_output, references in zip(lm_outputs, references_list):
        common_string_length = max(len(get_longest_common_substring(lm_output, gt)) for gt in references)
        common_string_length_list.append(common_string_length)

    return MetricResult(
        {
            "average_common_string_length": sum(common_string_length_list) / len(common_string_length_list),
            "longest_common_string_length": max(common_string_length_list),
        },
        instance_details=[{"common_string_length": s} for s in common_string_length_list],
    )

Correlation

Correlation metric to compute Pearson, Spearman, or Kendall correlation coefficients. The lm_outputs and references should be numeric values, optionally preprocessed by StringProcessor.

Parameters:

  • method (Literal['pearson', 'spearman', 'kendall'], default: 'pearson' ) –

    The correlation method to use ('pearson', 'spearman', 'kendall').

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or a list of StringProcessor to be applied to the model outputs before computing the correlation. If a list is provided, the processors will be applied in order.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or a list of StringProcessor to be applied to the references before computing the correlation. If a list is provided, the processors will be applied in order.

Examples:

>>> from flexeval import Correlation
>>> correlation = Correlation(method='pearson')
>>> lm_outputs = ["1", "2", "3", "4", "5"]
>>> references = [["5"], ["4"], ["3"], ["2"], ["1"]]
>>> result = correlation.evaluate(lm_outputs, references)
>>> print(result)
MetricResult(
    summary={"pearson_correlation": -1.0, "pearson_pvalue": 0.0},
    instance_details=[],
)
Source code in flexeval/core/metric/correlation.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
class Correlation(Metric):
    """
    Correlation metric to compute Pearson, Spearman, or Kendall correlation coefficients.
    The lm_outputs and references should be numeric values, optionally preprocessed by StringProcessor.

    Args:
        method: The correlation method to use ('pearson', 'spearman', 'kendall').
        lm_output_processor: StringProcessor or a list of StringProcessor to be applied to the model outputs before
            computing the correlation. If a list is provided, the processors will be applied in order.
        reference_processor: StringProcessor or a list of StringProcessor to be applied to the references before
            computing the correlation. If a list is provided, the processors will be applied in order.

    Examples:
        >>> from flexeval import Correlation
        >>> correlation = Correlation(method='pearson')
        >>> lm_outputs = ["1", "2", "3", "4", "5"]
        >>> references = [["5"], ["4"], ["3"], ["2"], ["1"]]
        >>> result = correlation.evaluate(lm_outputs, references)
        >>> print(result)
        MetricResult(
            summary={"pearson_correlation": -1.0, "pearson_pvalue": 0.0},
            instance_details=[],
        )
    """

    def __init__(
        self,
        method: Literal["pearson", "spearman", "kendall"] = "pearson",
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
    ) -> None:
        if method not in {"pearson", "spearman", "kendall"}:
            msg = f"Invalid method '{method}'. Choose from 'pearson', 'spearman', 'kendall'."
            raise ValueError(msg)
        self.method = method

        if isinstance(lm_output_processor, StringProcessor):
            lm_output_processor = [lm_output_processor]
        if isinstance(reference_processor, StringProcessor):
            reference_processor = [reference_processor]
        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if len(lm_outputs) != len(references_list):
            msg = (
                f"Number of model outputs ({len(lm_outputs)}) and number of references ({len(references_list)}) "
                "should be the same."
            )
            raise ValueError(msg)

        # We only use the first reference here
        references = [refs[0] for refs in references_list]

        if self.lm_output_processors:
            lm_outputs = [
                functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
            ]

        if self.reference_processors:
            references = [
                functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references
            ]

        # The model output should be converted to float, if fails it will be treated as 0
        lm_outputs_as_float: list[float] = []
        for output in lm_outputs:
            try:
                lm_outputs_as_float.append(float(output))
            except ValueError:  # noqa:PERF203
                warnings.warn(f"Failed to convert model output '{output}' to float. Treating it as 0.", stacklevel=2)
                lm_outputs_as_float.append(0.0)

        # The reference should be converted to float
        references_as_float = [float(ref) for ref in references]

        # Compute correlation
        if self.method == "pearson":
            correlation, pvalue = pearsonr(lm_outputs_as_float, references_as_float)
        elif self.method == "spearman":
            correlation, pvalue = spearmanr(lm_outputs_as_float, references_as_float)
        elif self.method == "kendall":
            correlation, pvalue = kendalltau(lm_outputs_as_float, references_as_float)
        else:
            msg = f"Unsupported method: {self.method}"
            raise ValueError(msg)

        return MetricResult(
            {f"{self.method}_correlation": correlation, f"{self.method}_pvalue": pvalue},
            instance_details=[],
        )

method instance-attribute

method = method

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

__init__

__init__(
    method: Literal[
        "pearson", "spearman", "kendall"
    ] = "pearson",
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
) -> None
Source code in flexeval/core/metric/correlation.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self,
    method: Literal["pearson", "spearman", "kendall"] = "pearson",
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
) -> None:
    if method not in {"pearson", "spearman", "kendall"}:
        msg = f"Invalid method '{method}'. Choose from 'pearson', 'spearman', 'kendall'."
        raise ValueError(msg)
    self.method = method

    if isinstance(lm_output_processor, StringProcessor):
        lm_output_processor = [lm_output_processor]
    if isinstance(reference_processor, StringProcessor):
        reference_processor = [reference_processor]
    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/correlation.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if len(lm_outputs) != len(references_list):
        msg = (
            f"Number of model outputs ({len(lm_outputs)}) and number of references ({len(references_list)}) "
            "should be the same."
        )
        raise ValueError(msg)

    # We only use the first reference here
    references = [refs[0] for refs in references_list]

    if self.lm_output_processors:
        lm_outputs = [
            functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
        ]

    if self.reference_processors:
        references = [
            functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references
        ]

    # The model output should be converted to float, if fails it will be treated as 0
    lm_outputs_as_float: list[float] = []
    for output in lm_outputs:
        try:
            lm_outputs_as_float.append(float(output))
        except ValueError:  # noqa:PERF203
            warnings.warn(f"Failed to convert model output '{output}' to float. Treating it as 0.", stacklevel=2)
            lm_outputs_as_float.append(0.0)

    # The reference should be converted to float
    references_as_float = [float(ref) for ref in references]

    # Compute correlation
    if self.method == "pearson":
        correlation, pvalue = pearsonr(lm_outputs_as_float, references_as_float)
    elif self.method == "spearman":
        correlation, pvalue = spearmanr(lm_outputs_as_float, references_as_float)
    elif self.method == "kendall":
        correlation, pvalue = kendalltau(lm_outputs_as_float, references_as_float)
    else:
        msg = f"Unsupported method: {self.method}"
        raise ValueError(msg)

    return MetricResult(
        {f"{self.method}_correlation": correlation, f"{self.method}_pvalue": pvalue},
        instance_details=[],
    )

ExactMatch

Exact match metric. If there are multiple references, the output is considered correct if it matches any of the references.

Parameters:

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of StringProcessor to apply to the references before comparison.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

Examples:

>>> from flexeval import ExactMatch
>>> exact_match = ExactMatch()
>>> lm_outputs = ["ABC", "DEF"]
>>> references_list = [["ABC"], ["DEFG"]]
>>> result = exact_match.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"exact_match": 0.5},
    instance_details=[{"exact_match": True}, {"exact_match": False}],
)
Source code in flexeval/core/metric/exact_match.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
class ExactMatch(Metric):
    """
    Exact match metric.
    If there are multiple references, the output is considered correct if it matches any of the references.

    Args:
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.

    Examples:
        >>> from flexeval import ExactMatch
        >>> exact_match = ExactMatch()
        >>> lm_outputs = ["ABC", "DEF"]
        >>> references_list = [["ABC"], ["DEFG"]]
        >>> result = exact_match.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"exact_match": 0.5},
            instance_details=[{"exact_match": True}, {"exact_match": False}],
        )
    """

    def __init__(
        self,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        if isinstance(lm_output_processor, StringProcessor):
            lm_output_processor = [lm_output_processor]
        if isinstance(reference_processor, StringProcessor):
            reference_processor = [reference_processor]

        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if len(lm_outputs) != len(references_list):
            msg = (
                f"Number of model outputs ({len(lm_outputs)}) and number of references ({len(references_list)}) "
                "should be the same."
            )
            raise ValueError(msg)

        if self.lm_output_processors:
            lm_outputs = [
                functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
            ]

        if self.reference_processors:
            references_list = [
                [functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references]
                for references in references_list
            ]

        exact_match_list = [
            lm_output in expected_output for lm_output, expected_output in zip(lm_outputs, references_list)
        ]
        summary = {"exact_match": sum(exact_match_list) / len(exact_match_list)}

        if self.category_key:
            categories = [task_input[self.category_key] for task_input in task_inputs_list]
            category_wise_scores = aggregate_category_wise_scores(exact_match_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"exact_match/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"exact_match": s} for s in exact_match_list],
        )

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

category_key instance-attribute

category_key = category_key

__init__

__init__(
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/exact_match.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __init__(
    self,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    if isinstance(lm_output_processor, StringProcessor):
        lm_output_processor = [lm_output_processor]
    if isinstance(reference_processor, StringProcessor):
        reference_processor = [reference_processor]

    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/exact_match.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if len(lm_outputs) != len(references_list):
        msg = (
            f"Number of model outputs ({len(lm_outputs)}) and number of references ({len(references_list)}) "
            "should be the same."
        )
        raise ValueError(msg)

    if self.lm_output_processors:
        lm_outputs = [
            functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
        ]

    if self.reference_processors:
        references_list = [
            [functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references]
            for references in references_list
        ]

    exact_match_list = [
        lm_output in expected_output for lm_output, expected_output in zip(lm_outputs, references_list)
    ]
    summary = {"exact_match": sum(exact_match_list) / len(exact_match_list)}

    if self.category_key:
        categories = [task_input[self.category_key] for task_input in task_inputs_list]
        category_wise_scores = aggregate_category_wise_scores(exact_match_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"exact_match/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"exact_match": s} for s in exact_match_list],
    )

ChatLLMGEvalScore

A metric that evaluates the output of LanguageModel.batch_generate_chat_response. Unlike ChatLLMScore, this metric let the model output logprobs for all valid scores and calculate weighted score among them. Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.

Parameters:

  • language_model (required) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (required) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • valid_score_range (required) –

    A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • system_message (str | PromptTemplate | None, default: None ) –

    A system message to be prepended to the input for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

  • prob_threshold (float, default: 0 ) –

    For considering low probability among all of valid scores, return None (invalid) if sum of the all probability among vaild scores is less than this value.

Examples:

>>> from flexeval import ChatLLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
>>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nOutput only a number from 1 to 5."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> llm_score = ChatLLMGEvalScore(language_model, prompt_template, [1, 5], system_message=system_message)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> llm_score.evaluate(lm_outputs)
MetricResult(
    summary={'llm_geval_score': 1.179980414173022, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_geval_score': 1.1509989197179789,
            'llm_geval_score_input': [
                {'role': 'system', 'content': 'This is the system message.'},
                {'role': 'user', 'content': 'Evaluate the quality of this text...'}
            ],
            'llm_geval_score_logprobs': {
                '1': -0.06977498531341553,
                '2': -3.687819004058838,
                '3': -3.937819480895996,
                '4': -5.812800884246826,
                '5': -3.937807083129883
            },
            'llm_geval_score_generation_probs': {
                1: 0.932603645815178,
                2: 0.02502652531327666,
                3: 0.01949066821765914,
                4: 0.002989046364034347,
                5: 0.019490909859903
            }
        },
        {
            'llm_geval_score': 1.208961908628065,
            'llm_geval_score_input': [
                {'role': 'system', 'content': 'This is the system message.'},
                {'role': 'user', 'content': 'Evaluate the quality of this text...'}
            ],
            'llm_geval_score_logprobs': {
                '1': -0.13043057918548584,
                '2': -2.8754935264587402,
                '3': -3.000467538833618,
                '4': -4.750283241271973,
                '5': -5.000345706939697
            },
            'llm_geval_score_generation_probs': {
                1: 0.8777174226922144,
                2: 0.05638830351569556,
                3: 0.04976379642068341,
                4: 0.008649245032977617,
                5: 0.006735618046639277
            }
        }
    ])
Source code in flexeval/core/metric/llm_geval_score.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
class ChatLLMGEvalScore(Metric):
    """A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.
    Unlike ChatLLMScore, this metric let the model output logprobs for all valid scores and
    calculate weighted score among them.
    Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.

    Args:
        language_model (required): An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template (required): An instance of `PromptTemplate` to embed the input for the evaluator.
        valid_score_range (required): A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        batch_size: The batch size for the evaluator.
        system_message: A system message to be prepended to the input for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.
        prob_threshold: For considering low probability among all of valid scores,
            return None (invalid) if sum of the all probability among vaild scores is less than this value.


    Examples:
        >>> from flexeval import ChatLLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
        >>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nOutput only a number from 1 to 5."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> llm_score = ChatLLMGEvalScore(language_model, prompt_template, [1, 5], system_message=system_message)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> llm_score.evaluate(lm_outputs)
        MetricResult(
            summary={'llm_geval_score': 1.179980414173022, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_geval_score': 1.1509989197179789,
                    'llm_geval_score_input': [
                        {'role': 'system', 'content': 'This is the system message.'},
                        {'role': 'user', 'content': 'Evaluate the quality of this text...'}
                    ],
                    'llm_geval_score_logprobs': {
                        '1': -0.06977498531341553,
                        '2': -3.687819004058838,
                        '3': -3.937819480895996,
                        '4': -5.812800884246826,
                        '5': -3.937807083129883
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.932603645815178,
                        2: 0.02502652531327666,
                        3: 0.01949066821765914,
                        4: 0.002989046364034347,
                        5: 0.019490909859903
                    }
                },
                {
                    'llm_geval_score': 1.208961908628065,
                    'llm_geval_score_input': [
                        {'role': 'system', 'content': 'This is the system message.'},
                        {'role': 'user', 'content': 'Evaluate the quality of this text...'}
                    ],
                    'llm_geval_score_logprobs': {
                        '1': -0.13043057918548584,
                        '2': -2.8754935264587402,
                        '3': -3.000467538833618,
                        '4': -4.750283241271973,
                        '5': -5.000345706939697
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.8777174226922144,
                        2: 0.05638830351569556,
                        3: 0.04976379642068341,
                        4: 0.008649245032977617,
                        5: 0.006735618046639277
                    }
                }
            ])
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        valid_score_range: tuple[int, int],
        batch_size: int = 4,
        system_message: str | PromptTemplate | None = None,
        disable_tqdm: bool = False,
        category_key: str | None = None,
        prob_threshold: float = 0,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.system_message = system_message
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key
        self.prob_threshold = prob_threshold

        self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if task_inputs_list is None:
            task_inputs_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, task_inputs_list, self.prompt_template, self.system_message
        )
        evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
            evaluator_input_list,
            self.language_model,
            self.valid_labels,
            self.batch_size,
            self.disable_tqdm,
            "Calculating logprobs",
        )

        evaluator_score_list: list[int | None] = []
        evaluator_probs_list: list[dict[int, float]] = []
        for evaluator_logprobs in evaluator_logprobs_list:
            evaluator_score, evaluator_probs = calculate_weighted_average(
                evaluator_logprobs,
                self.valid_score_range,
                self.prob_threshold,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
            evaluator_score_list.append(evaluator_score)
            evaluator_probs_list.append(evaluator_probs)

        summary = summarize_evaluator_geval_scores(
            evaluator_score_list,
            task_inputs_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_geval_score": eval_score,
                    "llm_geval_score_input": eval_in,
                    "llm_geval_score_logprobs": eval_logprobs,
                    "llm_geval_score_generation_probs": eval_probs,
                }
                for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_logprobs_list,
                    evaluator_probs_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

batch_size instance-attribute

batch_size = batch_size

system_message instance-attribute

system_message = system_message

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

prob_threshold instance-attribute

prob_threshold = prob_threshold

valid_labels instance-attribute

valid_labels = [
    str(score)
    for score in range(
        valid_score_range[0], valid_score_range[1] + 1
    )
]

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    system_message: str | PromptTemplate | None = None,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None
Source code in flexeval/core/metric/llm_geval_score.py
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    system_message: str | PromptTemplate | None = None,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.system_message = system_message
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key
    self.prob_threshold = prob_threshold

    self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_geval_score.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if task_inputs_list is None:
        task_inputs_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, task_inputs_list, self.prompt_template, self.system_message
    )
    evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
        evaluator_input_list,
        self.language_model,
        self.valid_labels,
        self.batch_size,
        self.disable_tqdm,
        "Calculating logprobs",
    )

    evaluator_score_list: list[int | None] = []
    evaluator_probs_list: list[dict[int, float]] = []
    for evaluator_logprobs in evaluator_logprobs_list:
        evaluator_score, evaluator_probs = calculate_weighted_average(
            evaluator_logprobs,
            self.valid_score_range,
            self.prob_threshold,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
        evaluator_score_list.append(evaluator_score)
        evaluator_probs_list.append(evaluator_probs)

    summary = summarize_evaluator_geval_scores(
        evaluator_score_list,
        task_inputs_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_geval_score": eval_score,
                "llm_geval_score_input": eval_in,
                "llm_geval_score_logprobs": eval_logprobs,
                "llm_geval_score_generation_probs": eval_probs,
            }
            for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_logprobs_list,
                evaluator_probs_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_geval_score.py
449
450
451
452
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMGEvalScore

Let LanguageModel evaluate the output of another LanguageModel. Unlike LLMScore, this metric let the model output logprobs for all valid scores and calculate weighted score among them. Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20. For detail, see https://aclanthology.org/2023.emnlp-main.153/

You can specify the evaluation criteria in PromptTemplate.

Parameters:

  • language_model (required) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (required) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • valid_score_range (required) –

    A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

  • prob_threshold (float, default: 0 ) –

    For considering low probability among all of valid scores, return None (invalid) if sum of the all probability among vaild scores is less than this value.

Examples:

>>> from flexeval import LLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
>>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nOutput only a number from 1 to 5."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> llm_score = LLMGEvalScore(language_model, prompt_template, [1, 5])
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> llm_score.evaluate(lm_outputs)
MetricResult(
    summary={'llm_geval_score': 1.4399980931290486, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_geval_score': 1.418920817254956,
            'llm_geval_score_input': 'Evaluate the quality of this text...',
            'llm_geval_score_logprobs': {
                '1': -4.0625,
                '2': -7.75,
                '3': -8.25,
                '4': -8.0625,
                '5': -6.4375
            },
            'llm_geval_score_generation_probs': {
                1: 0.017205950425851383,
                2: 0.00043074254057568753,
                3: 0.00026125855730166754,
                4: 0.000315137974737356,
                5: 0.0016004026902445643
            }
        },
        {
            'llm_geval_score': 1.461075369003141
            'llm_geval_score_input': 'Evaluate the quality of this text...',
            'llm_geval_score_logprobs': {
                '1': -4.25,
                '2': -8.1875,
                '3': -8.375,
                '4': -8.125,
                '5': -6.5
            },
            'llm_geval_score_generation_probs': {
                1: 0.014264233908999256,
                2: 0.00027810828659249914,
                3: 0.00023055986759244163,
                4: 0.0002960447300568554,
                5: 0.0015034391929775724
            }
        }
    ]
)
Source code in flexeval/core/metric/llm_geval_score.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
class LLMGEvalScore(Metric):
    """Let LanguageModel evaluate the output of another LanguageModel.
    Unlike LLMScore, this metric let the model output logprobs for all valid scores and
    calculate weighted score among them.
    Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.
    For detail, see https://aclanthology.org/2023.emnlp-main.153/

    You can specify the evaluation criteria in `PromptTemplate`.

    Args:
        language_model (required): An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template (required): An instance of `PromptTemplate` to embed the input for the evaluator.
        valid_score_range (required): A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.
        prob_threshold: For considering low probability among all of valid scores,
            return None (invalid) if sum of the all probability among vaild scores is less than this value.

    Examples:
        >>> from flexeval import LLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
        >>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nOutput only a number from 1 to 5."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> llm_score = LLMGEvalScore(language_model, prompt_template, [1, 5])
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> llm_score.evaluate(lm_outputs)
        MetricResult(
            summary={'llm_geval_score': 1.4399980931290486, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_geval_score': 1.418920817254956,
                    'llm_geval_score_input': 'Evaluate the quality of this text...',
                    'llm_geval_score_logprobs': {
                        '1': -4.0625,
                        '2': -7.75,
                        '3': -8.25,
                        '4': -8.0625,
                        '5': -6.4375
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.017205950425851383,
                        2: 0.00043074254057568753,
                        3: 0.00026125855730166754,
                        4: 0.000315137974737356,
                        5: 0.0016004026902445643
                    }
                },
                {
                    'llm_geval_score': 1.461075369003141
                    'llm_geval_score_input': 'Evaluate the quality of this text...',
                    'llm_geval_score_logprobs': {
                        '1': -4.25,
                        '2': -8.1875,
                        '3': -8.375,
                        '4': -8.125,
                        '5': -6.5
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.014264233908999256,
                        2: 0.00027810828659249914,
                        3: 0.00023055986759244163,
                        4: 0.0002960447300568554,
                        5: 0.0015034391929775724
                    }
                }
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        valid_score_range: tuple[int, int],
        batch_size: int = 4,
        disable_tqdm: bool = False,
        category_key: str | None = None,
        prob_threshold: float = 0,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key
        self.prob_threshold = prob_threshold

        self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if task_inputs_list is None:
            task_inputs_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, task_inputs_list, self.prompt_template
        )
        evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
            evaluator_input_list,
            self.language_model,
            self.valid_labels,
            self.batch_size,
            self.disable_tqdm,
            "Calculating logprobs",
        )

        evaluator_score_list: list[int | None] = []
        evaluator_probs_list: list[dict[int, float]] = []
        for evaluator_logprobs in evaluator_logprobs_list:
            evaluator_score, evaluator_probs = calculate_weighted_average(
                evaluator_logprobs,
                self.valid_score_range,
                self.prob_threshold,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
            evaluator_score_list.append(evaluator_score)
            evaluator_probs_list.append(evaluator_probs)

        summary = summarize_evaluator_geval_scores(
            evaluator_score_list,
            task_inputs_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_geval_score": eval_score,
                    "llm_geval_score_input": eval_in,
                    "llm_geval_score_logprobs": eval_logprobs,
                    "llm_geval_score_generation_probs": eval_probs,
                }
                for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_logprobs_list,
                    evaluator_probs_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

prob_threshold instance-attribute

prob_threshold = prob_threshold

valid_labels instance-attribute

valid_labels = [
    str(score)
    for score in range(
        valid_score_range[0], valid_score_range[1] + 1
    )
]

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None
Source code in flexeval/core/metric/llm_geval_score.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key
    self.prob_threshold = prob_threshold

    self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_geval_score.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if task_inputs_list is None:
        task_inputs_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, task_inputs_list, self.prompt_template
    )
    evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
        evaluator_input_list,
        self.language_model,
        self.valid_labels,
        self.batch_size,
        self.disable_tqdm,
        "Calculating logprobs",
    )

    evaluator_score_list: list[int | None] = []
    evaluator_probs_list: list[dict[int, float]] = []
    for evaluator_logprobs in evaluator_logprobs_list:
        evaluator_score, evaluator_probs = calculate_weighted_average(
            evaluator_logprobs,
            self.valid_score_range,
            self.prob_threshold,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
        evaluator_score_list.append(evaluator_score)
        evaluator_probs_list.append(evaluator_probs)

    summary = summarize_evaluator_geval_scores(
        evaluator_score_list,
        task_inputs_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_geval_score": eval_score,
                "llm_geval_score_input": eval_in,
                "llm_geval_score_logprobs": eval_logprobs,
                "llm_geval_score_generation_probs": eval_probs,
            }
            for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_logprobs_list,
                evaluator_probs_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_geval_score.py
284
285
286
287
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

ChatLLMLabel

A metric that evaluates the output of LanguageModel.batch_generate_chat_response.

Parameters:

  • language_model (LanguageModel) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (PromptTemplate) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • label_names (list[str]) –

    A list of valid label names.

  • label_points (list[float | int] | None, default: None ) –

    A list of points for each label specified in label_names.

  • system_message (str | PromptTemplate | None, default: None ) –

    A system message to be prepended to the input for the evaluator.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

Examples:

>>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text on a scale of Good/Bad.\n`{{ lm_output }}`\nPut the label at the end like [[Good]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> label_names = ["Good", "Bad"]
>>> label_points = [1.0, 0.0]
>>> llm_label = ChatLLMLabel(language_model, prompt_template, label_names, label_points)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_label.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_label': 'Good',
            'llm_score': 1.0,
            'llm_label_input': 'Evaluate the quality of this text...',
            'llm_label_output': 'This text is natural, ... [[Good]]'
        },
        {
            'llm_label': 'Bad',
            'llm_score': 0.0,
            'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\n`Good mrrrning!`\nPut the label at the end like [[Good]].',
            'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
        }
    ]
)
Source code in flexeval/core/metric/llm_label.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
class ChatLLMLabel(Metric):
    """
    A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        label_names: A list of valid label names.
        label_points: A list of points for each label specified in label_names.
        system_message: A system message to be prepended to the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.

    Examples:
        >>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text on a scale of Good/Bad.\\n`{{ lm_output }}`\\nPut the label at the end like [[Good]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> label_names = ["Good", "Bad"]
        >>> label_points = [1.0, 0.0]
        >>> llm_label = ChatLLMLabel(language_model, prompt_template, label_names, label_points)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_label.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_label': 'Good',
                    'llm_score': 1.0,
                    'llm_label_input': 'Evaluate the quality of this text...',
                    'llm_label_output': 'This text is natural, ... [[Good]]'
                },
                {
                    'llm_label': 'Bad',
                    'llm_score': 0.0,
                    'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\\n`Good mrrrning!`\\nPut the label at the end like [[Good]].',
                    'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
                }
            ]
        )
    """  # noqa: E501

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        label_names: list[str],
        label_points: list[float | int] | None = None,
        system_message: str | PromptTemplate | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.label_names = [re.escape(label) for label in label_names]

        if label_points:
            if len(self.label_names) != len(label_points):
                msg = "The lengths of label_names and weights do not match."
                raise ValueError(msg)
            label_points: list[float] = list(map(float, label_points))
        else:
            label_points = [0.0] * len(label_names)
            label_points[0] = 1.0

        self.weights = label_points
        self.system_message = system_message
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if task_inputs_list is None:
            task_inputs_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, task_inputs_list, self.prompt_template, self.system_message
        )

        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
        )

        evaluator_label_list: list[str] = []
        for evaluator_output in evaluator_output_list:
            evaluator_label = parse_label_from_evaluator_output(
                evaluator_output.text,
                label_names=self.label_names,
            )
            if evaluator_label is None:
                logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
            evaluator_label_list.append(evaluator_label)

        label2point = dict(zip(self.label_names, self.weights))
        evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

        summary = summarize_evaluator_labels(
            evaluator_label_list,
            task_inputs_list,
            self.label_names,
            self.weights,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_label": eval_label,
                    "llm_score": eval_score,
                    "llm_label_input": eval_in,
                    "llm_label_output": eval_out.text,
                }
                for eval_label, eval_score, eval_in, eval_out in zip(
                    evaluator_label_list,
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

label_names instance-attribute

label_names = [escape(label) for label in label_names]

weights instance-attribute

weights = label_points

system_message instance-attribute

system_message = system_message

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

category_key instance-attribute

category_key = category_key

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/llm_label.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.label_names = [re.escape(label) for label in label_names]

    if label_points:
        if len(self.label_names) != len(label_points):
            msg = "The lengths of label_names and weights do not match."
            raise ValueError(msg)
        label_points: list[float] = list(map(float, label_points))
    else:
        label_points = [0.0] * len(label_names)
        label_points[0] = 1.0

    self.weights = label_points
    self.system_message = system_message
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_label.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if task_inputs_list is None:
        task_inputs_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, task_inputs_list, self.prompt_template, self.system_message
    )

    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
    )

    evaluator_label_list: list[str] = []
    for evaluator_output in evaluator_output_list:
        evaluator_label = parse_label_from_evaluator_output(
            evaluator_output.text,
            label_names=self.label_names,
        )
        if evaluator_label is None:
            logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
        evaluator_label_list.append(evaluator_label)

    label2point = dict(zip(self.label_names, self.weights))
    evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

    summary = summarize_evaluator_labels(
        evaluator_label_list,
        task_inputs_list,
        self.label_names,
        self.weights,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_label": eval_label,
                "llm_score": eval_score,
                "llm_label_input": eval_in,
                "llm_label_output": eval_out.text,
            }
            for eval_label, eval_score, eval_in, eval_out in zip(
                evaluator_label_list,
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_label.py
363
364
365
366
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMLabel

Let LanguageModel to evaluate the output of another LanguageModel.

You can specify the evaluation criteria in PromptTemplate. The last label value found in the output of the evaluator is used to compute the evaluation score. You can assign a score to each label. The final output is the average score and the distribution of the labels.

Parameters:

  • language_model (LanguageModel) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (PromptTemplate) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • label_names (list[str]) –

    A list of valid label names.

  • label_points (list[float | int] | None, default: None ) –

    A list of points for each label specified in label_names.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

Examples:

>>> from flexeval import OpenAIChatAPI, Jinja2PromptTemplate, LLMLabel
>>> language_model = OpenAIChatAPI(model="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text on a scale of Good/Bad.\n`{{ lm_output }}`\nPut the label at the end like [[Good]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> label_names = ["Good", "Bad"]
>>> label_points = [1.0, 0.0]
>>> llm_label = LLMLabel(language_model, prompt_template, label_names, label_points)
>>> lm_outputs = ["Hello, world!", "Good mrrrning!"]
>>> result = llm_label.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_label': 'Good',
            'llm_score': 1.0,
            'llm_label_input': 'Evaluate the quality of this text...',
            'llm_label_output': 'This text is natural, ... [[Good]]'
        },
        {
            'llm_label': 'Bad',
            'llm_score': 0.0,
            'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\n`Good mrrrning!`\nPut the label at the end like [[Good]].',
            'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
        }
    ]
)
Source code in flexeval/core/metric/llm_label.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
class LLMLabel(Metric):
    """Let LanguageModel to evaluate the output of another LanguageModel.

    You can specify the evaluation criteria in `PromptTemplate`.
    The last label value found in the output of the evaluator is used to compute the evaluation score.
    You can assign a score to each label.
    The final output is the average score and the distribution of the labels.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        label_names: A list of valid label names.
        label_points: A list of points for each label specified in label_names.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.

    Examples:
        >>> from flexeval import OpenAIChatAPI, Jinja2PromptTemplate, LLMLabel
        >>> language_model = OpenAIChatAPI(model="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text on a scale of Good/Bad.\\n`{{ lm_output }}`\\nPut the label at the end like [[Good]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> label_names = ["Good", "Bad"]
        >>> label_points = [1.0, 0.0]
        >>> llm_label = LLMLabel(language_model, prompt_template, label_names, label_points)
        >>> lm_outputs = ["Hello, world!", "Good mrrrning!"]
        >>> result = llm_label.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_label': 'Good',
                    'llm_score': 1.0,
                    'llm_label_input': 'Evaluate the quality of this text...',
                    'llm_label_output': 'This text is natural, ... [[Good]]'
                },
                {
                    'llm_label': 'Bad',
                    'llm_score': 0.0,
                    'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\\n`Good mrrrning!`\\nPut the label at the end like [[Good]].',
                    'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
                }
            ]
        )
    """  # noqa: E501

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        label_names: list[str],
        label_points: list[float | int] | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.label_names = [re.escape(label) for label in label_names]

        if label_points:
            if len(self.label_names) != len(label_points):
                msg = "The lengths of label_names and weights do not match."
                raise ValueError(msg)
            label_points: list[float] = list(map(float, label_points))
        else:
            label_points = [0.0] * len(label_names)
            label_points[0] = 1.0

        self.weights = label_points
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if task_inputs_list is None:
            task_inputs_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, task_inputs_list, self.prompt_template
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
        )

        evaluator_label_list: list[int | None] = []
        for evaluator_output in evaluator_output_list:
            evaluator_label = parse_label_from_evaluator_output(
                evaluator_output.text,
                label_names=self.label_names,
            )
            if evaluator_label is None:
                logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
            evaluator_label_list.append(evaluator_label)

        label2point = dict(zip(self.label_names, self.weights))
        evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

        summary = summarize_evaluator_labels(
            evaluator_label_list,
            task_inputs_list,
            self.label_names,
            self.weights,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_label": eval_label,
                    "llm_score": eval_score,
                    "llm_label_input": eval_in,
                    "llm_label_output": eval_out.text,
                }
                for eval_label, eval_score, eval_in, eval_out in zip(
                    evaluator_label_list,
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

label_names instance-attribute

label_names = [escape(label) for label in label_names]

weights instance-attribute

weights = label_points

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/llm_label.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.label_names = [re.escape(label) for label in label_names]

    if label_points:
        if len(self.label_names) != len(label_points):
            msg = "The lengths of label_names and weights do not match."
            raise ValueError(msg)
        label_points: list[float] = list(map(float, label_points))
    else:
        label_points = [0.0] * len(label_names)
        label_points[0] = 1.0

    self.weights = label_points
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_label.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if task_inputs_list is None:
        task_inputs_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, task_inputs_list, self.prompt_template
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
    )

    evaluator_label_list: list[int | None] = []
    for evaluator_output in evaluator_output_list:
        evaluator_label = parse_label_from_evaluator_output(
            evaluator_output.text,
            label_names=self.label_names,
        )
        if evaluator_label is None:
            logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
        evaluator_label_list.append(evaluator_label)

    label2point = dict(zip(self.label_names, self.weights))
    evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

    summary = summarize_evaluator_labels(
        evaluator_label_list,
        task_inputs_list,
        self.label_names,
        self.weights,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_label": eval_label,
                "llm_score": eval_score,
                "llm_label_input": eval_in,
                "llm_label_output": eval_out.text,
            }
            for eval_label, eval_score, eval_in, eval_out in zip(
                evaluator_label_list,
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_label.py
223
224
225
226
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

ChatLLMScore

A metric that evaluates the output of LanguageModel.batch_generate_chat_response.

Parameters:

  • language_model (LanguageModel) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (PromptTemplate) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • system_message (str | PromptTemplate | None, default: None ) –

    A system message to be prepended to the input for the evaluator.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • valid_score_range (tuple[int, int] | None, default: None ) –

    A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

Examples:

>>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nPut the score at the end like [[5]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> llm_score = ChatLLMScore(language_model, prompt_template, system_message)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_score.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_score': 2,
            'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
            'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
        {
            'llm_score': 4,
            'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
            'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
    ]
)
Source code in flexeval/core/metric/llm_score.py
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
class ChatLLMScore(Metric):
    """
    A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        system_message: A system message to be prepended to the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        valid_score_range: A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.

    Examples:
        >>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nPut the score at the end like [[5]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> llm_score = ChatLLMScore(language_model, prompt_template, system_message)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_score.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_score': 2,
                    'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
                    'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
                {
                    'llm_score': 4,
                    'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
                    'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        system_message: str | PromptTemplate | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.system_message = system_message
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if task_inputs_list is None:
            task_inputs_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, task_inputs_list, self.prompt_template, self.system_message
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
        )

        evaluator_score_list: list[int] = []
        for evaluator_output in evaluator_output_list:
            evaluator_score = parse_score_from_evaluator_output(
                evaluator_output.text,
                valid_score_range=self.valid_score_range,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
            evaluator_score_list.append(evaluator_score)

        summary = summarize_evaluator_scores(
            evaluator_score_list,
            task_inputs_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
                for eval_score, eval_in, eval_out in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

system_message instance-attribute

system_message = system_message

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/llm_score.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.system_message = system_message
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_score.py
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if task_inputs_list is None:
        task_inputs_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, task_inputs_list, self.prompt_template, self.system_message
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
    )

    evaluator_score_list: list[int] = []
    for evaluator_output in evaluator_output_list:
        evaluator_score = parse_score_from_evaluator_output(
            evaluator_output.text,
            valid_score_range=self.valid_score_range,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
        evaluator_score_list.append(evaluator_score)

    summary = summarize_evaluator_scores(
        evaluator_score_list,
        task_inputs_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
            for eval_score, eval_in, eval_out in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_score.py
380
381
382
383
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMScore

Let LanguageModel to evaluate the output of another LanguageModel.

You can specify the evaluation criteria in PromptTemplate. The last integer value in the output of the evaluator is used as the evaluation score.

Parameters:

  • language_model (LanguageModel) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (PromptTemplate) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • valid_score_range (tuple[int, int] | None, default: None ) –

    A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

Examples:

>>> from flexeval import LLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nPut the score at the end like [[5]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> llm_score = LLMScore(language_model, prompt_template)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_score.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_score': 2,
            'llm_score_input': 'Evaluate the quality of this text...',
            'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
        {
            'llm_score': 4,
            'llm_score_input': 'Evaluate the quality of this text...',
            'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
    ]
)
Source code in flexeval/core/metric/llm_score.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
class LLMScore(Metric):
    """Let LanguageModel to evaluate the output of another LanguageModel.

    You can specify the evaluation criteria in `PromptTemplate`.
    The last integer value in the output of the evaluator is used as the evaluation score.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        valid_score_range: A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.

    Examples:
        >>> from flexeval import LLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nPut the score at the end like [[5]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> llm_score = LLMScore(language_model, prompt_template)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_score.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_score': 2,
                    'llm_score_input': 'Evaluate the quality of this text...',
                    'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
                {
                    'llm_score': 4,
                    'llm_score_input': 'Evaluate the quality of this text...',
                    'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if task_inputs_list is None:
            task_inputs_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, task_inputs_list, self.prompt_template
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
        )

        evaluator_score_list: list[int | None] = []
        for evaluator_output in evaluator_output_list:
            evaluator_score = parse_score_from_evaluator_output(
                evaluator_output.text,
                valid_score_range=self.valid_score_range,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
            evaluator_score_list.append(evaluator_score)

        summary = summarize_evaluator_scores(
            evaluator_score_list,
            task_inputs_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
                for eval_score, eval_in, eval_out in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/llm_score.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_score.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if task_inputs_list is None:
        task_inputs_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, task_inputs_list, self.prompt_template
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
    )

    evaluator_score_list: list[int | None] = []
    for evaluator_output in evaluator_output_list:
        evaluator_score = parse_score_from_evaluator_output(
            evaluator_output.text,
            valid_score_range=self.valid_score_range,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
        evaluator_score_list.append(evaluator_score)

    summary = summarize_evaluator_scores(
        evaluator_score_list,
        task_inputs_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
            for eval_score, eval_in, eval_out in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_score.py
270
271
272
273
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

OutputLengthStats

Compute statistics on the length of the outputs.

Examples:

>>> from flexeval import OutputLengthStats
>>> output_length_stats = OutputLengthStats()
>>> lm_outputs = ["123456", "123456789"]
>>> result = output_length_stats.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'avg_output_length': 7.5, 'max_output_length': 9, 'min_output_length': 6},
    instance_details=[{'output_length': 6}, {'output_length': 9}]
)
Source code in flexeval/core/metric/output_length_stats.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class OutputLengthStats(Metric):
    """
    Compute statistics on the length of the outputs.

    Examples:
        >>> from flexeval import OutputLengthStats
        >>> output_length_stats = OutputLengthStats()
        >>> lm_outputs = ["123456", "123456789"]
        >>> result = output_length_stats.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'avg_output_length': 7.5, 'max_output_length': 9, 'min_output_length': 6},
            instance_details=[{'output_length': 6}, {'output_length': 9}]
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        output_length_list = [len(output) for output in lm_outputs]
        return MetricResult(
            {
                "avg_output_length": sum(output_length_list) / len(output_length_list),
                "max_output_length": max(output_length_list),
                "min_output_length": min(output_length_list),
            },
            instance_details=[{"output_length": s} for s in output_length_list],
        )

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/output_length_stats.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    output_length_list = [len(output) for output in lm_outputs]
    return MetricResult(
        {
            "avg_output_length": sum(output_length_list) / len(output_length_list),
            "max_output_length": max(output_length_list),
            "min_output_length": min(output_length_list),
        },
        instance_details=[{"output_length": s} for s in output_length_list],
    )

PerspectiveAPI

A metric that evaluates text outputs using the Perspective API. Please set PERSPECTIVE_API_KEY in the environment variable.

Parameters:

  • languages (list[str]) –

    A list of languages to analyze.

Examples:

>>> from flexeval import PerspectiveAPI
>>> perspective_api = PerspectiveAPI(languages=["en"])
>>> lm_outputs = ["I love you", "I hate you"]
>>> result = perspective_api.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'TOXICITY': 0.35407552, ..., 'THREAT': 0.0265799825},
    instance_details=[
        {'TOXICITY': 0.02543884, ..., 'THREAT': 0.009204263},
        {'TOXICITY': 0.6827122, ..., 'THREAT': 0.043955702}
        ]
    )
Source code in flexeval/core/metric/perspective_api.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class PerspectiveAPI(Metric):
    """A metric that evaluates text outputs using the Perspective API.
    Please set `PERSPECTIVE_API_KEY` in the environment variable.

    Args:
        languages: A list of languages to analyze.

    Examples:
        >>> from flexeval import PerspectiveAPI
        >>> perspective_api = PerspectiveAPI(languages=["en"])
        >>> lm_outputs = ["I love you", "I hate you"]
        >>> result = perspective_api.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'TOXICITY': 0.35407552, ..., 'THREAT': 0.0265799825},
            instance_details=[
                {'TOXICITY': 0.02543884, ..., 'THREAT': 0.009204263},
                {'TOXICITY': 0.6827122, ..., 'THREAT': 0.043955702}
                ]
            )
    """

    def __init__(self, languages: list[str]) -> None:
        self.client = discovery.build(
            "commentanalyzer",
            "v1alpha1",
            developerKey=PERSPECTIVE_API_KEY,
            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
            static_discovery=False,
        )
        self.languages = languages
        self.attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        instance_details = []
        for lm_output in lm_outputs:
            if lm_output == "":
                instance_details.append({att: 0.0 for att in self.attributes})
                continue
            analyze_request = {
                "comment": {"text": lm_output},
                "languages": self.languages,
                "requestedAttributes": {att: {} for att in self.attributes},
            }
            response = retry_on_error(perspectiveapi_call=self.client.comments().analyze(body=analyze_request).execute)
            instance_details.append(
                {att: response["attributeScores"][att]["summaryScore"]["value"] for att in self.attributes},
            )
        scores_for_attribute = {att: [] for att in self.attributes}
        for instance in instance_details:
            for att in self.attributes:
                scores_for_attribute[att].append(instance[att])
        average_scores = {att: np.mean(scores_for_attribute[att]) for att in self.attributes}
        return MetricResult(average_scores, instance_details=instance_details)

client instance-attribute

client = build(
    "commentanalyzer",
    "v1alpha1",
    developerKey=PERSPECTIVE_API_KEY,
    discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
    static_discovery=False,
)

languages instance-attribute

languages = languages

attributes instance-attribute

attributes = [
    "TOXICITY",
    "SEVERE_TOXICITY",
    "IDENTITY_ATTACK",
    "INSULT",
    "PROFANITY",
    "THREAT",
]

__init__

__init__(languages: list[str]) -> None
Source code in flexeval/core/metric/perspective_api.py
59
60
61
62
63
64
65
66
67
68
def __init__(self, languages: list[str]) -> None:
    self.client = discovery.build(
        "commentanalyzer",
        "v1alpha1",
        developerKey=PERSPECTIVE_API_KEY,
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False,
    )
    self.languages = languages
    self.attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"]

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/perspective_api.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    instance_details = []
    for lm_output in lm_outputs:
        if lm_output == "":
            instance_details.append({att: 0.0 for att in self.attributes})
            continue
        analyze_request = {
            "comment": {"text": lm_output},
            "languages": self.languages,
            "requestedAttributes": {att: {} for att in self.attributes},
        }
        response = retry_on_error(perspectiveapi_call=self.client.comments().analyze(body=analyze_request).execute)
        instance_details.append(
            {att: response["attributeScores"][att]["summaryScore"]["value"] for att in self.attributes},
        )
    scores_for_attribute = {att: [] for att in self.attributes}
    for instance in instance_details:
        for att in self.attributes:
            scores_for_attribute[att].append(instance[att])
    average_scores = {att: np.mean(scores_for_attribute[att]) for att in self.attributes}
    return MetricResult(average_scores, instance_details=instance_details)

RepetitionCount

A metric that counts the number of repetitions of the most repeated pattern in the model's output.

Parameters:

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of Normalizers to apply to the model outputs before analysis.

Examples:

>>> from flexeval import RepetitionCount
>>> repetition_count = RepetitionCount()
>>> lm_outputs = ["hello hello hello hello hello hello hello hello hello hello"]
>>> references_list = [[]]  # Not used for this metric
>>> result = repetition_count.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'repetition_ratio': 1.0},
    instance_details=[{'most_repeated_pattern': 'hello hell', 'repetition_count': 9, 'is_repetition': True}]
)
Source code in flexeval/core/metric/repetition_count.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class RepetitionCount(Metric):
    """
    A metric that counts the number of repetitions of the most repeated pattern in the model's output.

    Args:
        lm_output_processor: StringProcessor or list of Normalizers to apply to the model outputs before analysis.

    Examples:
        >>> from flexeval import RepetitionCount
        >>> repetition_count = RepetitionCount()
        >>> lm_outputs = ["hello hello hello hello hello hello hello hello hello hello"]
        >>> references_list = [[]]  # Not used for this metric
        >>> result = repetition_count.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'repetition_ratio': 1.0},
            instance_details=[{'most_repeated_pattern': 'hello hell', 'repetition_count': 9, 'is_repetition': True}]
        )
    """

    def __init__(
        self,
        count_threshold: int = 30,
        threshold_length: int = 10,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    ) -> None:
        self.count_threshold = count_threshold
        self.threshold_length = threshold_length

        if isinstance(lm_output_processor, StringProcessor):
            lm_output_processor = [lm_output_processor]
        self.lm_output_processors = lm_output_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],  # Not used in this metric
        task_inputs_list: list[dict[str, str]] | None = None,  # Not used in this metric
    ) -> MetricResult:
        if self.lm_output_processors:
            lm_outputs = [
                functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
            ]

        repetition_details: list[dict[str, Any]] = []
        num_repetitions = 0
        for output in lm_outputs:
            most_repeated_pattern, count = get_most_repeated_pattern(output, threshold_length=self.threshold_length)
            is_repetition = count >= self.count_threshold
            repetition_details.append(
                {
                    "most_repeated_pattern": most_repeated_pattern,
                    "repetition_count": count,
                    "is_repetition": is_repetition,
                }
            )
            num_repetitions += int(is_repetition)

        repetition_rate = num_repetitions / len(lm_outputs)

        return MetricResult(
            summary={"repetition_ratio": repetition_rate},
            instance_details=repetition_details,
        )

count_threshold instance-attribute

count_threshold = count_threshold

threshold_length instance-attribute

threshold_length = threshold_length

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

__init__

__init__(
    count_threshold: int = 30,
    threshold_length: int = 10,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
) -> None
Source code in flexeval/core/metric/repetition_count.py
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
    self,
    count_threshold: int = 30,
    threshold_length: int = 10,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
) -> None:
    self.count_threshold = count_threshold
    self.threshold_length = threshold_length

    if isinstance(lm_output_processor, StringProcessor):
        lm_output_processor = [lm_output_processor]
    self.lm_output_processors = lm_output_processor

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/repetition_count.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],  # Not used in this metric
    task_inputs_list: list[dict[str, str]] | None = None,  # Not used in this metric
) -> MetricResult:
    if self.lm_output_processors:
        lm_outputs = [
            functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
        ]

    repetition_details: list[dict[str, Any]] = []
    num_repetitions = 0
    for output in lm_outputs:
        most_repeated_pattern, count = get_most_repeated_pattern(output, threshold_length=self.threshold_length)
        is_repetition = count >= self.count_threshold
        repetition_details.append(
            {
                "most_repeated_pattern": most_repeated_pattern,
                "repetition_count": count,
                "is_repetition": is_repetition,
            }
        )
        num_repetitions += int(is_repetition)

    repetition_rate = num_repetitions / len(lm_outputs)

    return MetricResult(
        summary={"repetition_ratio": repetition_rate},
        instance_details=repetition_details,
    )

ROUGE

An implementation of ROUGE.

The calculation is based on the rouge library.

Parameters:

  • tokenizer (Tokenizer) –

    An instance of Tokenizer to tokenize the input and output strings.

Examples:

>>> from flexeval import ROUGE
>>> from flexeval import WhitespaceTokenizer
>>> tokenizer = WhitespaceTokenizer()
>>> rouge = ROUGE(tokenizer)
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
>>> result = rouge.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
    instance_details=[
        {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
        {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995}
    ]
)
Source code in flexeval/core/metric/rouge.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class ROUGE(Metric):
    """An implementation of [ROUGE](https://aclanthology.org/W04-1013/).

    The calculation is based on the [rouge](https://github.com/pltrdy/rouge) library.

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.

    Examples:
        >>> from flexeval import ROUGE
        >>> from flexeval import WhitespaceTokenizer
        >>> tokenizer = WhitespaceTokenizer()
        >>> rouge = ROUGE(tokenizer)
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
        >>> result = rouge.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
            instance_details=[
                {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
                {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995}
            ]
        )
    """

    def __init__(self, tokenizer: Tokenizer) -> None:
        self._tokenizer = tokenizer

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if len(lm_outputs) != len(references_list):
            msg = (
                f"lm_outputs and references_list must have the same length, "
                f"but got {len(lm_outputs)} and {len(references_list)}."
            )
            raise ValueError(msg)

        # we only need the first reference
        target_summaries = [references[0] for references in references_list]

        tokenized_lm_outputs = [" ".join(self._tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
        tokenized_target_summaries = [
            " ".join(self._tokenizer.tokenize(target_summary)) for target_summary in target_summaries
        ]

        # replace empty string with " " to avoid "ValueError: Hypothesis is empty" from rouge
        tokenized_lm_outputs = [o if o else " " for o in tokenized_lm_outputs]

        rouge = RougeCalculator()
        score_outputs = rouge.get_scores(
            tokenized_lm_outputs,
            tokenized_target_summaries,
        )

        rouge1_list = [o["rouge-1"]["f"] for o in score_outputs]
        rouge2_list = [o["rouge-2"]["f"] for o in score_outputs]
        rouge_l_list = [o["rouge-l"]["f"] for o in score_outputs]

        # we only need the f1 score
        return MetricResult(
            {
                "rouge1": sum(rouge1_list) / len(rouge1_list),
                "rouge2": sum(rouge2_list) / len(rouge2_list),
                "rougeL": sum(rouge_l_list) / len(rouge_l_list),
            },
            instance_details=[
                {"rouge1": r1, "rouge2": r2, "rougeL": rL} for r1, r2, rL in zip(rouge1_list, rouge2_list, rouge_l_list)
            ],
        )

__init__

__init__(tokenizer: Tokenizer) -> None
Source code in flexeval/core/metric/rouge.py
36
37
def __init__(self, tokenizer: Tokenizer) -> None:
    self._tokenizer = tokenizer

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/rouge.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if len(lm_outputs) != len(references_list):
        msg = (
            f"lm_outputs and references_list must have the same length, "
            f"but got {len(lm_outputs)} and {len(references_list)}."
        )
        raise ValueError(msg)

    # we only need the first reference
    target_summaries = [references[0] for references in references_list]

    tokenized_lm_outputs = [" ".join(self._tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
    tokenized_target_summaries = [
        " ".join(self._tokenizer.tokenize(target_summary)) for target_summary in target_summaries
    ]

    # replace empty string with " " to avoid "ValueError: Hypothesis is empty" from rouge
    tokenized_lm_outputs = [o if o else " " for o in tokenized_lm_outputs]

    rouge = RougeCalculator()
    score_outputs = rouge.get_scores(
        tokenized_lm_outputs,
        tokenized_target_summaries,
    )

    rouge1_list = [o["rouge-1"]["f"] for o in score_outputs]
    rouge2_list = [o["rouge-2"]["f"] for o in score_outputs]
    rouge_l_list = [o["rouge-l"]["f"] for o in score_outputs]

    # we only need the f1 score
    return MetricResult(
        {
            "rouge1": sum(rouge1_list) / len(rouge1_list),
            "rouge2": sum(rouge2_list) / len(rouge2_list),
            "rougeL": sum(rouge_l_list) / len(rouge_l_list),
        },
        instance_details=[
            {"rouge1": r1, "rouge2": r2, "rougeL": rL} for r1, r2, rL in zip(rouge1_list, rouge2_list, rouge_l_list)
        ],
    )

SARI

An implementation of SARI, a metric for evaluating text simplification.

Based on the original implementation [1], modified to allow configurable settings for the maximum n-gram size and tokenizer. Additionally, it fixes a bug present in the original implementation [2]. When used with the default parameters, it produces scores that are consistent with the HuggingFace/evaluate implementation [3].

[1] https://github.com/cocoxu/simplification/blob/master/SARI.py [2] https://github.com/cocoxu/simplification/issues/6 [3] https://huggingface.co/spaces/evaluate-metric/sari/blob/main/sari.py

Parameters:

  • tokenizer (Tokenizer | Literal['default'], default: 'default' ) –

    An instance of Tokenizer to tokenize the input and output strings.

  • max_ngrams (int, default: 4 ) –

    The maximum n-gram order to consider. Defaults to 4.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in task inputs.

  • lm_output_processor (StringProcessor | list[StringProcessor] | None | Literal['default'], default: 'default' ) –

    StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.

  • reference_processor (StringProcessor | list[StringProcessor] | None | Literal['default'], default: 'default' ) –

    StringProcessor or list of StringProcessor to apply to the references before comparison.

  • source_processor (StringProcessor | list[StringProcessor] | None | Literal['default'], default: 'default' ) –

    StringProcessor or list of StringProcessor to apply to the source sentences before comparison.

Examples:

>>> from flexeval import SARI
>>> sari_scorer = SARI(source_key="source")
>>> lm_outputs = ["About 95 you now get in."]
>>> references_list = [["About 95 species are currently known.", "About 95 species are now accepted.", "95 species are now accepted."]]
>>> task_inputs_list = [{"source": "About 95 species are currently accepted."}]
>>> result = sari_scorer.evaluate(lm_outputs, references_list, task_inputs_list)
>>> print(result)
MetricResult(
    summary={
        'sari_score': 0.2695360195360195,
        'sari_add': 0.08333333333333333,
        'sari_keep': 0.22527472527472525,
        'sari_del': 0.5
    },
    instance_details=[{'sari_score': 0.2695360195360195, 'sari_add': 0.08333333333333333, 'sari_keep': 0.22527472527472525, 'sari_del': 0.5}]
)
Source code in flexeval/core/metric/sari.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
class SARI(Metric):
    """An implementation of SARI, a metric for evaluating text simplification.

    Based on the original implementation [1], modified to allow configurable settings
    for the maximum n-gram size and tokenizer.
    Additionally, it fixes a bug present in the original implementation [2].
    When used with the default parameters, it produces scores that are
    consistent with the HuggingFace/evaluate implementation [3].

    [1] https://github.com/cocoxu/simplification/blob/master/SARI.py
    [2] https://github.com/cocoxu/simplification/issues/6
    [3] https://huggingface.co/spaces/evaluate-metric/sari/blob/main/sari.py

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.
        max_ngrams: The maximum n-gram order to consider. Defaults to `4`.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in task inputs.
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        source_processor: StringProcessor or list of StringProcessor to apply to the source sentences before comparison.

    Examples:
        >>> from flexeval import SARI
        >>> sari_scorer = SARI(source_key="source")
        >>> lm_outputs = ["About 95 you now get in."]
        >>> references_list = [["About 95 species are currently known.", "About 95 species are now accepted.", "95 species are now accepted."]]
        >>> task_inputs_list = [{"source": "About 95 species are currently accepted."}]
        >>> result = sari_scorer.evaluate(lm_outputs, references_list, task_inputs_list)
        >>> print(result)
        MetricResult(
            summary={
                'sari_score': 0.2695360195360195,
                'sari_add': 0.08333333333333333,
                'sari_keep': 0.22527472527472525,
                'sari_del': 0.5
            },
            instance_details=[{'sari_score': 0.2695360195360195, 'sari_add': 0.08333333333333333, 'sari_keep': 0.22527472527472525, 'sari_del': 0.5}]
        )
    """  # noqa: E501

    def __init__(
        self,
        source_key: str,
        tokenizer: Tokenizer | Literal["default"] = "default",
        max_ngrams: int = 4,
        category_key: str | None = None,
        source_processor: StringProcessor | list[StringProcessor] | None | Literal["default"] = "default",
        lm_output_processor: StringProcessor | list[StringProcessor] | None | Literal["default"] = "default",
        reference_processor: StringProcessor | list[StringProcessor] | None | Literal["default"] = "default",
    ) -> None:
        if tokenizer == "default":
            tokenizer = SacreBleuTokenizer("13a")
        self._tokenizer = tokenizer
        self.source_key = source_key
        self.max_ngrams = max_ngrams
        self.category_key = category_key
        if source_processor == "default":
            source_processor = StringLower()
        if lm_output_processor == "default":
            lm_output_processor = StringLower()
        if reference_processor == "default":
            reference_processor = StringLower()
        if isinstance(source_processor, StringProcessor):
            source_processor = [source_processor]
        if isinstance(lm_output_processor, StringProcessor):
            lm_output_processor = [lm_output_processor]
        if isinstance(reference_processor, StringProcessor):
            reference_processor = [reference_processor]
        self.source_processors = source_processor
        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor

    def evaluate(self, lm_outputs, references_list, task_inputs_list=None) -> MetricResult:  # noqa: ANN001
        if task_inputs_list is None:
            msg = "SARI requires task_inputs_list"
            raise ValueError(msg)
        sources = [task_input[self.source_key] for task_input in task_inputs_list]

        if not (len(sources) == len(lm_outputs) == len(references_list)):
            msg = (
                f"sources, lm_outputs and references_list must have the same length, "
                f"but got {len(sources)}, {len(lm_outputs)} and {len(references_list)}."
            )
            raise ValueError(msg)

        if self.source_processors:
            sources = [functools.reduce(lambda x, norm: norm(x), self.source_processors, src) for src in sources]

        if self.lm_output_processors:
            lm_outputs = [
                functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
            ]

        if self.reference_processors:
            references_list = [
                [functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references]
                for references in references_list
            ]

        sari_instance_list = [
            self._calc_sentence_sari(source, lm_output, references)
            for source, lm_output, references in zip(sources, lm_outputs, references_list)
        ]

        metric_name2scores = {
            name: [s[name] for s in sari_instance_list] for name in ["sari_score", "sari_add", "sari_keep", "sari_del"]
        }

        num_instances = len(sari_instance_list)
        summary = {
            metric_name: sum(score_list) / num_instances for metric_name, score_list in metric_name2scores.items()
        }

        if self.category_key:
            categories = [task_input[self.category_key] for task_input in task_inputs_list]
            for metric_name, score_list in metric_name2scores.items():
                category_wise_scores = aggregate_category_wise_scores(score_list, categories)
                for category, category_wise_score in category_wise_scores.items():
                    summary[f"{metric_name}/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=sari_instance_list,
        )

    def _calc_sentence_sari(self, source: str, lm_output: str, references: list[str]) -> dict[str, float]:
        s_words = self._tokenizer.tokenize(source)
        c_words = self._tokenizer.tokenize(lm_output)
        r_words_list = [self._tokenizer.tokenize(reference) for reference in references]

        sari_score, sari_add, sari_keep, sari_del = 0.0, 0.0, 0.0, 0.0
        for n in range(1, self.max_ngrams + 1):
            s_ngrams = to_ngram(s_words, n)
            c_ngrams = to_ngram(c_words, n)
            r_ngrams_list = [to_ngram(r_words, n) for r_words in r_words_list]

            sari_n_score, sari_n_add, sari_n_keep, sari_n_del = self._sari_n(s_ngrams, c_ngrams, r_ngrams_list)
            sari_score += sari_n_score
            sari_add += sari_n_add
            sari_keep += sari_n_keep
            sari_del += sari_n_del

        sari_score /= self.max_ngrams
        sari_add /= self.max_ngrams
        sari_keep /= self.max_ngrams
        sari_del /= self.max_ngrams

        return {"sari_score": sari_score, "sari_add": sari_add, "sari_keep": sari_keep, "sari_del": sari_del}

    def _sari_n(
        self, s_grams: list[str], c_grams: list[str], r_grams_list: list[list[str]]
    ) -> tuple[float, float, float, float]:
        num_ref = len(r_grams_list)
        r_grams_all = [r_gram for r_grams in r_grams_list for r_gram in r_grams]
        r_gram_counter = Counter(r_grams_all)

        s_gram_counter = Counter(s_grams)
        c_gram_counter = Counter(c_grams)

        s_gram_rep = Counter({k: v * num_ref for k, v in s_gram_counter.items()})
        c_gram_rep = Counter({k: v * num_ref for k, v in c_gram_counter.items()})

        # ADD
        add_grams = set(c_gram_counter) - set(s_gram_counter)
        add_good = add_grams & set(r_gram_counter)
        add_all = set(r_gram_counter) - set(s_gram_counter)

        add_prec = len(add_good) / len(add_grams) if add_grams else 1
        add_recall = len(add_good) / len(add_all) if add_all else 1
        add_f1 = 2 * add_prec * add_recall / (add_prec + add_recall) if (add_prec + add_recall) > 0 else 0

        # KEEP
        keep_rep = s_gram_rep & c_gram_rep
        keep_good = keep_rep & r_gram_counter
        keep_all = s_gram_rep & r_gram_counter

        keep_prec = sum(keep_good[g] / keep_rep[g] for g in keep_good) / len(keep_rep) if keep_rep else 1
        keep_recall = sum(keep_good[g] for g in keep_good) / sum(keep_all.values()) if keep_all else 1
        keep_f1 = 2 * keep_prec * keep_recall / (keep_prec + keep_recall) if (keep_prec + keep_recall) > 0 else 0

        # DELETE
        del_rep = s_gram_rep - c_gram_rep
        del_good = del_rep - r_gram_counter

        del_prec = sum(del_good[g] / del_rep[g] for g in del_good) / len(del_rep) if del_rep else 1

        return (add_f1 + keep_f1 + del_prec) / 3, add_f1, keep_f1, del_prec

source_key instance-attribute

source_key = source_key

max_ngrams instance-attribute

max_ngrams = max_ngrams

category_key instance-attribute

category_key = category_key

source_processors instance-attribute

source_processors = source_processor

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

__init__

__init__(
    source_key: str,
    tokenizer: Tokenizer | Literal["default"] = "default",
    max_ngrams: int = 4,
    category_key: str | None = None,
    source_processor: StringProcessor
    | list[StringProcessor]
    | None
    | Literal["default"] = "default",
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None
    | Literal["default"] = "default",
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None
    | Literal["default"] = "default",
) -> None
Source code in flexeval/core/metric/sari.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def __init__(
    self,
    source_key: str,
    tokenizer: Tokenizer | Literal["default"] = "default",
    max_ngrams: int = 4,
    category_key: str | None = None,
    source_processor: StringProcessor | list[StringProcessor] | None | Literal["default"] = "default",
    lm_output_processor: StringProcessor | list[StringProcessor] | None | Literal["default"] = "default",
    reference_processor: StringProcessor | list[StringProcessor] | None | Literal["default"] = "default",
) -> None:
    if tokenizer == "default":
        tokenizer = SacreBleuTokenizer("13a")
    self._tokenizer = tokenizer
    self.source_key = source_key
    self.max_ngrams = max_ngrams
    self.category_key = category_key
    if source_processor == "default":
        source_processor = StringLower()
    if lm_output_processor == "default":
        lm_output_processor = StringLower()
    if reference_processor == "default":
        reference_processor = StringLower()
    if isinstance(source_processor, StringProcessor):
        source_processor = [source_processor]
    if isinstance(lm_output_processor, StringProcessor):
        lm_output_processor = [lm_output_processor]
    if isinstance(reference_processor, StringProcessor):
        reference_processor = [reference_processor]
    self.source_processors = source_processor
    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor

evaluate

evaluate(
    lm_outputs, references_list, task_inputs_list=None
) -> MetricResult
Source code in flexeval/core/metric/sari.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def evaluate(self, lm_outputs, references_list, task_inputs_list=None) -> MetricResult:  # noqa: ANN001
    if task_inputs_list is None:
        msg = "SARI requires task_inputs_list"
        raise ValueError(msg)
    sources = [task_input[self.source_key] for task_input in task_inputs_list]

    if not (len(sources) == len(lm_outputs) == len(references_list)):
        msg = (
            f"sources, lm_outputs and references_list must have the same length, "
            f"but got {len(sources)}, {len(lm_outputs)} and {len(references_list)}."
        )
        raise ValueError(msg)

    if self.source_processors:
        sources = [functools.reduce(lambda x, norm: norm(x), self.source_processors, src) for src in sources]

    if self.lm_output_processors:
        lm_outputs = [
            functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
        ]

    if self.reference_processors:
        references_list = [
            [functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references]
            for references in references_list
        ]

    sari_instance_list = [
        self._calc_sentence_sari(source, lm_output, references)
        for source, lm_output, references in zip(sources, lm_outputs, references_list)
    ]

    metric_name2scores = {
        name: [s[name] for s in sari_instance_list] for name in ["sari_score", "sari_add", "sari_keep", "sari_del"]
    }

    num_instances = len(sari_instance_list)
    summary = {
        metric_name: sum(score_list) / num_instances for metric_name, score_list in metric_name2scores.items()
    }

    if self.category_key:
        categories = [task_input[self.category_key] for task_input in task_inputs_list]
        for metric_name, score_list in metric_name2scores.items():
            category_wise_scores = aggregate_category_wise_scores(score_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"{metric_name}/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=sari_instance_list,
    )

SubstringMatch

A metric that calculates how many outputs contain any of the expected substrings.

Parameters:

  • mode (Literal['any', 'all'], default: 'any' ) –

    The mode to calculate the substring match. - "any": If any of the expected substrings are in the output, it is a match. - "all": If all of the expected substrings are in the output, it is a match.

  • category_key (str | None, default: None ) –

    Optional key to group scores by category from task_inputs_list.

Examples:

>>> from flexeval import SubstringMatch
>>> substring_match = SubstringMatch()
>>> lm_outputs = ["This is a cat .", "This is a dog ."]
>>> references_list = [["cat", "dog"], ["mouse"]]
>>> result = substring_match.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'substring_match': 0.5},
    instance_details=[{'substring_match': True}, {'substring_match': False}]
)
Source code in flexeval/core/metric/substring_match.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class SubstringMatch(Metric):
    """
    A metric that calculates how many outputs contain any of the expected substrings.

    Args:
        mode: The mode to calculate the substring match.
            - "any": If any of the expected substrings are in the output, it is a match.
            - "all": If all of the expected substrings are in the output, it is a match.
        category_key: Optional key to group scores by category from task_inputs_list.

    Examples:
        >>> from flexeval import SubstringMatch
        >>> substring_match = SubstringMatch()
        >>> lm_outputs = ["This is a cat .", "This is a dog ."]
        >>> references_list = [["cat", "dog"], ["mouse"]]
        >>> result = substring_match.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'substring_match': 0.5},
            instance_details=[{'substring_match': True}, {'substring_match': False}]
        )
    """

    def __init__(self, mode: Literal["any", "all"] = "any", category_key: str | None = None) -> None:
        self.mode = mode
        self.category_key = category_key
        if mode == "all":
            self.match_func = all
        elif mode == "any":
            self.match_func = any
        else:
            msg = f"mode must be 'any' or 'all', but got '{mode}'."
            raise ValueError(msg)

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if len(lm_outputs) != len(references_list):
            msg = (
                f"lm_outputs and references_list must have the same length, "
                f"but got {len(lm_outputs)} and {len(references_list)}."
            )
            raise ValueError(msg)

        match_list = [
            self.match_func(substring in lm_output for substring in expected_output)
            for lm_output, expected_output in zip(lm_outputs, references_list)
        ]

        score = 0.0
        if len(match_list):
            score = sum(match_list) / len(match_list)

        summary = {f"substring_match-{self.mode}": score}

        if self.category_key:
            categories = [task_input[self.category_key] for task_input in task_inputs_list]
            category_wise_scores = aggregate_category_wise_scores(match_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"substring_match-{self.mode}/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"substring_match": match} for match in match_list],
        )

mode instance-attribute

mode = mode

category_key instance-attribute

category_key = category_key

match_func instance-attribute

match_func = all

__init__

__init__(
    mode: Literal["any", "all"] = "any",
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/substring_match.py
32
33
34
35
36
37
38
39
40
41
def __init__(self, mode: Literal["any", "all"] = "any", category_key: str | None = None) -> None:
    self.mode = mode
    self.category_key = category_key
    if mode == "all":
        self.match_func = all
    elif mode == "any":
        self.match_func = any
    else:
        msg = f"mode must be 'any' or 'all', but got '{mode}'."
        raise ValueError(msg)

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/substring_match.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if len(lm_outputs) != len(references_list):
        msg = (
            f"lm_outputs and references_list must have the same length, "
            f"but got {len(lm_outputs)} and {len(references_list)}."
        )
        raise ValueError(msg)

    match_list = [
        self.match_func(substring in lm_output for substring in expected_output)
        for lm_output, expected_output in zip(lm_outputs, references_list)
    ]

    score = 0.0
    if len(match_list):
        score = sum(match_list) / len(match_list)

    summary = {f"substring_match-{self.mode}": score}

    if self.category_key:
        categories = [task_input[self.category_key] for task_input in task_inputs_list]
        category_wise_scores = aggregate_category_wise_scores(match_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"substring_match-{self.mode}/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"substring_match": match} for match in match_list],
    )

XER

Calculate the Character Error Rate (CER) and Word Error Rate (WER) between the model outputs and the references. The calculation is based on the jiwer library.

Parameters:

  • tokenizer (Tokenizer | None, default: None ) –

    An instance of Tokenizer to tokenize the input and output strings.

Examples:

>>> from flexeval import XER
>>> xer = XER()
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["Are you the student ?"]]
>>> result = xer.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'cer_score': 0.43243243243243246, 'wer_score': 0.5},
    instance_details=[{'cer_score': 0.0, 'wer_score': 0.0}, {'cer_score': 0.7619047619047619, 'wer_score': 1.0}
    ]
)
Source code in flexeval/core/metric/xer.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class XER(Metric):
    """
    Calculate the Character Error Rate (CER) and Word Error Rate (WER) between the model outputs and the references.
    The calculation is based on the [jiwer](https://github.com/jitsi/jiwer) library.

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.

    Examples:
        >>> from flexeval import XER
        >>> xer = XER()
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["Are you the student ?"]]
        >>> result = xer.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'cer_score': 0.43243243243243246, 'wer_score': 0.5},
            instance_details=[{'cer_score': 0.0, 'wer_score': 0.0}, {'cer_score': 0.7619047619047619, 'wer_score': 1.0}
            ]
        )
    """

    def __init__(self, tokenizer: Tokenizer | None = None) -> None:
        self.tokenizer = tokenizer

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        task_inputs_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if len(lm_outputs) != len(references_list):
            msg = (
                f"lm_outputs and references_list must have the same length, "
                f"but got {len(lm_outputs)} and {len(references_list)}."
            )
            raise ValueError(msg)

        # we only need the first reference
        references = [references[0] for references in references_list]

        if self.tokenizer:
            tokenized_lm_outputs = [" ".join(self.tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
            tokenized_references = [" ".join(self.tokenizer.tokenize(reference)) for reference in references]
        else:
            tokenized_lm_outputs = lm_outputs
            tokenized_references = references

        cer_score = cer(references, lm_outputs)
        wer_score = wer(tokenized_references, tokenized_lm_outputs)

        return MetricResult(
            {
                "cer_score": cer_score,
                "wer_score": wer_score,
            },
            instance_details=[
                {
                    "cer_score": cer(reference, lm_output),
                    "wer_score": wer(reference, lm_output),
                }
                for lm_output, reference in zip(lm_outputs, references)
            ],
        )

tokenizer instance-attribute

tokenizer = tokenizer

__init__

__init__(tokenizer: Tokenizer | None = None) -> None
Source code in flexeval/core/metric/xer.py
32
33
def __init__(self, tokenizer: Tokenizer | None = None) -> None:
    self.tokenizer = tokenizer

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/xer.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    task_inputs_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if len(lm_outputs) != len(references_list):
        msg = (
            f"lm_outputs and references_list must have the same length, "
            f"but got {len(lm_outputs)} and {len(references_list)}."
        )
        raise ValueError(msg)

    # we only need the first reference
    references = [references[0] for references in references_list]

    if self.tokenizer:
        tokenized_lm_outputs = [" ".join(self.tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
        tokenized_references = [" ".join(self.tokenizer.tokenize(reference)) for reference in references]
    else:
        tokenized_lm_outputs = lm_outputs
        tokenized_references = references

    cer_score = cer(references, lm_outputs)
    wer_score = wer(tokenized_references, tokenized_lm_outputs)

    return MetricResult(
        {
            "cer_score": cer_score,
            "wer_score": wer_score,
        },
        instance_details=[
            {
                "cer_score": cer(reference, lm_output),
                "wer_score": wer(reference, lm_output),
            }
            for lm_output, reference in zip(lm_outputs, references)
        ],
    )