Skip to content

Metric

Metric

Base class for metrics.

Subclasses must implement the evaluate method to perform metric computation. Use utility functions from flexeval.core.metric.utils for common patterns like string processing and category-wise aggregation.

Source code in flexeval/core/metric/base.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class Metric(ABC):
    """
    Base class for metrics.

    Subclasses must implement the `evaluate` method to perform metric computation.
    Use utility functions from `flexeval.core.metric.utils` for common patterns
    like string processing and category-wise aggregation.
    """

    @abstractmethod
    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        """
        Evaluate the outputs of `LanguageModel` against the references.

        Args:
            lm_outputs: List of model outputs.
            references_list: List of reference outputs.
            extra_info_list: List of task inputs and some extra information.
        """

evaluate abstractmethod

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Evaluate the outputs of LanguageModel against the references.

Parameters:

  • lm_outputs (list[str]) –

    List of model outputs.

  • references_list (list[list[str]]) –

    List of reference outputs.

  • extra_info_list (list[dict[str, str]] | None, default: None ) –

    List of task inputs and some extra information.

Source code in flexeval/core/metric/base.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@abstractmethod
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    """
    Evaluate the outputs of `LanguageModel` against the references.

    Args:
        lm_outputs: List of model outputs.
        references_list: List of reference outputs.
        extra_info_list: List of task inputs and some extra information.
    """

MetricResult dataclass

A dataclass representing the result of a metric evaluation.

Source code in flexeval/core/metric/base.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
@dataclass
class MetricResult:
    """
    A dataclass representing the result of a metric evaluation.
    """

    summary: dict[str, Any]
    """
    Summary containing aggregated metric values.
    """
    instance_details: list[dict[str, Any]] | None = None
    """
    A list of evaluate details for each instance.
    Useful for error analysis.
    """

summary instance-attribute

summary: dict[str, Any]

Summary containing aggregated metric values.

instance_details class-attribute instance-attribute

instance_details: list[dict[str, Any]] | None = None

A list of evaluate details for each instance. Useful for error analysis.

__init__

__init__(
    summary: dict[str, Any],
    instance_details: list[dict[str, Any]] | None = None,
) -> None

BLEU

An implementation of BLEU. The calculation is based on the sacrebleu library.

Parameters:

  • tokenize_option (str | None, default: None ) –

    Tokenization option for sacrebleu. If None, sacrebleu will use the default tokenization. For details, see sacreBLEU https://github.com/mjpost/sacrebleu/blob/aa3cc4351af6/sacrebleu/sacrebleu.py#L121-L124

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of StringProcessor to apply to the references before comparison.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import BLEU
>>> bleu = BLEU()
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
>>> result = bleu.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={
        'bleu_score': 100.0,
        'bleu_bp': 1.0,
        'bleu_signature': nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.1},
        instance_details=[
            {'bleu_score': 100.0, 'bleu_bp': 1.0},
            {'bleu_score': 100.0, 'bleu_bp': 1.0}
        ]
    )
Source code in flexeval/core/metric/bleu.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
class BLEU(Metric):
    """An implementation of [BLEU](https://aclanthology.org/P02-1040/).
    The calculation is based on the [sacrebleu](https://github.com/mjpost/sacrebleu) library.

    Args:
        tokenize_option: Tokenization option for sacrebleu.
            If `None`, sacrebleu will use the default tokenization.
            For details, see sacreBLEU
            https://github.com/mjpost/sacrebleu/blob/aa3cc4351af6/sacrebleu/sacrebleu.py#L121-L124
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import BLEU
        >>> bleu = BLEU()
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
        >>> result = bleu.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={
                'bleu_score': 100.0,
                'bleu_bp': 1.0,
                'bleu_signature': nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.1},
                instance_details=[
                    {'bleu_score': 100.0, 'bleu_bp': 1.0},
                    {'bleu_score': 100.0, 'bleu_bp': 1.0}
                ]
            )
    """

    def __init__(
        self,
        tokenize_option: str | None = None,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        self._corpus_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option)
        # For sentence BLEU, we need to set `effective_order=True` as recommended by sacrebleu.
        self._sentence_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option, effective_order=True)

        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references_list = [
            [apply_string_processors(ref, self.reference_processors) for ref in references]
            for references in references_list
        ]

        # Restructure references for sacrebleu format
        max_num_refs = max(len(refs) for refs in references_list)
        references_for_sacrebleu: list[list[str]] = []
        for i in range(max_num_refs):
            set_of_references: list[str] = []
            for refs_for_source in references_list:
                if i < len(refs_for_source):
                    set_of_references.append(refs_for_source[i])
                else:
                    set_of_references.append("")
            references_for_sacrebleu.append(set_of_references)

        # Compute metrics
        bleu = self._corpus_bleu.corpus_score([o.strip() for o in lm_outputs], references_for_sacrebleu)
        sentence_bleu_list = [
            self._sentence_bleu.sentence_score(o.strip(), refs) for o, refs in zip(lm_outputs, references_list)
        ]

        summary = {
            "bleu_score": bleu.score,
            "bleu_bp": bleu.bp,
            "bleu_signature": self._corpus_bleu.get_signature(),
        }

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            sentence_bleu_score_list = [b.score for b in sentence_bleu_list]
            category_wise_scores = aggregate_category_wise_scores(sentence_bleu_score_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"sentence_bleu_score/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"bleu_score": b.score, "bleu_bp": b.bp} for b in sentence_bleu_list],
        )

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

category_key instance-attribute

category_key = category_key

__init__

__init__(
    tokenize_option: str | None = None,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/bleu.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
    self,
    tokenize_option: str | None = None,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    self._corpus_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option)
    # For sentence BLEU, we need to set `effective_order=True` as recommended by sacrebleu.
    self._sentence_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option, effective_order=True)

    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/bleu.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references_list = [
        [apply_string_processors(ref, self.reference_processors) for ref in references]
        for references in references_list
    ]

    # Restructure references for sacrebleu format
    max_num_refs = max(len(refs) for refs in references_list)
    references_for_sacrebleu: list[list[str]] = []
    for i in range(max_num_refs):
        set_of_references: list[str] = []
        for refs_for_source in references_list:
            if i < len(refs_for_source):
                set_of_references.append(refs_for_source[i])
            else:
                set_of_references.append("")
        references_for_sacrebleu.append(set_of_references)

    # Compute metrics
    bleu = self._corpus_bleu.corpus_score([o.strip() for o in lm_outputs], references_for_sacrebleu)
    sentence_bleu_list = [
        self._sentence_bleu.sentence_score(o.strip(), refs) for o, refs in zip(lm_outputs, references_list)
    ]

    summary = {
        "bleu_score": bleu.score,
        "bleu_bp": bleu.bp,
        "bleu_signature": self._corpus_bleu.get_signature(),
    }

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        sentence_bleu_score_list = [b.score for b in sentence_bleu_list]
        category_wise_scores = aggregate_category_wise_scores(sentence_bleu_score_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"sentence_bleu_score/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"bleu_score": b.score, "bleu_bp": b.bp} for b in sentence_bleu_list],
    )

CharF1

A metric that calculates how many characters in the output string are included in the characters of the expected output. If there are multiple expected outputs, the highest score is adopted.

Parameters:

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of Normalizers to apply to the model outputs before comparison.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of Normalizers to apply to the references before comparison.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import CharF1
>>> char_f1 = CharF1()
>>> lm_outputs = ["abcd", "efgh"]
>>> references_list = [["abcd", "ABCD"], ["efGH"]]
>>> result = char_f1.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(summary={'char_f1': 0.75}, instance_details=[{'char_f1': 1.0}, {'char_f1': 0.5}])
Source code in flexeval/core/metric/char_f1.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class CharF1(Metric):
    """
    A metric that calculates how many characters in the output string are included
    in the characters of the expected output.
    If there are multiple expected outputs, the highest score is adopted.

    Args:
        lm_output_processor: StringProcessor or list of Normalizers to apply to the model outputs before comparison.
        reference_processor: StringProcessor or list of Normalizers to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import CharF1
        >>> char_f1 = CharF1()
        >>> lm_outputs = ["abcd", "efgh"]
        >>> references_list = [["abcd", "ABCD"], ["efGH"]]
        >>> result = char_f1.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(summary={'char_f1': 0.75}, instance_details=[{'char_f1': 1.0}, {'char_f1': 0.5}])
    """

    def __init__(
        self,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references_list = [
            [apply_string_processors(ref, self.reference_processors) for ref in references]
            for references in references_list
        ]

        # Compute metrics
        char_f1_scores: list[float] = []
        for lm_output, expected_output in zip(lm_outputs, references_list):
            score = max(fuzz.ratio(lm_output, o) for o in expected_output) / 100
            char_f1_scores.append(score)

        summary = {"char_f1": sum(char_f1_scores) / len(char_f1_scores)}

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            category_wise_scores = aggregate_category_wise_scores(char_f1_scores, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"char_f1/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"char_f1": s} for s in char_f1_scores],
        )

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

category_key instance-attribute

category_key = category_key

__init__

__init__(
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/char_f1.py
33
34
35
36
37
38
39
40
41
def __init__(
    self,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/char_f1.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references_list = [
        [apply_string_processors(ref, self.reference_processors) for ref in references]
        for references in references_list
    ]

    # Compute metrics
    char_f1_scores: list[float] = []
    for lm_output, expected_output in zip(lm_outputs, references_list):
        score = max(fuzz.ratio(lm_output, o) for o in expected_output) / 100
        char_f1_scores.append(score)

    summary = {"char_f1": sum(char_f1_scores) / len(char_f1_scores)}

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        category_wise_scores = aggregate_category_wise_scores(char_f1_scores, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"char_f1/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"char_f1": s} for s in char_f1_scores],
    )

CodeEval

A metric that evaluates generated code with test cases.

Parameters:

  • code_template (str | None, default: None ) –

    A Jinja2 template string to make the generated code. The template can contain variables from extra_info. If None, the code prompt will be the generated text itself.

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    String processors applied to model outputs before evaluation.

  • evaluate_module (str, default: 'code_eval' ) –

    An evaluate module to use.

Examples:

>>> from flexeval import CodeEval
>>> code_eval = CodeEval()
>>> lm_outputs = ["def add(a, b):\n    return a + b", "def is_equal(a, b):\n    return a = b"]
>>> references_list = [["assert add(1, 2) == 3"], ["assert is_equal(1, 2) == False"]]
>>> result = code_eval.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'pass@1': 0.5},
    instance_details=[
        {'passed': True, 'result': 'passed'},
        {'passed': False, 'result': 'failed: invalid syntax (<string>, line 2)'}
    ]
)
Source code in flexeval/core/metric/code_eval.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
class CodeEval(Metric):
    """
    A metric that evaluates generated code with test cases.

    Args:
        code_template: A Jinja2 template string to make the generated code.
            The template can contain variables from extra_info.
            If `None`, the code prompt will be the generated text itself.
        lm_output_processor: String processors applied to model outputs before evaluation.
        evaluate_module: An evaluate module to use.

    Examples:
        >>> from flexeval import CodeEval
        >>> code_eval = CodeEval()
        >>> lm_outputs = ["def add(a, b):\\n    return a + b", "def is_equal(a, b):\\n    return a = b"]
        >>> references_list = [["assert add(1, 2) == 3"], ["assert is_equal(1, 2) == False"]]
        >>> result = code_eval.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'pass@1': 0.5},
            instance_details=[
                {'passed': True, 'result': 'passed'},
                {'passed': False, 'result': 'failed: invalid syntax (<string>, line 2)'}
            ]
        )
    """

    def __init__(
        self,
        code_template: str | None = None,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        evaluate_module: str = "code_eval",
    ) -> None:
        if code_template is None:
            code_template = "{{ lm_output }}"

        self.code_template = JINJA2_ENV.from_string(code_template)
        self.code_eval = evaluate.load(evaluate_module)

        self.lm_output_processors = lm_output_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]

        # Compute metrics
        generated_code_list: list[str] = []
        test_case_list: list[str] = []
        # in code generation tasks, references_list contains the test cases
        for lm_output, extra_info, test_cases in zip(
            lm_outputs,
            extra_info_list,
            references_list,
        ):
            generated_code = self.code_template.render(lm_output=lm_output, **extra_info)
            generated_code_list.append(generated_code)
            test_case_list.append("\n".join(test_cases))
        pass_at_k, results = self.code_eval.compute(
            references=test_case_list,
            predictions=[[c] for c in generated_code_list],
            k=[1],
        )

        # `results` contain the detailed results for each test case
        # e.g., {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed", 'completion_id': 0})]}
        results: dict[int, list[tuple[int, dict[str, Any]]]]

        instance_details: list[dict[str, Any]] = []
        for i in range(len(lm_outputs)):
            first_result = results[i][0]  # we only assume one candidate code per instance, so we take the first result
            _, detail_result = first_result  # the first element is just the index so we ignore it
            # remove unnecessary fields to save space
            detail_result.pop("completion_id")
            detail_result.pop("task_id")
            instance_details.append(detail_result)

        return MetricResult(pass_at_k, instance_details=instance_details)

code_template instance-attribute

code_template = from_string(code_template)

code_eval instance-attribute

code_eval = load(evaluate_module)

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

__init__

__init__(
    code_template: str | None = None,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    evaluate_module: str = "code_eval",
) -> None
Source code in flexeval/core/metric/code_eval.py
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(
    self,
    code_template: str | None = None,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    evaluate_module: str = "code_eval",
) -> None:
    if code_template is None:
        code_template = "{{ lm_output }}"

    self.code_template = JINJA2_ENV.from_string(code_template)
    self.code_eval = evaluate.load(evaluate_module)

    self.lm_output_processors = lm_output_processor

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/code_eval.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]

    # Compute metrics
    generated_code_list: list[str] = []
    test_case_list: list[str] = []
    # in code generation tasks, references_list contains the test cases
    for lm_output, extra_info, test_cases in zip(
        lm_outputs,
        extra_info_list,
        references_list,
    ):
        generated_code = self.code_template.render(lm_output=lm_output, **extra_info)
        generated_code_list.append(generated_code)
        test_case_list.append("\n".join(test_cases))
    pass_at_k, results = self.code_eval.compute(
        references=test_case_list,
        predictions=[[c] for c in generated_code_list],
        k=[1],
    )

    # `results` contain the detailed results for each test case
    # e.g., {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed", 'completion_id': 0})]}
    results: dict[int, list[tuple[int, dict[str, Any]]]]

    instance_details: list[dict[str, Any]] = []
    for i in range(len(lm_outputs)):
        first_result = results[i][0]  # we only assume one candidate code per instance, so we take the first result
        _, detail_result = first_result  # the first element is just the index so we ignore it
        # remove unnecessary fields to save space
        detail_result.pop("completion_id")
        detail_result.pop("task_id")
        instance_details.append(detail_result)

    return MetricResult(pass_at_k, instance_details=instance_details)

CommonPrefixLength

A metric that calculates the length of the longest common prefix between the model output and the reference.

Examples:

>>> from flexeval import CommonPrefixLength
>>> common_prefix_length = CommonPrefixLength()
>>> lm_outputs = ["ABCDEFG"]
>>> references_list = [["ABCdefg"]]
>>> result = common_prefix_length.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"average_common_prefix_length": 3.0, "longest_common_prefix_length": 3},
    instance_details=[{"common_prefix_length": 3}],
)
Source code in flexeval/core/metric/common_prefix_length.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class CommonPrefixLength(Metric):
    """
    A metric that calculates the length of the longest common prefix between the model output and the reference.

    Examples:
        >>> from flexeval import CommonPrefixLength
        >>> common_prefix_length = CommonPrefixLength()
        >>> lm_outputs = ["ABCDEFG"]
        >>> references_list = [["ABCdefg"]]
        >>> result = common_prefix_length.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"average_common_prefix_length": 3.0, "longest_common_prefix_length": 3},
            instance_details=[{"common_prefix_length": 3}],
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        common_prefix_length_list: list[int] = []
        for lm_output, references in zip(lm_outputs, references_list):
            common_prefix_length = max(len(get_longest_common_prefix(lm_output, gt)) for gt in references)
            common_prefix_length_list.append(common_prefix_length)

        return MetricResult(
            {
                "average_common_prefix_length": sum(common_prefix_length_list) / len(common_prefix_length_list),
                "longest_common_prefix_length": max(common_prefix_length_list),
            },
            instance_details=[{"common_prefix_length": s} for s in common_prefix_length_list],
        )

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/common_prefix_length.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    common_prefix_length_list: list[int] = []
    for lm_output, references in zip(lm_outputs, references_list):
        common_prefix_length = max(len(get_longest_common_prefix(lm_output, gt)) for gt in references)
        common_prefix_length_list.append(common_prefix_length)

    return MetricResult(
        {
            "average_common_prefix_length": sum(common_prefix_length_list) / len(common_prefix_length_list),
            "longest_common_prefix_length": max(common_prefix_length_list),
        },
        instance_details=[{"common_prefix_length": s} for s in common_prefix_length_list],
    )

CommonStringLength

A metric that calculates the length of the longest common substring between the model output and the reference.

Examples:

>>> from flexeval import CommonStringLength
>>> common_string_length = CommonStringLength()
>>> lm_outputs = ["aBCDEFG"]
>>> references_list = [["ABCDefg"]]
>>> result = common_string_length.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"average_common_string_length": 3.0, "longest_common_string_length": 3},
    instance_details=[{"common_string_length": 3}],
)
Source code in flexeval/core/metric/common_string_length.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class CommonStringLength(Metric):
    """
    A metric that calculates the length of the longest common substring between the model output and the reference.

    Examples:
        >>> from flexeval import CommonStringLength
        >>> common_string_length = CommonStringLength()
        >>> lm_outputs = ["aBCDEFG"]
        >>> references_list = [["ABCDefg"]]
        >>> result = common_string_length.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"average_common_string_length": 3.0, "longest_common_string_length": 3},
            instance_details=[{"common_string_length": 3}],
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        common_string_length_list: list[int] = []
        for lm_output, references in zip(lm_outputs, references_list):
            common_string_length = max(len(get_longest_common_substring(lm_output, gt)) for gt in references)
            common_string_length_list.append(common_string_length)

        return MetricResult(
            {
                "average_common_string_length": sum(common_string_length_list) / len(common_string_length_list),
                "longest_common_string_length": max(common_string_length_list),
            },
            instance_details=[{"common_string_length": s} for s in common_string_length_list],
        )

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/common_string_length.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    common_string_length_list: list[int] = []
    for lm_output, references in zip(lm_outputs, references_list):
        common_string_length = max(len(get_longest_common_substring(lm_output, gt)) for gt in references)
        common_string_length_list.append(common_string_length)

    return MetricResult(
        {
            "average_common_string_length": sum(common_string_length_list) / len(common_string_length_list),
            "longest_common_string_length": max(common_string_length_list),
        },
        instance_details=[{"common_string_length": s} for s in common_string_length_list],
    )

Correlation

Correlation metric to compute Pearson, Spearman, or Kendall correlation coefficients. The lm_outputs and references should be numeric values, optionally preprocessed by StringProcessor.

Parameters:

  • method (Literal['pearson', 'spearman', 'kendall'], default: 'pearson' ) –

    The correlation method to use ('pearson', 'spearman', 'kendall').

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or a list of StringProcessor to be applied to the model outputs before computing the correlation. If a list is provided, the processors will be applied in order.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or a list of StringProcessor to be applied to the references before computing the correlation. If a list is provided, the processors will be applied in order.

Examples:

>>> from flexeval import Correlation
>>> correlation = Correlation(method='pearson')
>>> lm_outputs = ["1", "2", "3", "4", "5"]
>>> references = [["5"], ["4"], ["3"], ["2"], ["1"]]
>>> result = correlation.evaluate(lm_outputs, references)
>>> print(result)
MetricResult(
    summary={"pearson_correlation": -1.0, "pearson_pvalue": 0.0},
    instance_details=[],
)
Source code in flexeval/core/metric/correlation.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class Correlation(Metric):
    """
    Correlation metric to compute Pearson, Spearman, or Kendall correlation coefficients.
    The lm_outputs and references should be numeric values, optionally preprocessed by StringProcessor.

    Args:
        method: The correlation method to use ('pearson', 'spearman', 'kendall').
        lm_output_processor: StringProcessor or a list of StringProcessor to be applied to the model outputs before
            computing the correlation. If a list is provided, the processors will be applied in order.
        reference_processor: StringProcessor or a list of StringProcessor to be applied to the references before
            computing the correlation. If a list is provided, the processors will be applied in order.

    Examples:
        >>> from flexeval import Correlation
        >>> correlation = Correlation(method='pearson')
        >>> lm_outputs = ["1", "2", "3", "4", "5"]
        >>> references = [["5"], ["4"], ["3"], ["2"], ["1"]]
        >>> result = correlation.evaluate(lm_outputs, references)
        >>> print(result)
        MetricResult(
            summary={"pearson_correlation": -1.0, "pearson_pvalue": 0.0},
            instance_details=[],
        )
    """

    def __init__(
        self,
        method: Literal["pearson", "spearman", "kendall"] = "pearson",
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
    ) -> None:
        if method not in {"pearson", "spearman", "kendall"}:
            msg = f"Invalid method '{method}'. Choose from 'pearson', 'spearman', 'kendall'."
            raise ValueError(msg)
        self.method = method

        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data - we only use the first reference here
        references = [refs[0] for refs in references_list]
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references = [apply_string_processors(ref, self.reference_processors) for ref in references]

        # Convert to numeric values
        lm_outputs_as_float: list[float] = []
        for output in lm_outputs:
            try:
                lm_outputs_as_float.append(float(output))
            except ValueError:  # noqa:PERF203
                warnings.warn(f"Failed to convert model output '{output}' to float. Treating it as 0.", stacklevel=2)
                lm_outputs_as_float.append(0.0)

        references_as_float = [float(ref) for ref in references]

        # Compute metrics
        if self.method == "pearson":
            correlation, pvalue = pearsonr(lm_outputs_as_float, references_as_float)
        elif self.method == "spearman":
            correlation, pvalue = spearmanr(lm_outputs_as_float, references_as_float)
        elif self.method == "kendall":
            correlation, pvalue = kendalltau(lm_outputs_as_float, references_as_float)
        else:
            msg = f"Unsupported method: {self.method}"
            raise ValueError(msg)

        return MetricResult(
            {f"{self.method}_correlation": correlation, f"{self.method}_pvalue": pvalue},
            instance_details=[],
        )

method instance-attribute

method = method

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

__init__

__init__(
    method: Literal[
        "pearson", "spearman", "kendall"
    ] = "pearson",
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
) -> None
Source code in flexeval/core/metric/correlation.py
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(
    self,
    method: Literal["pearson", "spearman", "kendall"] = "pearson",
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
) -> None:
    if method not in {"pearson", "spearman", "kendall"}:
        msg = f"Invalid method '{method}'. Choose from 'pearson', 'spearman', 'kendall'."
        raise ValueError(msg)
    self.method = method

    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/correlation.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data - we only use the first reference here
    references = [refs[0] for refs in references_list]
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references = [apply_string_processors(ref, self.reference_processors) for ref in references]

    # Convert to numeric values
    lm_outputs_as_float: list[float] = []
    for output in lm_outputs:
        try:
            lm_outputs_as_float.append(float(output))
        except ValueError:  # noqa:PERF203
            warnings.warn(f"Failed to convert model output '{output}' to float. Treating it as 0.", stacklevel=2)
            lm_outputs_as_float.append(0.0)

    references_as_float = [float(ref) for ref in references]

    # Compute metrics
    if self.method == "pearson":
        correlation, pvalue = pearsonr(lm_outputs_as_float, references_as_float)
    elif self.method == "spearman":
        correlation, pvalue = spearmanr(lm_outputs_as_float, references_as_float)
    elif self.method == "kendall":
        correlation, pvalue = kendalltau(lm_outputs_as_float, references_as_float)
    else:
        msg = f"Unsupported method: {self.method}"
        raise ValueError(msg)

    return MetricResult(
        {f"{self.method}_correlation": correlation, f"{self.method}_pvalue": pvalue},
        instance_details=[],
    )

ExactMatch

Exact match metric. If there are multiple references, the output is considered correct if it matches any of the references.

Parameters:

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of StringProcessor to apply to the references before comparison.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import ExactMatch
>>> exact_match = ExactMatch()
>>> lm_outputs = ["ABC", "DEF"]
>>> references_list = [["ABC"], ["DEFG"]]
>>> result = exact_match.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"exact_match": 0.5},
    instance_details=[{"exact_match": True}, {"exact_match": False}],
)
Source code in flexeval/core/metric/exact_match.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class ExactMatch(Metric):
    """
    Exact match metric.
    If there are multiple references, the output is considered correct if it matches any of the references.

    Args:
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import ExactMatch
        >>> exact_match = ExactMatch()
        >>> lm_outputs = ["ABC", "DEF"]
        >>> references_list = [["ABC"], ["DEFG"]]
        >>> result = exact_match.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"exact_match": 0.5},
            instance_details=[{"exact_match": True}, {"exact_match": False}],
        )
    """

    def __init__(
        self,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references_list = [
            [apply_string_processors(ref, self.reference_processors) for ref in references]
            for references in references_list
        ]

        # Compute metrics
        exact_match_list = [
            lm_output in expected_output for lm_output, expected_output in zip(lm_outputs, references_list)
        ]
        summary = {"exact_match": sum(exact_match_list) / len(exact_match_list)}

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            category_wise_scores = aggregate_category_wise_scores(exact_match_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"exact_match/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"exact_match": s} for s in exact_match_list],
        )

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

category_key instance-attribute

category_key = category_key

__init__

__init__(
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/exact_match.py
34
35
36
37
38
39
40
41
42
def __init__(
    self,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/exact_match.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references_list = [
        [apply_string_processors(ref, self.reference_processors) for ref in references]
        for references in references_list
    ]

    # Compute metrics
    exact_match_list = [
        lm_output in expected_output for lm_output, expected_output in zip(lm_outputs, references_list)
    ]
    summary = {"exact_match": sum(exact_match_list) / len(exact_match_list)}

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        category_wise_scores = aggregate_category_wise_scores(exact_match_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"exact_match/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"exact_match": s} for s in exact_match_list],
    )

ChatLLMGEvalScore

A metric that evaluates the output of LanguageModel.batch_generate_chat_response. Unlike ChatLLMScore, this metric let the model output logprobs for all valid scores and calculate weighted score among them. Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.

Parameters:

  • language_model (required) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (required) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • valid_score_range (required) –

    A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • system_message (str | PromptTemplate | None, default: None ) –

    A system message to be prepended to the input for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

  • prob_threshold (float, default: 0 ) –

    For considering low probability among all of valid scores, return None (invalid) if sum of the all probability among vaild scores is less than this value.

Examples:

>>> from flexeval import ChatLLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
>>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nOutput only a number from 1 to 5."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> llm_score = ChatLLMGEvalScore(language_model, prompt_template, [1, 5], system_message=system_message)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> llm_score.evaluate(lm_outputs)
MetricResult(
    summary={'llm_geval_score': 1.179980414173022, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_geval_score': 1.1509989197179789,
            'llm_geval_score_input': [
                {'role': 'system', 'content': 'This is the system message.'},
                {'role': 'user', 'content': 'Evaluate the quality of this text...'}
            ],
            'llm_geval_score_logprobs': {
                '1': -0.06977498531341553,
                '2': -3.687819004058838,
                '3': -3.937819480895996,
                '4': -5.812800884246826,
                '5': -3.937807083129883
            },
            'llm_geval_score_generation_probs': {
                1: 0.932603645815178,
                2: 0.02502652531327666,
                3: 0.01949066821765914,
                4: 0.002989046364034347,
                5: 0.019490909859903
            }
        },
        {
            'llm_geval_score': 1.208961908628065,
            'llm_geval_score_input': [
                {'role': 'system', 'content': 'This is the system message.'},
                {'role': 'user', 'content': 'Evaluate the quality of this text...'}
            ],
            'llm_geval_score_logprobs': {
                '1': -0.13043057918548584,
                '2': -2.8754935264587402,
                '3': -3.000467538833618,
                '4': -4.750283241271973,
                '5': -5.000345706939697
            },
            'llm_geval_score_generation_probs': {
                1: 0.8777174226922144,
                2: 0.05638830351569556,
                3: 0.04976379642068341,
                4: 0.008649245032977617,
                5: 0.006735618046639277
            }
        }
    ])
Source code in flexeval/core/metric/llm_geval_score.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
class ChatLLMGEvalScore(Metric):
    """A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.
    Unlike ChatLLMScore, this metric let the model output logprobs for all valid scores and
    calculate weighted score among them.
    Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.

    Args:
        language_model (required): An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template (required): An instance of `PromptTemplate` to embed the input for the evaluator.
        valid_score_range (required): A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        batch_size: The batch size for the evaluator.
        system_message: A system message to be prepended to the input for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.
        prob_threshold: For considering low probability among all of valid scores,
            return None (invalid) if sum of the all probability among vaild scores is less than this value.


    Examples:
        >>> from flexeval import ChatLLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
        >>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nOutput only a number from 1 to 5."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> llm_score = ChatLLMGEvalScore(language_model, prompt_template, [1, 5], system_message=system_message)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> llm_score.evaluate(lm_outputs)
        MetricResult(
            summary={'llm_geval_score': 1.179980414173022, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_geval_score': 1.1509989197179789,
                    'llm_geval_score_input': [
                        {'role': 'system', 'content': 'This is the system message.'},
                        {'role': 'user', 'content': 'Evaluate the quality of this text...'}
                    ],
                    'llm_geval_score_logprobs': {
                        '1': -0.06977498531341553,
                        '2': -3.687819004058838,
                        '3': -3.937819480895996,
                        '4': -5.812800884246826,
                        '5': -3.937807083129883
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.932603645815178,
                        2: 0.02502652531327666,
                        3: 0.01949066821765914,
                        4: 0.002989046364034347,
                        5: 0.019490909859903
                    }
                },
                {
                    'llm_geval_score': 1.208961908628065,
                    'llm_geval_score_input': [
                        {'role': 'system', 'content': 'This is the system message.'},
                        {'role': 'user', 'content': 'Evaluate the quality of this text...'}
                    ],
                    'llm_geval_score_logprobs': {
                        '1': -0.13043057918548584,
                        '2': -2.8754935264587402,
                        '3': -3.000467538833618,
                        '4': -4.750283241271973,
                        '5': -5.000345706939697
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.8777174226922144,
                        2: 0.05638830351569556,
                        3: 0.04976379642068341,
                        4: 0.008649245032977617,
                        5: 0.006735618046639277
                    }
                }
            ])
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        valid_score_range: tuple[int, int],
        batch_size: int = 4,
        system_message: str | PromptTemplate | None = None,
        disable_tqdm: bool = False,
        category_key: str | None = None,
        prob_threshold: float = 0,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.system_message = system_message
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key
        self.prob_threshold = prob_threshold

        self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        # Compute metrics
        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
        )
        evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
            evaluator_input_list,
            self.language_model,
            self.valid_labels,
            self.batch_size,
            self.disable_tqdm,
            "Calculating logprobs",
        )

        evaluator_score_list: list[int | None] = []
        evaluator_probs_list: list[dict[int, float]] = []
        for evaluator_logprobs in evaluator_logprobs_list:
            evaluator_score, evaluator_probs = calculate_weighted_average(
                evaluator_logprobs,
                self.valid_score_range,
                self.prob_threshold,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
            evaluator_score_list.append(evaluator_score)
            evaluator_probs_list.append(evaluator_probs)

        summary = summarize_evaluator_geval_scores(
            evaluator_score_list,
            extra_info_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_geval_score": eval_score,
                    "llm_geval_score_input": eval_in,
                    "llm_geval_score_logprobs": eval_logprobs,
                    "llm_geval_score_generation_probs": eval_probs,
                }
                for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_logprobs_list,
                    evaluator_probs_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

batch_size instance-attribute

batch_size = batch_size

system_message instance-attribute

system_message = system_message

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

prob_threshold instance-attribute

prob_threshold = prob_threshold

valid_labels instance-attribute

valid_labels = [
    str(score)
    for score in range(
        valid_score_range[0], valid_score_range[1] + 1
    )
]

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    system_message: str | PromptTemplate | None = None,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None
Source code in flexeval/core/metric/llm_geval_score.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    system_message: str | PromptTemplate | None = None,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.system_message = system_message
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key
    self.prob_threshold = prob_threshold

    self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_geval_score.py
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    # Compute metrics
    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
    )
    evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
        evaluator_input_list,
        self.language_model,
        self.valid_labels,
        self.batch_size,
        self.disable_tqdm,
        "Calculating logprobs",
    )

    evaluator_score_list: list[int | None] = []
    evaluator_probs_list: list[dict[int, float]] = []
    for evaluator_logprobs in evaluator_logprobs_list:
        evaluator_score, evaluator_probs = calculate_weighted_average(
            evaluator_logprobs,
            self.valid_score_range,
            self.prob_threshold,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
        evaluator_score_list.append(evaluator_score)
        evaluator_probs_list.append(evaluator_probs)

    summary = summarize_evaluator_geval_scores(
        evaluator_score_list,
        extra_info_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_geval_score": eval_score,
                "llm_geval_score_input": eval_in,
                "llm_geval_score_logprobs": eval_logprobs,
                "llm_geval_score_generation_probs": eval_probs,
            }
            for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_logprobs_list,
                evaluator_probs_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_geval_score.py
454
455
456
457
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMGEvalScore

Let LanguageModel evaluate the output of another LanguageModel. Unlike LLMScore, this metric let the model output logprobs for all valid scores and calculate weighted score among them. Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20. For detail, see https://aclanthology.org/2023.emnlp-main.153/

You can specify the evaluation criteria in PromptTemplate.

Parameters:

  • language_model (required) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (required) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • valid_score_range (required) –

    A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

  • prob_threshold (float, default: 0 ) –

    For considering low probability among all of valid scores, return None (invalid) if sum of the all probability among vaild scores is less than this value.

Examples:

>>> from flexeval import LLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
>>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nOutput only a number from 1 to 5."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> llm_score = LLMGEvalScore(language_model, prompt_template, [1, 5])
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> llm_score.evaluate(lm_outputs)
MetricResult(
    summary={'llm_geval_score': 1.4399980931290486, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_geval_score': 1.418920817254956,
            'llm_geval_score_input': 'Evaluate the quality of this text...',
            'llm_geval_score_logprobs': {
                '1': -4.0625,
                '2': -7.75,
                '3': -8.25,
                '4': -8.0625,
                '5': -6.4375
            },
            'llm_geval_score_generation_probs': {
                1: 0.017205950425851383,
                2: 0.00043074254057568753,
                3: 0.00026125855730166754,
                4: 0.000315137974737356,
                5: 0.0016004026902445643
            }
        },
        {
            'llm_geval_score': 1.461075369003141
            'llm_geval_score_input': 'Evaluate the quality of this text...',
            'llm_geval_score_logprobs': {
                '1': -4.25,
                '2': -8.1875,
                '3': -8.375,
                '4': -8.125,
                '5': -6.5
            },
            'llm_geval_score_generation_probs': {
                1: 0.014264233908999256,
                2: 0.00027810828659249914,
                3: 0.00023055986759244163,
                4: 0.0002960447300568554,
                5: 0.0015034391929775724
            }
        }
    ]
)
Source code in flexeval/core/metric/llm_geval_score.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
class LLMGEvalScore(Metric):
    """Let LanguageModel evaluate the output of another LanguageModel.
    Unlike LLMScore, this metric let the model output logprobs for all valid scores and
    calculate weighted score among them.
    Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.
    For detail, see https://aclanthology.org/2023.emnlp-main.153/

    You can specify the evaluation criteria in `PromptTemplate`.

    Args:
        language_model (required): An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template (required): An instance of `PromptTemplate` to embed the input for the evaluator.
        valid_score_range (required): A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.
        prob_threshold: For considering low probability among all of valid scores,
            return None (invalid) if sum of the all probability among vaild scores is less than this value.

    Examples:
        >>> from flexeval import LLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
        >>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nOutput only a number from 1 to 5."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> llm_score = LLMGEvalScore(language_model, prompt_template, [1, 5])
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> llm_score.evaluate(lm_outputs)
        MetricResult(
            summary={'llm_geval_score': 1.4399980931290486, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_geval_score': 1.418920817254956,
                    'llm_geval_score_input': 'Evaluate the quality of this text...',
                    'llm_geval_score_logprobs': {
                        '1': -4.0625,
                        '2': -7.75,
                        '3': -8.25,
                        '4': -8.0625,
                        '5': -6.4375
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.017205950425851383,
                        2: 0.00043074254057568753,
                        3: 0.00026125855730166754,
                        4: 0.000315137974737356,
                        5: 0.0016004026902445643
                    }
                },
                {
                    'llm_geval_score': 1.461075369003141
                    'llm_geval_score_input': 'Evaluate the quality of this text...',
                    'llm_geval_score_logprobs': {
                        '1': -4.25,
                        '2': -8.1875,
                        '3': -8.375,
                        '4': -8.125,
                        '5': -6.5
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.014264233908999256,
                        2: 0.00027810828659249914,
                        3: 0.00023055986759244163,
                        4: 0.0002960447300568554,
                        5: 0.0015034391929775724
                    }
                }
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        valid_score_range: tuple[int, int],
        batch_size: int = 4,
        disable_tqdm: bool = False,
        category_key: str | None = None,
        prob_threshold: float = 0,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key
        self.prob_threshold = prob_threshold

        self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template
        )
        evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
            evaluator_input_list,
            self.language_model,
            self.valid_labels,
            self.batch_size,
            self.disable_tqdm,
            "Calculating logprobs",
        )

        evaluator_score_list: list[int | None] = []
        evaluator_probs_list: list[dict[int, float]] = []
        for evaluator_logprobs in evaluator_logprobs_list:
            evaluator_score, evaluator_probs = calculate_weighted_average(
                evaluator_logprobs,
                self.valid_score_range,
                self.prob_threshold,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
            evaluator_score_list.append(evaluator_score)
            evaluator_probs_list.append(evaluator_probs)

        summary = summarize_evaluator_geval_scores(
            evaluator_score_list,
            extra_info_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_geval_score": eval_score,
                    "llm_geval_score_input": eval_in,
                    "llm_geval_score_logprobs": eval_logprobs,
                    "llm_geval_score_generation_probs": eval_probs,
                }
                for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_logprobs_list,
                    evaluator_probs_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

prob_threshold instance-attribute

prob_threshold = prob_threshold

valid_labels instance-attribute

valid_labels = [
    str(score)
    for score in range(
        valid_score_range[0], valid_score_range[1] + 1
    )
]

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None
Source code in flexeval/core/metric/llm_geval_score.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key
    self.prob_threshold = prob_threshold

    self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_geval_score.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template
    )
    evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
        evaluator_input_list,
        self.language_model,
        self.valid_labels,
        self.batch_size,
        self.disable_tqdm,
        "Calculating logprobs",
    )

    evaluator_score_list: list[int | None] = []
    evaluator_probs_list: list[dict[int, float]] = []
    for evaluator_logprobs in evaluator_logprobs_list:
        evaluator_score, evaluator_probs = calculate_weighted_average(
            evaluator_logprobs,
            self.valid_score_range,
            self.prob_threshold,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
        evaluator_score_list.append(evaluator_score)
        evaluator_probs_list.append(evaluator_probs)

    summary = summarize_evaluator_geval_scores(
        evaluator_score_list,
        extra_info_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_geval_score": eval_score,
                "llm_geval_score_input": eval_in,
                "llm_geval_score_logprobs": eval_logprobs,
                "llm_geval_score_generation_probs": eval_probs,
            }
            for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_logprobs_list,
                evaluator_probs_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_geval_score.py
288
289
290
291
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

ChatLLMLabel

A metric that evaluates the output of LanguageModel.batch_generate_chat_response.

Parameters:

  • language_model (LanguageModel) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (PromptTemplate) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • label_names (list[str]) –

    A list of valid label names.

  • label_points (list[float | int] | None, default: None ) –

    A list of points for each label specified in label_names.

  • system_message (str | PromptTemplate | None, default: None ) –

    A system message to be prepended to the input for the evaluator.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text on a scale of Good/Bad.\n`{{ lm_output }}`\nPut the label at the end like [[Good]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> label_names = ["Good", "Bad"]
>>> label_points = [1.0, 0.0]
>>> llm_label = ChatLLMLabel(language_model, prompt_template, label_names, label_points)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_label.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_label': 'Good',
            'llm_score': 1.0,
            'llm_label_input': 'Evaluate the quality of this text...',
            'llm_label_output': 'This text is natural, ... [[Good]]'
        },
        {
            'llm_label': 'Bad',
            'llm_score': 0.0,
            'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\n`Good mrrrning!`\nPut the label at the end like [[Good]].',
            'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
        }
    ]
)
Source code in flexeval/core/metric/llm_label.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
class ChatLLMLabel(Metric):
    """
    A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        label_names: A list of valid label names.
        label_points: A list of points for each label specified in label_names.
        system_message: A system message to be prepended to the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text on a scale of Good/Bad.\\n`{{ lm_output }}`\\nPut the label at the end like [[Good]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> label_names = ["Good", "Bad"]
        >>> label_points = [1.0, 0.0]
        >>> llm_label = ChatLLMLabel(language_model, prompt_template, label_names, label_points)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_label.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_label': 'Good',
                    'llm_score': 1.0,
                    'llm_label_input': 'Evaluate the quality of this text...',
                    'llm_label_output': 'This text is natural, ... [[Good]]'
                },
                {
                    'llm_label': 'Bad',
                    'llm_score': 0.0,
                    'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\\n`Good mrrrning!`\\nPut the label at the end like [[Good]].',
                    'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
                }
            ]
        )
    """  # noqa: E501

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        label_names: list[str],
        label_points: list[float | int] | None = None,
        system_message: str | PromptTemplate | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.label_names = [re.escape(label) for label in label_names]

        if label_points:
            if len(self.label_names) != len(label_points):
                msg = "The lengths of label_names and weights do not match."
                raise ValueError(msg)
            label_points: list[float] = list(map(float, label_points))
        else:
            label_points = [0.0] * len(label_names)
            label_points[0] = 1.0

        self.weights = label_points
        self.system_message = system_message
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
        )

        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
        )

        evaluator_label_list: list[str] = []
        for evaluator_output in evaluator_output_list:
            evaluator_label = parse_label_from_evaluator_output(
                evaluator_output.text,
                label_names=self.label_names,
            )
            if evaluator_label is None:
                logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
            evaluator_label_list.append(evaluator_label)

        label2point = dict(zip(self.label_names, self.weights))
        evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

        summary = summarize_evaluator_labels(
            evaluator_label_list,
            extra_info_list,
            self.label_names,
            self.weights,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_label": eval_label,
                    "llm_score": eval_score,
                    "llm_label_input": eval_in,
                    "llm_label_output": eval_out.text,
                }
                for eval_label, eval_score, eval_in, eval_out in zip(
                    evaluator_label_list,
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

label_names instance-attribute

label_names = [escape(label) for label in label_names]

weights instance-attribute

weights = label_points

system_message instance-attribute

system_message = system_message

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

category_key instance-attribute

category_key = category_key

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/llm_label.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.label_names = [re.escape(label) for label in label_names]

    if label_points:
        if len(self.label_names) != len(label_points):
            msg = "The lengths of label_names and weights do not match."
            raise ValueError(msg)
        label_points: list[float] = list(map(float, label_points))
    else:
        label_points = [0.0] * len(label_names)
        label_points[0] = 1.0

    self.weights = label_points
    self.system_message = system_message
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_label.py
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
    )

    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
    )

    evaluator_label_list: list[str] = []
    for evaluator_output in evaluator_output_list:
        evaluator_label = parse_label_from_evaluator_output(
            evaluator_output.text,
            label_names=self.label_names,
        )
        if evaluator_label is None:
            logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
        evaluator_label_list.append(evaluator_label)

    label2point = dict(zip(self.label_names, self.weights))
    evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

    summary = summarize_evaluator_labels(
        evaluator_label_list,
        extra_info_list,
        self.label_names,
        self.weights,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_label": eval_label,
                "llm_score": eval_score,
                "llm_label_input": eval_in,
                "llm_label_output": eval_out.text,
            }
            for eval_label, eval_score, eval_in, eval_out in zip(
                evaluator_label_list,
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_label.py
370
371
372
373
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMLabel

Let LanguageModel to evaluate the output of another LanguageModel.

You can specify the evaluation criteria in PromptTemplate. The last label value found in the output of the evaluator is used to compute the evaluation score. You can assign a score to each label. The final output is the average score and the distribution of the labels.

Parameters:

  • language_model (LanguageModel) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (PromptTemplate) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • label_names (list[str]) –

    A list of valid label names.

  • label_points (list[float | int] | None, default: None ) –

    A list of points for each label specified in label_names.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import OpenAIChatAPI, Jinja2PromptTemplate, LLMLabel
>>> language_model = OpenAIChatAPI(model="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text on a scale of Good/Bad.\n`{{ lm_output }}`\nPut the label at the end like [[Good]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> label_names = ["Good", "Bad"]
>>> label_points = [1.0, 0.0]
>>> llm_label = LLMLabel(language_model, prompt_template, label_names, label_points)
>>> lm_outputs = ["Hello, world!", "Good mrrrning!"]
>>> result = llm_label.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_label': 'Good',
            'llm_score': 1.0,
            'llm_label_input': 'Evaluate the quality of this text...',
            'llm_label_output': 'This text is natural, ... [[Good]]'
        },
        {
            'llm_label': 'Bad',
            'llm_score': 0.0,
            'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\n`Good mrrrning!`\nPut the label at the end like [[Good]].',
            'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
        }
    ]
)
Source code in flexeval/core/metric/llm_label.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
class LLMLabel(Metric):
    """Let LanguageModel to evaluate the output of another LanguageModel.

    You can specify the evaluation criteria in `PromptTemplate`.
    The last label value found in the output of the evaluator is used to compute the evaluation score.
    You can assign a score to each label.
    The final output is the average score and the distribution of the labels.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        label_names: A list of valid label names.
        label_points: A list of points for each label specified in label_names.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import OpenAIChatAPI, Jinja2PromptTemplate, LLMLabel
        >>> language_model = OpenAIChatAPI(model="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text on a scale of Good/Bad.\\n`{{ lm_output }}`\\nPut the label at the end like [[Good]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> label_names = ["Good", "Bad"]
        >>> label_points = [1.0, 0.0]
        >>> llm_label = LLMLabel(language_model, prompt_template, label_names, label_points)
        >>> lm_outputs = ["Hello, world!", "Good mrrrning!"]
        >>> result = llm_label.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_label': 'Good',
                    'llm_score': 1.0,
                    'llm_label_input': 'Evaluate the quality of this text...',
                    'llm_label_output': 'This text is natural, ... [[Good]]'
                },
                {
                    'llm_label': 'Bad',
                    'llm_score': 0.0,
                    'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\\n`Good mrrrning!`\\nPut the label at the end like [[Good]].',
                    'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
                }
            ]
        )
    """  # noqa: E501

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        label_names: list[str],
        label_points: list[float | int] | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.label_names = [re.escape(label) for label in label_names]

        if label_points:
            if len(self.label_names) != len(label_points):
                msg = "The lengths of label_names and weights do not match."
                raise ValueError(msg)
            label_points: list[float] = list(map(float, label_points))
        else:
            label_points = [0.0] * len(label_names)
            label_points[0] = 1.0

        self.weights = label_points
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
        )

        evaluator_label_list: list[int | None] = []
        for evaluator_output in evaluator_output_list:
            evaluator_label = parse_label_from_evaluator_output(
                evaluator_output.text,
                label_names=self.label_names,
            )
            if evaluator_label is None:
                logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
            evaluator_label_list.append(evaluator_label)

        label2point = dict(zip(self.label_names, self.weights))
        evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

        summary = summarize_evaluator_labels(
            evaluator_label_list,
            extra_info_list,
            self.label_names,
            self.weights,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_label": eval_label,
                    "llm_score": eval_score,
                    "llm_label_input": eval_in,
                    "llm_label_output": eval_out.text,
                }
                for eval_label, eval_score, eval_in, eval_out in zip(
                    evaluator_label_list,
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

label_names instance-attribute

label_names = [escape(label) for label in label_names]

weights instance-attribute

weights = label_points

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/llm_label.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.label_names = [re.escape(label) for label in label_names]

    if label_points:
        if len(self.label_names) != len(label_points):
            msg = "The lengths of label_names and weights do not match."
            raise ValueError(msg)
        label_points: list[float] = list(map(float, label_points))
    else:
        label_points = [0.0] * len(label_names)
        label_points[0] = 1.0

    self.weights = label_points
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_label.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
    )

    evaluator_label_list: list[int | None] = []
    for evaluator_output in evaluator_output_list:
        evaluator_label = parse_label_from_evaluator_output(
            evaluator_output.text,
            label_names=self.label_names,
        )
        if evaluator_label is None:
            logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
        evaluator_label_list.append(evaluator_label)

    label2point = dict(zip(self.label_names, self.weights))
    evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

    summary = summarize_evaluator_labels(
        evaluator_label_list,
        extra_info_list,
        self.label_names,
        self.weights,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_label": eval_label,
                "llm_score": eval_score,
                "llm_label_input": eval_in,
                "llm_label_output": eval_out.text,
            }
            for eval_label, eval_score, eval_in, eval_out in zip(
                evaluator_label_list,
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_label.py
227
228
229
230
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

ChatLLMScore

A metric that evaluates the output of LanguageModel.batch_generate_chat_response.

Parameters:

  • language_model (LanguageModel) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (PromptTemplate) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • system_message (str | PromptTemplate | None, default: None ) –

    A system message to be prepended to the input for the evaluator.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • valid_score_range (tuple[int, int] | None, default: None ) –

    A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nPut the score at the end like [[5]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> llm_score = ChatLLMScore(language_model, prompt_template, system_message)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_score.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_score': 2,
            'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
            'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
        {
            'llm_score': 4,
            'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
            'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
    ]
)
Source code in flexeval/core/metric/llm_score.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
class ChatLLMScore(Metric):
    """
    A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        system_message: A system message to be prepended to the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        valid_score_range: A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nPut the score at the end like [[5]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> llm_score = ChatLLMScore(language_model, prompt_template, system_message)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_score.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_score': 2,
                    'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
                    'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
                {
                    'llm_score': 4,
                    'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
                    'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        system_message: str | PromptTemplate | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.system_message = system_message
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        # Compute metrics
        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
        )

        evaluator_score_list: list[int] = []
        for evaluator_output in evaluator_output_list:
            evaluator_score = parse_score_from_evaluator_output(
                evaluator_output.text,
                valid_score_range=self.valid_score_range,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
            evaluator_score_list.append(evaluator_score)

        summary = summarize_evaluator_scores(
            evaluator_score_list,
            extra_info_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
                for eval_score, eval_in, eval_out in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

system_message instance-attribute

system_message = system_message

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/llm_score.py
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.system_message = system_message
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_score.py
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    # Compute metrics
    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
    )

    evaluator_score_list: list[int] = []
    for evaluator_output in evaluator_output_list:
        evaluator_score = parse_score_from_evaluator_output(
            evaluator_output.text,
            valid_score_range=self.valid_score_range,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
        evaluator_score_list.append(evaluator_score)

    summary = summarize_evaluator_scores(
        evaluator_score_list,
        extra_info_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
            for eval_score, eval_in, eval_out in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_score.py
385
386
387
388
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMScore

Let LanguageModel to evaluate the output of another LanguageModel.

You can specify the evaluation criteria in PromptTemplate. The last integer value in the output of the evaluator is used as the evaluation score.

Parameters:

  • language_model (LanguageModel) –

    An instance of LanguageModel to evaluate the output of the model.

  • prompt_template (PromptTemplate) –

    An instance of PromptTemplate to embed the input for the evaluator.

  • batch_size (int, default: 4 ) –

    The batch size for the evaluator.

  • disable_tqdm (bool, default: False ) –

    Whether to disable the progress bar.

  • valid_score_range (tuple[int, int] | None, default: None ) –

    A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import LLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nPut the score at the end like [[5]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> llm_score = LLMScore(language_model, prompt_template)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_score.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_score': 2,
            'llm_score_input': 'Evaluate the quality of this text...',
            'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
        {
            'llm_score': 4,
            'llm_score_input': 'Evaluate the quality of this text...',
            'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
    ]
)
Source code in flexeval/core/metric/llm_score.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
class LLMScore(Metric):
    """Let LanguageModel to evaluate the output of another LanguageModel.

    You can specify the evaluation criteria in `PromptTemplate`.
    The last integer value in the output of the evaluator is used as the evaluation score.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        valid_score_range: A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import LLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nPut the score at the end like [[5]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> llm_score = LLMScore(language_model, prompt_template)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_score.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_score': 2,
                    'llm_score_input': 'Evaluate the quality of this text...',
                    'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
                {
                    'llm_score': 4,
                    'llm_score_input': 'Evaluate the quality of this text...',
                    'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
        )

        evaluator_score_list: list[int | None] = []
        for evaluator_output in evaluator_output_list:
            evaluator_score = parse_score_from_evaluator_output(
                evaluator_output.text,
                valid_score_range=self.valid_score_range,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
            evaluator_score_list.append(evaluator_score)

        summary = summarize_evaluator_scores(
            evaluator_score_list,
            extra_info_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
                for eval_score, eval_in, eval_out in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model instance-attribute

language_model = language_model

prompt_template instance-attribute

prompt_template = prompt_template

batch_size instance-attribute

batch_size = batch_size

disable_tqdm instance-attribute

disable_tqdm = disable_tqdm

valid_score_range instance-attribute

valid_score_range = valid_score_range

category_key instance-attribute

category_key = category_key

__init__

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/llm_score.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/llm_score.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
    )

    evaluator_score_list: list[int | None] = []
    for evaluator_output in evaluator_output_list:
        evaluator_score = parse_score_from_evaluator_output(
            evaluator_output.text,
            valid_score_range=self.valid_score_range,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
        evaluator_score_list.append(evaluator_score)

    summary = summarize_evaluator_scores(
        evaluator_score_list,
        extra_info_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
            for eval_score, eval_in, eval_out in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

__repr__

__repr__() -> str
Source code in flexeval/core/metric/llm_score.py
274
275
276
277
def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

OutputLengthStats

Compute statistics on the length of the outputs.

Examples:

>>> from flexeval import OutputLengthStats
>>> output_length_stats = OutputLengthStats()
>>> lm_outputs = ["123456", "123456789"]
>>> result = output_length_stats.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'avg_output_length': 7.5, 'max_output_length': 9, 'min_output_length': 6},
    instance_details=[{'output_length': 6}, {'output_length': 9}]
)
Source code in flexeval/core/metric/output_length_stats.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class OutputLengthStats(Metric):
    """
    Compute statistics on the length of the outputs.

    Examples:
        >>> from flexeval import OutputLengthStats
        >>> output_length_stats = OutputLengthStats()
        >>> lm_outputs = ["123456", "123456789"]
        >>> result = output_length_stats.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'avg_output_length': 7.5, 'max_output_length': 9, 'min_output_length': 6},
            instance_details=[{'output_length': 6}, {'output_length': 9}]
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        # Compute metrics
        output_length_list = [len(output) for output in lm_outputs]
        return MetricResult(
            {
                "avg_output_length": sum(output_length_list) / len(output_length_list),
                "max_output_length": max(output_length_list),
                "min_output_length": min(output_length_list),
            },
            instance_details=[{"output_length": s} for s in output_length_list],
        )

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/output_length_stats.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    # Compute metrics
    output_length_list = [len(output) for output in lm_outputs]
    return MetricResult(
        {
            "avg_output_length": sum(output_length_list) / len(output_length_list),
            "max_output_length": max(output_length_list),
            "min_output_length": min(output_length_list),
        },
        instance_details=[{"output_length": s} for s in output_length_list],
    )

PerspectiveAPI

A metric that evaluates text outputs using the Perspective API. Please set PERSPECTIVE_API_KEY in the environment variable.

Parameters:

  • languages (list[str]) –

    A list of languages to analyze.

Examples:

>>> from flexeval import PerspectiveAPI
>>> perspective_api = PerspectiveAPI(languages=["en"])
>>> lm_outputs = ["I love you", "I hate you"]
>>> result = perspective_api.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'TOXICITY': 0.35407552, ..., 'THREAT': 0.0265799825},
    instance_details=[
        {'TOXICITY': 0.02543884, ..., 'THREAT': 0.009204263},
        {'TOXICITY': 0.6827122, ..., 'THREAT': 0.043955702}
        ]
    )
Source code in flexeval/core/metric/perspective_api.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
class PerspectiveAPI(Metric):
    """A metric that evaluates text outputs using the Perspective API.
    Please set `PERSPECTIVE_API_KEY` in the environment variable.

    Args:
        languages: A list of languages to analyze.

    Examples:
        >>> from flexeval import PerspectiveAPI
        >>> perspective_api = PerspectiveAPI(languages=["en"])
        >>> lm_outputs = ["I love you", "I hate you"]
        >>> result = perspective_api.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'TOXICITY': 0.35407552, ..., 'THREAT': 0.0265799825},
            instance_details=[
                {'TOXICITY': 0.02543884, ..., 'THREAT': 0.009204263},
                {'TOXICITY': 0.6827122, ..., 'THREAT': 0.043955702}
                ]
            )
    """

    def __init__(self, languages: list[str]) -> None:
        self.client = discovery.build(
            "commentanalyzer",
            "v1alpha1",
            developerKey=PERSPECTIVE_API_KEY,
            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
            static_discovery=False,
        )
        self.languages = languages
        self.attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        # Compute metrics
        instance_details = []
        for lm_output in lm_outputs:
            if lm_output == "":
                instance_details.append({att: 0.0 for att in self.attributes})
                continue
            analyze_request = {
                "comment": {"text": lm_output},
                "languages": self.languages,
                "requestedAttributes": {att: {} for att in self.attributes},
            }
            response = retry_on_error(perspectiveapi_call=self.client.comments().analyze(body=analyze_request).execute)
            instance_details.append(
                {att: response["attributeScores"][att]["summaryScore"]["value"] for att in self.attributes},
            )
        scores_for_attribute = {att: [] for att in self.attributes}
        for instance in instance_details:
            for att in self.attributes:
                scores_for_attribute[att].append(instance[att])
        average_scores = {att: np.mean(scores_for_attribute[att]) for att in self.attributes}
        return MetricResult(average_scores, instance_details=instance_details)

client instance-attribute

client = build(
    "commentanalyzer",
    "v1alpha1",
    developerKey=PERSPECTIVE_API_KEY,
    discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
    static_discovery=False,
)

languages instance-attribute

languages = languages

attributes instance-attribute

attributes = [
    "TOXICITY",
    "SEVERE_TOXICITY",
    "IDENTITY_ATTACK",
    "INSULT",
    "PROFANITY",
    "THREAT",
]

__init__

__init__(languages: list[str]) -> None
Source code in flexeval/core/metric/perspective_api.py
59
60
61
62
63
64
65
66
67
68
def __init__(self, languages: list[str]) -> None:
    self.client = discovery.build(
        "commentanalyzer",
        "v1alpha1",
        developerKey=PERSPECTIVE_API_KEY,
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False,
    )
    self.languages = languages
    self.attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"]

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/perspective_api.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    # Compute metrics
    instance_details = []
    for lm_output in lm_outputs:
        if lm_output == "":
            instance_details.append({att: 0.0 for att in self.attributes})
            continue
        analyze_request = {
            "comment": {"text": lm_output},
            "languages": self.languages,
            "requestedAttributes": {att: {} for att in self.attributes},
        }
        response = retry_on_error(perspectiveapi_call=self.client.comments().analyze(body=analyze_request).execute)
        instance_details.append(
            {att: response["attributeScores"][att]["summaryScore"]["value"] for att in self.attributes},
        )
    scores_for_attribute = {att: [] for att in self.attributes}
    for instance in instance_details:
        for att in self.attributes:
            scores_for_attribute[att].append(instance[att])
    average_scores = {att: np.mean(scores_for_attribute[att]) for att in self.attributes}
    return MetricResult(average_scores, instance_details=instance_details)

RepetitionCount

A metric that counts the number of repetitions of the most repeated pattern in the model's output.

Parameters:

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

    StringProcessor or list of Normalizers to apply to the model outputs before analysis.

Examples:

>>> from flexeval import RepetitionCount
>>> repetition_count = RepetitionCount()
>>> lm_outputs = ["hello hello hello hello hello hello hello hello hello hello"]
>>> references_list = [[]]  # Not used for this metric
>>> result = repetition_count.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'repetition_ratio': 1.0},
    instance_details=[{'most_repeated_pattern': 'hello hell', 'repetition_count': 9, 'is_repetition': True}]
)
Source code in flexeval/core/metric/repetition_count.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
class RepetitionCount(Metric):
    """
    A metric that counts the number of repetitions of the most repeated pattern in the model's output.

    Args:
        lm_output_processor: StringProcessor or list of Normalizers to apply to the model outputs before analysis.

    Examples:
        >>> from flexeval import RepetitionCount
        >>> repetition_count = RepetitionCount()
        >>> lm_outputs = ["hello hello hello hello hello hello hello hello hello hello"]
        >>> references_list = [[]]  # Not used for this metric
        >>> result = repetition_count.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'repetition_ratio': 1.0},
            instance_details=[{'most_repeated_pattern': 'hello hell', 'repetition_count': 9, 'is_repetition': True}]
        )
    """

    def __init__(
        self,
        count_threshold: int = 30,
        threshold_length: int = 10,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    ) -> None:
        self.count_threshold = count_threshold
        self.threshold_length = threshold_length
        self.lm_output_processors = lm_output_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],  # Not used in this metric
        extra_info_list: list[dict[str, str]] | None = None,  # Not used in this metric
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)
        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]

        # Compute metrics
        repetition_details: list[dict[str, Any]] = []
        num_repetitions = 0
        for output in lm_outputs:
            most_repeated_pattern, count = get_most_repeated_pattern(output, threshold_length=self.threshold_length)
            is_repetition = count >= self.count_threshold
            repetition_details.append(
                {
                    "most_repeated_pattern": most_repeated_pattern,
                    "repetition_count": count,
                    "is_repetition": is_repetition,
                }
            )
            num_repetitions += int(is_repetition)

        repetition_rate = num_repetitions / len(lm_outputs)

        return MetricResult(
            summary={"repetition_ratio": repetition_rate},
            instance_details=repetition_details,
        )

count_threshold instance-attribute

count_threshold = count_threshold

threshold_length instance-attribute

threshold_length = threshold_length

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

__init__

__init__(
    count_threshold: int = 30,
    threshold_length: int = 10,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
) -> None
Source code in flexeval/core/metric/repetition_count.py
47
48
49
50
51
52
53
54
55
def __init__(
    self,
    count_threshold: int = 30,
    threshold_length: int = 10,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
) -> None:
    self.count_threshold = count_threshold
    self.threshold_length = threshold_length
    self.lm_output_processors = lm_output_processor

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/repetition_count.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],  # Not used in this metric
    extra_info_list: list[dict[str, str]] | None = None,  # Not used in this metric
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)
    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]

    # Compute metrics
    repetition_details: list[dict[str, Any]] = []
    num_repetitions = 0
    for output in lm_outputs:
        most_repeated_pattern, count = get_most_repeated_pattern(output, threshold_length=self.threshold_length)
        is_repetition = count >= self.count_threshold
        repetition_details.append(
            {
                "most_repeated_pattern": most_repeated_pattern,
                "repetition_count": count,
                "is_repetition": is_repetition,
            }
        )
        num_repetitions += int(is_repetition)

    repetition_rate = num_repetitions / len(lm_outputs)

    return MetricResult(
        summary={"repetition_ratio": repetition_rate},
        instance_details=repetition_details,
    )

ROUGE

An implementation of ROUGE.

The calculation is based on the rouge library.

Parameters:

  • tokenizer (Tokenizer) –

    An instance of Tokenizer to tokenize the input and output strings.

Examples:

>>> from flexeval import ROUGE
>>> from flexeval import WhitespaceTokenizer
>>> tokenizer = WhitespaceTokenizer()
>>> rouge = ROUGE(tokenizer)
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
>>> result = rouge.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
    instance_details=[
        {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
        {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995}
    ]
)
Source code in flexeval/core/metric/rouge.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class ROUGE(Metric):
    """An implementation of [ROUGE](https://aclanthology.org/W04-1013/).

    The calculation is based on the [rouge](https://github.com/pltrdy/rouge) library.

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.

    Examples:
        >>> from flexeval import ROUGE
        >>> from flexeval import WhitespaceTokenizer
        >>> tokenizer = WhitespaceTokenizer()
        >>> rouge = ROUGE(tokenizer)
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
        >>> result = rouge.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
            instance_details=[
                {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
                {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995}
            ]
        )
    """

    def __init__(self, tokenizer: Tokenizer) -> None:
        self._tokenizer = tokenizer

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data - we only need the first reference
        target_summaries = [references[0] for references in references_list]

        tokenized_lm_outputs = [" ".join(self._tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
        tokenized_target_summaries = [
            " ".join(self._tokenizer.tokenize(target_summary)) for target_summary in target_summaries
        ]

        # replace empty string with " " to avoid "ValueError: Hypothesis is empty" from rouge
        tokenized_lm_outputs = [o if o else " " for o in tokenized_lm_outputs]

        # Compute metrics
        rouge = RougeCalculator()
        score_outputs = rouge.get_scores(
            tokenized_lm_outputs,
            tokenized_target_summaries,
        )

        rouge1_list = [o["rouge-1"]["f"] for o in score_outputs]
        rouge2_list = [o["rouge-2"]["f"] for o in score_outputs]
        rouge_l_list = [o["rouge-l"]["f"] for o in score_outputs]

        # we only need the f1 score
        return MetricResult(
            {
                "rouge1": sum(rouge1_list) / len(rouge1_list),
                "rouge2": sum(rouge2_list) / len(rouge2_list),
                "rougeL": sum(rouge_l_list) / len(rouge_l_list),
            },
            instance_details=[
                {"rouge1": r1, "rouge2": r2, "rougeL": rL} for r1, r2, rL in zip(rouge1_list, rouge2_list, rouge_l_list)
            ],
        )

__init__

__init__(tokenizer: Tokenizer) -> None
Source code in flexeval/core/metric/rouge.py
37
38
def __init__(self, tokenizer: Tokenizer) -> None:
    self._tokenizer = tokenizer

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/rouge.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data - we only need the first reference
    target_summaries = [references[0] for references in references_list]

    tokenized_lm_outputs = [" ".join(self._tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
    tokenized_target_summaries = [
        " ".join(self._tokenizer.tokenize(target_summary)) for target_summary in target_summaries
    ]

    # replace empty string with " " to avoid "ValueError: Hypothesis is empty" from rouge
    tokenized_lm_outputs = [o if o else " " for o in tokenized_lm_outputs]

    # Compute metrics
    rouge = RougeCalculator()
    score_outputs = rouge.get_scores(
        tokenized_lm_outputs,
        tokenized_target_summaries,
    )

    rouge1_list = [o["rouge-1"]["f"] for o in score_outputs]
    rouge2_list = [o["rouge-2"]["f"] for o in score_outputs]
    rouge_l_list = [o["rouge-l"]["f"] for o in score_outputs]

    # we only need the f1 score
    return MetricResult(
        {
            "rouge1": sum(rouge1_list) / len(rouge1_list),
            "rouge2": sum(rouge2_list) / len(rouge2_list),
            "rougeL": sum(rouge_l_list) / len(rouge_l_list),
        },
        instance_details=[
            {"rouge1": r1, "rouge2": r2, "rougeL": rL} for r1, r2, rL in zip(rouge1_list, rouge2_list, rouge_l_list)
        ],
    )

SARI

An implementation of SARI, a metric for evaluating text simplification.

Based on the original implementation [1], modified to allow configurable settings for the maximum n-gram size and tokenizer. Additionally, it fixes a bug present in the original implementation [2]. When used with the default parameters, it produces scores that are consistent with the HuggingFace/evaluate implementation [3].

[1] https://github.com/cocoxu/simplification/blob/master/SARI.py [2] https://github.com/cocoxu/simplification/issues/6 [3] https://huggingface.co/spaces/evaluate-metric/sari/blob/main/sari.py

Parameters:

  • tokenizer (Tokenizer | Literal['default'], default: 'default' ) –

    An instance of Tokenizer to tokenize the input and output strings.

  • max_ngrams (int, default: 4 ) –

    The maximum n-gram order to consider. Defaults to 4.

  • category_key (str | None, default: None ) –

    A key to create category-wise mean score. The category key is expected to be in extra_info.

  • lm_output_processor (StringProcessor | list[StringProcessor] | None, default: DEFAULT_STRING_PROCESSOR ) –

    StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.

  • reference_processor (StringProcessor | list[StringProcessor] | None, default: DEFAULT_STRING_PROCESSOR ) –

    StringProcessor or list of StringProcessor to apply to the references before comparison.

  • source_processor (StringProcessor | list[StringProcessor] | None, default: DEFAULT_STRING_PROCESSOR ) –

    StringProcessor or list of StringProcessor to apply to the source sentences before comparison.

Examples:

>>> from flexeval import SARI
>>> sari_scorer = SARI(source_key="source")
>>> lm_outputs = ["About 95 you now get in."]
>>> references_list = [["About 95 species are currently known.", "About 95 species are now accepted.", "95 species are now accepted."]]
>>> extra_info_list = [{"source": "About 95 species are currently accepted."}]
>>> result = sari_scorer.evaluate(lm_outputs, references_list, extra_info_list)
>>> print(result)
MetricResult(
    summary={
        'sari_score': 0.2695360195360195,
        'sari_add': 0.08333333333333333,
        'sari_keep': 0.22527472527472525,
        'sari_del': 0.5
    },
    instance_details=[{'sari_score': 0.2695360195360195, 'sari_add': 0.08333333333333333, 'sari_keep': 0.22527472527472525, 'sari_del': 0.5}]
)
Source code in flexeval/core/metric/sari.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
class SARI(Metric):
    """An implementation of SARI, a metric for evaluating text simplification.

    Based on the original implementation [1], modified to allow configurable settings
    for the maximum n-gram size and tokenizer.
    Additionally, it fixes a bug present in the original implementation [2].
    When used with the default parameters, it produces scores that are
    consistent with the HuggingFace/evaluate implementation [3].

    [1] https://github.com/cocoxu/simplification/blob/master/SARI.py
    [2] https://github.com/cocoxu/simplification/issues/6
    [3] https://huggingface.co/spaces/evaluate-metric/sari/blob/main/sari.py

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.
        max_ngrams: The maximum n-gram order to consider. Defaults to `4`.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        source_processor: StringProcessor or list of StringProcessor to apply to the source sentences before comparison.

    Examples:
        >>> from flexeval import SARI
        >>> sari_scorer = SARI(source_key="source")
        >>> lm_outputs = ["About 95 you now get in."]
        >>> references_list = [["About 95 species are currently known.", "About 95 species are now accepted.", "95 species are now accepted."]]
        >>> extra_info_list = [{"source": "About 95 species are currently accepted."}]
        >>> result = sari_scorer.evaluate(lm_outputs, references_list, extra_info_list)
        >>> print(result)
        MetricResult(
            summary={
                'sari_score': 0.2695360195360195,
                'sari_add': 0.08333333333333333,
                'sari_keep': 0.22527472527472525,
                'sari_del': 0.5
            },
            instance_details=[{'sari_score': 0.2695360195360195, 'sari_add': 0.08333333333333333, 'sari_keep': 0.22527472527472525, 'sari_del': 0.5}]
        )
    """  # noqa: E501

    def __init__(
        self,
        source_key: str,
        tokenizer: Tokenizer | Literal["default"] = "default",
        max_ngrams: int = 4,
        category_key: str | None = None,
        source_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
        reference_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
    ) -> None:
        if tokenizer == "default":
            tokenizer = SacreBleuTokenizer("13a")
        self._tokenizer = tokenizer
        self.source_key = source_key
        self.max_ngrams = max_ngrams
        self.category_key = category_key

        self.source_processors = source_processor
        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor

    def evaluate(self, lm_outputs, references_list, extra_info_list=None) -> MetricResult:  # noqa: ANN001
        validate_inputs(lm_outputs, references_list, extra_info_list)

        if extra_info_list is None:
            msg = "SARI requires extra_info_list"
            raise ValueError(msg)
        sources = [extra_info[self.source_key] for extra_info in extra_info_list]

        # Normalize text data
        sources = [apply_string_processors(src, self.source_processors) for src in sources]
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references_list = [
            [apply_string_processors(ref, self.reference_processors) for ref in references]
            for references in references_list
        ]

        # Compute metrics
        sari_instance_list = [
            self._calc_sentence_sari(source, lm_output, references)
            for source, lm_output, references in zip(sources, lm_outputs, references_list)
        ]

        metric_name2scores = {
            name: [s[name] for s in sari_instance_list] for name in ["sari_score", "sari_add", "sari_keep", "sari_del"]
        }

        num_instances = len(sari_instance_list)
        summary = {
            metric_name: sum(score_list) / num_instances for metric_name, score_list in metric_name2scores.items()
        }

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            for metric_name, score_list in metric_name2scores.items():
                category_wise_scores = aggregate_category_wise_scores(score_list, categories)
                for category, category_wise_score in category_wise_scores.items():
                    summary[f"{metric_name}/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=sari_instance_list,
        )

    def _calc_sentence_sari(self, source: str, lm_output: str, references: list[str]) -> dict[str, float]:
        s_words = self._tokenizer.tokenize(source)
        c_words = self._tokenizer.tokenize(lm_output)
        r_words_list = [self._tokenizer.tokenize(reference) for reference in references]

        sari_score, sari_add, sari_keep, sari_del = 0.0, 0.0, 0.0, 0.0
        for n in range(1, self.max_ngrams + 1):
            s_ngrams = to_ngram(s_words, n)
            c_ngrams = to_ngram(c_words, n)
            r_ngrams_list = [to_ngram(r_words, n) for r_words in r_words_list]

            sari_n_score, sari_n_add, sari_n_keep, sari_n_del = self._sari_n(s_ngrams, c_ngrams, r_ngrams_list)
            sari_score += sari_n_score
            sari_add += sari_n_add
            sari_keep += sari_n_keep
            sari_del += sari_n_del

        sari_score /= self.max_ngrams
        sari_add /= self.max_ngrams
        sari_keep /= self.max_ngrams
        sari_del /= self.max_ngrams

        return {"sari_score": sari_score, "sari_add": sari_add, "sari_keep": sari_keep, "sari_del": sari_del}

    def _sari_n(
        self, s_grams: list[str], c_grams: list[str], r_grams_list: list[list[str]]
    ) -> tuple[float, float, float, float]:
        num_ref = len(r_grams_list)
        r_grams_all = [r_gram for r_grams in r_grams_list for r_gram in r_grams]
        r_gram_counter = Counter(r_grams_all)

        s_gram_counter = Counter(s_grams)
        c_gram_counter = Counter(c_grams)

        s_gram_rep = Counter({k: v * num_ref for k, v in s_gram_counter.items()})
        c_gram_rep = Counter({k: v * num_ref for k, v in c_gram_counter.items()})

        # ADD
        add_grams = set(c_gram_counter) - set(s_gram_counter)
        add_good = add_grams & set(r_gram_counter)
        add_all = set(r_gram_counter) - set(s_gram_counter)

        add_prec = len(add_good) / len(add_grams) if add_grams else 1
        add_recall = len(add_good) / len(add_all) if add_all else 1
        add_f1 = 2 * add_prec * add_recall / (add_prec + add_recall) if (add_prec + add_recall) > 0 else 0

        # KEEP
        keep_rep = s_gram_rep & c_gram_rep
        keep_good = keep_rep & r_gram_counter
        keep_all = s_gram_rep & r_gram_counter

        keep_prec = sum(keep_good[g] / keep_rep[g] for g in keep_good) / len(keep_rep) if keep_rep else 1
        keep_recall = sum(keep_good[g] for g in keep_good) / sum(keep_all.values()) if keep_all else 1
        keep_f1 = 2 * keep_prec * keep_recall / (keep_prec + keep_recall) if (keep_prec + keep_recall) > 0 else 0

        # DELETE
        del_rep = s_gram_rep - c_gram_rep
        del_good = del_rep - r_gram_counter

        del_prec = sum(del_good[g] / del_rep[g] for g in del_good) / len(del_rep) if del_rep else 1

        return (add_f1 + keep_f1 + del_prec) / 3, add_f1, keep_f1, del_prec

source_key instance-attribute

source_key = source_key

max_ngrams instance-attribute

max_ngrams = max_ngrams

category_key instance-attribute

category_key = category_key

source_processors instance-attribute

source_processors = source_processor

lm_output_processors instance-attribute

lm_output_processors = lm_output_processor

reference_processors instance-attribute

reference_processors = reference_processor

__init__

__init__(
    source_key: str,
    tokenizer: Tokenizer | Literal["default"] = "default",
    max_ngrams: int = 4,
    category_key: str | None = None,
    source_processor: StringProcessor
    | list[StringProcessor]
    | None = DEFAULT_STRING_PROCESSOR,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = DEFAULT_STRING_PROCESSOR,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = DEFAULT_STRING_PROCESSOR,
) -> None
Source code in flexeval/core/metric/sari.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __init__(
    self,
    source_key: str,
    tokenizer: Tokenizer | Literal["default"] = "default",
    max_ngrams: int = 4,
    category_key: str | None = None,
    source_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
    reference_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
) -> None:
    if tokenizer == "default":
        tokenizer = SacreBleuTokenizer("13a")
    self._tokenizer = tokenizer
    self.source_key = source_key
    self.max_ngrams = max_ngrams
    self.category_key = category_key

    self.source_processors = source_processor
    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor

evaluate

evaluate(
    lm_outputs, references_list, extra_info_list=None
) -> MetricResult
Source code in flexeval/core/metric/sari.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def evaluate(self, lm_outputs, references_list, extra_info_list=None) -> MetricResult:  # noqa: ANN001
    validate_inputs(lm_outputs, references_list, extra_info_list)

    if extra_info_list is None:
        msg = "SARI requires extra_info_list"
        raise ValueError(msg)
    sources = [extra_info[self.source_key] for extra_info in extra_info_list]

    # Normalize text data
    sources = [apply_string_processors(src, self.source_processors) for src in sources]
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references_list = [
        [apply_string_processors(ref, self.reference_processors) for ref in references]
        for references in references_list
    ]

    # Compute metrics
    sari_instance_list = [
        self._calc_sentence_sari(source, lm_output, references)
        for source, lm_output, references in zip(sources, lm_outputs, references_list)
    ]

    metric_name2scores = {
        name: [s[name] for s in sari_instance_list] for name in ["sari_score", "sari_add", "sari_keep", "sari_del"]
    }

    num_instances = len(sari_instance_list)
    summary = {
        metric_name: sum(score_list) / num_instances for metric_name, score_list in metric_name2scores.items()
    }

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        for metric_name, score_list in metric_name2scores.items():
            category_wise_scores = aggregate_category_wise_scores(score_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"{metric_name}/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=sari_instance_list,
    )

SubstringMatch

A metric that calculates how many outputs contain any of the expected substrings.

Parameters:

  • mode (Literal['any', 'all'], default: 'any' ) –

    The mode to calculate the substring match. - "any": If any of the expected substrings are in the output, it is a match. - "all": If all of the expected substrings are in the output, it is a match.

  • category_key (str | None, default: None ) –

    Optional key to group scores by category from extra_info_list.

Examples:

>>> from flexeval import SubstringMatch
>>> substring_match = SubstringMatch()
>>> lm_outputs = ["This is a cat .", "This is a dog ."]
>>> references_list = [["cat", "dog"], ["mouse"]]
>>> result = substring_match.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'substring_match': 0.5},
    instance_details=[{'substring_match': True}, {'substring_match': False}]
)
Source code in flexeval/core/metric/substring_match.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class SubstringMatch(Metric):
    """
    A metric that calculates how many outputs contain any of the expected substrings.

    Args:
        mode: The mode to calculate the substring match.
            - "any": If any of the expected substrings are in the output, it is a match.
            - "all": If all of the expected substrings are in the output, it is a match.
        category_key: Optional key to group scores by category from extra_info_list.

    Examples:
        >>> from flexeval import SubstringMatch
        >>> substring_match = SubstringMatch()
        >>> lm_outputs = ["This is a cat .", "This is a dog ."]
        >>> references_list = [["cat", "dog"], ["mouse"]]
        >>> result = substring_match.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'substring_match': 0.5},
            instance_details=[{'substring_match': True}, {'substring_match': False}]
        )
    """

    def __init__(self, mode: Literal["any", "all"] = "any", category_key: str | None = None) -> None:
        self.mode = mode
        self.category_key = category_key
        if mode == "all":
            self.match_func = all
        elif mode == "any":
            self.match_func = any
        else:
            msg = f"mode must be 'any' or 'all', but got '{mode}'."
            raise ValueError(msg)

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        match_list = [
            self.match_func(substring in lm_output for substring in expected_output)
            for lm_output, expected_output in zip(lm_outputs, references_list)
        ]

        score = 0.0
        if len(match_list):
            score = sum(match_list) / len(match_list)

        summary = {f"substring_match-{self.mode}": score}

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            category_wise_scores = aggregate_category_wise_scores(match_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"substring_match-{self.mode}/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"substring_match": match} for match in match_list],
        )

mode instance-attribute

mode = mode

category_key instance-attribute

category_key = category_key

match_func instance-attribute

match_func = all

__init__

__init__(
    mode: Literal["any", "all"] = "any",
    category_key: str | None = None,
) -> None
Source code in flexeval/core/metric/substring_match.py
32
33
34
35
36
37
38
39
40
41
def __init__(self, mode: Literal["any", "all"] = "any", category_key: str | None = None) -> None:
    self.mode = mode
    self.category_key = category_key
    if mode == "all":
        self.match_func = all
    elif mode == "any":
        self.match_func = any
    else:
        msg = f"mode must be 'any' or 'all', but got '{mode}'."
        raise ValueError(msg)

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/substring_match.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    match_list = [
        self.match_func(substring in lm_output for substring in expected_output)
        for lm_output, expected_output in zip(lm_outputs, references_list)
    ]

    score = 0.0
    if len(match_list):
        score = sum(match_list) / len(match_list)

    summary = {f"substring_match-{self.mode}": score}

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        category_wise_scores = aggregate_category_wise_scores(match_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"substring_match-{self.mode}/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"substring_match": match} for match in match_list],
    )

XER

Calculate the Character Error Rate (CER) and Word Error Rate (WER) between the model outputs and the references. The calculation is based on the jiwer library.

Parameters:

  • tokenizer (Tokenizer | None, default: None ) –

    An instance of Tokenizer to tokenize the input and output strings.

Examples:

>>> from flexeval import XER
>>> xer = XER()
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["Are you the student ?"]]
>>> result = xer.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'cer_score': 0.43243243243243246, 'wer_score': 0.5},
    instance_details=[{'cer_score': 0.0, 'wer_score': 0.0}, {'cer_score': 0.7619047619047619, 'wer_score': 1.0}
    ]
)
Source code in flexeval/core/metric/xer.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
class XER(Metric):
    """
    Calculate the Character Error Rate (CER) and Word Error Rate (WER) between the model outputs and the references.
    The calculation is based on the [jiwer](https://github.com/jitsi/jiwer) library.

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.

    Examples:
        >>> from flexeval import XER
        >>> xer = XER()
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["Are you the student ?"]]
        >>> result = xer.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'cer_score': 0.43243243243243246, 'wer_score': 0.5},
            instance_details=[{'cer_score': 0.0, 'wer_score': 0.0}, {'cer_score': 0.7619047619047619, 'wer_score': 1.0}
            ]
        )
    """

    def __init__(self, tokenizer: Tokenizer | None = None) -> None:
        self.tokenizer = tokenizer

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data - we only need the first reference
        references = [references[0] for references in references_list]

        if self.tokenizer:
            tokenized_lm_outputs = [" ".join(self.tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
            tokenized_references = [" ".join(self.tokenizer.tokenize(reference)) for reference in references]
        else:
            tokenized_lm_outputs = lm_outputs
            tokenized_references = references

        # Compute metrics
        cer_score = cer(references, lm_outputs)
        wer_score = wer(tokenized_references, tokenized_lm_outputs)

        return MetricResult(
            {
                "cer_score": cer_score,
                "wer_score": wer_score,
            },
            instance_details=[
                {
                    "cer_score": cer(reference, lm_output),
                    "wer_score": wer(reference, lm_output),
                }
                for lm_output, reference in zip(lm_outputs, references)
            ],
        )

tokenizer instance-attribute

tokenizer = tokenizer

__init__

__init__(tokenizer: Tokenizer | None = None) -> None
Source code in flexeval/core/metric/xer.py
33
34
def __init__(self, tokenizer: Tokenizer | None = None) -> None:
    self.tokenizer = tokenizer

evaluate

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult
Source code in flexeval/core/metric/xer.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data - we only need the first reference
    references = [references[0] for references in references_list]

    if self.tokenizer:
        tokenized_lm_outputs = [" ".join(self.tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
        tokenized_references = [" ".join(self.tokenizer.tokenize(reference)) for reference in references]
    else:
        tokenized_lm_outputs = lm_outputs
        tokenized_references = references

    # Compute metrics
    cer_score = cer(references, lm_outputs)
    wer_score = wer(tokenized_references, tokenized_lm_outputs)

    return MetricResult(
        {
            "cer_score": cer_score,
            "wer_score": wer_score,
        },
        instance_details=[
            {
                "cer_score": cer(reference, lm_output),
                "wer_score": wer(reference, lm_output),
            }
            for lm_output, reference in zip(lm_outputs, references)
        ],
    )